Hello,
I made the program that is used to do basic text preprocessing for use in text mining tools. It should be something trivial, but I can not see where it might be wrong.
#####################################################
##
## Exemplo de Text Mining - pré-processamento
##
#####################################################
# clean workspace
rm(list=ls())
# change dir
my.d <- dirname(rstudioapi::getActiveDocumentContext()$path)
setwd(my.d)
# close all graphics
graphics.off()
#####################################################
library(tm) # Text Mining
text <- c("Isto nao é um têste.",
"São Paulo está <*> sob os atáques de histéricos: 1280 pesoas já chegaram!",
"Goiânia é capital de Goias - localizada no centro-oeste;",
"Três catacrismas deverão // acontecer em menos de duas horas.")
text
library(dplyr)
dados <- data_frame(line = 1:4, Texto = text)
dados
##########################################
docs_corpus = Corpus(VectorSource(dados$Texto))
inspect(docs_corpus[1])
docs = docs_corpus
# Pré-processamento
# Remove caracteres especiais
for(j in seq(docs)){
docs[[j]] <- gsub("/", " ", docs[[j]])
docs[[j]] <- gsub("@", " ", docs[[j]])
docs[[j]] <- gsub("\|", " ", docs[[j]])
docs[[j]] <- gsub("<>", " ", docs[[j]])
docs[[j]] <- gsub("<*>", " ", docs[[j]])
}
# Mostra conteúdo
for (i in seq_along(docs))
print(docs$content[i])
#Coloca tudo em minúsculo
docs <- tm_map(docs_corpus, tolower)
# Remove elementos indesejados
docs = tm_map(docs,removeNumbers)
docs = tm_map(docs,removePunctuation)
# Mostra conteúdo
for (i in seq_along(docs))
print(docs$content[i])
# Tratar palavras grafadas com erro ou palavras compostas que não devem ser separadas,
# plural e singular, verbos na forma infinitiva
for (j in seq(docs)){
docs[[j]] <- gsub("são paulo", "são_paulo", docs[[j]])
docs[[j]] <- gsub("têste", "teste", docs[[j]])
docs[[j]] <- gsub("atáque", "ataque", docs[[j]])
docs[[j]] <- gsub("catacrismas", "cataclismas", docs[[j]])
docs[[j]] <- gsub("pesoas", "pessoas", docs[[j]])
docs[[j]] <- gsub("nao", "não", docs[[j]])
docs[[j]] <- gsub("goias", "goiás", docs[[j]])
docs[[j]] <- gsub("aaa*", "aaaaa", docs[[j]])
docs[[j]] <- gsub('^[ha]+$', "aaaaa", docs[[j]])
}
docs = tm_map(docs,stripWhitespace) # Remove excesso de espaço entre as palavras
# Mostra conteúdo
for (i in seq_along(docs))
print(docs$content[i])
# # Cria lista de stop words para portugues
stw = c(stopwords("portuguese"),"que","como","são","até","ser","é") #inserir a palavra para removê-la da análise
stw <- iconv(stw, "UTF-8", "latin1") # Altera a codificação para funcionar em Português
str(stw)
docs = tm_map(docs,removeWords,c(stw)) # Remove as stopwords
docs = tm_map(docs,stripWhitespace) # Remove excesso de espaço entre as palavras
inspect(docs[1])
# Mostra conteúdo
for (i in seq_along(docs))
print(docs$content[i])
In my list of stopwords that contains the "is" among others, but only this term is not removed from my corpus.
Does anyone know what might be happening?