Friday, April 8, 2016

Wordcloud romana


# Prefer fixed to scientific notation
options(scipen=5)
# Print numbers with two significant digits
options(digits=2)
options(R2HTML.format.digits=2)
# Set a nice color palette for plots
lattice.options(default.theme=latticeExtra::custom.theme(symbol=RColorBrewer::brewer.pal(8,
   "Set1")[c(2:1, 3:5, 7:9)], fill=RColorBrewer::brewer.pal(8, "Set1")[c(2:1,
  3:5, 7:9)], region=RColorBrewer::brewer.pal(n=11, name="Spectral")))
rm(lengths)
library(RODBC)
channel <-
  odbcConnectExcel2007("C:/Users/cc/Documents/DateKnime/adevarulCOMENTARII_Siriamartie21.xlsx")
corpusDataset <- sqlQuery(channel=channel, "select * from [Sheet1$]")
odbcCloseAll()
corpus <- Corpus(DataframeSource(corpusDataset["TextComentariu"]),
  readerControl=list(language="ro"))
corpusVars <- corpusDataset[!names(corpusDataset) == "TextComentariu"]
corpusVars <- data.frame(var1=factor(rep("", length(corpus))),
  row.names=names(corpus))
activeDataSet("corpusVars")
setCorpusVariables()
dtmCorpus <- corpus
dtmCorpus <- tm_map(dtmCorpus, content_transformer(tolower))
dtmCorpus <- tm_map(dtmCorpus, content_transformer(function(x)
  gsub("(['?\n<U+202F><U+2009>]|[[:punct:]]|[[:space:]]|[[:cntrl:]])+", " ",
  x)))
dtmCorpus <- tm_map(dtmCorpus, removeNumbers)
dtm <- DocumentTermMatrix(dtmCorpus, control=list(tolower=FALSE,
  wordLengths=c(2, Inf)))
rm(dtmCorpus)
dictionary <- data.frame(row.names=colnames(dtm),
  "Occurrences"=col_sums(dtm), "Stopword"=ifelse(colnames(dtm) %in%
  stopwords("ro"), "Stopword", ""), stringsAsFactors=FALSE)
dtm <- dtm[, !colnames(dtm) %in% stopwords("ro")]
attr(dtm, "dictionary") <- dictionary
rm(dictionary)
meta(corpus, type="corpus", tag="language") <- attr(dtm, "language") <- "ro"
meta(corpus, type="corpus", tag="processing") <- attr(dtm, "processing") <-
  c(lowercase=TRUE, punctuation=TRUE, digits=TRUE, stopwords=TRUE,
  stemming=FALSE, customStemming=FALSE, twitter=FALSE, removeHashtags=NA,
  removeNames=NA)
corpus
dtm
corpus <- tm_map(corpus, removeWords, "romanian")
stopwords(kind = "ro")
myDtm <- TermDocumentMatrix(corpus, control = list(minWordLength = 1,stopwords = TRUE))
inspect(myDtm[266:270,31:40])
library(wordcloud)
m <- as.matrix(myDtm)
v <- sort(rowSums(m), decreasing=TRUE)
myNames <- names(v)
d <- data.frame(word=myNames, freq=v)
wordcloud(d$word, d$freq, min.freq=33)


No comments:

Post a Comment