http://www.rdatamining.com/examples/social-network-analysis
http://www.rdatamining.com/examples/text-mining
# Prefer fixed to scientific notation
options(scipen=5)
# Print numbers with two significant digits
options(digits=2)
options(R2HTML.format.digits=2)
# Set a nice color palette for plots
lattice.options(default.theme=latticeExtra::custom.theme(symbol=RColorBrewer::brewer.pal(8,
"Set1")[c(2:1, 3:5, 7:9)], fill=RColorBrewer::brewer.pal(8, "Set1")[c(2:1,
3:5, 7:9)], region=RColorBrewer::brewer.pal(n=11, name="Spectral")))
rm(lengths)
library(RODBC)
channel <-
odbcConnectExcel("C:/Users/cristian.chirita/Documents/DateKnime/adevarulCOMENTARII_Siriamartie21.xls")
corpusDataset <- sqlQuery(channel=channel, "select * from [Sheet1$]")
odbcCloseAll()
corpus <- Corpus(DataframeSource(corpusDataset["TextComentariu"]),
readerControl=list(language="ro"))
corpusVars <- corpusDataset[!names(corpusDataset) == "TextComentariu"]
activeDataSet("corpusVars")
names(corpusVars) <- make.names(names(corpusVars))
setCorpusVariables()
dtmCorpus <- corpus
dtmCorpus <- tm_map(dtmCorpus, content_transformer(tolower))
dtmCorpus <- tm_map(dtmCorpus, content_transformer(function(x)
gsub("(['?\n<U+202F><U+2009>]|[[:punct:]]|[[:space:]]|[[:cntrl:]])+", " ",
x)))
dtmCorpus <- tm_map(dtmCorpus, removeNumbers)
dtm <- DocumentTermMatrix(dtmCorpus, control=list(tolower=TRUE,
wordLengths=c(2, Inf)))
rm(dtmCorpus)
dictionary <- data.frame(row.names=colnames(dtm),
"Occurrences"=col_sums(dtm), "Stopword"=ifelse(colnames(dtm) %in%
stopwords("ro"), "Stopword", ""), stringsAsFactors=FALSE)
dtm <- dtm[, !colnames(dtm) %in% stopwords("ro")]
attr(dtm, "dictionary") <- dictionary
rm(dictionary)
meta(corpus, type="corpus", tag="language") <- attr(dtm, "language") <- "ro"
meta(corpus, type="corpus", tag="processing") <- attr(dtm, "processing") <-
c(lowercase=TRUE, punctuation=TRUE, digits=TRUE, stopwords=TRUE,
stemming=FALSE, customStemming=FALSE, twitter=FALSE, removeHashtags=NA,
removeNames=NA)
corpus
dtm
myDtm <- TermDocumentMatrix(dtm, control = list(minWordLength = 1))
inspect(myDtm[100:170,10:20])
inspect(dtm[100:170,10:20])
findFreqTerms(dtm, lowfreq=10)
findAssocs(dtm, 'kurzi', 0.30)
library(wordcloud)
m <- as.matrix(dtm)
v <- sort(rowSums(m), decreasing=TRUE)
myNames <- names(v)
k <- which(names(v)=="siria")
myNames[k] <- "Siria"
d <- data.frame(word=myNames, freq=v)
wordcloud(d$word, d$freq, min.freq=3)
myDtm <- TermDocumentMatrix(corpus, control = list(minWordLength = 1))
inspect(myDtm[100:170,10:20])
myCorpus <- tm_map(corpus, tolower)
# remove punctuation
myCorpus <- tm_map(myCorpus, removePunctuation)
# remove numbers
myCorpus <- tm_map(myCorpus, removeNumbers)
# remove stopwords
# keep "r" by removing it from stopwords
myStopwords <- c(stopwords('english'), "available", "via")
idx <- which(myStopwords == "r")
myStopwords <- myStopwords[-idx]
myCorpus <- tm_map(myCorpus, removeWords, myStopwords)
myCorpus
myDtm <- TermDocumentMatrix(myCorpus, control = list(minWordLength = 1))
inspect(myDtm[4266:4270,4331:4340])
findFreqTerms(myDtm, lowfreq=20)
findAssocs(myDtm, 'victime', 0.30)
termDocMatrix <- as.matrix(myDtm)
Obs:
Fisier prea mare :)
No comments:
Post a Comment