# Prefer fixed to scientific notation
options(scipen=5)
# Print numbers with two significant digits
options(digits=2)
options(R2HTML.format.digits=2)
# Set a nice color palette for plots
lattice.options(default.theme=latticeExtra::custom.theme(symbol=RColorBrewer::brewer.pal(8,
"Set1")[c(2:1, 3:5, 7:9)], fill=RColorBrewer::brewer.pal(8, "Set1")[c(2:1,
3:5, 7:9)], region=RColorBrewer::brewer.pal(n=11, name="Spectral")))
rm(lengths)
library(RODBC)
channel <-
odbcConnectExcel2007("C:/Users/CCCC/Documents/DateKnime/adevarulCOMENTARII_Siriamartie21.xlsx")
corpusDataset <- sqlQuery(channel=channel, "select * from [Sheet1$]")
odbcCloseAll()
corpus <- Corpus(DataframeSource(corpusDataset["TextComentariu"]),
readerControl=list(language="ro"))
corpusVars <- corpusDataset[!names(corpusDataset) == "TextComentariu"]
corpusVars <- data.frame(var1=factor(rep("", length(corpus))),
row.names=names(corpus))
activeDataSet("corpusVars")
setCorpusVariables()
dtmCorpus <- corpus
dtmCorpus <- tm_map(dtmCorpus, content_transformer(tolower))
dtmCorpus <- tm_map(dtmCorpus, content_transformer(function(x)
gsub("(['?\n<U+202F><U+2009>]|[[:punct:]]|[[:space:]]|[[:cntrl:]])+", " ",
x)))
dtmCorpus <- tm_map(dtmCorpus, removeNumbers)
dtm <- DocumentTermMatrix(dtmCorpus, control=list(tolower=FALSE,
wordLengths=c(2, Inf)))
rm(dtmCorpus)
dictionary <- data.frame(row.names=colnames(dtm),
"Occurrences"=col_sums(dtm), "Stopword"=ifelse(colnames(dtm) %in%
stopwords("ro"), "Stopword", ""), stringsAsFactors=FALSE)
dtm <- dtm[, !colnames(dtm) %in% stopwords("ro")]
attr(dtm, "dictionary") <- dictionary
rm(dictionary)
meta(corpus, type="corpus", tag="language") <- attr(dtm, "language") <- "ro"
meta(corpus, type="corpus", tag="processing") <- attr(dtm, "processing") <-
c(lowercase=TRUE, punctuation=TRUE, digits=TRUE, stopwords=TRUE,
stemming=FALSE, customStemming=FALSE, twitter=FALSE, removeHashtags=NA,
removeNames=NA)
corpus
dtm
corpus <- tm_map(corpus, removeWords, "romanian")
stopwords(kind = "ro")
myDtm <- TermDocumentMatrix(corpus, control = list(minWordLength = 1,stopwords = TRUE))
inspect(myDtm[266:270,31:40])
library(wordcloud)
m <- as.matrix(myDtm)
v <- sort(rowSums(m), decreasing=TRUE)
myNames <- names(v)
d <- data.frame(word=myNames, freq=v)
wordcloud(d$word, d$freq, min.freq=33)
termDocMatrix <- as.matrix(myDtm)
termDocMatrix[termDocMatrix>=1] <- 1
termDocMatrix[5:10,1:20]
termMatrix <- termDocMatrix %*% t(termDocMatrix)
# inspect terms numbered 5 to 10
termMatrix[5:10,5:10]
write.table(termDocMatrix, file = "e:/matrix", sep = " ")
rm(corpus, corpusVars, dtm, lengths)
rm(lengths)
library(RODBC)
channel <-
odbcConnectExcel2007("C:/Users/cristian.chirita/Documents/DateKnime/adevarulCOMENTARII_Siriamartie21.xlsx")
corpusDataset <- sqlQuery(channel=channel, "select * from [Sheet1$]")
odbcCloseAll()
corpus <- Corpus(DataframeSource(corpusDataset["TextComentariu"]),
readerControl=list(language="ro"))
corpusVars <- corpusDataset[!names(corpusDataset) == "TextComentariu"]
corpusVars <- corpusVars[c("nume articol", "rankcomentator", "Autor",
"like-uri")]
activeDataSet("corpusVars")
names(corpusVars) <- make.names(names(corpusVars))
setCorpusVariables()
dtmCorpus <- corpus
dtmCorpus <- tm_map(dtmCorpus, content_transformer(tolower))
dtmCorpus <- tm_map(dtmCorpus, content_transformer(function(x)
gsub("(['?\n<U+202F><U+2009>]|[[:punct:]]|[[:space:]]|[[:cntrl:]])+", " ",
x)))
dtmCorpus <- tm_map(dtmCorpus, removeNumbers)
dtm <- DocumentTermMatrix(dtmCorpus, control=list(tolower=FALSE,
wordLengths=c(2, Inf)))
rm(dtmCorpus)
dictionary <- data.frame(row.names=colnames(dtm),
"Occurrences"=col_sums(dtm), "Stopword"=ifelse(colnames(dtm) %in%
stopwords("ro"), "Stopword", ""), stringsAsFactors=FALSE)
dtm <- dtm[, !colnames(dtm) %in% stopwords("ro")]
attr(dtm, "dictionary") <- dictionary
rm(dictionary)
meta(corpus, type="corpus", tag="language") <- attr(dtm, "language") <- "ro"
meta(corpus, type="corpus", tag="processing") <- attr(dtm, "processing") <-
c(lowercase=TRUE, punctuation=TRUE, digits=TRUE, stopwords=TRUE,
stemming=FALSE, customStemming=FALSE, twitter=FALSE, removeHashtags=NA,
removeNames=NA)
corpus
dtm
keep <- meta(corpus, "like.uri")[[1]] %in% c("-2 (6 voturi)",
"-4 (6 voturi)", "-4 (4 voturi)", "-1 (5 voturi)", "-1 (7 voturi)",
"-1 (1 voturi)", "-2 (4 voturi)", "-6 (8 voturi)", "-4 (8 voturi)",
"-3 (3 voturi)", "-5 (13 voturi)", "-6 (14 voturi)", "-4 (12 voturi)",
"-3 (11 voturi)", "-2 (2 voturi)", "-3 (15 voturi)", "-5 (15 voturi)",
"-1 (11 voturi)", "-5 (11 voturi)", "-3 (7 voturi)", "-2 (18 voturi)",
"-1 (3 voturi)", "-3 (9 voturi)", "-3 (5 voturi)", "0 (16 voturi)",
"-6 (10 voturi)", "-6 (18 voturi)", "-10 (18 voturi)", "-14 (16 voturi)",
"-11 (15 voturi)", "-12 (14 voturi)", "-2 (14 voturi)", "-4 (10 voturi)",
"-1 (9 voturi)", "-5 (21 voturi)", "-4 (14 voturi)", "-1 (13 voturi)",
"-2 (10 voturi)", "-5 (7 voturi)", "-5 (5 voturi)", "-6 (6 voturi)",
"-7 (25 voturi)", "-13 (27 voturi)", "-2 (16 voturi)", "-5 (17 voturi)",
"-7 (9 voturi)", "-7 (7 voturi)", "-12 (52 voturi)", "-1 (29 voturi)",
"-5 (31 voturi)", "-5 (41 voturi)", "-1 (35 voturi)", "-2 (34 voturi)",
"-5 (37 voturi)", "-6 (28 voturi)", "-6 (36 voturi)", "-9 (37 voturi)",
"-8 (24 voturi)", "-2 (8 voturi)", "-5 (9 voturi)", "-7 (11 voturi)",
"-8 (8 voturi)", "-2 (12 voturi)", "-8 (36 voturi)", "-3 (29 voturi)",
"-8 (28 voturi)", "-1 (15 voturi)", "-13 (23 voturi)", "-12 (18 voturi)",
"-10 (14 voturi)", "-10 (12 voturi)", "-6 (12 voturi)", "-8 (18 voturi)",
"-6 (44 voturi)", "-17 (57 voturi)", "-11 (57 voturi)", "-3 (21 voturi)",
"-4 (20 voturi)", "-7 (19 voturi)", "-8 (14 voturi)", "-3 (19 voturi)",
"-4 (36 voturi)", "-8 (26 voturi)", "-1 (17 voturi)", "-6 (16 voturi)",
"-8 (10 voturi)", "-9 (13 voturi)", "-9 (9 voturi)", "-4 (22 voturi)",
"-2 (20 voturi)", "-2 (22 voturi)", "-6 (24 voturi)", "-7 (15 voturi)",
"-11 (25 voturi)", "-7 (13 voturi)", "-5 (19 voturi)", "-4 (16 voturi)",
"-15 (29 voturi)", "-3 (17 voturi)", "-8 (12 voturi)", "-1 (21 voturi)",
"-5 (23 voturi)", "-1 (25 voturi)", "-16 (20 voturi)", "-9 (11 voturi)",
"-16 (16 voturi)", "-3 (13 voturi)", "-5 (29 voturi)", "-10 (20 voturi)",
"-8 (20 voturi)", "-9 (17 voturi)", "-8 (30 voturi)", "-2 (38 voturi)",
"-10 (28 voturi)", "-1 (53 voturi)", "-1 (43 voturi)", "-3 (47 voturi)",
"-4 (40 voturi)", "-8 (34 voturi)", "-7 (49 voturi)", "-4 (26 voturi)",
"-7 (23 voturi)", "-24 (30 voturi)", "-3 (27 voturi)", "-15 (23 voturi)",
"-12 (12 voturi)", "-17 (19 voturi)", "-31 (43 voturi)", "-16 (30 voturi)",
"-17 (27 voturi)", "-14 (26 voturi)")
origCorpus <- corpus
corpus <- corpus[keep]
origDtm <- dtm
dtmAttr <- attributes(dtm)
origDictionary <- attr(dtm, "dictionary")
dtm <- dtm[keep,]
dictionary <- data.frame(row.names=colnames(dtm),
"Occurrences"=col_sums(dtm), "Stopword"=ifelse(colnames(dtm) %in%
stopwords("ro"), "Stopword", ""), stringsAsFactors=FALSE)
dtm <- dtm[, !colnames(dtm) %in% stopwords("ro")]
attr(dtm, "dictionary") <- dictionary
rm(dictionary)
attr(dtm, "language") <- dtmAttr$lang
attr(dtm, "processing") <- dtmAttr$processing
rm(dtmAttr, origDictionary)
corpusVars <- corpusVars[keep,, drop=FALSE]
rm(list=c("keep", "lengths"))
corpus
dtm
library(relimp, pos=23)
showData(corpusVars, placement='-20+200', font=getRcmdr('logFont'),
maxwidth=80, maxheight=30)
editDataset(corpusVars)
editDataset(corpusVars)
####################################################
corpus <- tm_map(corpus, removeWords, "romanian")
stopwords(kind = "ro")
myDtm <- TermDocumentMatrix(corpus, control = list(minWordLength = 1,stopwords = TRUE))
inspect(myDtm[266:270,31:40])
library(wordcloud)
m <- as.matrix(myDtm)
v <- sort(rowSums(m), decreasing=TRUE)
myNames <- names(v)
d <- data.frame(word=myNames, freq=v)
wordcloud(d$word, d$freq, min.freq=33)
termDocMatrix <- as.matrix(myDtm)
termDocMatrix[termDocMatrix>=1] <- 1
termDocMatrix[5:10,1:20]
termMatrix <- termDocMatrix %*% t(termDocMatrix)
# inspect terms numbered 5 to 10
termMatrix[5:10,5:10]
library(igraph)
# build a graph from the above matrix
g <- graph.adjacency(termMatrix, weighted=T, mode = "undirected")
# remove loops
g <- simplify(g)
# set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)
# set seed to make the layout reproducible
set.seed(3952)
layout1 <- layout.fruchterman.reingold(g)
plot(g, layout=layout1)
write.graph(g, "e:/coocgraph", format=c( "graphml"))
https://rstudio-pubs-static.s3.amazonaws.com/31867_8236987cf0a8444e962ccd2aec46d9c3.html#clustering-by-term-similarity
Graficul rezultat contine nu este filtrat din punct de vedere gramatical motiv pentru care contine prea mult zgomot.

No comments:
Post a Comment