(freq.terms <- findFreqTerms(dtm, lowfreq = 30))
agraph <- graphplot(dtm,term = freq.terms,corThreshold = 0.1)
write.graph(agraph,"e:/agraphsiria.graphml", format=c("graphml"))
Thursday, April 21, 2016
Alternate plot function for tm with Igraph support.
graphplot <-
function(x,
terms = sample(Terms(x), 20),
corThreshold = 0.7,
weighted=TRUE,
diag=FALSE,
...)
{
if (system.file(package = "igraph") == "")
stop("Plotting requires package 'igraph'.")
m <- if (inherits(x, "TermDocumentMatrix")) t(x) else x
m <- as.matrix(m[, terms])
c <- cor(m)
c[c < corThreshold] <- 0
c[is.na(c)] <- 0
diag(c) <- 0
tmgraph <- graph.adjacency(c, mode=c("undirected"), weighted=TRUE, diag=FALSE,
add.colnames=NULL, add.rownames=NA)
plot(tmgraph)
invisible(tmgraph)
}
example usage:
graphplot(dtm,term = freq.terms,corThreshold = 0.5)
dtm- document term matrix
terms- similar to plot.tm
threshold-similar with plot.pm
for saving/export:
agraph <- graphplot(dtm,term = freq.terms,corThreshold = 0.3)
write.graph(agraph,"e:/agraph.graphml", format=c("graphml"))
function(x,
terms = sample(Terms(x), 20),
corThreshold = 0.7,
weighted=TRUE,
diag=FALSE,
...)
{
if (system.file(package = "igraph") == "")
stop("Plotting requires package 'igraph'.")
m <- if (inherits(x, "TermDocumentMatrix")) t(x) else x
m <- as.matrix(m[, terms])
c <- cor(m)
c[c < corThreshold] <- 0
c[is.na(c)] <- 0
diag(c) <- 0
tmgraph <- graph.adjacency(c, mode=c("undirected"), weighted=TRUE, diag=FALSE,
add.colnames=NULL, add.rownames=NA)
plot(tmgraph)
invisible(tmgraph)
}
example usage:
graphplot(dtm,term = freq.terms,corThreshold = 0.5)
dtm- document term matrix
terms- similar to plot.tm
threshold-similar with plot.pm
for saving/export:
agraph <- graphplot(dtm,term = freq.terms,corThreshold = 0.3)
write.graph(agraph,"e:/agraph.graphml", format=c("graphml"))
Wednesday, April 20, 2016
Topic model
dtm
library(topicmodels)
lda <- LDA(dtm, k = 10) # find 8 topics
(term <- terms(lda, 10)) # first 6 terms of every topic
dtm <- dtm[, !colnames(dtm) %in% c("si", "in", "dl", "dupa", "face",
"arata")]
dtm <- dtm[, colnames(dtm) %in% c("cred", "spus", "asa", "poate", "doua")
dim(dtm)
summary(col_sums(dtm))
summary(col_sums(dtm))
term_tfidf <-
tapply(dtm$v/row_sums(dtm)[dtm$i], dtm$j, mean) *
log2(nDocs(dtm)/col_sums(dtm > 0))
summary(term_tfidf)
dtm <- dtm[,term_tfidf >= 0.1]
dtm <- dtm[row_sums(dtm) > 0,]
summary(col_sums(dtm))
dim(dtm)
k <- 30
SEED <- 2010
jss_TM <-
list(VEM = LDA(dtm, k = k, control = list(seed = SEED)),
VEM_fixed = LDA(dtm, k = k,
control = list(estimate.alpha = FALSE, seed = SEED)),
Gibbs = LDA(dtm, k = k, method = "Gibbs",
control = list(seed = SEED, burnin = 1000,
thin = 100, iter = 1000)),
CTM = CTM(dtm, k = k,
control = list(seed = SEED,
var = list(tol = 10^-4), em = list(tol = 10^-3))))
Topic <- topics(jss_TM[["Gibbs"]], 10)
Terms <- terms(jss_TM[["Gibbs"]], 10)
Terms[,1:10]
ctm <- CTM(dtm, 10, method = "VEM", control = NULL, model = NULL)
g <- build_graph(ctm, 0.9, and = TRUE)
library(topicmodels)
lda <- LDA(dtm, k = 10) # find 8 topics
(term <- terms(lda, 10)) # first 6 terms of every topic
dtm <- dtm[, !colnames(dtm) %in% c("si", "in", "dl", "dupa", "face",
"arata")]
dtm <- dtm[, colnames(dtm) %in% c("cred", "spus", "asa", "poate", "doua")
dim(dtm)
summary(col_sums(dtm))
summary(col_sums(dtm))
term_tfidf <-
tapply(dtm$v/row_sums(dtm)[dtm$i], dtm$j, mean) *
log2(nDocs(dtm)/col_sums(dtm > 0))
summary(term_tfidf)
dtm <- dtm[,term_tfidf >= 0.1]
dtm <- dtm[row_sums(dtm) > 0,]
summary(col_sums(dtm))
dim(dtm)
k <- 30
SEED <- 2010
jss_TM <-
list(VEM = LDA(dtm, k = k, control = list(seed = SEED)),
VEM_fixed = LDA(dtm, k = k,
control = list(estimate.alpha = FALSE, seed = SEED)),
Gibbs = LDA(dtm, k = k, method = "Gibbs",
control = list(seed = SEED, burnin = 1000,
thin = 100, iter = 1000)),
CTM = CTM(dtm, k = k,
control = list(seed = SEED,
var = list(tol = 10^-4), em = list(tol = 10^-3))))
Topic <- topics(jss_TM[["Gibbs"]], 10)
Terms <- terms(jss_TM[["Gibbs"]], 10)
Terms[,1:10]
ctm <- CTM(dtm, 10, method = "VEM", control = NULL, model = NULL)
g <- build_graph(ctm, 0.9, and = TRUE)
Metoda inspirata din DataminingwithR
# Prefer fixed to scientific notation
options(scipen=5)
# Print numbers with two significant digits
options(digits=2)
options(R2HTML.format.digits=2)
# Set a nice color palette for plots
lattice.options(default.theme=latticeExtra::custom.theme(symbol=RColorBrewer::brewer.pal(8,
"Set1")[c(2:1, 3:5, 7:9)], fill=RColorBrewer::brewer.pal(8, "Set1")[c(2:1,
3:5, 7:9)], region=RColorBrewer::brewer.pal(n=11, name="Spectral")))
rm(corpus, corpusVars, dtm, lengths)
library(RODBC)
channel <-
odbcConnectExcel("C:/Users/cristian.chirita/Documents/newQdap/DATA/RAW_DATA/PAPER_ARTIFACTS/MMeditoriale.xls")
corpusDataset <- sqlQuery(channel=channel, "select * from [Sheet1$]")
odbcCloseAll()
corpus <- Corpus(DataframeSource(corpusDataset["Discurs"]),
readerControl=list(language="ro"))
corpusVars <- corpusDataset[!names(corpusDataset) == "Discurs"]
activeDataSet("corpusVars")
setCorpusVariables()
dtmCorpus <- corpus
dtmCorpus <- tm_map(dtmCorpus, content_transformer(tolower))
dtmCorpus <- tm_map(dtmCorpus, content_transformer(function(x)
gsub("(['?\n<U+202F><U+2009>]|[[:punct:]]|[[:space:]]|[[:cntrl:]])+", " ",
x)))
dtmCorpus <- tm_map(dtmCorpus, removeNumbers)
dtm <- DocumentTermMatrix(dtmCorpus, control=list(tolower=FALSE,
wordLengths=c(2, Inf)))
rm(dtmCorpus)
dictionary <- data.frame(row.names=colnames(dtm),
"Occurrences"=col_sums(dtm), "Stopword"=ifelse(colnames(dtm) %in%
stopwords("ro"), "Stopword", ""), stringsAsFactors=FALSE)
dtm <- dtm[, !colnames(dtm) %in% stopwords("ro")]
attr(dtm, "dictionary") <- dictionary
rm(dictionary)
meta(corpus, type="corpus", tag="language") <- attr(dtm, "language") <- "ro"
meta(corpus, type="corpus", tag="processing") <- attr(dtm, "processing") <-
c(lowercase=TRUE, punctuation=TRUE, digits=TRUE, stopwords=TRUE,
stemming=FALSE, customStemming=FALSE, twitter=FALSE, removeHashtags=NA,
removeNames=NA)
corpus
dtm
library(qdap)
mytdm <- as.Corpus(dtm)
my_tdm <- TermDocumentMatrix(mytdm)
inspect(my_tdm)
(freq.terms <- findFreqTerms(my_tdm, lowfreq = 40))
term.freq <- rowSums(as.matrix(my_tdm))
term.freq <- subset(term.freq, term.freq >= 25)
df <- data.frame(term = names(term.freq), freq = term.freq)
library(ggplot2)
ggplot(df, aes(x = term, y = freq)) + geom_bar(stat = "identity") +
xlab("Terms") + ylab("Count") + coord_flip()
findAssocs(my_tdm, "munteanu", 0.2)
findAssocs(my_tdm, "pnl", 0.6)
findAssocs(my_tdm, "ciucu", 0.6)
library(graph)
library(Rgraphviz)
plot(my_tdm, term = freq.terms, corThreshold = 0.1, weighting = T)
tdm.s = removeSparseTerms(my_tdm, sparse=0.7)
# we'll need the TDM as a matrix
m = as.matrix(tdm.s)
# datavis time
# convert matri to data frame
m.df = data.frame(m)
# quick hack to make keywords - which got stuck in row.names - into a variable
m.df$keywords = rownames(m.df)
library(reshape)
# "melt" the data frame ; ?melt at R console for info
m.df.melted = melt(m.df)
?
# not necessary, but I like decent column names
colnames(m.df.melted) = c("Keyword","Post","Freq")
# generate the heatmap
library(ggthemes)
hm = ggplot(m.df.melted, aes(x=Post, y=Keyword)) +
geom_tile(aes(fill=Freq), colour="white") +
scale_fill_gradient(low="black", high="darkorange") +
labs(title="Cuvinte cheie folosite in articolele cu Marian Munteanu") +
theme_few() +
theme(axis.text.x = element_text(size=6))
ggsave(plot=hm,filename="e:/mm_munteanudocs-hm.png",width=11,height=8.5)
# better? way to view frequencies
# sum rows of the tdm to get term freq count
tf = rowSums(as.matrix(my_tdm))
# we don't want all the words, so choose ones with 10+ freq
tf.10 = subset(tf, tf>=40)
#nu merge :)
# wimping out and using qplot so I don't have to make another data frame
#bf = qplot(names(tf.10), value(tf.10), geom="bar")
#ggsave(plot=bf,filename="e:/marianmunteanufreq-bars.png",width=8.5,height=11)
termDocMatrix <- as.matrix(my_tdm)
termDocMatrix[5:10,1:20]
# change it to a Boolean matrix
m[5:10,1:20]
termDocMatrix <- m
termDocMatrix[termDocMatrix>=1] <- 1
# transform into a term-term adjacency matrix
termMatrix <- termDocMatrix %*% t(termDocMatrix)
termMatrix[5:10,5:10]
library(igraph)
# build a graph from the above matrix
g <- graph.adjacency(termMatrix, weighted=TRUE, mode = "undirected")
# remove loops
g <- simplify(g)
# set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)
# set seed to make the layout reproducible
set.seed(3952)
layout1 <- layout.fruchterman.reingold(g)
plot(g, layout=layout1)
write.graph(g,"e:/marianmunteanuart.graphml", format=c("graphml"))
word.freq <- sort(rowSums(m), decreasing = T)
# colors
pal <- brewer.pal(9, "BuGn")
pal <- pal[-(1:4)]
library(wordcloud)
wordcloud(words = names(word.freq), freq = word.freq, min.freq = 10,
random.order = F, colors = pal)
tdm2 <- removeSparseTerms(my_tdm, sparse = 0.95)
tdm2 <- tdm.s
m2 <- as.matrix(tdm2)
# cluster terms
distMatrix <- dist(scale(m2))
fit <- hclust(distMatrix, method = "ward")
plot(fit)
rect.hclust(fit, k = 10) # cut tree into 6 clusters
m3 <- t(m2) # transpose the matrix to cluster documents
set.seed(122) # set a fixed random seed
k <- 10 # number of clusters
kmeansResult <- kmeans(m3, k)
round(kmeansResult$centers, digits = 3)
for (i in 1:k) {
cat(paste("cluster ", i, ": ", sep = ""))
s <- sort(kmeansResult$centers[i, ], decreasing = T)
cat(names(s)[1:5], "\n")
# print the keywords of every cluster
# print(keywords[which(kmeansResult$cluster==i)])
}
===============
https://rdatamining.wordpress.com/2012/05/17/an-example-of-social-network-analysis-with-r-using-package-igraph/
Pt plot tm vezi:
https://github.com/rforge/tm/blob/master/pkg/R/plot.R
======
http://www.leydesdorff.net/words/
======
http://www.r-bloggers.com/visualizing-risky-words-part-2/
Monday, April 18, 2016
Analiza discurs articolecotidianul Adevarul pe tema Marian Munteanu.
Wednesday, April 13, 2016
Acelasi Corpus cu Qdap
http://trinker.github.io/qdap/vignettes/qdap_vignette.html#import_export
require(qdap)
qdap_dat<- as.data.frame(corpus)
qview(qdap_dat)
(tm_dat)
sentSplit
(qdap_dat, "state", stem = TRUE)
bag_o_words(qdap_dat$text)
by(qdap_dat$text, bag_o_words)
head(corpus)
out <- ngrams(qdap_dat$text, 2)
lapply(out[["all_n"]], function(x) sapply(x, paste, collapse = " "))
DATA$state <- qdap_dat$text
rm_stopwords(qdap_dat$text, Top200Words)
ms <- c(" Siria ", "Rusia" ,"teroristi")
et <- c(" Turcia", " americani")
word_associate(qdap_dat$text, qdap_dat$docs, match.string = ms, wordcloud = TRUE, proportional = TRUE, network.plot = TRUE, nw.label.proportional = TRUE, extra.terms = et,cloud.legend =c("A", "B", "C"), title.color = "blue", cloud.colors = c("red", "purple", "gray70"))
DATA$state <- qdap_dat$text
rm_stopwords(qdap_dat$text, Top200Words)
ms <- c(" Siria ", "Rusia" ,"teroristi")
et <- c(" Turcia", " americani")
word_associate(qdap_dat$text, qdap_dat$docs, match.string = ms, wordcloud = TRUE, proportional = TRUE, network.plot = TRUE, nw.label.proportional = TRUE, extra.terms = et,cloud.legend =c("A", "B", "C"), title.color = "blue", cloud.colors = c("red", "purple", "gray70"))
term_match(text.var = qdap_dat$text, terms = qcv(siria, sirieni), return.list = FALSE)
term_match(qdap_dat$text, "america", FALSE)
with(qdap_dat, Dissimilarity(text, docs, method = "minkowski"))
dat <- qdap_dat[qdap_dat$text %in% qcv(Siria, Rusia),]
dat
(div.mod <- with(qdap_dat, diversity(docs, text)))
plot(div.mod, low = "yellow", grid = FALSE, values = TRUE)
words <- c("Siria", "rusia", "america", "turcia", "romania")
with(qdap_dat, word_cor(qdap_dat$text, x, words, r = .5))
x <- factor(with(qdap_dat, paste(text, pad(TOT(tot)), sep = "|")))
word_network_plot(text.var=qdap_dat$text, stopwords=NULL, label.cex = .95)
require(qdap)
qdap_dat<- as.data.frame(corpus)
qview(qdap_dat)
(tm_dat)
sentSplit
(qdap_dat, "state", stem = TRUE)
bag_o_words(qdap_dat$text)
by(qdap_dat$text, bag_o_words)
head(corpus)
out <- ngrams(qdap_dat$text, 2)
lapply(out[["all_n"]], function(x) sapply(x, paste, collapse = " "))
DATA$state <- qdap_dat$text
rm_stopwords(qdap_dat$text, Top200Words)
ms <- c(" Siria ", "Rusia" ,"teroristi")
et <- c(" Turcia", " americani")
word_associate(qdap_dat$text, qdap_dat$docs, match.string = ms, wordcloud = TRUE, proportional = TRUE, network.plot = TRUE, nw.label.proportional = TRUE, extra.terms = et,cloud.legend =c("A", "B", "C"), title.color = "blue", cloud.colors = c("red", "purple", "gray70"))
DATA$state <- qdap_dat$text
rm_stopwords(qdap_dat$text, Top200Words)
ms <- c(" Siria ", "Rusia" ,"teroristi")
et <- c(" Turcia", " americani")
word_associate(qdap_dat$text, qdap_dat$docs, match.string = ms, wordcloud = TRUE, proportional = TRUE, network.plot = TRUE, nw.label.proportional = TRUE, extra.terms = et,cloud.legend =c("A", "B", "C"), title.color = "blue", cloud.colors = c("red", "purple", "gray70"))
term_match(text.var = qdap_dat$text, terms = qcv(siria, sirieni), return.list = FALSE)
term_match(qdap_dat$text, "america", FALSE)
with(qdap_dat, Dissimilarity(text, docs, method = "minkowski"))
dat <- qdap_dat[qdap_dat$text %in% qcv(Siria, Rusia),]
dat
(div.mod <- with(qdap_dat, diversity(docs, text)))
plot(div.mod, low = "yellow", grid = FALSE, values = TRUE)
words <- c("Siria", "rusia", "america", "turcia", "romania")
with(qdap_dat, word_cor(qdap_dat$text, x, words, r = .5))
x <- factor(with(qdap_dat, paste(text, pad(TOT(tot)), sep = "|")))
word_network_plot(text.var=qdap_dat$text, stopwords=NULL, label.cex = .95)
Bibliografie R
http://stackoverflow.com/questions/16880411/r-tm-big-data-based-on-a-termdocumentmatrix-how-to-set-term-freq-bound-to-ext
Text mining Willian Shakespere:
http://www.r-bloggers.com/text-mining-the-complete-works-of-william-shakespeare/
http://www.r-bloggers.com/an-example-of-social-network-analysis-with-r-using-package-igraph/
Disimilarities Cosine
http://stats.stackexchange.com/questions/78321/term-frequency-inverse-document-frequency-tf-idf-weighting
library(proxy)
cosine_dist_mat <- as.matrix(dist(t(myDtm), method = "cosine"))
docsdissim <- dist(as.matrix(termDocMatrix), method = "cosine")
n <- as.matrix(docsdissim)
v <- sort(rowSums(n), decreasing=TRUE)
myNames <- names(v)
d <- data.frame(word=myNames, freq=v)
wordcloud(d$word, d$freq, min.freq=33)
termDocMatrix <- n
termDocMatrix[termDocMatrix>=1] <- 1
termDocMatrix[5:10,1:20]
termMatrix <- termDocMatrix %*% t(termDocMatrix)
# inspect terms numbered 5 to 10
termMatrix[5:10,5:10]
library(igraph)
# build a graph from the above matrix
g <- graph.adjacency(termMatrix, weighted=T, mode = "undirected")
# remove loops
g <- simplify(g)
# set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)
# set seed to make the layout reproducible
set.seed(3952)
layout1 <- layout.fruchterman.reingold(g)
plot(g, layout=layout1)
write.graph(g, "e:/siriacosinedisimilarity.graphml", format=c( "graphml"))
Cosine 1 =>mai degraba disimilaritate :)
termDocMatrix <- n
#termDocMatrix[termDocMatrix>=1] <- 1
termDocMatrix[termDocMatrix<1] <- 0
termDocMatrix[5:10,1:20]
termMatrix <- termDocMatrix %*% t(termDocMatrix)
# inspect terms numbered 5 to 10
termMatrix[5:10,5:10]
library(igraph)
# build a graph from the above matrix
g <- graph.adjacency(termMatrix, weighted=T, mode = "undirected")
# remove loops
g <- simplify(g)
# set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)
# set seed to make the layout reproducible
set.seed(3952)
layout1 <- layout.fruchterman.reingold(g)
plot(g, layout=layout1)
write.graph(g, "e:/siriacosinedisimilaritynonbollean.graphml", format=c( "graphml"))
Text mining Willian Shakespere:
http://www.r-bloggers.com/text-mining-the-complete-works-of-william-shakespeare/
http://www.r-bloggers.com/an-example-of-social-network-analysis-with-r-using-package-igraph/
Disimilarities Cosine
http://stats.stackexchange.com/questions/78321/term-frequency-inverse-document-frequency-tf-idf-weighting
library(proxy)
cosine_dist_mat <- as.matrix(dist(t(myDtm), method = "cosine"))
docsdissim <- dist(as.matrix(termDocMatrix), method = "cosine")
n <- as.matrix(docsdissim)
v <- sort(rowSums(n), decreasing=TRUE)
myNames <- names(v)
d <- data.frame(word=myNames, freq=v)
wordcloud(d$word, d$freq, min.freq=33)
termDocMatrix <- n
termDocMatrix[termDocMatrix>=1] <- 1
termDocMatrix[5:10,1:20]
termMatrix <- termDocMatrix %*% t(termDocMatrix)
# inspect terms numbered 5 to 10
termMatrix[5:10,5:10]
library(igraph)
# build a graph from the above matrix
g <- graph.adjacency(termMatrix, weighted=T, mode = "undirected")
# remove loops
g <- simplify(g)
# set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)
# set seed to make the layout reproducible
set.seed(3952)
layout1 <- layout.fruchterman.reingold(g)
plot(g, layout=layout1)
write.graph(g, "e:/siriacosinedisimilarity.graphml", format=c( "graphml"))
Cosine 1 =>mai degraba disimilaritate :)

termDocMatrix <- n
#termDocMatrix[termDocMatrix>=1] <- 1
termDocMatrix[termDocMatrix<1] <- 0
termDocMatrix[5:10,1:20]
termMatrix <- termDocMatrix %*% t(termDocMatrix)
# inspect terms numbered 5 to 10
termMatrix[5:10,5:10]
library(igraph)
# build a graph from the above matrix
g <- graph.adjacency(termMatrix, weighted=T, mode = "undirected")
# remove loops
g <- simplify(g)
# set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)
# set seed to make the layout reproducible
set.seed(3952)
layout1 <- layout.fruchterman.reingold(g)
plot(g, layout=layout1)
write.graph(g, "e:/siriacosinedisimilaritynonbollean.graphml", format=c( "graphml"))
Tuesday, April 12, 2016
Modificari corpus
Prin eliminarea de tip Sparse:
dtm <- dtm[, !colnames(dtm) %in% c("in", "asa", "cand", "poate",
+ "comentariu", "neaprobat", "dupa", "face", "cat", "fara", "ie", "pare",
+ "pana", "ani", "refugiati", "tara", "nato", "spune", "ru", "astia", "vrea",
+ "mi", "toti", "decat", "fata", "isi", "pt", "adica", "intr", "urma", "dup",
+ "niste", "ori", "inca", "spus", "tine", "as", "vezi", "articol", "doi",
+ "imi", "ni", "ta", "ul", "tii", "at", "ati", "iti", "aproape", "cate",
+ "cazul", "dvs", "erau", "il", "ilor", "pina", "ptr", "catre", "doua", "etc",
+ "iei", "trei", "parca", "pun", "caci", "cind", "for")]
In principiu am matricea este redusa prin:
m<-as.matrix(removeSparseTerms(myDtm, .99))
In rest ramane la fel :)
dict <- termsDictionary(dtm, "occurrences")
attr(dict, "title") <- "Terms dictionary sorted by number of occurrences"
dict
dissDtm2 <- rollup(dtm, 1, meta(corpus, "like.uri"))
diss <- corpusDissimilarity(dtm, dissDtm2)
rm(dissDtm2)
attr(diss, "title") <- "Documents by like.uri dissimilarity table"
diss
absVarFreqs <- table(meta(corpus, "like.uri"), dnn="like.uri")
varFreqs <- prop.table(absVarFreqs) * 100
barchart(varFreqs, xlab="% of documents",
main="Distribution of documents by like.uri", auto.key=TRUE)
varFreqs <- addmargins(varFreqs)
attr(varFreqs, "title") <- "Distribution of documents by like.uri (%)"
varFreqs
myDtm <- TermDocumentMatrix(corpus, control = list(minWordLength = 1,stopwords = TRUE))
inspect(myDtm[266:270,31:40])
m <- as.matrix(myDtm)
m<-as.matrix(removeSparseTerms(myDtm, .99))
v <- sort(rowSums(m), decreasing=TRUE)
myNames <- names(v)
d <- data.frame(word=myNames, freq=v)
wordcloud(d$word, d$freq, min.freq=33)
termDocMatrix <- as.matrix(myDtm)
termDocMatrix <- m
termDocMatrix[termDocMatrix>=1] <- 1
termDocMatrix[5:10,1:20]
termMatrix <- termDocMatrix %*% t(termDocMatrix)
# inspect terms numbered 5 to 10
termMatrix[5:10,5:10]
library(igraph)
# build a graph from the above matrix
g <- graph.adjacency(termMatrix, weighted=T, mode = "undirected")
# remove loops
g <- simplify(g)
# set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)
# set seed to make the layout reproducible
set.seed(3952)
layout1 <- layout.fruchterman.reingold(g)
plot(g, layout=layout1)
write.graph(g, "e:/siriasparsity099.graphml", format=c( "graphml"))
Un grafic al co-aparitiei destul de inteligibil.
dtm <- dtm[, !colnames(dtm) %in% c("in", "asa", "cand", "poate",
+ "comentariu", "neaprobat", "dupa", "face", "cat", "fara", "ie", "pare",
+ "pana", "ani", "refugiati", "tara", "nato", "spune", "ru", "astia", "vrea",
+ "mi", "toti", "decat", "fata", "isi", "pt", "adica", "intr", "urma", "dup",
+ "niste", "ori", "inca", "spus", "tine", "as", "vezi", "articol", "doi",
+ "imi", "ni", "ta", "ul", "tii", "at", "ati", "iti", "aproape", "cate",
+ "cazul", "dvs", "erau", "il", "ilor", "pina", "ptr", "catre", "doua", "etc",
+ "iei", "trei", "parca", "pun", "caci", "cind", "for")]
In principiu am matricea este redusa prin:
m<-as.matrix(removeSparseTerms(myDtm, .99))
In rest ramane la fel :)
dict <- termsDictionary(dtm, "occurrences")
attr(dict, "title") <- "Terms dictionary sorted by number of occurrences"
dict
dissDtm2 <- rollup(dtm, 1, meta(corpus, "like.uri"))
diss <- corpusDissimilarity(dtm, dissDtm2)
rm(dissDtm2)
attr(diss, "title") <- "Documents by like.uri dissimilarity table"
diss
absVarFreqs <- table(meta(corpus, "like.uri"), dnn="like.uri")
varFreqs <- prop.table(absVarFreqs) * 100
barchart(varFreqs, xlab="% of documents",
main="Distribution of documents by like.uri", auto.key=TRUE)
varFreqs <- addmargins(varFreqs)
attr(varFreqs, "title") <- "Distribution of documents by like.uri (%)"
varFreqs
myDtm <- TermDocumentMatrix(corpus, control = list(minWordLength = 1,stopwords = TRUE))
inspect(myDtm[266:270,31:40])
m <- as.matrix(myDtm)
m<-as.matrix(removeSparseTerms(myDtm, .99))
v <- sort(rowSums(m), decreasing=TRUE)
myNames <- names(v)
d <- data.frame(word=myNames, freq=v)
wordcloud(d$word, d$freq, min.freq=33)
termDocMatrix <- as.matrix(myDtm)
termDocMatrix <- m
termDocMatrix[termDocMatrix>=1] <- 1
termDocMatrix[5:10,1:20]
termMatrix <- termDocMatrix %*% t(termDocMatrix)
# inspect terms numbered 5 to 10
termMatrix[5:10,5:10]
library(igraph)
# build a graph from the above matrix
g <- graph.adjacency(termMatrix, weighted=T, mode = "undirected")
# remove loops
g <- simplify(g)
# set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)
# set seed to make the layout reproducible
set.seed(3952)
layout1 <- layout.fruchterman.reingold(g)
plot(g, layout=layout1)
write.graph(g, "e:/siriasparsity099.graphml", format=c( "graphml"))
Friday, April 8, 2016
Coocurences graph negative comments only
# Prefer fixed to scientific notation
options(scipen=5)
# Print numbers with two significant digits
options(digits=2)
options(R2HTML.format.digits=2)
# Set a nice color palette for plots
lattice.options(default.theme=latticeExtra::custom.theme(symbol=RColorBrewer::brewer.pal(8,
"Set1")[c(2:1, 3:5, 7:9)], fill=RColorBrewer::brewer.pal(8, "Set1")[c(2:1,
3:5, 7:9)], region=RColorBrewer::brewer.pal(n=11, name="Spectral")))
rm(lengths)
library(RODBC)
channel <-
odbcConnectExcel2007("C:/Users/CCCC/Documents/DateKnime/adevarulCOMENTARII_Siriamartie21.xlsx")
corpusDataset <- sqlQuery(channel=channel, "select * from [Sheet1$]")
odbcCloseAll()
corpus <- Corpus(DataframeSource(corpusDataset["TextComentariu"]),
readerControl=list(language="ro"))
corpusVars <- corpusDataset[!names(corpusDataset) == "TextComentariu"]
corpusVars <- data.frame(var1=factor(rep("", length(corpus))),
row.names=names(corpus))
activeDataSet("corpusVars")
setCorpusVariables()
dtmCorpus <- corpus
dtmCorpus <- tm_map(dtmCorpus, content_transformer(tolower))
dtmCorpus <- tm_map(dtmCorpus, content_transformer(function(x)
gsub("(['?\n<U+202F><U+2009>]|[[:punct:]]|[[:space:]]|[[:cntrl:]])+", " ",
x)))
dtmCorpus <- tm_map(dtmCorpus, removeNumbers)
dtm <- DocumentTermMatrix(dtmCorpus, control=list(tolower=FALSE,
wordLengths=c(2, Inf)))
rm(dtmCorpus)
dictionary <- data.frame(row.names=colnames(dtm),
"Occurrences"=col_sums(dtm), "Stopword"=ifelse(colnames(dtm) %in%
stopwords("ro"), "Stopword", ""), stringsAsFactors=FALSE)
dtm <- dtm[, !colnames(dtm) %in% stopwords("ro")]
attr(dtm, "dictionary") <- dictionary
rm(dictionary)
meta(corpus, type="corpus", tag="language") <- attr(dtm, "language") <- "ro"
meta(corpus, type="corpus", tag="processing") <- attr(dtm, "processing") <-
c(lowercase=TRUE, punctuation=TRUE, digits=TRUE, stopwords=TRUE,
stemming=FALSE, customStemming=FALSE, twitter=FALSE, removeHashtags=NA,
removeNames=NA)
corpus
dtm
corpus <- tm_map(corpus, removeWords, "romanian")
stopwords(kind = "ro")
myDtm <- TermDocumentMatrix(corpus, control = list(minWordLength = 1,stopwords = TRUE))
inspect(myDtm[266:270,31:40])
library(wordcloud)
m <- as.matrix(myDtm)
v <- sort(rowSums(m), decreasing=TRUE)
myNames <- names(v)
d <- data.frame(word=myNames, freq=v)
wordcloud(d$word, d$freq, min.freq=33)
termDocMatrix <- as.matrix(myDtm)
termDocMatrix[termDocMatrix>=1] <- 1
termDocMatrix[5:10,1:20]
termMatrix <- termDocMatrix %*% t(termDocMatrix)
# inspect terms numbered 5 to 10
termMatrix[5:10,5:10]
write.table(termDocMatrix, file = "e:/matrix", sep = " ")
rm(corpus, corpusVars, dtm, lengths)
rm(lengths)
library(RODBC)
channel <-
odbcConnectExcel2007("C:/Users/cristian.chirita/Documents/DateKnime/adevarulCOMENTARII_Siriamartie21.xlsx")
corpusDataset <- sqlQuery(channel=channel, "select * from [Sheet1$]")
odbcCloseAll()
corpus <- Corpus(DataframeSource(corpusDataset["TextComentariu"]),
readerControl=list(language="ro"))
corpusVars <- corpusDataset[!names(corpusDataset) == "TextComentariu"]
corpusVars <- corpusVars[c("nume articol", "rankcomentator", "Autor",
"like-uri")]
activeDataSet("corpusVars")
names(corpusVars) <- make.names(names(corpusVars))
setCorpusVariables()
dtmCorpus <- corpus
dtmCorpus <- tm_map(dtmCorpus, content_transformer(tolower))
dtmCorpus <- tm_map(dtmCorpus, content_transformer(function(x)
gsub("(['?\n<U+202F><U+2009>]|[[:punct:]]|[[:space:]]|[[:cntrl:]])+", " ",
x)))
dtmCorpus <- tm_map(dtmCorpus, removeNumbers)
dtm <- DocumentTermMatrix(dtmCorpus, control=list(tolower=FALSE,
wordLengths=c(2, Inf)))
rm(dtmCorpus)
dictionary <- data.frame(row.names=colnames(dtm),
"Occurrences"=col_sums(dtm), "Stopword"=ifelse(colnames(dtm) %in%
stopwords("ro"), "Stopword", ""), stringsAsFactors=FALSE)
dtm <- dtm[, !colnames(dtm) %in% stopwords("ro")]
attr(dtm, "dictionary") <- dictionary
rm(dictionary)
meta(corpus, type="corpus", tag="language") <- attr(dtm, "language") <- "ro"
meta(corpus, type="corpus", tag="processing") <- attr(dtm, "processing") <-
c(lowercase=TRUE, punctuation=TRUE, digits=TRUE, stopwords=TRUE,
stemming=FALSE, customStemming=FALSE, twitter=FALSE, removeHashtags=NA,
removeNames=NA)
corpus
dtm
keep <- meta(corpus, "like.uri")[[1]] %in% c("-2 (6 voturi)",
"-4 (6 voturi)", "-4 (4 voturi)", "-1 (5 voturi)", "-1 (7 voturi)",
"-1 (1 voturi)", "-2 (4 voturi)", "-6 (8 voturi)", "-4 (8 voturi)",
"-3 (3 voturi)", "-5 (13 voturi)", "-6 (14 voturi)", "-4 (12 voturi)",
"-3 (11 voturi)", "-2 (2 voturi)", "-3 (15 voturi)", "-5 (15 voturi)",
"-1 (11 voturi)", "-5 (11 voturi)", "-3 (7 voturi)", "-2 (18 voturi)",
"-1 (3 voturi)", "-3 (9 voturi)", "-3 (5 voturi)", "0 (16 voturi)",
"-6 (10 voturi)", "-6 (18 voturi)", "-10 (18 voturi)", "-14 (16 voturi)",
"-11 (15 voturi)", "-12 (14 voturi)", "-2 (14 voturi)", "-4 (10 voturi)",
"-1 (9 voturi)", "-5 (21 voturi)", "-4 (14 voturi)", "-1 (13 voturi)",
"-2 (10 voturi)", "-5 (7 voturi)", "-5 (5 voturi)", "-6 (6 voturi)",
"-7 (25 voturi)", "-13 (27 voturi)", "-2 (16 voturi)", "-5 (17 voturi)",
"-7 (9 voturi)", "-7 (7 voturi)", "-12 (52 voturi)", "-1 (29 voturi)",
"-5 (31 voturi)", "-5 (41 voturi)", "-1 (35 voturi)", "-2 (34 voturi)",
"-5 (37 voturi)", "-6 (28 voturi)", "-6 (36 voturi)", "-9 (37 voturi)",
"-8 (24 voturi)", "-2 (8 voturi)", "-5 (9 voturi)", "-7 (11 voturi)",
"-8 (8 voturi)", "-2 (12 voturi)", "-8 (36 voturi)", "-3 (29 voturi)",
"-8 (28 voturi)", "-1 (15 voturi)", "-13 (23 voturi)", "-12 (18 voturi)",
"-10 (14 voturi)", "-10 (12 voturi)", "-6 (12 voturi)", "-8 (18 voturi)",
"-6 (44 voturi)", "-17 (57 voturi)", "-11 (57 voturi)", "-3 (21 voturi)",
"-4 (20 voturi)", "-7 (19 voturi)", "-8 (14 voturi)", "-3 (19 voturi)",
"-4 (36 voturi)", "-8 (26 voturi)", "-1 (17 voturi)", "-6 (16 voturi)",
"-8 (10 voturi)", "-9 (13 voturi)", "-9 (9 voturi)", "-4 (22 voturi)",
"-2 (20 voturi)", "-2 (22 voturi)", "-6 (24 voturi)", "-7 (15 voturi)",
"-11 (25 voturi)", "-7 (13 voturi)", "-5 (19 voturi)", "-4 (16 voturi)",
"-15 (29 voturi)", "-3 (17 voturi)", "-8 (12 voturi)", "-1 (21 voturi)",
"-5 (23 voturi)", "-1 (25 voturi)", "-16 (20 voturi)", "-9 (11 voturi)",
"-16 (16 voturi)", "-3 (13 voturi)", "-5 (29 voturi)", "-10 (20 voturi)",
"-8 (20 voturi)", "-9 (17 voturi)", "-8 (30 voturi)", "-2 (38 voturi)",
"-10 (28 voturi)", "-1 (53 voturi)", "-1 (43 voturi)", "-3 (47 voturi)",
"-4 (40 voturi)", "-8 (34 voturi)", "-7 (49 voturi)", "-4 (26 voturi)",
"-7 (23 voturi)", "-24 (30 voturi)", "-3 (27 voturi)", "-15 (23 voturi)",
"-12 (12 voturi)", "-17 (19 voturi)", "-31 (43 voturi)", "-16 (30 voturi)",
"-17 (27 voturi)", "-14 (26 voturi)")
origCorpus <- corpus
corpus <- corpus[keep]
origDtm <- dtm
dtmAttr <- attributes(dtm)
origDictionary <- attr(dtm, "dictionary")
dtm <- dtm[keep,]
dictionary <- data.frame(row.names=colnames(dtm),
"Occurrences"=col_sums(dtm), "Stopword"=ifelse(colnames(dtm) %in%
stopwords("ro"), "Stopword", ""), stringsAsFactors=FALSE)
dtm <- dtm[, !colnames(dtm) %in% stopwords("ro")]
attr(dtm, "dictionary") <- dictionary
rm(dictionary)
attr(dtm, "language") <- dtmAttr$lang
attr(dtm, "processing") <- dtmAttr$processing
rm(dtmAttr, origDictionary)
corpusVars <- corpusVars[keep,, drop=FALSE]
rm(list=c("keep", "lengths"))
corpus
dtm
library(relimp, pos=23)
showData(corpusVars, placement='-20+200', font=getRcmdr('logFont'),
maxwidth=80, maxheight=30)
editDataset(corpusVars)
editDataset(corpusVars)
####################################################
corpus <- tm_map(corpus, removeWords, "romanian")
stopwords(kind = "ro")
myDtm <- TermDocumentMatrix(corpus, control = list(minWordLength = 1,stopwords = TRUE))
inspect(myDtm[266:270,31:40])
library(wordcloud)
m <- as.matrix(myDtm)
v <- sort(rowSums(m), decreasing=TRUE)
myNames <- names(v)
d <- data.frame(word=myNames, freq=v)
wordcloud(d$word, d$freq, min.freq=33)
termDocMatrix <- as.matrix(myDtm)
termDocMatrix[termDocMatrix>=1] <- 1
termDocMatrix[5:10,1:20]
termMatrix <- termDocMatrix %*% t(termDocMatrix)
# inspect terms numbered 5 to 10
termMatrix[5:10,5:10]
library(igraph)
# build a graph from the above matrix
g <- graph.adjacency(termMatrix, weighted=T, mode = "undirected")
# remove loops
g <- simplify(g)
# set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)
# set seed to make the layout reproducible
set.seed(3952)
layout1 <- layout.fruchterman.reingold(g)
plot(g, layout=layout1)
write.graph(g, "e:/coocgraph", format=c( "graphml"))
https://rstudio-pubs-static.s3.amazonaws.com/31867_8236987cf0a8444e962ccd2aec46d9c3.html#clustering-by-term-similarity
Graficul rezultat contine nu este filtrat din punct de vedere gramatical motiv pentru care contine prea mult zgomot.

Wordcloud romana
# Prefer fixed to scientific notation
options(scipen=5)
# Print numbers with two significant digits
options(digits=2)
options(R2HTML.format.digits=2)
# Set a nice color palette for plots
lattice.options(default.theme=latticeExtra::custom.theme(symbol=RColorBrewer::brewer.pal(8,
"Set1")[c(2:1, 3:5, 7:9)], fill=RColorBrewer::brewer.pal(8, "Set1")[c(2:1,
3:5, 7:9)], region=RColorBrewer::brewer.pal(n=11, name="Spectral")))
rm(lengths)
library(RODBC)
channel <-
odbcConnectExcel2007("C:/Users/cc/Documents/DateKnime/adevarulCOMENTARII_Siriamartie21.xlsx")
corpusDataset <- sqlQuery(channel=channel, "select * from [Sheet1$]")
odbcCloseAll()
corpus <- Corpus(DataframeSource(corpusDataset["TextComentariu"]),
readerControl=list(language="ro"))
corpusVars <- corpusDataset[!names(corpusDataset) == "TextComentariu"]
corpusVars <- data.frame(var1=factor(rep("", length(corpus))),
row.names=names(corpus))
activeDataSet("corpusVars")
setCorpusVariables()
dtmCorpus <- corpus
dtmCorpus <- tm_map(dtmCorpus, content_transformer(tolower))
dtmCorpus <- tm_map(dtmCorpus, content_transformer(function(x)
gsub("(['?\n<U+202F><U+2009>]|[[:punct:]]|[[:space:]]|[[:cntrl:]])+", " ",
x)))
dtmCorpus <- tm_map(dtmCorpus, removeNumbers)
dtm <- DocumentTermMatrix(dtmCorpus, control=list(tolower=FALSE,
wordLengths=c(2, Inf)))
rm(dtmCorpus)
dictionary <- data.frame(row.names=colnames(dtm),
"Occurrences"=col_sums(dtm), "Stopword"=ifelse(colnames(dtm) %in%
stopwords("ro"), "Stopword", ""), stringsAsFactors=FALSE)
dtm <- dtm[, !colnames(dtm) %in% stopwords("ro")]
attr(dtm, "dictionary") <- dictionary
rm(dictionary)
meta(corpus, type="corpus", tag="language") <- attr(dtm, "language") <- "ro"
meta(corpus, type="corpus", tag="processing") <- attr(dtm, "processing") <-
c(lowercase=TRUE, punctuation=TRUE, digits=TRUE, stopwords=TRUE,
stemming=FALSE, customStemming=FALSE, twitter=FALSE, removeHashtags=NA,
removeNames=NA)
corpus
dtm
corpus <- tm_map(corpus, removeWords, "romanian")
stopwords(kind = "ro")
myDtm <- TermDocumentMatrix(corpus, control = list(minWordLength = 1,stopwords = TRUE))
inspect(myDtm[266:270,31:40])
library(wordcloud)
m <- as.matrix(myDtm)
v <- sort(rowSums(m), decreasing=TRUE)
myNames <- names(v)
d <- data.frame(word=myNames, freq=v)
wordcloud(d$word, d$freq, min.freq=33)
Thursday, April 7, 2016
Matricea Apritiei Simultane
http://www.rdatamining.com/examples/social-network-analysis
http://www.rdatamining.com/examples/text-mining
# Prefer fixed to scientific notation
options(scipen=5)
# Print numbers with two significant digits
options(digits=2)
options(R2HTML.format.digits=2)
# Set a nice color palette for plots
lattice.options(default.theme=latticeExtra::custom.theme(symbol=RColorBrewer::brewer.pal(8,
"Set1")[c(2:1, 3:5, 7:9)], fill=RColorBrewer::brewer.pal(8, "Set1")[c(2:1,
3:5, 7:9)], region=RColorBrewer::brewer.pal(n=11, name="Spectral")))
rm(lengths)
library(RODBC)
channel <-
odbcConnectExcel("C:/Users/cristian.chirita/Documents/DateKnime/adevarulCOMENTARII_Siriamartie21.xls")
corpusDataset <- sqlQuery(channel=channel, "select * from [Sheet1$]")
odbcCloseAll()
corpus <- Corpus(DataframeSource(corpusDataset["TextComentariu"]),
readerControl=list(language="ro"))
corpusVars <- corpusDataset[!names(corpusDataset) == "TextComentariu"]
activeDataSet("corpusVars")
names(corpusVars) <- make.names(names(corpusVars))
setCorpusVariables()
dtmCorpus <- corpus
dtmCorpus <- tm_map(dtmCorpus, content_transformer(tolower))
dtmCorpus <- tm_map(dtmCorpus, content_transformer(function(x)
gsub("(['?\n<U+202F><U+2009>]|[[:punct:]]|[[:space:]]|[[:cntrl:]])+", " ",
x)))
dtmCorpus <- tm_map(dtmCorpus, removeNumbers)
dtm <- DocumentTermMatrix(dtmCorpus, control=list(tolower=TRUE,
wordLengths=c(2, Inf)))
rm(dtmCorpus)
dictionary <- data.frame(row.names=colnames(dtm),
"Occurrences"=col_sums(dtm), "Stopword"=ifelse(colnames(dtm) %in%
stopwords("ro"), "Stopword", ""), stringsAsFactors=FALSE)
dtm <- dtm[, !colnames(dtm) %in% stopwords("ro")]
attr(dtm, "dictionary") <- dictionary
rm(dictionary)
meta(corpus, type="corpus", tag="language") <- attr(dtm, "language") <- "ro"
meta(corpus, type="corpus", tag="processing") <- attr(dtm, "processing") <-
c(lowercase=TRUE, punctuation=TRUE, digits=TRUE, stopwords=TRUE,
stemming=FALSE, customStemming=FALSE, twitter=FALSE, removeHashtags=NA,
removeNames=NA)
corpus
dtm
myDtm <- TermDocumentMatrix(dtm, control = list(minWordLength = 1))
inspect(myDtm[100:170,10:20])
inspect(dtm[100:170,10:20])
findFreqTerms(dtm, lowfreq=10)
findAssocs(dtm, 'kurzi', 0.30)
library(wordcloud)
m <- as.matrix(dtm)
v <- sort(rowSums(m), decreasing=TRUE)
myNames <- names(v)
k <- which(names(v)=="siria")
myNames[k] <- "Siria"
d <- data.frame(word=myNames, freq=v)
wordcloud(d$word, d$freq, min.freq=3)
myDtm <- TermDocumentMatrix(corpus, control = list(minWordLength = 1))
inspect(myDtm[100:170,10:20])
myCorpus <- tm_map(corpus, tolower)
# remove punctuation
myCorpus <- tm_map(myCorpus, removePunctuation)
# remove numbers
myCorpus <- tm_map(myCorpus, removeNumbers)
# remove stopwords
# keep "r" by removing it from stopwords
myStopwords <- c(stopwords('english'), "available", "via")
idx <- which(myStopwords == "r")
myStopwords <- myStopwords[-idx]
myCorpus <- tm_map(myCorpus, removeWords, myStopwords)
myCorpus
myDtm <- TermDocumentMatrix(myCorpus, control = list(minWordLength = 1))
inspect(myDtm[4266:4270,4331:4340])
findFreqTerms(myDtm, lowfreq=20)
findAssocs(myDtm, 'victime', 0.30)
termDocMatrix <- as.matrix(myDtm)
Obs:
Fisier prea mare :)
http://www.rdatamining.com/examples/text-mining
# Prefer fixed to scientific notation
options(scipen=5)
# Print numbers with two significant digits
options(digits=2)
options(R2HTML.format.digits=2)
# Set a nice color palette for plots
lattice.options(default.theme=latticeExtra::custom.theme(symbol=RColorBrewer::brewer.pal(8,
"Set1")[c(2:1, 3:5, 7:9)], fill=RColorBrewer::brewer.pal(8, "Set1")[c(2:1,
3:5, 7:9)], region=RColorBrewer::brewer.pal(n=11, name="Spectral")))
rm(lengths)
library(RODBC)
channel <-
odbcConnectExcel("C:/Users/cristian.chirita/Documents/DateKnime/adevarulCOMENTARII_Siriamartie21.xls")
corpusDataset <- sqlQuery(channel=channel, "select * from [Sheet1$]")
odbcCloseAll()
corpus <- Corpus(DataframeSource(corpusDataset["TextComentariu"]),
readerControl=list(language="ro"))
corpusVars <- corpusDataset[!names(corpusDataset) == "TextComentariu"]
activeDataSet("corpusVars")
names(corpusVars) <- make.names(names(corpusVars))
setCorpusVariables()
dtmCorpus <- corpus
dtmCorpus <- tm_map(dtmCorpus, content_transformer(tolower))
dtmCorpus <- tm_map(dtmCorpus, content_transformer(function(x)
gsub("(['?\n<U+202F><U+2009>]|[[:punct:]]|[[:space:]]|[[:cntrl:]])+", " ",
x)))
dtmCorpus <- tm_map(dtmCorpus, removeNumbers)
dtm <- DocumentTermMatrix(dtmCorpus, control=list(tolower=TRUE,
wordLengths=c(2, Inf)))
rm(dtmCorpus)
dictionary <- data.frame(row.names=colnames(dtm),
"Occurrences"=col_sums(dtm), "Stopword"=ifelse(colnames(dtm) %in%
stopwords("ro"), "Stopword", ""), stringsAsFactors=FALSE)
dtm <- dtm[, !colnames(dtm) %in% stopwords("ro")]
attr(dtm, "dictionary") <- dictionary
rm(dictionary)
meta(corpus, type="corpus", tag="language") <- attr(dtm, "language") <- "ro"
meta(corpus, type="corpus", tag="processing") <- attr(dtm, "processing") <-
c(lowercase=TRUE, punctuation=TRUE, digits=TRUE, stopwords=TRUE,
stemming=FALSE, customStemming=FALSE, twitter=FALSE, removeHashtags=NA,
removeNames=NA)
corpus
dtm
myDtm <- TermDocumentMatrix(dtm, control = list(minWordLength = 1))
inspect(myDtm[100:170,10:20])
inspect(dtm[100:170,10:20])
findFreqTerms(dtm, lowfreq=10)
findAssocs(dtm, 'kurzi', 0.30)
library(wordcloud)
m <- as.matrix(dtm)
v <- sort(rowSums(m), decreasing=TRUE)
myNames <- names(v)
k <- which(names(v)=="siria")
myNames[k] <- "Siria"
d <- data.frame(word=myNames, freq=v)
wordcloud(d$word, d$freq, min.freq=3)
myDtm <- TermDocumentMatrix(corpus, control = list(minWordLength = 1))
inspect(myDtm[100:170,10:20])
myCorpus <- tm_map(corpus, tolower)
# remove punctuation
myCorpus <- tm_map(myCorpus, removePunctuation)
# remove numbers
myCorpus <- tm_map(myCorpus, removeNumbers)
# remove stopwords
# keep "r" by removing it from stopwords
myStopwords <- c(stopwords('english'), "available", "via")
idx <- which(myStopwords == "r")
myStopwords <- myStopwords[-idx]
myCorpus <- tm_map(myCorpus, removeWords, myStopwords)
myCorpus
myDtm <- TermDocumentMatrix(myCorpus, control = list(minWordLength = 1))
inspect(myDtm[4266:4270,4331:4340])
findFreqTerms(myDtm, lowfreq=20)
findAssocs(myDtm, 'victime', 0.30)
termDocMatrix <- as.matrix(myDtm)
Obs:
Fisier prea mare :)
Tuesday, April 5, 2016
Resurse Statistice R
http://quantlet.de/index.php?p=info
https://eight2late.wordpress.com/2015/12/02/a-gentle-introduction-to-network-graphs-using-r-and-gephi/
autori<-corpusVars$Autor In loc de filename
autori
filekey <- cbind(rownames(m),autori)
# Prefer fixed to scientific notation
options(scipen=5)
# Print numbers with two significant digits
options(digits=2)
options(R2HTML.format.digits=2)
# Set a nice color palette for plots
lattice.options(default.theme=latticeExtra::custom.theme(symbol=RColorBrewer::brewer.pal(8,
"Set1")[c(2:1, 3:5, 7:9)], fill=RColorBrewer::brewer.pal(8, "Set1")[c(2:1,
3:5, 7:9)], region=RColorBrewer::brewer.pal(n=11, name="Spectral")))
rm(lengths)
library(RODBC)
channel <-
odbcConnectExcel("C:/Users/c.c/Documents/DateKnime/adevarulCOMENTARII_Siriamartie21.xls")
corpusDataset <- sqlQuery(channel=channel, "select * from [Sheet1$]")
odbcCloseAll()
corpus <- Corpus(DataframeSource(corpusDataset["TextComentariu"]),
readerControl=list(language="ro"))
corpusVars <- corpusDataset[!names(corpusDataset) == "TextComentariu"]
activeDataSet("corpusVars")
names(corpusVars) <- make.names(names(corpusVars))
setCorpusVariables()
dtmCorpus <- corpus
dtmCorpus <- tm_map(dtmCorpus, content_transformer(tolower))
dtmCorpus <- tm_map(dtmCorpus, content_transformer(function(x)
gsub("(['?\n<U+202F><U+2009>]|[[:punct:]]|[[:space:]]|[[:cntrl:]])+", " ",
x)))
dtmCorpus <- tm_map(dtmCorpus, removeNumbers)
dtm <- DocumentTermMatrix(dtmCorpus, control=list(tolower=FALSE,
wordLengths=c(2, Inf)))
rm(dtmCorpus)
dictionary <- data.frame(row.names=colnames(dtm),
"Occurrences"=col_sums(dtm), "Stopword"=ifelse(colnames(dtm) %in%
stopwords("ro"), "Stopword", ""), stringsAsFactors=FALSE)
dtm <- dtm[, !colnames(dtm) %in% stopwords("ro")]
attr(dtm, "dictionary") <- dictionary
rm(dictionary)
meta(corpus, type="corpus", tag="language") <- attr(dtm, "language") <- "ro"
meta(corpus, type="corpus", tag="processing") <- attr(dtm, "processing") <-
c(lowercase=TRUE, punctuation=TRUE, digits=TRUE, stopwords=TRUE,
stemming=FALSE, customStemming=FALSE, twitter=FALSE, removeHashtags=NA,
removeNames=NA)
corpus
dtm
m<-as.matrix(dtm)
write.csv(m,file="E:/dtmEight2Late.csv")
filekey <- cbind(rownames(m),filenames)
write.csv(filekey,"E:/filekey.csv")
library(relimp, pos=21)
showData(corpusVars, placement='-20+200', font=getRcmdr('logFont'),
maxwidth=80, maxheight=30)
autori<-corpusVars$Autor
autori
filekey <- cbind(rownames(m),autori)
write.csv(filekey,"e:/filekey.csv")
cosineSim <- function(x){
as.dist(x%*%t(x)/(sqrt(rowSums(x^2) %*% t(rowSums(x^2)))))
}
cs <- cosineSim(m)
write.csv(as.matrix(cs),file="E:/csEight2Late.csv")
cs[cs < max(cs)/2] <- 0
cs <- round(cs,3)
write.csv(as.matrix(cs),file="e:AdjacencyMatrix.csv")
dat=read.csv(file.choose(),header=TRUE,row.names=1,check.names=FALSE)
m1=as.matrix(dat) # coerces the data set as a matrix
g=graph.adjacency(m1,mode="undirected",weighted=NULL) # this will create an 'igraph object'
g
http://www.slideshare.net/rdatamining/text-mining-with-r-an-analysis-of-twitter-data
http://www.n3labs.com/pdf/rank-co-occur.pdf
https://cran.r-project.org/web/packages/cooccur/cooccur.pdf
https://eight2late.wordpress.com/2015/09/29/a-gentle-introduction-to-topic-modeling-using-r/
http://www.rdatamining.com/examples/text-mining
http://www.linguisticdna.org/2015/09/10/proximity-data-ii-co-occurrence-and-distance-measurements/
https://eight2late.wordpress.com/2015/05/27/a-gentle-introduction-to-text-mining-using-r/
http://faculty.washington.edu/jwilker/CAP/R_Sample_Script.R
https://eight2late.wordpress.com/2015/12/02/a-gentle-introduction-to-network-graphs-using-r-and-gephi/
autori<-corpusVars$Autor In loc de filename
autori
filekey <- cbind(rownames(m),autori)
# Prefer fixed to scientific notation
options(scipen=5)
# Print numbers with two significant digits
options(digits=2)
options(R2HTML.format.digits=2)
# Set a nice color palette for plots
lattice.options(default.theme=latticeExtra::custom.theme(symbol=RColorBrewer::brewer.pal(8,
"Set1")[c(2:1, 3:5, 7:9)], fill=RColorBrewer::brewer.pal(8, "Set1")[c(2:1,
3:5, 7:9)], region=RColorBrewer::brewer.pal(n=11, name="Spectral")))
rm(lengths)
library(RODBC)
channel <-
odbcConnectExcel("C:/Users/c.c/Documents/DateKnime/adevarulCOMENTARII_Siriamartie21.xls")
corpusDataset <- sqlQuery(channel=channel, "select * from [Sheet1$]")
odbcCloseAll()
corpus <- Corpus(DataframeSource(corpusDataset["TextComentariu"]),
readerControl=list(language="ro"))
corpusVars <- corpusDataset[!names(corpusDataset) == "TextComentariu"]
activeDataSet("corpusVars")
names(corpusVars) <- make.names(names(corpusVars))
setCorpusVariables()
dtmCorpus <- corpus
dtmCorpus <- tm_map(dtmCorpus, content_transformer(tolower))
dtmCorpus <- tm_map(dtmCorpus, content_transformer(function(x)
gsub("(['?\n<U+202F><U+2009>]|[[:punct:]]|[[:space:]]|[[:cntrl:]])+", " ",
x)))
dtmCorpus <- tm_map(dtmCorpus, removeNumbers)
dtm <- DocumentTermMatrix(dtmCorpus, control=list(tolower=FALSE,
wordLengths=c(2, Inf)))
rm(dtmCorpus)
dictionary <- data.frame(row.names=colnames(dtm),
"Occurrences"=col_sums(dtm), "Stopword"=ifelse(colnames(dtm) %in%
stopwords("ro"), "Stopword", ""), stringsAsFactors=FALSE)
dtm <- dtm[, !colnames(dtm) %in% stopwords("ro")]
attr(dtm, "dictionary") <- dictionary
rm(dictionary)
meta(corpus, type="corpus", tag="language") <- attr(dtm, "language") <- "ro"
meta(corpus, type="corpus", tag="processing") <- attr(dtm, "processing") <-
c(lowercase=TRUE, punctuation=TRUE, digits=TRUE, stopwords=TRUE,
stemming=FALSE, customStemming=FALSE, twitter=FALSE, removeHashtags=NA,
removeNames=NA)
corpus
dtm
m<-as.matrix(dtm)
write.csv(m,file="E:/dtmEight2Late.csv")
filekey <- cbind(rownames(m),filenames)
write.csv(filekey,"E:/filekey.csv")
library(relimp, pos=21)
showData(corpusVars, placement='-20+200', font=getRcmdr('logFont'),
maxwidth=80, maxheight=30)
autori<-corpusVars$Autor
autori
filekey <- cbind(rownames(m),autori)
write.csv(filekey,"e:/filekey.csv")
cosineSim <- function(x){
as.dist(x%*%t(x)/(sqrt(rowSums(x^2) %*% t(rowSums(x^2)))))
}
cs <- cosineSim(m)
write.csv(as.matrix(cs),file="E:/csEight2Late.csv")
cs[cs < max(cs)/2] <- 0
cs <- round(cs,3)
write.csv(as.matrix(cs),file="e:AdjacencyMatrix.csv")
dat=read.csv(file.choose(),header=TRUE,row.names=1,check.names=FALSE)
m1=as.matrix(dat) # coerces the data set as a matrix
g=graph.adjacency(m1,mode="undirected",weighted=NULL) # this will create an 'igraph object'
g
http://www.slideshare.net/rdatamining/text-mining-with-r-an-analysis-of-twitter-data
http://www.n3labs.com/pdf/rank-co-occur.pdf
https://cran.r-project.org/web/packages/cooccur/cooccur.pdf
https://eight2late.wordpress.com/2015/09/29/a-gentle-introduction-to-topic-modeling-using-r/
http://www.rdatamining.com/examples/text-mining
http://www.linguisticdna.org/2015/09/10/proximity-data-ii-co-occurrence-and-distance-measurements/
https://eight2late.wordpress.com/2015/05/27/a-gentle-introduction-to-text-mining-using-r/
http://faculty.washington.edu/jwilker/CAP/R_Sample_Script.R
Subscribe to:
Comments (Atom)

































