Thursday, April 21, 2016

Perceptia Siriei in 5000 de comentarii Adevarul.ro

(freq.terms <- findFreqTerms(dtm, lowfreq = 30))
agraph <- graphplot(dtm,term = freq.terms,corThreshold = 0.1)
write.graph(agraph,"e:/agraphsiria.graphml", format=c("graphml"))

GGplot exemple

http://matthewlincoln.net/2014/12/20/adjacency-matrix-plots-with-r-and-ggplot2.html

Alternate plot function for tm with Igraph support.

graphplot <-
function(x,
         terms = sample(Terms(x), 20),
         corThreshold = 0.7,
         weighted=TRUE,
   diag=FALSE,
         ...)
{
    if (system.file(package = "igraph") == "")
        stop("Plotting requires package 'igraph'.")
    m <- if (inherits(x, "TermDocumentMatrix")) t(x) else x
    m <- as.matrix(m[, terms])
    c <- cor(m)
    c[c < corThreshold] <- 0
    c[is.na(c)] <- 0
    diag(c) <- 0
 tmgraph <- graph.adjacency(c, mode=c("undirected"), weighted=TRUE, diag=FALSE,
        add.colnames=NULL, add.rownames=NA)
        plot(tmgraph)
   
    invisible(tmgraph)
}

example usage:
graphplot(dtm,term = freq.terms,corThreshold = 0.5)
dtm- document term matrix
terms- similar to plot.tm
threshold-similar with plot.pm

for saving/export:
agraph <- graphplot(dtm,term = freq.terms,corThreshold = 0.3)
write.graph(agraph,"e:/agraph.graphml", format=c("graphml"))

Wednesday, April 20, 2016

Topic model

dtm
library(topicmodels)
lda <- LDA(dtm, k = 10) # find 8 topics
(term <- terms(lda, 10)) # first 6 terms of every topic
dtm <- dtm[, !colnames(dtm) %in% c("si", "in", "dl", "dupa", "face",
  "arata")]
dtm <- dtm[, colnames(dtm) %in% c("cred", "spus", "asa", "poate", "doua")
dim(dtm)
summary(col_sums(dtm))
summary(col_sums(dtm))
term_tfidf <-
  tapply(dtm$v/row_sums(dtm)[dtm$i], dtm$j, mean) *
    log2(nDocs(dtm)/col_sums(dtm > 0))
summary(term_tfidf)
dtm <- dtm[,term_tfidf >= 0.1]
dtm <- dtm[row_sums(dtm) > 0,]
summary(col_sums(dtm))
dim(dtm)

k <- 30
SEED <- 2010
jss_TM <-
  list(VEM = LDA(dtm, k = k, control = list(seed = SEED)),
       VEM_fixed = LDA(dtm, k = k,
         control = list(estimate.alpha = FALSE, seed = SEED)),
       Gibbs = LDA(dtm, k = k, method = "Gibbs",
         control = list(seed = SEED, burnin = 1000,
           thin = 100, iter = 1000)),
       CTM = CTM(dtm, k = k,
         control = list(seed = SEED,
           var = list(tol = 10^-4), em = list(tol = 10^-3))))
Topic <- topics(jss_TM[["Gibbs"]], 10)
Terms <- terms(jss_TM[["Gibbs"]], 10)
Terms[,1:10]
 ctm <- CTM(dtm, 10, method = "VEM", control = NULL, model = NULL)
 g <- build_graph(ctm, 0.9, and = TRUE)

Metoda inspirata din DataminingwithR


# Prefer fixed to scientific notation
options(scipen=5)
# Print numbers with two significant digits
options(digits=2)
options(R2HTML.format.digits=2)
# Set a nice color palette for plots
lattice.options(default.theme=latticeExtra::custom.theme(symbol=RColorBrewer::brewer.pal(8,
   "Set1")[c(2:1, 3:5, 7:9)], fill=RColorBrewer::brewer.pal(8, "Set1")[c(2:1,
  3:5, 7:9)], region=RColorBrewer::brewer.pal(n=11, name="Spectral")))
rm(corpus, corpusVars, dtm, lengths)
library(RODBC)
channel <-
  odbcConnectExcel("C:/Users/cristian.chirita/Documents/newQdap/DATA/RAW_DATA/PAPER_ARTIFACTS/MMeditoriale.xls")
corpusDataset <- sqlQuery(channel=channel, "select * from [Sheet1$]")
odbcCloseAll()
corpus <- Corpus(DataframeSource(corpusDataset["Discurs"]),
  readerControl=list(language="ro"))
corpusVars <- corpusDataset[!names(corpusDataset) == "Discurs"]
activeDataSet("corpusVars")
setCorpusVariables()
dtmCorpus <- corpus
dtmCorpus <- tm_map(dtmCorpus, content_transformer(tolower))
dtmCorpus <- tm_map(dtmCorpus, content_transformer(function(x)
  gsub("(['?\n<U+202F><U+2009>]|[[:punct:]]|[[:space:]]|[[:cntrl:]])+", " ",
  x)))
dtmCorpus <- tm_map(dtmCorpus, removeNumbers)
dtm <- DocumentTermMatrix(dtmCorpus, control=list(tolower=FALSE,
  wordLengths=c(2, Inf)))
rm(dtmCorpus)
dictionary <- data.frame(row.names=colnames(dtm),
  "Occurrences"=col_sums(dtm), "Stopword"=ifelse(colnames(dtm) %in%
  stopwords("ro"), "Stopword", ""), stringsAsFactors=FALSE)
dtm <- dtm[, !colnames(dtm) %in% stopwords("ro")]
attr(dtm, "dictionary") <- dictionary
rm(dictionary)
meta(corpus, type="corpus", tag="language") <- attr(dtm, "language") <- "ro"
meta(corpus, type="corpus", tag="processing") <- attr(dtm, "processing") <-
  c(lowercase=TRUE, punctuation=TRUE, digits=TRUE, stopwords=TRUE,
  stemming=FALSE, customStemming=FALSE, twitter=FALSE, removeHashtags=NA,
  removeNames=NA)
corpus
dtm
library(qdap)
mytdm <- as.Corpus(dtm)
my_tdm <- TermDocumentMatrix(mytdm)
inspect(my_tdm)
(freq.terms <- findFreqTerms(my_tdm, lowfreq = 40))
term.freq <- rowSums(as.matrix(my_tdm))
term.freq <- subset(term.freq, term.freq >= 25)
df <- data.frame(term = names(term.freq), freq = term.freq)
library(ggplot2)
ggplot(df, aes(x = term, y = freq)) + geom_bar(stat = "identity") +
xlab("Terms") + ylab("Count") + coord_flip()
findAssocs(my_tdm, "munteanu", 0.2)
findAssocs(my_tdm, "pnl", 0.6)
findAssocs(my_tdm, "ciucu", 0.6)
library(graph)
library(Rgraphviz)
plot(my_tdm, term = freq.terms, corThreshold = 0.1, weighting = T)
tdm.s = removeSparseTerms(my_tdm, sparse=0.7)
# we'll need the TDM as a matrix
m = as.matrix(tdm.s)

# datavis time

# convert matri to data frame
m.df = data.frame(m)

# quick hack to make keywords - which got stuck in row.names - into a variable
m.df$keywords = rownames(m.df)
library(reshape)
# "melt" the data frame ; ?melt at R console for info
m.df.melted = melt(m.df)
?
# not necessary, but I like decent column names
colnames(m.df.melted) = c("Keyword","Post","Freq")

# generate the heatmap
library(ggthemes)
hm = ggplot(m.df.melted, aes(x=Post, y=Keyword)) +
  geom_tile(aes(fill=Freq), colour="white") +
  scale_fill_gradient(low="black", high="darkorange") +
  labs(title="Cuvinte cheie folosite in articolele cu Marian Munteanu") +
  theme_few() +
  theme(axis.text.x  = element_text(size=6))
ggsave(plot=hm,filename="e:/mm_munteanudocs-hm.png",width=11,height=8.5)
# better? way to view frequencies
# sum rows of the tdm to get term freq count
tf = rowSums(as.matrix(my_tdm))
# we don't want all the words, so choose ones with 10+ freq
tf.10 = subset(tf, tf>=40)
 #nu merge :)
# wimping out and using qplot so I don't have to make another data frame
#bf = qplot(names(tf.10), value(tf.10), geom="bar")
#ggsave(plot=bf,filename="e:/marianmunteanufreq-bars.png",width=8.5,height=11)
termDocMatrix <- as.matrix(my_tdm)
termDocMatrix[5:10,1:20]
# change it to a Boolean matrix
m[5:10,1:20]
termDocMatrix <- m
termDocMatrix[termDocMatrix>=1] <- 1
# transform into a term-term adjacency matrix
termMatrix <- termDocMatrix %*% t(termDocMatrix)
termMatrix[5:10,5:10]
library(igraph)
# build a graph from the above matrix
g <- graph.adjacency(termMatrix, weighted=TRUE, mode = "undirected")
# remove loops
g <- simplify(g)
# set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)
# set seed to make the layout reproducible
set.seed(3952)
layout1 <- layout.fruchterman.reingold(g)
plot(g, layout=layout1)
write.graph(g,"e:/marianmunteanuart.graphml", format=c("graphml"))
word.freq <- sort(rowSums(m), decreasing = T)
# colors
pal <- brewer.pal(9, "BuGn")
pal <- pal[-(1:4)]
library(wordcloud)
wordcloud(words = names(word.freq), freq = word.freq, min.freq = 10,
random.order = F, colors = pal)
tdm2 <- removeSparseTerms(my_tdm, sparse = 0.95)
tdm2 <- tdm.s
m2 <- as.matrix(tdm2)
# cluster terms
distMatrix <- dist(scale(m2))
fit <- hclust(distMatrix, method = "ward")
plot(fit)
rect.hclust(fit, k = 10) # cut tree into 6 clusters
m3 <- t(m2) # transpose the matrix to cluster documents
set.seed(122) # set a fixed random seed
k <- 10 # number of clusters
kmeansResult <- kmeans(m3, k)
round(kmeansResult$centers, digits = 3)
for (i in 1:k) {
cat(paste("cluster ", i, ": ", sep = ""))
s <- sort(kmeansResult$centers[i, ], decreasing = T)
cat(names(s)[1:5], "\n")
# print the keywords of every cluster
# print(keywords[which(kmeansResult$cluster==i)])
}






===============

https://rdatamining.wordpress.com/2012/05/17/an-example-of-social-network-analysis-with-r-using-package-igraph/



Pt plot tm vezi:
https://github.com/rforge/tm/blob/master/pkg/R/plot.R

======
http://www.leydesdorff.net/words/

======


http://www.r-bloggers.com/visualizing-risky-words-part-2/

Monday, April 18, 2016

Analiza discurs articolecotidianul Adevarul pe tema Marian Munteanu.




Wordnet topici  Marian Munteanu

 


Două tabere polarizate în jurul cămpului semantinc numit Europa: Decență proclamată vs Xenofobie
Cu o temă secundara revenirea lui Băsescu/PSD.

Grupuri identificate:












Analiza Autori-Vocabular:





Numar de articole per autor:




Clustering:






 

 



 

 


 
Cluster
 

 

Wednesday, April 13, 2016

Acelasi Corpus cu Qdap

http://trinker.github.io/qdap/vignettes/qdap_vignette.html#import_export
require(qdap)
qdap_dat<- as.data.frame(corpus)
qview(qdap_dat)
(tm_dat)
sentSplit
(qdap_dat, "state", stem = TRUE)
bag_o_words(qdap_dat$text)
by(qdap_dat$text, bag_o_words)
head(corpus)
out <- ngrams(qdap_dat$text, 2)
lapply(out[["all_n"]], function(x) sapply(x, paste, collapse = " "))
DATA$state <- qdap_dat$text
rm_stopwords(qdap_dat$text, Top200Words)
ms <- c(" Siria ", "Rusia" ,"teroristi")
et <- c(" Turcia", " americani")
word_associate(qdap_dat$text, qdap_dat$docs, match.string = ms, wordcloud = TRUE,  proportional = TRUE, network.plot = TRUE,  nw.label.proportional = TRUE, extra.terms = et,cloud.legend =c("A", "B", "C"), title.color = "blue", cloud.colors = c("red", "purple", "gray70"))


DATA$state <- qdap_dat$text
rm_stopwords(qdap_dat$text, Top200Words)
ms <- c(" Siria ", "Rusia" ,"teroristi")
et <- c(" Turcia", " americani")
word_associate(qdap_dat$text, qdap_dat$docs, match.string = ms, wordcloud = TRUE,  proportional = TRUE, network.plot = TRUE,  nw.label.proportional = TRUE, extra.terms = et,cloud.legend =c("A", "B", "C"), title.color = "blue", cloud.colors = c("red", "purple", "gray70"))
term_match(text.var = qdap_dat$text, terms = qcv(siria, sirieni), return.list = FALSE)
term_match(qdap_dat$text, "america", FALSE)
with(qdap_dat, Dissimilarity(text, docs, method = "minkowski"))
dat <- qdap_dat[qdap_dat$text %in% qcv(Siria, Rusia),]
dat
(div.mod <- with(qdap_dat, diversity(docs, text)))
plot(div.mod, low = "yellow", grid = FALSE, values = TRUE)
words <- c("Siria", "rusia", "america", "turcia", "romania")
with(qdap_dat, word_cor(qdap_dat$text, x, words, r = .5))
x <- factor(with(qdap_dat, paste(text, pad(TOT(tot)), sep = "|")))
word_network_plot(text.var=qdap_dat$text, stopwords=NULL, label.cex = .95)
 
 

Curs Detaliat

https://www.csc.ncsu.edu/faculty/healey/maa-16/text/

http://staff.scem.uws.edu.au/~lapark/lt.pdf

http://nm.wu-wien.ac.at/research/publications/b675.pdf


https://cran.r-project.org/web/packages/qdap/vignettes/tm_package_compatibility.pdf

Bibliografie R

http://stackoverflow.com/questions/16880411/r-tm-big-data-based-on-a-termdocumentmatrix-how-to-set-term-freq-bound-to-ext
Text mining Willian Shakespere:
http://www.r-bloggers.com/text-mining-the-complete-works-of-william-shakespeare/


http://www.r-bloggers.com/an-example-of-social-network-analysis-with-r-using-package-igraph/


Disimilarities Cosine
http://stats.stackexchange.com/questions/78321/term-frequency-inverse-document-frequency-tf-idf-weighting
library(proxy)
cosine_dist_mat <- as.matrix(dist(t(myDtm), method = "cosine"))
docsdissim <- dist(as.matrix(termDocMatrix), method = "cosine")
n <- as.matrix(docsdissim)
v <- sort(rowSums(n), decreasing=TRUE)
myNames <- names(v)
d <- data.frame(word=myNames, freq=v)
wordcloud(d$word, d$freq, min.freq=33)
termDocMatrix <- n
termDocMatrix[termDocMatrix>=1] <- 1
termDocMatrix[5:10,1:20]
termMatrix <- termDocMatrix %*% t(termDocMatrix)
# inspect terms numbered 5 to 10
termMatrix[5:10,5:10]
library(igraph)
# build a graph from the above matrix
g <- graph.adjacency(termMatrix, weighted=T, mode = "undirected")
# remove loops
g <- simplify(g)
# set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)
# set seed to make the layout reproducible
set.seed(3952)
layout1 <- layout.fruchterman.reingold(g)
plot(g, layout=layout1)
write.graph(g, "e:/siriacosinedisimilarity.graphml", format=c( "graphml"))

Cosine 1 =>mai degraba disimilaritate :)


termDocMatrix <- n
#termDocMatrix[termDocMatrix>=1] <- 1
termDocMatrix[termDocMatrix<1] <- 0
termDocMatrix[5:10,1:20]
termMatrix <- termDocMatrix %*% t(termDocMatrix)
# inspect terms numbered 5 to 10
termMatrix[5:10,5:10]
library(igraph)
# build a graph from the above matrix
g <- graph.adjacency(termMatrix, weighted=T, mode = "undirected")
# remove loops
g <- simplify(g)
# set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)
# set seed to make the layout reproducible
set.seed(3952)
layout1 <- layout.fruchterman.reingold(g)
plot(g, layout=layout1)
write.graph(g, "e:/siriacosinedisimilaritynonbollean.graphml", format=c( "graphml"))

Tuesday, April 12, 2016

Modificari corpus

Prin eliminarea de tip Sparse:
 
Un grafic al co-aparitiei destul de inteligibil.


dtm <- dtm[, !colnames(dtm) %in% c("in", "asa", "cand", "poate",
+   "comentariu", "neaprobat", "dupa", "face", "cat", "fara", "ie", "pare",
+   "pana", "ani", "refugiati", "tara", "nato", "spune", "ru", "astia", "vrea",
+   "mi", "toti", "decat", "fata", "isi", "pt", "adica", "intr", "urma", "dup",
+   "niste", "ori", "inca", "spus", "tine", "as", "vezi", "articol", "doi",
+   "imi", "ni", "ta", "ul", "tii", "at", "ati", "iti", "aproape", "cate",
+   "cazul", "dvs", "erau", "il", "ilor", "pina", "ptr", "catre", "doua", "etc",
+    "iei", "trei", "parca", "pun", "caci", "cind", "for")]

In principiu am matricea este redusa prin:
m<-as.matrix(removeSparseTerms(myDtm, .99))
In rest ramane la fel :)
dict <- termsDictionary(dtm, "occurrences")
attr(dict, "title") <- "Terms dictionary sorted by number of occurrences"
dict
dissDtm2 <- rollup(dtm, 1, meta(corpus, "like.uri"))
diss <- corpusDissimilarity(dtm, dissDtm2)
rm(dissDtm2)
attr(diss, "title") <- "Documents by like.uri dissimilarity table"
diss
absVarFreqs <- table(meta(corpus, "like.uri"), dnn="like.uri")
varFreqs <- prop.table(absVarFreqs) * 100
barchart(varFreqs, xlab="% of documents",
  main="Distribution of documents by like.uri", auto.key=TRUE)
varFreqs <- addmargins(varFreqs)
attr(varFreqs, "title") <- "Distribution of documents by like.uri (%)"
varFreqs
myDtm <- TermDocumentMatrix(corpus, control = list(minWordLength = 1,stopwords = TRUE))
inspect(myDtm[266:270,31:40])
m <- as.matrix(myDtm)
m<-as.matrix(removeSparseTerms(myDtm, .99))
v <- sort(rowSums(m), decreasing=TRUE)
myNames <- names(v)
d <- data.frame(word=myNames, freq=v)
wordcloud(d$word, d$freq, min.freq=33)
termDocMatrix <- as.matrix(myDtm)
termDocMatrix <- m
termDocMatrix[termDocMatrix>=1] <- 1
termDocMatrix[5:10,1:20]
termMatrix <- termDocMatrix %*% t(termDocMatrix)
# inspect terms numbered 5 to 10
termMatrix[5:10,5:10]
library(igraph)
# build a graph from the above matrix
g <- graph.adjacency(termMatrix, weighted=T, mode = "undirected")
# remove loops
g <- simplify(g)
# set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)
# set seed to make the layout reproducible
set.seed(3952)
layout1 <- layout.fruchterman.reingold(g)
plot(g, layout=layout1)
write.graph(g, "e:/siriasparsity099.graphml", format=c( "graphml"))

Friday, April 8, 2016

Coocurences graph negative comments only


# Prefer fixed to scientific notation
options(scipen=5)
# Print numbers with two significant digits
options(digits=2)
options(R2HTML.format.digits=2)
# Set a nice color palette for plots
lattice.options(default.theme=latticeExtra::custom.theme(symbol=RColorBrewer::brewer.pal(8,
   "Set1")[c(2:1, 3:5, 7:9)], fill=RColorBrewer::brewer.pal(8, "Set1")[c(2:1,
  3:5, 7:9)], region=RColorBrewer::brewer.pal(n=11, name="Spectral")))
rm(lengths)
library(RODBC)
channel <-
  odbcConnectExcel2007("C:/Users/CCCC/Documents/DateKnime/adevarulCOMENTARII_Siriamartie21.xlsx")
corpusDataset <- sqlQuery(channel=channel, "select * from [Sheet1$]")
odbcCloseAll()
corpus <- Corpus(DataframeSource(corpusDataset["TextComentariu"]),
  readerControl=list(language="ro"))
corpusVars <- corpusDataset[!names(corpusDataset) == "TextComentariu"]
corpusVars <- data.frame(var1=factor(rep("", length(corpus))),
  row.names=names(corpus))
activeDataSet("corpusVars")
setCorpusVariables()
dtmCorpus <- corpus
dtmCorpus <- tm_map(dtmCorpus, content_transformer(tolower))
dtmCorpus <- tm_map(dtmCorpus, content_transformer(function(x)
  gsub("(['?\n<U+202F><U+2009>]|[[:punct:]]|[[:space:]]|[[:cntrl:]])+", " ",
  x)))
dtmCorpus <- tm_map(dtmCorpus, removeNumbers)
dtm <- DocumentTermMatrix(dtmCorpus, control=list(tolower=FALSE,
  wordLengths=c(2, Inf)))
rm(dtmCorpus)
dictionary <- data.frame(row.names=colnames(dtm),
  "Occurrences"=col_sums(dtm), "Stopword"=ifelse(colnames(dtm) %in%
  stopwords("ro"), "Stopword", ""), stringsAsFactors=FALSE)
dtm <- dtm[, !colnames(dtm) %in% stopwords("ro")]
attr(dtm, "dictionary") <- dictionary
rm(dictionary)
meta(corpus, type="corpus", tag="language") <- attr(dtm, "language") <- "ro"
meta(corpus, type="corpus", tag="processing") <- attr(dtm, "processing") <-
  c(lowercase=TRUE, punctuation=TRUE, digits=TRUE, stopwords=TRUE,
  stemming=FALSE, customStemming=FALSE, twitter=FALSE, removeHashtags=NA,
  removeNames=NA)
corpus
dtm
corpus <- tm_map(corpus, removeWords, "romanian")
stopwords(kind = "ro")
myDtm <- TermDocumentMatrix(corpus, control = list(minWordLength = 1,stopwords = TRUE))
inspect(myDtm[266:270,31:40])
library(wordcloud)
m <- as.matrix(myDtm)
v <- sort(rowSums(m), decreasing=TRUE)
myNames <- names(v)
d <- data.frame(word=myNames, freq=v)
wordcloud(d$word, d$freq, min.freq=33)
termDocMatrix <- as.matrix(myDtm)
termDocMatrix[termDocMatrix>=1] <- 1
termDocMatrix[5:10,1:20]
termMatrix <- termDocMatrix %*% t(termDocMatrix)
# inspect terms numbered 5 to 10
termMatrix[5:10,5:10]
write.table(termDocMatrix, file = "e:/matrix", sep = " ")
rm(corpus, corpusVars, dtm, lengths)
rm(lengths)
library(RODBC)
channel <-
  odbcConnectExcel2007("C:/Users/cristian.chirita/Documents/DateKnime/adevarulCOMENTARII_Siriamartie21.xlsx")
corpusDataset <- sqlQuery(channel=channel, "select * from [Sheet1$]")
odbcCloseAll()
corpus <- Corpus(DataframeSource(corpusDataset["TextComentariu"]),
  readerControl=list(language="ro"))
corpusVars <- corpusDataset[!names(corpusDataset) == "TextComentariu"]
corpusVars <- corpusVars[c("nume articol", "rankcomentator", "Autor",
  "like-uri")]
activeDataSet("corpusVars")
names(corpusVars) <- make.names(names(corpusVars))
setCorpusVariables()
dtmCorpus <- corpus
dtmCorpus <- tm_map(dtmCorpus, content_transformer(tolower))
dtmCorpus <- tm_map(dtmCorpus, content_transformer(function(x)
  gsub("(['?\n<U+202F><U+2009>]|[[:punct:]]|[[:space:]]|[[:cntrl:]])+", " ",
  x)))
dtmCorpus <- tm_map(dtmCorpus, removeNumbers)
dtm <- DocumentTermMatrix(dtmCorpus, control=list(tolower=FALSE,
  wordLengths=c(2, Inf)))
rm(dtmCorpus)
dictionary <- data.frame(row.names=colnames(dtm),
  "Occurrences"=col_sums(dtm), "Stopword"=ifelse(colnames(dtm) %in%
  stopwords("ro"), "Stopword", ""), stringsAsFactors=FALSE)
dtm <- dtm[, !colnames(dtm) %in% stopwords("ro")]
attr(dtm, "dictionary") <- dictionary
rm(dictionary)
meta(corpus, type="corpus", tag="language") <- attr(dtm, "language") <- "ro"
meta(corpus, type="corpus", tag="processing") <- attr(dtm, "processing") <-
  c(lowercase=TRUE, punctuation=TRUE, digits=TRUE, stopwords=TRUE,
  stemming=FALSE, customStemming=FALSE, twitter=FALSE, removeHashtags=NA,
  removeNames=NA)
corpus
dtm
keep <- meta(corpus, "like.uri")[[1]] %in% c("-2 (6 voturi)",
  "-4 (6 voturi)", "-4 (4 voturi)", "-1 (5 voturi)", "-1 (7 voturi)",
  "-1 (1 voturi)", "-2 (4 voturi)", "-6 (8 voturi)", "-4 (8 voturi)",
  "-3 (3 voturi)", "-5 (13 voturi)", "-6 (14 voturi)", "-4 (12 voturi)",
  "-3 (11 voturi)", "-2 (2 voturi)", "-3 (15 voturi)", "-5 (15 voturi)",
  "-1 (11 voturi)", "-5 (11 voturi)", "-3 (7 voturi)", "-2 (18 voturi)",
  "-1 (3 voturi)", "-3 (9 voturi)", "-3 (5 voturi)", "0 (16 voturi)",
  "-6 (10 voturi)", "-6 (18 voturi)", "-10 (18 voturi)", "-14 (16 voturi)",
  "-11 (15 voturi)", "-12 (14 voturi)", "-2 (14 voturi)", "-4 (10 voturi)",
  "-1 (9 voturi)", "-5 (21 voturi)", "-4 (14 voturi)", "-1 (13 voturi)",
  "-2 (10 voturi)", "-5 (7 voturi)", "-5 (5 voturi)", "-6 (6 voturi)",
  "-7 (25 voturi)", "-13 (27 voturi)", "-2 (16 voturi)", "-5 (17 voturi)",
  "-7 (9 voturi)", "-7 (7 voturi)", "-12 (52 voturi)", "-1 (29 voturi)",
  "-5 (31 voturi)", "-5 (41 voturi)", "-1 (35 voturi)", "-2 (34 voturi)",
  "-5 (37 voturi)", "-6 (28 voturi)", "-6 (36 voturi)", "-9 (37 voturi)",
  "-8 (24 voturi)", "-2 (8 voturi)", "-5 (9 voturi)", "-7 (11 voturi)",
  "-8 (8 voturi)", "-2 (12 voturi)", "-8 (36 voturi)", "-3 (29 voturi)",
  "-8 (28 voturi)", "-1 (15 voturi)", "-13 (23 voturi)", "-12 (18 voturi)",
  "-10 (14 voturi)", "-10 (12 voturi)", "-6 (12 voturi)", "-8 (18 voturi)",
  "-6 (44 voturi)", "-17 (57 voturi)", "-11 (57 voturi)", "-3 (21 voturi)",
  "-4 (20 voturi)", "-7 (19 voturi)", "-8 (14 voturi)", "-3 (19 voturi)",
  "-4 (36 voturi)", "-8 (26 voturi)", "-1 (17 voturi)", "-6 (16 voturi)",
  "-8 (10 voturi)", "-9 (13 voturi)", "-9 (9 voturi)", "-4 (22 voturi)",
  "-2 (20 voturi)", "-2 (22 voturi)", "-6 (24 voturi)", "-7 (15 voturi)",
  "-11 (25 voturi)", "-7 (13 voturi)", "-5 (19 voturi)", "-4 (16 voturi)",
  "-15 (29 voturi)", "-3 (17 voturi)", "-8 (12 voturi)", "-1 (21 voturi)",
  "-5 (23 voturi)", "-1 (25 voturi)", "-16 (20 voturi)", "-9 (11 voturi)",
  "-16 (16 voturi)", "-3 (13 voturi)", "-5 (29 voturi)", "-10 (20 voturi)",
  "-8 (20 voturi)", "-9 (17 voturi)", "-8 (30 voturi)", "-2 (38 voturi)",
  "-10 (28 voturi)", "-1 (53 voturi)", "-1 (43 voturi)", "-3 (47 voturi)",
  "-4 (40 voturi)", "-8 (34 voturi)", "-7 (49 voturi)", "-4 (26 voturi)",
  "-7 (23 voturi)", "-24 (30 voturi)", "-3 (27 voturi)", "-15 (23 voturi)",
  "-12 (12 voturi)", "-17 (19 voturi)", "-31 (43 voturi)", "-16 (30 voturi)",
  "-17 (27 voturi)", "-14 (26 voturi)")
origCorpus <- corpus
corpus <- corpus[keep]
origDtm <- dtm
dtmAttr <- attributes(dtm)
origDictionary <- attr(dtm, "dictionary")
dtm <- dtm[keep,]
dictionary <- data.frame(row.names=colnames(dtm),
  "Occurrences"=col_sums(dtm), "Stopword"=ifelse(colnames(dtm) %in%
  stopwords("ro"), "Stopword", ""), stringsAsFactors=FALSE)
dtm <- dtm[, !colnames(dtm) %in% stopwords("ro")]
attr(dtm, "dictionary") <- dictionary
rm(dictionary)
attr(dtm, "language") <- dtmAttr$lang
attr(dtm, "processing") <- dtmAttr$processing
rm(dtmAttr, origDictionary)
corpusVars <- corpusVars[keep,, drop=FALSE]
rm(list=c("keep", "lengths"))
corpus
dtm
library(relimp, pos=23)
showData(corpusVars, placement='-20+200', font=getRcmdr('logFont'),
  maxwidth=80, maxheight=30)
editDataset(corpusVars)
editDataset(corpusVars)
####################################################
corpus <- tm_map(corpus, removeWords, "romanian")
stopwords(kind = "ro")
myDtm <- TermDocumentMatrix(corpus, control = list(minWordLength = 1,stopwords = TRUE))
inspect(myDtm[266:270,31:40])
library(wordcloud)
m <- as.matrix(myDtm)
v <- sort(rowSums(m), decreasing=TRUE)
myNames <- names(v)
d <- data.frame(word=myNames, freq=v)
wordcloud(d$word, d$freq, min.freq=33)
termDocMatrix <- as.matrix(myDtm)
termDocMatrix[termDocMatrix>=1] <- 1
termDocMatrix[5:10,1:20]
termMatrix <- termDocMatrix %*% t(termDocMatrix)
# inspect terms numbered 5 to 10
termMatrix[5:10,5:10]
library(igraph)
# build a graph from the above matrix
g <- graph.adjacency(termMatrix, weighted=T, mode = "undirected")
# remove loops
g <- simplify(g)
# set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)
# set seed to make the layout reproducible
set.seed(3952)
layout1 <- layout.fruchterman.reingold(g)
plot(g, layout=layout1)
write.graph(g, "e:/coocgraph", format=c( "graphml"))



https://rstudio-pubs-static.s3.amazonaws.com/31867_8236987cf0a8444e962ccd2aec46d9c3.html#clustering-by-term-similarity

Wordcloud romana


# Prefer fixed to scientific notation
options(scipen=5)
# Print numbers with two significant digits
options(digits=2)
options(R2HTML.format.digits=2)
# Set a nice color palette for plots
lattice.options(default.theme=latticeExtra::custom.theme(symbol=RColorBrewer::brewer.pal(8,
   "Set1")[c(2:1, 3:5, 7:9)], fill=RColorBrewer::brewer.pal(8, "Set1")[c(2:1,
  3:5, 7:9)], region=RColorBrewer::brewer.pal(n=11, name="Spectral")))
rm(lengths)
library(RODBC)
channel <-
  odbcConnectExcel2007("C:/Users/cc/Documents/DateKnime/adevarulCOMENTARII_Siriamartie21.xlsx")
corpusDataset <- sqlQuery(channel=channel, "select * from [Sheet1$]")
odbcCloseAll()
corpus <- Corpus(DataframeSource(corpusDataset["TextComentariu"]),
  readerControl=list(language="ro"))
corpusVars <- corpusDataset[!names(corpusDataset) == "TextComentariu"]
corpusVars <- data.frame(var1=factor(rep("", length(corpus))),
  row.names=names(corpus))
activeDataSet("corpusVars")
setCorpusVariables()
dtmCorpus <- corpus
dtmCorpus <- tm_map(dtmCorpus, content_transformer(tolower))
dtmCorpus <- tm_map(dtmCorpus, content_transformer(function(x)
  gsub("(['?\n<U+202F><U+2009>]|[[:punct:]]|[[:space:]]|[[:cntrl:]])+", " ",
  x)))
dtmCorpus <- tm_map(dtmCorpus, removeNumbers)
dtm <- DocumentTermMatrix(dtmCorpus, control=list(tolower=FALSE,
  wordLengths=c(2, Inf)))
rm(dtmCorpus)
dictionary <- data.frame(row.names=colnames(dtm),
  "Occurrences"=col_sums(dtm), "Stopword"=ifelse(colnames(dtm) %in%
  stopwords("ro"), "Stopword", ""), stringsAsFactors=FALSE)
dtm <- dtm[, !colnames(dtm) %in% stopwords("ro")]
attr(dtm, "dictionary") <- dictionary
rm(dictionary)
meta(corpus, type="corpus", tag="language") <- attr(dtm, "language") <- "ro"
meta(corpus, type="corpus", tag="processing") <- attr(dtm, "processing") <-
  c(lowercase=TRUE, punctuation=TRUE, digits=TRUE, stopwords=TRUE,
  stemming=FALSE, customStemming=FALSE, twitter=FALSE, removeHashtags=NA,
  removeNames=NA)
corpus
dtm
corpus <- tm_map(corpus, removeWords, "romanian")
stopwords(kind = "ro")
myDtm <- TermDocumentMatrix(corpus, control = list(minWordLength = 1,stopwords = TRUE))
inspect(myDtm[266:270,31:40])
library(wordcloud)
m <- as.matrix(myDtm)
v <- sort(rowSums(m), decreasing=TRUE)
myNames <- names(v)
d <- data.frame(word=myNames, freq=v)
wordcloud(d$word, d$freq, min.freq=33)


Thursday, April 7, 2016

Matricea Apritiei Simultane

http://www.rdatamining.com/examples/social-network-analysis
http://www.rdatamining.com/examples/text-mining

# Prefer fixed to scientific notation
options(scipen=5)
# Print numbers with two significant digits
options(digits=2)
options(R2HTML.format.digits=2)
# Set a nice color palette for plots
lattice.options(default.theme=latticeExtra::custom.theme(symbol=RColorBrewer::brewer.pal(8,
   "Set1")[c(2:1, 3:5, 7:9)], fill=RColorBrewer::brewer.pal(8, "Set1")[c(2:1,
  3:5, 7:9)], region=RColorBrewer::brewer.pal(n=11, name="Spectral")))
rm(lengths)
library(RODBC)
channel <-
  odbcConnectExcel("C:/Users/cristian.chirita/Documents/DateKnime/adevarulCOMENTARII_Siriamartie21.xls")
corpusDataset <- sqlQuery(channel=channel, "select * from [Sheet1$]")
odbcCloseAll()
corpus <- Corpus(DataframeSource(corpusDataset["TextComentariu"]),
  readerControl=list(language="ro"))
corpusVars <- corpusDataset[!names(corpusDataset) == "TextComentariu"]
activeDataSet("corpusVars")
names(corpusVars) <- make.names(names(corpusVars))
setCorpusVariables()
dtmCorpus <- corpus
dtmCorpus <- tm_map(dtmCorpus, content_transformer(tolower))
dtmCorpus <- tm_map(dtmCorpus, content_transformer(function(x)
  gsub("(['?\n<U+202F><U+2009>]|[[:punct:]]|[[:space:]]|[[:cntrl:]])+", " ",
  x)))
dtmCorpus <- tm_map(dtmCorpus, removeNumbers)
dtm <- DocumentTermMatrix(dtmCorpus, control=list(tolower=TRUE,
  wordLengths=c(2, Inf)))
rm(dtmCorpus)
dictionary <- data.frame(row.names=colnames(dtm),
  "Occurrences"=col_sums(dtm), "Stopword"=ifelse(colnames(dtm) %in%
  stopwords("ro"), "Stopword", ""), stringsAsFactors=FALSE)
dtm <- dtm[, !colnames(dtm) %in% stopwords("ro")]
attr(dtm, "dictionary") <- dictionary
rm(dictionary)
meta(corpus, type="corpus", tag="language") <- attr(dtm, "language") <- "ro"
meta(corpus, type="corpus", tag="processing") <- attr(dtm, "processing") <-
  c(lowercase=TRUE, punctuation=TRUE, digits=TRUE, stopwords=TRUE,
  stemming=FALSE, customStemming=FALSE, twitter=FALSE, removeHashtags=NA,
  removeNames=NA)
corpus
dtm
myDtm <- TermDocumentMatrix(dtm, control = list(minWordLength = 1))
inspect(myDtm[100:170,10:20])
inspect(dtm[100:170,10:20])
findFreqTerms(dtm, lowfreq=10)
findAssocs(dtm, 'kurzi', 0.30)
library(wordcloud)
m <- as.matrix(dtm)
v <- sort(rowSums(m), decreasing=TRUE)
 myNames <- names(v)
k <- which(names(v)=="siria")
myNames[k] <- "Siria"
d <- data.frame(word=myNames, freq=v)
wordcloud(d$word, d$freq, min.freq=3)
 myDtm <- TermDocumentMatrix(corpus, control = list(minWordLength = 1))
inspect(myDtm[100:170,10:20])
myCorpus <- tm_map(corpus, tolower)
# remove punctuation
myCorpus <- tm_map(myCorpus, removePunctuation)
# remove numbers
myCorpus <- tm_map(myCorpus, removeNumbers)
# remove stopwords
# keep "r" by removing it from stopwords
myStopwords <- c(stopwords('english'), "available", "via")
idx <- which(myStopwords == "r")
myStopwords <- myStopwords[-idx]
myCorpus <- tm_map(myCorpus, removeWords, myStopwords)
myCorpus
myDtm <- TermDocumentMatrix(myCorpus, control = list(minWordLength = 1))
inspect(myDtm[4266:4270,4331:4340])
findFreqTerms(myDtm, lowfreq=20)
findAssocs(myDtm, 'victime', 0.30)
termDocMatrix <- as.matrix(myDtm)



Obs:
Fisier prea mare :)

Tuesday, April 5, 2016

Resurse Statistice R

http://quantlet.de/index.php?p=info

https://eight2late.wordpress.com/2015/12/02/a-gentle-introduction-to-network-graphs-using-r-and-gephi/

autori<-corpusVars$Autor In loc de filename
autori
filekey <- cbind(rownames(m),autori)


# Prefer fixed to scientific notation
options(scipen=5)
# Print numbers with two significant digits
options(digits=2)
options(R2HTML.format.digits=2)
# Set a nice color palette for plots
lattice.options(default.theme=latticeExtra::custom.theme(symbol=RColorBrewer::brewer.pal(8,
   "Set1")[c(2:1, 3:5, 7:9)], fill=RColorBrewer::brewer.pal(8, "Set1")[c(2:1,
  3:5, 7:9)], region=RColorBrewer::brewer.pal(n=11, name="Spectral")))
rm(lengths)
library(RODBC)
channel <-
  odbcConnectExcel("C:/Users/c.c/Documents/DateKnime/adevarulCOMENTARII_Siriamartie21.xls")
corpusDataset <- sqlQuery(channel=channel, "select * from [Sheet1$]")
odbcCloseAll()
corpus <- Corpus(DataframeSource(corpusDataset["TextComentariu"]),
  readerControl=list(language="ro"))
corpusVars <- corpusDataset[!names(corpusDataset) == "TextComentariu"]
activeDataSet("corpusVars")
names(corpusVars) <- make.names(names(corpusVars))
setCorpusVariables()
dtmCorpus <- corpus
dtmCorpus <- tm_map(dtmCorpus, content_transformer(tolower))
dtmCorpus <- tm_map(dtmCorpus, content_transformer(function(x)
  gsub("(['?\n<U+202F><U+2009>]|[[:punct:]]|[[:space:]]|[[:cntrl:]])+", " ",
  x)))
dtmCorpus <- tm_map(dtmCorpus, removeNumbers)
dtm <- DocumentTermMatrix(dtmCorpus, control=list(tolower=FALSE,
  wordLengths=c(2, Inf)))
rm(dtmCorpus)
dictionary <- data.frame(row.names=colnames(dtm),
  "Occurrences"=col_sums(dtm), "Stopword"=ifelse(colnames(dtm) %in%
  stopwords("ro"), "Stopword", ""), stringsAsFactors=FALSE)
dtm <- dtm[, !colnames(dtm) %in% stopwords("ro")]
attr(dtm, "dictionary") <- dictionary
rm(dictionary)
meta(corpus, type="corpus", tag="language") <- attr(dtm, "language") <- "ro"
meta(corpus, type="corpus", tag="processing") <- attr(dtm, "processing") <-
  c(lowercase=TRUE, punctuation=TRUE, digits=TRUE, stopwords=TRUE,
  stemming=FALSE, customStemming=FALSE, twitter=FALSE, removeHashtags=NA,
  removeNames=NA)
corpus
dtm
m<-as.matrix(dtm)
write.csv(m,file="E:/dtmEight2Late.csv")
filekey <- cbind(rownames(m),filenames)
write.csv(filekey,"E:/filekey.csv")
library(relimp, pos=21)
showData(corpusVars, placement='-20+200', font=getRcmdr('logFont'),
  maxwidth=80, maxheight=30)
autori<-corpusVars$Autor
autori
filekey <- cbind(rownames(m),autori)
write.csv(filekey,"e:/filekey.csv")
cosineSim <- function(x){
as.dist(x%*%t(x)/(sqrt(rowSums(x^2) %*% t(rowSums(x^2)))))
}
cs <- cosineSim(m)
write.csv(as.matrix(cs),file="E:/csEight2Late.csv")
cs[cs < max(cs)/2] <- 0
cs <- round(cs,3)
write.csv(as.matrix(cs),file="e:AdjacencyMatrix.csv")

dat=read.csv(file.choose(),header=TRUE,row.names=1,check.names=FALSE)
m1=as.matrix(dat) # coerces the data set as a matrix
g=graph.adjacency(m1,mode="undirected",weighted=NULL) # this will create an 'igraph object'
g
http://www.slideshare.net/rdatamining/text-mining-with-r-an-analysis-of-twitter-data

http://www.n3labs.com/pdf/rank-co-occur.pdf

https://cran.r-project.org/web/packages/cooccur/cooccur.pdf


https://eight2late.wordpress.com/2015/09/29/a-gentle-introduction-to-topic-modeling-using-r/

http://www.rdatamining.com/examples/text-mining

http://www.linguisticdna.org/2015/09/10/proximity-data-ii-co-occurrence-and-distance-measurements/


https://eight2late.wordpress.com/2015/05/27/a-gentle-introduction-to-text-mining-using-r/


http://faculty.washington.edu/jwilker/CAP/R_Sample_Script.R