Date Mici: 2016

Sunday, December 18, 2016

Campania electorala 2016

corpus <- Corpus(DirSource("E:/Big Data/texte", encoding="windows-1250"),

readerControl=list(language="ro"))

corpusVars <- data.frame(var1=factor(rep("", length(corpus))),

row.names=names(corpus))

activeDataSet("corpusVars")

setCorpusVariables()

corpus <- splitTexts(corpus, 20)

meta(corpus, type="corpus", tag="split") <- TRUE

dtmCorpus <- corpus

dtmCorpus <- tm_map(dtmCorpus, content_transformer(tolower))

dtmCorpus <- tm_map(dtmCorpus, content_transformer(function(x)

gsub("(['’\n<U+202F><U+2009>]|[[:punct:]]|[[:space:]]|[[:cntrl:]])+", " ",

x)))

dtmCorpus <- tm_map(dtmCorpus, removeNumbers)

dtm <- DocumentTermMatrix(dtmCorpus, control=list(tolower=FALSE,

wordLengths=c(2, Inf)))

rm(dtmCorpus)

library(SnowballC)

dictionary <- data.frame(row.names=colnames(dtm),

"Occurrences"=col_sums(dtm), "Stemmed.Term"=wordStem(colnames(dtm), "ro"),

"Stopword"=ifelse(colnames(dtm) %in% stopwords("ro"), "Stopword", ""),

stringsAsFactors=FALSE)

dtm <- dtm[, !colnames(dtm) %in% stopwords("ro")]

dtm <- rollup(dtm, 2, dictionary[colnames(dtm), 2])

attr(dtm, "dictionary") <- dictionary

rm(dictionary)

meta(corpus, type="corpus", tag="language") <- attr(dtm, "language") <- "ro"

meta(corpus, type="corpus", tag="processing") <- attr(dtm, "processing") <-

c(lowercase=TRUE, punctuation=TRUE, digits=TRUE, stopwords=TRUE,

stemming=TRUE, customStemming=FALSE, twitter=FALSE, removeHashtags=NA,

removeNames=NA)

corpus

dtm

graphplot <-

function(x,

terms = sample(Terms(x), 20),

corThreshold = 0.7,

weighted=TRUE,

diag=FALSE,

...)

{

if (system.file(package = "igraph") == "")

stop("Plotting requires package 'igraph'.")

m <- if (inherits(x, "TermDocumentMatrix")) t(x) else x

m <- as.matrix(m[, terms])

c <- cor(m)

c[c < corThreshold] <- 0

c[is.na(c)] <- 0

diag(c) <- 0

tmgraph <- graph.adjacency(c, mode=c("undirected"), weighted=TRUE, diag=FALSE,

add.colnames=NULL, add.rownames=NA)

plot(tmgraph)

invisible(tmgraph)

}

library(igraph)

(freq.terms <- findFreqTerms(dtm, lowfreq = 20))

agraph <- graphplot(dtm,term = freq.terms,corThreshold = 0.2)

write.graph(agraph,"e:/agraphPoliticAdevarul.graphml", format=c("graphml"))

Thursday, June 30, 2016

Thursday, April 21, 2016

Perceptia Siriei in 5000 de comentarii Adevarul.ro

(freq.terms <- findFreqTerms(dtm, lowfreq = 30))
agraph <- graphplot(dtm,term = freq.terms,corThreshold = 0.1)
write.graph(agraph,"e:/agraphsiria.graphml", format=c("graphml"))

GGplot exemple

http://matthewlincoln.net/2014/12/20/adjacency-matrix-plots-with-r-and-ggplot2.html

Alternate plot function for tm with Igraph support.

graphplot <-
function(x,
         terms = sample(Terms(x), 20),
         corThreshold = 0.7,
         weighted=TRUE,
   diag=FALSE,
         ...)
{
    if (system.file(package = "igraph") == "")
        stop("Plotting requires package 'igraph'.")
    m <- if (inherits(x, "TermDocumentMatrix")) t(x) else x
    m <- as.matrix(m[, terms])
    c <- cor(m)
    c[c < corThreshold] <- 0
    c[is.na(c)] <- 0
    diag(c) <- 0
tmgraph <- graph.adjacency(c, mode=c("undirected"), weighted=TRUE, diag=FALSE,
        add.colnames=NULL, add.rownames=NA)
        plot(tmgraph)

    invisible(tmgraph)
}

example usage:
graphplot(dtm,term = freq.terms,corThreshold = 0.5)
dtm- document term matrix
terms- similar to plot.tm
threshold-similar with plot.pm

for saving/export:
agraph <- graphplot(dtm,term = freq.terms,corThreshold = 0.3)
write.graph(agraph,"e:/agraph.graphml", format=c("graphml"))

Wednesday, April 20, 2016

Topic model

dtm
library(topicmodels)
lda <- LDA(dtm, k = 10) # find 8 topics
(term <- terms(lda, 10)) # first 6 terms of every topic
dtm <- dtm[, !colnames(dtm) %in% c("si", "in", "dl", "dupa", "face",
"arata")]
dtm <- dtm[, colnames(dtm) %in% c("cred", "spus", "asa", "poate", "doua")
dim(dtm)
summary(col_sums(dtm))
summary(col_sums(dtm))
term_tfidf <-
tapply(dtm$v/row_sums(dtm)[dtm$i], dtm$j, mean) *
    log2(nDocs(dtm)/col_sums(dtm > 0))
summary(term_tfidf)
dtm <- dtm[,term_tfidf >= 0.1]
dtm <- dtm[row_sums(dtm) > 0,]
summary(col_sums(dtm))
dim(dtm)

k <- 30
SEED <- 2010
jss_TM <-
list(VEM = LDA(dtm, k = k, control = list(seed = SEED)),
       VEM_fixed = LDA(dtm, k = k,
         control = list(estimate.alpha = FALSE, seed = SEED)),
       Gibbs = LDA(dtm, k = k, method = "Gibbs",
         control = list(seed = SEED, burnin = 1000,
           thin = 100, iter = 1000)),
       CTM = CTM(dtm, k = k,
         control = list(seed = SEED,
           var = list(tol = 10^-4), em = list(tol = 10^-3))))
Topic <- topics(jss_TM[["Gibbs"]], 10)
Terms <- terms(jss_TM[["Gibbs"]], 10)
Terms[,1:10]
ctm <- CTM(dtm, 10, method = "VEM", control = NULL, model = NULL)
g <- build_graph(ctm, 0.9, and = TRUE)

Metoda inspirata din DataminingwithR

Monday, April 18, 2016

Analiza discurs articolecotidianul Adevarul pe tema Marian Munteanu.

Wordnet topici Marian Munteanu

Două tabere polarizate în jurul cămpului semantinc numit Europa: Decență proclamată vs Xenofobie
Cu o temă secundara revenirea lui Băsescu/PSD.

Grupuri identificate:

Analiza Autori-Vocabular:

Numar de articole per autor:

Clustering:

Cluster

Wednesday, April 13, 2016

Acelasi Corpus cu Qdap

http://trinker.github.io/qdap/vignettes/qdap_vignette.html#import_export
require(qdap)
qdap_dat<- as.data.frame(corpus)
qview(qdap_dat)
(tm_dat)
sentSplit
(qdap_dat, "state", stem = TRUE)
bag_o_words(qdap_dat$text)
by(qdap_dat$text, bag_o_words)
head(corpus)
out <- ngrams(qdap_dat$text, 2)
lapply(out[["all_n"]], function(x) sapply(x, paste, collapse = " "))
DATA$state <- qdap_dat$text
rm_stopwords(qdap_dat$text, Top200Words)
ms <- c(" Siria ", "Rusia" ,"teroristi")
et <- c(" Turcia", " americani")
word_associate(qdap_dat$text, qdap_dat$docs, match.string = ms, wordcloud = TRUE, proportional = TRUE, network.plot = TRUE, nw.label.proportional = TRUE, extra.terms = et,cloud.legend =c("A", "B", "C"), title.color = "blue", cloud.colors = c("red", "purple", "gray70"))

DATA$state <- qdap_dat$text
rm_stopwords(qdap_dat$text, Top200Words)
ms <- c(" Siria ", "Rusia" ,"teroristi")
et <- c(" Turcia", " americani")
word_associate(qdap_dat$text, qdap_dat$docs, match.string = ms, wordcloud = TRUE, proportional = TRUE, network.plot = TRUE, nw.label.proportional = TRUE, extra.terms = et,cloud.legend =c("A", "B", "C"), title.color = "blue", cloud.colors = c("red", "purple", "gray70"))
term_match(text.var = qdap_dat$text, terms = qcv(siria, sirieni), return.list = FALSE)
term_match(qdap_dat$text, "america", FALSE)
with(qdap_dat, Dissimilarity(text, docs, method = "minkowski"))
dat <- qdap_dat[qdap_dat$text %in% qcv(Siria, Rusia),]
dat
(div.mod <- with(qdap_dat, diversity(docs, text)))
plot(div.mod, low = "yellow", grid = FALSE, values = TRUE)
words <- c("Siria", "rusia", "america", "turcia", "romania")
with(qdap_dat, word_cor(qdap_dat$text, x, words, r = .5))
x <- factor(with(qdap_dat, paste(text, pad(TOT(tot)), sep = "|")))
word_network_plot(text.var=qdap_dat$text, stopwords=NULL, label.cex = .95)

Curs Detaliat

https://www.csc.ncsu.edu/faculty/healey/maa-16/text/

http://staff.scem.uws.edu.au/~lapark/lt.pdf

http://nm.wu-wien.ac.at/research/publications/b675.pdf

https://cran.r-project.org/web/packages/qdap/vignettes/tm_package_compatibility.pdf

Bibliografie R

http://stackoverflow.com/questions/16880411/r-tm-big-data-based-on-a-termdocumentmatrix-how-to-set-term-freq-bound-to-ext
Text mining Willian Shakespere:
http://www.r-bloggers.com/text-mining-the-complete-works-of-william-shakespeare/

http://www.r-bloggers.com/an-example-of-social-network-analysis-with-r-using-package-igraph/

Disimilarities Cosine
http://stats.stackexchange.com/questions/78321/term-frequency-inverse-document-frequency-tf-idf-weighting
library(proxy)
cosine_dist_mat <- as.matrix(dist(t(myDtm), method = "cosine"))
docsdissim <- dist(as.matrix(termDocMatrix), method = "cosine")
n <- as.matrix(docsdissim)
v <- sort(rowSums(n), decreasing=TRUE)
myNames <- names(v)
d <- data.frame(word=myNames, freq=v)
wordcloud(d$word, d$freq, min.freq=33)
termDocMatrix <- n
termDocMatrix[termDocMatrix>=1] <- 1
termDocMatrix[5:10,1:20]
termMatrix <- termDocMatrix %*% t(termDocMatrix)
# inspect terms numbered 5 to 10
termMatrix[5:10,5:10]
library(igraph)
# build a graph from the above matrix
g <- graph.adjacency(termMatrix, weighted=T, mode = "undirected")
# remove loops
g <- simplify(g)
# set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)
# set seed to make the layout reproducible
set.seed(3952)
layout1 <- layout.fruchterman.reingold(g)
plot(g, layout=layout1)
write.graph(g, "e:/siriacosinedisimilarity.graphml", format=c( "graphml"))

Cosine 1 =>mai degraba disimilaritate :)

termDocMatrix <- n
#termDocMatrix[termDocMatrix>=1] <- 1
termDocMatrix[termDocMatrix<1] <- 0
termDocMatrix[5:10,1:20]
termMatrix <- termDocMatrix %*% t(termDocMatrix)
# inspect terms numbered 5 to 10
termMatrix[5:10,5:10]
library(igraph)
# build a graph from the above matrix
g <- graph.adjacency(termMatrix, weighted=T, mode = "undirected")
# remove loops
g <- simplify(g)
# set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)
# set seed to make the layout reproducible
set.seed(3952)
layout1 <- layout.fruchterman.reingold(g)
plot(g, layout=layout1)
write.graph(g, "e:/siriacosinedisimilaritynonbollean.graphml", format=c( "graphml"))

Tuesday, April 12, 2016

Modificari corpus

Prin eliminarea de tip Sparse:

Un grafic al co-aparitiei destul de inteligibil.

dtm <- dtm[, !colnames(dtm) %in% c("in", "asa", "cand", "poate",
+   "comentariu", "neaprobat", "dupa", "face", "cat", "fara", "ie", "pare",
+   "pana", "ani", "refugiati", "tara", "nato", "spune", "ru", "astia", "vrea",
+   "mi", "toti", "decat", "fata", "isi", "pt", "adica", "intr", "urma", "dup",
+   "niste", "ori", "inca", "spus", "tine", "as", "vezi", "articol", "doi",
+   "imi", "ni", "ta", "ul", "tii", "at", "ati", "iti", "aproape", "cate",
+   "cazul", "dvs", "erau", "il", "ilor", "pina", "ptr", "catre", "doua", "etc",
+    "iei", "trei", "parca", "pun", "caci", "cind", "for")]

In principiu am matricea este redusa prin:
m<-as.matrix(removeSparseTerms(myDtm, .99))
In rest ramane la fel :)
dict <- termsDictionary(dtm, "occurrences")
attr(dict, "title") <- "Terms dictionary sorted by number of occurrences"
dict
dissDtm2 <- rollup(dtm, 1, meta(corpus, "like.uri"))
diss <- corpusDissimilarity(dtm, dissDtm2)
rm(dissDtm2)
attr(diss, "title") <- "Documents by like.uri dissimilarity table"
diss
absVarFreqs <- table(meta(corpus, "like.uri"), dnn="like.uri")
varFreqs <- prop.table(absVarFreqs) * 100
barchart(varFreqs, xlab="% of documents",
main="Distribution of documents by like.uri", auto.key=TRUE)
varFreqs <- addmargins(varFreqs)
attr(varFreqs, "title") <- "Distribution of documents by like.uri (%)"
varFreqs
myDtm <- TermDocumentMatrix(corpus, control = list(minWordLength = 1,stopwords = TRUE))
inspect(myDtm[266:270,31:40])
m <- as.matrix(myDtm)
m<-as.matrix(removeSparseTerms(myDtm, .99))
v <- sort(rowSums(m), decreasing=TRUE)
myNames <- names(v)
d <- data.frame(word=myNames, freq=v)
wordcloud(d$word, d$freq, min.freq=33)
termDocMatrix <- as.matrix(myDtm)
termDocMatrix <- m
termDocMatrix[termDocMatrix>=1] <- 1
termDocMatrix[5:10,1:20]
termMatrix <- termDocMatrix %*% t(termDocMatrix)
# inspect terms numbered 5 to 10
termMatrix[5:10,5:10]
library(igraph)
# build a graph from the above matrix
g <- graph.adjacency(termMatrix, weighted=T, mode = "undirected")
# remove loops
g <- simplify(g)
# set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)
# set seed to make the layout reproducible
set.seed(3952)
layout1 <- layout.fruchterman.reingold(g)
plot(g, layout=layout1)
write.graph(g, "e:/siriasparsity099.graphml", format=c( "graphml"))

Friday, April 8, 2016

Coocurences graph negative comments only

# Prefer fixed to scientific notation
options(scipen=5)
# Print numbers with two significant digits
options(digits=2)
options(R2HTML.format.digits=2)
# Set a nice color palette for plots
lattice.options(default.theme=latticeExtra::custom.theme(symbol=RColorBrewer::brewer.pal(8,
"Set1")[c(2:1, 3:5, 7:9)], fill=RColorBrewer::brewer.pal(8, "Set1")[c(2:1,
3:5, 7:9)], region=RColorBrewer::brewer.pal(n=11, name="Spectral")))
rm(lengths)
library(RODBC)
channel <-
odbcConnectExcel2007("C:/Users/CCCC/Documents/DateKnime/adevarulCOMENTARII_Siriamartie21.xlsx")
corpusDataset <- sqlQuery(channel=channel, "select * from [Sheet1$]")
odbcCloseAll()
corpus <- Corpus(DataframeSource(corpusDataset["TextComentariu"]),
readerControl=list(language="ro"))
corpusVars <- corpusDataset[!names(corpusDataset) == "TextComentariu"]
corpusVars <- data.frame(var1=factor(rep("", length(corpus))),
row.names=names(corpus))
activeDataSet("corpusVars")
setCorpusVariables()
dtmCorpus <- corpus
dtmCorpus <- tm_map(dtmCorpus, content_transformer(tolower))
dtmCorpus <- tm_map(dtmCorpus, content_transformer(function(x)
gsub("(['?\n<U+202F><U+2009>]|[[:punct:]]|[[:space:]]|[[:cntrl:]])+", " ",
x)))
dtmCorpus <- tm_map(dtmCorpus, removeNumbers)
dtm <- DocumentTermMatrix(dtmCorpus, control=list(tolower=FALSE,
wordLengths=c(2, Inf)))
rm(dtmCorpus)
dictionary <- data.frame(row.names=colnames(dtm),
"Occurrences"=col_sums(dtm), "Stopword"=ifelse(colnames(dtm) %in%
stopwords("ro"), "Stopword", ""), stringsAsFactors=FALSE)
dtm <- dtm[, !colnames(dtm) %in% stopwords("ro")]
attr(dtm, "dictionary") <- dictionary
rm(dictionary)
meta(corpus, type="corpus", tag="language") <- attr(dtm, "language") <- "ro"
meta(corpus, type="corpus", tag="processing") <- attr(dtm, "processing") <-
c(lowercase=TRUE, punctuation=TRUE, digits=TRUE, stopwords=TRUE,
stemming=FALSE, customStemming=FALSE, twitter=FALSE, removeHashtags=NA,
removeNames=NA)
corpus
dtm
corpus <- tm_map(corpus, removeWords, "romanian")
stopwords(kind = "ro")
myDtm <- TermDocumentMatrix(corpus, control = list(minWordLength = 1,stopwords = TRUE))
inspect(myDtm[266:270,31:40])
library(wordcloud)
m <- as.matrix(myDtm)
v <- sort(rowSums(m), decreasing=TRUE)
myNames <- names(v)
d <- data.frame(word=myNames, freq=v)
wordcloud(d$word, d$freq, min.freq=33)
termDocMatrix <- as.matrix(myDtm)
termDocMatrix[termDocMatrix>=1] <- 1
termDocMatrix[5:10,1:20]
termMatrix <- termDocMatrix %*% t(termDocMatrix)
# inspect terms numbered 5 to 10
termMatrix[5:10,5:10]
write.table(termDocMatrix, file = "e:/matrix", sep = " ")
rm(corpus, corpusVars, dtm, lengths)
rm(lengths)
library(RODBC)
channel <-
odbcConnectExcel2007("C:/Users/cristian.chirita/Documents/DateKnime/adevarulCOMENTARII_Siriamartie21.xlsx")
corpusDataset <- sqlQuery(channel=channel, "select * from [Sheet1$]")
odbcCloseAll()
corpus <- Corpus(DataframeSource(corpusDataset["TextComentariu"]),
readerControl=list(language="ro"))
corpusVars <- corpusDataset[!names(corpusDataset) == "TextComentariu"]
corpusVars <- corpusVars[c("nume articol", "rankcomentator", "Autor",
"like-uri")]
activeDataSet("corpusVars")
names(corpusVars) <- make.names(names(corpusVars))
setCorpusVariables()
dtmCorpus <- corpus
dtmCorpus <- tm_map(dtmCorpus, content_transformer(tolower))
dtmCorpus <- tm_map(dtmCorpus, content_transformer(function(x)
gsub("(['?\n<U+202F><U+2009>]|[[:punct:]]|[[:space:]]|[[:cntrl:]])+", " ",
x)))
dtmCorpus <- tm_map(dtmCorpus, removeNumbers)
dtm <- DocumentTermMatrix(dtmCorpus, control=list(tolower=FALSE,
wordLengths=c(2, Inf)))
rm(dtmCorpus)
dictionary <- data.frame(row.names=colnames(dtm),
"Occurrences"=col_sums(dtm), "Stopword"=ifelse(colnames(dtm) %in%
stopwords("ro"), "Stopword", ""), stringsAsFactors=FALSE)
dtm <- dtm[, !colnames(dtm) %in% stopwords("ro")]
attr(dtm, "dictionary") <- dictionary
rm(dictionary)
meta(corpus, type="corpus", tag="language") <- attr(dtm, "language") <- "ro"
meta(corpus, type="corpus", tag="processing") <- attr(dtm, "processing") <-
c(lowercase=TRUE, punctuation=TRUE, digits=TRUE, stopwords=TRUE,
stemming=FALSE, customStemming=FALSE, twitter=FALSE, removeHashtags=NA,
removeNames=NA)
corpus
dtm
keep <- meta(corpus, "like.uri")[[1]] %in% c("-2 (6 voturi)",
"-4 (6 voturi)", "-4 (4 voturi)", "-1 (5 voturi)", "-1 (7 voturi)",
"-1 (1 voturi)", "-2 (4 voturi)", "-6 (8 voturi)", "-4 (8 voturi)",
"-3 (3 voturi)", "-5 (13 voturi)", "-6 (14 voturi)", "-4 (12 voturi)",
"-3 (11 voturi)", "-2 (2 voturi)", "-3 (15 voturi)", "-5 (15 voturi)",
"-1 (11 voturi)", "-5 (11 voturi)", "-3 (7 voturi)", "-2 (18 voturi)",
"-1 (3 voturi)", "-3 (9 voturi)", "-3 (5 voturi)", "0 (16 voturi)",
"-6 (10 voturi)", "-6 (18 voturi)", "-10 (18 voturi)", "-14 (16 voturi)",
"-11 (15 voturi)", "-12 (14 voturi)", "-2 (14 voturi)", "-4 (10 voturi)",
"-1 (9 voturi)", "-5 (21 voturi)", "-4 (14 voturi)", "-1 (13 voturi)",
"-2 (10 voturi)", "-5 (7 voturi)", "-5 (5 voturi)", "-6 (6 voturi)",
"-7 (25 voturi)", "-13 (27 voturi)", "-2 (16 voturi)", "-5 (17 voturi)",
"-7 (9 voturi)", "-7 (7 voturi)", "-12 (52 voturi)", "-1 (29 voturi)",
"-5 (31 voturi)", "-5 (41 voturi)", "-1 (35 voturi)", "-2 (34 voturi)",
"-5 (37 voturi)", "-6 (28 voturi)", "-6 (36 voturi)", "-9 (37 voturi)",
"-8 (24 voturi)", "-2 (8 voturi)", "-5 (9 voturi)", "-7 (11 voturi)",
"-8 (8 voturi)", "-2 (12 voturi)", "-8 (36 voturi)", "-3 (29 voturi)",
"-8 (28 voturi)", "-1 (15 voturi)", "-13 (23 voturi)", "-12 (18 voturi)",
"-10 (14 voturi)", "-10 (12 voturi)", "-6 (12 voturi)", "-8 (18 voturi)",
"-6 (44 voturi)", "-17 (57 voturi)", "-11 (57 voturi)", "-3 (21 voturi)",
"-4 (20 voturi)", "-7 (19 voturi)", "-8 (14 voturi)", "-3 (19 voturi)",
"-4 (36 voturi)", "-8 (26 voturi)", "-1 (17 voturi)", "-6 (16 voturi)",
"-8 (10 voturi)", "-9 (13 voturi)", "-9 (9 voturi)", "-4 (22 voturi)",
"-2 (20 voturi)", "-2 (22 voturi)", "-6 (24 voturi)", "-7 (15 voturi)",
"-11 (25 voturi)", "-7 (13 voturi)", "-5 (19 voturi)", "-4 (16 voturi)",
"-15 (29 voturi)", "-3 (17 voturi)", "-8 (12 voturi)", "-1 (21 voturi)",
"-5 (23 voturi)", "-1 (25 voturi)", "-16 (20 voturi)", "-9 (11 voturi)",
"-16 (16 voturi)", "-3 (13 voturi)", "-5 (29 voturi)", "-10 (20 voturi)",
"-8 (20 voturi)", "-9 (17 voturi)", "-8 (30 voturi)", "-2 (38 voturi)",
"-10 (28 voturi)", "-1 (53 voturi)", "-1 (43 voturi)", "-3 (47 voturi)",
"-4 (40 voturi)", "-8 (34 voturi)", "-7 (49 voturi)", "-4 (26 voturi)",
"-7 (23 voturi)", "-24 (30 voturi)", "-3 (27 voturi)", "-15 (23 voturi)",
"-12 (12 voturi)", "-17 (19 voturi)", "-31 (43 voturi)", "-16 (30 voturi)",
"-17 (27 voturi)", "-14 (26 voturi)")
origCorpus <- corpus
corpus <- corpus[keep]
origDtm <- dtm
dtmAttr <- attributes(dtm)
origDictionary <- attr(dtm, "dictionary")
dtm <- dtm[keep,]
dictionary <- data.frame(row.names=colnames(dtm),
"Occurrences"=col_sums(dtm), "Stopword"=ifelse(colnames(dtm) %in%
stopwords("ro"), "Stopword", ""), stringsAsFactors=FALSE)
dtm <- dtm[, !colnames(dtm) %in% stopwords("ro")]
attr(dtm, "dictionary") <- dictionary
rm(dictionary)
attr(dtm, "language") <- dtmAttr$lang
attr(dtm, "processing") <- dtmAttr$processing
rm(dtmAttr, origDictionary)
corpusVars <- corpusVars[keep,, drop=FALSE]
rm(list=c("keep", "lengths"))
corpus
dtm
library(relimp, pos=23)
showData(corpusVars, placement='-20+200', font=getRcmdr('logFont'),
maxwidth=80, maxheight=30)
editDataset(corpusVars)
editDataset(corpusVars)
####################################################
corpus <- tm_map(corpus, removeWords, "romanian")
stopwords(kind = "ro")
myDtm <- TermDocumentMatrix(corpus, control = list(minWordLength = 1,stopwords = TRUE))
inspect(myDtm[266:270,31:40])
library(wordcloud)
m <- as.matrix(myDtm)
v <- sort(rowSums(m), decreasing=TRUE)
myNames <- names(v)
d <- data.frame(word=myNames, freq=v)
wordcloud(d$word, d$freq, min.freq=33)
termDocMatrix <- as.matrix(myDtm)
termDocMatrix[termDocMatrix>=1] <- 1
termDocMatrix[5:10,1:20]
termMatrix <- termDocMatrix %*% t(termDocMatrix)
# inspect terms numbered 5 to 10
termMatrix[5:10,5:10]
library(igraph)
# build a graph from the above matrix
g <- graph.adjacency(termMatrix, weighted=T, mode = "undirected")
# remove loops
g <- simplify(g)
# set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)
# set seed to make the layout reproducible
set.seed(3952)
layout1 <- layout.fruchterman.reingold(g)
plot(g, layout=layout1)
write.graph(g, "e:/coocgraph", format=c( "graphml"))

https://rstudio-pubs-static.s3.amazonaws.com/31867_8236987cf0a8444e962ccd2aec46d9c3.html#clustering-by-term-similarity

Graficul rezultat contine nu este filtrat din punct de vedere gramatical motiv pentru care contine prea mult zgomot.

Wordcloud romana

Thursday, April 7, 2016

Matricea Apritiei Simultane

http://www.rdatamining.com/examples/social-network-analysis
http://www.rdatamining.com/examples/text-mining

# Prefer fixed to scientific notation
options(scipen=5)
# Print numbers with two significant digits
options(digits=2)
options(R2HTML.format.digits=2)
# Set a nice color palette for plots
lattice.options(default.theme=latticeExtra::custom.theme(symbol=RColorBrewer::brewer.pal(8,
"Set1")[c(2:1, 3:5, 7:9)], fill=RColorBrewer::brewer.pal(8, "Set1")[c(2:1,
3:5, 7:9)], region=RColorBrewer::brewer.pal(n=11, name="Spectral")))
rm(lengths)
library(RODBC)
channel <-
odbcConnectExcel("C:/Users/cristian.chirita/Documents/DateKnime/adevarulCOMENTARII_Siriamartie21.xls")
corpusDataset <- sqlQuery(channel=channel, "select * from [Sheet1$]")
odbcCloseAll()
corpus <- Corpus(DataframeSource(corpusDataset["TextComentariu"]),
readerControl=list(language="ro"))
corpusVars <- corpusDataset[!names(corpusDataset) == "TextComentariu"]
activeDataSet("corpusVars")
names(corpusVars) <- make.names(names(corpusVars))
setCorpusVariables()
dtmCorpus <- corpus
dtmCorpus <- tm_map(dtmCorpus, content_transformer(tolower))
dtmCorpus <- tm_map(dtmCorpus, content_transformer(function(x)
gsub("(['?\n<U+202F><U+2009>]|[[:punct:]]|[[:space:]]|[[:cntrl:]])+", " ",
x)))
dtmCorpus <- tm_map(dtmCorpus, removeNumbers)
dtm <- DocumentTermMatrix(dtmCorpus, control=list(tolower=TRUE,
wordLengths=c(2, Inf)))
rm(dtmCorpus)
dictionary <- data.frame(row.names=colnames(dtm),
"Occurrences"=col_sums(dtm), "Stopword"=ifelse(colnames(dtm) %in%
stopwords("ro"), "Stopword", ""), stringsAsFactors=FALSE)
dtm <- dtm[, !colnames(dtm) %in% stopwords("ro")]
attr(dtm, "dictionary") <- dictionary
rm(dictionary)
meta(corpus, type="corpus", tag="language") <- attr(dtm, "language") <- "ro"
meta(corpus, type="corpus", tag="processing") <- attr(dtm, "processing") <-
c(lowercase=TRUE, punctuation=TRUE, digits=TRUE, stopwords=TRUE,
stemming=FALSE, customStemming=FALSE, twitter=FALSE, removeHashtags=NA,
removeNames=NA)
corpus
dtm
myDtm <- TermDocumentMatrix(dtm, control = list(minWordLength = 1))
inspect(myDtm[100:170,10:20])
inspect(dtm[100:170,10:20])
findFreqTerms(dtm, lowfreq=10)
findAssocs(dtm, 'kurzi', 0.30)
library(wordcloud)
m <- as.matrix(dtm)
v <- sort(rowSums(m), decreasing=TRUE)
myNames <- names(v)
k <- which(names(v)=="siria")
myNames[k] <- "Siria"
d <- data.frame(word=myNames, freq=v)
wordcloud(d$word, d$freq, min.freq=3)
myDtm <- TermDocumentMatrix(corpus, control = list(minWordLength = 1))
inspect(myDtm[100:170,10:20])
myCorpus <- tm_map(corpus, tolower)
# remove punctuation
myCorpus <- tm_map(myCorpus, removePunctuation)
# remove numbers
myCorpus <- tm_map(myCorpus, removeNumbers)
# remove stopwords
# keep "r" by removing it from stopwords
myStopwords <- c(stopwords('english'), "available", "via")
idx <- which(myStopwords == "r")
myStopwords <- myStopwords[-idx]
myCorpus <- tm_map(myCorpus, removeWords, myStopwords)
myCorpus
myDtm <- TermDocumentMatrix(myCorpus, control = list(minWordLength = 1))
inspect(myDtm[4266:4270,4331:4340])
findFreqTerms(myDtm, lowfreq=20)
findAssocs(myDtm, 'victime', 0.30)
termDocMatrix <- as.matrix(myDtm)

Obs:
Fisier prea mare :)

Tuesday, April 5, 2016

Resurse Statistice R

http://quantlet.de/index.php?p=info

https://eight2late.wordpress.com/2015/12/02/a-gentle-introduction-to-network-graphs-using-r-and-gephi/

autori<-corpusVars$Autor In loc de filename
autori
filekey <- cbind(rownames(m),autori)

# Prefer fixed to scientific notation
options(scipen=5)
# Print numbers with two significant digits
options(digits=2)
options(R2HTML.format.digits=2)
# Set a nice color palette for plots
lattice.options(default.theme=latticeExtra::custom.theme(symbol=RColorBrewer::brewer.pal(8,
"Set1")[c(2:1, 3:5, 7:9)], fill=RColorBrewer::brewer.pal(8, "Set1")[c(2:1,
3:5, 7:9)], region=RColorBrewer::brewer.pal(n=11, name="Spectral")))
rm(lengths)
library(RODBC)
channel <-
odbcConnectExcel("C:/Users/c.c/Documents/DateKnime/adevarulCOMENTARII_Siriamartie21.xls")
corpusDataset <- sqlQuery(channel=channel, "select * from [Sheet1$]")
odbcCloseAll()
corpus <- Corpus(DataframeSource(corpusDataset["TextComentariu"]),
readerControl=list(language="ro"))
corpusVars <- corpusDataset[!names(corpusDataset) == "TextComentariu"]
activeDataSet("corpusVars")
names(corpusVars) <- make.names(names(corpusVars))
setCorpusVariables()
dtmCorpus <- corpus
dtmCorpus <- tm_map(dtmCorpus, content_transformer(tolower))
dtmCorpus <- tm_map(dtmCorpus, content_transformer(function(x)
gsub("(['?\n<U+202F><U+2009>]|[[:punct:]]|[[:space:]]|[[:cntrl:]])+", " ",
x)))
dtmCorpus <- tm_map(dtmCorpus, removeNumbers)
dtm <- DocumentTermMatrix(dtmCorpus, control=list(tolower=FALSE,
wordLengths=c(2, Inf)))
rm(dtmCorpus)
dictionary <- data.frame(row.names=colnames(dtm),
"Occurrences"=col_sums(dtm), "Stopword"=ifelse(colnames(dtm) %in%
stopwords("ro"), "Stopword", ""), stringsAsFactors=FALSE)
dtm <- dtm[, !colnames(dtm) %in% stopwords("ro")]
attr(dtm, "dictionary") <- dictionary
rm(dictionary)
meta(corpus, type="corpus", tag="language") <- attr(dtm, "language") <- "ro"
meta(corpus, type="corpus", tag="processing") <- attr(dtm, "processing") <-
c(lowercase=TRUE, punctuation=TRUE, digits=TRUE, stopwords=TRUE,
stemming=FALSE, customStemming=FALSE, twitter=FALSE, removeHashtags=NA,
removeNames=NA)
corpus
dtm
m<-as.matrix(dtm)
write.csv(m,file="E:/dtmEight2Late.csv")
filekey <- cbind(rownames(m),filenames)
write.csv(filekey,"E:/filekey.csv")
library(relimp, pos=21)
showData(corpusVars, placement='-20+200', font=getRcmdr('logFont'),
maxwidth=80, maxheight=30)
autori<-corpusVars$Autor
autori
filekey <- cbind(rownames(m),autori)
write.csv(filekey,"e:/filekey.csv")
cosineSim <- function(x){
as.dist(x%*%t(x)/(sqrt(rowSums(x^2) %*% t(rowSums(x^2)))))
}
cs <- cosineSim(m)
write.csv(as.matrix(cs),file="E:/csEight2Late.csv")
cs[cs < max(cs)/2] <- 0
cs <- round(cs,3)
write.csv(as.matrix(cs),file="e:AdjacencyMatrix.csv")

dat=read.csv(file.choose(),header=TRUE,row.names=1,check.names=FALSE)
m1=as.matrix(dat) # coerces the data set as a matrix
g=graph.adjacency(m1,mode="undirected",weighted=NULL) # this will create an 'igraph object'
g
http://www.slideshare.net/rdatamining/text-mining-with-r-an-analysis-of-twitter-data

http://www.n3labs.com/pdf/rank-co-occur.pdf

https://cran.r-project.org/web/packages/cooccur/cooccur.pdf

https://eight2late.wordpress.com/2015/09/29/a-gentle-introduction-to-topic-modeling-using-r/

http://www.rdatamining.com/examples/text-mining

http://www.linguisticdna.org/2015/09/10/proximity-data-ii-co-occurrence-and-distance-measurements/

https://eight2late.wordpress.com/2015/05/27/a-gentle-introduction-to-text-mining-using-r/

http://faculty.washington.edu/jwilker/CAP/R_Sample_Script.R

Monday, March 28, 2016

Legaturi utile

Instrumente pentru etichetarea POS.
http://www.martinschweinberger.de/blog/part-of-speech-tagging-with-r/

Wednesday, March 16, 2016

POS Romanian

Proiect Romanian POS Tagger
In acest moment nu exista un instrument ușor accesibil pentru etichetare POS pt. limba romana.
Desi atat OPEN NLP sau TreeTagger ar permite acest lucru.

Teoretic utilizând Tree Tagger se poate obține un istrument pentru Romană.
Sunt necesare trei fișiere:
1. Lexiconul

Training
--------

Training is done with the *train-tree-tagger* program. If the program is 
called without arguments, the following output is printed:

USAGE: train-tree-tagger <lexicon> <open class file> <infile> <outfile> 
       {-cl <context length>} {-dtg <min. decision tree gain>}
       {-ecw <eq. class weight>} {-atg <affix tree gain>} {-st <sent. tag>}

Description of the command line arguments:
* <lexicon>: name of a file which contains the fullform lexicon. Each line 
  of the lexicon corresponds to one word form and contains the word form 
  itself followed by a Tab character and a sequence of tag-lemma pairs.
  The tags and lemmata are separated by whitespace.

Example:
aback RB aback
abacuses NNS abacus
abandon VB abandon VBP abandon
abandoned JJ abandoned VBD abandon VBN abandon
abandoning VBG abandon

  Important: Ordinal and cardinal numbers which consist of digits
  should not be included in the lexicon. Otherwise, the tagger will
  not be able to learn how to tag numbers which are not listed in the
  lexicon. Numbers with unusual tags should be added to the lexicon,
  however.

  Remark: The tagger doesn't need the lemmata for tagging. If
  you do not have the lemma information or if you do not plan to
  annotate corpora with lemmas, you can replace the lemma with a dummy
  value, e.g. "-".

* <open class file>: name of a file which contains a list of open class tags
  i.e. possible tags of unknown word forms. This information is needed to
  estimate likely tags of unknown words. This file would typically contain
  adverb, adjective, noun, proper name and perhaps verb tags, but not
  prepositions, determiners, pronouns or numbers.
* <input file>: name of a file which contains tagged training data. The data
  must be in one-word-per-line format. This means that each line contains 
  one token and one tag in that order separated by a tabulator. 
  Punctuation marks are considered as tokens and must have been tagged as well.

Cum din varii motive un corpus adnotat nu a putut fi utilizat o varianta este utilizarea romanului adnotat 1984

Example: Pierre NP Vinken NP , , 61 CD years NNS * <output file>: name of the file in which the resulting tagger parameters are stored.

Presupune utilizarea unui corpus gata etichetat.
Sunt disponibile o serie de materiale ce pot fi utilizate pentru acest lucru:
1.1Lexicon
Free
https://www.clarin.si/repository/xmlui/handle/11356/1041

MULTEXT-East free lexicons 4.0 BIBTEXCMDI
Credit:
Erjavec, Tomaž; Bruda, Stefan; Derzhanski, Ivan; Dimitrova, Ludmila; Garabík, Radovan; Holozan, Peter; Ide, Nancy; Kaalep, Heiki-Jaan; Kotsyba, Natalia; Oravecz, Csaba; Petkevič, Vladimír; Priest-Dorman, Greg; Shevchenko, Igor; Simov, Kiril; Sinapova, Lydia; Steenwijk, Han; Tihanyi, Laszlo; Tufiş, Dan and Véronis, Jean, 2010, MULTEXT-East free lexicons 4.0, Slovenian language resource repository CLARIN.SI, http://hdl.handle.net/11356/1041.

Forma fisiere lexic free:

The MULTEXT-East morphosyntactic lexicons have a simple structure, where each line is a lexical entry with three tab-separated fields: (1) the word-form, the inflected form of the word; (2) the lemma, the base-form of the word; (3) the MSD, the morphosyntactic description of the word-form, i.e., its fine-grained PoS tag, as defined in the MULTEXT-East morphosyntactic specifications.

Exemplu:

ţurţuri ţurţur Ncmp-n
ţurţurii ţurţur Ncmpry
ţurţurilor ţurţur Ncmpoy
ţurţurul ţurţur Ncmsry
ţurţurului ţurţur Ncmsoy
ţuşti ţuşti I
ţânc ţânc Ncms-n
ţânci ţânc Ncmp-n
ţâncii ţânc Ncmpry
ţâncilor ţânc Ncmpoy
ţâncul ţânc Ncmsry

Autorii fisierului romanesc sunt:
Romanian:
S.Bruda, C.Diaconu, L.Diaconu, and D.Tufis
Center for Research in Machine Learning,
Natural Language Processing and Conceptual Modelling
Romanian Academy of Sciences

1.2 Document Adnotat 1984 George Orwell
https://www.clarin.si/repository/xmlui/browse?value=Romanian&type=language

2. Treetagger http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/
Odată ce exista un tagger (fie via Maxtent Open NLP fie TreeTagger ) se pot utiliza instrumente tipice pentru analiza.

TreeTagger este free in scop necomercial.
In order to use the TreeTagger commercially, you need to obtain a commercial license (see contact address below)!

3.koRpus https://cran.r-project.org/web/packages/koRpus/index.html
Mai multe detalii aici:http://reaktanz.de/?c=hacking&s=koRpus
Exista si varinate comerciale pt. Lexic.

4. Romanian Treebank http://universal.elra.info/search.php
Un tutorial pentru antrenarea unui TreeBank
4.1Exemple:
Exemplu corpus rusesc cu utilizare Multiest:
Designing and Evaluating a Russian Tagsethttps://msuweb.montclair.edu/~feldmana/publications/2008-lrec-mocky.pdf
4.2 Eforturi academice si comerciale:

RACAI-RoTb: nucleu de corpus de limbă română adnotat sintactic cu relaţii de dependenţă

http://rochi.utcluj.ro/rrioc/articole/RRIOC-8-2-Irimia.pdf

Parser de dependenţe pentru limba română realizat

pe baza parserelor pentru alte limbi romanicehttp://rochi.utcluj.ro/rrioc/articole/RRIOC-7-1-Florea.pdf

O interfata Windows pentruTreeBank
5. Train the Romanian TreeBank http://www.smo.uhi.ac.uk/~oduibhin/oideasra/interfaces/winttinterface.htm 5.1 Trained corpus 5.2

1. Lexicon Util fisierul
2. Corpus
3. Fisier de parsare. http://www.racai.ro/external/static/awde/tufiscor11.html
Penetru a avea un fisier de parsare corespunzator acest atrebuie adaptat pentru descrierea din Lexicon. Pentru Romana descrierea este aici:
http://nl.ijs.si/ME/Vault/V3/msd/html/msd.html#SECTION05200000000000000000
fisierul cu descrierea MSD este disponibil aici:

MULTEXT-East
                    Morphosyntactic Specifications
                              Version 4

http://nl.ijs.si/ME/V4/msd/tables/

Tabel in Romana:
http://nl.ijs.si/ME/V4/msd/tables/msd-human-ro.tbl
In format xml:
http://nl.ijs.si/ME/V4/msd/tables/msd-canon-ro.tbl

Documentatie completa:
http://nl.ijs.si/ME/V4/msd/html/msd-ro.html
Ontologie:
http://nl.ijs.si/ME/owl/msd-ro.owl
Alternative:
https://www.sketchengine.co.uk/romanian-tagset/

Rezultat:
Tree Tagger are fișier de parametrizare in limba româna.