Tuesday, April 12, 2016

Modificari corpus

Prin eliminarea de tip Sparse:
 
Un grafic al co-aparitiei destul de inteligibil.


dtm <- dtm[, !colnames(dtm) %in% c("in", "asa", "cand", "poate",
+   "comentariu", "neaprobat", "dupa", "face", "cat", "fara", "ie", "pare",
+   "pana", "ani", "refugiati", "tara", "nato", "spune", "ru", "astia", "vrea",
+   "mi", "toti", "decat", "fata", "isi", "pt", "adica", "intr", "urma", "dup",
+   "niste", "ori", "inca", "spus", "tine", "as", "vezi", "articol", "doi",
+   "imi", "ni", "ta", "ul", "tii", "at", "ati", "iti", "aproape", "cate",
+   "cazul", "dvs", "erau", "il", "ilor", "pina", "ptr", "catre", "doua", "etc",
+    "iei", "trei", "parca", "pun", "caci", "cind", "for")]

In principiu am matricea este redusa prin:
m<-as.matrix(removeSparseTerms(myDtm, .99))
In rest ramane la fel :)
dict <- termsDictionary(dtm, "occurrences")
attr(dict, "title") <- "Terms dictionary sorted by number of occurrences"
dict
dissDtm2 <- rollup(dtm, 1, meta(corpus, "like.uri"))
diss <- corpusDissimilarity(dtm, dissDtm2)
rm(dissDtm2)
attr(diss, "title") <- "Documents by like.uri dissimilarity table"
diss
absVarFreqs <- table(meta(corpus, "like.uri"), dnn="like.uri")
varFreqs <- prop.table(absVarFreqs) * 100
barchart(varFreqs, xlab="% of documents",
  main="Distribution of documents by like.uri", auto.key=TRUE)
varFreqs <- addmargins(varFreqs)
attr(varFreqs, "title") <- "Distribution of documents by like.uri (%)"
varFreqs
myDtm <- TermDocumentMatrix(corpus, control = list(minWordLength = 1,stopwords = TRUE))
inspect(myDtm[266:270,31:40])
m <- as.matrix(myDtm)
m<-as.matrix(removeSparseTerms(myDtm, .99))
v <- sort(rowSums(m), decreasing=TRUE)
myNames <- names(v)
d <- data.frame(word=myNames, freq=v)
wordcloud(d$word, d$freq, min.freq=33)
termDocMatrix <- as.matrix(myDtm)
termDocMatrix <- m
termDocMatrix[termDocMatrix>=1] <- 1
termDocMatrix[5:10,1:20]
termMatrix <- termDocMatrix %*% t(termDocMatrix)
# inspect terms numbered 5 to 10
termMatrix[5:10,5:10]
library(igraph)
# build a graph from the above matrix
g <- graph.adjacency(termMatrix, weighted=T, mode = "undirected")
# remove loops
g <- simplify(g)
# set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)
# set seed to make the layout reproducible
set.seed(3952)
layout1 <- layout.fruchterman.reingold(g)
plot(g, layout=layout1)
write.graph(g, "e:/siriasparsity099.graphml", format=c( "graphml"))

No comments:

Post a Comment