corpus <- Corpus(DirSource("E:/Big Data/texte", encoding="windows-1250"),
readerControl=list(language="ro"))
corpusVars <- data.frame(var1=factor(rep("", length(corpus))),
row.names=names(corpus))
activeDataSet("corpusVars")
setCorpusVariables()
corpus <- splitTexts(corpus, 20)
meta(corpus, type="corpus", tag="split") <- TRUE
dtmCorpus <- corpus
dtmCorpus <- tm_map(dtmCorpus, content_transformer(tolower))
dtmCorpus <- tm_map(dtmCorpus, content_transformer(function(x)
gsub("(['’\n<U+202F><U+2009>]|[[:punct:]]|[[:space:]]|[[:cntrl:]])+", " ",
x)))
dtmCorpus <- tm_map(dtmCorpus, removeNumbers)
dtm <- DocumentTermMatrix(dtmCorpus, control=list(tolower=FALSE,
wordLengths=c(2, Inf)))
rm(dtmCorpus)
library(SnowballC)
dictionary <- data.frame(row.names=colnames(dtm),
"Occurrences"=col_sums(dtm), "Stemmed.Term"=wordStem(colnames(dtm), "ro"),
"Stopword"=ifelse(colnames(dtm) %in% stopwords("ro"), "Stopword", ""),
stringsAsFactors=FALSE)
dtm <- dtm[, !colnames(dtm) %in% stopwords("ro")]
dtm <- rollup(dtm, 2, dictionary[colnames(dtm), 2])
attr(dtm, "dictionary") <- dictionary
rm(dictionary)
meta(corpus, type="corpus", tag="language") <- attr(dtm, "language") <- "ro"
meta(corpus, type="corpus", tag="processing") <- attr(dtm, "processing") <-
c(lowercase=TRUE, punctuation=TRUE, digits=TRUE, stopwords=TRUE,
stemming=TRUE, customStemming=FALSE, twitter=FALSE, removeHashtags=NA,
removeNames=NA)
corpus
dtm
graphplot <-
function(x,
terms = sample(Terms(x), 20),
corThreshold = 0.7,
weighted=TRUE,
diag=FALSE,
...)
{
if (system.file(package = "igraph") == "")
stop("Plotting requires package 'igraph'.")
m <- if (inherits(x, "TermDocumentMatrix")) t(x) else x
m <- as.matrix(m[, terms])
c <- cor(m)
c[c < corThreshold] <- 0
c[is.na(c)] <- 0
diag(c) <- 0
tmgraph <- graph.adjacency(c, mode=c("undirected"), weighted=TRUE, diag=FALSE,
add.colnames=NULL, add.rownames=NA)
plot(tmgraph)
invisible(tmgraph)
}
library(igraph)
(freq.terms <- findFreqTerms(dtm, lowfreq = 20))
agraph <- graphplot(dtm,term = freq.terms,corThreshold = 0.2)
write.graph(agraph,"e:/agraphPoliticAdevarul.graphml", format=c("graphml"))






No comments:
Post a Comment