Thursday, June 30, 2016
Guardian comments
# Prefer fixed to scientific notation
options(scipen=5)
# Print numbers with two significant digits
options(digits=2)
options(R2HTML.format.digits=2)
# Set a nice color palette for plots
lattice.options(default.theme=latticeExtra::custom.theme(symbol=RColorBrewer::brewer.pal(8,
"Set1")[c(2:1, 3:5, 7:9)], fill=RColorBrewer::brewer.pal(8, "Set1")[c(2:1,
3:5, 7:9)], region=RColorBrewer::brewer.pal(n=11, name="Spectral")))
rm(corpus, corpusVars, dtm, lengths)
corpusDataset <-
read.csv("C:/Users/cristian.chirita/Documents/guardian comments2.csv",
fileEncoding="UTF-8")
rm(lengths)
library(RODBC)
channel <-
odbcConnectExcel("C:/Users/cristian.chirita/Documents/guardian comments2.xls")
corpusDataset <- sqlQuery(channel=channel,
"select * from [guardian comments2$]")
odbcCloseAll()
corpus <- Corpus(DataframeSource(corpusDataset["p"]), readerControl=list(language="en"))
corpusVars <- corpusDataset[!names(corpusDataset) == "p"]
corpusVars <- corpusVars[c("Autor")]
activeDataSet("corpusVars")
setCorpusVariables()
dtmCorpus <- corpus
dtmCorpus <- tm_map(dtmCorpus, content_transformer(tolower))
dtmCorpus <- tm_map(dtmCorpus, content_transformer(function(x) gsub("(['?\n<U+202F><U+2009>]|[[:punct:]]|[[:space:]]|[[:cntrl:]])+", " ", x)))
dtmCorpus <- tm_map(dtmCorpus, removeNumbers)
dtm <- DocumentTermMatrix(dtmCorpus, control=list(tolower=FALSE, wordLengths=c(2, Inf)))
rm(dtmCorpus)
dictionary <- data.frame(row.names=colnames(dtm), "Occurrences"=col_sums(dtm), "Stopword"=ifelse(colnames(dtm) %in% stopwords("en"), "Stopword", ""), stringsAsFactors=FALSE)
dtm <- dtm[, !colnames(dtm) %in% stopwords("en")]
attr(dtm, "dictionary") <- dictionary
rm(dictionary)
meta(corpus, type="corpus", tag="language") <- attr(dtm, "language") <- "en"
meta(corpus, type="corpus", tag="processing") <- attr(dtm, "processing") <- c(lowercase=TRUE, punctuation=TRUE, digits=TRUE, stopwords=TRUE, stemming=FALSE, customStemming=FALSE,
twitter=FALSE, removeHashtags=NA, removeNames=NA)
corpus
dtm
library(topicmodels)
#lda <- LDA(dtm, k = 10) # find 8 topics
library(qdap)
mytdm <- as.Corpus(dtm)
my_tdm <- TermDocumentMatrix(mytdm)
inspect(my_tdm)
(freq.terms <- findFreqTerms(my_tdm, lowfreq = 10))
term.freq <- rowSums(as.matrix(my_tdm))
term.freq <- subset(term.freq, term.freq >= 25)
df <- data.frame(term = names(term.freq), freq = term.freq)
library(ggplot2)
ggplot(df, aes(x = term, y = freq)) + geom_bar(stat = "identity") +
xlab("Terms") + ylab("Count") + coord_flip()
graphplot <-
function(x,
terms = sample(Terms(x), 20),
corThreshold = 0.7,
weighted=TRUE,
diag=FALSE,
...)
{
if (system.file(package = "igraph") == "")
stop("Plotting requires package 'igraph'.")
m <- if (inherits(x, "TermDocumentMatrix")) t(x) else x
m <- as.matrix(m[, terms])
c <- cor(m)
c[c < corThreshold] <- 0
c[is.na(c)] <- 0
diag(c) <- 0
tmgraph <- graph.adjacency(c, mode=c("undirected"), weighted=TRUE, diag=FALSE,
add.colnames=NULL, add.rownames=NA)
plot(tmgraph)
invisible(tmgraph)
}
library(igraph)
(freq.terms <- findFreqTerms(dtm, lowfreq = 20))
agraph <- graphplot(dtm,term = freq.terms,corThreshold = 0.2)
write.graph(agraph,"e:/agraphGuardian1.graphml", format=c("graphml"))
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment