Wednesday, April 20, 2016
Metoda inspirata din DataminingwithR
# Prefer fixed to scientific notation
options(scipen=5)
# Print numbers with two significant digits
options(digits=2)
options(R2HTML.format.digits=2)
# Set a nice color palette for plots
lattice.options(default.theme=latticeExtra::custom.theme(symbol=RColorBrewer::brewer.pal(8,
"Set1")[c(2:1, 3:5, 7:9)], fill=RColorBrewer::brewer.pal(8, "Set1")[c(2:1,
3:5, 7:9)], region=RColorBrewer::brewer.pal(n=11, name="Spectral")))
rm(corpus, corpusVars, dtm, lengths)
library(RODBC)
channel <-
odbcConnectExcel("C:/Users/cristian.chirita/Documents/newQdap/DATA/RAW_DATA/PAPER_ARTIFACTS/MMeditoriale.xls")
corpusDataset <- sqlQuery(channel=channel, "select * from [Sheet1$]")
odbcCloseAll()
corpus <- Corpus(DataframeSource(corpusDataset["Discurs"]),
readerControl=list(language="ro"))
corpusVars <- corpusDataset[!names(corpusDataset) == "Discurs"]
activeDataSet("corpusVars")
setCorpusVariables()
dtmCorpus <- corpus
dtmCorpus <- tm_map(dtmCorpus, content_transformer(tolower))
dtmCorpus <- tm_map(dtmCorpus, content_transformer(function(x)
gsub("(['?\n<U+202F><U+2009>]|[[:punct:]]|[[:space:]]|[[:cntrl:]])+", " ",
x)))
dtmCorpus <- tm_map(dtmCorpus, removeNumbers)
dtm <- DocumentTermMatrix(dtmCorpus, control=list(tolower=FALSE,
wordLengths=c(2, Inf)))
rm(dtmCorpus)
dictionary <- data.frame(row.names=colnames(dtm),
"Occurrences"=col_sums(dtm), "Stopword"=ifelse(colnames(dtm) %in%
stopwords("ro"), "Stopword", ""), stringsAsFactors=FALSE)
dtm <- dtm[, !colnames(dtm) %in% stopwords("ro")]
attr(dtm, "dictionary") <- dictionary
rm(dictionary)
meta(corpus, type="corpus", tag="language") <- attr(dtm, "language") <- "ro"
meta(corpus, type="corpus", tag="processing") <- attr(dtm, "processing") <-
c(lowercase=TRUE, punctuation=TRUE, digits=TRUE, stopwords=TRUE,
stemming=FALSE, customStemming=FALSE, twitter=FALSE, removeHashtags=NA,
removeNames=NA)
corpus
dtm
library(qdap)
mytdm <- as.Corpus(dtm)
my_tdm <- TermDocumentMatrix(mytdm)
inspect(my_tdm)
(freq.terms <- findFreqTerms(my_tdm, lowfreq = 40))
term.freq <- rowSums(as.matrix(my_tdm))
term.freq <- subset(term.freq, term.freq >= 25)
df <- data.frame(term = names(term.freq), freq = term.freq)
library(ggplot2)
ggplot(df, aes(x = term, y = freq)) + geom_bar(stat = "identity") +
xlab("Terms") + ylab("Count") + coord_flip()
findAssocs(my_tdm, "munteanu", 0.2)
findAssocs(my_tdm, "pnl", 0.6)
findAssocs(my_tdm, "ciucu", 0.6)
library(graph)
library(Rgraphviz)
plot(my_tdm, term = freq.terms, corThreshold = 0.1, weighting = T)
tdm.s = removeSparseTerms(my_tdm, sparse=0.7)
# we'll need the TDM as a matrix
m = as.matrix(tdm.s)
# datavis time
# convert matri to data frame
m.df = data.frame(m)
# quick hack to make keywords - which got stuck in row.names - into a variable
m.df$keywords = rownames(m.df)
library(reshape)
# "melt" the data frame ; ?melt at R console for info
m.df.melted = melt(m.df)
?
# not necessary, but I like decent column names
colnames(m.df.melted) = c("Keyword","Post","Freq")
# generate the heatmap
library(ggthemes)
hm = ggplot(m.df.melted, aes(x=Post, y=Keyword)) +
geom_tile(aes(fill=Freq), colour="white") +
scale_fill_gradient(low="black", high="darkorange") +
labs(title="Cuvinte cheie folosite in articolele cu Marian Munteanu") +
theme_few() +
theme(axis.text.x = element_text(size=6))
ggsave(plot=hm,filename="e:/mm_munteanudocs-hm.png",width=11,height=8.5)
# better? way to view frequencies
# sum rows of the tdm to get term freq count
tf = rowSums(as.matrix(my_tdm))
# we don't want all the words, so choose ones with 10+ freq
tf.10 = subset(tf, tf>=40)
#nu merge :)
# wimping out and using qplot so I don't have to make another data frame
#bf = qplot(names(tf.10), value(tf.10), geom="bar")
#ggsave(plot=bf,filename="e:/marianmunteanufreq-bars.png",width=8.5,height=11)
termDocMatrix <- as.matrix(my_tdm)
termDocMatrix[5:10,1:20]
# change it to a Boolean matrix
m[5:10,1:20]
termDocMatrix <- m
termDocMatrix[termDocMatrix>=1] <- 1
# transform into a term-term adjacency matrix
termMatrix <- termDocMatrix %*% t(termDocMatrix)
termMatrix[5:10,5:10]
library(igraph)
# build a graph from the above matrix
g <- graph.adjacency(termMatrix, weighted=TRUE, mode = "undirected")
# remove loops
g <- simplify(g)
# set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)
# set seed to make the layout reproducible
set.seed(3952)
layout1 <- layout.fruchterman.reingold(g)
plot(g, layout=layout1)
write.graph(g,"e:/marianmunteanuart.graphml", format=c("graphml"))
word.freq <- sort(rowSums(m), decreasing = T)
# colors
pal <- brewer.pal(9, "BuGn")
pal <- pal[-(1:4)]
library(wordcloud)
wordcloud(words = names(word.freq), freq = word.freq, min.freq = 10,
random.order = F, colors = pal)
tdm2 <- removeSparseTerms(my_tdm, sparse = 0.95)
tdm2 <- tdm.s
m2 <- as.matrix(tdm2)
# cluster terms
distMatrix <- dist(scale(m2))
fit <- hclust(distMatrix, method = "ward")
plot(fit)
rect.hclust(fit, k = 10) # cut tree into 6 clusters
m3 <- t(m2) # transpose the matrix to cluster documents
set.seed(122) # set a fixed random seed
k <- 10 # number of clusters
kmeansResult <- kmeans(m3, k)
round(kmeansResult$centers, digits = 3)
for (i in 1:k) {
cat(paste("cluster ", i, ": ", sep = ""))
s <- sort(kmeansResult$centers[i, ], decreasing = T)
cat(names(s)[1:5], "\n")
# print the keywords of every cluster
# print(keywords[which(kmeansResult$cluster==i)])
}
===============
https://rdatamining.wordpress.com/2012/05/17/an-example-of-social-network-analysis-with-r-using-package-igraph/
Pt plot tm vezi:
https://github.com/rforge/tm/blob/master/pkg/R/plot.R
======
http://www.leydesdorff.net/words/
======
http://www.r-bloggers.com/visualizing-risky-words-part-2/
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment