source("http://bioconductor.org/biocLite.R")
biocLite("Rgraphviz")
biocLite("graph")

library(twitteR)
library(cluster)
library(tm)
library(ggplot2)
library(graph)
library(Rgraphviz)
library(wordcloud)
library(topicmodels)
library(SnowballC)

Pre-processing and inspection using the tm package

Let’s load the dataset from an R-file. Should be in your working directory that you already set to the path.

path="~/Dropbox/MST_COURSES/SS16/practicals/"
setwd(path)

load(file = "rdmTweets-201306.RData")
tweets

How many tweets are there?

n.tweet <- length(tweets)

Let’s convert the tweets list to a data frame in order to process it.

tweets.df <- twListToDF(tweets)
dim(tweets.df)

Pre-processing and corpus preparation

We use the \(tm\) package methods in order to effectively pre-process the text content. \(tm\) package offers any necessary pre-processing tools that text needs (that includes lower case transformation, stemming, removing punctuation, etc.).

# build a corpus, and specify the source to be character vectors
myCorpus <- Corpus(VectorSource(tweets.df$text))

# convert to lower case
myCorpus <- tm_map(myCorpus, content_transformer(tolower))

# remove URLs
removeURL <- function(x) gsub("http[^[:space:]]*", "", x)

# tm method of removing "user-defined" transformation like the urls
myCorpus <- tm_map(myCorpus, content_transformer(removeURL))

# remove anything other than English letters or space
removeNumPunct <- function(x) gsub("[^[:alpha:][:space:]]*", "", x)
myCorpus <- tm_map(myCorpus, content_transformer(removeNumPunct))

# remove punctuation
myCorpus <- tm_map(myCorpus, removePunctuation)

# remove numbers
myCorpus <- tm_map(myCorpus, removeNumbers)

# add two extra stop words: "available" and "via"
myStopwords <- c(stopwords('english'), "available", "via")

# remove "r" and "big" from stopwords
myStopwords <- setdiff(myStopwords, c("r", "big"))

# remove stopwords from corpus
myCorpus <- tm_map(myCorpus, removeWords, myStopwords)

# remove extra whitespace
myCorpus <- tm_map(myCorpus, stripWhitespace)

Then, let’s keep a copy of the corpus, to use it later as a dictionary for stem completion. That will help to get the original words.

# keep a copy of corpus to use later as a dictionary for stem completion
myCorpusCopy <- myCorpus
# stem words
myCorpus <- tm_map(myCorpus, stemDocument)
# inspect the first 5 documents (tweets)
inspect(myCorpus[1:5])

Basic statistics and replacement

Then, let’s so some basic discovery of specific words. For example, let’s search for specific word frequencies.

# count frequency of "mining"
miningCases <- lapply(myCorpusCopy,
function(x) {grep(as.character(x), pattern = "\\<mining")} )
sum(unlist(miningCases))

Can you do the same for “miner”?

The following code will replace “miner” with “mining”

# replace "miner" with "mining"
myCorpus <- tm_map(myCorpus, content_transformer(gsub),pattern = "miner", replacement = "mining")

Conversion to term-document-matrix

Most algorithms work using term-document-matrices so we want to convert the Corpus format to an equivalent format. The following steps summarize this process.

tdm <- TermDocumentMatrix(myCorpus,
control = list(wordLengths = c(1, Inf)))
tdm

Basic operations with the tdm matrix

Let’s first look for which documents contain term “r”.

idx <- which(dimnames(tdm)$Terms == "mine")
inspect(tdm[idx + (0:5), 101:110])

Let’s now see what are the most frequent terms (suppose frequency with more than 15 occurences

freq.terms <- findFreqTerms(tdm, lowfreq = 15)
freq.terms

Or similarly, we can present which are these terms with their frequency.

term.freq <- rowSums(as.matrix(tdm))
term.freq <- subset(term.freq, term.freq >= 15)
df <- data.frame(term = names(term.freq), freq = term.freq)

And of course, wen can plot the results.

ggplot(df, aes(x = df$term, y = df$freq)) + geom_bar(stat = "identity") +
xlab("Terms") + ylab("Count") + coord_flip()

Moreover, we can find which words are associated with “r” (and select the association level)?

findAssocs(tdm, "r", 0.2)

Can you find which terms are associated with mining with over 25%?

We can also have some nice plots that show the associations of terms with a specific correlation threshold. That provides as with a nice graph of different relations.

plot(tdm, term = freq.terms, corThreshold = 0.1, weighting = T)

Other useful plots (also really trendy lately) are the word clouds. Here is a code that can build such an example, based on the term-document-matrix.

m <- as.matrix(tdm)
# calculate the frequency of words and sort it by frequency
word.freq <- sort(rowSums(m), decreasing = T)
# colors
pal <- brewer.pal(9, "BuGn")
pal <- pal[-(1:4)]
# plot word cloud
wordcloud(words = names(word.freq), freq = word.freq, min.freq = 3,
random.order = F, colors = pal)

Clustering

Can we apply clustering on this dataset? Sure! But first, let’s do some more pre-processing by removing sparse terms.

tdm2 <- removeSparseTerms(tdm, sparse = 0.95)
m2 <- as.matrix(tdm2)
dim(m2)

How many terms are left (that are not sparce)?

In the next steps, try to apply both hierarchical clustering (using the ward method) and k-means clustering. Experiment with the number of clusters.

distMatrix <- dist(scale(t(m2)))
fit <- agnes(distMatrix, method = "ward")
plot(fit, hang=-1)
rect.hclust(fit, k = 6) # cut tree into 6 clusters

Then, let’s apply the k-means algorithm and also see how the k-clusters are represented by the centroids.

m3 <- t(m2) # transpose the matrix to cluster documents (tweets)
set.seed(122) # set a fixed random seed
k <- 6 # number of clusters
kmeansResult <- kmeans(m3, k)

#represent the centroids
round(kmeansResult$centers, digits = 3) # cluster centers

# print the tweets of every cluster
i=1
print(tweets[which(kmeansResult$cluster==i)])

Topic modeling

Finally, let’s apply a topic modeling approach. Remember, that while clustering and LDA are working in a similar technical way, it’s not necessary that the results they produce are similar.

dtm <- as.DocumentTermMatrix(tdm)
lda <- LDA(dtm, k = 6) # find 6 topics

With the following statements we can find the most common terms for each topic, or the most prevalent topics for each document. Notice here that we deal with tweets and LDA is not expected to perform very well. Can you think why?

term <- terms(lda, 6) # first 6 terms of every topic

topic <- topics(lda, 1) # first topic for each document

And finally, let’s do some nice visualization based on the topics and the dates they were created. That shows the relative prevalence of topics.

topics7 <- data.frame(date=as.Date(tweets.df$created), topic)

#Sorry that figure used to be better, but qplot was recently changed
qplot(date, ..count.., data=topics7, geom="density", fill=term[topic])

#still haven't solved the problem...
ggplot(topics7, aes(date, topic)) + geom_area(aes(fill = term[topic]))