These is a very verbose code documentation of a talk that I held at the Social Science Data Lab at the Mannheimer Zentrum für Europäische Sozialforschung (MZES) upon invitation from Christiane Grill. Thanks for having me!

Overview of this talk

  1. Why quanteda?
  2. Using quanteda
  3. Applying dictionaries
  4. Unsupervised machine learning
  5. Supervised machine learning
  6. Closing remarks

Why quanteda?

Kenneth Benoit, creator of quanteda

Kenneth Benoit, creator of quanteda

Why quanteda?

Using quanteda

Most analyses with quanteda onsist of three steps:

  1. Import the data
  2. Build a corpus
  3. Calculate a DFM
Model of a DTM.

Model of a DTM.

Using quanteda: Reading data

sherlock <- readtext("data/sherlock/novels/[0-9]*.txt") 
sherlock$doc_id <- str_sub(sherlock$doc_id, start = 4, end = -5)
mycorpus <- corpus(sherlock, docid_field = "doc_id")
docvars(mycorpus, "Textno") <- sprintf("%02d", 1:ndoc(mycorpus))
mycorpus
## Corpus consisting of 12 documents and 1 docvar.

Using quanteda: Generating corpus statistics

mycorpus.stats <- summary(mycorpus)
mycorpus.stats$Text <- reorder(mycorpus.stats$Text, 1:ndoc(mycorpus), order = T)
mycorpus.stats

Using quanteda: What makes DFMs nifty

Things to remember about DFMS:

Using quanteda: Calculating a DFM (1)

mydfm <- dfm(mycorpus, remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE, remove = stopwords("english"))
mydfm
## Document-feature matrix of: 12 documents, 8,489 features (79.1% sparse).
head(dfm_sort(mydfm, decreasing = TRUE, margin = "both"), n = 12, nf = 10) 
## Document-feature matrix of: 12 documents, 10 features (0.0% sparse).
## 12 x 10 sparse Matrix of class "dfm"
##                                        features
## docs                                    said upon holmes one man mr little
##   The Adventure of the Speckled Band      44   41     55  33  11  5     17
##   The Adventure of the Copper Beeches     47   33     42  36  34 44     37
##   The Boscombe Valley Mystery             37   42     43  31  41 24     25
##   The Man with the Twisted Lip            28   54     28  36  30 20     21
##   The Adventure of the Beryl Coronet      45   33     26  32  27 20     22
##   The Red-headed League                   51   50     51  29  25 55     25
##   A Scandal in Bohemia                    33   25     47  27  23  9     14
##   The Adventure of the Engineer's Thumb   47   38     12  33  17 11     25
##   The Adventure of the Noble Bachelor     33   29     34  31  10 17     26
##   The Adventure of the Blue Carbuncle     43   38     34  38  37 17     24
##   The Five Orange Pips                    32   47     25  29  19  3      5
##   A Case of Identity                      45   35     46  17  16 50     28
##                                        features
## docs                                    now see may
##   The Adventure of the Speckled Band     21  22  19
##   The Adventure of the Copper Beeches    18  17  21
##   The Boscombe Valley Mystery            16  24  19
##   The Man with the Twisted Lip           27  18  15
##   The Adventure of the Beryl Coronet     29  20  25
##   The Red-headed League                  14  23   8
##   A Scandal in Bohemia                   17  15  21
##   The Adventure of the Engineer's Thumb  16  16   9
##   The Adventure of the Noble Bachelor    16  16  18
##   The Adventure of the Blue Carbuncle    33  27   7
##   The Five Orange Pips                   12  16  24
##   A Case of Identity                     15  15  11

Using quanteda: Calculating a DFM (2)

load("data/euspeech/euspeech.korpus.RData")
korpus.euspeech
## Corpus consisting of 17,505 documents and 10 docvars.
mydfm.eu <- dfm(korpus.euspeech, groups = "Typ")
mydfm.eu.prop <- dfm_weight(mydfm.eu, scheme = "prop")
head(dfm_sort(mydfm.eu.prop, decreasing = TRUE, margin = "both"), nf = 8) 
## Document-feature matrix of: 2 documents, 8 features (0.0% sparse).
## 2 x 8 sparse Matrix of class "dfm"
##            features
## docs           european        also     countri        need        year
##   Regierung 0.006177315 0.007775821 0.008114498 0.003875806 0.005878028
##   EU        0.010959076 0.006602713 0.004304388 0.006419974 0.004401859
##            features
## docs              europ      polici      govern
##   Regierung 0.003943332 0.002321008 0.006384211
##   EU        0.005781216 0.006610397 0.002485433

Applying dictionaries: Defining an ad-hoc dictionary

populism.liberalism.dict <- dictionary(list(populism = c("elit*", "consensus*", "undemocratic*", "referend*", "corrupt*", "propagand", "politici*", "*deceit*", "*deceiv*", "*betray*", "shame*", "scandal*", "truth*", "dishonest*", "establishm*", "ruling*"), liberalism = c("liber*", "free*", "indiv*", "open*", "law*", "rules", "order", "rights", "trade", "global", "inter*", "trans*", "minori*", "exchange", "market*")))
populism.liberalism.dict
## Dictionary object with 2 key entries.
## - [populism]:
##   - elit*, consensus*, undemocratic*, referend*, corrupt*, propagand, politici*, *deceit*, *deceiv*, *betray*, shame*, scandal*, truth*, dishonest*, establishm*, ruling*
## - [liberalism]:
##   - liber*, free*, indiv*, open*, law*, rules, order, rights, trade, global, inter*, trans*, minori*, exchange, market*

Applying dictionaries: Applying an ad-hoc dictionary

mydfm.eu <- dfm(korpus.euspeech, dictionary = populism.liberalism.dict)
mydfm.eu.prop <- dfm_weight(mydfm.eu, scheme = "prop")
eu.poplib <- convert(mydfm.eu.prop, "data.frame") %>% 
  bind_cols(korpus.euspeech.stats) %>% 
  filter(length >= 1200, populism > 0 | liberalism > 0)
ggplot(eu.poplib, aes(country, populism)) + geom_boxplot(outlier.size = 0) + geom_jitter(aes(country,populism), position = position_jitter(width = 0.4, height = 0), alpha = 0.1, size = 0.2, show.legend = F) + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) + xlab("") + ylab("Populism share") + ggtitle("Populism share in the EUspeech corpus based on our ad-hoc dictionary (%)")

Applying dictionaries: Lexicoder Policy Agendas (PA)

load("dictionaries/policy_agendas_english.RData")
policyagendas.dict <- dictionary(dictLexic2Topics)
mydfm.eu.pa <- dfm(korpus.euspeech, groups = "country", dictionary = policyagendas.dict)
eu.topics.pa <- convert(mydfm.eu.pa, "data.frame") %>%
  rename(Land = document) %>%
  select(Land, macroeconomics, finance, foreign_trade, labour, healthcare, immigration, education, intl_affairs, defence) %>%
  gather(macroeconomics:defence, key = "Thema", value = "Anteil") %>% 
  group_by(Land) %>% 
  mutate(Anteil = Anteil/sum(Anteil)) %>% 
  mutate(Thema = as_factor(Thema))
ggplot(eu.topics.pa, aes(Land, Anteil, colour = Thema, fill = Thema)) + geom_bar(stat="identity") + scale_colour_brewer(palette = "Set1") + scale_fill_brewer(palette = "Pastel1") + ggtitle("Distribution of PA topics in the EUspeech corpus") + xlab("") + ylab("Topic share (%)") + theme(axis.text.x = element_text(angle = 45, hjust = 1))

Dictionaries: LIWC German (1)

load("data/facebook/facebook.korpus.RData")
korpus.facebook
## Corpus consisting of 20,000 documents and 3 docvars.
texts(corpus_sample(corpus_subset(korpus.facebook, corpus == "populism"), size = 1))
##                                                                                    pegidaevdresden3881 
## "Meinungsfreiheit wird bestraft aber uns zu unterstellen das wir Nazis sind, das wird nicht bestraft."
texts(corpus_sample(corpus_subset(korpus.facebook, corpus == "news"), size = 1))
##                                                                                                                                                                faz828 
## "Liebe FAZ es macht mir den Eindruck als ob ihr es nicht mitbekommen habt das der Kalte Krieg vorbei ist man braucht keine Propaganda von euch  das ist echt traurig"

Dictionaries: LIWC German (2)

facebook.aktivitaet <- korpus.facebook.stats %>%
  mutate(Quelle = factor(source, levels = c("pegidaevdresden", "alternativefuerde", "FAZ", "SZ", "Welt", "Zeit"))) %>% 
  group_by(Datum = floor_date(created_time, "1 month"), Quelle) %>%
  summarise(Kommentare = n())
ggplot(facebook.aktivitaet, aes(as.Date(Datum), Kommentare, group = Quelle, col = Quelle)) + geom_line(size = 1) + scale_colour_brewer(palette = "Set1") + scale_x_date(date_breaks = "2 months", date_labels = "%b %Y") + ggtitle("Volume of comments on six public Facebook pages") + xlab("") + ylab("") + theme(axis.text.x = element_text(angle = 45, hjust = 1))

Dictionaries: LIWC German (3)

liwc.ger <- dictionary(file = "dictionaries/LIWC_German.dic", format = "LIWC")
## note: ignoring undefined categories:
##   27 for anhoer*
##   29 for anhoer*
##   27 for anhör*
##   29 for anhör*
##   27 for auge*
##   28 for auge*
##   27 for brenn*
##   30 for brenn*
##   27 for darbietung
##   28 for darbietung
##   27 for empfang*
##   29 for empfang*
##   27 for empfindung*
##   30 for empfindung*
##   27 for essen*
##   27 for gefuttert*
##   27 for gegessen*
##   27 for geschmack*
##   27 for geschmäck*
##   27 for geschmaeck*
##   27 for getraenk*
##   27 for getränk*
##   27 for hoer*
##   29 for hoer*
##   27 for hoerte*
##   29 for hoerte*
##   27 for hör*
##   29 for hör*
##   27 for hörte*
##   29 for hörte*
##   27 for isst*
##   27 for juck*
##   30 for juck*
##   27 for knuddel*
##   30 for knuddel*
##   27 for ohr*
##   29 for ohr*
##   27 for schau
##   28 for schau
##   27 for schmackhaft*
##   27 for schmerz*
##   30 for schmerz*
##   27 for show*
##   28 for show*
##   27 for sinnes*
##   30 for sinnes*
##   27 for trank*
##   27 for trink*
##   27 for umarm*
##   30 for umarm*
##   27 for zuhoer*
##   29 for zuhoer*
##   27 for zuhör*
##   29 for zuhör*
head(liwc.ger, 2)
## Dictionary object with 2 key entries.
## - [Pronoun]:
##   - dein*, denen, deren, derer, dessen, dich, dir, du, er, es, euch, euer*, eure*, ich, ihm, ihn, ihnen, ihr, ihre, ihrem, ihren, ihrer, ihres, irgendei*, irgendet*, irgendj*, irgendwe*, jedermann*, jemand*, mein, meine, meinem, meinen, meiner, mich, mir, nichts, niemand*, sein*, sie, uns, unser, unsere, unserem, unseren, wir
## - [I]:
##   - ich, mein, meine, meinem, meinen, meiner, mich, mir

Dictionaries: LIWC German (4)

mydfm.fb.liwc <- dfm(korpus.facebook, groups = "corpus", dictionary = liwc.ger)
liwc.shares <- convert(mydfm.fb.liwc, "data.frame") %>%
  rename(Korpus = document) %>% 
  gather(key = Kategorie, value = Words, -Korpus) %>% 
  filter(!Kategorie %in% c("Article", "Down", "Eat", "Fillers", "Grooming", "Humans", "Money", "Motion", "Music", "Nonfluency", "Numbers", "Physical", "Preps", "Relig", "Sex", "Sleep", "Sports", "Time", "Up")) %>% 
  mutate(Korpus = factor(Korpus, levels = c("populism", "news")))
ggplot(liwc.shares, aes(Kategorie, Words, fill = Korpus)) + geom_bar(stat = "identity", position = position_dodge()) + scale_fill_brewer(palette = "Set1") + ggtitle("Share of LIWC categories in Facebook comments by page") + xlab("LIWC categories") + theme(axis.text.x = element_text(size = 7, angle = 45, hjust = 1))

Unsupervised machine learning: Latent semantic analysis (LSA)

Unsupervised machine learning: Latent semantic analysis (LSA)

korpus.facebook.sample <- corpus_sample(korpus.facebook, size = 500)
mydfm.fb <- dfm(korpus.facebook.sample)
mylsa <- textmodel_lsa(mydfm.fb, nd = 10)
sources <- str_remove_all(rownames(mylsa$docs), "[0-9]")
sources.color <- rep("blue", times = length(sources))
sources.color[sources %in% c("alternativefuerde", "pegidaevdresden")] <- "red"
plot(mylsa$docs[,1:2], col = sources.color, pch = 19, xlab = "Dimension 1", ylab = "Dimension 2", main = "LSA dimensions by subcorpus")

plot(mylsa$docs[,2:3], col = sources.color, pch = 19, xlab = "Dimension 2", ylab = "Dimension 3", main = "LSA dimensions by subcorpus")

plot(mylsa$docs[,3:4], col = sources.color, pch = 19, xlab = "Dimension 3", ylab = "Dimension 4", main = "LSA dimensions by subcorpus")

plot(mylsa$docs[,4:5], col = sources.color, pch = 19, xlab = "Dimension 4", ylab = "Dimension 5", main = "LSA dimensions by subcorpus")

plot(mylsa$docs[,5:6], col = sources.color, pch = 19, xlab = "Dimension 5", ylab = "Dimension 6", main = "LSA dimensions by subcorpus")

plot(mylsa$docs[,6:7], col = sources.color, pch = 19, xlab = "Dimension 6", ylab = "Dimension 7", main = "LSA dimensions by subcorpus")

plot(mylsa$docs[,7:8], col = sources.color, pch = 19, xlab = "Dimension 7", ylab = "Dimension 8", main = "LSA dimensions by subcorpus")

plot(mylsa$docs[,8:9], col = sources.color, pch = 19, xlab = "Dimension 8", ylab = "Dimension 9", main = "LSA dimensions by subcorpus")

plot(mylsa$docs[,9:10], col = sources.color, pch = 19, xlab = "Dimension 9", ylab = "Dimension 10", main = "LSA dimensions by subcorpus")

Unsupervised machine learning: LDA (1)

load("data/zeit/zeit.sample.korpus.RData")
as.data.frame(zeit.korpus.stats)

Unsupervised machine learning: LDA (2)

mydfm.zeit <- dfm(zeit.korpus, remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE, remove = stopwords("german"))
mydfm.zeit.trim <-  dfm_trim(mydfm.zeit, min_docfreq = 3, max_docfreq = 65)
mydfm.zeit.trim
## Document-feature matrix of: 377 documents, 5,260 features (97.9% sparse).

Unsupervised machine learning: LDA (3)

topic.count <- 15
dfm2topicmodels <- convert(mydfm.zeit.trim, to = "topicmodels")
lda.model <- LDA(dfm2topicmodels, topic.count)
lda.model
## A LDA_VEM topic model with 15 topics.

Unsupervised machine learning: LDA (4)

as.data.frame(terms(lda.model, 6))

Unsupervised machine learning: LDA (5)

lda.similarity <- as.data.frame(lda.model@beta) %>% 
  scale() %>% 
  dist(method = "euclidean") %>% 
  hclust(method = "ward.D2")
par(mar = c(0, 4, 4, 2))
plot(lda.similarity, main = "LDA topic similarity by features", xlab = "", sub = "")

Unsupervised machine learning: STM (1)

Unsupervised machine learning: STM (2)

load("data/un/un.stm.RData")
par(mar=c(0.5, 0.5, 0.5, 0.5))
cloud(modell.stm, topic = 1, scale = c(2.25,.5))

cloud(modell.stm, topic = 2, scale = c(2.25,.5))

cloud(modell.stm, topic = 7, scale = c(2.25,.5))

cloud(modell.stm, topic = 9, scale = c(2.25,.5))

plot(modell.stm, type = "summary", text.cex = 0.5, main = "STM topic shares", xlab = "Share estimation")

Supervised machine learning: NB classifier (1)

load("data/nytimes/nyt.korpus.RData")
korpus.nyt
## Corpus consisting of 30,862 documents and 3 docvars.
labels.cats <- scan("data/nytimes/majortopics2digits.txt", what = "char", sep = "\n", quiet = T)
labels.cats <- data.frame(Kategorie = as.character(1:length(labels.cats)), Label = labels.cats, stringsAsFactors = F)
labels.cats

Supervised machine learning: NB classifier (2)

mydfm.nyt <- dfm(korpus.nyt, remove_numbers = TRUE, remove_punct = TRUE, remove = stopwords("english"))
mydfm.nyt
## Document-feature matrix of: 30,862 documents, 16,499 features (100.0% sparse).
mydfm.nyt.trim <- dfm_trim(mydfm.nyt, min_docfreq = 0.0005, docfreq_type = "prop") # optional: min_count = 10
mydfm.nyt.trim
## Document-feature matrix of: 30,862 documents, 1,905 features (99.8% sparse).

Supervised machine learning: NB classifier (3)

modell.NB <- textmodel_nb(mydfm.nyt.trim, korpus.nyt.stats$Topic_2digit, prior = "docfreq")
head(as.character(predict(modell.NB)))
## [1] "12" "28" "20" "16" "20" "10"
prop.table(table(predict(modell.NB) == korpus.nyt.stats$Topic_2digit))*100
## 
##    FALSE     TRUE 
## 25.13771 74.86229
prop.table(table(sample(predict(modell.NB)) == korpus.nyt.stats$Topic_2digit))*100
## 
##    FALSE     TRUE 
## 89.94232 10.05768

Supervised machine learning: NB classifier (4)

model.NB.classification <- bind_cols(korpus.nyt.stats, Klassifikation = as.character(predict(modell.NB))) %>%
  mutate(Kategorie = as.character(Topic_2digit)) %>% 
  mutate(RichtigKodiert = Klassifikation == Kategorie) %>% 
  group_by(Kategorie, RichtigKodiert) %>% 
  summarise(n = n()) %>% 
  mutate(Anteil = n/sum(n)) %>% 
  filter(RichtigKodiert == TRUE) %>% 
  left_join(labels.cats, by = "Kategorie") %>% 
  select(Kategorie, Label, n, Anteil)
ggplot(model.NB.classification, aes(Label, Anteil)) + geom_bar(stat = "identity") + geom_hline(yintercept = mean(model.NB.classification$Anteil), color = "blue") + ylim(0, 1) + ggtitle("Share of correctly classified texts in 26\ncontent categories with a Naive Bayes classifier") + xlab("") + ylab("") + coord_flip()

Supervised machine learning: RTextTools (1)

container <- create_container(convert(mydfm.nyt.trim, to = "matrix"), korpus.nyt.stats$Topic_2digit, trainSize = 1:27775, testSize = 27776:30862, virgin = FALSE)

Supervised machine learning: RTextTools (2)

load("data/nytimes/nyt.modelle.RData")

# primäre Modelle
#modell.SVM <- train_model(container,"SVM")
#modell.GLMNET <- train_model(container,"GLMNET")
#modell.MAXENT <- train_model(container,"MAXENT")
#modell.SLDA <- train_model(container,"SLDA")

# weitere Modelle, funktionieren z.T. nicht
#modell.BOOSTING <- train_model(container,"BOOSTING")
#modell.BAGGING <- train_model(container,"BAGGING")
#modell.RF <- train_model(container,"RF")
#modell.NNET <- train_model(container,"NNET")
#modell.TREE <- train_model(container,"TREE")

Supervised machine learning: RTextTools (3)

SVM_CLASSIFY <- classify_model(container, modell.SVM)
GLMNET_CLASSIFY <- classify_model(container, modell.GLMNET)
MAXENT_CLASSIFY <- classify_model(container, modell.MAXENT)
SLDA_CLASSIFY <- classify_model(container, modell.SLDA)
#BOOSTING_CLASSIFY <- classify_model(container, modell.BOOSTING)
#BAGGING_CLASSIFY <- classify_model(container, modell.BAGGING)
#RF_CLASSIFY <- classify_model(container, modell.RF)
#NNET_CLASSIFY <- classify_model(container, modell.NNET)
#TREE_CLASSIFY <- classify_model(container, modell.TREE)

Supervised machine learning: RTextTools (4)

analytics <- create_analytics(container, cbind(SVM_CLASSIFY, GLMNET_CLASSIFY, MAXENT_CLASSIFY, SLDA_CLASSIFY))
summary(analytics)
## ENSEMBLE SUMMARY
## 
##        n-ENSEMBLE COVERAGE n-ENSEMBLE RECALL
## n >= 1                1.00              0.60
## n >= 2                0.98              0.61
## n >= 3                0.77              0.69
## n >= 4                0.43              0.83
## 
## 
## ALGORITHM PERFORMANCE
## 
##        SVM_PRECISION           SVM_RECALL           SVM_FSCORE 
##            0.5215385            0.5034615            0.4996154 
##       SLDA_PRECISION          SLDA_RECALL          SLDA_FSCORE 
##            0.4988462            0.5223077            0.4857692 
##     GLMNET_PRECISION        GLMNET_RECALL        GLMNET_FSCORE 
##            0.6319231            0.4350000            0.4846154 
## MAXENTROPY_PRECISION    MAXENTROPY_RECALL    MAXENTROPY_FSCORE 
##            0.4788462            0.4673077            0.4550000

Supervised machine learning: RTextTools (5)

topic.codes <- scan("data/nytimes/majortopics2digits.txt", what = "char", sep = "\n", quiet = T)
topic.codes <- data.frame(category = as.factor(1:length(topic.codes)), category.label = topic.codes)
algdf <- data.frame(algorithm = str_split(colnames(analytics@algorithm_summary), "_", simplify = T)[,1], measure = factor(str_split(colnames(analytics@algorithm_summary), "_", simplify = T)[,2], levels = str_split(colnames(analytics@algorithm_summary), "_", simplify = T)[1:3,2]), category = factor(rep(rownames(analytics@algorithm_summary), each = ncol(analytics@algorithm_summary)), levels = rownames(analytics@algorithm_summary)), score = as.vector(t(analytics@algorithm_summary)))
algdf <- left_join(algdf, topic.codes, by = "category")
## Warning: Column `category` joining factors with different levels, coercing
## to character vector

Supervised machine learning: RTextTools (6)

algdf <- filter(algdf, category.label %in% c("Civil Rights, Minority Issues, and Civil Liberties", "Education", "Environment", "Law, Crime, and Family Issues", "Macroeconomics", "Sports and Recreation"))
ggplot(algdf, aes(score, algorithm, color = measure, shape = measure)) + geom_point(size = 1.75) + facet_grid(category.label ~ ., switch = "y") + xlim(c(0,1)) + theme(strip.text.y = element_text(angle = 180)) + scale_colour_manual(values = c("lightblue", "lightgreen", "red")) + ggtitle("Precision, recall and f-score for four\nalgorithms measured across six\ncontent categories") + xlab("") + ylab("")

Closing remarks