These is a very verbose code documentation of a talk that I held at the Social Science Data Lab at the Mannheimer Zentrum für Europäische Sozialforschung (MZES) upon invitation from Christiane Grill. Thanks for having me!
Most analyses with quanteda onsist of three steps:
sherlock <- readtext("data/sherlock/novels/[0-9]*.txt")
sherlock$doc_id <- str_sub(sherlock$doc_id, start = 4, end = -5)
mycorpus <- corpus(sherlock, docid_field = "doc_id")
docvars(mycorpus, "Textno") <- sprintf("%02d", 1:ndoc(mycorpus))
mycorpus
## Corpus consisting of 12 documents and 1 docvar.
mycorpus.stats <- summary(mycorpus)
mycorpus.stats$Text <- reorder(mycorpus.stats$Text, 1:ndoc(mycorpus), order = T)
mycorpus.stats
Things to remember about DFMS:
mydfm <- dfm(mycorpus, remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE, remove = stopwords("english"))
mydfm
## Document-feature matrix of: 12 documents, 8,489 features (79.1% sparse).
head(dfm_sort(mydfm, decreasing = TRUE, margin = "both"), n = 12, nf = 10)
## Document-feature matrix of: 12 documents, 10 features (0.0% sparse).
## 12 x 10 sparse Matrix of class "dfm"
## features
## docs said upon holmes one man mr little
## The Adventure of the Speckled Band 44 41 55 33 11 5 17
## The Adventure of the Copper Beeches 47 33 42 36 34 44 37
## The Boscombe Valley Mystery 37 42 43 31 41 24 25
## The Man with the Twisted Lip 28 54 28 36 30 20 21
## The Adventure of the Beryl Coronet 45 33 26 32 27 20 22
## The Red-headed League 51 50 51 29 25 55 25
## A Scandal in Bohemia 33 25 47 27 23 9 14
## The Adventure of the Engineer's Thumb 47 38 12 33 17 11 25
## The Adventure of the Noble Bachelor 33 29 34 31 10 17 26
## The Adventure of the Blue Carbuncle 43 38 34 38 37 17 24
## The Five Orange Pips 32 47 25 29 19 3 5
## A Case of Identity 45 35 46 17 16 50 28
## features
## docs now see may
## The Adventure of the Speckled Band 21 22 19
## The Adventure of the Copper Beeches 18 17 21
## The Boscombe Valley Mystery 16 24 19
## The Man with the Twisted Lip 27 18 15
## The Adventure of the Beryl Coronet 29 20 25
## The Red-headed League 14 23 8
## A Scandal in Bohemia 17 15 21
## The Adventure of the Engineer's Thumb 16 16 9
## The Adventure of the Noble Bachelor 16 16 18
## The Adventure of the Blue Carbuncle 33 27 7
## The Five Orange Pips 12 16 24
## A Case of Identity 15 15 11
load("data/euspeech/euspeech.korpus.RData")
korpus.euspeech
## Corpus consisting of 17,505 documents and 10 docvars.
mydfm.eu <- dfm(korpus.euspeech, groups = "Typ")
mydfm.eu.prop <- dfm_weight(mydfm.eu, scheme = "prop")
head(dfm_sort(mydfm.eu.prop, decreasing = TRUE, margin = "both"), nf = 8)
## Document-feature matrix of: 2 documents, 8 features (0.0% sparse).
## 2 x 8 sparse Matrix of class "dfm"
## features
## docs european also countri need year
## Regierung 0.006177315 0.007775821 0.008114498 0.003875806 0.005878028
## EU 0.010959076 0.006602713 0.004304388 0.006419974 0.004401859
## features
## docs europ polici govern
## Regierung 0.003943332 0.002321008 0.006384211
## EU 0.005781216 0.006610397 0.002485433
populism.liberalism.dict <- dictionary(list(populism = c("elit*", "consensus*", "undemocratic*", "referend*", "corrupt*", "propagand", "politici*", "*deceit*", "*deceiv*", "*betray*", "shame*", "scandal*", "truth*", "dishonest*", "establishm*", "ruling*"), liberalism = c("liber*", "free*", "indiv*", "open*", "law*", "rules", "order", "rights", "trade", "global", "inter*", "trans*", "minori*", "exchange", "market*")))
populism.liberalism.dict
## Dictionary object with 2 key entries.
## - [populism]:
## - elit*, consensus*, undemocratic*, referend*, corrupt*, propagand, politici*, *deceit*, *deceiv*, *betray*, shame*, scandal*, truth*, dishonest*, establishm*, ruling*
## - [liberalism]:
## - liber*, free*, indiv*, open*, law*, rules, order, rights, trade, global, inter*, trans*, minori*, exchange, market*
mydfm.eu <- dfm(korpus.euspeech, dictionary = populism.liberalism.dict)
mydfm.eu.prop <- dfm_weight(mydfm.eu, scheme = "prop")
eu.poplib <- convert(mydfm.eu.prop, "data.frame") %>%
bind_cols(korpus.euspeech.stats) %>%
filter(length >= 1200, populism > 0 | liberalism > 0)
ggplot(eu.poplib, aes(country, populism)) + geom_boxplot(outlier.size = 0) + geom_jitter(aes(country,populism), position = position_jitter(width = 0.4, height = 0), alpha = 0.1, size = 0.2, show.legend = F) + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) + xlab("") + ylab("Populism share") + ggtitle("Populism share in the EUspeech corpus based on our ad-hoc dictionary (%)")
load("dictionaries/policy_agendas_english.RData")
policyagendas.dict <- dictionary(dictLexic2Topics)
mydfm.eu.pa <- dfm(korpus.euspeech, groups = "country", dictionary = policyagendas.dict)
eu.topics.pa <- convert(mydfm.eu.pa, "data.frame") %>%
rename(Land = document) %>%
select(Land, macroeconomics, finance, foreign_trade, labour, healthcare, immigration, education, intl_affairs, defence) %>%
gather(macroeconomics:defence, key = "Thema", value = "Anteil") %>%
group_by(Land) %>%
mutate(Anteil = Anteil/sum(Anteil)) %>%
mutate(Thema = as_factor(Thema))
ggplot(eu.topics.pa, aes(Land, Anteil, colour = Thema, fill = Thema)) + geom_bar(stat="identity") + scale_colour_brewer(palette = "Set1") + scale_fill_brewer(palette = "Pastel1") + ggtitle("Distribution of PA topics in the EUspeech corpus") + xlab("") + ylab("Topic share (%)") + theme(axis.text.x = element_text(angle = 45, hjust = 1))
load("data/facebook/facebook.korpus.RData")
korpus.facebook
## Corpus consisting of 20,000 documents and 3 docvars.
texts(corpus_sample(corpus_subset(korpus.facebook, corpus == "populism"), size = 1))
## pegidaevdresden3881
## "Meinungsfreiheit wird bestraft aber uns zu unterstellen das wir Nazis sind, das wird nicht bestraft."
texts(corpus_sample(corpus_subset(korpus.facebook, corpus == "news"), size = 1))
## faz828
## "Liebe FAZ es macht mir den Eindruck als ob ihr es nicht mitbekommen habt das der Kalte Krieg vorbei ist man braucht keine Propaganda von euch das ist echt traurig"
facebook.aktivitaet <- korpus.facebook.stats %>%
mutate(Quelle = factor(source, levels = c("pegidaevdresden", "alternativefuerde", "FAZ", "SZ", "Welt", "Zeit"))) %>%
group_by(Datum = floor_date(created_time, "1 month"), Quelle) %>%
summarise(Kommentare = n())
ggplot(facebook.aktivitaet, aes(as.Date(Datum), Kommentare, group = Quelle, col = Quelle)) + geom_line(size = 1) + scale_colour_brewer(palette = "Set1") + scale_x_date(date_breaks = "2 months", date_labels = "%b %Y") + ggtitle("Volume of comments on six public Facebook pages") + xlab("") + ylab("") + theme(axis.text.x = element_text(angle = 45, hjust = 1))
liwc.ger <- dictionary(file = "dictionaries/LIWC_German.dic", format = "LIWC")
## note: ignoring undefined categories:
## 27 for anhoer*
## 29 for anhoer*
## 27 for anhör*
## 29 for anhör*
## 27 for auge*
## 28 for auge*
## 27 for brenn*
## 30 for brenn*
## 27 for darbietung
## 28 for darbietung
## 27 for empfang*
## 29 for empfang*
## 27 for empfindung*
## 30 for empfindung*
## 27 for essen*
## 27 for gefuttert*
## 27 for gegessen*
## 27 for geschmack*
## 27 for geschmäck*
## 27 for geschmaeck*
## 27 for getraenk*
## 27 for getränk*
## 27 for hoer*
## 29 for hoer*
## 27 for hoerte*
## 29 for hoerte*
## 27 for hör*
## 29 for hör*
## 27 for hörte*
## 29 for hörte*
## 27 for isst*
## 27 for juck*
## 30 for juck*
## 27 for knuddel*
## 30 for knuddel*
## 27 for ohr*
## 29 for ohr*
## 27 for schau
## 28 for schau
## 27 for schmackhaft*
## 27 for schmerz*
## 30 for schmerz*
## 27 for show*
## 28 for show*
## 27 for sinnes*
## 30 for sinnes*
## 27 for trank*
## 27 for trink*
## 27 for umarm*
## 30 for umarm*
## 27 for zuhoer*
## 29 for zuhoer*
## 27 for zuhör*
## 29 for zuhör*
head(liwc.ger, 2)
## Dictionary object with 2 key entries.
## - [Pronoun]:
## - dein*, denen, deren, derer, dessen, dich, dir, du, er, es, euch, euer*, eure*, ich, ihm, ihn, ihnen, ihr, ihre, ihrem, ihren, ihrer, ihres, irgendei*, irgendet*, irgendj*, irgendwe*, jedermann*, jemand*, mein, meine, meinem, meinen, meiner, mich, mir, nichts, niemand*, sein*, sie, uns, unser, unsere, unserem, unseren, wir
## - [I]:
## - ich, mein, meine, meinem, meinen, meiner, mich, mir
mydfm.fb.liwc <- dfm(korpus.facebook, groups = "corpus", dictionary = liwc.ger)
liwc.shares <- convert(mydfm.fb.liwc, "data.frame") %>%
rename(Korpus = document) %>%
gather(key = Kategorie, value = Words, -Korpus) %>%
filter(!Kategorie %in% c("Article", "Down", "Eat", "Fillers", "Grooming", "Humans", "Money", "Motion", "Music", "Nonfluency", "Numbers", "Physical", "Preps", "Relig", "Sex", "Sleep", "Sports", "Time", "Up")) %>%
mutate(Korpus = factor(Korpus, levels = c("populism", "news")))
ggplot(liwc.shares, aes(Kategorie, Words, fill = Korpus)) + geom_bar(stat = "identity", position = position_dodge()) + scale_fill_brewer(palette = "Set1") + ggtitle("Share of LIWC categories in Facebook comments by page") + xlab("LIWC categories") + theme(axis.text.x = element_text(size = 7, angle = 45, hjust = 1))
korpus.facebook.sample <- corpus_sample(korpus.facebook, size = 500)
mydfm.fb <- dfm(korpus.facebook.sample)
mylsa <- textmodel_lsa(mydfm.fb, nd = 10)
sources <- str_remove_all(rownames(mylsa$docs), "[0-9]")
sources.color <- rep("blue", times = length(sources))
sources.color[sources %in% c("alternativefuerde", "pegidaevdresden")] <- "red"
plot(mylsa$docs[,1:2], col = sources.color, pch = 19, xlab = "Dimension 1", ylab = "Dimension 2", main = "LSA dimensions by subcorpus")
plot(mylsa$docs[,2:3], col = sources.color, pch = 19, xlab = "Dimension 2", ylab = "Dimension 3", main = "LSA dimensions by subcorpus")
plot(mylsa$docs[,3:4], col = sources.color, pch = 19, xlab = "Dimension 3", ylab = "Dimension 4", main = "LSA dimensions by subcorpus")
plot(mylsa$docs[,4:5], col = sources.color, pch = 19, xlab = "Dimension 4", ylab = "Dimension 5", main = "LSA dimensions by subcorpus")
plot(mylsa$docs[,5:6], col = sources.color, pch = 19, xlab = "Dimension 5", ylab = "Dimension 6", main = "LSA dimensions by subcorpus")
plot(mylsa$docs[,6:7], col = sources.color, pch = 19, xlab = "Dimension 6", ylab = "Dimension 7", main = "LSA dimensions by subcorpus")
plot(mylsa$docs[,7:8], col = sources.color, pch = 19, xlab = "Dimension 7", ylab = "Dimension 8", main = "LSA dimensions by subcorpus")
plot(mylsa$docs[,8:9], col = sources.color, pch = 19, xlab = "Dimension 8", ylab = "Dimension 9", main = "LSA dimensions by subcorpus")
plot(mylsa$docs[,9:10], col = sources.color, pch = 19, xlab = "Dimension 9", ylab = "Dimension 10", main = "LSA dimensions by subcorpus")
load("data/zeit/zeit.sample.korpus.RData")
as.data.frame(zeit.korpus.stats)
mydfm.zeit <- dfm(zeit.korpus, remove_numbers = TRUE, remove_punct = TRUE, remove_symbols = TRUE, remove = stopwords("german"))
mydfm.zeit.trim <- dfm_trim(mydfm.zeit, min_docfreq = 3, max_docfreq = 65)
mydfm.zeit.trim
## Document-feature matrix of: 377 documents, 5,260 features (97.9% sparse).
topic.count <- 15
dfm2topicmodels <- convert(mydfm.zeit.trim, to = "topicmodels")
lda.model <- LDA(dfm2topicmodels, topic.count)
lda.model
## A LDA_VEM topic model with 15 topics.
as.data.frame(terms(lda.model, 6))
lda.similarity <- as.data.frame(lda.model@beta) %>%
scale() %>%
dist(method = "euclidean") %>%
hclust(method = "ward.D2")
par(mar = c(0, 4, 4, 2))
plot(lda.similarity, main = "LDA topic similarity by features", xlab = "", sub = "")
load("data/un/un.stm.RData")
par(mar=c(0.5, 0.5, 0.5, 0.5))
cloud(modell.stm, topic = 1, scale = c(2.25,.5))
cloud(modell.stm, topic = 2, scale = c(2.25,.5))
cloud(modell.stm, topic = 7, scale = c(2.25,.5))
cloud(modell.stm, topic = 9, scale = c(2.25,.5))
plot(modell.stm, type = "summary", text.cex = 0.5, main = "STM topic shares", xlab = "Share estimation")
load("data/nytimes/nyt.korpus.RData")
korpus.nyt
## Corpus consisting of 30,862 documents and 3 docvars.
labels.cats <- scan("data/nytimes/majortopics2digits.txt", what = "char", sep = "\n", quiet = T)
labels.cats <- data.frame(Kategorie = as.character(1:length(labels.cats)), Label = labels.cats, stringsAsFactors = F)
labels.cats
mydfm.nyt <- dfm(korpus.nyt, remove_numbers = TRUE, remove_punct = TRUE, remove = stopwords("english"))
mydfm.nyt
## Document-feature matrix of: 30,862 documents, 16,499 features (100.0% sparse).
mydfm.nyt.trim <- dfm_trim(mydfm.nyt, min_docfreq = 0.0005, docfreq_type = "prop") # optional: min_count = 10
mydfm.nyt.trim
## Document-feature matrix of: 30,862 documents, 1,905 features (99.8% sparse).
modell.NB <- textmodel_nb(mydfm.nyt.trim, korpus.nyt.stats$Topic_2digit, prior = "docfreq")
head(as.character(predict(modell.NB)))
## [1] "12" "28" "20" "16" "20" "10"
prop.table(table(predict(modell.NB) == korpus.nyt.stats$Topic_2digit))*100
##
## FALSE TRUE
## 25.13771 74.86229
prop.table(table(sample(predict(modell.NB)) == korpus.nyt.stats$Topic_2digit))*100
##
## FALSE TRUE
## 89.94232 10.05768
model.NB.classification <- bind_cols(korpus.nyt.stats, Klassifikation = as.character(predict(modell.NB))) %>%
mutate(Kategorie = as.character(Topic_2digit)) %>%
mutate(RichtigKodiert = Klassifikation == Kategorie) %>%
group_by(Kategorie, RichtigKodiert) %>%
summarise(n = n()) %>%
mutate(Anteil = n/sum(n)) %>%
filter(RichtigKodiert == TRUE) %>%
left_join(labels.cats, by = "Kategorie") %>%
select(Kategorie, Label, n, Anteil)
ggplot(model.NB.classification, aes(Label, Anteil)) + geom_bar(stat = "identity") + geom_hline(yintercept = mean(model.NB.classification$Anteil), color = "blue") + ylim(0, 1) + ggtitle("Share of correctly classified texts in 26\ncontent categories with a Naive Bayes classifier") + xlab("") + ylab("") + coord_flip()
container <- create_container(convert(mydfm.nyt.trim, to = "matrix"), korpus.nyt.stats$Topic_2digit, trainSize = 1:27775, testSize = 27776:30862, virgin = FALSE)
load("data/nytimes/nyt.modelle.RData")
# primäre Modelle
#modell.SVM <- train_model(container,"SVM")
#modell.GLMNET <- train_model(container,"GLMNET")
#modell.MAXENT <- train_model(container,"MAXENT")
#modell.SLDA <- train_model(container,"SLDA")
# weitere Modelle, funktionieren z.T. nicht
#modell.BOOSTING <- train_model(container,"BOOSTING")
#modell.BAGGING <- train_model(container,"BAGGING")
#modell.RF <- train_model(container,"RF")
#modell.NNET <- train_model(container,"NNET")
#modell.TREE <- train_model(container,"TREE")
SVM_CLASSIFY <- classify_model(container, modell.SVM)
GLMNET_CLASSIFY <- classify_model(container, modell.GLMNET)
MAXENT_CLASSIFY <- classify_model(container, modell.MAXENT)
SLDA_CLASSIFY <- classify_model(container, modell.SLDA)
#BOOSTING_CLASSIFY <- classify_model(container, modell.BOOSTING)
#BAGGING_CLASSIFY <- classify_model(container, modell.BAGGING)
#RF_CLASSIFY <- classify_model(container, modell.RF)
#NNET_CLASSIFY <- classify_model(container, modell.NNET)
#TREE_CLASSIFY <- classify_model(container, modell.TREE)
analytics <- create_analytics(container, cbind(SVM_CLASSIFY, GLMNET_CLASSIFY, MAXENT_CLASSIFY, SLDA_CLASSIFY))
summary(analytics)
## ENSEMBLE SUMMARY
##
## n-ENSEMBLE COVERAGE n-ENSEMBLE RECALL
## n >= 1 1.00 0.60
## n >= 2 0.98 0.61
## n >= 3 0.77 0.69
## n >= 4 0.43 0.83
##
##
## ALGORITHM PERFORMANCE
##
## SVM_PRECISION SVM_RECALL SVM_FSCORE
## 0.5215385 0.5034615 0.4996154
## SLDA_PRECISION SLDA_RECALL SLDA_FSCORE
## 0.4988462 0.5223077 0.4857692
## GLMNET_PRECISION GLMNET_RECALL GLMNET_FSCORE
## 0.6319231 0.4350000 0.4846154
## MAXENTROPY_PRECISION MAXENTROPY_RECALL MAXENTROPY_FSCORE
## 0.4788462 0.4673077 0.4550000
topic.codes <- scan("data/nytimes/majortopics2digits.txt", what = "char", sep = "\n", quiet = T)
topic.codes <- data.frame(category = as.factor(1:length(topic.codes)), category.label = topic.codes)
algdf <- data.frame(algorithm = str_split(colnames(analytics@algorithm_summary), "_", simplify = T)[,1], measure = factor(str_split(colnames(analytics@algorithm_summary), "_", simplify = T)[,2], levels = str_split(colnames(analytics@algorithm_summary), "_", simplify = T)[1:3,2]), category = factor(rep(rownames(analytics@algorithm_summary), each = ncol(analytics@algorithm_summary)), levels = rownames(analytics@algorithm_summary)), score = as.vector(t(analytics@algorithm_summary)))
algdf <- left_join(algdf, topic.codes, by = "category")
## Warning: Column `category` joining factors with different levels, coercing
## to character vector
algdf <- filter(algdf, category.label %in% c("Civil Rights, Minority Issues, and Civil Liberties", "Education", "Environment", "Law, Crime, and Family Issues", "Macroeconomics", "Sports and Recreation"))
ggplot(algdf, aes(score, algorithm, color = measure, shape = measure)) + geom_point(size = 1.75) + facet_grid(category.label ~ ., switch = "y") + xlim(c(0,1)) + theme(strip.text.y = element_text(angle = 180)) + scale_colour_manual(values = c("lightblue", "lightgreen", "red")) + ggtitle("Precision, recall and f-score for four\nalgorithms measured across six\ncontent categories") + xlab("") + ylab("")