library(tidyverse)
<- c('heart disease',
terms 'diabetes',
'mental health',
'substance abuse',
'obesity',
'kidney disease')
<- lapply(terms, function(x) {
rss
::qnews_build_rss(x) %>%
quicknews::qnews_strip_rss() })
quicknews
names(rss) <- terms
<- rss %>%
rss0 bind_rows(.id = 'term') %>%
mutate(term = gsub(' ', '_', term)) %>%
distinct(link, .keep_all = TRUE) %>%
mutate(doc_id = as.character(row_number())) %>%
mutate(term = as.factor(term)) %>%
select(doc_id, term:link)
A re-worked version of a previous post. A very small survey of some simple, but effective approaches to text classification using R, with a focus on Naive Bayes and FastText classifiers.
1 Labeled data
For demonstration purposes, we build a corpus using the quicknews package. The corpus is comprised of articles returned from a set of health-related queries. Search terms, then, serve as classification labels. An imperfect annotation process, but fine for our purposes here. As “distant” supervision.
<- quicknews::qnews_extract_article(url = rss0$link, cores = 7)
articles <- articles %>% left_join(rss0) articles0
Descriptives for the resulting corpus by search term. So, a super small demo corpus.
%>%
articles0 mutate(words = tokenizers::count_words(text)) %>%
group_by(term) %>%
summarize(n = n(), words = sum(words)) %>%
::adorn_totals() %>%
janitor::kable() knitr
term | n | words |
---|---|---|
diabetes | 81 | 72045 |
heart_disease | 82 | 65505 |
kidney_disease | 80 | 68379 |
mental_health | 82 | 73804 |
obesity | 81 | 69179 |
substance_abuse | 84 | 64133 |
Total | 490 | 413045 |
A sample of articles from the GoogleNews/quicknews
query:
set.seed(99)
%>%
articles0 select(term, date, source, title) %>%
sample_n(5) %>%
::kable() knitr
term | date | source | title |
---|---|---|---|
kidney_disease | 2022-05-23 | Reuters.com | U.S. Task Force to consider routine kidney disease screening as new drugs available |
substance_abuse | 2022-06-15 | The Sylva Herald | District Court | Court News | thesylvaherald.com |
substance_abuse | 2022-05-28 | WUSF News | Northeast Florida’s health care concerns: from substance abuse to food deserts |
heart_disease | 2022-05-23 | Bloomberg | Take Breaks and Watch Less TV to Slash Heart Disease Risk, Experts Say |
substance_abuse | 2022-06-14 | Williston Daily Herald | Report: U.S. records highest ever rates of substance abuse, suicide-related deaths |
2 Data structures
2.1 Document-Term Matrix
As bag-of-words
<- articles0 %>%
dtm mutate(wds = tokenizers::count_words(text)) %>%
filter(wds > 200 & wds < 1500) %>%
::tif2token() %>%
text2df::token2df() %>%
text2dfmutate(token = tolower(token))
# mutate(stem = quanteda::char_wordstem(token))
%>% head() %>% knitr::kable() dtm
doc_id | token | token_id |
---|---|---|
2 | heart | 1 |
2 | disease | 2 |
2 | is | 3 |
2 | a | 4 |
2 | very | 5 |
2 | general | 6 |
<- dtm %>%
dtm_tok count(doc_id, token) %>%
group_by(token) %>%
mutate(docf = length(unique(doc_id))) %>% ungroup() %>%
mutate(docf = round(docf/length(unique(doc_id)), 3 )) %>%
filter(docf >= 0.01 & docf < 0.5 &
!grepl('^[0-9]|^[[:punct:]]', token))
%>% head() %>% knitr::kable() dtm_tok
doc_id | token | n | docf |
---|---|---|---|
10 | acknowledge | 1 | 0.031 |
10 | across | 1 | 0.207 |
10 | additional | 1 | 0.151 |
10 | address | 3 | 0.217 |
10 | adhere | 1 | 0.018 |
10 | ads | 1 | 0.026 |
<- dtm_tok %>%
dtm_sparse ::bind_tf_idf(term = token,
tidytextdocument = doc_id,
n = n) %>%
::cast_sparse(row = doc_id,
tidytextcolumn = token,
value = tf_idf)
2.2 Cleaned text
<- dtm %>%
ctext group_by(doc_id) %>%
summarize(text = paste0(token, collapse = ' ')) %>% ungroup()
strwrap(ctext$text[5], width = 60)[1:5]
[1] "upon receiving a diagnosis of type 1 diabetes ( t1d ) ,"
[2] "many people have the same reaction : “ but why me ? ” some"
[3] "people have t1d that runs in their family , while others"
[4] "have no idea how or why they received a diagnosis . often ,"
[5] "to their frustration , those questions go unanswered . but"
2.3 Word embeddings
## devtools::install_github("pommedeterresautee/fastrtext")
<- tempfile()
tmp_file_txt <- tempfile()
tmp_file_model writeLines(text = ctext$text, con = tmp_file_txt)
<- 25
dims <- 5
window
::execute(commands = c("skipgram",
fastrtext"-input", tmp_file_txt,
"-output", tmp_file_model,
"-dim", gsub('^.*\\.', '', dims),
"-ws", window,
"-verbose", 1))
Read 0M words
Number of words: 5318
Number of labels: 0
Progress: 100.0% words/sec/thread: 121781 lr: 0.000000 avg.loss: 2.421614 ETA: 0h 0m 0s
<- fastrtext::load_model(tmp_file_model) fast.model
add .bin extension to the path
<- fastrtext::get_dictionary(fast.model)
fast.dict <- fastrtext::get_word_vectors(fast.model, fast.dict) embeddings
3 Classifiers
<- articles0 %>%
articles1 arrange(doc_id) %>%
filter(doc_id %in% unique(dtm_tok$doc_id))
set.seed(99)
<- caret::createDataPartition(articles1$term, p = .7)$Resample1 trainIndex
3.1 Bag-of-words & Naive Bayes
Document represented as bag-of-words.
<- dtm_sparse[trainIndex, ]
dtm_train <- dtm_sparse[-trainIndex, ]
dtm_test <- e1071::naiveBayes(as.matrix(dtm_train),
dtm_classifier $term,
articles1[trainIndex, ]laplace = 0.5)
<- predict(dtm_classifier, as.matrix(dtm_test)) dtm_predicted
3.2 Word embeddings & Naive Bayes
Document represented as an aggregate (here, mean) of constituent word embeddings. Custom/FastText word embeddings derived from
quicknews
corpus (above).
<- embeddings %>%
v1 data.frame() %>%
mutate(token = rownames(embeddings)) %>%
filter(token %in% unique(dtm_tok$token)) %>%
inner_join(dtm)
<- lapply(unique(dtm$doc_id), function(y){
avg0
<- subset(v1, doc_id == y)
d0 <- as.matrix(d0[, 1:dims])
d1 <-Matrix.utils::aggregate.Matrix(d1,
d2 groupings = rep(y, nrow(d0)),
fun = 'mean')
as.matrix(d2)
})
<- do.call(rbind, avg0) doc_embeddings
<- doc_embeddings[trainIndex, ]
emb_train <- doc_embeddings[-trainIndex, ]
emb_test <- e1071::naiveBayes(as.matrix(emb_train),
emb_classifier $term,
articles1[trainIndex, ]laplace = 0.5)
<- predict(emb_classifier, as.matrix(emb_test)) emb_predicted
3.3 FastText classifier
<- articles1[trainIndex, ]
fast_train <- articles1[-trainIndex, ] fast_test
Prepare data for FastText:
<- tempfile()
tmp_file_model
<- paste0("__label__", fast_train$term)
train_labels <- tolower(fast_train$text)
train_texts <- paste(train_labels, train_texts)
train_to_write <- tempfile()
train_tmp_file_txt writeLines(text = train_to_write, con = train_tmp_file_txt)
<- paste0("__label__", fast_test$term)
test_labels <- tolower(fast_test$text)
test_texts <- paste(test_labels, test_texts) test_to_write
::execute(commands = c("supervised",
fastrtext"-input", train_tmp_file_txt,
"-output", tmp_file_model,
"-dim", 25,
"-lr", 1,
"-epoch", 20,
"-wordNgrams", 2,
"-verbose", 1))
<- fastrtext::load_model(tmp_file_model)
model <- predict(model, sentences = test_to_write)
fast_predicted0 <- as.factor(names(unlist(fast_predicted0))) fast_predicted
4 Evaluation
<- list('BOW' = dtm_predicted,
predictions 'Word embeddings' = emb_predicted,
'FastText' = fast_predicted)
4.1 Model accuracy
<- lapply(predictions,
conf_mats ::confusionMatrix,
caretreference = articles1[-trainIndex, ]$term)
<- lapply(conf_mats, '[[', 3)
sums <- as.data.frame(do.call(rbind, sums)) %>%
sums0 select(1:4) %>%
mutate_at(1:4, round, 3)
%>% arrange(desc(Accuracy)) %>% knitr::kable() sums0
Accuracy | Kappa | AccuracyLower | AccuracyUpper | |
---|---|---|---|---|
FastText | 0.757 | 0.708 | 0.668 | 0.832 |
BOW | 0.661 | 0.593 | 0.567 | 0.747 |
Word embeddings | 0.452 | 0.341 | 0.359 | 0.548 |
4.2 FastText classifier: Model accuracy by class
'FastText']]$byClass %>% data.frame() %>%
conf_mats[[select (Sensitivity, Specificity, Balanced.Accuracy) %>%
rownames_to_column(var = 'topic') %>%
mutate(topic = gsub('Class: ','', topic)) %>%
mutate_if(is.numeric, round, 2) %>%
::kable() knitr
topic | Sensitivity | Specificity | Balanced.Accuracy |
---|---|---|---|
diabetes | 0.75 | 1.00 | 0.88 |
heart_disease | 0.56 | 0.99 | 0.77 |
kidney_disease | 0.89 | 0.86 | 0.88 |
mental_health | 0.89 | 0.96 | 0.93 |
obesity | 0.74 | 0.93 | 0.83 |
substance_abuse | 0.70 | 0.97 | 0.83 |
4.3 FastText classifier: Confusion matrix
<- as.data.frame(conf_mats[['FastText']]$table)
dp
ggplot(data = dp,
aes(x = Reference, y = Prediction)) +
geom_tile(aes(fill = log(Freq)),
colour = "white") +
scale_fill_gradient(low = "white",
high = "steelblue") +
geom_text(data = dp,
aes(x = Reference,
y = Prediction,
label = Freq)) +
theme(legend.position = "none",
axis.text.x=element_text(angle=45,
hjust=1)) +
labs(title="Confusion Matrix")