|
@@ -0,0 +1,159 @@ |
|
|
|
|
|
|
|
|
|
|
|
--- |
|
|
|
|
|
```{r} |
|
|
|
|
|
library(textmineR) |
|
|
|
|
|
|
|
|
|
|
|
set.seed <- 30 |
|
|
|
|
|
|
|
|
|
|
|
data("ai_pubmed") |
|
|
|
|
|
|
|
|
|
|
|
ai_pubmed <- filter(ai_pubmed, abstract != "") |
|
|
|
|
|
|
|
|
|
|
|
idx <- sample(1:nrow(ai_pubmed), 1000) |
|
|
|
|
|
|
|
|
|
|
|
ai_pubmed <- ai_pubmed[idx,] |
|
|
|
|
|
|
|
|
|
|
|
# create a document term matrix |
|
|
|
|
|
dtm <- CreateDtm(doc_vec = ai_pubmed$abstract, # character vector of documents |
|
|
|
|
|
doc_names = ai_pubmed$pmid, # document names, optional |
|
|
|
|
|
ngram_window = c(1, 2), # minimum and maximum n-gram length |
|
|
|
|
|
stopword_vec = c(stopwords::stopwords("en"), # stopwords from tm |
|
|
|
|
|
stopwords::stopwords(source = "smart")), # this is the default value |
|
|
|
|
|
lower = TRUE, # lowercase - this is the default value |
|
|
|
|
|
remove_punctuation = TRUE, # punctuation - this is the default |
|
|
|
|
|
remove_numbers = TRUE, # numbers - this is the default |
|
|
|
|
|
verbose = FALSE, # Turn off status bar for this demo |
|
|
|
|
|
cpus = 4) # by default, this will be the max number of cpus available |
|
|
|
|
|
|
|
|
|
|
|
tf_mat <- TermDocFreq(dtm = dtm) |
|
|
|
|
|
dtm <- dtm[, tf_mat$term_freq > 10] |
|
|
|
|
|
``` |
|
|
|
|
|
|
|
|
|
|
|
```{r} |
|
|
|
|
|
tf_mat <- TermDocFreq(dtm = dtm) |
|
|
|
|
|
``` |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
```{r} |
|
|
|
|
|
compute_tfidf <- function(dtm, tf_mat) { |
|
|
|
|
|
tfidf <- t(dtm[ , tf_mat$term ]) * tf_mat$idf |
|
|
|
|
|
t(tfidf) |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
tfidf <-compute_tfidf(dtm, tf_mat) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
csim <- tfidf / sqrt(rowSums(tfidf * tfidf)) |
|
|
|
|
|
csim <- csim %*% t(csim) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cdist <- as.dist(1 - csim) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
``` |
|
|
|
|
|
|
|
|
|
|
|
```{r} |
|
|
|
|
|
hc <- hclust(cdist, "ward.D") |
|
|
|
|
|
|
|
|
|
|
|
clustering <- cutree(hc, 10) |
|
|
|
|
|
|
|
|
|
|
|
``` |
|
|
|
|
|
```{r} |
|
|
|
|
|
p_words <- colSums(dtm) / sum(dtm) |
|
|
|
|
|
|
|
|
|
|
|
cluster_words <- lapply(unique(clustering), function(x){ |
|
|
|
|
|
rows <- dtm[ clustering == x , ] |
|
|
|
|
|
|
|
|
|
|
|
# for memory's sake, drop all words that don't appear in the cluster |
|
|
|
|
|
rows <- rows[ , colSums(rows) > 0 ] |
|
|
|
|
|
|
|
|
|
|
|
colSums(rows) / sum(rows) - p_words[ colnames(rows) ] |
|
|
|
|
|
}) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cluster_words <- lapply(1:10, function(i) { |
|
|
|
|
|
dt <- data.frame(term = names(cluster_words[[i]]), prob = cluster_words[[i]], stringsAsFactors = F) |
|
|
|
|
|
dt$cluster <- i |
|
|
|
|
|
dt |
|
|
|
|
|
}) |
|
|
|
|
|
|
|
|
|
|
|
cluster_words <- bind_rows(cluster_words) |
|
|
|
|
|
|
|
|
|
|
|
bla <- cluster_words %>% |
|
|
|
|
|
group_by(cluster) %>% |
|
|
|
|
|
arrange(cluster, desc(prob)) %>% |
|
|
|
|
|
filter(prob > 0) %>% |
|
|
|
|
|
# top_n(5, prob) %>% |
|
|
|
|
|
summarize(size = n(), top_words = paste0(term[1:5], collapse =',')) |
|
|
|
|
|
|
|
|
|
|
|
``` |
|
|
|
|
|
|
|
|
|
|
|
```{r} |
|
|
|
|
|
cluster_summary <- data.frame(cluster = unique(clustering), |
|
|
|
|
|
size = as.numeric(table(clustering)), |
|
|
|
|
|
top_words = sapply(cluster_words, function(d){ |
|
|
|
|
|
paste( |
|
|
|
|
|
names(d)[ order(d, decreasing = TRUE) ][ 1:5 ], |
|
|
|
|
|
collapse = ", ") |
|
|
|
|
|
}), |
|
|
|
|
|
stringsAsFactors = FALSE) |
|
|
|
|
|
|
|
|
|
|
|
cluster_summary |
|
|
|
|
|
``` |
|
|
|
|
|
|
|
|
|
|
|
```{r} |
|
|
|
|
|
|
|
|
|
|
|
i <- 3 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
library(ggwordcloud) |
|
|
|
|
|
|
|
|
|
|
|
get_word_cloud <- function(cluster_words, i) { |
|
|
|
|
|
cluster_words %>% |
|
|
|
|
|
filter(cluster == i) %>% |
|
|
|
|
|
arrange(desc(prob)) %>% |
|
|
|
|
|
top_n(20, prob) %>% |
|
|
|
|
|
mutate(angle = 90 * sample(c(0, 1), n(), replace = TRUE, prob = c(60, 40))) %>% |
|
|
|
|
|
ggplot( aes(label=term, size = prob^2, angle = angle)) + |
|
|
|
|
|
geom_text_wordcloud_area(eccentricity = 1, rm_outside = TRUE) + |
|
|
|
|
|
scale_size_area(max_size = 12) + |
|
|
|
|
|
theme_void() |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
plots = list() |
|
|
|
|
|
|
|
|
|
|
|
for (i in 1:10) { |
|
|
|
|
|
plots[[i]] = get_word_cloud(cluster_words, i) |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
library(patchwork) |
|
|
|
|
|
|
|
|
|
|
|
plots[[1]] + plots[[2]] + plots[[3]] + plots[[4]] + |
|
|
|
|
|
plots[[5]] + plots[[6]] + plots[[7]] + plots[[8]] + |
|
|
|
|
|
plots[[9]] + plots[[10]] + plot_layout(ncol = 3) |
|
|
|
|
|
|
|
|
|
|
|
``` |
|
|
|
|
|
```{r} |
|
|
|
|
|
|
|
|
|
|
|
palette = rev(color_hex_palette_from_number("1294")) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cl1 <- cluster_words %>% |
|
|
|
|
|
filter(cluster == 1) %>% |
|
|
|
|
|
arrange(desc(prob)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wordcloud::wordcloud(words = cl1$term, |
|
|
|
|
|
freq = cl1$prob, |
|
|
|
|
|
max.words = 50, |
|
|
|
|
|
random.order = F, |
|
|
|
|
|
colors = palette, |
|
|
|
|
|
main = "Top words in cluster 100",) |
|
|
|
|
|
|
|
|
|
|
|
``` |
|
|
|
|
|
|