--- ```{r} library(textmineR) set.seed <- 30 data("ai_pubmed") ai_pubmed <- filter(ai_pubmed, abstract != "") idx <- sample(1:nrow(ai_pubmed), 1000) ai_pubmed <- ai_pubmed[idx,] # create a document term matrix dtm <- CreateDtm(doc_vec = ai_pubmed$abstract, # character vector of documents doc_names = ai_pubmed$pmid, # document names, optional ngram_window = c(1, 2), # minimum and maximum n-gram length stopword_vec = c(stopwords::stopwords("en"), # stopwords from tm stopwords::stopwords(source = "smart")), # this is the default value lower = TRUE, # lowercase - this is the default value remove_punctuation = TRUE, # punctuation - this is the default remove_numbers = TRUE, # numbers - this is the default verbose = FALSE, # Turn off status bar for this demo cpus = 4) # by default, this will be the max number of cpus available tf_mat <- TermDocFreq(dtm = dtm) dtm <- dtm[, tf_mat$term_freq > 10] ``` ```{r} tf_mat <- TermDocFreq(dtm = dtm) ``` ```{r} compute_tfidf <- function(dtm, tf_mat) { tfidf <- t(dtm[ , tf_mat$term ]) * tf_mat$idf t(tfidf) } tfidf <-compute_tfidf(dtm, tf_mat) csim <- tfidf / sqrt(rowSums(tfidf * tfidf)) csim <- csim %*% t(csim) cdist <- as.dist(1 - csim) ``` ```{r} hc <- hclust(cdist, "ward.D") clustering <- cutree(hc, 10) ``` ```{r} p_words <- colSums(dtm) / sum(dtm) cluster_words <- lapply(unique(clustering), function(x){ rows <- dtm[ clustering == x , ] # for memory's sake, drop all words that don't appear in the cluster rows <- rows[ , colSums(rows) > 0 ] colSums(rows) / sum(rows) - p_words[ colnames(rows) ] }) cluster_words <- lapply(1:10, function(i) { dt <- data.frame(term = names(cluster_words[[i]]), prob = cluster_words[[i]], stringsAsFactors = F) dt$cluster <- i dt }) cluster_words <- bind_rows(cluster_words) bla <- cluster_words %>% group_by(cluster) %>% arrange(cluster, desc(prob)) %>% filter(prob > 0) %>% # top_n(5, prob) %>% summarize(size = n(), top_words = paste0(term[1:5], collapse =',')) ``` ```{r} cluster_summary <- data.frame(cluster = unique(clustering), size = as.numeric(table(clustering)), top_words = sapply(cluster_words, function(d){ paste( names(d)[ order(d, decreasing = TRUE) ][ 1:5 ], collapse = ", ") }), stringsAsFactors = FALSE) cluster_summary ``` ```{r} i <- 3 library(ggwordcloud) get_word_cloud <- function(cluster_words, i) { cluster_words %>% filter(cluster == i) %>% arrange(desc(prob)) %>% top_n(20, prob) %>% mutate(angle = 90 * sample(c(0, 1), n(), replace = TRUE, prob = c(60, 40))) %>% ggplot( aes(label=term, size = prob^2, angle = angle)) + geom_text_wordcloud_area(eccentricity = 1, rm_outside = TRUE) + scale_size_area(max_size = 12) + theme_void() } plots = list() for (i in 1:10) { plots[[i]] = get_word_cloud(cluster_words, i) } library(patchwork) plots[[1]] + plots[[2]] + plots[[3]] + plots[[4]] + plots[[5]] + plots[[6]] + plots[[7]] + plots[[8]] + plots[[9]] + plots[[10]] + plot_layout(ncol = 3) ``` ```{r} palette = rev(color_hex_palette_from_number("1294")) cl1 <- cluster_words %>% filter(cluster == 1) %>% arrange(desc(prob)) wordcloud::wordcloud(words = cl1$term, freq = cl1$prob, max.words = 50, random.order = F, colors = palette, main = "Top words in cluster 100",) ```