diff --git a/courses/ai_pubmed_cloud.R b/courses/ai_pubmed_cloud.R new file mode 100644 index 0000000..f64141b --- /dev/null +++ b/courses/ai_pubmed_cloud.R @@ -0,0 +1,159 @@ + +--- + ```{r} +library(textmineR) + +set.seed <- 30 + +data("ai_pubmed") + +ai_pubmed <- filter(ai_pubmed, abstract != "") + +idx <- sample(1:nrow(ai_pubmed), 1000) + +ai_pubmed <- ai_pubmed[idx,] + +# create a document term matrix +dtm <- CreateDtm(doc_vec = ai_pubmed$abstract, # character vector of documents + doc_names = ai_pubmed$pmid, # document names, optional + ngram_window = c(1, 2), # minimum and maximum n-gram length + stopword_vec = c(stopwords::stopwords("en"), # stopwords from tm + stopwords::stopwords(source = "smart")), # this is the default value + lower = TRUE, # lowercase - this is the default value + remove_punctuation = TRUE, # punctuation - this is the default + remove_numbers = TRUE, # numbers - this is the default + verbose = FALSE, # Turn off status bar for this demo + cpus = 4) # by default, this will be the max number of cpus available + +tf_mat <- TermDocFreq(dtm = dtm) +dtm <- dtm[, tf_mat$term_freq > 10] +``` + +```{r} +tf_mat <- TermDocFreq(dtm = dtm) +``` + + +```{r} +compute_tfidf <- function(dtm, tf_mat) { + tfidf <- t(dtm[ , tf_mat$term ]) * tf_mat$idf + t(tfidf) +} + +tfidf <-compute_tfidf(dtm, tf_mat) + + +csim <- tfidf / sqrt(rowSums(tfidf * tfidf)) +csim <- csim %*% t(csim) + + + +cdist <- as.dist(1 - csim) + + +``` + +```{r} +hc <- hclust(cdist, "ward.D") + +clustering <- cutree(hc, 10) + +``` +```{r} +p_words <- colSums(dtm) / sum(dtm) + +cluster_words <- lapply(unique(clustering), function(x){ + rows <- dtm[ clustering == x , ] + + # for memory's sake, drop all words that don't appear in the cluster + rows <- rows[ , colSums(rows) > 0 ] + + colSums(rows) / sum(rows) - p_words[ colnames(rows) ] +}) + + +cluster_words <- lapply(1:10, function(i) { + dt <- data.frame(term = names(cluster_words[[i]]), prob = cluster_words[[i]], stringsAsFactors = F) + dt$cluster <- i + dt +}) + +cluster_words <- bind_rows(cluster_words) + +bla <- cluster_words %>% + group_by(cluster) %>% + arrange(cluster, desc(prob)) %>% + filter(prob > 0) %>% + # top_n(5, prob) %>% + summarize(size = n(), top_words = paste0(term[1:5], collapse =',')) + +``` + +```{r} +cluster_summary <- data.frame(cluster = unique(clustering), + size = as.numeric(table(clustering)), + top_words = sapply(cluster_words, function(d){ + paste( + names(d)[ order(d, decreasing = TRUE) ][ 1:5 ], + collapse = ", ") + }), + stringsAsFactors = FALSE) + +cluster_summary +``` + +```{r} + +i <- 3 + + + + + + +library(ggwordcloud) + +get_word_cloud <- function(cluster_words, i) { + cluster_words %>% + filter(cluster == i) %>% + arrange(desc(prob)) %>% + top_n(20, prob) %>% + mutate(angle = 90 * sample(c(0, 1), n(), replace = TRUE, prob = c(60, 40))) %>% + ggplot( aes(label=term, size = prob^2, angle = angle)) + + geom_text_wordcloud_area(eccentricity = 1, rm_outside = TRUE) + + scale_size_area(max_size = 12) + + theme_void() +} + +plots = list() + +for (i in 1:10) { + plots[[i]] = get_word_cloud(cluster_words, i) +} + +library(patchwork) + +plots[[1]] + plots[[2]] + plots[[3]] + plots[[4]] + + plots[[5]] + plots[[6]] + plots[[7]] + plots[[8]] + + plots[[9]] + plots[[10]] + plot_layout(ncol = 3) + +``` +```{r} + +palette = rev(color_hex_palette_from_number("1294")) + + +cl1 <- cluster_words %>% + filter(cluster == 1) %>% + arrange(desc(prob)) + + +wordcloud::wordcloud(words = cl1$term, + freq = cl1$prob, + max.words = 50, + random.order = F, + colors = palette, + main = "Top words in cluster 100",) + +``` +