diff --git a/courses/ai_pubmed_cloud.R b/courses/ai_pubmed_cloud.R
new file mode 100644
index 0000000..f64141b
--- /dev/null
+++ b/courses/ai_pubmed_cloud.R
@@ -0,0 +1,159 @@
+
+---
+  ```{r}
+library(textmineR)
+
+set.seed <- 30
+
+data("ai_pubmed")
+
+ai_pubmed <- filter(ai_pubmed, abstract != "")
+
+idx <- sample(1:nrow(ai_pubmed), 1000)
+
+ai_pubmed <- ai_pubmed[idx,]
+
+# create a document term matrix 
+dtm <- CreateDtm(doc_vec = ai_pubmed$abstract, # character vector of documents
+                 doc_names = ai_pubmed$pmid, # document names, optional
+                 ngram_window = c(1, 2), # minimum and maximum n-gram length
+                 stopword_vec = c(stopwords::stopwords("en"), # stopwords from tm
+                                  stopwords::stopwords(source = "smart")), # this is the default value
+                 lower = TRUE, # lowercase - this is the default value
+                 remove_punctuation = TRUE, # punctuation - this is the default
+                 remove_numbers = TRUE, # numbers - this is the default
+                 verbose = FALSE, # Turn off status bar for this demo
+                 cpus = 4) # by default, this will be the max number of cpus available
+
+tf_mat <- TermDocFreq(dtm = dtm)
+dtm <- dtm[, tf_mat$term_freq > 10]
+```
+
+```{r}
+tf_mat <- TermDocFreq(dtm = dtm)
+```
+
+
+```{r}
+compute_tfidf <- function(dtm, tf_mat) {
+  tfidf <- t(dtm[ , tf_mat$term ]) * tf_mat$idf
+  t(tfidf)
+}
+
+tfidf <-compute_tfidf(dtm, tf_mat)
+
+
+csim <- tfidf / sqrt(rowSums(tfidf * tfidf))
+csim <- csim %*% t(csim)
+
+
+
+cdist <- as.dist(1 - csim)
+
+
+```
+
+```{r}
+hc <- hclust(cdist, "ward.D")
+
+clustering <- cutree(hc, 10)
+
+```
+```{r}
+p_words <- colSums(dtm) / sum(dtm)
+
+cluster_words <- lapply(unique(clustering), function(x){
+  rows <- dtm[ clustering == x , ]
+  
+  # for memory's sake, drop all words that don't appear in the cluster
+  rows <- rows[ , colSums(rows) > 0 ]
+  
+  colSums(rows) / sum(rows) - p_words[ colnames(rows) ]
+})
+
+
+cluster_words <- lapply(1:10, function(i) {
+  dt <- data.frame(term = names(cluster_words[[i]]), prob = cluster_words[[i]], stringsAsFactors = F) 
+  dt$cluster <- i
+  dt
+})
+
+cluster_words  <- bind_rows(cluster_words)
+
+bla <- cluster_words %>%
+  group_by(cluster) %>%
+  arrange(cluster, desc(prob))  %>%
+  filter(prob > 0) %>%
+  #  top_n(5, prob) %>%
+  summarize(size = n(), top_words = paste0(term[1:5], collapse =','))
+
+```
+
+```{r}
+cluster_summary <- data.frame(cluster = unique(clustering),
+                              size = as.numeric(table(clustering)),
+                              top_words = sapply(cluster_words, function(d){
+                                paste(
+                                  names(d)[ order(d, decreasing = TRUE) ][ 1:5 ], 
+                                  collapse = ", ")
+                              }),
+                              stringsAsFactors = FALSE)
+
+cluster_summary
+```
+
+```{r}
+
+i <- 3
+
+
+
+
+
+
+library(ggwordcloud)
+
+get_word_cloud <- function(cluster_words, i) {
+  cluster_words %>%
+    filter(cluster == i) %>%
+    arrange(desc(prob))  %>%
+    top_n(20, prob) %>%
+    mutate(angle = 90 * sample(c(0, 1), n(), replace = TRUE, prob = c(60, 40))) %>%
+    ggplot( aes(label=term, size = prob^2, angle = angle)) + 
+    geom_text_wordcloud_area(eccentricity = 1, rm_outside = TRUE)  +
+    scale_size_area(max_size = 12) +
+    theme_void()
+}
+
+plots = list()
+
+for (i in 1:10) {
+  plots[[i]] = get_word_cloud(cluster_words, i)
+}
+
+library(patchwork)
+
+plots[[1]] + plots[[2]] + plots[[3]] + plots[[4]] +
+  plots[[5]] + plots[[6]] + plots[[7]] + plots[[8]] + 
+  plots[[9]] + plots[[10]] + plot_layout(ncol = 3) 
+
+```
+```{r}
+
+palette = rev(color_hex_palette_from_number("1294"))
+
+
+cl1 <- cluster_words %>%
+  filter(cluster == 1) %>%
+  arrange(desc(prob)) 
+
+
+wordcloud::wordcloud(words = cl1$term, 
+                     freq = cl1$prob, 
+                     max.words = 50, 
+                     random.order = F,
+                     colors =  palette,
+                     main = "Top words in cluster 100",)
+
+```
+