You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

ai_pubmed_cloud.R 3.7KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159
  1. ---
  2. ```{r}
  3. library(textmineR)
  4. set.seed <- 30
  5. data("ai_pubmed")
  6. ai_pubmed <- filter(ai_pubmed, abstract != "")
  7. idx <- sample(1:nrow(ai_pubmed), 1000)
  8. ai_pubmed <- ai_pubmed[idx,]
  9. # create a document term matrix
  10. dtm <- CreateDtm(doc_vec = ai_pubmed$abstract, # character vector of documents
  11. doc_names = ai_pubmed$pmid, # document names, optional
  12. ngram_window = c(1, 2), # minimum and maximum n-gram length
  13. stopword_vec = c(stopwords::stopwords("en"), # stopwords from tm
  14. stopwords::stopwords(source = "smart")), # this is the default value
  15. lower = TRUE, # lowercase - this is the default value
  16. remove_punctuation = TRUE, # punctuation - this is the default
  17. remove_numbers = TRUE, # numbers - this is the default
  18. verbose = FALSE, # Turn off status bar for this demo
  19. cpus = 4) # by default, this will be the max number of cpus available
  20. tf_mat <- TermDocFreq(dtm = dtm)
  21. dtm <- dtm[, tf_mat$term_freq > 10]
  22. ```
  23. ```{r}
  24. tf_mat <- TermDocFreq(dtm = dtm)
  25. ```
  26. ```{r}
  27. compute_tfidf <- function(dtm, tf_mat) {
  28. tfidf <- t(dtm[ , tf_mat$term ]) * tf_mat$idf
  29. t(tfidf)
  30. }
  31. tfidf <-compute_tfidf(dtm, tf_mat)
  32. csim <- tfidf / sqrt(rowSums(tfidf * tfidf))
  33. csim <- csim %*% t(csim)
  34. cdist <- as.dist(1 - csim)
  35. ```
  36. ```{r}
  37. hc <- hclust(cdist, "ward.D")
  38. clustering <- cutree(hc, 10)
  39. ```
  40. ```{r}
  41. p_words <- colSums(dtm) / sum(dtm)
  42. cluster_words <- lapply(unique(clustering), function(x){
  43. rows <- dtm[ clustering == x , ]
  44. # for memory's sake, drop all words that don't appear in the cluster
  45. rows <- rows[ , colSums(rows) > 0 ]
  46. colSums(rows) / sum(rows) - p_words[ colnames(rows) ]
  47. })
  48. cluster_words <- lapply(1:10, function(i) {
  49. dt <- data.frame(term = names(cluster_words[[i]]), prob = cluster_words[[i]], stringsAsFactors = F)
  50. dt$cluster <- i
  51. dt
  52. })
  53. cluster_words <- bind_rows(cluster_words)
  54. bla <- cluster_words %>%
  55. group_by(cluster) %>%
  56. arrange(cluster, desc(prob)) %>%
  57. filter(prob > 0) %>%
  58. # top_n(5, prob) %>%
  59. summarize(size = n(), top_words = paste0(term[1:5], collapse =','))
  60. ```
  61. ```{r}
  62. cluster_summary <- data.frame(cluster = unique(clustering),
  63. size = as.numeric(table(clustering)),
  64. top_words = sapply(cluster_words, function(d){
  65. paste(
  66. names(d)[ order(d, decreasing = TRUE) ][ 1:5 ],
  67. collapse = ", ")
  68. }),
  69. stringsAsFactors = FALSE)
  70. cluster_summary
  71. ```
  72. ```{r}
  73. i <- 3
  74. library(ggwordcloud)
  75. get_word_cloud <- function(cluster_words, i) {
  76. cluster_words %>%
  77. filter(cluster == i) %>%
  78. arrange(desc(prob)) %>%
  79. top_n(20, prob) %>%
  80. mutate(angle = 90 * sample(c(0, 1), n(), replace = TRUE, prob = c(60, 40))) %>%
  81. ggplot( aes(label=term, size = prob^2, angle = angle)) +
  82. geom_text_wordcloud_area(eccentricity = 1, rm_outside = TRUE) +
  83. scale_size_area(max_size = 12) +
  84. theme_void()
  85. }
  86. plots = list()
  87. for (i in 1:10) {
  88. plots[[i]] = get_word_cloud(cluster_words, i)
  89. }
  90. library(patchwork)
  91. plots[[1]] + plots[[2]] + plots[[3]] + plots[[4]] +
  92. plots[[5]] + plots[[6]] + plots[[7]] + plots[[8]] +
  93. plots[[9]] + plots[[10]] + plot_layout(ncol = 3)
  94. ```
  95. ```{r}
  96. palette = rev(color_hex_palette_from_number("1294"))
  97. cl1 <- cluster_words %>%
  98. filter(cluster == 1) %>%
  99. arrange(desc(prob))
  100. wordcloud::wordcloud(words = cl1$term,
  101. freq = cl1$prob,
  102. max.words = 50,
  103. random.order = F,
  104. colors = palette,
  105. main = "Top words in cluster 100",)
  106. ```