|
-
- ---
- ```{r}
- library(textmineR)
-
- set.seed <- 30
-
- data("ai_pubmed")
-
- ai_pubmed <- filter(ai_pubmed, abstract != "")
-
- idx <- sample(1:nrow(ai_pubmed), 1000)
-
- ai_pubmed <- ai_pubmed[idx,]
-
- # create a document term matrix
- dtm <- CreateDtm(doc_vec = ai_pubmed$abstract, # character vector of documents
- doc_names = ai_pubmed$pmid, # document names, optional
- ngram_window = c(1, 2), # minimum and maximum n-gram length
- stopword_vec = c(stopwords::stopwords("en"), # stopwords from tm
- stopwords::stopwords(source = "smart")), # this is the default value
- lower = TRUE, # lowercase - this is the default value
- remove_punctuation = TRUE, # punctuation - this is the default
- remove_numbers = TRUE, # numbers - this is the default
- verbose = FALSE, # Turn off status bar for this demo
- cpus = 4) # by default, this will be the max number of cpus available
-
- tf_mat <- TermDocFreq(dtm = dtm)
- dtm <- dtm[, tf_mat$term_freq > 10]
- ```
-
- ```{r}
- tf_mat <- TermDocFreq(dtm = dtm)
- ```
-
-
- ```{r}
- compute_tfidf <- function(dtm, tf_mat) {
- tfidf <- t(dtm[ , tf_mat$term ]) * tf_mat$idf
- t(tfidf)
- }
-
- tfidf <-compute_tfidf(dtm, tf_mat)
-
-
- csim <- tfidf / sqrt(rowSums(tfidf * tfidf))
- csim <- csim %*% t(csim)
-
-
-
- cdist <- as.dist(1 - csim)
-
-
- ```
-
- ```{r}
- hc <- hclust(cdist, "ward.D")
-
- clustering <- cutree(hc, 10)
-
- ```
- ```{r}
- p_words <- colSums(dtm) / sum(dtm)
-
- cluster_words <- lapply(unique(clustering), function(x){
- rows <- dtm[ clustering == x , ]
-
- # for memory's sake, drop all words that don't appear in the cluster
- rows <- rows[ , colSums(rows) > 0 ]
-
- colSums(rows) / sum(rows) - p_words[ colnames(rows) ]
- })
-
-
- cluster_words <- lapply(1:10, function(i) {
- dt <- data.frame(term = names(cluster_words[[i]]), prob = cluster_words[[i]], stringsAsFactors = F)
- dt$cluster <- i
- dt
- })
-
- cluster_words <- bind_rows(cluster_words)
-
- bla <- cluster_words %>%
- group_by(cluster) %>%
- arrange(cluster, desc(prob)) %>%
- filter(prob > 0) %>%
- # top_n(5, prob) %>%
- summarize(size = n(), top_words = paste0(term[1:5], collapse =','))
-
- ```
-
- ```{r}
- cluster_summary <- data.frame(cluster = unique(clustering),
- size = as.numeric(table(clustering)),
- top_words = sapply(cluster_words, function(d){
- paste(
- names(d)[ order(d, decreasing = TRUE) ][ 1:5 ],
- collapse = ", ")
- }),
- stringsAsFactors = FALSE)
-
- cluster_summary
- ```
-
- ```{r}
-
- i <- 3
-
-
-
-
-
-
- library(ggwordcloud)
-
- get_word_cloud <- function(cluster_words, i) {
- cluster_words %>%
- filter(cluster == i) %>%
- arrange(desc(prob)) %>%
- top_n(20, prob) %>%
- mutate(angle = 90 * sample(c(0, 1), n(), replace = TRUE, prob = c(60, 40))) %>%
- ggplot( aes(label=term, size = prob^2, angle = angle)) +
- geom_text_wordcloud_area(eccentricity = 1, rm_outside = TRUE) +
- scale_size_area(max_size = 12) +
- theme_void()
- }
-
- plots = list()
-
- for (i in 1:10) {
- plots[[i]] = get_word_cloud(cluster_words, i)
- }
-
- library(patchwork)
-
- plots[[1]] + plots[[2]] + plots[[3]] + plots[[4]] +
- plots[[5]] + plots[[6]] + plots[[7]] + plots[[8]] +
- plots[[9]] + plots[[10]] + plot_layout(ncol = 3)
-
- ```
- ```{r}
-
- palette = rev(color_hex_palette_from_number("1294"))
-
-
- cl1 <- cluster_words %>%
- filter(cluster == 1) %>%
- arrange(desc(prob))
-
-
- wordcloud::wordcloud(words = cl1$term,
- freq = cl1$prob,
- max.words = 50,
- random.order = F,
- colors = palette,
- main = "Top words in cluster 100",)
-
- ```
|