Antoine Neuraz пре 5 година
родитељ
комит
5ff4425c52
1 измењених фајлова са 159 додато и 0 уклоњено
  1. +159
    -0
      courses/ai_pubmed_cloud.R

+ 159
- 0
courses/ai_pubmed_cloud.R Прегледај датотеку

@@ -0,0 +1,159 @@

---
```{r}
library(textmineR)

set.seed <- 30

data("ai_pubmed")

ai_pubmed <- filter(ai_pubmed, abstract != "")

idx <- sample(1:nrow(ai_pubmed), 1000)

ai_pubmed <- ai_pubmed[idx,]

# create a document term matrix
dtm <- CreateDtm(doc_vec = ai_pubmed$abstract, # character vector of documents
doc_names = ai_pubmed$pmid, # document names, optional
ngram_window = c(1, 2), # minimum and maximum n-gram length
stopword_vec = c(stopwords::stopwords("en"), # stopwords from tm
stopwords::stopwords(source = "smart")), # this is the default value
lower = TRUE, # lowercase - this is the default value
remove_punctuation = TRUE, # punctuation - this is the default
remove_numbers = TRUE, # numbers - this is the default
verbose = FALSE, # Turn off status bar for this demo
cpus = 4) # by default, this will be the max number of cpus available

tf_mat <- TermDocFreq(dtm = dtm)
dtm <- dtm[, tf_mat$term_freq > 10]
```

```{r}
tf_mat <- TermDocFreq(dtm = dtm)
```


```{r}
compute_tfidf <- function(dtm, tf_mat) {
tfidf <- t(dtm[ , tf_mat$term ]) * tf_mat$idf
t(tfidf)
}

tfidf <-compute_tfidf(dtm, tf_mat)


csim <- tfidf / sqrt(rowSums(tfidf * tfidf))
csim <- csim %*% t(csim)



cdist <- as.dist(1 - csim)


```

```{r}
hc <- hclust(cdist, "ward.D")

clustering <- cutree(hc, 10)

```
```{r}
p_words <- colSums(dtm) / sum(dtm)

cluster_words <- lapply(unique(clustering), function(x){
rows <- dtm[ clustering == x , ]
# for memory's sake, drop all words that don't appear in the cluster
rows <- rows[ , colSums(rows) > 0 ]
colSums(rows) / sum(rows) - p_words[ colnames(rows) ]
})


cluster_words <- lapply(1:10, function(i) {
dt <- data.frame(term = names(cluster_words[[i]]), prob = cluster_words[[i]], stringsAsFactors = F)
dt$cluster <- i
dt
})

cluster_words <- bind_rows(cluster_words)

bla <- cluster_words %>%
group_by(cluster) %>%
arrange(cluster, desc(prob)) %>%
filter(prob > 0) %>%
# top_n(5, prob) %>%
summarize(size = n(), top_words = paste0(term[1:5], collapse =','))

```

```{r}
cluster_summary <- data.frame(cluster = unique(clustering),
size = as.numeric(table(clustering)),
top_words = sapply(cluster_words, function(d){
paste(
names(d)[ order(d, decreasing = TRUE) ][ 1:5 ],
collapse = ", ")
}),
stringsAsFactors = FALSE)

cluster_summary
```

```{r}

i <- 3






library(ggwordcloud)

get_word_cloud <- function(cluster_words, i) {
cluster_words %>%
filter(cluster == i) %>%
arrange(desc(prob)) %>%
top_n(20, prob) %>%
mutate(angle = 90 * sample(c(0, 1), n(), replace = TRUE, prob = c(60, 40))) %>%
ggplot( aes(label=term, size = prob^2, angle = angle)) +
geom_text_wordcloud_area(eccentricity = 1, rm_outside = TRUE) +
scale_size_area(max_size = 12) +
theme_void()
}

plots = list()

for (i in 1:10) {
plots[[i]] = get_word_cloud(cluster_words, i)
}

library(patchwork)

plots[[1]] + plots[[2]] + plots[[3]] + plots[[4]] +
plots[[5]] + plots[[6]] + plots[[7]] + plots[[8]] +
plots[[9]] + plots[[10]] + plot_layout(ncol = 3)

```
```{r}

palette = rev(color_hex_palette_from_number("1294"))


cl1 <- cluster_words %>%
filter(cluster == 1) %>%
arrange(desc(prob))


wordcloud::wordcloud(words = cl1$term,
freq = cl1$prob,
max.words = 50,
random.order = F,
colors = palette,
main = "Top words in cluster 100",)

```


Loading…
Откажи
Сачувај