You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

61 lines
1.6KB

  1. library(tidyverse)
  2. library(RSQLite)
  3. library(magrittr)
  4. library(httr)
  5. library(rvest)
  6. library(stringr)
  7. annee <- 2020
  8. db <- dbConnect(SQLite(), "ecn.db")
  9. celine <- GET("http://www.cngsante.fr/chiron/celine/listing.html")
  10. celine %>%
  11. content %>%
  12. html_node("tr:first-child") %>%
  13. html_text ->
  14. timestamp
  15. timestamp %>%
  16. str_extract("\\d+h\\d+") %>%
  17. str_replace("h", ":") -> heure
  18. timestamp %>%
  19. str_replace(" \\d+h\\d+ ", "") %>%
  20. str_extract("\\w+") -> date
  21. date %>%
  22. str_replace("([a-z]+)(\\d+)([a-z]+)", "\\2") -> jour
  23. date %>%
  24. str_replace("([a-z]+)(\\d+)([a-z]+)", "\\3") -> mois
  25. case_when(mois == "jul" ~ "07",
  26. mois == "sep" ~ "09",
  27. mois %>% str_detect("^ao") ~ "08") -> mois
  28. timestamp <- str_c(annee, "-", mois, "-", jour, " ", heure)
  29. celine %>%
  30. str_replace_all("\n", "") %>%
  31. str_replace("(<tr>.*?</tr>){8}", "") %>%
  32. read_html %>%
  33. html_table(header = T) %>%
  34. .[[1]] ->
  35. listing
  36. listing %>%
  37. select(-SubDis) %>%
  38. filter(Etat != "déclassé") %>%
  39. mutate(Etudiant = ifelse(Etudiant %>% str_detect("\\d+ \\( (\\d+) \\)"),
  40. Etudiant %>% str_replace("\\d+ \\( (\\d+) \\)", "\\1"),
  41. Etudiant)
  42. %>% as.numeric,
  43. Discipline = Discipline %>% str_replace("Discipline .*? : ", ""),
  44. Subdivision = Subdivision %>% str_replace("CHU ((d')|(de ))?", ""),
  45. timestamp = timestamp) %>%
  46. mutate_if(is.character, factor) %>%
  47. dbWriteTable(conn = db, value = ., name = "trajectoires", overwrite = T, indexes = list("Etudiant", "Discipline", "Subdivision"))
  48. db %>% dbDisconnect