|
- library(tidyverse)
- library(RSQLite)
- library(magrittr)
- library(httr)
- library(rvest)
- library(stringr)
-
- annee <- 2017
-
- db <- dbConnect(SQLite(), "ecn.db")
-
- celine <- GET(str_c("http://cngsante.fr/chiron", annee, "/celine/listing.html"))
-
- celine %>%
- content %>%
- html_node("tr:first-child") %>%
- html_text ->
- timestamp
-
- timestamp %>%
- str_extract("\\d+h\\d+") %>%
- str_replace("h", ":") -> heure
-
- timestamp %>%
- str_replace(" \\d+h\\d+ ", "") %>%
- str_extract("\\w+") -> date
-
- date %>%
- str_replace("([a-z]+)(\\d+)([a-z]+)", "\\2") -> jour
-
- date %>%
- str_replace("([a-z]+)(\\d+)([a-z]+)", "\\3") -> mois
- case_when(mois == "jul" ~ "07",
- mois == "sep" ~ "09",
- mois %>% str_detect("^ao") ~ "08") -> mois
-
- timestamp <- str_c(annee, "-", mois, "-", jour, " ", heure)
-
- celine %>%
- str_replace_all("\n", "") %>%
- str_replace("(<tr>.*?</tr>){8}", "") %>%
- read_html %>%
- html_table(header = T) %>%
- .[[1]] ->
- listing
-
- listing %>%
- select(-SubDis) %>%
- filter(Etat != "déclassé") %>%
- mutate(Etudiant = ifelse(Etudiant %>% str_detect("\\d+ \\( (\\d+) \\)"),
- Etudiant %>% str_replace("\\d+ \\( (\\d+) \\)", "\\1"),
- Etudiant)
- %>% as.numeric,
- Discipline = Discipline %>% str_replace("Discipline .*? : ", ""),
- Subdivision = Subdivision %>% str_replace("CHU ((d')|(de ))?", ""),
- timestamp = timestamp) %>%
- mutate_if(is.character, factor) %>%
- dbWriteTable(conn = db, value = ., name = "trajectoires", overwrite = T, indexes = list("Etudiant", "Discipline", "Subdivision"))
-
- db %>% dbDisconnect
|