library(tidyverse) library(RSQLite) library(magrittr) library(httr) library(rvest) library(stringr) annee <- 2020 db <- dbConnect(SQLite(), "ecn.db") celine <- GET("http://www.cngsante.fr/chiron/celine/listing.html") celine %>% content %>% html_node("tr:first-child") %>% html_text -> timestamp timestamp %>% str_extract("\\d+h\\d+") %>% str_replace("h", ":") -> heure timestamp %>% str_replace(" \\d+h\\d+ ", "") %>% str_extract("\\w+") -> date date %>% str_replace("([a-z]+)(\\d+)([a-z]+)", "\\2") -> jour date %>% str_replace("([a-z]+)(\\d+)([a-z]+)", "\\3") -> mois case_when(mois == "jul" ~ "07", mois == "sep" ~ "09", mois %>% str_detect("^ao") ~ "08") -> mois timestamp <- str_c(annee, "-", mois, "-", jour, " ", heure) celine %>% str_replace_all("\n", "") %>% str_replace("(.*?){8}", "") %>% read_html %>% html_table(header = T) %>% .[[1]] -> listing listing %>% select(-SubDis) %>% filter(Etat != "déclassé") %>% mutate(Etudiant = ifelse(Etudiant %>% str_detect("\\d+ \\( (\\d+) \\)"), Etudiant %>% str_replace("\\d+ \\( (\\d+) \\)", "\\1"), Etudiant) %>% as.numeric, Discipline = Discipline %>% str_replace("Discipline .*? : ", ""), Subdivision = Subdivision %>% str_replace("CHU ((d')|(de ))?", ""), timestamp = timestamp) %>% mutate_if(is.character, factor) %>% dbWriteTable(conn = db, value = ., name = "trajectoires", overwrite = T, indexes = list("Etudiant", "Discipline", "Subdivision")) db %>% dbDisconnect