diff --git a/init_db.R b/init_db.R new file mode 100644 index 0000000..1c6d661 --- /dev/null +++ b/init_db.R @@ -0,0 +1,60 @@ +library(tidyverse) +library(RSQLite) +library(magrittr) +library(httr) +library(rvest) +library(stringr) + +annee <- 2017 + +db <- dbConnect(SQLite(), "ecn.db") + +celine <- GET(str_c("http://cngsante.fr/chiron", annee, "/celine/listing.html")) + +celine %>% + content %>% + html_node("tr:first-child") %>% + html_text -> +timestamp + +timestamp %>% + str_extract("\\d+h\\d+") %>% + str_replace("h", ":") -> heure + +timestamp %>% + str_replace(" \\d+h\\d+ ", "") %>% + str_extract("\\w+") -> date + +date %>% + str_replace("([a-z]+)(\\d+)([a-z]+)", "\\2") -> jour + +date %>% + str_replace("([a-z]+)(\\d+)([a-z]+)", "\\3") -> mois + case_when(mois == "jul" ~ "07", + mois == "sep" ~ "09", + mois %>% str_detect("^ao") ~ "08") -> mois + +timestamp <- str_c(annee, "-", mois, "-", jour, " ", heure) + +celine %>% + str_replace_all("\n", "") %>% + str_replace("(.*?){8}", "") %>% + read_html %>% + html_table(header = T) %>% + .[[1]] -> +listing + +listing %>% + select(-SubDis) %>% + filter(Etat != "déclassé") %>% + mutate(Etudiant = ifelse(Etudiant %>% str_detect("\\d+ \\( (\\d+) \\)"), + Etudiant %>% str_replace("\\d+ \\( (\\d+) \\)", "\\1"), + Etudiant) + %>% as.numeric, + Discipline = Discipline %>% str_replace("Discipline .*? : ", ""), + Subdivision = Subdivision %>% str_replace("CHU ((d')|(de ))?", ""), + timestamp = timestamp) %>% + mutate_if(is.character, factor) %>% + dbWriteTable(conn = db, value = ., name = "trajectoires", overwrite = T, indexes = list("Etudiant", "Discipline", "Subdivision")) + +db %>% dbDisconnect