|
|
@@ -0,0 +1,60 @@ |
|
|
|
library(tidyverse) |
|
|
|
library(RSQLite) |
|
|
|
library(magrittr) |
|
|
|
library(httr) |
|
|
|
library(rvest) |
|
|
|
library(stringr) |
|
|
|
|
|
|
|
annee <- 2017 |
|
|
|
|
|
|
|
db <- dbConnect(SQLite(), "ecn.db") |
|
|
|
|
|
|
|
celine <- GET(str_c("http://cngsante.fr/chiron", annee, "/celine/listing.html")) |
|
|
|
|
|
|
|
celine %>% |
|
|
|
content %>% |
|
|
|
html_node("tr:first-child") %>% |
|
|
|
html_text -> |
|
|
|
timestamp |
|
|
|
|
|
|
|
timestamp %>% |
|
|
|
str_extract("\\d+h\\d+") %>% |
|
|
|
str_replace("h", ":") -> heure |
|
|
|
|
|
|
|
timestamp %>% |
|
|
|
str_replace(" \\d+h\\d+ ", "") %>% |
|
|
|
str_extract("\\w+") -> date |
|
|
|
|
|
|
|
date %>% |
|
|
|
str_replace("([a-z]+)(\\d+)([a-z]+)", "\\2") -> jour |
|
|
|
|
|
|
|
date %>% |
|
|
|
str_replace("([a-z]+)(\\d+)([a-z]+)", "\\3") -> mois |
|
|
|
case_when(mois == "jul" ~ "07", |
|
|
|
mois == "sep" ~ "09", |
|
|
|
mois %>% str_detect("^ao") ~ "08") -> mois |
|
|
|
|
|
|
|
timestamp <- str_c(annee, "-", mois, "-", jour, " ", heure) |
|
|
|
|
|
|
|
celine %>% |
|
|
|
str_replace_all("\n", "") %>% |
|
|
|
str_replace("(<tr>.*?</tr>){8}", "") %>% |
|
|
|
read_html %>% |
|
|
|
html_table(header = T) %>% |
|
|
|
.[[1]] -> |
|
|
|
listing |
|
|
|
|
|
|
|
listing %>% |
|
|
|
select(-SubDis) %>% |
|
|
|
filter(Etat != "déclassé") %>% |
|
|
|
mutate(Etudiant = ifelse(Etudiant %>% str_detect("\\d+ \\( (\\d+) \\)"), |
|
|
|
Etudiant %>% str_replace("\\d+ \\( (\\d+) \\)", "\\1"), |
|
|
|
Etudiant) |
|
|
|
%>% as.numeric, |
|
|
|
Discipline = Discipline %>% str_replace("Discipline .*? : ", ""), |
|
|
|
Subdivision = Subdivision %>% str_replace("CHU ((d')|(de ))?", ""), |
|
|
|
timestamp = timestamp) %>% |
|
|
|
mutate_if(is.character, factor) %>% |
|
|
|
dbWriteTable(conn = db, value = ., name = "trajectoires", overwrite = T, indexes = list("Etudiant", "Discipline", "Subdivision")) |
|
|
|
|
|
|
|
db %>% dbDisconnect |