Cours
/
Parcours-ML


			
				
					
						
						
							
							# Initialisation ----
library(tidyverse)

# Données d'exemple
data(iris)

# Graphique de base
iris %>%
  ggplot() +
  aes(x = Petal.Length, y = Sepal.Length, color = Species) +
  geom_point()

# Set de training
set.seed(1000)
iris %>%
  sample_n(120) -> train

# Set de test
iris %>%
  anti_join(train) -> test

# Graphique de base
train %>%
  ggplot() +
  aes(x = Petal.Length, y = Sepal.Length, color = Species) +
  geom_point() -> g

# kNN ----
library(class)

# Application du kNN sur les données de test
knn(train = train %>% select(Petal.Length, Sepal.Length),
    test  = test %>% select(Petal.Length, Sepal.Length),
    cl    = train$Species,
    k     = 10) -> test$Species_knn

# Graphique avec les résultats
g +
  geom_point(data = test,
             aes(x = Petal.Length, y = Sepal.Length, color = Species_knn),
             size = 4) +
  geom_point(data = test,
             aes(x = Petal.Length, y = Sepal.Length, color = Species),
             size = 2)

# Résultats
mean(test$Species == test$Species_knn)

# Nouvelles données
tibble(Petal.Length = rnorm(10, mean(iris$Petal.Length), var(iris$Petal.Length)),
       Sepal.Length = rnorm(10, mean(iris$Sepal.Length), var(iris$Sepal.Length))) -> test2
## Exemples particuliers
# tibble(Petal.Length = rep(5, 10),
#        Sepal.Length = seq(5, 7, length.out = 10)) %>%
# bind_rows(test2) -> test2

# Application du kNN sur nouvelles données
knn(train = iris %>% select(Petal.Length, Sepal.Length),
    test  = test2 %>% select(Petal.Length, Sepal.Length),
    cl    = iris$Species,
    k     = 5) -> test2$Species

# Graphique avec les résultats
g +
  geom_point(data = test2,
             aes(x = Petal.Length, y = Sepal.Length, color = Species),
             size = 4)

# Arbres décisionnels ----
library(rpart)

rpart(Species ~ Petal.Length + Sepal.Length, data = train) -> arbre
arbre %T>%
  plot(branch = .5, margin = .5) %>%
  text(use.n = T, all = T, pretty = T, fancy = T)

predict(arbre, test %>% select(Petal.Length, Sepal.Length), type = "class") -> test$Species_cart

# Graphique avec les résultats
g +
  geom_point(data = test,
             aes(x = Petal.Length, y = Sepal.Length, color = Species_cart),
             size = 4) +
  geom_point(data = test,
             aes(x = Petal.Length, y = Sepal.Length, color = Species),
             size = 2)

# Résultats
mean(test$Species == test$Species_cart)

# Régression logistique ----
# Données
data(mtcars)

# Graphique de base
mtcars %>%
  ggplot() +
  aes(x = hp, y = mpg, color = am) +
  geom_point() +
  scale_color_gradient(low = "#FF0000", high = "#0000FF")

# Sets de training et de test
set.seed(1000)
mtcars %>%
  sample_n(24) -> train
mtcars %>%
  anti_join(train) -> test

# Graphique training
train %>%
  ggplot() +
  aes(x = hp, y = mpg, color = am) +
  geom_point() +
  scale_color_gradient(low = "#FF0000", high = "#0000FF") -> g

# Modèle
glm(am ~ hp + mpg, data = train, family = "binomial") -> logi

logi %>% summary
logi %>% plot

# Prédiction
predict(logi, test %>% select(hp, mpg), type = "response") -> test$am_logi

# Graphique avec prédiction
g +
  geom_point(data = test,
             aes(x = hp, y = mpg, color = am_logi),
             size = 4) +
  geom_point(data = test,
             aes(x = hp, y = mpg, color = am),
             size = 2)

# Validation
library(pROC)

roc(am ~ am_logi, data = test) -> ROC
plot(ROC)