Cours
/
Parcours-ML


			
							library(tidyverse)
library(shiny)
library(miniUI)
library(broom)

tibble(x = runif(20, 0, 10),
       y = x^2 - 2*x + rnorm(20, 0, 5)) -> df

df %>%
  ggplot() +
  aes(x = x, y = y) +
  geom_point() -> g

df %>%
  sample_n(10) -> train

df %>%
  anti_join(train) -> test

train %>%
  ggplot() +
  aes(x = x, y = y) +
  geom_point() -> gtrain

gtrain +
  geom_point(data = test, aes(color = "red")) +
  theme(legend.position = "none") -> g

ggsave(filename = "set.png", plot = g)

powers <- function(df, n)
{
  for (i in 1:n)
    df[[str_c("x", i)]] <- df$x^i

  df
}

LM <- function(df, n)
{
  str_c("y ~ ", str_c("x", 1:n, collapse = " + ")) %>% as.formula -> form

  df <- powers(df, n)

  lm(form, data = df)
}

PREDICT <- function(train, test, n)
{
  model <- LM(train, n)

  test <- powers(test, n)

  mean((test$y - predict(model, test))^2)
}

Fggfun <- function(model)
{
  model %>%
    pull(estimate) -> est
  Vectorize(
            function(x)
            {
              sum(x^(0:(length(est) - 1)) * est)
            })
}

1:9 %>% map(~LM(train, .) %>% tidy %>% Fggfun) -> ggfuns

1:9 %>%
  map(function(n)
      {
        LM(train, n) %>%
          tidy %>%
          Fggfun %>%
          stat_function(fun = .)
      }) -> regs

1:9 %>%
  map(function(n)
      {
        (gtrain + regs[n]) %>%
          ggsave(filename = str_c("train_", n, ".png"))
      })

1:9 %>%
  map(function(n)
      {
        (g + regs[n]) %>%
          ggsave(filename = str_c("all_", n, ".png"))
      })

(
1:9 %>%
  map_dbl(~LM(train, .) %>% glance %>% pull(r.squared)) %>%
  tibble(x = 1:9, y = .) %>%
  ggplot() +
  aes(x = x, y = y) +
  geom_point() +
  geom_line() +
  scale_x_continuous(breaks = 0:10)
) %>%
ggsave(filename = "rsquared.png", plot = .)

(
1:9 %>%
  map_dbl(~PREDICT(train, test, .)) %>%
  tibble(x = 1:9, y = .) %>%
  ggplot() +
  aes(x = x, y = y) +
  geom_point() +
  geom_line() +
  scale_x_continuous(breaks = 0:10)
) %>%
ggsave(filename = "error.png", plot = .)


# Bootstrap ----

x <- rnorm(30)
hist(x)
summary(x)
t.test(x)

bootstrap <- function(x, b)
{
  1:b %>%
    map_dbl(function(y)
        {
          sample(x, replace = T) %>%
            mean
        }) -> bootstraped

  hist(bootstraped)
  abline(v = mean(bootstraped))
  abline(v = quantile(bootstraped, probs = c(.025, .975)), lty = 2)

  list(mean = mean(bootstraped), IC = quantile(bootstraped, probs = c(.025,.975)))
}

bootstrap(x, 10000)

# CV ----

cvpart <- function(df, k)
{
  n <- nrow(df)
  rep(1:k, length.out = n) %>%
    sample -> parts

  df %>%
    split(parts) -> tests

  tests %>%
    map(~anti_join(df, .)) -> trains

  transpose(list(train = trains, test = tests))
}

cvdegree <- function(df, k, N)
{
  df %>%
    cvpart(k) -> parts

  1:N %>%
    map_dbl(function(n)
        {
          parts %>%
            map_dbl(function(df)
                {
                  PREDICT(df$train, df$test, n)
                }) %>%
          mean
        }) %>%
  which.min
}

tibble(x = runif(200, 0, 10),
       y = x^2 - 2*x + rnorm(200, 0, 5)) -> df

df %>%
  ggplot() +
  aes(x = x, y = y) +
  geom_point() -> g