You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

186 lines
3.0KB

  1. library(tidyverse)
  2. library(shiny)
  3. library(miniUI)
  4. library(broom)
  5. tibble(x = runif(20, 0, 10),
  6. y = x^2 - 2*x + rnorm(20, 0, 5)) -> df
  7. df %>%
  8. ggplot() +
  9. aes(x = x, y = y) +
  10. geom_point() -> g
  11. df %>%
  12. sample_n(10) -> train
  13. df %>%
  14. anti_join(train) -> test
  15. train %>%
  16. ggplot() +
  17. aes(x = x, y = y) +
  18. geom_point() -> gtrain
  19. gtrain +
  20. geom_point(data = test, aes(color = "red")) +
  21. theme(legend.position = "none") -> g
  22. ggsave(filename = "set.png", plot = g)
  23. powers <- function(df, n)
  24. {
  25. for (i in 1:n)
  26. df[[str_c("x", i)]] <- df$x^i
  27. df
  28. }
  29. LM <- function(df, n)
  30. {
  31. str_c("y ~ ", str_c("x", 1:n, collapse = " + ")) %>% as.formula -> form
  32. df <- powers(df, n)
  33. lm(form, data = df)
  34. }
  35. PREDICT <- function(train, test, n)
  36. {
  37. model <- LM(train, n)
  38. test <- powers(test, n)
  39. mean((test$y - predict(model, test))^2)
  40. }
  41. Fggfun <- function(model)
  42. {
  43. model %>%
  44. pull(estimate) -> est
  45. Vectorize(
  46. function(x)
  47. {
  48. sum(x^(0:(length(est) - 1)) * est)
  49. })
  50. }
  51. 1:9 %>% map(~LM(train, .) %>% tidy %>% Fggfun) -> ggfuns
  52. 1:9 %>%
  53. map(function(n)
  54. {
  55. LM(train, n) %>%
  56. tidy %>%
  57. Fggfun %>%
  58. stat_function(fun = .)
  59. }) -> regs
  60. 1:9 %>%
  61. map(function(n)
  62. {
  63. (gtrain + regs[n]) %>%
  64. ggsave(filename = str_c("train_", n, ".png"))
  65. })
  66. 1:9 %>%
  67. map(function(n)
  68. {
  69. (g + regs[n]) %>%
  70. ggsave(filename = str_c("all_", n, ".png"))
  71. })
  72. (
  73. 1:9 %>%
  74. map_dbl(~LM(train, .) %>% glance %>% pull(r.squared)) %>%
  75. tibble(x = 1:9, y = .) %>%
  76. ggplot() +
  77. aes(x = x, y = y) +
  78. geom_point() +
  79. geom_line() +
  80. scale_x_continuous(breaks = 0:10)
  81. ) %>%
  82. ggsave(filename = "rsquared.png", plot = .)
  83. (
  84. 1:9 %>%
  85. map_dbl(~PREDICT(train, test, .)) %>%
  86. tibble(x = 1:9, y = .) %>%
  87. ggplot() +
  88. aes(x = x, y = y) +
  89. geom_point() +
  90. geom_line() +
  91. scale_x_continuous(breaks = 0:10)
  92. ) %>%
  93. ggsave(filename = "error.png", plot = .)
  94. # Bootstrap ----
  95. x <- rnorm(30)
  96. hist(x)
  97. summary(x)
  98. t.test(x)
  99. bootstrap <- function(x, b)
  100. {
  101. 1:b %>%
  102. map_dbl(function(y)
  103. {
  104. sample(x, replace = T) %>%
  105. mean
  106. }) -> bootstraped
  107. hist(bootstraped)
  108. abline(v = mean(bootstraped))
  109. abline(v = quantile(bootstraped, probs = c(.025, .975)), lty = 2)
  110. list(mean = mean(bootstraped), IC = quantile(bootstraped, probs = c(.025,.975)))
  111. }
  112. bootstrap(x, 10000)
  113. # CV ----
  114. cvpart <- function(df, k)
  115. {
  116. n <- nrow(df)
  117. rep(1:k, length.out = n) %>%
  118. sample -> parts
  119. df %>%
  120. split(parts) -> tests
  121. tests %>%
  122. map(~anti_join(df, .)) -> trains
  123. transpose(list(train = trains, test = tests))
  124. }
  125. cvdegree <- function(df, k, N)
  126. {
  127. df %>%
  128. cvpart(k) -> parts
  129. 1:N %>%
  130. map_dbl(function(n)
  131. {
  132. parts %>%
  133. map_dbl(function(df)
  134. {
  135. PREDICT(df$train, df$test, n)
  136. }) %>%
  137. mean
  138. }) %>%
  139. which.min
  140. }
  141. tibble(x = runif(200, 0, 10),
  142. y = x^2 - 2*x + rnorm(200, 0, 5)) -> df
  143. df %>%
  144. ggplot() +
  145. aes(x = x, y = y) +
  146. geom_point() -> g