You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

284 lines
9.5KB

  1. #' Generate one statistic for all variables
  2. #'
  3. #' @param stat The statistic to use
  4. #' @param data The dataframe to apply the statistic to
  5. #' @return A vector for one statistic column
  6. statColumn <- function(stat, data)
  7. {
  8. data %>%
  9. lapply(statify, stat) %>%
  10. unlist()
  11. }
  12. #' Generate the table of all statistics for all variables
  13. #'
  14. #' @param data The dataframe to apply the statistic to
  15. #' @param stats A list of named statistics to use
  16. #' @return A dataframe of all statistics for all variables
  17. statTable <- function(data, stats)
  18. {
  19. # Call the stats arg_function passed, or use the provided list as-is
  20. if (is.function(stats))
  21. stats = stats(data)
  22. stats %>%
  23. lapply(statColumn, data) %>%
  24. data.frame(check.names = F, row.names = NULL, stringsAsFactors = F)
  25. }
  26. #' Generate the variable column to display as row names
  27. #'
  28. #' Generates the variable column.
  29. #' Replaces the variable names by their label if given in the named character vector labels, and inserts levels for factors.
  30. #'
  31. #' labels is an option named character vector used to make the table prettier.
  32. #' If given, the variable names for which there is a label will be replaced by their corresponding label.
  33. #' Not all variables need to have a label, and labels for non-existing variables are ignored.
  34. #'
  35. #' @param data The dataframe to get the names from
  36. #' @param labels The optional named character vector containing the keypairs var = "Label"
  37. #' @return A dataframe with one variable named "Variables", a character vector of variable names/labels and levels
  38. varColumn <- function(data, labels = NULL)
  39. {
  40. # Replace variable names by their labels, if they exist
  41. names(data) -> base_names
  42. base_names[base_names %in% names(labels)] <- labels[base_names[base_names %in% names(labels)]]
  43. # Insert levels for factors after the variable name
  44. if (any(data %>% lapply(is.factor) %>% unlist()))
  45. {
  46. data %>%
  47. lapply(is.factor) %>%
  48. unlist() %>%
  49. which() -> factors_idx
  50. base_names[factors_idx] <- paste0("**", base_names[factors_idx], "**")
  51. factor_levels <-
  52. factors_idx %>%
  53. lapply(function(x)
  54. {
  55. paste0(base_names[x],
  56. ": ",
  57. "*",
  58. levels(data[[x]]),
  59. "*")
  60. })
  61. insert(x = base_names,
  62. y = factor_levels,
  63. position = factors_idx) -> base_names
  64. }
  65. data.frame(Variables = base_names, check.names = F, row.names = NULL, stringsAsFactors = F)
  66. }
  67. #' Generate a statistics table
  68. #'
  69. #' Generate a statistics table with the chosen statistical functions, and tests if given a \code{"grouped"} dataframe.
  70. #'
  71. #' @section Labels:
  72. #' labels is an option named character vector used to make the table prettier.
  73. #'
  74. #' If given, the variable names for which there is a label will be replaced by their corresponding label.
  75. #'
  76. #' Not all variables need to have a label, and labels for non-existing variables are ignored.
  77. #'
  78. #' labels must be given in the form c(unquoted_variable_name = "label")
  79. #'
  80. #' @section Stats:
  81. #' The stats can be a function which takes a dataframe and returns a list of statistical functions to use.
  82. #'
  83. #' stats can also be a named list of statistical functions, or formulas.
  84. #'
  85. #' The names will be used as column names in the resulting table. If an element of the list is a function, it will be used as-is for the stats. If an element of the list is a formula, it can be used to conditionally use stats depending on the variable.
  86. #'
  87. #' The general form is \code{condition ~ T | F}, and can be nested, such as \code{is.factor ~ percent | (is.normal ~ mean | median)}, for example.
  88. #'
  89. #' @section Tests:
  90. #' The tests can be a function which takes a variable and a grouping variable, and returns an appropriate statistical test to use in that case.
  91. #'
  92. #' tests can also be a named list of statistical test functions, associating the name of a variable in the data, and a test to use specifically for that variable.
  93. #'
  94. #' That test name must be expressed as a single-term formula (e.g. \code{~t.test}). You don't have to specify tests for all the variables: a default test for all other variables can be defined with the name \code{.default}, and an automatic test can be defined with the name \code{.auto}.
  95. #'
  96. #' If data is a grouped dataframe (using \code{group_by}), subtables are created and statistic tests are performed over each sub-group.
  97. #'
  98. #' @section Output:
  99. #' The output is a desctable object, which is a list of named dataframes that can be further manipulated. Methods for printing, using in \pkg{pander} and \pkg{DT} are present. Printing reduces the object to a dataframe.
  100. #'
  101. #' @param data The dataframe to analyze
  102. #' @param stats A list of named statistics to apply to each element of the dataframe, or a function returning a list of named statistics
  103. #' @param tests A list of statistical tests to use when calling desctable with a grouped_df
  104. #' @param labels A named character vector of labels to use instead of variable names
  105. #' @return A desctable object, which prints to a table of statistics for all variables
  106. #' @seealso \code{\link{stats_auto}}
  107. #' @seealso \code{\link{tests_auto}}
  108. #' @seealso \code{\link{print.desctable}}
  109. #' @seealso \code{\link{pander.desctable}}
  110. #' @seealso \code{\link{datatable.desctable}}
  111. #' @export
  112. #' @examples
  113. #' iris %>%
  114. #' desctable()
  115. #'
  116. #' # Does the same as stats_auto here
  117. #' iris %>%
  118. #' desctable(stats = list("N" = length,
  119. #' "%/Mean" = is.factor ~ percent | (is.normal ~ mean),
  120. #' "sd" = is.normal ~ sd,
  121. #' "Med" = is.normal ~ NA | median,
  122. #' "IQR" = is.normal ~ NA | IQR))
  123. #'
  124. #' # With labels
  125. #' mtcars %>% desctable(labels = c(hp = "Horse Power",
  126. #' cyl = "Cylinders",
  127. #' mpg = "Miles per gallon"))
  128. #'
  129. #' # With grouping on a factor
  130. #' iris %>%
  131. #' group_by(Species) %>%
  132. #' desctable(stats = stats_default)
  133. #'
  134. #' # With nested grouping, on arbitrary variables
  135. #' mtcars %>%
  136. #' group_by(vs, cyl) %>%
  137. #' desctable()
  138. #'
  139. #' # With grouping on a condition, and choice of tests
  140. #' iris %>%
  141. #' group_by(Petal.Length > 5) %>%
  142. #' desctable(tests = list(.auto = tests_auto, Species = ~chisq.test))
  143. desctable <- function(data, stats, tests, labels)
  144. {
  145. UseMethod("desctable", data)
  146. }
  147. #' @rdname desctable
  148. #' @export
  149. desctable.default <- function(data, stats = stats_auto, tests, labels = NULL)
  150. {
  151. # Build the complete table
  152. list(Variables = varColumn(data, labels),
  153. stats = statTable(data, stats)) %>%
  154. `class<-`("desctable")
  155. }
  156. #' @rdname desctable
  157. #' @export
  158. desctable.grouped_df <- function(data, stats = stats_auto, tests = tests_auto, labels = NULL)
  159. {
  160. # Get groups then ungroup dataframe
  161. grps <- data %>% dplyr::groups()
  162. data <- dplyr::ungroup(data)
  163. # Build the complete table recursively, assign "desctable" class
  164. c(Variables = list(varColumn(data[!names(data) %in% (grps %>% lapply(as.character) %>% unlist())], labels)),
  165. subTable(data, stats, tests, grps)) %>%
  166. `class<-`("desctable")
  167. }
  168. #' Create the subtables names
  169. #'
  170. #' Create the subtables names, as
  171. #' factor: level (n=sub-group length)
  172. #'
  173. #' @param grp Grouping factor
  174. #' @param df Dataframe containing the grouping factor
  175. #' @return A character vector with the names for the subtables
  176. subNames <- function(grp, df)
  177. {
  178. paste0(as.character(grp),
  179. ": ",
  180. eval(grp, df) %>% factor() %>% levels(),
  181. " (n=",
  182. summary(eval(grp, df) %>% factor() %>% stats::na.omit(), maxsum = Inf),
  183. ")")
  184. }
  185. #' Create the pvalues column
  186. #'
  187. #' @param df Dataframe to use for the tests
  188. #' @param tests Test function or list of functions
  189. #' @param grp Grouping factor
  190. #' @return A numeric vector of pvalues
  191. testColumn <- function(df, tests, grp)
  192. {
  193. group <- eval(grp, df)
  194. df <- df %>%
  195. dplyr::select(-!!(grp))
  196. if (is.function(tests))
  197. {
  198. ftests <- df %>%
  199. lapply(tests, group %>% factor())
  200. tests <- ftests
  201. } else if (!is.null(tests$.auto))
  202. {
  203. ftests <- df %>%
  204. lapply(tests$.auto, group %>% factor)
  205. } else if (!is.null(tests$.default))
  206. {
  207. ftests <- df %>%
  208. lapply(function(x){tests$.default})
  209. } else
  210. {
  211. ftests <- df %>%
  212. lapply(function(x){stats::kruskal.test})
  213. }
  214. names(tests) %>% setdiff(".auto") %>% intersect(names(df)) -> forced_tests
  215. ftests[names(ftests) %in% forced_tests][forced_tests] <- tests[forced_tests]
  216. df %>%
  217. purrr::map2(ftests, testify, group) %>%
  218. dplyr::bind_rows()
  219. }
  220. #' Create a subtable in a grouped desctable
  221. #'
  222. #' @param df Dataframe to use
  223. #' @param stats Stats list/function to use
  224. #' @param tests Tests list/function to use
  225. #' @param grps List of symbols for grouping factors
  226. #' @return A nested list of statTables and testColumns
  227. subTable <- function(df, stats, tests, grps)
  228. {
  229. # Final group, make tests
  230. if (length(grps) == 1)
  231. {
  232. group <- eval(grps[[1]], df) %>% factor()
  233. # Create the subtable stats
  234. df %>%
  235. dplyr::select(-!!(grps[[1]])) %>%
  236. by(group, statTable, stats) %>%
  237. # Name the subtables with info about group and group size
  238. stats::setNames(subNames(grps[[1]], df)) -> stats
  239. # Create the subtable tests
  240. testColumn(df, tests, grps[[1]]) -> pvalues
  241. c(stats, tests = list(pvalues))
  242. }
  243. else
  244. {
  245. group <- eval(grps[[1]], df)
  246. # Go through the next grouping levels and build the subtables
  247. df %>%
  248. dplyr::select(-!!(grps[[1]])) %>%
  249. by(group, subTable, stats, tests, grps[-1]) %>%
  250. # Name the subtables with info about group and group size
  251. stats::setNames(subNames(grps[[1]], df))
  252. }
  253. }