You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

286 lines
11KB

  1. #' Generate one statistic for all variables
  2. #'
  3. #' Use one stat function (made safe using statify) on all the data
  4. #' to produce a single statistics column.
  5. #'
  6. #' The result is either a numeric vector, or a character vector if
  7. #' the content of the column is not made entirely of numbers.
  8. #'
  9. #' @param stat The statistic to use
  10. #' @param data The dataframe to apply the statistic to
  11. #' @return A vector for one statistic column
  12. statColumn <- function(stat, data) {
  13. # Apply one statified stat function to every variable in the data
  14. # Return a simple vector for the column
  15. # Statify checks types and output for the stat function. Returns a numeric vector or a character vector if needed.
  16. data %>%
  17. lapply(statify, stat) %>%
  18. unlist()
  19. }
  20. #' Generate the table of all statistics for all variables
  21. #'
  22. #' If stats is a list of functions, use them.
  23. #' If it is a single function, use it with the entire data as
  24. #' its argument to produce a list of statistical functions to use.
  25. #'
  26. #' @param data The dataframe to apply the statistic to
  27. #' @param stats A list of named statistics to use
  28. #' @return A dataframe of all statistics for all variables
  29. statTable <- function(data, stats) {
  30. # If stats is a function, apply it to the data to obtain a list of stat functions
  31. # Else use the function list as-is
  32. if (is.function(stats)) stats = stats(data)
  33. # Compute a statColumn for every stat function in stats
  34. # Assemble the result in a dataframe
  35. stats %>%
  36. lapply(statColumn, data) %>%
  37. data.frame(check.names = F,
  38. row.names = NULL,
  39. stringsAsFactors = F)
  40. }
  41. #' Generate the variable column to display as row names
  42. #'
  43. #' Generates the variable column.
  44. #' Replaces the variable names by their label if given in the named character vector labels, and inserts levels for factors.
  45. #'
  46. #' labels is an option named character vector used to make the table prettier.
  47. #' If given, the variable names for which there is a label will be replaced by their corresponding label.
  48. #' Not all variables need to have a label, and labels for non-existing variables are ignored.
  49. #'
  50. #' @param data The dataframe to get the names from
  51. #' @param labels The optional named character vector containing the keypairs var = "Label"
  52. #' @return A dataframe with one variable named "Variables", a character vector of variable names/labels and levels
  53. varColumn <- function(data, labels = NULL) {
  54. # Every variable name that exists in the labels is to be replaced with its corresponding label
  55. # Labels for non-existing variables are ignored
  56. # Variables with no label are not replaced and used as-is
  57. base_names <- names(data)
  58. base_names[base_names %in% names(labels)] <- labels[base_names[base_names %in% names(labels)]]
  59. # Check if there are factors
  60. data %>%
  61. lapply(is.factor) %>%
  62. unlist() -> factors
  63. # Insert levels for factors after the variable name
  64. if (any(factors)) {
  65. factors_idx <- which(factors)
  66. # Factor names in **bold**
  67. base_names[factors_idx] <- paste0("**", base_names[factors_idx], "**")
  68. # Factor levels in *italic*
  69. factor_levels <- lapply(factors_idx, function(x) paste0(base_names[x], ": ", "*", levels(data[[x]]), "*"))
  70. # Insert the factor levels after each factor name
  71. base_names <- insert(x = base_names,
  72. y = factor_levels,
  73. position = factors_idx)
  74. }
  75. data.frame(Variables = base_names,
  76. check.names = F,
  77. row.names = NULL,
  78. stringsAsFactors = F)
  79. }
  80. #' Generate a statistics table
  81. #'
  82. #' Generate a statistics table with the chosen statistical functions, and tests if given a \code{"grouped"} dataframe.
  83. #'
  84. #' @section Labels:
  85. #' labels is an option named character vector used to make the table prettier.
  86. #'
  87. #' If given, the variable names for which there is a label will be replaced by their corresponding label.
  88. #'
  89. #' Not all variables need to have a label, and labels for non-existing variables are ignored.
  90. #'
  91. #' labels must be given in the form c(unquoted_variable_name = "label")
  92. #'
  93. #' @section Stats:
  94. #' The stats can be a function which takes a dataframe and returns a list of statistical functions to use.
  95. #'
  96. #' stats can also be a named list of statistical functions, or formulas.
  97. #'
  98. #' The names will be used as column names in the resulting table. If an element of the list is a function, it will be used as-is for the stats. If an element of the list is a formula, it can be used to conditionally use stats depending on the variable.
  99. #'
  100. #' The general form is \code{condition ~ T | F}, and can be nested, such as \code{is.factor ~ percent | (is.normal ~ mean | median)}, for example.
  101. #'
  102. #' @section Tests:
  103. #' The tests can be a function which takes a variable and a grouping variable, and returns an appropriate statistical test to use in that case.
  104. #'
  105. #' tests can also be a named list of statistical test functions, associating the name of a variable in the data, and a test to use specifically for that variable.
  106. #'
  107. #' That test name must be expressed as a single-term formula (e.g. \code{~t.test}). You don't have to specify tests for all the variables: a default test for all other variables can be defined with the name \code{.default}, and an automatic test can be defined with the name \code{.auto}.
  108. #'
  109. #' If data is a grouped dataframe (using \code{group_by}), subtables are created and statistic tests are performed over each sub-group.
  110. #'
  111. #' @section Output:
  112. #' The output is a desctable object, which is a list of named dataframes that can be further manipulated. Methods for printing, using in \pkg{pander} and \pkg{DT} are present. Printing reduces the object to a dataframe.
  113. #'
  114. #' @param data The dataframe to analyze
  115. #' @param stats A list of named statistics to apply to each element of the dataframe, or a function returning a list of named statistics
  116. #' @param tests A list of statistical tests to use when calling desctable with a grouped_df
  117. #' @param labels A named character vector of labels to use instead of variable names
  118. #' @return A desctable object, which prints to a table of statistics for all variables
  119. #' @seealso \code{\link{stats_auto}}
  120. #' @seealso \code{\link{tests_auto}}
  121. #' @seealso \code{\link{print.desctable}}
  122. #' @seealso \code{\link{pander.desctable}}
  123. #' @seealso \code{\link{datatable.desctable}}
  124. #' @export
  125. #' @examples
  126. #' iris %>%
  127. #' desctable()
  128. #'
  129. #' # Does the same as stats_auto here
  130. #' iris %>%
  131. #' desctable(stats = list("N" = length,
  132. #' "%/Mean" = is.factor ~ percent | (is.normal ~ mean),
  133. #' "sd" = is.normal ~ sd,
  134. #' "Med" = is.normal ~ NA | median,
  135. #' "IQR" = is.normal ~ NA | IQR))
  136. #'
  137. #' # With labels
  138. #' mtcars %>% desctable(labels = c(hp = "Horse Power",
  139. #' cyl = "Cylinders",
  140. #' mpg = "Miles per gallon"))
  141. #'
  142. #' # With grouping on a factor
  143. #' iris %>%
  144. #' group_by(Species) %>%
  145. #' desctable(stats = stats_default)
  146. #'
  147. #' # With nested grouping, on arbitrary variables
  148. #' mtcars %>%
  149. #' group_by(vs, cyl) %>%
  150. #' desctable()
  151. #'
  152. #' # With grouping on a condition, and choice of tests
  153. #' iris %>%
  154. #' group_by(Petal.Length > 5) %>%
  155. #' desctable(tests = list(.auto = tests_auto, Species = ~chisq.test))
  156. desctable <- function(data, stats, tests, labels) {
  157. UseMethod("desctable", data)
  158. }
  159. #' @rdname desctable
  160. #' @export
  161. desctable.default <- function(data, stats = stats_auto, tests, labels = NULL) {
  162. # Assemble the Variables and the statTable in a single desctable object
  163. list(Variables = varColumn(data, labels),
  164. stats = statTable(data, stats)) %>%
  165. set_desctable_class()
  166. }
  167. #' @rdname desctable
  168. #' @export
  169. desctable.grouped_df <- function(data, stats = stats_auto, tests = tests_auto, labels = NULL) {
  170. # Get groups then ungroup dataframe
  171. grps <- dplyr::groups(data)
  172. data <- dplyr::ungroup(data)
  173. # Assemble the Variables (excluding the grouping ones) and the subTables recursively in a single desctable object
  174. c(Variables = list(varColumn(data[!names(data) %in% (grps %>% lapply(as.character) %>% unlist())], labels)),
  175. subTable(data, stats, tests, grps)) %>%
  176. set_desctable_class()
  177. }
  178. #' Create the subtables names
  179. #'
  180. #' Create the subtables names, as
  181. #' factor: level (n=sub-group length)
  182. #'
  183. #' @param grp Grouping factor
  184. #' @param df Dataframe containing the grouping factor
  185. #' @return A character vector with the names for the subtables
  186. subNames <- function(grp, df) {
  187. paste0(as.character(grp),
  188. ": ",
  189. eval(grp, df) %>% factor() %>% levels(),
  190. " (n=",
  191. summary(eval(grp, df) %>% factor() %>% stats::na.omit(), maxsum = Inf),
  192. ")")
  193. }
  194. #' Create the pvalues column
  195. #'
  196. #' @param df Dataframe to use for the tests
  197. #' @param tests Test function or list of functions
  198. #' @param grp Grouping factor
  199. #' @return A numeric vector of pvalues
  200. testColumn <- function(df, tests, grp) {
  201. group <- eval(grp, df)
  202. df <- df[!names(df) %in% as.character(grp)]
  203. # If tests is a function, apply it to the data and the grouping factor to produce a list of tests
  204. # If there is an .auto element in the list of tests, apply the function as previously to select the relevant test
  205. # If there is a .default element, use it as tests
  206. # Else fall back on kruskal.test
  207. if (is.function(tests)) {
  208. ftests <- lapply(df, tests, factor(group))
  209. tests <- ftests
  210. } else if (!is.null(tests$.auto)) ftests <- lapply(df, tests$.auto, factor(group))
  211. else if (!is.null(tests$.default)) ftests <- lapply(df, function(x){tests$.default})
  212. else ftests <- lapply(df, function(x){stats::kruskal.test})
  213. # Select the forced (named) tests
  214. tests %>%
  215. names() %>%
  216. setdiff(".auto") %>%
  217. intersect(names(df)) -> forced_tests
  218. # Assemble the complete list of tests to compute
  219. ftests[names(ftests) %in% forced_tests][forced_tests] <- tests[forced_tests]
  220. # Compute the tests (made safe with testify) on the variable, using the grouping variable
  221. df %>%
  222. purrr::map2(ftests, testify, group) %>%
  223. Reduce(f = rbind)
  224. }
  225. #' Create a subtable in a grouped desctable
  226. #'
  227. #' @param df Dataframe to use
  228. #' @param stats Stats list/function to use
  229. #' @param tests Tests list/function to use
  230. #' @param grps List of symbols for grouping factors
  231. #' @return A nested list of statTables and testColumns
  232. subTable <- function(df, stats, tests, grps) {
  233. # Final group, compute tests
  234. if (length(grps) == 1) {
  235. group <- factor(eval(grps[[1]], df))
  236. # Create the subtable stats
  237. df[!names(df) %in% as.character(grps[[1]])] %>%
  238. by(group, statTable, stats) %>%
  239. # Name the subtables with info about group and group size
  240. stats::setNames(subNames(grps[[1]], df)) -> stats
  241. # Create the subtable tests
  242. pvalues <- testColumn(df, tests, grps[[1]])
  243. c(stats, tests = list(pvalues))
  244. } else {
  245. group <- eval(grps[[1]], df)
  246. # Go through the next grouping levels and build the subtables
  247. df[!names(df) %in% as.character(grps[[1]])] %>%
  248. by(group, subTable, stats, tests, grps[-1]) %>%
  249. # Name the subtables with info about group and group size
  250. stats::setNames(subNames(grps[[1]], df))
  251. }
  252. }