You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

319 lines
12KB

  1. #' Generate one statistic for all variables
  2. #'
  3. #' Use one stat function (made safe using statify) on all the data
  4. #' to produce a single statistics column.
  5. #'
  6. #' The result is either a numeric vector, or a character vector if
  7. #' the content of the column is not made entirely of numbers.
  8. #'
  9. #' @param stat The statistic to use
  10. #' @param data The dataframe to apply the statistic to
  11. #' @keywords internal
  12. #' @return A vector for one statistic column
  13. statColumn <- function(stat, data) {
  14. # Apply one statified stat function to every variable in the data
  15. # Return a simple vector for the column
  16. # Statify checks types and output for the stat function. Returns a numeric vector or a character vector if needed.
  17. if (length(stat) == 3) # remove after 1.0
  18. warning("Conditional formulas are deprecated and will be removed in 1.0.0
  19. purrr::map style formulas are used now.
  20. For example, `is.normal ~ mean | median` becomes `~ if (is.normal(.)) mean(.) else median(.)`")
  21. data %>%
  22. lapply(statify, stat) %>%
  23. unlist()
  24. }
  25. #' Generate the table of all statistics for all variables
  26. #'
  27. #' If stats is a list of functions or purrr::map like formulas, use them.
  28. #' If it is a single function, use it with the entire data as
  29. #' its argument to produce a list of statistical functions to use.
  30. #'
  31. #' @param data The dataframe to apply the statistic to
  32. #' @param stats A list of named statistics to use
  33. #' @keywords internal
  34. #' @return A dataframe of all statistics for all variables
  35. statTable <- function(data, stats) {
  36. # If stats is a function, apply it to the data to obtain a list of stat functions
  37. # Else use the function list as-is
  38. if (is.function(stats)) stats = stats(data) # remove after 1.0
  39. # Compute a statColumn for every stat function in stats
  40. # Assemble the result in a dataframe
  41. stats %>%
  42. lapply(statColumn, data) %>%
  43. data.frame(check.names = F,
  44. row.names = NULL,
  45. stringsAsFactors = F)
  46. }
  47. #' Generate the variable column to display as row names
  48. #'
  49. #' Generates the variable column.
  50. #' Replaces the variable names by their label if given in the named character vector labels, and inserts levels for factors.
  51. #'
  52. #' labels is an option named character vector used to make the table prettier.
  53. #' If given, the variable names for which there is a label will be replaced by their corresponding label.
  54. #' Not all variables need to have a label, and labels for non-existing variables are ignored.
  55. #'
  56. #' @param data The dataframe to get the names from
  57. #' @param labels The optional named character vector containing the keypairs var = "Label"
  58. #' @keywords internal
  59. #' @return A dataframe with one variable named "Variables", a character vector of variable names/labels and levels
  60. varColumn <- function(data, labels = NULL) {
  61. # Every variable name that exists in the labels is to be replaced with its corresponding label
  62. # Labels for non-existing variables are ignored
  63. # Variables with no label are not replaced and used as-is
  64. base_names <- names(data)
  65. base_names[base_names %in% names(labels)] <- labels[base_names[base_names %in% names(labels)]]
  66. # Check if there are factors
  67. data %>%
  68. lapply(is.factor) %>%
  69. unlist() -> factors
  70. # Insert levels for factors after the variable name
  71. if (any(factors)) {
  72. factors_idx <- which(factors)
  73. # Factor names in **bold**
  74. base_names[factors_idx] <- paste0("**", base_names[factors_idx], "**")
  75. # Factor levels in *italic*
  76. factor_levels <- lapply(factors_idx, function(x) paste0(base_names[x], ": ", "*", levels(data[[x]]), "*"))
  77. # Insert the factor levels after each factor name
  78. base_names <- insert(x = base_names,
  79. y = factor_levels,
  80. position = factors_idx)
  81. }
  82. data.frame(Variables = base_names,
  83. check.names = F,
  84. row.names = NULL,
  85. stringsAsFactors = F)
  86. }
  87. #' Create the pvalues column
  88. #'
  89. #' @param df Dataframe to use for the tests
  90. #' @param tests Test function or list of functions
  91. #' @param grp Grouping factor
  92. #' @keywords internal
  93. #' @return A numeric vector of pvalues
  94. testColumn <- function(df, tests, grp) {
  95. group <- eval(grp, df)
  96. df <- df[!names(df) %in% as.character(grp)]
  97. # If tests is a function, apply it to the data and the grouping factor to produce a list of tests
  98. # If there is an .auto element in the list of tests, apply the function as previously to select the relevant test
  99. # If there is a .default element, use it as tests
  100. # Else fall back on kruskal.test
  101. if (is.function(tests)) { # remove after 1.0
  102. ftests <- lapply(df, tests, factor(group))
  103. tests <- ftests
  104. } else if (!is.null(tests$.default)) ftests <- lapply(df, function(x){tests$.default})
  105. else if (!is.null(tests$.auto)) ftests <- lapply(df, tests$.auto, factor(group))
  106. else ftests <- lapply(df, function(x){stats::kruskal.test})
  107. # Select the forced (named) tests
  108. tests %>%
  109. names() %>%
  110. setdiff(".auto") %>%
  111. intersect(names(df)) -> forced_tests
  112. # Assemble the complete list of tests to compute
  113. ftests[names(ftests) %in% forced_tests][forced_tests] <- tests[forced_tests]
  114. # Compute the tests (made safe with testify) on the variable, using the grouping variable
  115. mapply(testify, df, ftests, MoreArgs = list(group = group), SIMPLIFY = F) %>%
  116. Reduce(f = rbind)
  117. }
  118. #' Generate a statistics table
  119. #'
  120. #' Generate a statistics table with the chosen statistical functions, nested if called with a grouped dataframe.
  121. #'
  122. #' @section Stats:
  123. #' The statistical functions to use in the table are passed as additional arguments.
  124. #' If the argument is named (eg. \code{N = length}) the name will be used as the column title instead of the function
  125. #' name (here, \strong{N} instead of \strong{length}).
  126. #'
  127. #' Any R function can be a statistical function, as long as it returns only one value when applied to a vector, or as
  128. #' many values as there are levels in a factor, plus one.
  129. #'
  130. #' Users can also use \code{purrr::map}-like formulas as quick anonymous functions (eg. \code{Q1 = ~ quantile(., .25)} to get the first quantile in a
  131. #' column named \strong{Q1})
  132. #'
  133. #' If no statistical function is given to \code{desc_table}, the \code{.auto} argument is used to provide a function
  134. #' that automatically determines the most appropriate statistical functions to use based on the contents of the table.
  135. #'
  136. #' @section Labels:
  137. #' \code{.labels} is a named character vector to provide "pretty" labels to variables.
  138. #'
  139. #' If given, the variable names for which there is a label will be replaced by their corresponding label.
  140. #'
  141. #' Not all variables need to have a label, and labels for non-existing variables are ignored.
  142. #'
  143. #' labels must be given in the form \code{c(unquoted_variable_name = "label")}
  144. #'
  145. #' @section Output:
  146. #' The output is either a dataframe in the case of a simple descriptive table,
  147. #' or nested dataframes in the case of a comparative table.
  148. #'
  149. #' @param data The dataframe to analyze
  150. #' @param ... A list of named statistics to apply to each element of the dataframe, or a function returning a list of named statistics
  151. #' @param .auto A function to automatically determine appropriate statistics
  152. #' @param .labels A named character vector of variable labels
  153. #' @return A simple or grouped descriptive table
  154. #' @seealso \code{\link{stats_auto}}
  155. #' @seealso \code{\link{IQR}}
  156. #' @seealso \code{\link{percent}}
  157. #' @export
  158. #' @family desc_table core functions
  159. #' @examples
  160. #' iris %>%
  161. #' desc_table()
  162. #'
  163. #' # Does the same as stats_auto here
  164. #' iris %>%
  165. #' desc_table("N" = length,
  166. #' "Min" = min,
  167. #' "Q1" = ~quantile(., .25),
  168. #' "Med" = median,
  169. #' "Mean" = mean,
  170. #' "Q3" = ~quantile(., .75),
  171. #' "Max" = max,
  172. #' "sd" = sd,
  173. #' "IQR" = IQR)
  174. #'
  175. #' # With grouping on a factor
  176. #' iris %>%
  177. #' group_by(Species) %>%
  178. #' desc_table(.auto = stats_auto)
  179. desc_table <- function(data, ..., .auto, .labels) {
  180. UseMethod("desc_table", data)
  181. }
  182. #' @rdname desc_table
  183. #' @export
  184. desc_table.default <- function(data, ..., .auto, .labels) {
  185. stop("`desc_table` must be called on a data.frame")
  186. }
  187. #' @rdname desc_table
  188. #' @export
  189. desc_table.data.frame <- function(data, ..., .labels = NULL, .auto = stats_auto) {
  190. stats <- rlang::dots_list(..., .named = T)
  191. if (length(stats) == 0 & is.null(.auto)) {
  192. stop("desc_table needs at least one statistic function, or an automatic function in .stats_auto")
  193. } else if (length(stats) == 0) {
  194. stats <- .auto(data)
  195. }
  196. # Assemble the Variables and the statTable in a single desctable object
  197. cbind(varColumn(data, .labels),
  198. statTable(data, stats))
  199. }
  200. #' @rdname desc_table
  201. #' @export
  202. desc_table.grouped_df <- function(data, ..., .auto = stats_auto, .labels = NULL) {
  203. # Get groups then ungroup dataframe
  204. grps <- dplyr::groups(data)
  205. if (length(grps) > 1) {
  206. warning("Only the first group will be used")
  207. data <- dplyr::ungroup(data, !!! grps[-1])
  208. }
  209. stats <- rlang::dots_list(..., .named = T)
  210. desctable <- tidyr::nest(data)
  211. if (length(stats) == 0 & is.null(.auto)) {
  212. stop("desc_table needs at least one statistic function, or an automatic function in .stats_auto")
  213. } else if (length(stats) == 0) {
  214. stats <- lapply(desctable$data, .auto)
  215. }
  216. if (is.list(stats[[1]])) {
  217. desctable$.stats <- mapply(statTable, desctable$data, stats, SIMPLIFY = F)
  218. } else {
  219. desctable$.stats <- lapply(desctable$data, statTable, stats)
  220. }
  221. desctable$.vars <- list(varColumn(data[!names(data) %in% (grps %>% lapply(as.character) %>% unlist())], .labels))
  222. desctable
  223. }
  224. #' Add tests to a desc_table
  225. #'
  226. #' Add test statistics to a grouped desc_table, with the tests specified as \code{variable = test}.
  227. #'
  228. #' @section Tests:
  229. #' The statistical test functions to use in the table are passed as additional named arguments. Tests must be preceded
  230. #' by a formula tilde (\code{~}).
  231. #' \code{name = ~test} will apply test \code{test} to variable \code{name}.
  232. #'
  233. #' Any R test function can be used, as long as it returns an object containing a \code{p.value} element, which is the
  234. #' case for most tests returning an object of class \code{htest}.
  235. #'
  236. #' Users can also use \code{purrr::map}-like formulas as quick anonymous functions (eg. \code{~ t.test(., var.equal = T)} to
  237. #' compute a t test without the Welch correction.
  238. #'
  239. #' @param desctable A desc_table
  240. #' @param ... A list of statistical tests associated to variable names
  241. #' @param .auto A function to automatically determine the appropriate tests
  242. #' @param .default A default fallback test
  243. #' @seealso \code{\link{tests_auto}}
  244. #' @seealso \code{\link{no.test}}
  245. #' @seealso \code{\link{ANOVA}}
  246. #' @return A desc_table with tests
  247. #' @export
  248. #' @family desc_table core functions
  249. #' @examples
  250. #' iris %>%
  251. #' group_by(Species) %>%
  252. #' desc_table() %>%
  253. #' desc_tests(Sepal.Length = ~kruskal.test,
  254. #' Sepal.Width = ~oneway.test,
  255. #' Petal.Length = ~oneway.test(., var.equal = T),
  256. #' Petal.Length = ~oneway.test(., var.equal = F))
  257. desc_tests <- function(desctable, .auto = tests_auto, .default = NULL, ...) {
  258. if (which.desctable(desctable) != "grouped")
  259. stop("Unexpected input. `desc_tests` must be used on the output of `desc_table` on a grouped dataframe.\n
  260. For example: iris %>% group_by(Species) %>% desc_table() %>% desc_tests")
  261. fulldata <- tidyr::unnest(desctable, "data")
  262. fulldata$.tests <- NULL
  263. fulldata$.stats <- NULL
  264. fulldata$.vars <- NULL
  265. tests <- list(...)
  266. if (!(all(names(desctable$data[[1]]) %in% names(tests))) & is.null(.auto) & is.null(.default)) {
  267. stop("desc_tests needs either a full specification of tests, or include a .auto or a .default function for non specified-tests")
  268. } else {
  269. tests <- c(list(...), list(.auto = .auto, .default = .default))
  270. }
  271. desctable$.tests <- list(testColumn(fulldata, tests, as.symbol(names(desctable)[1])))
  272. desctable
  273. }