You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

267 lines
9.5KB

  1. #' Generate one statistic for all variables
  2. #'
  3. #' @param stat The statistic to use
  4. #' @param data The dataframe to apply the statistic to
  5. #' @return A vector for one statistic column
  6. statColumn <- function(stat, data)
  7. {
  8. data %>%
  9. lapply(statify, stat) %>%
  10. unlist
  11. }
  12. #' Generate the table of all statistics for all variables
  13. #'
  14. #' @param data The dataframe to apply the statistic to
  15. #' @param stats A list of named statistics to use
  16. #' @return A dataframe of all statistics for all variables
  17. statTable <- function(data, stats)
  18. {
  19. # Call the stats arg_function passed, or use the provided list as-is
  20. if (is.function(stats))
  21. stats = stats(data)
  22. stats %>%
  23. lapply(statColumn, data) %>%
  24. data.frame(check.names = F, row.names = NULL, stringsAsFactors = F)
  25. }
  26. #' Generate the variable column to display as row names
  27. #'
  28. #' Generates the variable column.
  29. #' Replaces the variable names by their label if given in the named character vector labels, and inserts levels for factors.
  30. #'
  31. #' labels is an option named character vector used to make the table prettier.
  32. #' If given, the variable names for which there is a label will be replaced by their corresponding label.
  33. #' Not all variables need to have a label, and labels for non-existing variables are ignored.
  34. #'
  35. #' @param data The dataframe to get the names from
  36. #' @param labels The optional named character vector containing the keypairs var = "Label"
  37. #' @return A dataframe with one variable named "Variables", a character vector of variable names/labels and levels
  38. varColumn <- function(data, labels = NULL)
  39. {
  40. # Replace variable names by their labels, if they exist
  41. names(data) -> base_names
  42. base_names[base_names %in% names(labels)] <- labels[base_names[base_names %in% names(labels)]]
  43. # Insert levels for factors after the variable name
  44. if (any(data %>% lapply(is.factor) %>% unlist))
  45. {
  46. data %>%
  47. lapply(is.factor) %>%
  48. unlist %>%
  49. which -> factors_idx
  50. base_names[factors_idx] <- paste0("**", base_names[factors_idx], "**")
  51. factor_levels <-
  52. factors_idx %>%
  53. lapply(function(x)
  54. {
  55. paste0(base_names[x],
  56. ": ",
  57. "*",
  58. levels(data[[x]]),
  59. "*")
  60. })
  61. insert(x = base_names,
  62. y = factor_levels,
  63. position = factors_idx) -> base_names
  64. }
  65. data.frame(Variables = base_names, check.names = F, row.names = NULL, stringsAsFactors = F)
  66. }
  67. #' Generate a statistics table
  68. #'
  69. #' Generate a statistics table with variable names/labels and levels
  70. #'
  71. #' labels is an option named character vector used to make the table prettier.
  72. #' If given, the variable names for which there is a label will be replaced by their corresponding label.
  73. #' Not all variables need to have a label, and labels for non-existing variables are ignored.
  74. #' labels must be given in the form c(unquoted_variable_name = "label")
  75. #'
  76. #' The stats can be a function which takes a dataframe and returns a list of statistical functions to use.
  77. #' stats can also be a named list of statistical functions, or formulas. The names will be used as column names in the resulting table. If an element of the list is a function, it will be used as-is for the stats. If an element of the list is a formula, it can be used to conditionally use stats depending on the variable. The general form is `condition ~ T | F`, and can be nested, such as `is.factor ~ percent | (is.normal ~ mean | median)`, for example.
  78. #'
  79. #' The tests can be a function which takes a variable and a grouping variable, and returns an appropriate statistical test to use in that case.
  80. #' tests can also be a named list of statistical test functions, associating the name of a variable in the data, and a test to use specifically for that variable. That test name must be expressed as a single-term formula (e.g. ~t.test). You don't have to specify tests for all the variables: a default test for all other variables can be defined with the name .default, and an automatic test can be defined with the name .auto.
  81. #'
  82. #' If data is a grouped dataframe (using group_by), subtables are created and statistic tests are performed over each sub-group.
  83. #'
  84. #' The output is a desctable object, which is a list of named dataframes that can be further manipulated. Methods for printing, using in pander and DT::datatable are present. Printing reduces the object to a dataframe.
  85. #'
  86. #' @param data The dataframe to analyze
  87. #' @param stats A list of named statistics to apply to each element of the dataframe, or a function returning a list of named statistics
  88. #' @param tests A list of statistical tests to use when calling desctable with a grouped_df
  89. #' @param labels A named character vector of labels to use instead of variable names
  90. #' @return A desctable object, which prints to a table of statistics for all variables
  91. #' @seealso \code{\link{stats_auto}}
  92. #' @seealso \code{\link{tests_auto}}
  93. #' @seealso \code{\link{print.desctable}}
  94. #' @seealso \code{\link{pander.desctable}}
  95. #' @seealso \code{\link{datatable.desctable}}
  96. #' @export
  97. #' @examples
  98. #' \dontrun{
  99. #' iris %>%
  100. #' desctable
  101. #'
  102. #' # Does the same as stats_auto here
  103. #' iris %>%
  104. #' desctable(stats = list("N" = length,
  105. #' "%/Mean" = is.factor ~ percent | (is.normal ~ mean),
  106. #' "sd" = is.normal ~ sd,
  107. #' "Med" = is.normal ~ NA | median,
  108. #' "IQR" = is.normal ~ NA | IQR))
  109. #'
  110. #' # With labels
  111. #' mtcars %>% desctable(labels = c(hp = "Horse Power",
  112. #' cyl = "Cylinders",
  113. #' mpg = "Miles per gallon"))
  114. #'
  115. #' # With grouping on a factor
  116. #' iris %>%
  117. #' group_by(Species) %>%
  118. #' desctable(stats = stats_default)
  119. #'
  120. #' # With nested grouping, on arbitrary variables
  121. #' mtcars %>%
  122. #' group_by(vs, cyl) %>%
  123. #' desctable
  124. #'
  125. #' # With grouping on a condition, and choice of tests
  126. #' iris %>%
  127. #' group_by(Petal.Length > 5) %>%
  128. #' desctable(tests = list(.auto = tests_auto, Species = ~chisq.test))
  129. #' }
  130. desctable <- function(data, stats, tests, labels)
  131. {
  132. # Replace every logical vector with a factor and nice labels
  133. if (any(data %>% lapply(is.logical) %>% unlist))
  134. data %>% purrr::dmap_if(is.logical, factor, levels = c(F, T), labels = c("No", "Yes")) -> data
  135. UseMethod("desctable", data)
  136. }
  137. #' @rdname desctable
  138. #' @export
  139. desctable.default <- function(data, stats = stats_auto, tests, labels = NULL)
  140. {
  141. # Build the complete table
  142. list(Variables = varColumn(data, labels),
  143. stats = statTable(data, stats)) %>%
  144. `class<-`("desctable")
  145. }
  146. #' @rdname desctable
  147. #' @export
  148. desctable.grouped_df <- function(data, stats = stats_auto, tests = tests_auto, labels = NULL)
  149. {
  150. # Get groups then ungroup dataframe
  151. grps <- data %>% dplyr::groups()
  152. data <- dplyr::ungroup(data)
  153. # Build the complete table recursively, assign "desctable" class
  154. c(Variables = list(varColumn(data[!names(data) %in% (grps %>% lapply(as.character) %>% unlist)], labels)),
  155. subTable(data, stats, tests, grps)) %>%
  156. `class<-`("desctable")
  157. }
  158. #' Create the subtables names
  159. #'
  160. #' Create the subtables names, as
  161. #' factor: level (n=sub-group length)
  162. #'
  163. #' @param grp Grouping factor
  164. #' @param df Dataframe containing the grouping factor
  165. #' @return A character vector with the names for the subtables
  166. subNames <- function(grp, df)
  167. {
  168. paste0(as.character(grp),
  169. ": ",
  170. eval(grp, df) %>% factor %>% levels,
  171. " (n=",
  172. summary(eval(grp, df) %>% factor %>% stats::na.omit(), maxsum = Inf),
  173. ")")
  174. }
  175. #' Create the pvalues column
  176. #'
  177. #' @param df Dataframe to use for the tests
  178. #' @param tests Test function or list of functions
  179. #' @param grp Grouping factor
  180. #' @return A numeric vector of pvalues
  181. testColumn <- function(df, tests, grp)
  182. {
  183. group <- eval(grp, df)
  184. df <- df %>%
  185. dplyr::select(- eval(grp))
  186. if (is.function(tests))
  187. {
  188. ftests <- df %>%
  189. lapply(tests, group %>% factor)
  190. tests <- ftests
  191. } else if (!is.null(tests$.auto))
  192. {
  193. ftests <- df %>%
  194. lapply(tests$.auto, group %>% factor)
  195. } else if (!is.null(tests$.default))
  196. {
  197. ftests <- df %>%
  198. lapply(function(x){tests$.default})
  199. } else
  200. {
  201. ftests <- df %>%
  202. lapply(function(x){stats::kruskal.test})
  203. }
  204. names(tests) %>% setdiff(".auto") %>% intersect(names(df)) -> forced_tests
  205. ftests[names(ftests) %in% forced_tests][forced_tests] <- tests[forced_tests]
  206. df %>%
  207. purrr::map2(ftests, testify, group) %>%
  208. dplyr::bind_rows()
  209. }
  210. #' Create a subtable in a grouped desctable
  211. #'
  212. #' @param df Dataframe to use
  213. #' @param stats Stats list/function to use
  214. #' @param tests Tests list/function to use
  215. #' @param grps List of symbols for grouping factors
  216. #' @return A nested list of statTables and testColumns
  217. subTable <- function(df, stats, tests, grps)
  218. {
  219. # Final group, make tests
  220. if (length(grps) == 1)
  221. {
  222. group <- eval(grps[[1]], df) %>% factor
  223. # Create the subtable stats
  224. df %>%
  225. dplyr::select(- eval(grps[[1]])) %>%
  226. by(group, statTable, stats) %>%
  227. # Name the subtables with info about group and group size
  228. stats::setNames(subNames(grps[[1]], df)) -> stats
  229. # Create the subtable tests
  230. testColumn(df, tests, grps[[1]]) -> pvalues
  231. c(stats, tests = list(pvalues))
  232. }
  233. else
  234. {
  235. group <- eval(grps[[1]], df)
  236. # Go through the next grouping levels and build the subtables
  237. df %>%
  238. dplyr::select(- eval(grps[[1]])) %>%
  239. by(group, subTable, stats, tests, grps[-1]) %>%
  240. # Name the subtables with info about group and group size
  241. stats::setNames(subNames(grps[[1]], df))
  242. }
  243. }