|
- #' Generate one statistic for all variables
- #'
- #' Use one stat function (made safe using statify) on all the data
- #' to produce a single statistics column.
- #'
- #' The result is either a numeric vector, or a character vector if
- #' the content of the column is not made entirely of numbers.
- #'
- #' @param stat The statistic to use
- #' @param data The dataframe to apply the statistic to
- #' @return A vector for one statistic column
- statColumn <- function(stat, data) {
- # Apply one statified stat function to every variable in the data
- # Return a simple vector for the column
- # Statify checks types and output for the stat function. Returns a numeric vector or a character vector if needed.
- data %>%
- lapply(statify, stat) %>%
- unlist()
- }
-
-
- #' Generate the table of all statistics for all variables
- #'
- #' If stats is a list of functions, use them.
- #' If it is a single function, use it with the entire data as
- #' its argument to produce a list of statistical functions to use.
- #'
- #' @param data The dataframe to apply the statistic to
- #' @param stats A list of named statistics to use
- #' @return A dataframe of all statistics for all variables
- statTable <- function(data, stats) {
- # If stats is a function, apply it to the data to obtain a list of stat functions
- # Else use the function list as-is
- if (is.function(stats)) stats = stats(data)
-
- # Compute a statColumn for every stat function in stats
- # Assemble the result in a dataframe
- stats %>%
- lapply(statColumn, data) %>%
- data.frame(check.names = F,
- row.names = NULL,
- stringsAsFactors = F)
- }
-
-
- #' Generate the variable column to display as row names
- #'
- #' Generates the variable column.
- #' Replaces the variable names by their label if given in the named character vector labels, and inserts levels for factors.
- #'
- #' labels is an option named character vector used to make the table prettier.
- #' If given, the variable names for which there is a label will be replaced by their corresponding label.
- #' Not all variables need to have a label, and labels for non-existing variables are ignored.
- #'
- #' @param data The dataframe to get the names from
- #' @param labels The optional named character vector containing the keypairs var = "Label"
- #' @return A dataframe with one variable named "Variables", a character vector of variable names/labels and levels
- varColumn <- function(data, labels = NULL) {
- # Every variable name that exists in the labels is to be replaced with its corresponding label
- # Labels for non-existing variables are ignored
- # Variables with no label are not replaced and used as-is
- base_names <- names(data)
- base_names[base_names %in% names(labels)] <- labels[base_names[base_names %in% names(labels)]]
-
- # Check if there are factors
- data %>%
- lapply(is.factor) %>%
- unlist() -> factors
-
- # Insert levels for factors after the variable name
- if (any(factors)) {
- factors_idx <- which(factors)
-
- # Factor names in **bold**
- base_names[factors_idx] <- paste0("**", base_names[factors_idx], "**")
-
- # Factor levels in *italic*
- factor_levels <- lapply(factors_idx, function(x) paste0(base_names[x], ": ", "*", levels(data[[x]]), "*"))
-
- # Insert the factor levels after each factor name
- base_names <- insert(x = base_names,
- y = factor_levels,
- position = factors_idx)
- }
-
- data.frame(Variables = base_names,
- check.names = F,
- row.names = NULL,
- stringsAsFactors = F)
- }
-
-
- #' Generate a statistics table
- #'
- #' Generate a statistics table with the chosen statistical functions, and tests if given a \code{"grouped"} dataframe.
- #'
- #' @section Labels:
- #' labels is an option named character vector used to make the table prettier.
- #'
- #' If given, the variable names for which there is a label will be replaced by their corresponding label.
- #'
- #' Not all variables need to have a label, and labels for non-existing variables are ignored.
- #'
- #' labels must be given in the form c(unquoted_variable_name = "label")
- #'
- #' @section Stats:
- #' The stats can be a function which takes a dataframe and returns a list of statistical functions to use.
- #'
- #' stats can also be a named list of statistical functions, or formulas.
- #'
- #' The names will be used as column names in the resulting table. If an element of the list is a function, it will be used as-is for the stats. If an element of the list is a formula, it can be used to conditionally use stats depending on the variable.
- #'
- #' The general form is \code{condition ~ T | F}, and can be nested, such as \code{is.factor ~ percent | (is.normal ~ mean | median)}, for example.
- #'
- #' @section Tests:
- #' The tests can be a function which takes a variable and a grouping variable, and returns an appropriate statistical test to use in that case.
- #'
- #' tests can also be a named list of statistical test functions, associating the name of a variable in the data, and a test to use specifically for that variable.
- #'
- #' That test name must be expressed as a single-term formula (e.g. \code{~t.test}). You don't have to specify tests for all the variables: a default test for all other variables can be defined with the name \code{.default}, and an automatic test can be defined with the name \code{.auto}.
- #'
- #' If data is a grouped dataframe (using \code{group_by}), subtables are created and statistic tests are performed over each sub-group.
- #'
- #' @section Output:
- #' The output is a desctable object, which is a list of named dataframes that can be further manipulated. Methods for printing, using in \pkg{pander} and \pkg{DT} are present. Printing reduces the object to a dataframe.
- #'
- #' @param data The dataframe to analyze
- #' @param stats A list of named statistics to apply to each element of the dataframe, or a function returning a list of named statistics
- #' @param tests A list of statistical tests to use when calling desctable with a grouped_df
- #' @param labels A named character vector of labels to use instead of variable names
- #' @return A desctable object, which prints to a table of statistics for all variables
- #' @seealso \code{\link{stats_auto}}
- #' @seealso \code{\link{tests_auto}}
- #' @seealso \code{\link{print.desctable}}
- #' @seealso \code{\link{pander.desctable}}
- #' @seealso \code{\link{datatable.desctable}}
- #' @export
- #' @examples
- #' iris %>%
- #' desctable()
- #'
- #' # Does the same as stats_auto here
- #' iris %>%
- #' desctable(stats = list("N" = length,
- #' "%/Mean" = is.factor ~ percent | (is.normal ~ mean),
- #' "sd" = is.normal ~ sd,
- #' "Med" = is.normal ~ NA | median,
- #' "IQR" = is.normal ~ NA | IQR))
- #'
- #' # With labels
- #' mtcars %>% desctable(labels = c(hp = "Horse Power",
- #' cyl = "Cylinders",
- #' mpg = "Miles per gallon"))
- #'
- #' # With grouping on a factor
- #' iris %>%
- #' group_by(Species) %>%
- #' desctable(stats = stats_default)
- #'
- #' # With nested grouping, on arbitrary variables
- #' mtcars %>%
- #' group_by(vs, cyl) %>%
- #' desctable()
- #'
- #' # With grouping on a condition, and choice of tests
- #' iris %>%
- #' group_by(Petal.Length > 5) %>%
- #' desctable(tests = list(.auto = tests_auto, Species = ~chisq.test))
- desctable <- function(data, stats, tests, labels) {
- UseMethod("desctable", data)
- }
-
-
- #' @rdname desctable
- #' @export
- desctable.default <- function(data, stats = stats_auto, tests, labels = NULL) {
- # Assemble the Variables and the statTable in a single desctable object
- list(Variables = varColumn(data, labels),
- stats = statTable(data, stats)) %>%
- set_desctable_class()
- }
-
-
- #' @rdname desctable
- #' @export
- desctable.grouped_df <- function(data, stats = stats_auto, tests = tests_auto, labels = NULL) {
- # Get groups then ungroup dataframe
- grps <- dplyr::groups(data)
- data <- dplyr::ungroup(data)
-
- # Assemble the Variables (excluding the grouping ones) and the subTables recursively in a single desctable object
- c(Variables = list(varColumn(data[!names(data) %in% (grps %>% lapply(as.character) %>% unlist())], labels)),
- subTable(data, stats, tests, grps)) %>%
- set_desctable_class()
- }
-
-
- #' Create the subtables names
- #'
- #' Create the subtables names, as
- #' factor: level (n=sub-group length)
- #'
- #' @param grp Grouping factor
- #' @param df Dataframe containing the grouping factor
- #' @return A character vector with the names for the subtables
- subNames <- function(grp, df) {
- paste0(as.character(grp),
- ": ",
- eval(grp, df) %>% factor() %>% levels(),
- " (n=",
- summary(eval(grp, df) %>% factor() %>% stats::na.omit(), maxsum = Inf),
- ")")
- }
-
-
- #' Create the pvalues column
- #'
- #' @param df Dataframe to use for the tests
- #' @param tests Test function or list of functions
- #' @param grp Grouping factor
- #' @return A numeric vector of pvalues
- testColumn <- function(df, tests, grp) {
- group <- eval(grp, df)
-
- df <- df[!names(df) %in% as.character(grp)]
-
- # If tests is a function, apply it to the data and the grouping factor to produce a list of tests
- # If there is an .auto element in the list of tests, apply the function as previously to select the relevant test
- # If there is a .default element, use it as tests
- # Else fall back on kruskal.test
- if (is.function(tests)) {
- ftests <- lapply(df, tests, factor(group))
- tests <- ftests
- } else if (!is.null(tests$.auto)) ftests <- lapply(df, tests$.auto, factor(group))
- else if (!is.null(tests$.default)) ftests <- lapply(df, function(x){tests$.default})
- else ftests <- lapply(df, function(x){stats::kruskal.test})
-
- # Select the forced (named) tests
- tests %>%
- names() %>%
- setdiff(".auto") %>%
- intersect(names(df)) -> forced_tests
-
- # Assemble the complete list of tests to compute
- ftests[names(ftests) %in% forced_tests][forced_tests] <- tests[forced_tests]
-
- # Compute the tests (made safe with testify) on the variable, using the grouping variable
- df %>%
- purrr::map2(ftests, testify, group) %>%
- Reduce(f = rbind)
- }
-
-
- #' Create a subtable in a grouped desctable
- #'
- #' @param df Dataframe to use
- #' @param stats Stats list/function to use
- #' @param tests Tests list/function to use
- #' @param grps List of symbols for grouping factors
- #' @return A nested list of statTables and testColumns
- subTable <- function(df, stats, tests, grps) {
- # Final group, compute tests
- if (length(grps) == 1) {
- group <- factor(eval(grps[[1]], df))
-
- # Create the subtable stats
- df[!names(df) %in% as.character(grps[[1]])] %>%
- by(group, statTable, stats) %>%
- # Name the subtables with info about group and group size
- stats::setNames(subNames(grps[[1]], df)) -> stats
-
- # Create the subtable tests
- pvalues <- testColumn(df, tests, grps[[1]])
-
- c(stats, tests = list(pvalues))
- } else {
- group <- eval(grps[[1]], df)
-
- # Go through the next grouping levels and build the subtables
- df[!names(df) %in% as.character(grps[[1]])] %>%
- by(group, subTable, stats, tests, grps[-1]) %>%
- # Name the subtables with info about group and group size
- stats::setNames(subNames(grps[[1]], df))
- }
- }
|