maxx
/
desctable
mirror of https://github.com/maximewack/desctable


			
							#' Generate one statistic for all variables
#'
#' Use one stat function (made safe using statify) on all the data
#' to produce a single statistics column.
#'
#' The result is either a numeric vector, or a character vector if
#' the content of the column is not made entirely of numbers.
#'
#' @param stat The statistic to use
#' @param data The dataframe to apply the statistic to
#' @return A vector for one statistic column
statColumn <- function(stat, data) {
  # Apply one statified stat function to every variable in the data
  # Return a simple vector for the column
  # Statify checks types and output for the stat function. Returns a numeric vector or a character vector if needed.
  data %>%
    lapply(statify, stat) %>%
    unlist()
}


#' Generate the table of all statistics for all variables
#'
#' If stats is a list of functions, use them.
#' If it is a single function, use it with the entire data as
#' its argument to produce a list of statistical functions to use.
#'
#' @param data The dataframe to apply the statistic to
#' @param stats A list of named statistics to use
#' @return A dataframe of all statistics for all variables
statTable <- function(data, stats) {
  # If stats is a function, apply it to the data to obtain a list of stat functions
  # Else use the function list as-is
  if (is.function(stats)) stats = stats(data)

  # Compute a statColumn for every stat function in stats
  # Assemble the result in a dataframe
  stats %>%
    lapply(statColumn, data) %>%
    data.frame(check.names = F,
               row.names = NULL,
               stringsAsFactors = F)
}


#' Generate the variable column to display as row names
#'
#' Generates the variable column.
#' Replaces the variable names by their label if given in the named character vector labels, and inserts levels for factors.
#'
#' labels is an option named character vector used to make the table prettier.
#' If given, the variable names for which there is a label will be replaced by their corresponding label.
#' Not all variables need to have a label, and labels for non-existing variables are ignored.
#'
#' @param data The dataframe to get the names from
#' @param labels The optional named character vector containing the keypairs var = "Label"
#' @return A dataframe with one variable named "Variables", a character vector of variable names/labels and levels
varColumn <- function(data, labels = NULL) {
  # Every variable name that exists in the labels is to be replaced with its corresponding label
  # Labels for non-existing variables are ignored
  # Variables with no label are not replaced and used as-is
  base_names <- names(data)
  base_names[base_names %in% names(labels)] <- labels[base_names[base_names %in% names(labels)]]

  # Check if there are factors
  data %>%
    lapply(is.factor) %>%
    unlist() -> factors

  # Insert levels for factors after the variable name
  if (any(factors)) {
    factors_idx <- which(factors)

    # Factor names in **bold**
    base_names[factors_idx] <- paste0("**", base_names[factors_idx], "**")

    # Factor levels in *italic*
    factor_levels <- lapply(factors_idx, function(x) paste0(base_names[x], ": ", "*", levels(data[[x]]), "*"))

    # Insert the factor levels after each factor name
    base_names <- insert(x = base_names,
                         y = factor_levels,
                         position = factors_idx)
  }

  data.frame(Variables = base_names,
             check.names = F,
             row.names = NULL,
             stringsAsFactors = F)
}


#' Generate a statistics table
#'
#' Generate a statistics table with the chosen statistical functions, and tests if given a \code{"grouped"} dataframe.
#'
#' @section Labels:
#' labels is an option named character vector used to make the table prettier.
#'
#' If given, the variable names for which there is a label will be replaced by their corresponding label.
#'
#' Not all variables need to have a label, and labels for non-existing variables are ignored.
#'
#' labels must be given in the form c(unquoted_variable_name = "label")
#'
#' @section Stats:
#' The stats can be a function which takes a dataframe and returns a list of statistical functions to use.
#'
#' stats can also be a named list of statistical functions, or formulas.
#'
#' The names will be used as column names in the resulting table. If an element of the list is a function, it will be used as-is for the stats. If an element of the list is a formula, it can be used to conditionally use stats depending on the variable.
#'
#' The general form is \code{condition ~ T | F}, and can be nested, such as \code{is.factor ~ percent | (is.normal ~ mean | median)}, for example.
#'
#' @section Tests:
#' The tests can be a function which takes a variable and a grouping variable, and returns an appropriate statistical test to use in that case.
#'
#' tests can also be a named list of statistical test functions, associating the name of a variable in the data, and a test to use specifically for that variable.
#'
#' That test name must be expressed as a single-term formula (e.g. \code{~t.test}). You don't have to specify tests for all the variables: a default test for all other variables can be defined with the name \code{.default}, and an automatic test can be defined with the name \code{.auto}.
#'
#' If data is a grouped dataframe (using \code{group_by}), subtables are created and statistic tests are performed over each sub-group.
#'
#' @section Output:
#' The output is a desctable object, which is a list of named dataframes that can be further manipulated. Methods for printing, using in \pkg{pander} and \pkg{DT} are present. Printing reduces the object to a dataframe.
#'
#' @param data The dataframe to analyze
#' @param stats A list of named statistics to apply to each element of the dataframe, or a function returning a list of named statistics
#' @param tests A list of statistical tests to use when calling desctable with a grouped_df
#' @param labels A named character vector of labels to use instead of variable names
#' @return A desctable object, which prints to a table of statistics for all variables
#' @seealso \code{\link{stats_auto}}
#' @seealso \code{\link{tests_auto}}
#' @seealso \code{\link{print.desctable}}
#' @seealso \code{\link{pander.desctable}}
#' @seealso \code{\link{datatable.desctable}}
#' @export
#' @examples
#' iris %>%
#'   desctable()
#'
#' # Does the same as stats_auto here
#' iris %>%
#'   desctable(stats = list("N"      = length,
#'                          "%/Mean" = is.factor ~ percent | (is.normal ~ mean),
#'                          "sd"     = is.normal ~ sd,
#'                          "Med"    = is.normal ~ NA | median,
#'                          "IQR"    = is.normal ~ NA | IQR))
#'
#' # With labels
#' mtcars %>% desctable(labels = c(hp  = "Horse Power",
#'                                 cyl = "Cylinders",
#'                                 mpg = "Miles per gallon"))
#'
#' # With grouping on a factor
#' iris %>%
#'   group_by(Species) %>%
#'   desctable(stats = stats_default)
#'
#' # With nested grouping, on arbitrary variables
#' mtcars %>%
#'   group_by(vs, cyl) %>%
#'   desctable()
#'
#' # With grouping on a condition, and choice of tests
#' iris %>%
#'   group_by(Petal.Length > 5) %>%
#'   desctable(tests = list(.auto = tests_auto, Species = ~chisq.test))
desctable <- function(data, stats, tests, labels) {
  UseMethod("desctable", data)
}


#' @rdname desctable
#' @export
desctable.default <- function(data, stats = stats_auto, tests, labels = NULL) {
  # Assemble the Variables and the statTable in a single desctable object
  list(Variables = varColumn(data, labels),
       stats = statTable(data, stats)) %>%
  set_desctable_class()
}


#' @rdname desctable
#' @export
desctable.grouped_df <- function(data, stats = stats_auto, tests = tests_auto, labels = NULL) {
  # Get groups then ungroup dataframe
  grps <- dplyr::groups(data)
  data <- dplyr::ungroup(data)

  # Assemble the Variables (excluding the grouping ones) and the subTables recursively in a single desctable object
  c(Variables = list(varColumn(data[!names(data) %in% (grps %>% lapply(as.character) %>% unlist())], labels)),
    subTable(data, stats, tests, grps)) %>%
    set_desctable_class()
}


#' Create the subtables names
#'
#' Create the subtables names, as
#' factor: level (n=sub-group length)
#'
#' @param grp Grouping factor
#' @param df Dataframe containing the grouping factor
#' @return A character vector with the names for the subtables
subNames <- function(grp, df) {
  paste0(as.character(grp),
         ": ",
         eval(grp, df) %>% factor() %>% levels(),
         " (n=",
         summary(eval(grp, df) %>% factor() %>% stats::na.omit(), maxsum = Inf),
         ")")
}


#' Create the pvalues column
#'
#' @param df Dataframe to use for the tests
#' @param tests Test function or list of functions
#' @param grp Grouping factor
#' @return A numeric vector of pvalues
testColumn <- function(df, tests, grp) {
  group <- eval(grp, df)

  df <- df[!names(df) %in% as.character(grp)]

  # If tests is a function, apply it to the data and the grouping factor to produce a list of tests
  # If there is an .auto element in the list of tests, apply the function as previously to select the relevant test
  # If there is a .default element, use it as tests
  # Else fall back on kruskal.test
  if (is.function(tests)) {
    ftests <- lapply(df, tests, factor(group))
    tests <- ftests
  } else if (!is.null(tests$.auto))  ftests <- lapply(df, tests$.auto, factor(group))
  else if (!is.null(tests$.default)) ftests <- lapply(df, function(x){tests$.default})
  else                               ftests <- lapply(df, function(x){stats::kruskal.test})

  # Select the forced (named) tests
  tests %>%
    names() %>%
    setdiff(".auto") %>%
    intersect(names(df)) -> forced_tests

  # Assemble the complete list of tests to compute
  ftests[names(ftests) %in% forced_tests][forced_tests] <- tests[forced_tests]

  # Compute the tests (made safe with testify) on the variable, using the grouping variable
  df %>%
    purrr::map2(ftests, testify, group) %>%
    Reduce(f = rbind)
}


#' Create a subtable in a grouped desctable
#'
#' @param df Dataframe to use
#' @param stats Stats list/function to use
#' @param tests Tests list/function to use
#' @param grps List of symbols for grouping factors
#' @return A nested list of statTables and testColumns
subTable <- function(df, stats, tests, grps) {
  # Final group, compute tests
  if (length(grps) == 1) {
    group <- factor(eval(grps[[1]], df))

    # Create the subtable stats
    df[!names(df) %in% as.character(grps[[1]])] %>%
      by(group, statTable, stats) %>%
      # Name the subtables with info about group and group size
      stats::setNames(subNames(grps[[1]], df)) -> stats

    # Create the subtable tests
    pvalues <- testColumn(df, tests, grps[[1]])

    c(stats, tests = list(pvalues))
  } else {
    group <- eval(grps[[1]], df)

    # Go through the next grouping levels and build the subtables
    df[!names(df) %in% as.character(grps[[1]])] %>%
      by(group, subTable, stats, tests, grps[-1]) %>%
      # Name the subtables with info about group and group size
      stats::setNames(subNames(grps[[1]], df))
  }
}