maxx
/
desctable
mirror of https://github.com/maximewack/desctable


			
							#' Return the percentages for the levels of a factor
#'
#' Return a compatible vector of length nlevels(x) + 1
#' to print the percentages of each level of a factor
#'
#' @param x A factor
#' @export
#' @return A nlevels(x) + 1 length vector of percentages
percent <- function(x) {
  if (is.factor(x)) c(NA, summary(x, maxsum = Inf) / length(x)) * 100
  else              NA
}


#' Return the inter-quartile range
#'
#' Safe version of IQR for statify
#' @param x A vector
#' @return The IQR
#' @export
IQR <- function(x) {
  if (!is.factor(x))
    base::diff(stats::quantile(x, c(0.25, 0.75), na.rm = T))
}


#' Test if distribution is normal
#'
#' Test if distribution is normal.
#' The condition for normality is length > 30 and non-significant Shapiro-Wilks test with p > .1
#'
#' @param x A numerical vector
#' @export
#' @return A boolean
is.normal <- function(x) {
  if (!is.numeric(x)) F
  else if (length(stats::na.omit(x)) >= 30) tryCatch(stats::shapiro.test(x)$p.value > .1, error = function(e) F)
  else F
}


#' Fisher's Exact Test for Count Data
#'
#' Performs Fisher's exact test for testing the null of independence
#' of rows and columns in a contingency table with fixed marginals, or with a formula expression.
#'
#' If \code{x} is a matrix, it is taken as a two-dimensional contingency
#' table, and hence its entries should be nonnegative integers.
#' Otherwise, both \code{x} and \code{y} must be vectors of the same length.
#' Incomplete cases are removed, the vectors are coerced into factor
#' objects, and the contingency table is computed from these.
#'
#' For 2 by 2 cases, p-values are obtained directly using the
#' (central or non-central) hypergeometric distribution. Otherwise,
#' computations are based on a C version of the FORTRAN subroutine
#' FEXACT which implements the network developed by Mehta and Patel
#' (1986) and improved by Clarkson, Fan and Joe (1993).  The FORTRAN
#' code can be obtained from \url{http://www.netlib.org/toms/643}.
#' Note this fails (with an error message) when the entries of the
#' table are too large.  (It transposes the table if necessary so it
#' has no more rows than columns.  One constraint is that the product
#' of the row marginals be less than 2^31 - 1.)
#'
#' For 2 by 2 tables, the null of conditional independence is
#' equivalent to the hypothesis that the odds ratio equals one.
#' \code{Exact} inference can be based on observing that in general, given
#' all marginal totals fixed, the first element of the contingency
#' table has a non-central hypergeometric distribution with
#' non-centrality parameter given by the odds ratio (Fisher, 1935).
#' The alternative for a one-sided test is based on the odds ratio,
#' so \code{alternative = "greater"} is a test of the odds ratio being
#' bigger than \code{or}.
#'
#' Two-sided tests are based on the probabilities of the tables, and
#' take as \code{more extreme} all tables with probabilities less than or
#' equal to that of the observed table, the p-value being the sum of
#' such probabilities.
#'
#' For larger than 2 by 2 tables and \code{hybrid = TRUE}, asymptotic
#' chi-squared probabilities are only used if the ‘Cochran
#' conditions’ are satisfied, that is if no cell has count zero, and
#' more than 80% of the cells have counts at least 5: otherwise the
#' exact calculation is used.
#'
#' Simulation is done conditional on the row and column marginals,
#' and works only if the marginals are strictly positive.  (A C
#' translation of the algorithm of Patefield (1981) is used.)
#' @param x either a two-dimensional contingency table in matrix form, a factor object, or a formula of the form \code{lhs ~ rhs} where \code{lhs} and \code{rhs} are factors.
#' @param y a factor object; ignored if \code{x} is a matrix or a formula.
#' @param ... additional params to feed to original fisher.test
#' @inheritParams stats::fisher.test
#' @return A list with class \code{"htest"} containing the following components:
#'
#' p.value: the p-value of the test.
#'
#' conf.int: a confidence interval for the odds ratio.  Only present in
#'           the 2 by 2 case and if argument \code{conf.int = TRUE}.
#'
#' estimate: an estimate of the odds ratio.  Note that the _conditional_
#'           Maximum Likelihood Estimate (MLE) rather than the
#'           unconditional MLE (the sample odds ratio) is used.  Only
#'           present in the 2 by 2 case.
#'
#' null.value: the odds ratio under the null, \code{or}.  Only present in the 2
#'           by 2 case.
#'
#' alternative: a character string describing the alternative hypothesis.
#'
#' method: the character string \code{"Fisher's Exact Test for Count Data"}.
#'
#' data.name: a character string giving the names of the data.
#' @references
#' Agresti, A. (1990) _Categorical data analysis_.  New York: Wiley.
#' Pages 59-66.
#'
#' Agresti, A. (2002) _Categorical data analysis_. Second edition.
#' New York: Wiley.  Pages 91-101.
#'
#' Fisher, R. A. (1935) The logic of inductive inference.  _Journal
#' of the Royal Statistical Society Series A_ *98*, 39-54.
#'
#' Fisher, R. A. (1962) Confidence limits for a cross-product ratio.
#' _Australian Journal of Statistics_ *4*, 41.
#'
#' Fisher, R. A. (1970) _Statistical Methods for Research Workers._
#' Oliver & Boyd.
#'
#' Mehta, C. R. and Patel, N. R. (1986) Algorithm 643. FEXACT: A
#' Fortran subroutine for Fisher's exact test on unordered r*c
#' contingency tables.  _ACM Transactions on Mathematical Software_,
#' *12*, 154-161.
#'
#' Clarkson, D. B., Fan, Y. and Joe, H. (1993) A Remark on Algorithm
#' 643: FEXACT: An Algorithm for Performing Fisher's Exact Test in r
#' x c Contingency Tables.  _ACM Transactions on Mathematical
#' Software_, *19*, 484-488.
#'
#' Patefield, W. M. (1981) Algorithm AS159.  An efficient method of
#' generating r x c tables with given row and column totals.
#' _Applied Statistics_ *30*, 91-97.
#' @seealso
#' \code{\link{chisq.test}}
#'
#' \code{fisher.exact} in package \pkg{kexact2x2} for alternative
#' interpretations of two-sided tests and confidence intervals for 2
#' by 2 tables.
#' @examples
#' \dontrun{
#' ## Agresti (1990, p. 61f; 2002, p. 91) Fisher's Tea Drinker
#' ## A British woman claimed to be able to distinguish whether milk or
#' ##  tea was added to the cup first.  To test, she was given 8 cups of
#' ##  tea, in four of which milk was added first.  The null hypothesis
#' ##  is that there is no association between the true order of pouring
#' ##  and the woman's guess, the alternative that there is a positive
#' ##  association (that the odds ratio is greater than 1).
#' TeaTasting <-
#' matrix(c(3, 1, 1, 3),
#'        nrow = 2,
#'        dimnames = list(Guess = c("Milk", "Tea"),
#'                        Truth = c("Milk", "Tea")))
#' fisher.test(TeaTasting, alternative = "greater")
#' ## => p = 0.2429, association could not be established
#'
#' ## Fisher (1962, 1970), Criminal convictions of like-sex twins
#' Convictions <-
#' matrix(c(2, 10, 15, 3),
#'        nrow = 2,
#'        dimnames =
#'        list(c("Dizygotic", "Monozygotic"),
#'             c("Convicted", "Not convicted")))
#' Convictions
#' fisher.test(Convictions, alternative = "less")
#' fisher.test(Convictions, conf.int = FALSE)
#' fisher.test(Convictions, conf.level = 0.95)$conf.int
#' fisher.test(Convictions, conf.level = 0.99)$conf.int
#'
#' ## A r x c table  Agresti (2002, p. 57) Job Satisfaction
#' Job <- matrix(c(1,2,1,0, 3,3,6,1, 10,10,14,9, 6,7,12,11), 4, 4,
#' dimnames = list(income = c("< 15k", "15-25k", "25-40k", "> 40k"),
#'                 satisfaction = c("VeryD", "LittleD", "ModerateS", "VeryS")))
#' fisher.test(Job)
#' fisher.test(Job, simulate.p.value = TRUE, B = 1e5)
#'
#' ###
#' }
#' @export
fisher.test <- function(x, y, workspace, hybrid, control, or, alternative, conf.int, conf.level, simulate.p.value, B) {
  UseMethod("fisher.test")
}


#' @rdname fisher.test
fisher.test.default <- function(x, ...) {
  stats::fisher.test(x, ...)
}

#' @rdname fisher.test
fisher.test.formula <- function(x,
                                y = NULL,
                                workspace = 200000,
                                hybrid = F,
                                control = list(),
                                or = 1,
                                alternative = "two.sided",
                                conf.int = T,
                                conf.level = .95,
                                simulate.p.value = F,
                                B = 2000) {
  stats::fisher.test(x = eval(x[[2]], envir = parent.frame()),
                     y = eval(x[[3]], envir = parent.frame()),
                     workspace = workspace,
                     hybrid = hybrid,
                     control = control,
                     or = or,
                     alternative = alternative,
                     conf.int = conf.int,
                     conf.level = conf.level,
                     simulate.p.value = simulate.p.value,
                     B = B)
}


#' Pearson's Chi-squared Test for Count Data
#'
#' \code{chisq.test} performs chi-squared contingency table tests and goodness-of-fit tests, with an added method for formulas.
#'
#' If \code{x} is a matrix with one row or column, or if \code{x} is a vector
#' and \code{y} is not given, then a _goodness-of-fit test_ is performed
#' (\code{x} is treated as a one-dimensional contingency table).  The
#' entries of \code{x} must be non-negative integers.  In this case, the
#' hypothesis tested is whether the population probabilities equal
#' those in \code{p}, or are all equal if \code{p} is not given.
#'
#' If \code{x} is a matrix with at least two rows and columns, it is taken
#' as a two-dimensional contingency table: the entries of \code{x} must be
#' non-negative integers.  Otherwise, \code{x} and \code{y} must be vectors or
#' factors of the same length; cases with missing values are removed,
#' the objects are coerced to factors, and the contingency table is
#' computed from these.  Then Pearson's chi-squared test is performed
#' of the null hypothesis that the joint distribution of the cell
#' counts in a 2-dimensional contingency table is the product of the
#' row and column marginals.
#'
#' If \code{simulate.p.value} is \code{FALSE}, the p-value is computed from the
#' asymptotic chi-squared distribution of the test statistic;
#' continuity correction is only used in the 2-by-2 case (if
#' \code{correct} is \code{TRUE}, the default).  Otherwise the p-value is
#' computed for a Monte Carlo test (Hope, 1968) with \code{B} replicates.
#'
#' In the contingency table case simulation is done by random
#' sampling from the set of all contingency tables with given
#' marginals, and works only if the marginals are strictly positive.
#' Continuity correction is never used, and the statistic is quoted
#' without it.  Note that this is not the usual sampling situation
#' assumed for the chi-squared test but rather that for Fisher's
#' exact test.
#'
#' In the goodness-of-fit case simulation is done by random sampling
#' from the discrete distribution specified by \code{p}, each sample being
#' of size \code{n = sum(x)}.  This simulation is done in R and may be
#' slow.
#' @param x a numeric vector, or matrix, or formula of the form \code{lhs ~ rhs} where \code{lhs} and \code{rhs} are factors. \code{x} and \code{y} can also both be factors.
#' @param y a numeric vector; ignored if \code{x} is a matrix or a formula. If \code{x} is a factor, \code{y} should be a factor of the same length.
#' @inheritParams stats::chisq.test
#' @return A list with class \code{"htest"} containing the following components:
#' statistic: the value the chi-squared test statistic.
#'
#' parameter: the degrees of freedom of the approximate chi-squared
#'           distribution of the test statistic, \code{NA} if the p-value is
#'           computed by Monte Carlo simulation.
#'
#'  p.value: the p-value for the test.
#'
#'   method: a character string indicating the type of test performed, and
#'           whether Monte Carlo simulation or continuity correction was
#'           used.
#'
#' data.name: a character string giving the name(s) of the data.
#'
#' observed: the observed counts.
#'
#' expected: the expected counts under the null hypothesis.
#'
#' residuals: the Pearson residuals, ‘(observed - expected) /
#'           sqrt(expected)’.
#'
#'   stdres: standardized residuals, \code{(observed - expected) / sqrt(V)},
#'           where \code{V} is the residual cell variance (Agresti, 2007,
#'           section 2.4.5 for the case where \code{x} is a matrix, ‘n * p * (1
#'           - p)’ otherwise).
#' @source The code for Monte Carlo simulation is a C translation of the Fortran algorithm of Patefield (1981).
#' @references
#' Hope, A. C. A. (1968) A simplified Monte Carlo significance test
#' procedure.  _J. Roy, Statist. Soc. B_ *30*, 582-598.
#'
#' Patefield, W. M. (1981) Algorithm AS159.  An efficient method of
#' generating r x c tables with given row and column totals.
#' _Applied Statistics_ *30*, 91-97.
#'
#' Agresti, A. (2007) _An Introduction to Categorical Data Analysis,
#' 2nd ed._, New York: John Wiley & Sons.  Page 38.
#' @seealso For goodness-of-fit testing, notably of continuous distributions, \code{\link{ks.test}}.
#' @examples
#' \dontrun{
#' ## From Agresti(2007) p.39
#' M <- as.table(rbind(c(762, 327, 468), c(484, 239, 477)))
#' dimnames(M) <- list(gender = c("F", "M"),
#'                     party = c("Democrat","Independent", "Republican"))
#' (Xsq <- chisq.test(M))  # Prints test summary
#' Xsq$observed   # observed counts (same as M)
#' Xsq$expected   # expected counts under the null
#' Xsq$residuals  # Pearson residuals
#' Xsq$stdres     # standardized residuals
#'
#'
#' ## Effect of simulating p-values
#' x <- matrix(c(12, 5, 7, 7), ncol = 2)
#' chisq.test(x)$p.value           # 0.4233
#' chisq.test(x, simulate.p.value = TRUE, B = 10000)$p.value
#'                                 # around 0.29!
#'
#' ## Testing for population probabilities
#' ## Case A. Tabulated data
#' x <- c(A = 20, B = 15, C = 25)
#' chisq.test(x)
#' chisq.test(as.table(x))             # the same
#' x <- c(89,37,30,28,2)
#' p <- c(40,20,20,15,5)
#' try(
#' chisq.test(x, p = p)                # gives an error
#' )
#' chisq.test(x, p = p, rescale.p = TRUE)
#'                                 # works
#' p <- c(0.40,0.20,0.20,0.19,0.01)
#'                                 # Expected count in category 5
#'                                 # is 1.86 < 5 ==> chi square approx.
#' chisq.test(x, p = p)            #               maybe doubtful, but is ok!
#' chisq.test(x, p = p, simulate.p.value = TRUE)
#'
#' ## Case B. Raw data
#' x <- trunc(5 * runif(100))
#' chisq.test(table(x))            # NOT 'chisq.test(x)'!
#'
#' ###
#' }
#' @export
chisq.test <- function(x, y, correct, p, rescale.p, simulate.p.value, B) {
  UseMethod("chisq.test")
}


#' @rdname chisq.test
chisq.test.default <- stats::chisq.test

#' @rdname chisq.test

chisq.test.formula <- function(x,
                               y = NULL,
                               correct = T,
                               p = rep(1/length(x), length(x)),
                               rescale.p = F,
                               simulate.p.value = F,
                               B = 2000) {
  stats::chisq.test(x = eval(x[[2]], envir = parent.frame()),
                    y = eval(x[[3]], envir = parent.frame()),
                    correct = correct,
                    p = p,
                    rescale.p = rescale.p,
                    simulate.p.value = simulate.p.value,
                    B = B)
}


#' Wrapper for oneway.test(var.equal = T)
#'
#' @param formula An anova formula (\code{variable ~ grouping variable})
#' @seealso \code{\link{oneway.test}}
#' @export
ANOVA <- function(formula) {
  stats::oneway.test(formula, var.equal = T)
}


#' No test
#'
#' An empty test
#' @param formula A formula
no.test <- function(formula) {
  data.frame(p.value = NA)
}