|
- #' Return the percentages for the levels of a factor
- #'
- #' Return a compatible vector of length nlevels(x) + 1
- #' to print the percentages of each level of a factor
- #'
- #' @param x A factor
- #' @export
- #' @return A nlevels(x) + 1 length vector of percentages
- percent <- function(x) {
- if (is.factor(x)) c(NA, summary(x, maxsum = Inf) / length(x)) * 100
- else NA
- }
-
-
- #' Return the inter-quartile range
- #'
- #' Safe version of IQR for statify
- #' @param x A vector
- #' @return The IQR
- #' @export
- IQR <- function(x) {
- base::diff(stats::quantile(x, c(0.25, 0.75), na.rm = T))
- }
-
-
- #' Test if distribution is normal
- #'
- #' Test if distribution is normal.
- #' The condition for normality is length > 30 and non-significant Shapiro-Wilks test with p > .1
- #'
- #' @param x A numerical vector
- #' @export
- #' @return A boolean
- is.normal <- function(x) {
- if (!is.numeric(x)) F
- else if (length(stats::na.omit(x)) >= 30) tryCatch(stats::shapiro.test(x)$p.value > .1, error = function(e) F)
- else F
- }
-
-
- #' Fisher's Exact Test for Count Data
- #'
- #' Performs Fisher's exact test for testing the null of independence
- #' of rows and columns in a contingency table with fixed marginals, or with a formula expression.
- #'
- #' If \code{x} is a matrix, it is taken as a two-dimensional contingency
- #' table, and hence its entries should be nonnegative integers.
- #' Otherwise, both \code{x} and \code{y} must be vectors of the same length.
- #' Incomplete cases are removed, the vectors are coerced into factor
- #' objects, and the contingency table is computed from these.
- #'
- #' For 2 by 2 cases, p-values are obtained directly using the
- #' (central or non-central) hypergeometric distribution. Otherwise,
- #' computations are based on a C version of the FORTRAN subroutine
- #' FEXACT which implements the network developed by Mehta and Patel
- #' (1986) and improved by Clarkson, Fan and Joe (1993). The FORTRAN
- #' code can be obtained from \url{http://www.netlib.org/toms/643}.
- #' Note this fails (with an error message) when the entries of the
- #' table are too large. (It transposes the table if necessary so it
- #' has no more rows than columns. One constraint is that the product
- #' of the row marginals be less than 2^31 - 1.)
- #'
- #' For 2 by 2 tables, the null of conditional independence is
- #' equivalent to the hypothesis that the odds ratio equals one.
- #' \code{Exact} inference can be based on observing that in general, given
- #' all marginal totals fixed, the first element of the contingency
- #' table has a non-central hypergeometric distribution with
- #' non-centrality parameter given by the odds ratio (Fisher, 1935).
- #' The alternative for a one-sided test is based on the odds ratio,
- #' so \code{alternative = "greater"} is a test of the odds ratio being
- #' bigger than \code{or}.
- #'
- #' Two-sided tests are based on the probabilities of the tables, and
- #' take as \code{more extreme} all tables with probabilities less than or
- #' equal to that of the observed table, the p-value being the sum of
- #' such probabilities.
- #'
- #' For larger than 2 by 2 tables and \code{hybrid = TRUE}, asymptotic
- #' chi-squared probabilities are only used if the ‘Cochran
- #' conditions’ are satisfied, that is if no cell has count zero, and
- #' more than 80% of the cells have counts at least 5: otherwise the
- #' exact calculation is used.
- #'
- #' Simulation is done conditional on the row and column marginals,
- #' and works only if the marginals are strictly positive. (A C
- #' translation of the algorithm of Patefield (1981) is used.)
- #' @param x either a two-dimensional contingency table in matrix form, a factor object, or a formula of the form \code{lhs ~ rhs} where \code{lhs} and \code{rhs} are factors.
- #' @param y a factor object; ignored if \code{x} is a matrix or a formula.
- #' @param ... additional params to feed to original fisher.test
- #' @inheritParams stats::fisher.test
- #' @return A list with class \code{"htest"} containing the following components:
- #'
- #' p.value: the p-value of the test.
- #'
- #' conf.int: a confidence interval for the odds ratio. Only present in
- #' the 2 by 2 case and if argument \code{conf.int = TRUE}.
- #'
- #' estimate: an estimate of the odds ratio. Note that the _conditional_
- #' Maximum Likelihood Estimate (MLE) rather than the
- #' unconditional MLE (the sample odds ratio) is used. Only
- #' present in the 2 by 2 case.
- #'
- #' null.value: the odds ratio under the null, \code{or}. Only present in the 2
- #' by 2 case.
- #'
- #' alternative: a character string describing the alternative hypothesis.
- #'
- #' method: the character string \code{"Fisher's Exact Test for Count Data"}.
- #'
- #' data.name: a character string giving the names of the data.
- #' @references
- #' Agresti, A. (1990) _Categorical data analysis_. New York: Wiley.
- #' Pages 59-66.
- #'
- #' Agresti, A. (2002) _Categorical data analysis_. Second edition.
- #' New York: Wiley. Pages 91-101.
- #'
- #' Fisher, R. A. (1935) The logic of inductive inference. _Journal
- #' of the Royal Statistical Society Series A_ *98*, 39-54.
- #'
- #' Fisher, R. A. (1962) Confidence limits for a cross-product ratio.
- #' _Australian Journal of Statistics_ *4*, 41.
- #'
- #' Fisher, R. A. (1970) _Statistical Methods for Research Workers._
- #' Oliver & Boyd.
- #'
- #' Mehta, C. R. and Patel, N. R. (1986) Algorithm 643. FEXACT: A
- #' Fortran subroutine for Fisher's exact test on unordered r*c
- #' contingency tables. _ACM Transactions on Mathematical Software_,
- #' *12*, 154-161.
- #'
- #' Clarkson, D. B., Fan, Y. and Joe, H. (1993) A Remark on Algorithm
- #' 643: FEXACT: An Algorithm for Performing Fisher's Exact Test in r
- #' x c Contingency Tables. _ACM Transactions on Mathematical
- #' Software_, *19*, 484-488.
- #'
- #' Patefield, W. M. (1981) Algorithm AS159. An efficient method of
- #' generating r x c tables with given row and column totals.
- #' _Applied Statistics_ *30*, 91-97.
- #' @seealso
- #' \code{\link{chisq.test}}
- #'
- #' \code{fisher.exact} in package \pkg{kexact2x2} for alternative
- #' interpretations of two-sided tests and confidence intervals for 2
- #' by 2 tables.
- #' @examples
- #' \dontrun{
- #' ## Agresti (1990, p. 61f; 2002, p. 91) Fisher's Tea Drinker
- #' ## A British woman claimed to be able to distinguish whether milk or
- #' ## tea was added to the cup first. To test, she was given 8 cups of
- #' ## tea, in four of which milk was added first. The null hypothesis
- #' ## is that there is no association between the true order of pouring
- #' ## and the woman's guess, the alternative that there is a positive
- #' ## association (that the odds ratio is greater than 1).
- #' TeaTasting <-
- #' matrix(c(3, 1, 1, 3),
- #' nrow = 2,
- #' dimnames = list(Guess = c("Milk", "Tea"),
- #' Truth = c("Milk", "Tea")))
- #' fisher.test(TeaTasting, alternative = "greater")
- #' ## => p = 0.2429, association could not be established
- #'
- #' ## Fisher (1962, 1970), Criminal convictions of like-sex twins
- #' Convictions <-
- #' matrix(c(2, 10, 15, 3),
- #' nrow = 2,
- #' dimnames =
- #' list(c("Dizygotic", "Monozygotic"),
- #' c("Convicted", "Not convicted")))
- #' Convictions
- #' fisher.test(Convictions, alternative = "less")
- #' fisher.test(Convictions, conf.int = FALSE)
- #' fisher.test(Convictions, conf.level = 0.95)$conf.int
- #' fisher.test(Convictions, conf.level = 0.99)$conf.int
- #'
- #' ## A r x c table Agresti (2002, p. 57) Job Satisfaction
- #' Job <- matrix(c(1,2,1,0, 3,3,6,1, 10,10,14,9, 6,7,12,11), 4, 4,
- #' dimnames = list(income = c("< 15k", "15-25k", "25-40k", "> 40k"),
- #' satisfaction = c("VeryD", "LittleD", "ModerateS", "VeryS")))
- #' fisher.test(Job)
- #' fisher.test(Job, simulate.p.value = TRUE, B = 1e5)
- #'
- #' ###
- #' }
- #' @export
- fisher.test <- function(x, y, workspace, hybrid, control, or, alternative, conf.int, conf.level, simulate.p.value, B) {
- UseMethod("fisher.test")
- }
-
-
- #' @rdname fisher.test
- fisher.test.default <- function(x, ...) {
- stats::fisher.test(x, ...)
- }
-
- #' @rdname fisher.test
- fisher.test.formula <- function(x,
- y = NULL,
- workspace = 200000,
- hybrid = F,
- control = list(),
- or = 1,
- alternative = "two.sided",
- conf.int = T,
- conf.level = .95,
- simulate.p.value = F,
- B = 2000) {
- stats::fisher.test(x = eval(x[[2]], envir = parent.frame()),
- y = eval(x[[3]], envir = parent.frame()),
- workspace = workspace,
- hybrid = hybrid,
- control = control,
- or = or,
- alternative = alternative,
- conf.int = conf.int,
- conf.level = conf.level,
- simulate.p.value = simulate.p.value,
- B = B)
- }
-
-
- #' Pearson's Chi-squared Test for Count Data
- #'
- #' \code{chisq.test} performs chi-squared contingency table tests and goodness-of-fit tests, with an added method for formulas.
- #'
- #' If \code{x} is a matrix with one row or column, or if \code{x} is a vector
- #' and \code{y} is not given, then a _goodness-of-fit test_ is performed
- #' (\code{x} is treated as a one-dimensional contingency table). The
- #' entries of \code{x} must be non-negative integers. In this case, the
- #' hypothesis tested is whether the population probabilities equal
- #' those in \code{p}, or are all equal if \code{p} is not given.
- #'
- #' If \code{x} is a matrix with at least two rows and columns, it is taken
- #' as a two-dimensional contingency table: the entries of \code{x} must be
- #' non-negative integers. Otherwise, \code{x} and \code{y} must be vectors or
- #' factors of the same length; cases with missing values are removed,
- #' the objects are coerced to factors, and the contingency table is
- #' computed from these. Then Pearson's chi-squared test is performed
- #' of the null hypothesis that the joint distribution of the cell
- #' counts in a 2-dimensional contingency table is the product of the
- #' row and column marginals.
- #'
- #' If \code{simulate.p.value} is \code{FALSE}, the p-value is computed from the
- #' asymptotic chi-squared distribution of the test statistic;
- #' continuity correction is only used in the 2-by-2 case (if
- #' \code{correct} is \code{TRUE}, the default). Otherwise the p-value is
- #' computed for a Monte Carlo test (Hope, 1968) with \code{B} replicates.
- #'
- #' In the contingency table case simulation is done by random
- #' sampling from the set of all contingency tables with given
- #' marginals, and works only if the marginals are strictly positive.
- #' Continuity correction is never used, and the statistic is quoted
- #' without it. Note that this is not the usual sampling situation
- #' assumed for the chi-squared test but rather that for Fisher's
- #' exact test.
- #'
- #' In the goodness-of-fit case simulation is done by random sampling
- #' from the discrete distribution specified by \code{p}, each sample being
- #' of size \code{n = sum(x)}. This simulation is done in R and may be
- #' slow.
- #' @param x a numeric vector, or matrix, or formula of the form \code{lhs ~ rhs} where \code{lhs} and \code{rhs} are factors. \code{x} and \code{y} can also both be factors.
- #' @param y a numeric vector; ignored if \code{x} is a matrix or a formula. If \code{x} is a factor, \code{y} should be a factor of the same length.
- #' @inheritParams stats::chisq.test
- #' @return A list with class \code{"htest"} containing the following components:
- #' statistic: the value the chi-squared test statistic.
- #'
- #' parameter: the degrees of freedom of the approximate chi-squared
- #' distribution of the test statistic, \code{NA} if the p-value is
- #' computed by Monte Carlo simulation.
- #'
- #' p.value: the p-value for the test.
- #'
- #' method: a character string indicating the type of test performed, and
- #' whether Monte Carlo simulation or continuity correction was
- #' used.
- #'
- #' data.name: a character string giving the name(s) of the data.
- #'
- #' observed: the observed counts.
- #'
- #' expected: the expected counts under the null hypothesis.
- #'
- #' residuals: the Pearson residuals, ‘(observed - expected) /
- #' sqrt(expected)’.
- #'
- #' stdres: standardized residuals, \code{(observed - expected) / sqrt(V)},
- #' where \code{V} is the residual cell variance (Agresti, 2007,
- #' section 2.4.5 for the case where \code{x} is a matrix, ‘n * p * (1
- #' - p)’ otherwise).
- #' @source The code for Monte Carlo simulation is a C translation of the Fortran algorithm of Patefield (1981).
- #' @references
- #' Hope, A. C. A. (1968) A simplified Monte Carlo significance test
- #' procedure. _J. Roy, Statist. Soc. B_ *30*, 582-598.
- #'
- #' Patefield, W. M. (1981) Algorithm AS159. An efficient method of
- #' generating r x c tables with given row and column totals.
- #' _Applied Statistics_ *30*, 91-97.
- #'
- #' Agresti, A. (2007) _An Introduction to Categorical Data Analysis,
- #' 2nd ed._, New York: John Wiley & Sons. Page 38.
- #' @seealso For goodness-of-fit testing, notably of continuous distributions, \code{\link{ks.test}}.
- #' @examples
- #' \dontrun{
- #' ## From Agresti(2007) p.39
- #' M <- as.table(rbind(c(762, 327, 468), c(484, 239, 477)))
- #' dimnames(M) <- list(gender = c("F", "M"),
- #' party = c("Democrat","Independent", "Republican"))
- #' (Xsq <- chisq.test(M)) # Prints test summary
- #' Xsq$observed # observed counts (same as M)
- #' Xsq$expected # expected counts under the null
- #' Xsq$residuals # Pearson residuals
- #' Xsq$stdres # standardized residuals
- #'
- #'
- #' ## Effect of simulating p-values
- #' x <- matrix(c(12, 5, 7, 7), ncol = 2)
- #' chisq.test(x)$p.value # 0.4233
- #' chisq.test(x, simulate.p.value = TRUE, B = 10000)$p.value
- #' # around 0.29!
- #'
- #' ## Testing for population probabilities
- #' ## Case A. Tabulated data
- #' x <- c(A = 20, B = 15, C = 25)
- #' chisq.test(x)
- #' chisq.test(as.table(x)) # the same
- #' x <- c(89,37,30,28,2)
- #' p <- c(40,20,20,15,5)
- #' try(
- #' chisq.test(x, p = p) # gives an error
- #' )
- #' chisq.test(x, p = p, rescale.p = TRUE)
- #' # works
- #' p <- c(0.40,0.20,0.20,0.19,0.01)
- #' # Expected count in category 5
- #' # is 1.86 < 5 ==> chi square approx.
- #' chisq.test(x, p = p) # maybe doubtful, but is ok!
- #' chisq.test(x, p = p, simulate.p.value = TRUE)
- #'
- #' ## Case B. Raw data
- #' x <- trunc(5 * runif(100))
- #' chisq.test(table(x)) # NOT 'chisq.test(x)'!
- #'
- #' ###
- #' }
- #' @export
- chisq.test <- function(x, y, correct, p, rescale.p, simulate.p.value, B) {
- UseMethod("chisq.test")
- }
-
-
- #' @rdname chisq.test
- chisq.test.default <- stats::chisq.test
-
- #' @rdname chisq.test
-
- chisq.test.formula <- function(x,
- y = NULL,
- correct = T,
- p = rep(1/length(x), length(x)),
- rescale.p = F,
- simulate.p.value = F,
- B = 2000) {
- stats::chisq.test(x = eval(x[[2]], envir = parent.frame()),
- y = eval(x[[3]], envir = parent.frame()),
- correct = correct,
- p = p,
- rescale.p = rescale.p,
- simulate.p.value = simulate.p.value,
- B = B)
- }
-
-
- #' Wrapper for oneway.test(var.equal = T)
- #'
- #' @param formula An anova formula (\code{variable ~ grouping variable})
- #' @seealso \code{\link{oneway.test}}
- #' @export
- ANOVA <- function(formula) {
- stats::oneway.test(formula, var.equal = T)
- }
-
-
- #' No test
- #'
- #' An empty test
- #' @param formula A formula
- no.test <- function(formula) {
- data.frame(p.value = NA)
- }
|