|
|
@@ -40,38 +40,332 @@ is.normal <- function(x) |
|
|
|
F |
|
|
|
} |
|
|
|
|
|
|
|
#' Fisher test |
|
|
|
#' Fisher's Exact Test for Count Data |
|
|
|
#' |
|
|
|
#' Performs Fisher's exact test for testing the null of independence |
|
|
|
#' of rows and columns in a contingency table with fixed marginals. |
|
|
|
#' |
|
|
|
#' If ‘x’ is a matrix, it is taken as a two-dimensional contingency |
|
|
|
#' table, and hence its entries should be nonnegative integers. |
|
|
|
#' Otherwise, both ‘x’ and ‘y’ must be vectors of the same length. |
|
|
|
#' Incomplete cases are removed, the vectors are coerced into factor |
|
|
|
#' objects, and the contingency table is computed from these. |
|
|
|
#' |
|
|
|
#' For 2 by 2 cases, p-values are obtained directly using the |
|
|
|
#' (central or non-central) hypergeometric distribution. Otherwise, |
|
|
|
#' computations are based on a C version of the FORTRAN subroutine |
|
|
|
#' FEXACT which implements the network developed by Mehta and Patel |
|
|
|
#' (1986) and improved by Clarkson, Fan and Joe (1993). The FORTRAN |
|
|
|
#' code can be obtained from |http://www.netlib.org/toms/643|. |
|
|
|
#' Note this fails (with an error message) when the entries of the |
|
|
|
#' table are too large. (It transposes the table if necessary so it |
|
|
|
#' has no more rows than columns. One constraint is that the product |
|
|
|
#' of the row marginals be less than 2^31 - 1.) |
|
|
|
#' |
|
|
|
#' For 2 by 2 tables, the null of conditional independence is |
|
|
|
#' equivalent to the hypothesis that the odds ratio equals one. |
|
|
|
#' ‘Exact’ inference can be based on observing that in general, given |
|
|
|
#' all marginal totals fixed, the first element of the contingency |
|
|
|
#' table has a non-central hypergeometric distribution with |
|
|
|
#' non-centrality parameter given by the odds ratio (Fisher, 1935). |
|
|
|
#' The alternative for a one-sided test is based on the odds ratio, |
|
|
|
#' so ‘alternative = "greater"’ is a test of the odds ratio being |
|
|
|
#' bigger than ‘or’. |
|
|
|
#' |
|
|
|
#' Two-sided tests are based on the probabilities of the tables, and |
|
|
|
#' take as ‘more extreme’ all tables with probabilities less than or |
|
|
|
#' equal to that of the observed table, the p-value being the sum of |
|
|
|
#' such probabilities. |
|
|
|
#' |
|
|
|
#' For larger than 2 by 2 tables and ‘hybrid = TRUE’, asymptotic |
|
|
|
#' chi-squared probabilities are only used if the ‘Cochran |
|
|
|
#' conditions’ are satisfied, that is if no cell has count zero, and |
|
|
|
#' more than 80% of the cells have counts at least 5: otherwise the |
|
|
|
#' exact calculation is used. |
|
|
|
#' |
|
|
|
#' Simulation is done conditional on the row and column marginals, |
|
|
|
#' and works only if the marginals are strictly positive. (A C |
|
|
|
#' translation of the algorithm of Patefield (1981) is used.) |
|
|
|
#' @param x either a two-dimensional contingency table in matrix form, a factor object, or a formula of the form \code{lhs ~ rhs} where \code{lhs} and \code{rhs} are factors. |
|
|
|
#' @param y a factor object; ignored if \code{x} is a matrix or a formula. |
|
|
|
#' @inheritParams stats::fisher.test |
|
|
|
#' @seealso stats::fisher.test |
|
|
|
#' @return A list with class ‘"htest"’ containing the following components: |
|
|
|
#' |
|
|
|
#' p.value: the p-value of the test. |
|
|
|
#' |
|
|
|
#' conf.int: a confidence interval for the odds ratio. Only present in |
|
|
|
#' the 2 by 2 case and if argument ‘conf.int = TRUE’. |
|
|
|
#' |
|
|
|
#' estimate: an estimate of the odds ratio. Note that the _conditional_ |
|
|
|
#' Maximum Likelihood Estimate (MLE) rather than the |
|
|
|
#' unconditional MLE (the sample odds ratio) is used. Only |
|
|
|
#' present in the 2 by 2 case. |
|
|
|
#' |
|
|
|
#' null.value: the odds ratio under the null, ‘or’. Only present in the 2 |
|
|
|
#' by 2 case. |
|
|
|
#' |
|
|
|
#' alternative: a character string describing the alternative hypothesis. |
|
|
|
#' |
|
|
|
#' method: the character string ‘"Fisher's Exact Test for Count Data"’. |
|
|
|
#' |
|
|
|
#' data.name: a character string giving the names of the data. |
|
|
|
#' @references |
|
|
|
#' Agresti, A. (1990) _Categorical data analysis_. New York: Wiley. |
|
|
|
#' Pages 59-66. |
|
|
|
#' |
|
|
|
#' Agresti, A. (2002) _Categorical data analysis_. Second edition. |
|
|
|
#' New York: Wiley. Pages 91-101. |
|
|
|
#' |
|
|
|
#' Fisher, R. A. (1935) The logic of inductive inference. _Journal |
|
|
|
#' of the Royal Statistical Society Series A_ *98*, 39-54. |
|
|
|
#' |
|
|
|
#' Fisher, R. A. (1962) Confidence limits for a cross-product ratio. |
|
|
|
#' _Australian Journal of Statistics_ *4*, 41. |
|
|
|
#' |
|
|
|
#' Fisher, R. A. (1970) _Statistical Methods for Research Workers._ |
|
|
|
#' Oliver & Boyd. |
|
|
|
#' |
|
|
|
#' Mehta, C. R. and Patel, N. R. (1986) Algorithm 643. FEXACT: A |
|
|
|
#' Fortran subroutine for Fisher's exact test on unordered r*c |
|
|
|
#' contingency tables. _ACM Transactions on Mathematical Software_, |
|
|
|
#' *12*, 154-161. |
|
|
|
#' |
|
|
|
#' Clarkson, D. B., Fan, Y. and Joe, H. (1993) A Remark on Algorithm |
|
|
|
#' 643: FEXACT: An Algorithm for Performing Fisher's Exact Test in r |
|
|
|
#' x c Contingency Tables. _ACM Transactions on Mathematical |
|
|
|
#' Software_, *19*, 484-488. |
|
|
|
#' |
|
|
|
#' Patefield, W. M. (1981) Algorithm AS159. An efficient method of |
|
|
|
#' generating r x c tables with given row and column totals. |
|
|
|
#' _Applied Statistics_ *30*, 91-97. |
|
|
|
#' @seealso |
|
|
|
#' ‘chisq.test’ |
|
|
|
#' |
|
|
|
#' ‘fisher.exact’ in package ‘exact2x2’ for alternative |
|
|
|
#' interpretations of two-sided tests and confidence intervals for 2 |
|
|
|
#' by 2 tables. |
|
|
|
#' @examples |
|
|
|
#' \dontrun{ |
|
|
|
#' ## Agresti (1990, p. 61f; 2002, p. 91) Fisher's Tea Drinker |
|
|
|
#' ## A British woman claimed to be able to distinguish whether milk or |
|
|
|
#' ## tea was added to the cup first. To test, she was given 8 cups of |
|
|
|
#' ## tea, in four of which milk was added first. The null hypothesis |
|
|
|
#' ## is that there is no association between the true order of pouring |
|
|
|
#' ## and the woman's guess, the alternative that there is a positive |
|
|
|
#' ## association (that the odds ratio is greater than 1). |
|
|
|
#' TeaTasting <- |
|
|
|
#' matrix(c(3, 1, 1, 3), |
|
|
|
#' nrow = 2, |
|
|
|
#' dimnames = list(Guess = c("Milk", "Tea"), |
|
|
|
#' Truth = c("Milk", "Tea"))) |
|
|
|
#' fisher.test(TeaTasting, alternative = "greater") |
|
|
|
#' ## => p = 0.2429, association could not be established |
|
|
|
#' |
|
|
|
#' ## Fisher (1962, 1970), Criminal convictions of like-sex twins |
|
|
|
#' Convictions <- |
|
|
|
#' matrix(c(2, 10, 15, 3), |
|
|
|
#' nrow = 2, |
|
|
|
#' dimnames = |
|
|
|
#' list(c("Dizygotic", "Monozygotic"), |
|
|
|
#' c("Convicted", "Not convicted"))) |
|
|
|
#' Convictions |
|
|
|
#' fisher.test(Convictions, alternative = "less") |
|
|
|
#' fisher.test(Convictions, conf.int = FALSE) |
|
|
|
#' fisher.test(Convictions, conf.level = 0.95)$conf.int |
|
|
|
#' fisher.test(Convictions, conf.level = 0.99)$conf.int |
|
|
|
#' |
|
|
|
#' ## A r x c table Agresti (2002, p. 57) Job Satisfaction |
|
|
|
#' Job <- matrix(c(1,2,1,0, 3,3,6,1, 10,10,14,9, 6,7,12,11), 4, 4, |
|
|
|
#' dimnames = list(income = c("< 15k", "15-25k", "25-40k", "> 40k"), |
|
|
|
#' satisfaction = c("VeryD", "LittleD", "ModerateS", "VeryS"))) |
|
|
|
#' fisher.test(Job) |
|
|
|
#' fisher.test(Job, simulate.p.value = TRUE, B = 1e5) |
|
|
|
#' |
|
|
|
#' ### |
|
|
|
#' } |
|
|
|
#' @export |
|
|
|
fisher.test <- function(x, y = NULL, workspace = 200000, hybrid = FALSE, control = list(), or = 1, alternative = "two.sided", conf.int = TRUE, conf.level = 0.95, simulate.p.value = FALSE, B = 2000) |
|
|
|
fisher.test <- function(x, y, workspace, hybrid, control, or, alternative, conf.int, conf.level, simulate.p.value, B) |
|
|
|
{ |
|
|
|
UseMethod("fisher.test") |
|
|
|
} |
|
|
|
|
|
|
|
#' @rdname fisher.test |
|
|
|
fisher.test.default <- stats::fisher.test |
|
|
|
|
|
|
|
fisher.test.formula <- function(x, ...) |
|
|
|
#' @rdname fisher.test |
|
|
|
fisher.test.formula <- function(x, |
|
|
|
y = NULL, |
|
|
|
workspace = 200000, |
|
|
|
hybrid = F, |
|
|
|
control = list(), |
|
|
|
or = 1, |
|
|
|
alternative = "two.sided", |
|
|
|
conf.int = T, |
|
|
|
conf.level = .95, |
|
|
|
simulate.p.value = F, |
|
|
|
B = 2000) |
|
|
|
{ |
|
|
|
fisher.test.default(x = eval(x[[2]], envir = parent.frame()), y = eval(x[[3]], envir = parent.frame()), ...) |
|
|
|
stats::fisher.test(x = eval(x[[2]], envir = parent.frame()), |
|
|
|
y = eval(x[[3]], envir = parent.frame()), |
|
|
|
workspace = workspace, |
|
|
|
hybrid = hybrid, |
|
|
|
control = control, |
|
|
|
or = or, |
|
|
|
alternative = alternative, |
|
|
|
conf.int = conf.int, |
|
|
|
conf.level = conf.level, |
|
|
|
simulate.p.value = simulate.p.value, |
|
|
|
B = B) |
|
|
|
} |
|
|
|
|
|
|
|
#' Chi-square test |
|
|
|
#' Pearson's Chi-squared Test for Count Data |
|
|
|
#' |
|
|
|
#' ‘chisq.test’ performs chi-squared contingency table tests and goodness-of-fit tests. |
|
|
|
#' |
|
|
|
#' If ‘x’ is a matrix with one row or column, or if ‘x’ is a vector |
|
|
|
#' and ‘y’ is not given, then a _goodness-of-fit test_ is performed |
|
|
|
#' (‘x’ is treated as a one-dimensional contingency table). The |
|
|
|
#' entries of ‘x’ must be non-negative integers. In this case, the |
|
|
|
#' hypothesis tested is whether the population probabilities equal |
|
|
|
#' those in ‘p’, or are all equal if ‘p’ is not given. |
|
|
|
#' |
|
|
|
#' If ‘x’ is a matrix with at least two rows and columns, it is taken |
|
|
|
#' as a two-dimensional contingency table: the entries of ‘x’ must be |
|
|
|
#' non-negative integers. Otherwise, ‘x’ and ‘y’ must be vectors or |
|
|
|
#' factors of the same length; cases with missing values are removed, |
|
|
|
#' the objects are coerced to factors, and the contingency table is |
|
|
|
#' computed from these. Then Pearson's chi-squared test is performed |
|
|
|
#' of the null hypothesis that the joint distribution of the cell |
|
|
|
#' counts in a 2-dimensional contingency table is the product of the |
|
|
|
#' row and column marginals. |
|
|
|
#' |
|
|
|
#' If ‘simulate.p.value’ is ‘FALSE’, the p-value is computed from the |
|
|
|
#' asymptotic chi-squared distribution of the test statistic; |
|
|
|
#' continuity correction is only used in the 2-by-2 case (if |
|
|
|
#' ‘correct’ is ‘TRUE’, the default). Otherwise the p-value is |
|
|
|
#' computed for a Monte Carlo test (Hope, 1968) with ‘B’ replicates. |
|
|
|
#' |
|
|
|
#' In the contingency table case simulation is done by random |
|
|
|
#' sampling from the set of all contingency tables with given |
|
|
|
#' marginals, and works only if the marginals are strictly positive. |
|
|
|
#' Continuity correction is never used, and the statistic is quoted |
|
|
|
#' without it. Note that this is not the usual sampling situation |
|
|
|
#' assumed for the chi-squared test but rather that for Fisher's |
|
|
|
#' exact test. |
|
|
|
#' |
|
|
|
#' In the goodness-of-fit case simulation is done by random sampling |
|
|
|
#' from the discrete distribution specified by ‘p’, each sample being |
|
|
|
#' of size ‘n = sum(x)’. This simulation is done in R and may be |
|
|
|
#' slow. |
|
|
|
#' @param x a numeric vector, or matrix, or formula of the form \code{lhs ~ rhs} where \code{lhs} and \code{rhs} are factors. ‘x’ and ‘y’ can also both be factors. |
|
|
|
#' @param y a numeric vector; ignored if ‘x’ is a matrix or a formula. If ‘x’ is a factor, ‘y’ should be a factor of the same length. |
|
|
|
#' @inheritParams stats::chisq.test |
|
|
|
#' @seealso stats::chisq.test |
|
|
|
#' @return A list with class ‘"htest"’ containing the following components: |
|
|
|
#' statistic: the value the chi-squared test statistic. |
|
|
|
#' |
|
|
|
#' parameter: the degrees of freedom of the approximate chi-squared |
|
|
|
#' distribution of the test statistic, ‘NA’ if the p-value is |
|
|
|
#' computed by Monte Carlo simulation. |
|
|
|
#' |
|
|
|
#' p.value: the p-value for the test. |
|
|
|
#' |
|
|
|
#' method: a character string indicating the type of test performed, and |
|
|
|
#' whether Monte Carlo simulation or continuity correction was |
|
|
|
#' used. |
|
|
|
#' |
|
|
|
#' data.name: a character string giving the name(s) of the data. |
|
|
|
#' |
|
|
|
#' observed: the observed counts. |
|
|
|
#' |
|
|
|
#' expected: the expected counts under the null hypothesis. |
|
|
|
#' |
|
|
|
#' residuals: the Pearson residuals, ‘(observed - expected) / |
|
|
|
#' sqrt(expected)’. |
|
|
|
#' |
|
|
|
#' stdres: standardized residuals, ‘(observed - expected) / sqrt(V)’, |
|
|
|
#' where ‘V’ is the residual cell variance (Agresti, 2007, |
|
|
|
#' section 2.4.5 for the case where ‘x’ is a matrix, ‘n * p * (1 |
|
|
|
#' - p)’ otherwise). |
|
|
|
#' @source The code for Monte Carlo simulation is a C translation of the Fortran algorithm of Patefield (1981). |
|
|
|
#' @references |
|
|
|
#' Hope, A. C. A. (1968) A simplified Monte Carlo significance test |
|
|
|
#' procedure. _J. Roy, Statist. Soc. B_ *30*, 582-598. |
|
|
|
#' |
|
|
|
#' Patefield, W. M. (1981) Algorithm AS159. An efficient method of |
|
|
|
#' generating r x c tables with given row and column totals. |
|
|
|
#' _Applied Statistics_ *30*, 91-97. |
|
|
|
#' |
|
|
|
#' Agresti, A. (2007) _An Introduction to Categorical Data Analysis, |
|
|
|
#' 2nd ed._, New York: John Wiley & Sons. Page 38. |
|
|
|
#' @seealso For goodness-of-fit testing, notably of continuous distributions, ‘ks.test’. |
|
|
|
#' @examples |
|
|
|
#' \dontrun{ |
|
|
|
#' ## From Agresti(2007) p.39 |
|
|
|
#' M <- as.table(rbind(c(762, 327, 468), c(484, 239, 477))) |
|
|
|
#' dimnames(M) <- list(gender = c("F", "M"), |
|
|
|
#' party = c("Democrat","Independent", "Republican")) |
|
|
|
#' (Xsq <- chisq.test(M)) # Prints test summary |
|
|
|
#' Xsq$observed # observed counts (same as M) |
|
|
|
#' Xsq$expected # expected counts under the null |
|
|
|
#' Xsq$residuals # Pearson residuals |
|
|
|
#' Xsq$stdres # standardized residuals |
|
|
|
#' |
|
|
|
#' |
|
|
|
#' ## Effect of simulating p-values |
|
|
|
#' x <- matrix(c(12, 5, 7, 7), ncol = 2) |
|
|
|
#' chisq.test(x)$p.value # 0.4233 |
|
|
|
#' chisq.test(x, simulate.p.value = TRUE, B = 10000)$p.value |
|
|
|
#' # around 0.29! |
|
|
|
#' |
|
|
|
#' ## Testing for population probabilities |
|
|
|
#' ## Case A. Tabulated data |
|
|
|
#' x <- c(A = 20, B = 15, C = 25) |
|
|
|
#' chisq.test(x) |
|
|
|
#' chisq.test(as.table(x)) # the same |
|
|
|
#' x <- c(89,37,30,28,2) |
|
|
|
#' p <- c(40,20,20,15,5) |
|
|
|
#' try( |
|
|
|
#' chisq.test(x, p = p) # gives an error |
|
|
|
#' ) |
|
|
|
#' chisq.test(x, p = p, rescale.p = TRUE) |
|
|
|
#' # works |
|
|
|
#' p <- c(0.40,0.20,0.20,0.19,0.01) |
|
|
|
#' # Expected count in category 5 |
|
|
|
#' # is 1.86 < 5 ==> chi square approx. |
|
|
|
#' chisq.test(x, p = p) # maybe doubtful, but is ok! |
|
|
|
#' chisq.test(x, p = p, simulate.p.value = TRUE) |
|
|
|
#' |
|
|
|
#' ## Case B. Raw data |
|
|
|
#' x <- trunc(5 * runif(100)) |
|
|
|
#' chisq.test(table(x)) # NOT 'chisq.test(x)'! |
|
|
|
#' |
|
|
|
#' ### |
|
|
|
#' } |
|
|
|
#' @export |
|
|
|
chisq.test <- function(x, y = NULL, correct = TRUE, p = rep(1/length(x), length(x)), rescale.p = FALSE, simulate.p.value = FALSE, B = 2000) |
|
|
|
chisq.test <- function(x, y, correct, p, rescale.p, simulate.p.value, B) |
|
|
|
{ |
|
|
|
UseMethod("chisq.test") |
|
|
|
} |
|
|
|
|
|
|
|
#' @rdname chisq.test |
|
|
|
chisq.test.default <- stats::chisq.test |
|
|
|
|
|
|
|
chisq.test.formula <- function(x, y = NULL, correct = TRUE, p = rep(1/length(x), length(x)), rescale.p = FALSE, simulate.p.value = FALSE, B = 2000, ...) |
|
|
|
#' @rdname chisq.test |
|
|
|
chisq.test.formula <- function(x, |
|
|
|
y = NULL, |
|
|
|
correct = T, |
|
|
|
p = rep(1/length(x), length(x)), |
|
|
|
rescale.p = F, |
|
|
|
simulate.p.value = F, |
|
|
|
B = 2000) |
|
|
|
{ |
|
|
|
chisq.test.default(x = eval(x[[2]], envir = parent.frame()), y = eval(x[[3]], envir = parent.frame()), ...) |
|
|
|
stats::chisq.test(x = eval(x[[2]], envir = parent.frame()), |
|
|
|
y = eval(x[[3]], envir = parent.frame()), |
|
|
|
correct = correct, |
|
|
|
p = p, |
|
|
|
rescale.p = rescale.p, |
|
|
|
simulate.p.value = simulate.p.value, |
|
|
|
B = B) |
|
|
|
} |
|
|
|
|
|
|
|
#' Wrapper for summary(aov) |
|
|
|