You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

391 lines
16KB

  1. #' Return the percentages for the levels of a factor
  2. #'
  3. #' Return a compatible vector of length nlevels(x) + 1
  4. #' to print the percentages of each level of a factor
  5. #'
  6. #' @param x A factor
  7. #' @export
  8. #' @return A nlevels(x) + 1 length vector of percentages
  9. percent <- function(x) {
  10. if (is.factor(x)) c(NA, summary(x, maxsum = Inf) / length(x)) * 100
  11. else NA
  12. }
  13. #' Return the inter-quartile range
  14. #'
  15. #' Safe version of IQR for statify
  16. #' @param x A vector
  17. #' @return The IQR
  18. #' @export
  19. IQR <- function(x) {
  20. if (!is.factor(x))
  21. base::diff(stats::quantile(x, c(0.25, 0.75), na.rm = T))
  22. }
  23. #' Test if distribution is normal
  24. #'
  25. #' Test if distribution is normal.
  26. #' The condition for normality is length > 30 and non-significant Shapiro-Wilks test with p > .1
  27. #'
  28. #' @param x A numerical vector
  29. #' @export
  30. #' @return A boolean
  31. is.normal <- function(x) {
  32. if (!is.numeric(x)) F
  33. else if (length(stats::na.omit(x)) >= 30) tryCatch(stats::shapiro.test(x)$p.value > .1, error = function(e) F)
  34. else F
  35. }
  36. #' Fisher's Exact Test for Count Data
  37. #'
  38. #' Performs Fisher's exact test for testing the null of independence
  39. #' of rows and columns in a contingency table with fixed marginals, or with a formula expression.
  40. #'
  41. #' If \code{x} is a matrix, it is taken as a two-dimensional contingency
  42. #' table, and hence its entries should be nonnegative integers.
  43. #' Otherwise, both \code{x} and \code{y} must be vectors of the same length.
  44. #' Incomplete cases are removed, the vectors are coerced into factor
  45. #' objects, and the contingency table is computed from these.
  46. #'
  47. #' For 2 by 2 cases, p-values are obtained directly using the
  48. #' (central or non-central) hypergeometric distribution. Otherwise,
  49. #' computations are based on a C version of the FORTRAN subroutine
  50. #' FEXACT which implements the network developed by Mehta and Patel
  51. #' (1986) and improved by Clarkson, Fan and Joe (1993). The FORTRAN
  52. #' code can be obtained from \url{http://www.netlib.org/toms/643}.
  53. #' Note this fails (with an error message) when the entries of the
  54. #' table are too large. (It transposes the table if necessary so it
  55. #' has no more rows than columns. One constraint is that the product
  56. #' of the row marginals be less than 2^31 - 1.)
  57. #'
  58. #' For 2 by 2 tables, the null of conditional independence is
  59. #' equivalent to the hypothesis that the odds ratio equals one.
  60. #' \code{Exact} inference can be based on observing that in general, given
  61. #' all marginal totals fixed, the first element of the contingency
  62. #' table has a non-central hypergeometric distribution with
  63. #' non-centrality parameter given by the odds ratio (Fisher, 1935).
  64. #' The alternative for a one-sided test is based on the odds ratio,
  65. #' so \code{alternative = "greater"} is a test of the odds ratio being
  66. #' bigger than \code{or}.
  67. #'
  68. #' Two-sided tests are based on the probabilities of the tables, and
  69. #' take as \code{more extreme} all tables with probabilities less than or
  70. #' equal to that of the observed table, the p-value being the sum of
  71. #' such probabilities.
  72. #'
  73. #' For larger than 2 by 2 tables and \code{hybrid = TRUE}, asymptotic
  74. #' chi-squared probabilities are only used if the ‘Cochran
  75. #' conditions’ are satisfied, that is if no cell has count zero, and
  76. #' more than 80% of the cells have counts at least 5: otherwise the
  77. #' exact calculation is used.
  78. #'
  79. #' Simulation is done conditional on the row and column marginals,
  80. #' and works only if the marginals are strictly positive. (A C
  81. #' translation of the algorithm of Patefield (1981) is used.)
  82. #' @param x either a two-dimensional contingency table in matrix form, a factor object, or a formula of the form \code{lhs ~ rhs} where \code{lhs} and \code{rhs} are factors.
  83. #' @param y a factor object; ignored if \code{x} is a matrix or a formula.
  84. #' @param ... additional params to feed to original fisher.test
  85. #' @inheritParams stats::fisher.test
  86. #' @return A list with class \code{"htest"} containing the following components:
  87. #'
  88. #' p.value: the p-value of the test.
  89. #'
  90. #' conf.int: a confidence interval for the odds ratio. Only present in
  91. #' the 2 by 2 case and if argument \code{conf.int = TRUE}.
  92. #'
  93. #' estimate: an estimate of the odds ratio. Note that the _conditional_
  94. #' Maximum Likelihood Estimate (MLE) rather than the
  95. #' unconditional MLE (the sample odds ratio) is used. Only
  96. #' present in the 2 by 2 case.
  97. #'
  98. #' null.value: the odds ratio under the null, \code{or}. Only present in the 2
  99. #' by 2 case.
  100. #'
  101. #' alternative: a character string describing the alternative hypothesis.
  102. #'
  103. #' method: the character string \code{"Fisher's Exact Test for Count Data"}.
  104. #'
  105. #' data.name: a character string giving the names of the data.
  106. #' @references
  107. #' Agresti, A. (1990) _Categorical data analysis_. New York: Wiley.
  108. #' Pages 59-66.
  109. #'
  110. #' Agresti, A. (2002) _Categorical data analysis_. Second edition.
  111. #' New York: Wiley. Pages 91-101.
  112. #'
  113. #' Fisher, R. A. (1935) The logic of inductive inference. _Journal
  114. #' of the Royal Statistical Society Series A_ *98*, 39-54.
  115. #'
  116. #' Fisher, R. A. (1962) Confidence limits for a cross-product ratio.
  117. #' _Australian Journal of Statistics_ *4*, 41.
  118. #'
  119. #' Fisher, R. A. (1970) _Statistical Methods for Research Workers._
  120. #' Oliver & Boyd.
  121. #'
  122. #' Mehta, C. R. and Patel, N. R. (1986) Algorithm 643. FEXACT: A
  123. #' Fortran subroutine for Fisher's exact test on unordered r*c
  124. #' contingency tables. _ACM Transactions on Mathematical Software_,
  125. #' *12*, 154-161.
  126. #'
  127. #' Clarkson, D. B., Fan, Y. and Joe, H. (1993) A Remark on Algorithm
  128. #' 643: FEXACT: An Algorithm for Performing Fisher's Exact Test in r
  129. #' x c Contingency Tables. _ACM Transactions on Mathematical
  130. #' Software_, *19*, 484-488.
  131. #'
  132. #' Patefield, W. M. (1981) Algorithm AS159. An efficient method of
  133. #' generating r x c tables with given row and column totals.
  134. #' _Applied Statistics_ *30*, 91-97.
  135. #' @seealso
  136. #' \code{\link{chisq.test}}
  137. #'
  138. #' \code{fisher.exact} in package \pkg{kexact2x2} for alternative
  139. #' interpretations of two-sided tests and confidence intervals for 2
  140. #' by 2 tables.
  141. #' @examples
  142. #' \dontrun{
  143. #' ## Agresti (1990, p. 61f; 2002, p. 91) Fisher's Tea Drinker
  144. #' ## A British woman claimed to be able to distinguish whether milk or
  145. #' ## tea was added to the cup first. To test, she was given 8 cups of
  146. #' ## tea, in four of which milk was added first. The null hypothesis
  147. #' ## is that there is no association between the true order of pouring
  148. #' ## and the woman's guess, the alternative that there is a positive
  149. #' ## association (that the odds ratio is greater than 1).
  150. #' TeaTasting <-
  151. #' matrix(c(3, 1, 1, 3),
  152. #' nrow = 2,
  153. #' dimnames = list(Guess = c("Milk", "Tea"),
  154. #' Truth = c("Milk", "Tea")))
  155. #' fisher.test(TeaTasting, alternative = "greater")
  156. #' ## => p = 0.2429, association could not be established
  157. #'
  158. #' ## Fisher (1962, 1970), Criminal convictions of like-sex twins
  159. #' Convictions <-
  160. #' matrix(c(2, 10, 15, 3),
  161. #' nrow = 2,
  162. #' dimnames =
  163. #' list(c("Dizygotic", "Monozygotic"),
  164. #' c("Convicted", "Not convicted")))
  165. #' Convictions
  166. #' fisher.test(Convictions, alternative = "less")
  167. #' fisher.test(Convictions, conf.int = FALSE)
  168. #' fisher.test(Convictions, conf.level = 0.95)$conf.int
  169. #' fisher.test(Convictions, conf.level = 0.99)$conf.int
  170. #'
  171. #' ## A r x c table Agresti (2002, p. 57) Job Satisfaction
  172. #' Job <- matrix(c(1,2,1,0, 3,3,6,1, 10,10,14,9, 6,7,12,11), 4, 4,
  173. #' dimnames = list(income = c("< 15k", "15-25k", "25-40k", "> 40k"),
  174. #' satisfaction = c("VeryD", "LittleD", "ModerateS", "VeryS")))
  175. #' fisher.test(Job)
  176. #' fisher.test(Job, simulate.p.value = TRUE, B = 1e5)
  177. #'
  178. #' ###
  179. #' }
  180. #' @export
  181. fisher.test <- function(x, y, workspace, hybrid, control, or, alternative, conf.int, conf.level, simulate.p.value, B) {
  182. UseMethod("fisher.test")
  183. }
  184. #' @rdname fisher.test
  185. fisher.test.default <- function(x, ...) {
  186. stats::fisher.test(x, ...)
  187. }
  188. #' @rdname fisher.test
  189. fisher.test.formula <- function(x,
  190. y = NULL,
  191. workspace = 200000,
  192. hybrid = F,
  193. control = list(),
  194. or = 1,
  195. alternative = "two.sided",
  196. conf.int = T,
  197. conf.level = .95,
  198. simulate.p.value = F,
  199. B = 2000) {
  200. stats::fisher.test(x = eval(x[[2]], envir = parent.frame()),
  201. y = eval(x[[3]], envir = parent.frame()),
  202. workspace = workspace,
  203. hybrid = hybrid,
  204. control = control,
  205. or = or,
  206. alternative = alternative,
  207. conf.int = conf.int,
  208. conf.level = conf.level,
  209. simulate.p.value = simulate.p.value,
  210. B = B)
  211. }
  212. #' Pearson's Chi-squared Test for Count Data
  213. #'
  214. #' \code{chisq.test} performs chi-squared contingency table tests and goodness-of-fit tests, with an added method for formulas.
  215. #'
  216. #' If \code{x} is a matrix with one row or column, or if \code{x} is a vector
  217. #' and \code{y} is not given, then a _goodness-of-fit test_ is performed
  218. #' (\code{x} is treated as a one-dimensional contingency table). The
  219. #' entries of \code{x} must be non-negative integers. In this case, the
  220. #' hypothesis tested is whether the population probabilities equal
  221. #' those in \code{p}, or are all equal if \code{p} is not given.
  222. #'
  223. #' If \code{x} is a matrix with at least two rows and columns, it is taken
  224. #' as a two-dimensional contingency table: the entries of \code{x} must be
  225. #' non-negative integers. Otherwise, \code{x} and \code{y} must be vectors or
  226. #' factors of the same length; cases with missing values are removed,
  227. #' the objects are coerced to factors, and the contingency table is
  228. #' computed from these. Then Pearson's chi-squared test is performed
  229. #' of the null hypothesis that the joint distribution of the cell
  230. #' counts in a 2-dimensional contingency table is the product of the
  231. #' row and column marginals.
  232. #'
  233. #' If \code{simulate.p.value} is \code{FALSE}, the p-value is computed from the
  234. #' asymptotic chi-squared distribution of the test statistic;
  235. #' continuity correction is only used in the 2-by-2 case (if
  236. #' \code{correct} is \code{TRUE}, the default). Otherwise the p-value is
  237. #' computed for a Monte Carlo test (Hope, 1968) with \code{B} replicates.
  238. #'
  239. #' In the contingency table case simulation is done by random
  240. #' sampling from the set of all contingency tables with given
  241. #' marginals, and works only if the marginals are strictly positive.
  242. #' Continuity correction is never used, and the statistic is quoted
  243. #' without it. Note that this is not the usual sampling situation
  244. #' assumed for the chi-squared test but rather that for Fisher's
  245. #' exact test.
  246. #'
  247. #' In the goodness-of-fit case simulation is done by random sampling
  248. #' from the discrete distribution specified by \code{p}, each sample being
  249. #' of size \code{n = sum(x)}. This simulation is done in R and may be
  250. #' slow.
  251. #' @param x a numeric vector, or matrix, or formula of the form \code{lhs ~ rhs} where \code{lhs} and \code{rhs} are factors. \code{x} and \code{y} can also both be factors.
  252. #' @param y a numeric vector; ignored if \code{x} is a matrix or a formula. If \code{x} is a factor, \code{y} should be a factor of the same length.
  253. #' @inheritParams stats::chisq.test
  254. #' @return A list with class \code{"htest"} containing the following components:
  255. #' statistic: the value the chi-squared test statistic.
  256. #'
  257. #' parameter: the degrees of freedom of the approximate chi-squared
  258. #' distribution of the test statistic, \code{NA} if the p-value is
  259. #' computed by Monte Carlo simulation.
  260. #'
  261. #' p.value: the p-value for the test.
  262. #'
  263. #' method: a character string indicating the type of test performed, and
  264. #' whether Monte Carlo simulation or continuity correction was
  265. #' used.
  266. #'
  267. #' data.name: a character string giving the name(s) of the data.
  268. #'
  269. #' observed: the observed counts.
  270. #'
  271. #' expected: the expected counts under the null hypothesis.
  272. #'
  273. #' residuals: the Pearson residuals, ‘(observed - expected) /
  274. #' sqrt(expected)’.
  275. #'
  276. #' stdres: standardized residuals, \code{(observed - expected) / sqrt(V)},
  277. #' where \code{V} is the residual cell variance (Agresti, 2007,
  278. #' section 2.4.5 for the case where \code{x} is a matrix, ‘n * p * (1
  279. #' - p)’ otherwise).
  280. #' @source The code for Monte Carlo simulation is a C translation of the Fortran algorithm of Patefield (1981).
  281. #' @references
  282. #' Hope, A. C. A. (1968) A simplified Monte Carlo significance test
  283. #' procedure. _J. Roy, Statist. Soc. B_ *30*, 582-598.
  284. #'
  285. #' Patefield, W. M. (1981) Algorithm AS159. An efficient method of
  286. #' generating r x c tables with given row and column totals.
  287. #' _Applied Statistics_ *30*, 91-97.
  288. #'
  289. #' Agresti, A. (2007) _An Introduction to Categorical Data Analysis,
  290. #' 2nd ed._, New York: John Wiley & Sons. Page 38.
  291. #' @seealso For goodness-of-fit testing, notably of continuous distributions, \code{\link{ks.test}}.
  292. #' @examples
  293. #' \dontrun{
  294. #' ## From Agresti(2007) p.39
  295. #' M <- as.table(rbind(c(762, 327, 468), c(484, 239, 477)))
  296. #' dimnames(M) <- list(gender = c("F", "M"),
  297. #' party = c("Democrat","Independent", "Republican"))
  298. #' (Xsq <- chisq.test(M)) # Prints test summary
  299. #' Xsq$observed # observed counts (same as M)
  300. #' Xsq$expected # expected counts under the null
  301. #' Xsq$residuals # Pearson residuals
  302. #' Xsq$stdres # standardized residuals
  303. #'
  304. #'
  305. #' ## Effect of simulating p-values
  306. #' x <- matrix(c(12, 5, 7, 7), ncol = 2)
  307. #' chisq.test(x)$p.value # 0.4233
  308. #' chisq.test(x, simulate.p.value = TRUE, B = 10000)$p.value
  309. #' # around 0.29!
  310. #'
  311. #' ## Testing for population probabilities
  312. #' ## Case A. Tabulated data
  313. #' x <- c(A = 20, B = 15, C = 25)
  314. #' chisq.test(x)
  315. #' chisq.test(as.table(x)) # the same
  316. #' x <- c(89,37,30,28,2)
  317. #' p <- c(40,20,20,15,5)
  318. #' try(
  319. #' chisq.test(x, p = p) # gives an error
  320. #' )
  321. #' chisq.test(x, p = p, rescale.p = TRUE)
  322. #' # works
  323. #' p <- c(0.40,0.20,0.20,0.19,0.01)
  324. #' # Expected count in category 5
  325. #' # is 1.86 < 5 ==> chi square approx.
  326. #' chisq.test(x, p = p) # maybe doubtful, but is ok!
  327. #' chisq.test(x, p = p, simulate.p.value = TRUE)
  328. #'
  329. #' ## Case B. Raw data
  330. #' x <- trunc(5 * runif(100))
  331. #' chisq.test(table(x)) # NOT 'chisq.test(x)'!
  332. #'
  333. #' ###
  334. #' }
  335. #' @export
  336. chisq.test <- function(x, y, correct, p, rescale.p, simulate.p.value, B) {
  337. UseMethod("chisq.test")
  338. }
  339. #' @rdname chisq.test
  340. chisq.test.default <- stats::chisq.test
  341. #' @rdname chisq.test
  342. chisq.test.formula <- function(x,
  343. y = NULL,
  344. correct = T,
  345. p = rep(1/length(x), length(x)),
  346. rescale.p = F,
  347. simulate.p.value = F,
  348. B = 2000) {
  349. stats::chisq.test(x = eval(x[[2]], envir = parent.frame()),
  350. y = eval(x[[3]], envir = parent.frame()),
  351. correct = correct,
  352. p = p,
  353. rescale.p = rescale.p,
  354. simulate.p.value = simulate.p.value,
  355. B = B)
  356. }
  357. #' Wrapper for oneway.test(var.equal = T)
  358. #'
  359. #' @param formula An anova formula (\code{variable ~ grouping variable})
  360. #' @seealso \code{\link{oneway.test}}
  361. #' @export
  362. ANOVA <- function(formula) {
  363. stats::oneway.test(formula, var.equal = T)
  364. }
  365. #' No test
  366. #'
  367. #' An empty test
  368. #' @param formula A formula
  369. no.test <- function(formula) {
  370. data.frame(p.value = NA)
  371. }