You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

390 lines
16KB

  1. #' Return the percentages for the levels of a factor
  2. #'
  3. #' Return a compatible vector of length nlevels(x) + 1
  4. #' to print the percentages of each level of a factor
  5. #'
  6. #' @param x A factor
  7. #' @export
  8. #' @return A nlevels(x) + 1 length vector of percentages
  9. percent <- function(x) {
  10. if (is.factor(x)) c(NA, summary(x, maxsum = Inf) / length(x)) * 100
  11. else NA
  12. }
  13. #' Return the inter-quartile range
  14. #'
  15. #' Safe version of IQR for statify
  16. #' @param x A vector
  17. #' @return The IQR
  18. #' @export
  19. IQR <- function(x) {
  20. base::diff(stats::quantile(x, c(0.25, 0.75), na.rm = T))
  21. }
  22. #' Test if distribution is normal
  23. #'
  24. #' Test if distribution is normal.
  25. #' The condition for normality is length > 30 and non-significant Shapiro-Wilks test with p > .1
  26. #'
  27. #' @param x A numerical vector
  28. #' @export
  29. #' @return A boolean
  30. is.normal <- function(x) {
  31. if (!is.numeric(x)) F
  32. else if (length(stats::na.omit(x)) >= 30) tryCatch(stats::shapiro.test(x)$p.value > .1, error = function(e) F)
  33. else F
  34. }
  35. #' Fisher's Exact Test for Count Data
  36. #'
  37. #' Performs Fisher's exact test for testing the null of independence
  38. #' of rows and columns in a contingency table with fixed marginals, or with a formula expression.
  39. #'
  40. #' If \code{x} is a matrix, it is taken as a two-dimensional contingency
  41. #' table, and hence its entries should be nonnegative integers.
  42. #' Otherwise, both \code{x} and \code{y} must be vectors of the same length.
  43. #' Incomplete cases are removed, the vectors are coerced into factor
  44. #' objects, and the contingency table is computed from these.
  45. #'
  46. #' For 2 by 2 cases, p-values are obtained directly using the
  47. #' (central or non-central) hypergeometric distribution. Otherwise,
  48. #' computations are based on a C version of the FORTRAN subroutine
  49. #' FEXACT which implements the network developed by Mehta and Patel
  50. #' (1986) and improved by Clarkson, Fan and Joe (1993). The FORTRAN
  51. #' code can be obtained from \url{http://www.netlib.org/toms/643}.
  52. #' Note this fails (with an error message) when the entries of the
  53. #' table are too large. (It transposes the table if necessary so it
  54. #' has no more rows than columns. One constraint is that the product
  55. #' of the row marginals be less than 2^31 - 1.)
  56. #'
  57. #' For 2 by 2 tables, the null of conditional independence is
  58. #' equivalent to the hypothesis that the odds ratio equals one.
  59. #' \code{Exact} inference can be based on observing that in general, given
  60. #' all marginal totals fixed, the first element of the contingency
  61. #' table has a non-central hypergeometric distribution with
  62. #' non-centrality parameter given by the odds ratio (Fisher, 1935).
  63. #' The alternative for a one-sided test is based on the odds ratio,
  64. #' so \code{alternative = "greater"} is a test of the odds ratio being
  65. #' bigger than \code{or}.
  66. #'
  67. #' Two-sided tests are based on the probabilities of the tables, and
  68. #' take as \code{more extreme} all tables with probabilities less than or
  69. #' equal to that of the observed table, the p-value being the sum of
  70. #' such probabilities.
  71. #'
  72. #' For larger than 2 by 2 tables and \code{hybrid = TRUE}, asymptotic
  73. #' chi-squared probabilities are only used if the ‘Cochran
  74. #' conditions’ are satisfied, that is if no cell has count zero, and
  75. #' more than 80% of the cells have counts at least 5: otherwise the
  76. #' exact calculation is used.
  77. #'
  78. #' Simulation is done conditional on the row and column marginals,
  79. #' and works only if the marginals are strictly positive. (A C
  80. #' translation of the algorithm of Patefield (1981) is used.)
  81. #' @param x either a two-dimensional contingency table in matrix form, a factor object, or a formula of the form \code{lhs ~ rhs} where \code{lhs} and \code{rhs} are factors.
  82. #' @param y a factor object; ignored if \code{x} is a matrix or a formula.
  83. #' @param ... additional params to feed to original fisher.test
  84. #' @inheritParams stats::fisher.test
  85. #' @return A list with class \code{"htest"} containing the following components:
  86. #'
  87. #' p.value: the p-value of the test.
  88. #'
  89. #' conf.int: a confidence interval for the odds ratio. Only present in
  90. #' the 2 by 2 case and if argument \code{conf.int = TRUE}.
  91. #'
  92. #' estimate: an estimate of the odds ratio. Note that the _conditional_
  93. #' Maximum Likelihood Estimate (MLE) rather than the
  94. #' unconditional MLE (the sample odds ratio) is used. Only
  95. #' present in the 2 by 2 case.
  96. #'
  97. #' null.value: the odds ratio under the null, \code{or}. Only present in the 2
  98. #' by 2 case.
  99. #'
  100. #' alternative: a character string describing the alternative hypothesis.
  101. #'
  102. #' method: the character string \code{"Fisher's Exact Test for Count Data"}.
  103. #'
  104. #' data.name: a character string giving the names of the data.
  105. #' @references
  106. #' Agresti, A. (1990) _Categorical data analysis_. New York: Wiley.
  107. #' Pages 59-66.
  108. #'
  109. #' Agresti, A. (2002) _Categorical data analysis_. Second edition.
  110. #' New York: Wiley. Pages 91-101.
  111. #'
  112. #' Fisher, R. A. (1935) The logic of inductive inference. _Journal
  113. #' of the Royal Statistical Society Series A_ *98*, 39-54.
  114. #'
  115. #' Fisher, R. A. (1962) Confidence limits for a cross-product ratio.
  116. #' _Australian Journal of Statistics_ *4*, 41.
  117. #'
  118. #' Fisher, R. A. (1970) _Statistical Methods for Research Workers._
  119. #' Oliver & Boyd.
  120. #'
  121. #' Mehta, C. R. and Patel, N. R. (1986) Algorithm 643. FEXACT: A
  122. #' Fortran subroutine for Fisher's exact test on unordered r*c
  123. #' contingency tables. _ACM Transactions on Mathematical Software_,
  124. #' *12*, 154-161.
  125. #'
  126. #' Clarkson, D. B., Fan, Y. and Joe, H. (1993) A Remark on Algorithm
  127. #' 643: FEXACT: An Algorithm for Performing Fisher's Exact Test in r
  128. #' x c Contingency Tables. _ACM Transactions on Mathematical
  129. #' Software_, *19*, 484-488.
  130. #'
  131. #' Patefield, W. M. (1981) Algorithm AS159. An efficient method of
  132. #' generating r x c tables with given row and column totals.
  133. #' _Applied Statistics_ *30*, 91-97.
  134. #' @seealso
  135. #' \code{\link{chisq.test}}
  136. #'
  137. #' \code{fisher.exact} in package \pkg{kexact2x2} for alternative
  138. #' interpretations of two-sided tests and confidence intervals for 2
  139. #' by 2 tables.
  140. #' @examples
  141. #' \dontrun{
  142. #' ## Agresti (1990, p. 61f; 2002, p. 91) Fisher's Tea Drinker
  143. #' ## A British woman claimed to be able to distinguish whether milk or
  144. #' ## tea was added to the cup first. To test, she was given 8 cups of
  145. #' ## tea, in four of which milk was added first. The null hypothesis
  146. #' ## is that there is no association between the true order of pouring
  147. #' ## and the woman's guess, the alternative that there is a positive
  148. #' ## association (that the odds ratio is greater than 1).
  149. #' TeaTasting <-
  150. #' matrix(c(3, 1, 1, 3),
  151. #' nrow = 2,
  152. #' dimnames = list(Guess = c("Milk", "Tea"),
  153. #' Truth = c("Milk", "Tea")))
  154. #' fisher.test(TeaTasting, alternative = "greater")
  155. #' ## => p = 0.2429, association could not be established
  156. #'
  157. #' ## Fisher (1962, 1970), Criminal convictions of like-sex twins
  158. #' Convictions <-
  159. #' matrix(c(2, 10, 15, 3),
  160. #' nrow = 2,
  161. #' dimnames =
  162. #' list(c("Dizygotic", "Monozygotic"),
  163. #' c("Convicted", "Not convicted")))
  164. #' Convictions
  165. #' fisher.test(Convictions, alternative = "less")
  166. #' fisher.test(Convictions, conf.int = FALSE)
  167. #' fisher.test(Convictions, conf.level = 0.95)$conf.int
  168. #' fisher.test(Convictions, conf.level = 0.99)$conf.int
  169. #'
  170. #' ## A r x c table Agresti (2002, p. 57) Job Satisfaction
  171. #' Job <- matrix(c(1,2,1,0, 3,3,6,1, 10,10,14,9, 6,7,12,11), 4, 4,
  172. #' dimnames = list(income = c("< 15k", "15-25k", "25-40k", "> 40k"),
  173. #' satisfaction = c("VeryD", "LittleD", "ModerateS", "VeryS")))
  174. #' fisher.test(Job)
  175. #' fisher.test(Job, simulate.p.value = TRUE, B = 1e5)
  176. #'
  177. #' ###
  178. #' }
  179. #' @export
  180. fisher.test <- function(x, y, workspace, hybrid, control, or, alternative, conf.int, conf.level, simulate.p.value, B) {
  181. UseMethod("fisher.test")
  182. }
  183. #' @rdname fisher.test
  184. fisher.test.default <- function(x, ...) {
  185. stats::fisher.test(x, ...)
  186. }
  187. #' @rdname fisher.test
  188. fisher.test.formula <- function(x,
  189. y = NULL,
  190. workspace = 200000,
  191. hybrid = F,
  192. control = list(),
  193. or = 1,
  194. alternative = "two.sided",
  195. conf.int = T,
  196. conf.level = .95,
  197. simulate.p.value = F,
  198. B = 2000) {
  199. stats::fisher.test(x = eval(x[[2]], envir = parent.frame()),
  200. y = eval(x[[3]], envir = parent.frame()),
  201. workspace = workspace,
  202. hybrid = hybrid,
  203. control = control,
  204. or = or,
  205. alternative = alternative,
  206. conf.int = conf.int,
  207. conf.level = conf.level,
  208. simulate.p.value = simulate.p.value,
  209. B = B)
  210. }
  211. #' Pearson's Chi-squared Test for Count Data
  212. #'
  213. #' \code{chisq.test} performs chi-squared contingency table tests and goodness-of-fit tests, with an added method for formulas.
  214. #'
  215. #' If \code{x} is a matrix with one row or column, or if \code{x} is a vector
  216. #' and \code{y} is not given, then a _goodness-of-fit test_ is performed
  217. #' (\code{x} is treated as a one-dimensional contingency table). The
  218. #' entries of \code{x} must be non-negative integers. In this case, the
  219. #' hypothesis tested is whether the population probabilities equal
  220. #' those in \code{p}, or are all equal if \code{p} is not given.
  221. #'
  222. #' If \code{x} is a matrix with at least two rows and columns, it is taken
  223. #' as a two-dimensional contingency table: the entries of \code{x} must be
  224. #' non-negative integers. Otherwise, \code{x} and \code{y} must be vectors or
  225. #' factors of the same length; cases with missing values are removed,
  226. #' the objects are coerced to factors, and the contingency table is
  227. #' computed from these. Then Pearson's chi-squared test is performed
  228. #' of the null hypothesis that the joint distribution of the cell
  229. #' counts in a 2-dimensional contingency table is the product of the
  230. #' row and column marginals.
  231. #'
  232. #' If \code{simulate.p.value} is \code{FALSE}, the p-value is computed from the
  233. #' asymptotic chi-squared distribution of the test statistic;
  234. #' continuity correction is only used in the 2-by-2 case (if
  235. #' \code{correct} is \code{TRUE}, the default). Otherwise the p-value is
  236. #' computed for a Monte Carlo test (Hope, 1968) with \code{B} replicates.
  237. #'
  238. #' In the contingency table case simulation is done by random
  239. #' sampling from the set of all contingency tables with given
  240. #' marginals, and works only if the marginals are strictly positive.
  241. #' Continuity correction is never used, and the statistic is quoted
  242. #' without it. Note that this is not the usual sampling situation
  243. #' assumed for the chi-squared test but rather that for Fisher's
  244. #' exact test.
  245. #'
  246. #' In the goodness-of-fit case simulation is done by random sampling
  247. #' from the discrete distribution specified by \code{p}, each sample being
  248. #' of size \code{n = sum(x)}. This simulation is done in R and may be
  249. #' slow.
  250. #' @param x a numeric vector, or matrix, or formula of the form \code{lhs ~ rhs} where \code{lhs} and \code{rhs} are factors. \code{x} and \code{y} can also both be factors.
  251. #' @param y a numeric vector; ignored if \code{x} is a matrix or a formula. If \code{x} is a factor, \code{y} should be a factor of the same length.
  252. #' @inheritParams stats::chisq.test
  253. #' @return A list with class \code{"htest"} containing the following components:
  254. #' statistic: the value the chi-squared test statistic.
  255. #'
  256. #' parameter: the degrees of freedom of the approximate chi-squared
  257. #' distribution of the test statistic, \code{NA} if the p-value is
  258. #' computed by Monte Carlo simulation.
  259. #'
  260. #' p.value: the p-value for the test.
  261. #'
  262. #' method: a character string indicating the type of test performed, and
  263. #' whether Monte Carlo simulation or continuity correction was
  264. #' used.
  265. #'
  266. #' data.name: a character string giving the name(s) of the data.
  267. #'
  268. #' observed: the observed counts.
  269. #'
  270. #' expected: the expected counts under the null hypothesis.
  271. #'
  272. #' residuals: the Pearson residuals, ‘(observed - expected) /
  273. #' sqrt(expected)’.
  274. #'
  275. #' stdres: standardized residuals, \code{(observed - expected) / sqrt(V)},
  276. #' where \code{V} is the residual cell variance (Agresti, 2007,
  277. #' section 2.4.5 for the case where \code{x} is a matrix, ‘n * p * (1
  278. #' - p)’ otherwise).
  279. #' @source The code for Monte Carlo simulation is a C translation of the Fortran algorithm of Patefield (1981).
  280. #' @references
  281. #' Hope, A. C. A. (1968) A simplified Monte Carlo significance test
  282. #' procedure. _J. Roy, Statist. Soc. B_ *30*, 582-598.
  283. #'
  284. #' Patefield, W. M. (1981) Algorithm AS159. An efficient method of
  285. #' generating r x c tables with given row and column totals.
  286. #' _Applied Statistics_ *30*, 91-97.
  287. #'
  288. #' Agresti, A. (2007) _An Introduction to Categorical Data Analysis,
  289. #' 2nd ed._, New York: John Wiley & Sons. Page 38.
  290. #' @seealso For goodness-of-fit testing, notably of continuous distributions, \code{\link{ks.test}}.
  291. #' @examples
  292. #' \dontrun{
  293. #' ## From Agresti(2007) p.39
  294. #' M <- as.table(rbind(c(762, 327, 468), c(484, 239, 477)))
  295. #' dimnames(M) <- list(gender = c("F", "M"),
  296. #' party = c("Democrat","Independent", "Republican"))
  297. #' (Xsq <- chisq.test(M)) # Prints test summary
  298. #' Xsq$observed # observed counts (same as M)
  299. #' Xsq$expected # expected counts under the null
  300. #' Xsq$residuals # Pearson residuals
  301. #' Xsq$stdres # standardized residuals
  302. #'
  303. #'
  304. #' ## Effect of simulating p-values
  305. #' x <- matrix(c(12, 5, 7, 7), ncol = 2)
  306. #' chisq.test(x)$p.value # 0.4233
  307. #' chisq.test(x, simulate.p.value = TRUE, B = 10000)$p.value
  308. #' # around 0.29!
  309. #'
  310. #' ## Testing for population probabilities
  311. #' ## Case A. Tabulated data
  312. #' x <- c(A = 20, B = 15, C = 25)
  313. #' chisq.test(x)
  314. #' chisq.test(as.table(x)) # the same
  315. #' x <- c(89,37,30,28,2)
  316. #' p <- c(40,20,20,15,5)
  317. #' try(
  318. #' chisq.test(x, p = p) # gives an error
  319. #' )
  320. #' chisq.test(x, p = p, rescale.p = TRUE)
  321. #' # works
  322. #' p <- c(0.40,0.20,0.20,0.19,0.01)
  323. #' # Expected count in category 5
  324. #' # is 1.86 < 5 ==> chi square approx.
  325. #' chisq.test(x, p = p) # maybe doubtful, but is ok!
  326. #' chisq.test(x, p = p, simulate.p.value = TRUE)
  327. #'
  328. #' ## Case B. Raw data
  329. #' x <- trunc(5 * runif(100))
  330. #' chisq.test(table(x)) # NOT 'chisq.test(x)'!
  331. #'
  332. #' ###
  333. #' }
  334. #' @export
  335. chisq.test <- function(x, y, correct, p, rescale.p, simulate.p.value, B) {
  336. UseMethod("chisq.test")
  337. }
  338. #' @rdname chisq.test
  339. chisq.test.default <- stats::chisq.test
  340. #' @rdname chisq.test
  341. chisq.test.formula <- function(x,
  342. y = NULL,
  343. correct = T,
  344. p = rep(1/length(x), length(x)),
  345. rescale.p = F,
  346. simulate.p.value = F,
  347. B = 2000) {
  348. stats::chisq.test(x = eval(x[[2]], envir = parent.frame()),
  349. y = eval(x[[3]], envir = parent.frame()),
  350. correct = correct,
  351. p = p,
  352. rescale.p = rescale.p,
  353. simulate.p.value = simulate.p.value,
  354. B = B)
  355. }
  356. #' Wrapper for oneway.test(var.equal = T)
  357. #'
  358. #' @param formula An anova formula (\code{variable ~ grouping variable})
  359. #' @seealso \code{\link{oneway.test}}
  360. #' @export
  361. ANOVA <- function(formula) {
  362. stats::oneway.test(formula, var.equal = T)
  363. }
  364. #' No test
  365. #'
  366. #' An empty test
  367. #' @param formula A formula
  368. no.test <- function(formula) {
  369. data.frame(p.value = NA)
  370. }