openGPMP
Open Source Mathematics Package
hypothesis.cpp
Go to the documentation of this file.
1 /*************************************************************************
2  *
3  * Project
4  * _____ _____ __ __ _____
5  * / ____| __ \| \/ | __ \
6  * ___ _ __ ___ _ __ | | __| |__) | \ / | |__) |
7  * / _ \| '_ \ / _ \ '_ \| | |_ | ___/| |\/| | ___/
8  *| (_) | |_) | __/ | | | |__| | | | | | | |
9  * \___/| .__/ \___|_| |_|\_____|_| |_| |_|_|
10  * | |
11  * |_|
12  *
13  * Copyright (C) Akiel Aries, <akiel@akiel.org>, et al.
14  *
15  * This software is licensed as described in the file LICENSE, which
16  * you should have received as part of this distribution. The terms
17  * among other details are referenced in the official documentation
18  * seen here : https://akielaries.github.io/openGPMP/ along with
19  * important files seen in this project.
20  *
21  * You may opt to use, copy, modify, merge, publish, distribute
22  * and/or sell copies of the Software, and permit persons to whom
23  * the Software is furnished to do so, under the terms of the
24  * LICENSE file.
25  *
26  *
27  *
28  * This software is distributed on an AS IS basis, WITHOUT
29  * WARRANTY OF ANY KIND, either express or implied.
30  *
31  ************************************************************************/
32 #include <algorithm>
33 #include <cmath>
34 #include <iostream>
35 #include <limits>
38 #include <vector>
39 
40 // Method for conducting a one-sample t-test
41 double
42 gpmp::stats::HypothesisTest::one_sample_ttest(const std::vector<double> &sample,
43  double populationMean) {
44  int n = sample.size();
45  double sampleMean = gpmp::stats::Describe::mean_arith(sample);
46  double sampleStdDev = gpmp::stats::Describe::stdev(sample, sampleMean);
47  double standardError = sampleStdDev / sqrt(n);
48  return (sampleMean - populationMean) / standardError;
49 }
50 
51 // Method for conducting a two-sample t-test
53  const std::vector<double> &sample1,
54  const std::vector<double> &sample2) {
55  int n1 = sample1.size();
56  int n2 = sample2.size();
57  double sampleMean1 = gpmp::stats::Describe::mean_arith(sample1);
58  double sampleMean2 = gpmp::stats::Describe::mean_arith(sample2);
59  double sampleVar1 = gpmp::stats::Describe::variance(sample1, sampleMean1);
60  double sampleVar2 = gpmp::stats::Describe::variance(sample2, sampleMean2);
61  double pooledVar =
62  ((n1 - 1) * sampleVar1 + (n2 - 1) * sampleVar2) / (n1 + n2 - 2);
63  double t =
64  (sampleMean1 - sampleMean2) / sqrt(pooledVar * (1.0 / n1 + 1.0 / n2));
65  return t;
66 }
67 
68 // Method for conducting ANOVA (Analysis of Variance)
70  const std::vector<std::vector<double>> &samples) {
71  int k = samples.size();
72  int n = 0;
73  double grandMean = 0.0;
74  double SSB = 0.0;
75  double SSW = 0.0;
76 
77  // Calculate total number of observations and grand mean
78  for (const auto &sample : samples) {
79  n += sample.size();
80  double sampleMean = gpmp::stats::Describe::mean_arith(sample);
81  grandMean += sampleMean;
82  }
83  grandMean /= k;
84 
85  // Calculate sum of squares between groups (SSB) and within groups (SSW)
86  for (int i = 0; i < k; ++i) {
87  double sampleMean = gpmp::stats::Describe::mean_arith(samples[i]);
88  for (double x : samples[i]) {
89  SSB += pow((sampleMean - grandMean), 2);
90  SSW += pow((x - sampleMean), 2);
91  }
92  }
93 
94  // Calculate degrees of freedom
95  int dfBetweenGroups = k - 1;
96  int dfWithinGroups = n - k;
97 
98  // Calculate F-statistic
99  double MSB = SSB / dfBetweenGroups;
100  double MSW = SSW / dfWithinGroups;
101  double F = MSB / MSW;
102 
103  return F;
104 }
105 
106 // Method for conducting chi-square test of independence
108  const std::vector<std::vector<int>> &observed,
109  const std::vector<std::vector<double>> &expected) {
110  int rows = observed.size();
111  int cols = observed[0].size();
112  double chiSquare = 0.0;
113 
114  for (int i = 0; i < rows; ++i) {
115  for (int j = 0; j < cols; ++j) {
116  chiSquare +=
117  pow((observed[i][j] - expected[i][j]), 2) / expected[i][j];
118  }
119  }
120 
121  return chiSquare;
122 }
123 
124 // Method for conducting z-test for proportions
126  double p2,
127  double n1,
128  double n2) {
129  double p = (p1 * n1 + p2 * n2) / (n1 + n2);
130  double z = (p1 - p2) / sqrt(p * (1 - p) * (1 / n1 + 1 / n2));
131  return z;
132 }
133 
134 // Method for conducting Wilcoxon signed-rank test
136  const std::vector<double> &sample1,
137  const std::vector<double> &sample2) {
138  int n = sample1.size();
139  if (n != static_cast<int>(sample2.size())) {
140  std::cerr << "Sample sizes must be equal for Wilcoxon signed-rank test."
141  << std::endl;
142  return std::numeric_limits<double>::quiet_NaN();
143  }
144 
145  std::vector<double> differences;
146  for (int i = 0; i < n; ++i) {
147  differences.push_back(sample1[i] - sample2[i]);
148  }
149  std::sort(differences.begin(), differences.end(), [](double a, double b) {
150  return std::abs(a) < std::abs(b);
151  });
152 
153  double Tplus = 0;
154  double Tminus = 0;
155  int numPositive = 0;
156  int numNegative = 0;
157  for (double diff : differences) {
158  if (diff > 0) {
159  Tplus += diff;
160  numPositive++;
161  } else if (diff < 0) {
162  Tminus -= diff;
163  numNegative++;
164  }
165  }
166  int T = std::min(Tplus, Tminus);
167 
168  // Calculate the critical value using the normal approximation
169  double mean = n * (n + 1) / 4.0;
170  double stdDev = sqrt(n * (n + 1) * (2 * n + 1) / 24.0);
171  double z = (T - mean) / stdDev;
172 
173  return z;
174 }
175 
176 // Method for conducting Mann-Whitney U test
178  const std::vector<double> &sample1,
179  const std::vector<double> &sample2) {
180  int n1 = sample1.size();
181  int n2 = sample2.size();
182  double U1 = gpmp::stats::Describe::u_stat(sample1, sample2);
183  double U2 = gpmp::stats::Describe::u_stat(sample2, sample1);
184  double U = std::min(U1, U2);
185 
186  // Calculate the expected value of U
187  double expectedU = n1 * n2 / 2.0;
188 
189  // Calculate the standard deviation of U
190  double stdDev = sqrt(n1 * n2 * (n1 + n2 + 1) / 12.0);
191 
192  // Calculate the z-score
193  double z = (U - expectedU) / stdDev;
194 
195  return z;
196 }
197 
198 // Method for conducting Fisher's Exact Test
200  const std::vector<std::vector<int>> &table) {
201  int nRows = table.size();
202  int nCols = table[0].size();
203 
204  if (nRows != 2 || nCols != 2) {
205  std::cerr << "Fisher's Exact Test requires a 2x2 contingency table."
206  << std::endl;
207  return std::numeric_limits<double>::quiet_NaN();
208  }
209 
210  int a = table[0][0];
211  int b = table[0][1];
212  int c = table[1][0];
213  int d = table[1][1];
214 
215  double p = (factorial(a + b) * factorial(c + d) * factorial(a + c) *
216  factorial(b + d)) /
217  (factorial(a) * factorial(b) * factorial(c) * factorial(d) *
218  factorial(a + b + c + d));
219 
220  return p;
221 }
222 
223 // Method for conducting Kolmogorov-Smirnov Test
225  const std::vector<double> &sample1,
226  const std::vector<double> &sample2) {
227  int n1 = sample1.size();
228  int n2 = sample2.size();
229 
230  std::vector<double> combinedSamples = sample1;
231  combinedSamples.insert(combinedSamples.end(),
232  sample2.begin(),
233  sample2.end());
234  std::sort(combinedSamples.begin(), combinedSamples.end());
235 
236  double maxDPlus = 0.0;
237  double maxDMinus = 0.0;
238 
239  for (size_t i = 0; i < combinedSamples.size(); ++i) {
240  double DPlus = (i + 1) / static_cast<double>(n1) - combinedSamples[i];
241  double DMinus = combinedSamples[i] - i / static_cast<double>(n2);
242 
243  maxDPlus = std::max(maxDPlus, DPlus);
244  maxDMinus = std::max(maxDMinus, DMinus);
245  }
246 
247  return std::max(maxDPlus, maxDMinus);
248 }
249 
250 // Method for conducting Wilcoxon Rank Sum Test (Mann-Whitney U Test)
252  const std::vector<double> &sample1,
253  const std::vector<double> &sample2) {
254  int n1 = sample1.size();
255  int n2 = sample2.size();
256  std::vector<double> ranks;
257  ranks.reserve(n1 + n2);
258 
259  for (double x : sample1) {
260  ranks.push_back(x);
261  }
262  for (double x : sample2) {
263  ranks.push_back(x);
264  }
265 
266  std::sort(ranks.begin(), ranks.end());
267 
268  double rankSum1 = 0.0;
269  for (double x : sample1) {
270  rankSum1 +=
271  std::distance(ranks.begin(),
272  std::lower_bound(ranks.begin(), ranks.end(), x));
273  }
274 
275  double U1 = rankSum1 - (n1 * (n1 + 1)) / 2.0;
276  double U2 = n1 * n2 - U1;
277 
278  return std::min(U1, U2);
279 }
280 
281 // Method for conducting Kruskal-Wallis Test
283  const std::vector<std::vector<double>> &samples) {
284  int k = samples.size();
285  std::vector<std::pair<double, int>> combinedData;
286 
287  for (int i = 0; i < k; ++i) {
288  for (double x : samples[i]) {
289  combinedData.push_back(std::make_pair(x, i));
290  }
291  }
292 
293  std::sort(combinedData.begin(), combinedData.end());
294 
295  std::vector<double> ranks;
296  ranks.reserve(combinedData.size());
297 
298  int rank = 1;
299  ranks.push_back(rank);
300  for (size_t i = 1; i < combinedData.size(); ++i) {
301  if (std::abs(combinedData[i].first - combinedData[i - 1].first) >
302  std::numeric_limits<double>::epsilon()) {
303  rank++;
304  }
305  ranks.push_back(rank);
306  }
307 
308  double H = 0.0;
309  for (int i = 0; i < k; ++i) {
310  double rankSum = 0.0;
311  for (size_t j = 0; j < samples[i].size(); ++j) {
312  rankSum += ranks[i * samples[i].size() + j];
313  }
314  H += (rankSum * rankSum) / samples[i].size();
315  }
316  H = (12.0 / (combinedData.size() * (combinedData.size() + 1))) * H -
317  3.0 * (combinedData.size() + 1);
318 
319  return H;
320 }
321 
322 // Method for conducting Runs Test
323 double
324 gpmp::stats::HypothesisTest::runs_test(const std::vector<bool> &sequence) {
325  int n = sequence.size();
326  int numRuns = 1;
327 
328  for (int i = 1; i < n; ++i) {
329  if (sequence[i] != sequence[i - 1]) {
330  numRuns++;
331  }
332  }
333 
334  double expectedRuns = (2.0 * n - 1) / 3.0;
335  double varianceRuns = (16.0 * n - 29) / 90.0;
336  double z = (numRuns - expectedRuns) / sqrt(varianceRuns);
337 
338  return z;
339 }
340 
341 // Helper method to calculate factorial
343  if (n <= 1) {
344  return 1;
345  }
346  return n * factorial(n - 1);
347 }
static double u_stat(const std::vector< double > &sample1, const std::vector< double > &sample2)
Calculates U statistic given two samples.
Definition: describe.cpp:38
static double stdev(const std::vector< double > &data, double mean)
Calculates the standard deviation of a given dataset, given the mean.
Definition: describe.cpp:184
static double variance(const std::vector< double > &data, double mean)
Calculates the variance of a given dataset, given the mean.
Definition: describe.cpp:194
static double mean_arith(const std::vector< double > &data)
Calculates the arithmetic mean of a given dataset.
Definition: describe.cpp:52
static int factorial(int n)
Helper method to calculate factorial.
Definition: hypothesis.cpp:342
static double fisher_test(const std::vector< std::vector< int >> &table)
Fisher's Exact Test.
Definition: hypothesis.cpp:199
static double mann_whitney_test(const std::vector< double > &sample1, const std::vector< double > &sample2)
Mann-Whitney U test.
Definition: hypothesis.cpp:177
static double two_sample_ttest(const std::vector< double > &sample1, const std::vector< double > &sample2)
Two-sample t-test.
Definition: hypothesis.cpp:52
static double chi_square_test(const std::vector< std::vector< int >> &observed, const std::vector< std::vector< double >> &expected)
Chi-square test of independence.
Definition: hypothesis.cpp:107
static double runs_test(const std::vector< bool > &sequence)
Runs Test.
Definition: hypothesis.cpp:324
static double kol_smirnov_test(const std::vector< double > &sample1, const std::vector< double > &sample2)
Kolmogorov-Smirnov Test.
Definition: hypothesis.cpp:224
static double ANOVA(const std::vector< std::vector< double >> &samples)
Analysis of Variance (ANOVA)
Definition: hypothesis.cpp:69
static double one_sample_ttest(const std::vector< double > &sample, double populationMean)
One-sample t-test.
Definition: hypothesis.cpp:42
static double proportion_z_test(double p1, double p2, double n1, double n2)
Z-test for proportions.
Definition: hypothesis.cpp:125
static double kruskal_wallis_test(const std::vector< std::vector< double >> &samples)
Kruskal-Wallis Test.
Definition: hypothesis.cpp:282
static double wilcoxon_rank_test(const std::vector< double > &sample1, const std::vector< double > &sample2)
Wilcoxon signed-rank test.
Definition: hypothesis.cpp:135
static double wilcoxon_rank_sum_test(const std::vector< double > &sample1, const std::vector< double > &sample2)
Wilcoxon Rank Sum Test (Mann-Whitney U Test)
Definition: hypothesis.cpp:251