Line data Source code
1 : /*************************************************************************
2 : *
3 : * Project
4 : * _____ _____ __ __ _____
5 : * / ____| __ \| \/ | __ \
6 : * ___ _ __ ___ _ __ | | __| |__) | \ / | |__) |
7 : * / _ \| '_ \ / _ \ '_ \| | |_ | ___/| |\/| | ___/
8 : *| (_) | |_) | __/ | | | |__| | | | | | | |
9 : * \___/| .__/ \___|_| |_|\_____|_| |_| |_|_|
10 : * | |
11 : * |_|
12 : *
13 : * Copyright (C) Akiel Aries, <akiel@akiel.org>, et al.
14 : *
15 : * This software is licensed as described in the file LICENSE, which
16 : * you should have received as part of this distribution. The terms
17 : * among other details are referenced in the official documentation
18 : * seen here : https://akielaries.github.io/openGPMP/ along with
19 : * important files seen in this project.
20 : *
21 : * You may opt to use, copy, modify, merge, publish, distribute
22 : * and/or sell copies of the Software, and permit persons to whom
23 : * the Software is furnished to do so, under the terms of the
24 : * LICENSE file.
25 : *
26 : *
27 : *
28 : * This software is distributed on an AS IS basis, WITHOUT
29 : * WARRANTY OF ANY KIND, either express or implied.
30 : *
31 : ************************************************************************/
32 : #include <algorithm>
33 : #include <cmath>
34 : #include <iostream>
35 : #include <limits>
36 : #include <openGPMP/stats/describe.hpp>
37 : #include <openGPMP/stats/hypothesis.hpp>
38 : #include <vector>
39 :
40 : // Method for conducting a one-sample t-test
41 : double
42 0 : gpmp::stats::HypothesisTest::one_sample_ttest(const std::vector<double> &sample,
43 : double populationMean) {
44 0 : int n = sample.size();
45 0 : double sampleMean = gpmp::stats::Describe::mean_arith(sample);
46 0 : double sampleStdDev = gpmp::stats::Describe::stdev(sample, sampleMean);
47 0 : double standardError = sampleStdDev / sqrt(n);
48 0 : return (sampleMean - populationMean) / standardError;
49 : }
50 :
51 : // Method for conducting a two-sample t-test
52 0 : double gpmp::stats::HypothesisTest::two_sample_ttest(
53 : const std::vector<double> &sample1,
54 : const std::vector<double> &sample2) {
55 0 : int n1 = sample1.size();
56 0 : int n2 = sample2.size();
57 0 : double sampleMean1 = gpmp::stats::Describe::mean_arith(sample1);
58 0 : double sampleMean2 = gpmp::stats::Describe::mean_arith(sample2);
59 0 : double sampleVar1 = gpmp::stats::Describe::variance(sample1, sampleMean1);
60 0 : double sampleVar2 = gpmp::stats::Describe::variance(sample2, sampleMean2);
61 0 : double pooledVar =
62 0 : ((n1 - 1) * sampleVar1 + (n2 - 1) * sampleVar2) / (n1 + n2 - 2);
63 : double t =
64 0 : (sampleMean1 - sampleMean2) / sqrt(pooledVar * (1.0 / n1 + 1.0 / n2));
65 0 : return t;
66 : }
67 :
68 : // Method for conducting ANOVA (Analysis of Variance)
69 0 : double gpmp::stats::HypothesisTest::ANOVA(
70 : const std::vector<std::vector<double>> &samples) {
71 0 : int k = samples.size();
72 0 : int n = 0;
73 0 : double grandMean = 0.0;
74 0 : double SSB = 0.0;
75 0 : double SSW = 0.0;
76 :
77 : // Calculate total number of observations and grand mean
78 0 : for (const auto &sample : samples) {
79 0 : n += sample.size();
80 0 : double sampleMean = gpmp::stats::Describe::mean_arith(sample);
81 0 : grandMean += sampleMean;
82 : }
83 0 : grandMean /= k;
84 :
85 : // Calculate sum of squares between groups (SSB) and within groups (SSW)
86 0 : for (int i = 0; i < k; ++i) {
87 0 : double sampleMean = gpmp::stats::Describe::mean_arith(samples[i]);
88 0 : for (double x : samples[i]) {
89 0 : SSB += pow((sampleMean - grandMean), 2);
90 0 : SSW += pow((x - sampleMean), 2);
91 : }
92 : }
93 :
94 : // Calculate degrees of freedom
95 0 : int dfBetweenGroups = k - 1;
96 0 : int dfWithinGroups = n - k;
97 :
98 : // Calculate F-statistic
99 0 : double MSB = SSB / dfBetweenGroups;
100 0 : double MSW = SSW / dfWithinGroups;
101 0 : double F = MSB / MSW;
102 :
103 0 : return F;
104 : }
105 :
106 : // Method for conducting chi-square test of independence
107 0 : double gpmp::stats::HypothesisTest::chi_square_test(
108 : const std::vector<std::vector<int>> &observed,
109 : const std::vector<std::vector<double>> &expected) {
110 0 : int rows = observed.size();
111 0 : int cols = observed[0].size();
112 0 : double chiSquare = 0.0;
113 :
114 0 : for (int i = 0; i < rows; ++i) {
115 0 : for (int j = 0; j < cols; ++j) {
116 0 : chiSquare +=
117 0 : pow((observed[i][j] - expected[i][j]), 2) / expected[i][j];
118 : }
119 : }
120 :
121 0 : return chiSquare;
122 : }
123 :
124 : // Method for conducting z-test for proportions
125 0 : double gpmp::stats::HypothesisTest::proportion_z_test(double p1,
126 : double p2,
127 : double n1,
128 : double n2) {
129 0 : double p = (p1 * n1 + p2 * n2) / (n1 + n2);
130 0 : double z = (p1 - p2) / sqrt(p * (1 - p) * (1 / n1 + 1 / n2));
131 0 : return z;
132 : }
133 :
134 : // Method for conducting Wilcoxon signed-rank test
135 0 : double gpmp::stats::HypothesisTest::wilcoxon_rank_test(
136 : const std::vector<double> &sample1,
137 : const std::vector<double> &sample2) {
138 0 : int n = sample1.size();
139 0 : if (n != static_cast<int>(sample2.size())) {
140 0 : std::cerr << "Sample sizes must be equal for Wilcoxon signed-rank test."
141 0 : << std::endl;
142 0 : return std::numeric_limits<double>::quiet_NaN();
143 : }
144 :
145 0 : std::vector<double> differences;
146 0 : for (int i = 0; i < n; ++i) {
147 0 : differences.push_back(sample1[i] - sample2[i]);
148 : }
149 0 : std::sort(differences.begin(), differences.end(), [](double a, double b) {
150 0 : return std::abs(a) < std::abs(b);
151 : });
152 :
153 0 : double Tplus = 0;
154 0 : double Tminus = 0;
155 0 : int numPositive = 0;
156 0 : int numNegative = 0;
157 0 : for (double diff : differences) {
158 0 : if (diff > 0) {
159 0 : Tplus += diff;
160 0 : numPositive++;
161 0 : } else if (diff < 0) {
162 0 : Tminus -= diff;
163 0 : numNegative++;
164 : }
165 : }
166 0 : int T = std::min(Tplus, Tminus);
167 :
168 : // Calculate the critical value using the normal approximation
169 0 : double mean = n * (n + 1) / 4.0;
170 0 : double stdDev = sqrt(n * (n + 1) * (2 * n + 1) / 24.0);
171 0 : double z = (T - mean) / stdDev;
172 :
173 0 : return z;
174 0 : }
175 :
176 : // Method for conducting Mann-Whitney U test
177 0 : double gpmp::stats::HypothesisTest::mann_whitney_test(
178 : const std::vector<double> &sample1,
179 : const std::vector<double> &sample2) {
180 0 : int n1 = sample1.size();
181 0 : int n2 = sample2.size();
182 0 : double U1 = gpmp::stats::Describe::u_stat(sample1, sample2);
183 0 : double U2 = gpmp::stats::Describe::u_stat(sample2, sample1);
184 0 : double U = std::min(U1, U2);
185 :
186 : // Calculate the expected value of U
187 0 : double expectedU = n1 * n2 / 2.0;
188 :
189 : // Calculate the standard deviation of U
190 0 : double stdDev = sqrt(n1 * n2 * (n1 + n2 + 1) / 12.0);
191 :
192 : // Calculate the z-score
193 0 : double z = (U - expectedU) / stdDev;
194 :
195 0 : return z;
196 : }
197 :
198 : // Method for conducting Fisher's Exact Test
199 0 : double gpmp::stats::HypothesisTest::fisher_test(
200 : const std::vector<std::vector<int>> &table) {
201 0 : int nRows = table.size();
202 0 : int nCols = table[0].size();
203 :
204 0 : if (nRows != 2 || nCols != 2) {
205 0 : std::cerr << "Fisher's Exact Test requires a 2x2 contingency table."
206 0 : << std::endl;
207 0 : return std::numeric_limits<double>::quiet_NaN();
208 : }
209 :
210 0 : int a = table[0][0];
211 0 : int b = table[0][1];
212 0 : int c = table[1][0];
213 0 : int d = table[1][1];
214 :
215 0 : double p = (factorial(a + b) * factorial(c + d) * factorial(a + c) *
216 0 : factorial(b + d)) /
217 0 : (factorial(a) * factorial(b) * factorial(c) * factorial(d) *
218 0 : factorial(a + b + c + d));
219 :
220 0 : return p;
221 : }
222 :
223 : // Method for conducting Kolmogorov-Smirnov Test
224 0 : double gpmp::stats::HypothesisTest::kol_smirnov_test(
225 : const std::vector<double> &sample1,
226 : const std::vector<double> &sample2) {
227 0 : int n1 = sample1.size();
228 0 : int n2 = sample2.size();
229 :
230 0 : std::vector<double> combinedSamples = sample1;
231 0 : combinedSamples.insert(combinedSamples.end(),
232 : sample2.begin(),
233 : sample2.end());
234 0 : std::sort(combinedSamples.begin(), combinedSamples.end());
235 :
236 0 : double maxDPlus = 0.0;
237 0 : double maxDMinus = 0.0;
238 :
239 0 : for (size_t i = 0; i < combinedSamples.size(); ++i) {
240 0 : double DPlus = (i + 1) / static_cast<double>(n1) - combinedSamples[i];
241 0 : double DMinus = combinedSamples[i] - i / static_cast<double>(n2);
242 :
243 0 : maxDPlus = std::max(maxDPlus, DPlus);
244 0 : maxDMinus = std::max(maxDMinus, DMinus);
245 : }
246 :
247 0 : return std::max(maxDPlus, maxDMinus);
248 0 : }
249 :
250 : // Method for conducting Wilcoxon Rank Sum Test (Mann-Whitney U Test)
251 0 : double gpmp::stats::HypothesisTest::wilcoxon_rank_sum_test(
252 : const std::vector<double> &sample1,
253 : const std::vector<double> &sample2) {
254 0 : int n1 = sample1.size();
255 0 : int n2 = sample2.size();
256 0 : std::vector<double> ranks;
257 0 : ranks.reserve(n1 + n2);
258 :
259 0 : for (double x : sample1) {
260 0 : ranks.push_back(x);
261 : }
262 0 : for (double x : sample2) {
263 0 : ranks.push_back(x);
264 : }
265 :
266 0 : std::sort(ranks.begin(), ranks.end());
267 :
268 0 : double rankSum1 = 0.0;
269 0 : for (double x : sample1) {
270 0 : rankSum1 +=
271 0 : std::distance(ranks.begin(),
272 : std::lower_bound(ranks.begin(), ranks.end(), x));
273 : }
274 :
275 0 : double U1 = rankSum1 - (n1 * (n1 + 1)) / 2.0;
276 0 : double U2 = n1 * n2 - U1;
277 :
278 0 : return std::min(U1, U2);
279 0 : }
280 :
281 : // Method for conducting Kruskal-Wallis Test
282 0 : double gpmp::stats::HypothesisTest::kruskal_wallis_test(
283 : const std::vector<std::vector<double>> &samples) {
284 0 : int k = samples.size();
285 0 : std::vector<std::pair<double, int>> combinedData;
286 :
287 0 : for (int i = 0; i < k; ++i) {
288 0 : for (double x : samples[i]) {
289 0 : combinedData.push_back(std::make_pair(x, i));
290 : }
291 : }
292 :
293 0 : std::sort(combinedData.begin(), combinedData.end());
294 :
295 0 : std::vector<double> ranks;
296 0 : ranks.reserve(combinedData.size());
297 :
298 0 : int rank = 1;
299 0 : ranks.push_back(rank);
300 0 : for (size_t i = 1; i < combinedData.size(); ++i) {
301 0 : if (std::abs(combinedData[i].first - combinedData[i - 1].first) >
302 0 : std::numeric_limits<double>::epsilon()) {
303 0 : rank++;
304 : }
305 0 : ranks.push_back(rank);
306 : }
307 :
308 0 : double H = 0.0;
309 0 : for (int i = 0; i < k; ++i) {
310 0 : double rankSum = 0.0;
311 0 : for (size_t j = 0; j < samples[i].size(); ++j) {
312 0 : rankSum += ranks[i * samples[i].size() + j];
313 : }
314 0 : H += (rankSum * rankSum) / samples[i].size();
315 : }
316 0 : H = (12.0 / (combinedData.size() * (combinedData.size() + 1))) * H -
317 0 : 3.0 * (combinedData.size() + 1);
318 :
319 0 : return H;
320 0 : }
321 :
322 : // Method for conducting Runs Test
323 : double
324 0 : gpmp::stats::HypothesisTest::runs_test(const std::vector<bool> &sequence) {
325 0 : int n = sequence.size();
326 0 : int numRuns = 1;
327 :
328 0 : for (int i = 1; i < n; ++i) {
329 0 : if (sequence[i] != sequence[i - 1]) {
330 0 : numRuns++;
331 : }
332 : }
333 :
334 0 : double expectedRuns = (2.0 * n - 1) / 3.0;
335 0 : double varianceRuns = (16.0 * n - 29) / 90.0;
336 0 : double z = (numRuns - expectedRuns) / sqrt(varianceRuns);
337 :
338 0 : return z;
339 : }
340 :
341 : // Helper method to calculate factorial
342 0 : int gpmp::stats::HypothesisTest::factorial(int n) {
343 0 : if (n <= 1) {
344 0 : return 1;
345 : }
346 0 : return n * factorial(n - 1);
347 : }
|