openGPMP
Open Source Mathematics Package
Public Member Functions | Static Public Member Functions | Static Private Member Functions | List of all members
gpmp::stats::HypothesisTest Class Reference

A class for conducting various statistical hypothesis tests. More...

#include <hypothesis.hpp>

Public Member Functions

 HypothesisTest ()
 Constructor. More...
 
 ~HypothesisTest ()
 Destructor. More...
 

Static Public Member Functions

static double one_sample_ttest (const std::vector< double > &sample, double populationMean)
 One-sample t-test. More...
 
static double two_sample_ttest (const std::vector< double > &sample1, const std::vector< double > &sample2)
 Two-sample t-test. More...
 
static double ANOVA (const std::vector< std::vector< double >> &samples)
 Analysis of Variance (ANOVA) More...
 
static double chi_square_test (const std::vector< std::vector< int >> &observed, const std::vector< std::vector< double >> &expected)
 Chi-square test of independence. More...
 
static double proportion_z_test (double p1, double p2, double n1, double n2)
 Z-test for proportions. More...
 
static double wilcoxon_rank_test (const std::vector< double > &sample1, const std::vector< double > &sample2)
 Wilcoxon signed-rank test. More...
 
static double mann_whitney_test (const std::vector< double > &sample1, const std::vector< double > &sample2)
 Mann-Whitney U test. More...
 
static double fisher_test (const std::vector< std::vector< int >> &table)
 Fisher's Exact Test. More...
 
static double kol_smirnov_test (const std::vector< double > &sample1, const std::vector< double > &sample2)
 Kolmogorov-Smirnov Test. More...
 
static double wilcoxon_rank_sum_test (const std::vector< double > &sample1, const std::vector< double > &sample2)
 Wilcoxon Rank Sum Test (Mann-Whitney U Test) More...
 
static double kruskal_wallis_test (const std::vector< std::vector< double >> &samples)
 Kruskal-Wallis Test. More...
 
static double runs_test (const std::vector< bool > &sequence)
 Runs Test. More...
 

Static Private Member Functions

static int factorial (int n)
 Helper method to calculate factorial. More...
 

Detailed Description

A class for conducting various statistical hypothesis tests.

Definition at line 47 of file hypothesis.hpp.

Constructor & Destructor Documentation

◆ HypothesisTest()

gpmp::stats::HypothesisTest::HypothesisTest ( )
inline

Constructor.

Definition at line 52 of file hypothesis.hpp.

52  {
53  }

◆ ~HypothesisTest()

gpmp::stats::HypothesisTest::~HypothesisTest ( )
inline

Destructor.

Definition at line 58 of file hypothesis.hpp.

58  {
59  }

Member Function Documentation

◆ ANOVA()

double gpmp::stats::HypothesisTest::ANOVA ( const std::vector< std::vector< double >> &  samples)
static

Analysis of Variance (ANOVA)

Parameters
samplesThe vector of sample data
Returns
The F-statistic

Definition at line 69 of file hypothesis.cpp.

70  {
71  int k = samples.size();
72  int n = 0;
73  double grandMean = 0.0;
74  double SSB = 0.0;
75  double SSW = 0.0;
76 
77  // Calculate total number of observations and grand mean
78  for (const auto &sample : samples) {
79  n += sample.size();
80  double sampleMean = gpmp::stats::Describe::mean_arith(sample);
81  grandMean += sampleMean;
82  }
83  grandMean /= k;
84 
85  // Calculate sum of squares between groups (SSB) and within groups (SSW)
86  for (int i = 0; i < k; ++i) {
87  double sampleMean = gpmp::stats::Describe::mean_arith(samples[i]);
88  for (double x : samples[i]) {
89  SSB += pow((sampleMean - grandMean), 2);
90  SSW += pow((x - sampleMean), 2);
91  }
92  }
93 
94  // Calculate degrees of freedom
95  int dfBetweenGroups = k - 1;
96  int dfWithinGroups = n - k;
97 
98  // Calculate F-statistic
99  double MSB = SSB / dfBetweenGroups;
100  double MSW = SSW / dfWithinGroups;
101  double F = MSB / MSW;
102 
103  return F;
104 }
static double mean_arith(const std::vector< double > &data)
Calculates the arithmetic mean of a given dataset.
Definition: describe.cpp:52

References gpmp::stats::Describe::mean_arith().

Referenced by main().

◆ chi_square_test()

double gpmp::stats::HypothesisTest::chi_square_test ( const std::vector< std::vector< int >> &  observed,
const std::vector< std::vector< double >> &  expected 
)
static

Chi-square test of independence.

Parameters
observedThe observed data
expectedThe expected data
Returns
The chi-square statistic

Definition at line 107 of file hypothesis.cpp.

109  {
110  int rows = observed.size();
111  int cols = observed[0].size();
112  double chiSquare = 0.0;
113 
114  for (int i = 0; i < rows; ++i) {
115  for (int j = 0; j < cols; ++j) {
116  chiSquare +=
117  pow((observed[i][j] - expected[i][j]), 2) / expected[i][j];
118  }
119  }
120 
121  return chiSquare;
122 }

References test_linalg::cols, and test_linalg::rows.

Referenced by main().

◆ factorial()

int gpmp::stats::HypothesisTest::factorial ( int  n)
staticprivate

Helper method to calculate factorial.

Parameters
nThe integer value
Returns
The factorial of n

Definition at line 342 of file hypothesis.cpp.

342  {
343  if (n <= 1) {
344  return 1;
345  }
346  return n * factorial(n - 1);
347 }
static int factorial(int n)
Helper method to calculate factorial.
Definition: hypothesis.cpp:342

◆ fisher_test()

double gpmp::stats::HypothesisTest::fisher_test ( const std::vector< std::vector< int >> &  table)
static

Fisher's Exact Test.

Parameters
tableThe contingency table
Returns
The p-value

Definition at line 199 of file hypothesis.cpp.

200  {
201  int nRows = table.size();
202  int nCols = table[0].size();
203 
204  if (nRows != 2 || nCols != 2) {
205  std::cerr << "Fisher's Exact Test requires a 2x2 contingency table."
206  << std::endl;
207  return std::numeric_limits<double>::quiet_NaN();
208  }
209 
210  int a = table[0][0];
211  int b = table[0][1];
212  int c = table[1][0];
213  int d = table[1][1];
214 
215  double p = (factorial(a + b) * factorial(c + d) * factorial(a + c) *
216  factorial(b + d)) /
217  (factorial(a) * factorial(b) * factorial(c) * factorial(d) *
218  factorial(a + b + c + d));
219 
220  return p;
221 }

◆ kol_smirnov_test()

double gpmp::stats::HypothesisTest::kol_smirnov_test ( const std::vector< double > &  sample1,
const std::vector< double > &  sample2 
)
static

Kolmogorov-Smirnov Test.

Parameters
sample1The first sample data
sample2The second sample data
Returns
The test statistic

Definition at line 224 of file hypothesis.cpp.

226  {
227  int n1 = sample1.size();
228  int n2 = sample2.size();
229 
230  std::vector<double> combinedSamples = sample1;
231  combinedSamples.insert(combinedSamples.end(),
232  sample2.begin(),
233  sample2.end());
234  std::sort(combinedSamples.begin(), combinedSamples.end());
235 
236  double maxDPlus = 0.0;
237  double maxDMinus = 0.0;
238 
239  for (size_t i = 0; i < combinedSamples.size(); ++i) {
240  double DPlus = (i + 1) / static_cast<double>(n1) - combinedSamples[i];
241  double DMinus = combinedSamples[i] - i / static_cast<double>(n2);
242 
243  maxDPlus = std::max(maxDPlus, DPlus);
244  maxDMinus = std::max(maxDMinus, DMinus);
245  }
246 
247  return std::max(maxDPlus, maxDMinus);
248 }

◆ kruskal_wallis_test()

double gpmp::stats::HypothesisTest::kruskal_wallis_test ( const std::vector< std::vector< double >> &  samples)
static

Kruskal-Wallis Test.

Parameters
samplesThe vector of sample data
Returns
The test statistic

Definition at line 282 of file hypothesis.cpp.

283  {
284  int k = samples.size();
285  std::vector<std::pair<double, int>> combinedData;
286 
287  for (int i = 0; i < k; ++i) {
288  for (double x : samples[i]) {
289  combinedData.push_back(std::make_pair(x, i));
290  }
291  }
292 
293  std::sort(combinedData.begin(), combinedData.end());
294 
295  std::vector<double> ranks;
296  ranks.reserve(combinedData.size());
297 
298  int rank = 1;
299  ranks.push_back(rank);
300  for (size_t i = 1; i < combinedData.size(); ++i) {
301  if (std::abs(combinedData[i].first - combinedData[i - 1].first) >
302  std::numeric_limits<double>::epsilon()) {
303  rank++;
304  }
305  ranks.push_back(rank);
306  }
307 
308  double H = 0.0;
309  for (int i = 0; i < k; ++i) {
310  double rankSum = 0.0;
311  for (size_t j = 0; j < samples[i].size(); ++j) {
312  rankSum += ranks[i * samples[i].size() + j];
313  }
314  H += (rankSum * rankSum) / samples[i].size();
315  }
316  H = (12.0 / (combinedData.size() * (combinedData.size() + 1))) * H -
317  3.0 * (combinedData.size() + 1);
318 
319  return H;
320 }

◆ mann_whitney_test()

double gpmp::stats::HypothesisTest::mann_whitney_test ( const std::vector< double > &  sample1,
const std::vector< double > &  sample2 
)
static

Mann-Whitney U test.

Parameters
sample1The first sample data
sample2The second sample data
Returns
The z-score

Definition at line 177 of file hypothesis.cpp.

179  {
180  int n1 = sample1.size();
181  int n2 = sample2.size();
182  double U1 = gpmp::stats::Describe::u_stat(sample1, sample2);
183  double U2 = gpmp::stats::Describe::u_stat(sample2, sample1);
184  double U = std::min(U1, U2);
185 
186  // Calculate the expected value of U
187  double expectedU = n1 * n2 / 2.0;
188 
189  // Calculate the standard deviation of U
190  double stdDev = sqrt(n1 * n2 * (n1 + n2 + 1) / 12.0);
191 
192  // Calculate the z-score
193  double z = (U - expectedU) / stdDev;
194 
195  return z;
196 }
static double u_stat(const std::vector< double > &sample1, const std::vector< double > &sample2)
Calculates U statistic given two samples.
Definition: describe.cpp:38

References gpmp::stats::Describe::u_stat().

Referenced by main().

◆ one_sample_ttest()

double gpmp::stats::HypothesisTest::one_sample_ttest ( const std::vector< double > &  sample,
double  populationMean 
)
static

One-sample t-test.

Parameters
sampleThe sample data
populationMeanThe population mean
Returns
The t-statistic

Definition at line 42 of file hypothesis.cpp.

43  {
44  int n = sample.size();
45  double sampleMean = gpmp::stats::Describe::mean_arith(sample);
46  double sampleStdDev = gpmp::stats::Describe::stdev(sample, sampleMean);
47  double standardError = sampleStdDev / sqrt(n);
48  return (sampleMean - populationMean) / standardError;
49 }
static double stdev(const std::vector< double > &data, double mean)
Calculates the standard deviation of a given dataset, given the mean.
Definition: describe.cpp:184

References gpmp::stats::Describe::mean_arith(), and gpmp::stats::Describe::stdev().

Referenced by main().

◆ proportion_z_test()

double gpmp::stats::HypothesisTest::proportion_z_test ( double  p1,
double  p2,
double  n1,
double  n2 
)
static

Z-test for proportions.

Parameters
p1The proportion for sample 1
p2The proportion for sample 2
n1The sample size for sample 1
n2The sample size for sample 2
Returns
The z-score

Definition at line 125 of file hypothesis.cpp.

128  {
129  double p = (p1 * n1 + p2 * n2) / (n1 + n2);
130  double z = (p1 - p2) / sqrt(p * (1 - p) * (1 / n1 + 1 / n2));
131  return z;
132 }

Referenced by main().

◆ runs_test()

double gpmp::stats::HypothesisTest::runs_test ( const std::vector< bool > &  sequence)
static

Runs Test.

Parameters
sequenceThe binary sequence
Returns
The z-score

Definition at line 324 of file hypothesis.cpp.

324  {
325  int n = sequence.size();
326  int numRuns = 1;
327 
328  for (int i = 1; i < n; ++i) {
329  if (sequence[i] != sequence[i - 1]) {
330  numRuns++;
331  }
332  }
333 
334  double expectedRuns = (2.0 * n - 1) / 3.0;
335  double varianceRuns = (16.0 * n - 29) / 90.0;
336  double z = (numRuns - expectedRuns) / sqrt(varianceRuns);
337 
338  return z;
339 }

◆ two_sample_ttest()

double gpmp::stats::HypothesisTest::two_sample_ttest ( const std::vector< double > &  sample1,
const std::vector< double > &  sample2 
)
static

Two-sample t-test.

Parameters
sample1The first sample data
sample2The second sample data
Returns
The t-statistic

Definition at line 52 of file hypothesis.cpp.

54  {
55  int n1 = sample1.size();
56  int n2 = sample2.size();
57  double sampleMean1 = gpmp::stats::Describe::mean_arith(sample1);
58  double sampleMean2 = gpmp::stats::Describe::mean_arith(sample2);
59  double sampleVar1 = gpmp::stats::Describe::variance(sample1, sampleMean1);
60  double sampleVar2 = gpmp::stats::Describe::variance(sample2, sampleMean2);
61  double pooledVar =
62  ((n1 - 1) * sampleVar1 + (n2 - 1) * sampleVar2) / (n1 + n2 - 2);
63  double t =
64  (sampleMean1 - sampleMean2) / sqrt(pooledVar * (1.0 / n1 + 1.0 / n2));
65  return t;
66 }
static double variance(const std::vector< double > &data, double mean)
Calculates the variance of a given dataset, given the mean.
Definition: describe.cpp:194

References gpmp::stats::Describe::mean_arith(), and gpmp::stats::Describe::variance().

Referenced by main().

◆ wilcoxon_rank_sum_test()

double gpmp::stats::HypothesisTest::wilcoxon_rank_sum_test ( const std::vector< double > &  sample1,
const std::vector< double > &  sample2 
)
static

Wilcoxon Rank Sum Test (Mann-Whitney U Test)

Parameters
sample1The first sample data
sample2The second sample data
Returns
The U statistic

Definition at line 251 of file hypothesis.cpp.

253  {
254  int n1 = sample1.size();
255  int n2 = sample2.size();
256  std::vector<double> ranks;
257  ranks.reserve(n1 + n2);
258 
259  for (double x : sample1) {
260  ranks.push_back(x);
261  }
262  for (double x : sample2) {
263  ranks.push_back(x);
264  }
265 
266  std::sort(ranks.begin(), ranks.end());
267 
268  double rankSum1 = 0.0;
269  for (double x : sample1) {
270  rankSum1 +=
271  std::distance(ranks.begin(),
272  std::lower_bound(ranks.begin(), ranks.end(), x));
273  }
274 
275  double U1 = rankSum1 - (n1 * (n1 + 1)) / 2.0;
276  double U2 = n1 * n2 - U1;
277 
278  return std::min(U1, U2);
279 }

◆ wilcoxon_rank_test()

double gpmp::stats::HypothesisTest::wilcoxon_rank_test ( const std::vector< double > &  sample1,
const std::vector< double > &  sample2 
)
static

Wilcoxon signed-rank test.

Parameters
sample1The first sample data
sample2The second sample data
Returns
The z-score

Definition at line 135 of file hypothesis.cpp.

137  {
138  int n = sample1.size();
139  if (n != static_cast<int>(sample2.size())) {
140  std::cerr << "Sample sizes must be equal for Wilcoxon signed-rank test."
141  << std::endl;
142  return std::numeric_limits<double>::quiet_NaN();
143  }
144 
145  std::vector<double> differences;
146  for (int i = 0; i < n; ++i) {
147  differences.push_back(sample1[i] - sample2[i]);
148  }
149  std::sort(differences.begin(), differences.end(), [](double a, double b) {
150  return std::abs(a) < std::abs(b);
151  });
152 
153  double Tplus = 0;
154  double Tminus = 0;
155  int numPositive = 0;
156  int numNegative = 0;
157  for (double diff : differences) {
158  if (diff > 0) {
159  Tplus += diff;
160  numPositive++;
161  } else if (diff < 0) {
162  Tminus -= diff;
163  numNegative++;
164  }
165  }
166  int T = std::min(Tplus, Tminus);
167 
168  // Calculate the critical value using the normal approximation
169  double mean = n * (n + 1) / 4.0;
170  double stdDev = sqrt(n * (n + 1) * (2 * n + 1) / 24.0);
171  double z = (T - mean) / stdDev;
172 
173  return z;
174 }

Referenced by main().


The documentation for this class was generated from the following files: