openGPMP
Open Source Mathematics Package
describe.cpp
Go to the documentation of this file.
1 /*************************************************************************
2  *
3  * Project
4  * _____ _____ __ __ _____
5  * / ____| __ \| \/ | __ \
6  * ___ _ __ ___ _ __ | | __| |__) | \ / | |__) |
7  * / _ \| '_ \ / _ \ '_ \| | |_ | ___/| |\/| | ___/
8  *| (_) | |_) | __/ | | | |__| | | | | | | |
9  * \___/| .__/ \___|_| |_|\_____|_| |_| |_|_|
10  * | |
11  * |_|
12  *
13  * Copyright (C) Akiel Aries, <akiel@akiel.org>, et al.
14  *
15  * This software is licensed as described in the file LICENSE, which
16  * you should have received as part of this distribution. The terms
17  * among other details are referenced in the official documentation
18  * seen here : https://akielaries.github.io/openGPMP/ along with
19  * important files seen in this project.
20  *
21  * You may opt to use, copy, modify, merge, publish, distribute
22  * and/or sell copies of the Software, and permit persons to whom
23  * the Software is furnished to do so, under the terms of the
24  * LICENSE file. As this is an Open Source effort, all implementations
25  * must be of the same methodology.
26  *
27  *
28  *
29  * This software is distributed on an AS IS basis, WITHOUT
30  * WARRANTY OF ANY KIND, either express or implied.
31  *
32  ************************************************************************/
33 #include <algorithm>
34 #include <cmath>
36 #include <vector>
37 
38 double gpmp::stats::Describe::u_stat(const std::vector<double> &sample1,
39  const std::vector<double> &sample2) {
40  double U = 0;
41  for (double x1 : sample1) {
42  for (double x2 : sample2) {
43  if (x1 < x2) {
44  U++;
45  }
46  }
47  }
48  return U;
49 }
50 
51 // Arithmetic Mean
52 double gpmp::stats::Describe::mean_arith(const std::vector<double> &data) {
53  double sum = 0.0;
54  for (const auto &value : data) {
55  sum += value;
56  }
57  return sum / static_cast<double>(data.size());
58 }
59 
60 // Geometric Mean
61 double gpmp::stats::Describe::mean_geo(const std::vector<double> &data) {
62  double product = 1.0;
63  for (const auto &value : data) {
64  product *= value;
65  }
66  return std::pow(product, 1.0 / static_cast<double>(data.size()));
67 }
68 
69 // Cubic Generalized Mean
70 double gpmp::stats::Describe::mean_cubic(const std::vector<double> &data,
71  double p) {
72  double sum = 0.0;
73  for (const auto &value : data) {
74  sum += std::pow(value, p);
75  }
76  return std::pow(sum / static_cast<double>(data.size()), 1.0 / p);
77 }
78 
79 // Power Geometric Mean
80 double gpmp::stats::Describe::mean_geo_pow(const std::vector<double> &data,
81  double p) {
82  double product = 1.0;
83  for (const auto &value : data) {
84  product *= std::pow(value, p);
85  }
86  return std::pow(product, 1.0 / static_cast<double>(data.size()));
87 }
88 
89 // Harmonic Mean
90 double gpmp::stats::Describe::mean_harmonic(const std::vector<double> &data) {
91  double sum = 0.0;
92  for (const auto &value : data) {
93  sum += 1.0 / value;
94  }
95  return static_cast<double>(data.size()) / sum;
96 }
97 
98 // Heronian Mean
99 double gpmp::stats::Describe::mean_heronian(const std::vector<double> &data) {
100  double product = 1.0;
101  for (const auto &value : data) {
102  product *= std::sqrt(value);
103  }
104  return std::pow(product, 2.0 / static_cast<double>(data.size()));
105 }
106 
107 // Heinz Mean
108 double gpmp::stats::Describe::mean_heinz(const std::vector<double> &data) {
109  double sum = 0.0;
110  for (const auto &value : data) {
111  sum += value * std::log(value);
112  }
113  return std::exp(sum / static_cast<double>(data.size()));
114 }
115 
116 // Lehmer Mean
117 double gpmp::stats::Describe::mean_lehmer(const std::vector<double> &data,
118  double p) {
119  double sum = 0.0;
120  for (const auto &value : data) {
121  sum += std::pow(value, p);
122  }
123  return sum / static_cast<double>(data.size());
124 }
125 
126 // Median
127 double gpmp::stats::Describe::Median(std::vector<double> data) {
128  std::sort(data.begin(), data.end());
129  size_t size = data.size();
130  if (size % 2 == 0) {
131  return (data[size / 2 - 1] + data[size / 2]) / 2.0;
132  } else {
133  return data[size / 2];
134  }
135 }
136 
137 // Average Absolute Deviation
138 double gpmp::stats::Describe::avg_abs_dev(const std::vector<double> &data) {
139  double mean = mean_arith(data);
140  double sum = 0.0;
141  for (const auto &value : data) {
142  sum += std::abs(value - mean);
143  }
144  return sum / static_cast<double>(data.size());
145 }
146 
147 // Coefficient of Variation
148 double gpmp::stats::Describe::var_coeff(const std::vector<double> &data) {
149  double mean = mean_arith(data);
150  double stddev = stdev(data, mean);
151  return (stddev / mean) * 100.0; // Multiply by 100 for percentage
152 }
153 
154 // Interquartile Range
155 double gpmp::stats::Describe::iq_range(const std::vector<double> &data) {
156  std::vector<double> sortedData = data;
157  std::sort(sortedData.begin(), sortedData.end());
158 
159  size_t size = sortedData.size();
160  size_t lowerIndex = size / 4;
161  size_t upperIndex = 3 * size / 4;
162 
163  return sortedData[upperIndex] - sortedData[lowerIndex];
164 }
165 
166 // percentile
167 double gpmp::stats::Describe::percentile(const std::vector<double> &data,
168  double percentile) {
169  std::vector<double> sortedData = data;
170  std::sort(sortedData.begin(), sortedData.end());
171 
172  size_t size = sortedData.size();
173  size_t index = static_cast<size_t>(percentile * (size - 1));
174  return sortedData[index];
175 }
176 
177 // Range
178 double gpmp::stats::Describe::range(const std::vector<double> &data) {
179  auto result = std::minmax_element(data.begin(), data.end());
180  return *result.second - *result.first;
181 }
182 
183 // Standard Deviation
184 double gpmp::stats::Describe::stdev(const std::vector<double> &data,
185  double mean) {
186  double sum = 0.0;
187  for (const auto &value : data) {
188  sum += std::pow(value - mean, 2.0);
189  }
190  return std::sqrt(sum / static_cast<double>(data.size()));
191 }
192 
193 // variance
194 double gpmp::stats::Describe::variance(const std::vector<double> &data,
195  double mean) {
196  double sum = 0.0;
197  for (const auto &value : data) {
198  sum += std::pow(value - mean, 2.0);
199  }
200  return sum / static_cast<double>(data.size());
201 }
202 
203 // central limit theorem
204 double gpmp::stats::Describe::clt(const std::vector<double> &data,
205  int numSamples) {
206  double mean = mean_arith(data);
207  double stddev = stdev(data, mean);
208  return stddev / std::sqrt(static_cast<double>(numSamples));
209 }
210 
211 // Kurtosis
212 double gpmp::stats::Describe::kurtosis(const std::vector<double> &data,
213  double mean) {
214  double sum = 0.0;
215  for (const auto &value : data) {
216  sum += std::pow(value - mean, 4.0);
217  }
218  double var = variance(data, mean);
219  return sum / (data.size() * std::pow(var, 2.0)) - 3.0;
220 }
221 
222 // l-moments (first two)
223 double gpmp::stats::Describe::lmoment1(const std::vector<double> &data,
224  double mean) {
225  double sum = 0.0;
226  for (const auto &value : data) {
227  sum += std::pow(value - mean, 3.0);
228  }
229  return sum / data.size();
230 }
231 
232 double gpmp::stats::Describe::lmoment2(const std::vector<double> &data,
233  double mean) {
234  double sum = 0.0;
235  for (const auto &value : data) {
236  sum += std::pow(value - mean, 4.0);
237  }
238  return sum / data.size();
239 }
240 
241 // skewness
242 double gpmp::stats::Describe::skewness(const std::vector<double> &data,
243  double mean,
244  double stddev) {
245  double sum = 0.0;
246  for (const auto &value : data) {
247  sum += std::pow((value - mean) / stddev, 3.0);
248  }
249  return sum / static_cast<double>(data.size());
250 }
251 
252 std::vector<size_t>
253 gpmp::stats::Describe::rank_data(const std::vector<double> &data) {
254  std::vector<size_t> ranks(data.size());
255 
256  for (size_t i = 0; i < data.size(); ++i) {
257  size_t rank = 1;
258  for (size_t j = 0; j < data.size(); ++j) {
259  if (j != i && data[j] < data[i]) {
260  rank++;
261  }
262  }
263  ranks[i] = rank;
264  }
265 
266  return ranks;
267 }
268 
269 double gpmp::stats::Describe::partial_corr(const std::vector<double> &x,
270  const std::vector<double> &y,
271  const std::vector<double> &z) {
272  double r_xy = ppmc(x, y);
273  double r_xz = ppmc(x, z);
274  double r_yz = ppmc(y, z);
275 
276  return (r_xy - (r_xz * r_yz)) /
277  std::sqrt((1.0 - std::pow(r_xz, 2.0)) * (1.0 - std::pow(r_yz, 2.0)));
278 }
279 
280 // Pearson Product-Moment Correlation
281 double gpmp::stats::Describe::ppmc(const std::vector<double> &x,
282  const std::vector<double> &y) {
283  double mean_x = mean_arith(x);
284  double mean_y = mean_arith(y);
285 
286  double numerator = 0.0;
287  double denominator_x = 0.0;
288  double denominator_y = 0.0;
289 
290  for (size_t i = 0; i < x.size(); ++i) {
291  numerator += (x[i] - mean_x) * (y[i] - mean_y);
292  denominator_x += std::pow(x[i] - mean_x, 2.0);
293  denominator_y += std::pow(y[i] - mean_y, 2.0);
294  }
295 
296  return numerator / std::sqrt(denominator_x * denominator_y);
297 }
298 
299 // Kendall's Tau Rank Correlation
300 double gpmp::stats::Describe::kendalls_tau(const std::vector<double> &x,
301  const std::vector<double> &y) {
302  size_t concordant = 0;
303  size_t discordant = 0;
304 
305  for (size_t i = 0; i < x.size() - 1; ++i) {
306  for (size_t j = i + 1; j < x.size(); ++j) {
307  if ((x[i] < x[j] && y[i] < y[j]) || (x[i] > x[j] && y[i] > y[j])) {
308  concordant++;
309  } else if ((x[i] < x[j] && y[i] > y[j]) ||
310  (x[i] > x[j] && y[i] < y[j])) {
311  discordant++;
312  }
313  }
314  }
315 
316  return static_cast<double>(concordant - discordant) /
317  std::sqrt(static_cast<double>((concordant + discordant) *
318  (x.size() * (x.size() - 1)) / 2));
319 }
320 
321 // Spearman's Rank Correlation
322 double gpmp::stats::Describe::spearmans_rho(const std::vector<double> &x,
323  const std::vector<double> &y) {
324  std::vector<size_t> ranks_x = rank_data(x);
325  std::vector<size_t> ranks_y = rank_data(y);
326 
327  double d_squared = 0.0;
328  for (size_t i = 0; i < x.size(); ++i) {
329  d_squared += std::pow(ranks_x[i] - ranks_y[i], 2.0);
330  }
331 
332  return 1.0 -
333  (6.0 * d_squared) / (x.size() * (std::pow(x.size(), 2.0) - 1.0));
334 }
static double u_stat(const std::vector< double > &sample1, const std::vector< double > &sample2)
Calculates U statistic given two samples.
Definition: describe.cpp:38
double kurtosis(const std::vector< double > &data, double mean)
Calculates the kurtosis of a given dataset.
Definition: describe.cpp:212
static double stdev(const std::vector< double > &data, double mean)
Calculates the standard deviation of a given dataset, given the mean.
Definition: describe.cpp:184
double lmoment2(const std::vector< double > &data, double mean)
Calculates the second L-moment of a given dataset.
Definition: describe.cpp:232
double kendalls_tau(const std::vector< double > &x, const std::vector< double > &y)
Calculates Kendall's Tau Rank Correlation between two variables.
Definition: describe.cpp:300
double lmoment1(const std::vector< double > &data, double mean)
Calculates the first L-moment of a given dataset.
Definition: describe.cpp:223
double clt(const std::vector< double > &data, int numSamples)
Calculates the standard error of the mean using the Central Limit Theorem.
Definition: describe.cpp:204
double spearmans_rho(const std::vector< double > &x, const std::vector< double > &y)
Calculates Spearman's Rank Correlation between two variables.
Definition: describe.cpp:322
static double variance(const std::vector< double > &data, double mean)
Calculates the variance of a given dataset, given the mean.
Definition: describe.cpp:194
double mean_lehmer(const std::vector< double > &data, double p)
Calculates the Lehmer mean of a given dataset with a specified power.
Definition: describe.cpp:117
double Median(std::vector< double > data)
Calculates the median of a given dataset.
Definition: describe.cpp:127
double mean_heronian(const std::vector< double > &data)
Calculates the Heronian mean of a given dataset.
Definition: describe.cpp:99
static double mean_arith(const std::vector< double > &data)
Calculates the arithmetic mean of a given dataset.
Definition: describe.cpp:52
double mean_geo_pow(const std::vector< double > &data, double p)
Calculates the power geometric mean of a given dataset with a specified power.
Definition: describe.cpp:80
double percentile(const std::vector< double > &data, double percentile)
Calculates the specified percentile of a given dataset.
Definition: describe.cpp:167
double partial_corr(const std::vector< double > &x, const std::vector< double > &y, const std::vector< double > &z)
Calculates the partial correlation coefficient between two variables, controlling for a third variabl...
Definition: describe.cpp:269
double skewness(const std::vector< double > &data, double mean, double stddev)
Calculates the skewness of a given dataset.
Definition: describe.cpp:242
double iq_range(const std::vector< double > &data)
Calculates the interquartile range of a given dataset.
Definition: describe.cpp:155
std::vector< size_t > rank_data(const std::vector< double > &data)
Ranks the data in ascending order.
Definition: describe.cpp:253
double mean_heinz(const std::vector< double > &data)
Calculates the Heinz mean of a given dataset.
Definition: describe.cpp:108
double mean_cubic(const std::vector< double > &data, double p)
Calculates the cubic generalized mean of a given dataset with a specified power.
Definition: describe.cpp:70
double var_coeff(const std::vector< double > &data)
Calculates the coefficient of variation of a given dataset.
Definition: describe.cpp:148
double mean_harmonic(const std::vector< double > &data)
Calculates the harmonic mean of a given dataset.
Definition: describe.cpp:90
double avg_abs_dev(const std::vector< double > &data)
Calculates the average absolute deviation of a given dataset.
Definition: describe.cpp:138
double ppmc(const std::vector< double > &x, const std::vector< double > &y)
Calculates the Pearson Product-Moment Correlation between two variables.
Definition: describe.cpp:281
double range(const std::vector< double > &data)
Calculates the range of a given dataset.
Definition: describe.cpp:178
double mean_geo(const std::vector< double > &data)
Calculates the geometric mean of a given dataset.
Definition: describe.cpp:61