openGPMP
Open Source Mathematics Package
resampling.cpp
Go to the documentation of this file.
1 /*************************************************************************
2  *
3  * Project
4  * _____ _____ __ __ _____
5  * / ____| __ \| \/ | __ \
6  * ___ _ __ ___ _ __ | | __| |__) | \ / | |__) |
7  * / _ \| '_ \ / _ \ '_ \| | |_ | ___/| |\/| | ___/
8  *| (_) | |_) | __/ | | | |__| | | | | | | |
9  * \___/| .__/ \___|_| |_|\_____|_| |_| |_|_|
10  * | |
11  * |_|
12  *
13  * Copyright (C) Akiel Aries, <akiel@akiel.org>, et al.
14  *
15  * This software is licensed as described in the file LICENSE, which
16  * you should have received as part of this distribution. The terms
17  * among other details are referenced in the official documentation
18  * seen here : https://akielaries.github.io/openGPMP/ along with
19  * important files seen in this project.
20  *
21  * You may opt to use, copy, modify, merge, publish, distribute
22  * and/or sell copies of the Software, and permit persons to whom
23  * the Software is furnished to do so, under the terms of the
24  * LICENSE file.
25  *
26  *
27  *
28  * This software is distributed on an AS IS basis, WITHOUT
29  * WARRANTY OF ANY KIND, either express or implied.
30  *
31  ************************************************************************/
32 #include <algorithm>
34 #include <random>
35 #include <stdexcept>
36 #include <vector>
37 
38 // Resample with replacement
39 std::vector<int>
40 gpmp::stats::Resampling::bootstrap(const std::vector<int> &data,
41  int numSamples) {
42  std::vector<int> resampledData;
43  resampledData.reserve(numSamples);
44 
45  std::mt19937 rng(std::random_device{}());
46  std::uniform_int_distribution<int> dist(0, data.size() - 1);
47 
48  for (int i = 0; i < numSamples; ++i) {
49  int index = dist(rng);
50  resampledData.push_back(data[index]);
51  }
52 
53  return resampledData;
54 }
55 
56 // Resample without replacement
57 std::vector<int>
58 gpmp::stats::Resampling::subsample(const std::vector<int> &data,
59  int numSamples) {
60  if (numSamples > static_cast<int>(data.size())) {
61  throw std::invalid_argument(
62  "Number of samples cannot exceed data size");
63  }
64 
65  std::vector<int> resampledData = data;
66  std::shuffle(resampledData.begin(),
67  resampledData.end(),
68  std::mt19937(std::random_device()()));
69 
70  resampledData.resize(numSamples);
71 
72  return resampledData;
73 }
74 
75 // Jackknife resampling
76 std::vector<std::vector<int>>
77 gpmp::stats::Resampling::jackknife(const std::vector<int> &data) {
78  int n = data.size();
79  std::vector<std::vector<int>> resampledDatasets;
80  resampledDatasets.reserve(n);
81 
82  for (int i = 0; i < n; ++i) {
83  std::vector<int> resampledData = data;
84  resampledData.erase(resampledData.begin() + i);
85  resampledDatasets.push_back(resampledData);
86  }
87 
88  return resampledDatasets;
89 }
90 
91 // Permutation test
92 std::vector<std::vector<int>>
93 gpmp::stats::Resampling::permutation_test(const std::vector<int> &data,
94  int numPermutations) {
95  std::vector<std::vector<int>> permutedDatasets;
96  permutedDatasets.reserve(numPermutations);
97 
98  std::mt19937 rng(std::random_device{}());
99 
100  for (int i = 0; i < numPermutations; ++i) {
101  std::vector<int> permutedData = data;
102  std::shuffle(permutedData.begin(), permutedData.end(), rng);
103  permutedDatasets.push_back(permutedData);
104  }
105 
106  return permutedDatasets;
107 }
108 
109 // Bootstrap-t method
110 std::vector<double>
111 gpmp::stats::Resampling::bootstrap_t(const std::vector<double> &data,
112  int numSamples) {
113  std::vector<double> resampledMeans;
114  resampledMeans.reserve(numSamples);
115 
116  std::mt19937 rng(std::random_device{}());
117  std::uniform_int_distribution<int> dist(0, data.size() - 1);
118 
119  for (int i = 0; i < numSamples; ++i) {
120  std::vector<double> resampledData;
121  resampledData.reserve(data.size());
122  for (int j = 0; j < static_cast<int>(data.size()); ++j) {
123  int index = dist(rng);
124  resampledData.push_back(data[index]);
125  }
126  double mean =
127  std::accumulate(resampledData.begin(), resampledData.end(), 0.0) /
128  resampledData.size();
129  resampledMeans.push_back(mean);
130  }
131 
132  return resampledMeans;
133 }
134 
135 // Bootstrapped confidence interval
136 std::pair<double, double>
137 gpmp::stats::Resampling::bootstrap_ci(const std::vector<double> &data,
138  double alpha,
139  int numSamples) {
140  std::vector<double> resampledMeans = bootstrap_t(data, numSamples);
141  std::sort(resampledMeans.begin(), resampledMeans.end());
142  int lowerIndex = static_cast<int>((alpha / 2) * numSamples);
143  int upperIndex = static_cast<int>((1 - alpha / 2) * numSamples) - 1;
144  return std::make_pair(resampledMeans[lowerIndex],
145  resampledMeans[upperIndex]);
146 }
147 
148 // Smoothed bootstrap
149 std::vector<double>
150 gpmp::stats::Resampling::smoothed_bootstrap(const std::vector<double> &data,
151  int numSamples) {
152  std::vector<double> resampledData;
153  resampledData.reserve(numSamples);
154 
155  std::mt19937 rng(std::random_device{}());
156  std::uniform_int_distribution<int> dist(0, data.size() - 1);
157 
158  for (int i = 0; i < numSamples; ++i) {
159  double sum = 0.0;
160  for (int j = 0; j < static_cast<int>(data.size()); ++j) {
161  int index = dist(rng);
162  sum += data[index];
163  }
164  resampledData.push_back(sum / data.size());
165  }
166 
167  return resampledData;
168 }
169 
170 // Circular block bootstrap
172  const std::vector<double> &data,
173  int blockSize,
174  int numSamples) {
175 
176  if (blockSize <= 0 || blockSize > static_cast<int>(data.size())) {
177  throw std::invalid_argument("Invalid block size");
178  }
179 
180  std::vector<double> resampledData;
181  resampledData.reserve(numSamples);
182 
183  std::mt19937 rng(std::random_device{}());
184  std::uniform_int_distribution<int> dist(0, data.size() - 1);
185 
186  for (int i = 0; i < numSamples; ++i) {
187  std::vector<double> blockMeans;
188  blockMeans.reserve(data.size() / blockSize);
189  for (int j = 0; j < static_cast<int>(data.size()) / blockSize; ++j) {
190  double sum = 0.0;
191  for (int k = 0; k < blockSize; ++k) {
192  int index = dist(rng);
193  sum += data[index];
194  }
195  blockMeans.push_back(sum / blockSize);
196  }
197  std::shuffle(blockMeans.begin(), blockMeans.end(), rng);
198  resampledData.insert(resampledData.end(),
199  blockMeans.begin(),
200  blockMeans.end());
201  }
202 
203  return resampledData;
204 }
205 
206 // Time series bootstrap
207 std::vector<double>
208 gpmp::stats::Resampling::time_series_bootstrap(const std::vector<double> &data,
209  int numSamples) {
210  std::vector<double> resampledData;
211  resampledData.reserve(numSamples);
212 
213  std::mt19937 rng(std::random_device{}());
214  std::uniform_int_distribution<int> dist(0, data.size() - 1);
215 
216  for (int i = 0; i < numSamples; ++i) {
217  std::vector<double> resampledSequence;
218  resampledSequence.reserve(data.size());
219  int startIndex = dist(rng);
220  for (int j = 0; j < static_cast<int>(data.size()); ++j) {
221  int index = (startIndex + j) % data.size();
222  resampledSequence.push_back(data[index]);
223  }
224  resampledData.insert(resampledData.end(),
225  resampledSequence.begin(),
226  resampledSequence.end());
227  }
228 
229  return resampledData;
230 }
231 
232 std::vector<double>
233 gpmp::stats::Resampling::weighted_bootstrap(const std::vector<double> &data,
234  const std::vector<double> &weights,
235  int size) {
236  std::vector<double> resampledData;
237  std::random_device rd;
238  std::mt19937 gen(rd());
239  std::discrete_distribution<> dis(weights.begin(), weights.end());
240  for (int i = 0; i < size; ++i) {
241  resampledData.push_back(data[dis(gen)]);
242  }
243  return resampledData;
244 }
245 
246 double
247 gpmp::stats::Resampling::permutation_p_value(const std::vector<double> &data1,
248  const std::vector<double> &data2,
249  double observedStatistic) {
250  int count = 0;
251  std::vector<double> combinedData = data1;
252  combinedData.insert(combinedData.end(), data2.begin(), data2.end());
253  std::shuffle(combinedData.begin(),
254  combinedData.end(),
255  std::mt19937(std::random_device()()));
256  std::vector<double> permutedData1(data1.begin(), data1.end());
257  std::vector<double> permutedData2(data2.begin(), data2.end());
258  for (int i = 0; i < 1000; ++i) {
259  std::shuffle(combinedData.begin(),
260  combinedData.end(),
261  std::mt19937(std::random_device()()));
262  auto permutedStatistic =
263  (std::accumulate(combinedData.begin(),
264  combinedData.begin() + data1.size(),
265  0.0)) /
266  data1.size();
267  if (permutedStatistic >= observedStatistic) {
268  count++;
269  }
270  }
271  return count / 1000.0;
272 }
static std::vector< double > bootstrap_t(const std::vector< double > &data, int numSamples)
Perform bootstrap t-statistic resampling.
Definition: resampling.cpp:111
static std::vector< double > circular_block_bootstrap(const std::vector< double > &data, int blockSize, int numSamples)
Perform circular block bootstrap resampling.
Definition: resampling.cpp:171
static std::vector< double > smoothed_bootstrap(const std::vector< double > &data, int numSamples)
Perform smoothed bootstrap resampling.
Definition: resampling.cpp:150
static std::vector< double > weighted_bootstrap(const std::vector< double > &data, const std::vector< double > &weights, int size)
Perform weighted bootstrap resampling.
Definition: resampling.cpp:233
static std::vector< int > subsample(const std::vector< int > &data, int numSamples)
Perform subsampling.
Definition: resampling.cpp:58
static std::vector< double > time_series_bootstrap(const std::vector< double > &data, int numSamples)
Perform time series bootstrap resampling.
Definition: resampling.cpp:208
static double permutation_p_value(const std::vector< double > &data1, const std::vector< double > &data2, double observedStatistic)
Calculate the p-value using permutation test.
Definition: resampling.cpp:247
static std::vector< std::vector< int > > jackknife(const std::vector< int > &data)
Perform jackknife resampling.
Definition: resampling.cpp:77
static std::vector< std::vector< int > > permutation_test(const std::vector< int > &data, int numPermutations)
Perform permutation test.
Definition: resampling.cpp:93
static std::pair< double, double > bootstrap_ci(const std::vector< double > &data, double alpha, int numSamples)
Calculate confidence interval using bootstrap.
Definition: resampling.cpp:137
static std::vector< int > bootstrap(const std::vector< int > &data, int numSamples)
Perform bootstrap resampling.
Definition: resampling.cpp:40