Line data Source code
1 : /*************************************************************************
2 : *
3 : * Project
4 : * _____ _____ __ __ _____
5 : * / ____| __ \| \/ | __ \
6 : * ___ _ __ ___ _ __ | | __| |__) | \ / | |__) |
7 : * / _ \| '_ \ / _ \ '_ \| | |_ | ___/| |\/| | ___/
8 : *| (_) | |_) | __/ | | | |__| | | | | | | |
9 : * \___/| .__/ \___|_| |_|\_____|_| |_| |_|_|
10 : * | |
11 : * |_|
12 : *
13 : * Copyright (C) Akiel Aries, <akiel@akiel.org>, et al.
14 : *
15 : * This software is licensed as described in the file LICENSE, which
16 : * you should have received as part of this distribution. The terms
17 : * among other details are referenced in the official documentation
18 : * seen here : https://akielaries.github.io/openGPMP/ along with
19 : * important files seen in this project.
20 : *
21 : * You may opt to use, copy, modify, merge, publish, distribute
22 : * and/or sell copies of the Software, and permit persons to whom
23 : * the Software is furnished to do so, under the terms of the
24 : * LICENSE file.
25 : *
26 : *
27 : *
28 : * This software is distributed on an AS IS basis, WITHOUT
29 : * WARRANTY OF ANY KIND, either express or implied.
30 : *
31 : ************************************************************************/
32 : #include <algorithm>
33 : #include <openGPMP/stats/resampling.hpp>
34 : #include <random>
35 : #include <stdexcept>
36 : #include <vector>
37 :
38 : // Resample with replacement
39 : std::vector<int>
40 0 : gpmp::stats::Resampling::bootstrap(const std::vector<int> &data,
41 : int numSamples) {
42 0 : std::vector<int> resampledData;
43 0 : resampledData.reserve(numSamples);
44 :
45 0 : std::mt19937 rng(std::random_device{}());
46 0 : std::uniform_int_distribution<int> dist(0, data.size() - 1);
47 :
48 0 : for (int i = 0; i < numSamples; ++i) {
49 0 : int index = dist(rng);
50 0 : resampledData.push_back(data[index]);
51 : }
52 :
53 0 : return resampledData;
54 0 : }
55 :
56 : // Resample without replacement
57 : std::vector<int>
58 0 : gpmp::stats::Resampling::subsample(const std::vector<int> &data,
59 : int numSamples) {
60 0 : if (numSamples > static_cast<int>(data.size())) {
61 0 : throw std::invalid_argument(
62 0 : "Number of samples cannot exceed data size");
63 : }
64 :
65 0 : std::vector<int> resampledData = data;
66 0 : std::shuffle(resampledData.begin(),
67 : resampledData.end(),
68 0 : std::mt19937(std::random_device()()));
69 :
70 0 : resampledData.resize(numSamples);
71 :
72 0 : return resampledData;
73 0 : }
74 :
75 : // Jackknife resampling
76 : std::vector<std::vector<int>>
77 0 : gpmp::stats::Resampling::jackknife(const std::vector<int> &data) {
78 0 : int n = data.size();
79 0 : std::vector<std::vector<int>> resampledDatasets;
80 0 : resampledDatasets.reserve(n);
81 :
82 0 : for (int i = 0; i < n; ++i) {
83 0 : std::vector<int> resampledData = data;
84 0 : resampledData.erase(resampledData.begin() + i);
85 0 : resampledDatasets.push_back(resampledData);
86 0 : }
87 :
88 0 : return resampledDatasets;
89 0 : }
90 :
91 : // Permutation test
92 : std::vector<std::vector<int>>
93 0 : gpmp::stats::Resampling::permutation_test(const std::vector<int> &data,
94 : int numPermutations) {
95 0 : std::vector<std::vector<int>> permutedDatasets;
96 0 : permutedDatasets.reserve(numPermutations);
97 :
98 0 : std::mt19937 rng(std::random_device{}());
99 :
100 0 : for (int i = 0; i < numPermutations; ++i) {
101 0 : std::vector<int> permutedData = data;
102 0 : std::shuffle(permutedData.begin(), permutedData.end(), rng);
103 0 : permutedDatasets.push_back(permutedData);
104 0 : }
105 :
106 0 : return permutedDatasets;
107 0 : }
108 :
109 : // Bootstrap-t method
110 : std::vector<double>
111 0 : gpmp::stats::Resampling::bootstrap_t(const std::vector<double> &data,
112 : int numSamples) {
113 0 : std::vector<double> resampledMeans;
114 0 : resampledMeans.reserve(numSamples);
115 :
116 0 : std::mt19937 rng(std::random_device{}());
117 0 : std::uniform_int_distribution<int> dist(0, data.size() - 1);
118 :
119 0 : for (int i = 0; i < numSamples; ++i) {
120 0 : std::vector<double> resampledData;
121 0 : resampledData.reserve(data.size());
122 0 : for (int j = 0; j < static_cast<int>(data.size()); ++j) {
123 0 : int index = dist(rng);
124 0 : resampledData.push_back(data[index]);
125 : }
126 : double mean =
127 0 : std::accumulate(resampledData.begin(), resampledData.end(), 0.0) /
128 0 : resampledData.size();
129 0 : resampledMeans.push_back(mean);
130 0 : }
131 :
132 0 : return resampledMeans;
133 0 : }
134 :
135 : // Bootstrapped confidence interval
136 : std::pair<double, double>
137 0 : gpmp::stats::Resampling::bootstrap_ci(const std::vector<double> &data,
138 : double alpha,
139 : int numSamples) {
140 0 : std::vector<double> resampledMeans = bootstrap_t(data, numSamples);
141 0 : std::sort(resampledMeans.begin(), resampledMeans.end());
142 0 : int lowerIndex = static_cast<int>((alpha / 2) * numSamples);
143 0 : int upperIndex = static_cast<int>((1 - alpha / 2) * numSamples) - 1;
144 0 : return std::make_pair(resampledMeans[lowerIndex],
145 0 : resampledMeans[upperIndex]);
146 0 : }
147 :
148 : // Smoothed bootstrap
149 : std::vector<double>
150 0 : gpmp::stats::Resampling::smoothed_bootstrap(const std::vector<double> &data,
151 : int numSamples) {
152 0 : std::vector<double> resampledData;
153 0 : resampledData.reserve(numSamples);
154 :
155 0 : std::mt19937 rng(std::random_device{}());
156 0 : std::uniform_int_distribution<int> dist(0, data.size() - 1);
157 :
158 0 : for (int i = 0; i < numSamples; ++i) {
159 0 : double sum = 0.0;
160 0 : for (int j = 0; j < static_cast<int>(data.size()); ++j) {
161 0 : int index = dist(rng);
162 0 : sum += data[index];
163 : }
164 0 : resampledData.push_back(sum / data.size());
165 : }
166 :
167 0 : return resampledData;
168 0 : }
169 :
170 : // Circular block bootstrap
171 0 : std::vector<double> gpmp::stats::Resampling::circular_block_bootstrap(
172 : const std::vector<double> &data,
173 : int blockSize,
174 : int numSamples) {
175 :
176 0 : if (blockSize <= 0 || blockSize > static_cast<int>(data.size())) {
177 0 : throw std::invalid_argument("Invalid block size");
178 : }
179 :
180 0 : std::vector<double> resampledData;
181 0 : resampledData.reserve(numSamples);
182 :
183 0 : std::mt19937 rng(std::random_device{}());
184 0 : std::uniform_int_distribution<int> dist(0, data.size() - 1);
185 :
186 0 : for (int i = 0; i < numSamples; ++i) {
187 0 : std::vector<double> blockMeans;
188 0 : blockMeans.reserve(data.size() / blockSize);
189 0 : for (int j = 0; j < static_cast<int>(data.size()) / blockSize; ++j) {
190 0 : double sum = 0.0;
191 0 : for (int k = 0; k < blockSize; ++k) {
192 0 : int index = dist(rng);
193 0 : sum += data[index];
194 : }
195 0 : blockMeans.push_back(sum / blockSize);
196 : }
197 0 : std::shuffle(blockMeans.begin(), blockMeans.end(), rng);
198 0 : resampledData.insert(resampledData.end(),
199 : blockMeans.begin(),
200 : blockMeans.end());
201 0 : }
202 :
203 0 : return resampledData;
204 0 : }
205 :
206 : // Time series bootstrap
207 : std::vector<double>
208 0 : gpmp::stats::Resampling::time_series_bootstrap(const std::vector<double> &data,
209 : int numSamples) {
210 0 : std::vector<double> resampledData;
211 0 : resampledData.reserve(numSamples);
212 :
213 0 : std::mt19937 rng(std::random_device{}());
214 0 : std::uniform_int_distribution<int> dist(0, data.size() - 1);
215 :
216 0 : for (int i = 0; i < numSamples; ++i) {
217 0 : std::vector<double> resampledSequence;
218 0 : resampledSequence.reserve(data.size());
219 0 : int startIndex = dist(rng);
220 0 : for (int j = 0; j < static_cast<int>(data.size()); ++j) {
221 0 : int index = (startIndex + j) % data.size();
222 0 : resampledSequence.push_back(data[index]);
223 : }
224 0 : resampledData.insert(resampledData.end(),
225 : resampledSequence.begin(),
226 : resampledSequence.end());
227 0 : }
228 :
229 0 : return resampledData;
230 0 : }
231 :
232 : std::vector<double>
233 0 : gpmp::stats::Resampling::weighted_bootstrap(const std::vector<double> &data,
234 : const std::vector<double> &weights,
235 : int size) {
236 0 : std::vector<double> resampledData;
237 0 : std::random_device rd;
238 0 : std::mt19937 gen(rd());
239 0 : std::discrete_distribution<> dis(weights.begin(), weights.end());
240 0 : for (int i = 0; i < size; ++i) {
241 0 : resampledData.push_back(data[dis(gen)]);
242 : }
243 0 : return resampledData;
244 0 : }
245 :
246 : double
247 0 : gpmp::stats::Resampling::permutation_p_value(const std::vector<double> &data1,
248 : const std::vector<double> &data2,
249 : double observedStatistic) {
250 0 : int count = 0;
251 0 : std::vector<double> combinedData = data1;
252 0 : combinedData.insert(combinedData.end(), data2.begin(), data2.end());
253 0 : std::shuffle(combinedData.begin(),
254 : combinedData.end(),
255 0 : std::mt19937(std::random_device()()));
256 0 : std::vector<double> permutedData1(data1.begin(), data1.end());
257 0 : std::vector<double> permutedData2(data2.begin(), data2.end());
258 0 : for (int i = 0; i < 1000; ++i) {
259 0 : std::shuffle(combinedData.begin(),
260 : combinedData.end(),
261 0 : std::mt19937(std::random_device()()));
262 : auto permutedStatistic =
263 0 : (std::accumulate(combinedData.begin(),
264 0 : combinedData.begin() + data1.size(),
265 0 : 0.0)) /
266 0 : data1.size();
267 0 : if (permutedStatistic >= observedStatistic) {
268 0 : count++;
269 : }
270 : }
271 0 : return count / 1000.0;
272 0 : }
|