Line data Source code
1 : /*************************************************************************
2 : *
3 : * Project
4 : * _____ _____ __ __ _____
5 : * / ____| __ \| \/ | __ \
6 : * ___ _ __ ___ _ __ | | __| |__) | \ / | |__) |
7 : * / _ \| '_ \ / _ \ '_ \| | |_ | ___/| |\/| | ___/
8 : *| (_) | |_) | __/ | | | |__| | | | | | | |
9 : * \___/| .__/ \___|_| |_|\_____|_| |_| |_|_|
10 : * | |
11 : * |_|
12 : *
13 : * Copyright (C) Akiel Aries, <akiel@akiel.org>, et al.
14 : *
15 : * This software is licensed as described in the file LICENSE, which
16 : * you should have received as part of this distribution. The terms
17 : * among other details are referenced in the official documentation
18 : * seen here : https://akielaries.github.io/openGPMP/ along with
19 : * important files seen in this project.
20 : *
21 : * You may opt to use, copy, modify, merge, publish, distribute
22 : * and/or sell copies of the Software, and permit persons to whom
23 : * the Software is furnished to do so, under the terms of the
24 : * LICENSE file. As this is an Open Source effort, all implementations
25 : * must be of the same methodology.
26 : *
27 : *
28 : *
29 : * This software is distributed on an AS IS basis, WITHOUT
30 : * WARRANTY OF ANY KIND, either express or implied.
31 : *
32 : ************************************************************************/
33 : #include <algorithm>
34 : #include <cmath>
35 : #include <iostream>
36 : #include <numeric>
37 : #include <openGPMP/ml/bayes_clf.hpp>
38 : #include <string>
39 : #include <unordered_map>
40 : #include <vector>
41 :
42 0 : gpmp::ml::BayesClf::BayesClf(double alpha_param,
43 : bool fit_prior_param,
44 0 : const std::vector<double> &class_prior)
45 0 : : alpha(alpha_param), fit_prior(fit_prior_param),
46 0 : class_log_prior(class_prior.begin(), class_prior.end()) {
47 0 : }
48 :
49 0 : gpmp::ml::BayesClf::~BayesClf() {
50 0 : }
51 :
52 0 : void gpmp::ml::BayesClf::train(const std::vector<std::vector<double>> &data,
53 : const std::vector<std::string> &labels) {
54 : // count class occurrences
55 0 : for (const auto &label : labels) {
56 0 : class_probs[label] += 1.0;
57 : }
58 :
59 : // count feature occurrences for each class
60 0 : for (size_t i = 0; i < data.size(); ++i) {
61 0 : const std::string &label = labels[i];
62 0 : const std::vector<double> &features = data[i];
63 :
64 0 : class_probs[label] += 1.0;
65 :
66 : // initialize feature_probs[label] if not present
67 0 : if (feature_probs.find(label) == feature_probs.end()) {
68 0 : feature_probs[label] = std::vector<double>(features.size(), 0.0);
69 : }
70 :
71 0 : for (size_t j = 0; j < features.size(); ++j) {
72 0 : feature_probs[label][j] += features[j];
73 : }
74 : }
75 :
76 : // calculate class probabilities and feature probabilities
77 0 : double smoothing_factor = alpha * 2.0;
78 0 : for (const auto &entry : class_probs) {
79 0 : const std::string &label = entry.first;
80 0 : double class_count = entry.second;
81 :
82 : // calculate class probability
83 0 : class_probs[label] =
84 0 : (class_count + alpha) / (data.size() + smoothing_factor);
85 :
86 : // calculate feature probabilities
87 0 : for (size_t j = 0; j < feature_probs[label].size(); ++j) {
88 0 : feature_probs[label][j] = (feature_probs[label][j] + alpha) /
89 0 : (class_count + smoothing_factor);
90 : }
91 : }
92 :
93 : // calculate class log priors
94 0 : if (fit_prior) {
95 0 : double total = std::accumulate(
96 : class_probs.begin(),
97 : class_probs.end(),
98 : 0.0,
99 0 : [](double sum, const auto &entry) { return sum + entry.second; });
100 :
101 0 : for (auto &entry : class_probs) {
102 0 : entry.second /= total;
103 : }
104 :
105 0 : std::transform(
106 : class_probs.begin(),
107 : class_probs.end(),
108 : class_log_prior.begin(),
109 0 : [total](const auto &entry) { return log(entry.second); });
110 : }
111 0 : }
112 :
113 : std::string
114 0 : gpmp::ml::BayesClf::predict(const std::vector<double> &new_data) const {
115 0 : double max_prob = -std::numeric_limits<double>::infinity();
116 0 : std::string predicted_class;
117 :
118 0 : for (const auto &entry : class_probs) {
119 0 : const std::string &label = entry.first;
120 0 : double probability = log(entry.second);
121 :
122 0 : for (size_t j = 0; j < new_data.size(); ++j) {
123 0 : probability += new_data[j] * log(feature_probs.at(label).at(j));
124 : }
125 :
126 0 : if (probability > max_prob) {
127 0 : max_prob = probability;
128 0 : predicted_class = label;
129 : }
130 : }
131 :
132 0 : return predicted_class;
133 0 : }
134 :
135 0 : void gpmp::ml::BayesClf::display() const {
136 0 : std::cout << "Class Probabilities:\n";
137 0 : for (const auto &entry : class_probs) {
138 0 : std::cout << entry.first << ": " << entry.second << "\n";
139 : }
140 :
141 0 : std::cout << "\nFeature Probabilities:\n";
142 0 : for (const auto &class_entry : feature_probs) {
143 0 : std::cout << class_entry.first << ":\n";
144 0 : for (size_t j = 0; j < class_entry.second.size(); ++j) {
145 0 : std::cout << " Feature " << j << ": " << class_entry.second[j]
146 0 : << "\n";
147 : }
148 : }
149 :
150 0 : std::cout << "\nClass Log Priors:\n";
151 0 : for (const auto &logPrior : class_log_prior) {
152 0 : std::cout << logPrior << "\n";
153 : }
154 0 : }
155 0 : void gpmp::ml::BayesBernoulli::train(
156 : const std::vector<std::vector<size_t>> &data,
157 : const std::vector<std::string> &labels) {
158 0 : size_t numInstances = data.size();
159 0 : size_t num_feats = data[0].size();
160 :
161 0 : for (size_t i = 0; i < numInstances; ++i) {
162 0 : std::string classLabel = labels[i];
163 :
164 : // update class probabilities
165 0 : class_probs[classLabel] += 1.0;
166 :
167 : // update feature probabilities
168 0 : for (size_t j = 0; j < num_feats; ++j) {
169 0 : feat_probs[classLabel][j] += data[i][j];
170 : }
171 0 : }
172 :
173 : // laplace smoothing
174 0 : double smoothing_factor = alpha * 2.0;
175 0 : for (auto &entry : class_probs) {
176 0 : entry.second =
177 0 : (entry.second + alpha) / (numInstances + smoothing_factor);
178 : }
179 :
180 0 : for (auto &class_entry : feat_probs) {
181 0 : for (auto &feat_entry : class_entry.second) {
182 0 : feat_entry.second =
183 0 : (feat_entry.second + alpha) /
184 0 : (class_probs[class_entry.first] + smoothing_factor);
185 : }
186 : }
187 0 : }
188 :
189 : // predict the class of a new data point
190 : std::string
191 0 : gpmp::ml::BayesBernoulli::predict(const std::vector<size_t> &new_data) const {
192 0 : double max_prob = -std::numeric_limits<double>::infinity();
193 0 : std::string predicted_class;
194 :
195 0 : for (const auto &class_entry : class_probs) {
196 0 : double probability = log(class_entry.second);
197 :
198 0 : for (size_t i = 0; i < new_data.size(); ++i) {
199 0 : probability +=
200 0 : new_data[i] * log(feat_probs.at(class_entry.first).at(i));
201 : }
202 :
203 0 : if (probability > max_prob) {
204 0 : max_prob = probability;
205 0 : predicted_class = class_entry.first;
206 : }
207 : }
208 :
209 0 : return predicted_class;
210 0 : }
211 :
212 0 : void gpmp::ml::BayesBernoulli::display() const {
213 0 : std::cout << "Class Probabilities:\n";
214 0 : for (const auto &entry : class_probs) {
215 0 : std::cout << entry.first << ": " << entry.second << "\n";
216 : }
217 :
218 0 : std::cout << "\nFeature Probabilities:\n";
219 0 : for (const auto &class_entry : feat_probs) {
220 0 : std::cout << class_entry.first << ":\n";
221 0 : for (const auto &feat_entry : class_entry.second) {
222 0 : std::cout << " Feature " << feat_entry.first << ": "
223 0 : << feat_entry.second << "\n";
224 : }
225 : }
226 0 : }
227 :
228 0 : void gpmp::ml::BayesGauss::train(const std::vector<std::vector<double>> &data,
229 : const std::vector<std::string> &labels) {
230 : // calculate class occurrences
231 0 : for (const auto &label : labels) {
232 0 : class_probs[label] += 1.0;
233 : }
234 :
235 : // calculate mean and variance for each feature in each class
236 0 : mean_and_var(data, labels);
237 :
238 : // calculate class probabilities
239 0 : double total_instances = static_cast<double>(data.size());
240 0 : for (auto &entry : class_probs) {
241 0 : entry.second /= total_instances;
242 : }
243 0 : }
244 :
245 0 : void gpmp::ml::BayesGauss::mean_and_var(
246 : const std::vector<std::vector<double>> &data,
247 : const std::vector<std::string> &labels) {
248 0 : size_t num_features = data[0].size();
249 :
250 0 : for (size_t i = 0; i < data.size(); ++i) {
251 0 : const std::string &label = labels[i];
252 0 : const std::vector<double> &features = data[i];
253 :
254 0 : class_probs[label] += 1.0;
255 :
256 : // initialize mean[label] and variance[label] if not present
257 0 : if (mean.find(label) == mean.end()) {
258 0 : mean[label] = std::vector<double>(num_features, 0.0);
259 0 : variance[label] = std::vector<double>(num_features, 0.0);
260 : }
261 :
262 : // update mean
263 0 : for (size_t j = 0; j < num_features; ++j) {
264 0 : mean[label][j] += features[j];
265 : }
266 : }
267 :
268 : // calculate mean
269 0 : for (auto &entry : mean) {
270 0 : const std::string &label = entry.first;
271 0 : double class_count = class_probs[label];
272 :
273 0 : for (size_t j = 0; j < num_features; ++j) {
274 0 : entry.second[j] /= class_count;
275 : }
276 : }
277 :
278 : // calculate variance
279 0 : for (size_t i = 0; i < data.size(); ++i) {
280 0 : const std::string &label = labels[i];
281 0 : const std::vector<double> &features = data[i];
282 :
283 0 : for (size_t j = 0; j < num_features; ++j) {
284 0 : variance[label][j] += std::pow(features[j] - mean[label][j], 2);
285 : }
286 : }
287 :
288 0 : for (auto &entry : variance) {
289 0 : const std::string &label = entry.first;
290 0 : double class_count = class_probs[label];
291 :
292 0 : for (size_t j = 0; j < num_features; ++j) {
293 0 : entry.second[j] /= class_count;
294 : }
295 : }
296 0 : }
297 :
298 : std::string
299 0 : gpmp::ml::BayesGauss::predict(const std::vector<double> &newData) const {
300 0 : double max_prob = -std::numeric_limits<double>::infinity();
301 0 : std::string predicted_class;
302 :
303 0 : for (const auto &entry : class_probs) {
304 0 : const std::string &label = entry.first;
305 0 : double probability = log(entry.second);
306 :
307 0 : for (size_t j = 0; j < newData.size(); ++j) {
308 0 : probability -=
309 0 : 0.5 * (std::log(2 * M_PI * variance.at(label).at(j)) +
310 0 : std::pow(newData[j] - mean.at(label).at(j), 2) /
311 0 : (2 * variance.at(label).at(j)));
312 : }
313 :
314 0 : if (probability > max_prob) {
315 0 : max_prob = probability;
316 0 : predicted_class = label;
317 : }
318 : }
319 :
320 0 : return predicted_class;
321 0 : }
322 :
323 0 : void gpmp::ml::BayesGauss::display() const {
324 0 : std::cout << "Class Probabilities:\n";
325 0 : for (const auto &entry : class_probs) {
326 0 : std::cout << entry.first << ": " << entry.second << "\n";
327 : }
328 :
329 0 : std::cout << "\nMean and Variance:\n";
330 0 : for (const auto &class_entry : mean) {
331 0 : std::cout << class_entry.first << ":\n";
332 0 : std::cout << " Mean: ";
333 0 : for (size_t j = 0; j < class_entry.second.size(); ++j) {
334 0 : std::cout << class_entry.second[j] << " ";
335 : }
336 0 : std::cout << "\n Variance: ";
337 0 : for (size_t j = 0; j < variance.at(class_entry.first).size(); ++j) {
338 0 : std::cout << variance.at(class_entry.first).at(j) << " ";
339 : }
340 0 : std::cout << "\n";
341 : }
342 0 : }
343 :
344 0 : gpmp::ml::BayesMultiNom::BayesMultiNom(double alpha_param,
345 : bool fit_prior_param,
346 0 : const std::vector<double> &class_prior)
347 0 : : alpha(alpha_param), fit_prior(fit_prior_param),
348 0 : class_log_prior(class_prior.begin(), class_prior.end()) {
349 0 : }
350 :
351 0 : gpmp::ml::BayesMultiNom::~BayesMultiNom() {
352 0 : }
353 :
354 0 : void gpmp::ml::BayesMultiNom::train(
355 : const std::vector<std::vector<size_t>> &data,
356 : const std::vector<std::string> &labels) {
357 0 : size_t num_instances = data.size();
358 0 : size_t num_features = data[0].size();
359 :
360 : // count class occurrences
361 0 : for (const auto &label : labels) {
362 0 : class_probs[label] += 1.0;
363 : }
364 :
365 : // count feature occurrences for each class
366 0 : for (size_t i = 0; i < num_instances; ++i) {
367 0 : const std::string &label = labels[i];
368 0 : const std::vector<size_t> &features = data[i];
369 :
370 0 : class_probs[label] += 1.0;
371 :
372 : // Initialize feature_probs[label] if not present
373 0 : if (feature_probs.find(label) == feature_probs.end()) {
374 0 : feature_probs[label] = std::vector<double>(num_features, 0.0);
375 : }
376 :
377 0 : for (size_t j = 0; j < num_features; ++j) {
378 0 : feature_probs[label][j] += features[j];
379 : }
380 : }
381 :
382 : // calculate class probabilities and feature probabilities
383 0 : double smoothing_factor = alpha * num_features;
384 0 : for (const auto &entry : class_probs) {
385 0 : const std::string &label = entry.first;
386 0 : double class_count = entry.second;
387 :
388 : // calculate class probability
389 0 : class_probs[label] =
390 0 : (class_count + alpha) / (num_instances + smoothing_factor);
391 :
392 : // calculate feature probabilities
393 0 : for (size_t j = 0; j < feature_probs[label].size(); ++j) {
394 0 : feature_probs[label][j] = (feature_probs[label][j] + alpha) /
395 0 : (class_count + smoothing_factor);
396 : }
397 : }
398 :
399 : // calculate class log priors
400 0 : if (fit_prior) {
401 0 : double total = std::accumulate(
402 : class_probs.begin(),
403 : class_probs.end(),
404 : 0.0,
405 0 : [](double sum, const auto &entry) { return sum + entry.second; });
406 :
407 0 : for (auto &entry : class_probs) {
408 0 : entry.second /= total;
409 : }
410 :
411 0 : std::transform(
412 : class_probs.begin(),
413 : class_probs.end(),
414 : class_log_prior.begin(),
415 0 : [total](const auto &entry) { return log(entry.second); });
416 : }
417 0 : }
418 :
419 : std::string
420 0 : gpmp::ml::BayesMultiNom::predict(const std::vector<size_t> &new_data) const {
421 0 : double max_prob = -std::numeric_limits<double>::infinity();
422 0 : std::string predicted_class;
423 :
424 0 : for (const auto &entry : class_probs) {
425 0 : const std::string &label = entry.first;
426 0 : double probability = log(entry.second);
427 :
428 0 : for (size_t j = 0; j < new_data.size(); ++j) {
429 0 : probability += new_data[j] * log(feature_probs.at(label).at(j));
430 : }
431 :
432 0 : if (probability > max_prob) {
433 0 : max_prob = probability;
434 0 : predicted_class = label;
435 : }
436 : }
437 :
438 0 : return predicted_class;
439 0 : }
440 :
441 0 : void gpmp::ml::BayesMultiNom::display() const {
442 0 : std::cout << "Class Probabilities:\n";
443 0 : for (const auto &entry : class_probs) {
444 0 : std::cout << entry.first << ": " << entry.second << "\n";
445 : }
446 :
447 0 : std::cout << "\nFeature Probabilities:\n";
448 0 : for (const auto &class_entry : feature_probs) {
449 0 : std::cout << class_entry.first << ":\n";
450 0 : for (size_t j = 0; j < class_entry.second.size(); ++j) {
451 0 : std::cout << " Feature " << j << ": " << class_entry.second[j]
452 0 : << "\n";
453 : }
454 : }
455 :
456 0 : std::cout << "\nClass Log Priors:\n";
457 0 : for (const auto &log_prior : class_log_prior) {
458 0 : std::cout << log_prior << "\n";
459 : }
460 0 : }
|