openGPMP
Open Source Mathematics Package
bayes_clf.cpp
Go to the documentation of this file.
1 /*************************************************************************
2  *
3  * Project
4  * _____ _____ __ __ _____
5  * / ____| __ \| \/ | __ \
6  * ___ _ __ ___ _ __ | | __| |__) | \ / | |__) |
7  * / _ \| '_ \ / _ \ '_ \| | |_ | ___/| |\/| | ___/
8  *| (_) | |_) | __/ | | | |__| | | | | | | |
9  * \___/| .__/ \___|_| |_|\_____|_| |_| |_|_|
10  * | |
11  * |_|
12  *
13  * Copyright (C) Akiel Aries, <akiel@akiel.org>, et al.
14  *
15  * This software is licensed as described in the file LICENSE, which
16  * you should have received as part of this distribution. The terms
17  * among other details are referenced in the official documentation
18  * seen here : https://akielaries.github.io/openGPMP/ along with
19  * important files seen in this project.
20  *
21  * You may opt to use, copy, modify, merge, publish, distribute
22  * and/or sell copies of the Software, and permit persons to whom
23  * the Software is furnished to do so, under the terms of the
24  * LICENSE file. As this is an Open Source effort, all implementations
25  * must be of the same methodology.
26  *
27  *
28  *
29  * This software is distributed on an AS IS basis, WITHOUT
30  * WARRANTY OF ANY KIND, either express or implied.
31  *
32  ************************************************************************/
33 #include <algorithm>
34 #include <cmath>
35 #include <iostream>
36 #include <numeric>
38 #include <string>
39 #include <unordered_map>
40 #include <vector>
41 
42 gpmp::ml::BayesClf::BayesClf(double alpha_param,
43  bool fit_prior_param,
44  const std::vector<double> &class_prior)
45  : alpha(alpha_param), fit_prior(fit_prior_param),
46  class_log_prior(class_prior.begin(), class_prior.end()) {
47 }
48 
50 }
51 
52 void gpmp::ml::BayesClf::train(const std::vector<std::vector<double>> &data,
53  const std::vector<std::string> &labels) {
54  // count class occurrences
55  for (const auto &label : labels) {
56  class_probs[label] += 1.0;
57  }
58 
59  // count feature occurrences for each class
60  for (size_t i = 0; i < data.size(); ++i) {
61  const std::string &label = labels[i];
62  const std::vector<double> &features = data[i];
63 
64  class_probs[label] += 1.0;
65 
66  // initialize feature_probs[label] if not present
67  if (feature_probs.find(label) == feature_probs.end()) {
68  feature_probs[label] = std::vector<double>(features.size(), 0.0);
69  }
70 
71  for (size_t j = 0; j < features.size(); ++j) {
72  feature_probs[label][j] += features[j];
73  }
74  }
75 
76  // calculate class probabilities and feature probabilities
77  double smoothing_factor = alpha * 2.0;
78  for (const auto &entry : class_probs) {
79  const std::string &label = entry.first;
80  double class_count = entry.second;
81 
82  // calculate class probability
83  class_probs[label] =
84  (class_count + alpha) / (data.size() + smoothing_factor);
85 
86  // calculate feature probabilities
87  for (size_t j = 0; j < feature_probs[label].size(); ++j) {
88  feature_probs[label][j] = (feature_probs[label][j] + alpha) /
89  (class_count + smoothing_factor);
90  }
91  }
92 
93  // calculate class log priors
94  if (fit_prior) {
95  double total = std::accumulate(
96  class_probs.begin(),
97  class_probs.end(),
98  0.0,
99  [](double sum, const auto &entry) { return sum + entry.second; });
100 
101  for (auto &entry : class_probs) {
102  entry.second /= total;
103  }
104 
105  std::transform(
106  class_probs.begin(),
107  class_probs.end(),
108  class_log_prior.begin(),
109  [total](const auto &entry) { return log(entry.second); });
110  }
111 }
112 
113 std::string
114 gpmp::ml::BayesClf::predict(const std::vector<double> &new_data) const {
115  double max_prob = -std::numeric_limits<double>::infinity();
116  std::string predicted_class;
117 
118  for (const auto &entry : class_probs) {
119  const std::string &label = entry.first;
120  double probability = log(entry.second);
121 
122  for (size_t j = 0; j < new_data.size(); ++j) {
123  probability += new_data[j] * log(feature_probs.at(label).at(j));
124  }
125 
126  if (probability > max_prob) {
127  max_prob = probability;
128  predicted_class = label;
129  }
130  }
131 
132  return predicted_class;
133 }
134 
136  std::cout << "Class Probabilities:\n";
137  for (const auto &entry : class_probs) {
138  std::cout << entry.first << ": " << entry.second << "\n";
139  }
140 
141  std::cout << "\nFeature Probabilities:\n";
142  for (const auto &class_entry : feature_probs) {
143  std::cout << class_entry.first << ":\n";
144  for (size_t j = 0; j < class_entry.second.size(); ++j) {
145  std::cout << " Feature " << j << ": " << class_entry.second[j]
146  << "\n";
147  }
148  }
149 
150  std::cout << "\nClass Log Priors:\n";
151  for (const auto &logPrior : class_log_prior) {
152  std::cout << logPrior << "\n";
153  }
154 }
156  const std::vector<std::vector<size_t>> &data,
157  const std::vector<std::string> &labels) {
158  size_t numInstances = data.size();
159  size_t num_feats = data[0].size();
160 
161  for (size_t i = 0; i < numInstances; ++i) {
162  std::string classLabel = labels[i];
163 
164  // update class probabilities
165  class_probs[classLabel] += 1.0;
166 
167  // update feature probabilities
168  for (size_t j = 0; j < num_feats; ++j) {
169  feat_probs[classLabel][j] += data[i][j];
170  }
171  }
172 
173  // laplace smoothing
174  double smoothing_factor = alpha * 2.0;
175  for (auto &entry : class_probs) {
176  entry.second =
177  (entry.second + alpha) / (numInstances + smoothing_factor);
178  }
179 
180  for (auto &class_entry : feat_probs) {
181  for (auto &feat_entry : class_entry.second) {
182  feat_entry.second =
183  (feat_entry.second + alpha) /
184  (class_probs[class_entry.first] + smoothing_factor);
185  }
186  }
187 }
188 
189 // predict the class of a new data point
190 std::string
191 gpmp::ml::BayesBernoulli::predict(const std::vector<size_t> &new_data) const {
192  double max_prob = -std::numeric_limits<double>::infinity();
193  std::string predicted_class;
194 
195  for (const auto &class_entry : class_probs) {
196  double probability = log(class_entry.second);
197 
198  for (size_t i = 0; i < new_data.size(); ++i) {
199  probability +=
200  new_data[i] * log(feat_probs.at(class_entry.first).at(i));
201  }
202 
203  if (probability > max_prob) {
204  max_prob = probability;
205  predicted_class = class_entry.first;
206  }
207  }
208 
209  return predicted_class;
210 }
211 
213  std::cout << "Class Probabilities:\n";
214  for (const auto &entry : class_probs) {
215  std::cout << entry.first << ": " << entry.second << "\n";
216  }
217 
218  std::cout << "\nFeature Probabilities:\n";
219  for (const auto &class_entry : feat_probs) {
220  std::cout << class_entry.first << ":\n";
221  for (const auto &feat_entry : class_entry.second) {
222  std::cout << " Feature " << feat_entry.first << ": "
223  << feat_entry.second << "\n";
224  }
225  }
226 }
227 
228 void gpmp::ml::BayesGauss::train(const std::vector<std::vector<double>> &data,
229  const std::vector<std::string> &labels) {
230  // calculate class occurrences
231  for (const auto &label : labels) {
232  class_probs[label] += 1.0;
233  }
234 
235  // calculate mean and variance for each feature in each class
236  mean_and_var(data, labels);
237 
238  // calculate class probabilities
239  double total_instances = static_cast<double>(data.size());
240  for (auto &entry : class_probs) {
241  entry.second /= total_instances;
242  }
243 }
244 
246  const std::vector<std::vector<double>> &data,
247  const std::vector<std::string> &labels) {
248  size_t num_features = data[0].size();
249 
250  for (size_t i = 0; i < data.size(); ++i) {
251  const std::string &label = labels[i];
252  const std::vector<double> &features = data[i];
253 
254  class_probs[label] += 1.0;
255 
256  // initialize mean[label] and variance[label] if not present
257  if (mean.find(label) == mean.end()) {
258  mean[label] = std::vector<double>(num_features, 0.0);
259  variance[label] = std::vector<double>(num_features, 0.0);
260  }
261 
262  // update mean
263  for (size_t j = 0; j < num_features; ++j) {
264  mean[label][j] += features[j];
265  }
266  }
267 
268  // calculate mean
269  for (auto &entry : mean) {
270  const std::string &label = entry.first;
271  double class_count = class_probs[label];
272 
273  for (size_t j = 0; j < num_features; ++j) {
274  entry.second[j] /= class_count;
275  }
276  }
277 
278  // calculate variance
279  for (size_t i = 0; i < data.size(); ++i) {
280  const std::string &label = labels[i];
281  const std::vector<double> &features = data[i];
282 
283  for (size_t j = 0; j < num_features; ++j) {
284  variance[label][j] += std::pow(features[j] - mean[label][j], 2);
285  }
286  }
287 
288  for (auto &entry : variance) {
289  const std::string &label = entry.first;
290  double class_count = class_probs[label];
291 
292  for (size_t j = 0; j < num_features; ++j) {
293  entry.second[j] /= class_count;
294  }
295  }
296 }
297 
298 std::string
299 gpmp::ml::BayesGauss::predict(const std::vector<double> &newData) const {
300  double max_prob = -std::numeric_limits<double>::infinity();
301  std::string predicted_class;
302 
303  for (const auto &entry : class_probs) {
304  const std::string &label = entry.first;
305  double probability = log(entry.second);
306 
307  for (size_t j = 0; j < newData.size(); ++j) {
308  probability -=
309  0.5 * (std::log(2 * M_PI * variance.at(label).at(j)) +
310  std::pow(newData[j] - mean.at(label).at(j), 2) /
311  (2 * variance.at(label).at(j)));
312  }
313 
314  if (probability > max_prob) {
315  max_prob = probability;
316  predicted_class = label;
317  }
318  }
319 
320  return predicted_class;
321 }
322 
324  std::cout << "Class Probabilities:\n";
325  for (const auto &entry : class_probs) {
326  std::cout << entry.first << ": " << entry.second << "\n";
327  }
328 
329  std::cout << "\nMean and Variance:\n";
330  for (const auto &class_entry : mean) {
331  std::cout << class_entry.first << ":\n";
332  std::cout << " Mean: ";
333  for (size_t j = 0; j < class_entry.second.size(); ++j) {
334  std::cout << class_entry.second[j] << " ";
335  }
336  std::cout << "\n Variance: ";
337  for (size_t j = 0; j < variance.at(class_entry.first).size(); ++j) {
338  std::cout << variance.at(class_entry.first).at(j) << " ";
339  }
340  std::cout << "\n";
341  }
342 }
343 
345  bool fit_prior_param,
346  const std::vector<double> &class_prior)
347  : alpha(alpha_param), fit_prior(fit_prior_param),
348  class_log_prior(class_prior.begin(), class_prior.end()) {
349 }
350 
352 }
353 
355  const std::vector<std::vector<size_t>> &data,
356  const std::vector<std::string> &labels) {
357  size_t num_instances = data.size();
358  size_t num_features = data[0].size();
359 
360  // count class occurrences
361  for (const auto &label : labels) {
362  class_probs[label] += 1.0;
363  }
364 
365  // count feature occurrences for each class
366  for (size_t i = 0; i < num_instances; ++i) {
367  const std::string &label = labels[i];
368  const std::vector<size_t> &features = data[i];
369 
370  class_probs[label] += 1.0;
371 
372  // Initialize feature_probs[label] if not present
373  if (feature_probs.find(label) == feature_probs.end()) {
374  feature_probs[label] = std::vector<double>(num_features, 0.0);
375  }
376 
377  for (size_t j = 0; j < num_features; ++j) {
378  feature_probs[label][j] += features[j];
379  }
380  }
381 
382  // calculate class probabilities and feature probabilities
383  double smoothing_factor = alpha * num_features;
384  for (const auto &entry : class_probs) {
385  const std::string &label = entry.first;
386  double class_count = entry.second;
387 
388  // calculate class probability
389  class_probs[label] =
390  (class_count + alpha) / (num_instances + smoothing_factor);
391 
392  // calculate feature probabilities
393  for (size_t j = 0; j < feature_probs[label].size(); ++j) {
394  feature_probs[label][j] = (feature_probs[label][j] + alpha) /
395  (class_count + smoothing_factor);
396  }
397  }
398 
399  // calculate class log priors
400  if (fit_prior) {
401  double total = std::accumulate(
402  class_probs.begin(),
403  class_probs.end(),
404  0.0,
405  [](double sum, const auto &entry) { return sum + entry.second; });
406 
407  for (auto &entry : class_probs) {
408  entry.second /= total;
409  }
410 
411  std::transform(
412  class_probs.begin(),
413  class_probs.end(),
414  class_log_prior.begin(),
415  [total](const auto &entry) { return log(entry.second); });
416  }
417 }
418 
419 std::string
420 gpmp::ml::BayesMultiNom::predict(const std::vector<size_t> &new_data) const {
421  double max_prob = -std::numeric_limits<double>::infinity();
422  std::string predicted_class;
423 
424  for (const auto &entry : class_probs) {
425  const std::string &label = entry.first;
426  double probability = log(entry.second);
427 
428  for (size_t j = 0; j < new_data.size(); ++j) {
429  probability += new_data[j] * log(feature_probs.at(label).at(j));
430  }
431 
432  if (probability > max_prob) {
433  max_prob = probability;
434  predicted_class = label;
435  }
436  }
437 
438  return predicted_class;
439 }
440 
442  std::cout << "Class Probabilities:\n";
443  for (const auto &entry : class_probs) {
444  std::cout << entry.first << ": " << entry.second << "\n";
445  }
446 
447  std::cout << "\nFeature Probabilities:\n";
448  for (const auto &class_entry : feature_probs) {
449  std::cout << class_entry.first << ":\n";
450  for (size_t j = 0; j < class_entry.second.size(); ++j) {
451  std::cout << " Feature " << j << ": " << class_entry.second[j]
452  << "\n";
453  }
454  }
455 
456  std::cout << "\nClass Log Priors:\n";
457  for (const auto &log_prior : class_log_prior) {
458  std::cout << log_prior << "\n";
459  }
460 }
void train(const std::vector< std::vector< size_t >> &data, const std::vector< std::string > &labels)
Train the classifier with a set of labeled data.
Definition: bayes_clf.cpp:155
void display() const
Display the learned probabilities.
Definition: bayes_clf.cpp:212
std::string predict(const std::vector< size_t > &newData) const
Predict the class of a new data point.
Definition: bayes_clf.cpp:191
~BayesClf()
Destructor for BayesClf class.
Definition: bayes_clf.cpp:49
void train(const std::vector< std::vector< double >> &data, const std::vector< std::string > &labels)
Train the classifier with a set of labeled data.
Definition: bayes_clf.cpp:52
void display() const
Display the learned probabilities.
Definition: bayes_clf.cpp:135
std::string predict(const std::vector< double > &newData) const
Predict the class of a new data point.
Definition: bayes_clf.cpp:114
BayesClf(double alpha_param=1.0, bool fit_prior_param=true, const std::vector< double > &class_prior={})
Constructor for BayesClf class.
Definition: bayes_clf.cpp:42
void display() const
Display the learned probabilities.
Definition: bayes_clf.cpp:323
void mean_and_var(const std::vector< std::vector< double >> &data, const std::vector< std::string > &labels)
Calculate the mean and variance for each class.
Definition: bayes_clf.cpp:245
std::string predict(const std::vector< double > &newData) const
Predict the class of a new data point.
Definition: bayes_clf.cpp:299
void train(const std::vector< std::vector< double >> &data, const std::vector< std::string > &labels)
Train the classifier with a set of labeled data.
Definition: bayes_clf.cpp:228
void train(const std::vector< std::vector< size_t >> &data, const std::vector< std::string > &labels)
Train the classifier with a set of labeled data.
Definition: bayes_clf.cpp:354
std::string predict(const std::vector< size_t > &new_data) const
Predict the class of a new data point.
Definition: bayes_clf.cpp:420
~BayesMultiNom()
Destructor for BayesMultiNom class.
Definition: bayes_clf.cpp:351
void display() const
Display the learned probabilities.
Definition: bayes_clf.cpp:441
BayesMultiNom(double alpha_param=1.0, bool fit_prior_param=true, const std::vector< double > &class_prior={})
Constructor for BayesMultiNom class.
Definition: bayes_clf.cpp:344