LCOV - code coverage report
Current view: top level - modules/ml - bayes_clf.cpp (source / functions) Hit Total Coverage
Test: lcov.info Lines: 0 235 0.0 %
Date: 2024-05-13 05:06:06 Functions: 0 21 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*************************************************************************
       2             :  *
       3             :  *  Project
       4             :  *                         _____ _____  __  __ _____
       5             :  *                        / ____|  __ \|  \/  |  __ \
       6             :  *  ___  _ __   ___ _ __ | |  __| |__) | \  / | |__) |
       7             :  * / _ \| '_ \ / _ \ '_ \| | |_ |  ___/| |\/| |  ___/
       8             :  *| (_) | |_) |  __/ | | | |__| | |    | |  | | |
       9             :  * \___/| .__/ \___|_| |_|\_____|_|    |_|  |_|_|
      10             :  *      | |
      11             :  *      |_|
      12             :  *
      13             :  * Copyright (C) Akiel Aries, <akiel@akiel.org>, et al.
      14             :  *
      15             :  * This software is licensed as described in the file LICENSE, which
      16             :  * you should have received as part of this distribution. The terms
      17             :  * among other details are referenced in the official documentation
      18             :  * seen here : https://akielaries.github.io/openGPMP/ along with
      19             :  * important files seen in this project.
      20             :  *
      21             :  * You may opt to use, copy, modify, merge, publish, distribute
      22             :  * and/or sell copies of the Software, and permit persons to whom
      23             :  * the Software is furnished to do so, under the terms of the
      24             :  * LICENSE file. As this is an Open Source effort, all implementations
      25             :  * must be of the same methodology.
      26             :  *
      27             :  *
      28             :  *
      29             :  * This software is distributed on an AS IS basis, WITHOUT
      30             :  * WARRANTY OF ANY KIND, either express or implied.
      31             :  *
      32             :  ************************************************************************/
      33             : #include <algorithm>
      34             : #include <cmath>
      35             : #include <iostream>
      36             : #include <numeric>
      37             : #include <openGPMP/ml/bayes_clf.hpp>
      38             : #include <string>
      39             : #include <unordered_map>
      40             : #include <vector>
      41             : 
      42           0 : gpmp::ml::BayesClf::BayesClf(double alpha_param,
      43             :                              bool fit_prior_param,
      44           0 :                              const std::vector<double> &class_prior)
      45           0 :     : alpha(alpha_param), fit_prior(fit_prior_param),
      46           0 :       class_log_prior(class_prior.begin(), class_prior.end()) {
      47           0 : }
      48             : 
      49           0 : gpmp::ml::BayesClf::~BayesClf() {
      50           0 : }
      51             : 
      52           0 : void gpmp::ml::BayesClf::train(const std::vector<std::vector<double>> &data,
      53             :                                const std::vector<std::string> &labels) {
      54             :     // count class occurrences
      55           0 :     for (const auto &label : labels) {
      56           0 :         class_probs[label] += 1.0;
      57             :     }
      58             : 
      59             :     // count feature occurrences for each class
      60           0 :     for (size_t i = 0; i < data.size(); ++i) {
      61           0 :         const std::string &label = labels[i];
      62           0 :         const std::vector<double> &features = data[i];
      63             : 
      64           0 :         class_probs[label] += 1.0;
      65             : 
      66             :         // initialize feature_probs[label] if not present
      67           0 :         if (feature_probs.find(label) == feature_probs.end()) {
      68           0 :             feature_probs[label] = std::vector<double>(features.size(), 0.0);
      69             :         }
      70             : 
      71           0 :         for (size_t j = 0; j < features.size(); ++j) {
      72           0 :             feature_probs[label][j] += features[j];
      73             :         }
      74             :     }
      75             : 
      76             :     // calculate class probabilities and feature probabilities
      77           0 :     double smoothing_factor = alpha * 2.0;
      78           0 :     for (const auto &entry : class_probs) {
      79           0 :         const std::string &label = entry.first;
      80           0 :         double class_count = entry.second;
      81             : 
      82             :         // calculate class probability
      83           0 :         class_probs[label] =
      84           0 :             (class_count + alpha) / (data.size() + smoothing_factor);
      85             : 
      86             :         // calculate feature probabilities
      87           0 :         for (size_t j = 0; j < feature_probs[label].size(); ++j) {
      88           0 :             feature_probs[label][j] = (feature_probs[label][j] + alpha) /
      89           0 :                                       (class_count + smoothing_factor);
      90             :         }
      91             :     }
      92             : 
      93             :     // calculate class log priors
      94           0 :     if (fit_prior) {
      95           0 :         double total = std::accumulate(
      96             :             class_probs.begin(),
      97             :             class_probs.end(),
      98             :             0.0,
      99           0 :             [](double sum, const auto &entry) { return sum + entry.second; });
     100             : 
     101           0 :         for (auto &entry : class_probs) {
     102           0 :             entry.second /= total;
     103             :         }
     104             : 
     105           0 :         std::transform(
     106             :             class_probs.begin(),
     107             :             class_probs.end(),
     108             :             class_log_prior.begin(),
     109           0 :             [total](const auto &entry) { return log(entry.second); });
     110             :     }
     111           0 : }
     112             : 
     113             : std::string
     114           0 : gpmp::ml::BayesClf::predict(const std::vector<double> &new_data) const {
     115           0 :     double max_prob = -std::numeric_limits<double>::infinity();
     116           0 :     std::string predicted_class;
     117             : 
     118           0 :     for (const auto &entry : class_probs) {
     119           0 :         const std::string &label = entry.first;
     120           0 :         double probability = log(entry.second);
     121             : 
     122           0 :         for (size_t j = 0; j < new_data.size(); ++j) {
     123           0 :             probability += new_data[j] * log(feature_probs.at(label).at(j));
     124             :         }
     125             : 
     126           0 :         if (probability > max_prob) {
     127           0 :             max_prob = probability;
     128           0 :             predicted_class = label;
     129             :         }
     130             :     }
     131             : 
     132           0 :     return predicted_class;
     133           0 : }
     134             : 
     135           0 : void gpmp::ml::BayesClf::display() const {
     136           0 :     std::cout << "Class Probabilities:\n";
     137           0 :     for (const auto &entry : class_probs) {
     138           0 :         std::cout << entry.first << ": " << entry.second << "\n";
     139             :     }
     140             : 
     141           0 :     std::cout << "\nFeature Probabilities:\n";
     142           0 :     for (const auto &class_entry : feature_probs) {
     143           0 :         std::cout << class_entry.first << ":\n";
     144           0 :         for (size_t j = 0; j < class_entry.second.size(); ++j) {
     145           0 :             std::cout << "  Feature " << j << ": " << class_entry.second[j]
     146           0 :                       << "\n";
     147             :         }
     148             :     }
     149             : 
     150           0 :     std::cout << "\nClass Log Priors:\n";
     151           0 :     for (const auto &logPrior : class_log_prior) {
     152           0 :         std::cout << logPrior << "\n";
     153             :     }
     154           0 : }
     155           0 : void gpmp::ml::BayesBernoulli::train(
     156             :     const std::vector<std::vector<size_t>> &data,
     157             :     const std::vector<std::string> &labels) {
     158           0 :     size_t numInstances = data.size();
     159           0 :     size_t num_feats = data[0].size();
     160             : 
     161           0 :     for (size_t i = 0; i < numInstances; ++i) {
     162           0 :         std::string classLabel = labels[i];
     163             : 
     164             :         // update class probabilities
     165           0 :         class_probs[classLabel] += 1.0;
     166             : 
     167             :         // update feature probabilities
     168           0 :         for (size_t j = 0; j < num_feats; ++j) {
     169           0 :             feat_probs[classLabel][j] += data[i][j];
     170             :         }
     171           0 :     }
     172             : 
     173             :     // laplace smoothing
     174           0 :     double smoothing_factor = alpha * 2.0;
     175           0 :     for (auto &entry : class_probs) {
     176           0 :         entry.second =
     177           0 :             (entry.second + alpha) / (numInstances + smoothing_factor);
     178             :     }
     179             : 
     180           0 :     for (auto &class_entry : feat_probs) {
     181           0 :         for (auto &feat_entry : class_entry.second) {
     182           0 :             feat_entry.second =
     183           0 :                 (feat_entry.second + alpha) /
     184           0 :                 (class_probs[class_entry.first] + smoothing_factor);
     185             :         }
     186             :     }
     187           0 : }
     188             : 
     189             : // predict the class of a new data point
     190             : std::string
     191           0 : gpmp::ml::BayesBernoulli::predict(const std::vector<size_t> &new_data) const {
     192           0 :     double max_prob = -std::numeric_limits<double>::infinity();
     193           0 :     std::string predicted_class;
     194             : 
     195           0 :     for (const auto &class_entry : class_probs) {
     196           0 :         double probability = log(class_entry.second);
     197             : 
     198           0 :         for (size_t i = 0; i < new_data.size(); ++i) {
     199           0 :             probability +=
     200           0 :                 new_data[i] * log(feat_probs.at(class_entry.first).at(i));
     201             :         }
     202             : 
     203           0 :         if (probability > max_prob) {
     204           0 :             max_prob = probability;
     205           0 :             predicted_class = class_entry.first;
     206             :         }
     207             :     }
     208             : 
     209           0 :     return predicted_class;
     210           0 : }
     211             : 
     212           0 : void gpmp::ml::BayesBernoulli::display() const {
     213           0 :     std::cout << "Class Probabilities:\n";
     214           0 :     for (const auto &entry : class_probs) {
     215           0 :         std::cout << entry.first << ": " << entry.second << "\n";
     216             :     }
     217             : 
     218           0 :     std::cout << "\nFeature Probabilities:\n";
     219           0 :     for (const auto &class_entry : feat_probs) {
     220           0 :         std::cout << class_entry.first << ":\n";
     221           0 :         for (const auto &feat_entry : class_entry.second) {
     222           0 :             std::cout << "  Feature " << feat_entry.first << ": "
     223           0 :                       << feat_entry.second << "\n";
     224             :         }
     225             :     }
     226           0 : }
     227             : 
     228           0 : void gpmp::ml::BayesGauss::train(const std::vector<std::vector<double>> &data,
     229             :                                  const std::vector<std::string> &labels) {
     230             :     // calculate class occurrences
     231           0 :     for (const auto &label : labels) {
     232           0 :         class_probs[label] += 1.0;
     233             :     }
     234             : 
     235             :     // calculate mean and variance for each feature in each class
     236           0 :     mean_and_var(data, labels);
     237             : 
     238             :     // calculate class probabilities
     239           0 :     double total_instances = static_cast<double>(data.size());
     240           0 :     for (auto &entry : class_probs) {
     241           0 :         entry.second /= total_instances;
     242             :     }
     243           0 : }
     244             : 
     245           0 : void gpmp::ml::BayesGauss::mean_and_var(
     246             :     const std::vector<std::vector<double>> &data,
     247             :     const std::vector<std::string> &labels) {
     248           0 :     size_t num_features = data[0].size();
     249             : 
     250           0 :     for (size_t i = 0; i < data.size(); ++i) {
     251           0 :         const std::string &label = labels[i];
     252           0 :         const std::vector<double> &features = data[i];
     253             : 
     254           0 :         class_probs[label] += 1.0;
     255             : 
     256             :         // initialize mean[label] and variance[label] if not present
     257           0 :         if (mean.find(label) == mean.end()) {
     258           0 :             mean[label] = std::vector<double>(num_features, 0.0);
     259           0 :             variance[label] = std::vector<double>(num_features, 0.0);
     260             :         }
     261             : 
     262             :         // update mean
     263           0 :         for (size_t j = 0; j < num_features; ++j) {
     264           0 :             mean[label][j] += features[j];
     265             :         }
     266             :     }
     267             : 
     268             :     // calculate mean
     269           0 :     for (auto &entry : mean) {
     270           0 :         const std::string &label = entry.first;
     271           0 :         double class_count = class_probs[label];
     272             : 
     273           0 :         for (size_t j = 0; j < num_features; ++j) {
     274           0 :             entry.second[j] /= class_count;
     275             :         }
     276             :     }
     277             : 
     278             :     // calculate variance
     279           0 :     for (size_t i = 0; i < data.size(); ++i) {
     280           0 :         const std::string &label = labels[i];
     281           0 :         const std::vector<double> &features = data[i];
     282             : 
     283           0 :         for (size_t j = 0; j < num_features; ++j) {
     284           0 :             variance[label][j] += std::pow(features[j] - mean[label][j], 2);
     285             :         }
     286             :     }
     287             : 
     288           0 :     for (auto &entry : variance) {
     289           0 :         const std::string &label = entry.first;
     290           0 :         double class_count = class_probs[label];
     291             : 
     292           0 :         for (size_t j = 0; j < num_features; ++j) {
     293           0 :             entry.second[j] /= class_count;
     294             :         }
     295             :     }
     296           0 : }
     297             : 
     298             : std::string
     299           0 : gpmp::ml::BayesGauss::predict(const std::vector<double> &newData) const {
     300           0 :     double max_prob = -std::numeric_limits<double>::infinity();
     301           0 :     std::string predicted_class;
     302             : 
     303           0 :     for (const auto &entry : class_probs) {
     304           0 :         const std::string &label = entry.first;
     305           0 :         double probability = log(entry.second);
     306             : 
     307           0 :         for (size_t j = 0; j < newData.size(); ++j) {
     308           0 :             probability -=
     309           0 :                 0.5 * (std::log(2 * M_PI * variance.at(label).at(j)) +
     310           0 :                        std::pow(newData[j] - mean.at(label).at(j), 2) /
     311           0 :                            (2 * variance.at(label).at(j)));
     312             :         }
     313             : 
     314           0 :         if (probability > max_prob) {
     315           0 :             max_prob = probability;
     316           0 :             predicted_class = label;
     317             :         }
     318             :     }
     319             : 
     320           0 :     return predicted_class;
     321           0 : }
     322             : 
     323           0 : void gpmp::ml::BayesGauss::display() const {
     324           0 :     std::cout << "Class Probabilities:\n";
     325           0 :     for (const auto &entry : class_probs) {
     326           0 :         std::cout << entry.first << ": " << entry.second << "\n";
     327             :     }
     328             : 
     329           0 :     std::cout << "\nMean and Variance:\n";
     330           0 :     for (const auto &class_entry : mean) {
     331           0 :         std::cout << class_entry.first << ":\n";
     332           0 :         std::cout << "  Mean: ";
     333           0 :         for (size_t j = 0; j < class_entry.second.size(); ++j) {
     334           0 :             std::cout << class_entry.second[j] << " ";
     335             :         }
     336           0 :         std::cout << "\n  Variance: ";
     337           0 :         for (size_t j = 0; j < variance.at(class_entry.first).size(); ++j) {
     338           0 :             std::cout << variance.at(class_entry.first).at(j) << " ";
     339             :         }
     340           0 :         std::cout << "\n";
     341             :     }
     342           0 : }
     343             : 
     344           0 : gpmp::ml::BayesMultiNom::BayesMultiNom(double alpha_param,
     345             :                                        bool fit_prior_param,
     346           0 :                                        const std::vector<double> &class_prior)
     347           0 :     : alpha(alpha_param), fit_prior(fit_prior_param),
     348           0 :       class_log_prior(class_prior.begin(), class_prior.end()) {
     349           0 : }
     350             : 
     351           0 : gpmp::ml::BayesMultiNom::~BayesMultiNom() {
     352           0 : }
     353             : 
     354           0 : void gpmp::ml::BayesMultiNom::train(
     355             :     const std::vector<std::vector<size_t>> &data,
     356             :     const std::vector<std::string> &labels) {
     357           0 :     size_t num_instances = data.size();
     358           0 :     size_t num_features = data[0].size();
     359             : 
     360             :     // count class occurrences
     361           0 :     for (const auto &label : labels) {
     362           0 :         class_probs[label] += 1.0;
     363             :     }
     364             : 
     365             :     // count feature occurrences for each class
     366           0 :     for (size_t i = 0; i < num_instances; ++i) {
     367           0 :         const std::string &label = labels[i];
     368           0 :         const std::vector<size_t> &features = data[i];
     369             : 
     370           0 :         class_probs[label] += 1.0;
     371             : 
     372             :         // Initialize feature_probs[label] if not present
     373           0 :         if (feature_probs.find(label) == feature_probs.end()) {
     374           0 :             feature_probs[label] = std::vector<double>(num_features, 0.0);
     375             :         }
     376             : 
     377           0 :         for (size_t j = 0; j < num_features; ++j) {
     378           0 :             feature_probs[label][j] += features[j];
     379             :         }
     380             :     }
     381             : 
     382             :     // calculate class probabilities and feature probabilities
     383           0 :     double smoothing_factor = alpha * num_features;
     384           0 :     for (const auto &entry : class_probs) {
     385           0 :         const std::string &label = entry.first;
     386           0 :         double class_count = entry.second;
     387             : 
     388             :         // calculate class probability
     389           0 :         class_probs[label] =
     390           0 :             (class_count + alpha) / (num_instances + smoothing_factor);
     391             : 
     392             :         // calculate feature probabilities
     393           0 :         for (size_t j = 0; j < feature_probs[label].size(); ++j) {
     394           0 :             feature_probs[label][j] = (feature_probs[label][j] + alpha) /
     395           0 :                                       (class_count + smoothing_factor);
     396             :         }
     397             :     }
     398             : 
     399             :     // calculate class log priors
     400           0 :     if (fit_prior) {
     401           0 :         double total = std::accumulate(
     402             :             class_probs.begin(),
     403             :             class_probs.end(),
     404             :             0.0,
     405           0 :             [](double sum, const auto &entry) { return sum + entry.second; });
     406             : 
     407           0 :         for (auto &entry : class_probs) {
     408           0 :             entry.second /= total;
     409             :         }
     410             : 
     411           0 :         std::transform(
     412             :             class_probs.begin(),
     413             :             class_probs.end(),
     414             :             class_log_prior.begin(),
     415           0 :             [total](const auto &entry) { return log(entry.second); });
     416             :     }
     417           0 : }
     418             : 
     419             : std::string
     420           0 : gpmp::ml::BayesMultiNom::predict(const std::vector<size_t> &new_data) const {
     421           0 :     double max_prob = -std::numeric_limits<double>::infinity();
     422           0 :     std::string predicted_class;
     423             : 
     424           0 :     for (const auto &entry : class_probs) {
     425           0 :         const std::string &label = entry.first;
     426           0 :         double probability = log(entry.second);
     427             : 
     428           0 :         for (size_t j = 0; j < new_data.size(); ++j) {
     429           0 :             probability += new_data[j] * log(feature_probs.at(label).at(j));
     430             :         }
     431             : 
     432           0 :         if (probability > max_prob) {
     433           0 :             max_prob = probability;
     434           0 :             predicted_class = label;
     435             :         }
     436             :     }
     437             : 
     438           0 :     return predicted_class;
     439           0 : }
     440             : 
     441           0 : void gpmp::ml::BayesMultiNom::display() const {
     442           0 :     std::cout << "Class Probabilities:\n";
     443           0 :     for (const auto &entry : class_probs) {
     444           0 :         std::cout << entry.first << ": " << entry.second << "\n";
     445             :     }
     446             : 
     447           0 :     std::cout << "\nFeature Probabilities:\n";
     448           0 :     for (const auto &class_entry : feature_probs) {
     449           0 :         std::cout << class_entry.first << ":\n";
     450           0 :         for (size_t j = 0; j < class_entry.second.size(); ++j) {
     451           0 :             std::cout << "  Feature " << j << ": " << class_entry.second[j]
     452           0 :                       << "\n";
     453             :         }
     454             :     }
     455             : 
     456           0 :     std::cout << "\nClass Log Priors:\n";
     457           0 :     for (const auto &log_prior : class_log_prior) {
     458           0 :         std::cout << log_prior << "\n";
     459             :     }
     460           0 : }

Generated by: LCOV version 1.14