LCOV - code coverage report
Current view: top level - modules/ml - linreg.cpp (source / functions) Hit Total Coverage
Test: lcov.info Lines: 0 226 0.0 %
Date: 2024-05-13 05:06:06 Functions: 0 20 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*************************************************************************
       2             :  *
       3             :  *  Project
       4             :  *                         _____ _____  __  __ _____
       5             :  *                        / ____|  __ \|  \/  |  __ \
       6             :  *  ___  _ __   ___ _ __ | |  __| |__) | \  / | |__) |
       7             :  * / _ \| '_ \ / _ \ '_ \| | |_ |  ___/| |\/| |  ___/
       8             :  *| (_) | |_) |  __/ | | | |__| | |    | |  | | |
       9             :  * \___/| .__/ \___|_| |_|\_____|_|    |_|  |_|_|
      10             :  *      | |
      11             :  *      |_|
      12             :  *
      13             :  * Copyright (C) Akiel Aries, <akiel@akiel.org>, et al.
      14             :  *
      15             :  * This software is licensed as described in the file LICENSE, which
      16             :  * you should have received as part of this distribution. The terms
      17             :  * among other details are referenced in the official documentation
      18             :  * seen here : https://akielaries.github.io/openGPMP/ along with
      19             :  * important files seen in this project.
      20             :  *
      21             :  * You may opt to use, copy, modify, merge, publish, distribute
      22             :  * and/or sell copies of the Software, and permit persons to whom
      23             :  * the Software is furnished to do so, under the terms of the
      24             :  * LICENSE file. As this is an Open Source effort, all implementations
      25             :  * must be of the same methodology.
      26             :  *
      27             :  *
      28             :  *
      29             :  * This software is distributed on an AS IS basis, WITHOUT
      30             :  * WARRANTY OF ANY KIND, either express or implied.
      31             :  *
      32             :  ************************************************************************/
      33             : 
      34             : /**
      35             :  * openGPMP implementation of Linear Regression
      36             :  */
      37             : #include <fstream>
      38             : #include <iostream>
      39             : #include <openGPMP/core/datatable.hpp>
      40             : #include <openGPMP/core/utils.hpp>
      41             : #include <openGPMP/ml/linreg.hpp>
      42             : #include <random>
      43             : #include <stdio.h>
      44             : #include <string>
      45             : #include <vector>
      46             : 
      47             : /** Logger class object*/
      48             : static gpmp::core::Logger _log_;
      49             : 
      50             : /*
      51             :  * Constructor to provide the default values to all the terms in the
      52             :  * object of class regression
      53             :  */
      54           0 : gpmp::ml::LinearRegression::LinearRegression() {
      55           0 :     coeff = 0;
      56           0 :     constant = 0;
      57           0 :     sum_y = 0;
      58           0 :     sum_y_square = 0;
      59           0 :     sum_x_square = 0;
      60           0 :     sum_x = 0;
      61           0 :     sum_xy = 0;
      62           0 : }
      63             : 
      64             : // Function that calculate the coefficient/slope of the best fitting
      65             : // line
      66           0 : void gpmp::ml::LinearRegression::calculate_coeffecient() {
      67             :     // get number of datapoints
      68           0 :     long double N = x.size();
      69             :     // calculate numerator and denominator
      70           0 :     long double numerator = (N * sum_xy - sum_x * sum_y);
      71           0 :     long double denominator = (N * sum_x_square - sum_x * sum_x);
      72             :     // calculate the coeffecient
      73           0 :     coeff = numerator / denominator;
      74           0 : }
      75             : 
      76             : /*
      77             :  * Member function that will calculate the constant term of the best
      78             :  * fitting line
      79             :  */
      80           0 : void gpmp::ml::LinearRegression::calculate_constant() {
      81           0 :     long double N = x.size();
      82           0 :     long double numerator = (sum_y * sum_x_square - sum_x * sum_xy);
      83           0 :     long double denominator = (N * sum_x_square - sum_x * sum_x);
      84             :     // calculate constant
      85           0 :     constant = numerator / denominator;
      86           0 : }
      87             : 
      88             : // Function that return the number of entries (xi, yi) in the data set
      89           0 : int64_t gpmp::ml::LinearRegression::data_size() {
      90           0 :     return x.size();
      91             : }
      92             : 
      93             : // Function that return the coefficient/slope of the best fitting line
      94           0 : long double gpmp::ml::LinearRegression::return_coeffecient() {
      95           0 :     if (fabs(coeff - 0.0f) < std::numeric_limits<double>::epsilon()) {
      96           0 :         calculate_coeffecient();
      97             :     }
      98           0 :     return coeff;
      99             : }
     100             : 
     101             : // Function that return the constant term of the best fitting line
     102           0 : long double gpmp::ml::LinearRegression::return_constant() {
     103           0 :     if (fabs(constant - 0.0f) < std::numeric_limits<double>::epsilon()) {
     104           0 :         calculate_constant();
     105             :     }
     106           0 :     return constant;
     107             : }
     108             : 
     109             : // Function to calculate and display the best fitting line
     110           0 : void gpmp::ml::LinearRegression::best_fit() {
     111           0 :     if (x_train.empty() || y_train.empty()) {
     112             :         // Check if training data is empty
     113           0 :         _log_.log(WARNING, "Training data is empty.");
     114             : 
     115           0 :         if (fabs(coeff - 0.0f) < std::numeric_limits<double>::epsilon() &&
     116           0 :             fabs(constant - 0.0f) < std::numeric_limits<double>::epsilon()) {
     117             :             // If coefficients are not calculated, calculate them
     118           0 :             calculate_coeffecient();
     119           0 :             calculate_constant();
     120             :         }
     121             :         // Display the best fitting line equation
     122           0 :         _log_.log(INFO,
     123           0 :                   "Best fitting line : y = " + std::to_string(coeff) + "x + " +
     124           0 :                       std::to_string(constant));
     125           0 :         return;
     126             :     }
     127             : 
     128             :     // Calculate the coefficients using the training data
     129           0 :     long double N = x_train.size();
     130           0 :     long double sum_xy_train = 0;
     131           0 :     long double sum_x_train = 0;
     132           0 :     long double sum_y_train = 0;
     133           0 :     long double sum_x_square_train = 0;
     134             : 
     135           0 :     for (size_t i = 0; i < N; i++) {
     136             :         // Calculate sums for the training data
     137           0 :         sum_xy_train += x_train[i] * y_train[i];
     138           0 :         sum_x_train += x_train[i];
     139           0 :         sum_y_train += y_train[i];
     140           0 :         sum_x_square_train += x_train[i] * x_train[i];
     141             :     }
     142             : 
     143           0 :     long double numerator = (N * sum_xy_train - sum_x_train * sum_y_train);
     144           0 :     long double denominator =
     145           0 :         (N * sum_x_square_train - sum_x_train * sum_x_train);
     146             : 
     147             :     // Calculate the coefficients of the best fitting line
     148           0 :     coeff = numerator / denominator;
     149           0 :     constant = (sum_y_train - coeff * sum_x_train) / N;
     150             :     // Display the best fitting line equation
     151           0 :     _log_.log(INFO,
     152           0 :               "Best fitting line : y = " + std::to_string(coeff) + "x + " +
     153           0 :                   std::to_string(constant));
     154             : }
     155             : 
     156             : // Function to accept input data in the form of two vectors
     157           0 : void gpmp::ml::LinearRegression::get_input(
     158             :     const std::vector<long double> &x_data,
     159             :     const std::vector<long double> &y_data) {
     160             :     // Clear existing data from x and y vectors
     161           0 :     x.clear();
     162           0 :     y.clear();
     163             :     // Initialize LinearRegression class variables
     164           0 :     sum_xy = 0;       /* Set x*y sum */
     165           0 :     sum_x = 0;        /* Set sum of x */
     166           0 :     sum_y = 0;        /* Set sum of y */
     167           0 :     sum_x_square = 0; /* Set sum of x squares */
     168           0 :     sum_y_square = 0; /* Set sum of y squares */
     169             : 
     170           0 :     if (x_data.size() != y_data.size()) {
     171             :         // Check if input vectors have the same size
     172           0 :         _log_.log(ERROR, "Input vectors must have the same size");
     173           0 :         return;
     174             :     }
     175             : 
     176           0 :     for (size_t i = 0; i < x_data.size(); i++) {
     177             :         // Append x and y values to x and y vectors
     178           0 :         x.push_back(x_data[i]);
     179           0 :         y.push_back(y_data[i]);
     180             :         // Update sum of (x * y)
     181           0 :         sum_xy += x_data[i] * y_data[i];
     182             :         // Update sum of x and y
     183           0 :         sum_x += x_data[i];
     184           0 :         sum_y += y_data[i];
     185             :         // Update sum of x squares and y squares
     186           0 :         sum_x_square += x_data[i] * x_data[i];
     187           0 :         sum_y_square += y_data[i] * y_data[i];
     188             :     }
     189             : }
     190             : 
     191             : // Function to accept input data from a DataTableStr
     192           0 : void gpmp::ml::LinearRegression::get_input(
     193             :     const gpmp::core::DataTableStr &data,
     194             :     const std::vector<std::string> &columns) {
     195             :     // Clear any existing data from x and y vectors
     196           0 :     x.clear();
     197           0 :     y.clear();
     198           0 :     sum_xy = 0;
     199           0 :     sum_x = 0;
     200           0 :     sum_y = 0;
     201           0 :     sum_x_square = 0;
     202           0 :     sum_y_square = 0;
     203             : 
     204             :     // Ensure that columns is not empty and has at least 2 elements
     205           0 :     if (columns.size() < 2) {
     206           0 :         _log_.log(ERROR, "Input vectors must have at least 2 column names.");
     207           0 :         return;
     208             :     }
     209             : 
     210             :     // Find the column indices for the specified column names
     211           0 :     std::vector<size_t> column_indices;
     212           0 :     for (const auto &column_name : columns) {
     213           0 :         bool found = false;
     214           0 :         for (size_t i = 0; i < data.first.size(); ++i) {
     215           0 :             if (data.first[i] == column_name) {
     216           0 :                 column_indices.push_back(i);
     217           0 :                 found = true;
     218           0 :                 break;
     219             :             }
     220             :         }
     221           0 :         if (!found) {
     222           0 :             _log_.log(ERROR,
     223           0 :                       "Column '" + column_name +
     224             :                           "' not found in DataTableStr.");
     225           0 :             return;
     226             :         }
     227             :     }
     228             : 
     229           0 :     for (const auto &row : data.second) {
     230             :         try {
     231           0 :             long double xi = std::stold(row[column_indices[0]]);
     232           0 :             long double yi = std::stold(row[column_indices[1]]);
     233             :             // Append x and y values to x and y vectors
     234           0 :             x.push_back(xi);
     235           0 :             y.push_back(yi);
     236             :             // Update sum of (x * y)
     237           0 :             sum_xy += xi * yi;
     238             :             // Update sum of x and y
     239           0 :             sum_x += xi;
     240           0 :             sum_y += yi;
     241             :             // Update sum of x squares and y squares
     242           0 :             sum_x_square += xi * xi;
     243           0 :             sum_y_square += yi * yi;
     244           0 :         } catch (const std::exception &e) {
     245             :             // Handle parsing errors here
     246           0 :             _log_.log(ERROR, "Error parsing data: " + std::string(e.what()));
     247           0 :             continue;
     248           0 :         }
     249             :     }
     250           0 : }
     251             : 
     252             : // Function to accept input data from a file
     253           0 : void gpmp::ml::LinearRegression::get_input(const char *file) {
     254           0 :     int n = num_rows(file);
     255           0 :     for (int64_t i = 0; i < n; i++) {
     256             :         /*
     257             :          * In a csv file, all the values of xi and yi are separated by
     258             :          * commas
     259             :          */
     260             :         char comma;
     261             :         long double xi;
     262             :         long double yi;
     263           0 :         std::cin >> xi >> comma >> yi;
     264             :         // Update sum of (x * y)
     265           0 :         sum_xy += xi * yi;
     266             :         // Update sum of x and y
     267           0 :         sum_x += xi;
     268           0 :         sum_y += yi;
     269             :         // Update sum of x squares and y squares
     270           0 :         sum_x_square += xi * xi;
     271           0 :         sum_y_square += yi * yi;
     272             :         // Append x and y values to x and y vectors
     273           0 :         x.push_back(xi);
     274           0 :         y.push_back(yi);
     275             :     }
     276           0 : }
     277             : 
     278             : // Function to split data into training and testing sets
     279           0 : void gpmp::ml::LinearRegression::split_data(double test_size,
     280             :                                             unsigned int seed,
     281             :                                             bool shuffle) {
     282           0 :     if (x.empty() || y.empty()) {
     283           0 :         _log_.log(ERROR, "Training data is empty.");
     284           0 :         return;
     285             :     }
     286             : 
     287           0 :     if (test_size <= 0 || test_size >= 1) {
     288           0 :         _log_.log(ERROR, "Invalid `test_size`. Must be between 0 - 1.");
     289           0 :         return;
     290             :     }
     291             : 
     292           0 :     size_t data_size = x.size();
     293           0 :     size_t test_data_size = static_cast<size_t>(test_size * data_size);
     294             : 
     295             :     // Shuffle the data randomly if specified
     296           0 :     if (shuffle == true) {
     297             :         // Create vector of indices
     298           0 :         std::vector<size_t> indices(data_size);
     299             :         // Fill vector sequentially w/ `iota`
     300           0 :         std::iota(indices.begin(), indices.end(), 0);
     301             :         // Randomly shuffle vector indices from start to end
     302           0 :         std::shuffle(indices.begin(),
     303             :                      indices.end(),
     304           0 :                      std::default_random_engine(seed));
     305             : 
     306             :         // Clear training and testing data vectors
     307           0 :         x_train.clear();
     308           0 :         y_train.clear();
     309           0 :         x_test.clear();
     310           0 :         y_test.clear();
     311             : 
     312             :         // Split the data into training and testing sets based on shuffled
     313             :         // indices
     314           0 :         for (size_t i = 0; i < data_size; ++i) {
     315           0 :             if (i < test_data_size) {
     316             :                 // Append x test vector with shuffled x value
     317           0 :                 x_test.push_back(x[indices[i]]);
     318           0 :                 y_test.push_back(y[indices[i]]);
     319             :             } else {
     320             :                 // Append x training vector with suffled x value
     321           0 :                 x_train.push_back(x[indices[i]]);
     322           0 :                 y_train.push_back(y[indices[i]]);
     323             :             }
     324             :         }
     325           0 :     } else {
     326             :         // Without shuffling, split the data without changing its order by
     327             :         // assigning the first training element to the training set
     328           0 :         x_train.assign(x.begin(), x.begin() + data_size - test_data_size);
     329           0 :         y_train.assign(y.begin(), y.begin() + data_size - test_data_size);
     330             :         // Assign the 1+ testing elements to the testing test
     331           0 :         x_test.assign(x.begin() + data_size - test_data_size, x.end());
     332           0 :         y_test.assign(y.begin() + data_size - test_data_size, y.end());
     333             :     }
     334             : }
     335             : 
     336             : // Function to display the dataset
     337           0 : void gpmp::ml::LinearRegression::show_data() {
     338           0 :     for (int64_t i = 0; i < 62; i++) {
     339           0 :         printf("_");
     340             :     }
     341           0 :     printf("\n\n");
     342           0 :     printf("|%15s%5s %15s%5s%20s\n", "X", "", "Y", "", "|");
     343             : 
     344             :     // Display each data point in the dataset
     345           0 :     for (int64_t i = 0; uint64_t(i) < x.size(); i++) {
     346           0 :         printf("|%20Lf %20Lf%20s\n", x[i], y[i], "|");
     347             :     }
     348             : 
     349           0 :     for (int64_t i = 0; i < 62; i++) {
     350           0 :         printf("_");
     351             :     }
     352           0 :     printf("\n");
     353           0 : }
     354             : 
     355             : // Function to predict a value based on input x
     356           0 : long double gpmp::ml::LinearRegression::predict(long double _x) const {
     357           0 :     return coeff * _x + constant;
     358             : }
     359             : 
     360             : // Function to predict a value based on input x and a dataset
     361             : long double
     362           0 : gpmp::ml::LinearRegression::predict(long double _x,
     363             :                                     const std::vector<long double> &x_data) {
     364             :     // Calculate the coefficient if not already calculated
     365           0 :     long double _coeff = return_coeffecient();
     366             :     // Calculate the constant if not already calculated
     367           0 :     long double _constant = return_constant();
     368             :     // TODO FIXME unused variable
     369           0 :     return _coeff * _x + _constant + x_data[0];
     370             : }
     371             : 
     372             : // Function to calculate the error for a given value
     373           0 : long double gpmp::ml::LinearRegression::error_in(long double num) {
     374           0 :     for (int64_t i = 0; uint64_t(i) < x.size(); i++) {
     375           0 :         if (fabs(num - x[i]) < std::numeric_limits<double>::epsilon()) {
     376           0 :             return (y[i] - predict(x[i]));
     377             :         }
     378             :     }
     379           0 :     return 0;
     380             : }
     381             : 
     382             : // Function to calculate the error for a given value and a dataset
     383             : long double
     384           0 : gpmp::ml::LinearRegression::error_in(long double num,
     385             :                                      const std::vector<long double> &x_data,
     386             :                                      const std::vector<long double> &y_data) {
     387           0 :     long double prediction = predict(num, x_data);
     388             :     // Find the corresponding y value for the input x
     389           0 :     for (size_t i = 0; i < x_data.size(); i++) {
     390           0 :         if (fabs(num - x_data[i]) < std::numeric_limits<double>::epsilon()) {
     391           0 :             return y_data[i] - prediction;
     392             :         }
     393             :     }
     394           0 :     return 0;
     395             : }
     396             : 
     397             : // Function that returns the overall sum of the square of errors
     398           0 : long double gpmp::ml::LinearRegression::error_square() {
     399           0 :     long double ans = 0;
     400             : 
     401             :     // Iterate through each data point
     402           0 :     for (int64_t i = 0; uint64_t(i) < x.size(); i++) {
     403             :         // Calculate the square of the error (difference between predicted and
     404             :         // actual values)
     405           0 :         ans += ((predict(x[i]) - y[i]) * (predict(x[i]) - y[i]));
     406             :     }
     407           0 :     return ans; // Return the sum of squared errors
     408             : }
     409             : 
     410             : // Find the Mean Squared Error (MSE) of the given dataset
     411             : long double
     412           0 : gpmp::ml::LinearRegression::mse(const std::vector<long double> &x_data,
     413             :                                 const std::vector<long double> &y_data) const {
     414             :     // Check if input vectors have the same size
     415           0 :     if (x_data.size() != y_data.size()) {
     416           0 :         return -1; // Return an error value
     417             :     }
     418             : 
     419           0 :     long double mse = 0.0;
     420           0 :     int64_t n = x_data.size();
     421             : 
     422             :     // Iterate through each data point
     423           0 :     for (int64_t i = 0; i < n; i++) {
     424             :         // Calculate the predicted value using the linear regression model
     425           0 :         long double predicted = predict(x_data[i]);
     426             :         // Calculate the error (difference between predicted and actual values)
     427           0 :         long double error = predicted - y_data[i];
     428             :         // Accumulate the squared error
     429           0 :         mse += error * error;
     430             :     }
     431             : 
     432             :     // Calculate the Mean Squared Error by dividing the accumulated squared
     433             :     // error by the number of data points
     434           0 :     return mse / static_cast<long double>(n);
     435             : }
     436             : 
     437             : // Determine an R^2 score value
     438           0 : long double gpmp::ml::LinearRegression::r_sqrd(
     439             :     const std::vector<long double> &x_data,
     440             :     const std::vector<long double> &y_data) const {
     441             :     // Check if input vectors have the same size
     442           0 :     if (x_data.size() != y_data.size()) {
     443           0 :         _log_.log(ERROR, "Input vectors must have the same size.");
     444           0 :         return -1; // Return an error value
     445             :     }
     446             : 
     447           0 :     long double total_sum_of_squares = 0.0;
     448           0 :     long double sum_of_squared_errors = 0.0;
     449           0 :     int64_t n = x_data.size();
     450             : 
     451           0 :     long double y_mean = 0.0;
     452           0 :     for (int64_t i = 0; i < n; i++) {
     453             :         // Calculate the mean of the dependent variable (Y)
     454           0 :         y_mean += y_data[i];
     455             :     }
     456           0 :     y_mean /= static_cast<long double>(n);
     457             : 
     458             :     // Iterate through each data point
     459           0 :     for (int64_t i = 0; i < n; i++) {
     460             :         // Calculate the predicted value using the linear regression model
     461           0 :         long double predicted = predict(x_data[i]);
     462             :         // Calculate the error (difference between predicted and actual values)
     463           0 :         long double error = predicted - y_data[i];
     464             :         // Calculate the total sum of squares and sum of squared errors
     465           0 :         total_sum_of_squares += (y_data[i] - y_mean) * (y_data[i] - y_mean);
     466           0 :         sum_of_squared_errors += error * error;
     467             :     }
     468             : 
     469             :     // Calculate the R-squared value using the formula 1 - (SSE / SST)
     470           0 :     return 1.0 - (sum_of_squared_errors / total_sum_of_squares);
     471             : }
     472             : 
     473             : // Determine number of rows in given dataset
     474           0 : int64_t gpmp::ml::LinearRegression::num_rows(const char *input) {
     475           0 :     int64_t num = 0;
     476           0 :     std::string row;
     477             : 
     478             :     // create input file stream
     479           0 :     std::ifstream file(input);
     480             : 
     481           0 :     while (getline(file, row)) {
     482           0 :         num++;
     483             :     }
     484             : 
     485           0 :     return num;
     486           0 : }

Generated by: LCOV version 1.14