openGPMP
Open Source Mathematics Package
linreg.cpp
Go to the documentation of this file.
1 /*************************************************************************
2  *
3  * Project
4  * _____ _____ __ __ _____
5  * / ____| __ \| \/ | __ \
6  * ___ _ __ ___ _ __ | | __| |__) | \ / | |__) |
7  * / _ \| '_ \ / _ \ '_ \| | |_ | ___/| |\/| | ___/
8  *| (_) | |_) | __/ | | | |__| | | | | | | |
9  * \___/| .__/ \___|_| |_|\_____|_| |_| |_|_|
10  * | |
11  * |_|
12  *
13  * Copyright (C) Akiel Aries, <akiel@akiel.org>, et al.
14  *
15  * This software is licensed as described in the file LICENSE, which
16  * you should have received as part of this distribution. The terms
17  * among other details are referenced in the official documentation
18  * seen here : https://akielaries.github.io/openGPMP/ along with
19  * important files seen in this project.
20  *
21  * You may opt to use, copy, modify, merge, publish, distribute
22  * and/or sell copies of the Software, and permit persons to whom
23  * the Software is furnished to do so, under the terms of the
24  * LICENSE file. As this is an Open Source effort, all implementations
25  * must be of the same methodology.
26  *
27  *
28  *
29  * This software is distributed on an AS IS basis, WITHOUT
30  * WARRANTY OF ANY KIND, either express or implied.
31  *
32  ************************************************************************/
33 
37 #include <fstream>
38 #include <iostream>
40 #include <openGPMP/core/utils.hpp>
41 #include <openGPMP/ml/linreg.hpp>
42 #include <random>
43 #include <stdio.h>
44 #include <string>
45 #include <vector>
46 
49 
50 /*
51  * Constructor to provide the default values to all the terms in the
52  * object of class regression
53  */
55  coeff = 0;
56  constant = 0;
57  sum_y = 0;
58  sum_y_square = 0;
59  sum_x_square = 0;
60  sum_x = 0;
61  sum_xy = 0;
62 }
63 
64 // Function that calculate the coefficient/slope of the best fitting
65 // line
67  // get number of datapoints
68  long double N = x.size();
69  // calculate numerator and denominator
70  long double numerator = (N * sum_xy - sum_x * sum_y);
71  long double denominator = (N * sum_x_square - sum_x * sum_x);
72  // calculate the coeffecient
73  coeff = numerator / denominator;
74 }
75 
76 /*
77  * Member function that will calculate the constant term of the best
78  * fitting line
79  */
81  long double N = x.size();
82  long double numerator = (sum_y * sum_x_square - sum_x * sum_xy);
83  long double denominator = (N * sum_x_square - sum_x * sum_x);
84  // calculate constant
85  constant = numerator / denominator;
86 }
87 
88 // Function that return the number of entries (xi, yi) in the data set
90  return x.size();
91 }
92 
93 // Function that return the coefficient/slope of the best fitting line
95  if (fabs(coeff - 0.0f) < std::numeric_limits<double>::epsilon()) {
96  calculate_coeffecient();
97  }
98  return coeff;
99 }
100 
101 // Function that return the constant term of the best fitting line
103  if (fabs(constant - 0.0f) < std::numeric_limits<double>::epsilon()) {
104  calculate_constant();
105  }
106  return constant;
107 }
108 
109 // Function to calculate and display the best fitting line
111  if (x_train.empty() || y_train.empty()) {
112  // Check if training data is empty
113  _log_.log(WARNING, "Training data is empty.");
114 
115  if (fabs(coeff - 0.0f) < std::numeric_limits<double>::epsilon() &&
116  fabs(constant - 0.0f) < std::numeric_limits<double>::epsilon()) {
117  // If coefficients are not calculated, calculate them
118  calculate_coeffecient();
119  calculate_constant();
120  }
121  // Display the best fitting line equation
122  _log_.log(INFO,
123  "Best fitting line : y = " + std::to_string(coeff) + "x + " +
124  std::to_string(constant));
125  return;
126  }
127 
128  // Calculate the coefficients using the training data
129  long double N = x_train.size();
130  long double sum_xy_train = 0;
131  long double sum_x_train = 0;
132  long double sum_y_train = 0;
133  long double sum_x_square_train = 0;
134 
135  for (size_t i = 0; i < N; i++) {
136  // Calculate sums for the training data
137  sum_xy_train += x_train[i] * y_train[i];
138  sum_x_train += x_train[i];
139  sum_y_train += y_train[i];
140  sum_x_square_train += x_train[i] * x_train[i];
141  }
142 
143  long double numerator = (N * sum_xy_train - sum_x_train * sum_y_train);
144  long double denominator =
145  (N * sum_x_square_train - sum_x_train * sum_x_train);
146 
147  // Calculate the coefficients of the best fitting line
148  coeff = numerator / denominator;
149  constant = (sum_y_train - coeff * sum_x_train) / N;
150  // Display the best fitting line equation
151  _log_.log(INFO,
152  "Best fitting line : y = " + std::to_string(coeff) + "x + " +
153  std::to_string(constant));
154 }
155 
156 // Function to accept input data in the form of two vectors
158  const std::vector<long double> &x_data,
159  const std::vector<long double> &y_data) {
160  // Clear existing data from x and y vectors
161  x.clear();
162  y.clear();
163  // Initialize LinearRegression class variables
164  sum_xy = 0; /* Set x*y sum */
165  sum_x = 0; /* Set sum of x */
166  sum_y = 0; /* Set sum of y */
167  sum_x_square = 0; /* Set sum of x squares */
168  sum_y_square = 0; /* Set sum of y squares */
169 
170  if (x_data.size() != y_data.size()) {
171  // Check if input vectors have the same size
172  _log_.log(ERROR, "Input vectors must have the same size");
173  return;
174  }
175 
176  for (size_t i = 0; i < x_data.size(); i++) {
177  // Append x and y values to x and y vectors
178  x.push_back(x_data[i]);
179  y.push_back(y_data[i]);
180  // Update sum of (x * y)
181  sum_xy += x_data[i] * y_data[i];
182  // Update sum of x and y
183  sum_x += x_data[i];
184  sum_y += y_data[i];
185  // Update sum of x squares and y squares
186  sum_x_square += x_data[i] * x_data[i];
187  sum_y_square += y_data[i] * y_data[i];
188  }
189 }
190 
191 // Function to accept input data from a DataTableStr
193  const gpmp::core::DataTableStr &data,
194  const std::vector<std::string> &columns) {
195  // Clear any existing data from x and y vectors
196  x.clear();
197  y.clear();
198  sum_xy = 0;
199  sum_x = 0;
200  sum_y = 0;
201  sum_x_square = 0;
202  sum_y_square = 0;
203 
204  // Ensure that columns is not empty and has at least 2 elements
205  if (columns.size() < 2) {
206  _log_.log(ERROR, "Input vectors must have at least 2 column names.");
207  return;
208  }
209 
210  // Find the column indices for the specified column names
211  std::vector<size_t> column_indices;
212  for (const auto &column_name : columns) {
213  bool found = false;
214  for (size_t i = 0; i < data.first.size(); ++i) {
215  if (data.first[i] == column_name) {
216  column_indices.push_back(i);
217  found = true;
218  break;
219  }
220  }
221  if (!found) {
222  _log_.log(ERROR,
223  "Column '" + column_name +
224  "' not found in DataTableStr.");
225  return;
226  }
227  }
228 
229  for (const auto &row : data.second) {
230  try {
231  long double xi = std::stold(row[column_indices[0]]);
232  long double yi = std::stold(row[column_indices[1]]);
233  // Append x and y values to x and y vectors
234  x.push_back(xi);
235  y.push_back(yi);
236  // Update sum of (x * y)
237  sum_xy += xi * yi;
238  // Update sum of x and y
239  sum_x += xi;
240  sum_y += yi;
241  // Update sum of x squares and y squares
242  sum_x_square += xi * xi;
243  sum_y_square += yi * yi;
244  } catch (const std::exception &e) {
245  // Handle parsing errors here
246  _log_.log(ERROR, "Error parsing data: " + std::string(e.what()));
247  continue;
248  }
249  }
250 }
251 
252 // Function to accept input data from a file
254  int n = num_rows(file);
255  for (int64_t i = 0; i < n; i++) {
256  /*
257  * In a csv file, all the values of xi and yi are separated by
258  * commas
259  */
260  char comma;
261  long double xi;
262  long double yi;
263  std::cin >> xi >> comma >> yi;
264  // Update sum of (x * y)
265  sum_xy += xi * yi;
266  // Update sum of x and y
267  sum_x += xi;
268  sum_y += yi;
269  // Update sum of x squares and y squares
270  sum_x_square += xi * xi;
271  sum_y_square += yi * yi;
272  // Append x and y values to x and y vectors
273  x.push_back(xi);
274  y.push_back(yi);
275  }
276 }
277 
278 // Function to split data into training and testing sets
280  unsigned int seed,
281  bool shuffle) {
282  if (x.empty() || y.empty()) {
283  _log_.log(ERROR, "Training data is empty.");
284  return;
285  }
286 
287  if (test_size <= 0 || test_size >= 1) {
288  _log_.log(ERROR, "Invalid `test_size`. Must be between 0 - 1.");
289  return;
290  }
291 
292  size_t data_size = x.size();
293  size_t test_data_size = static_cast<size_t>(test_size * data_size);
294 
295  // Shuffle the data randomly if specified
296  if (shuffle == true) {
297  // Create vector of indices
298  std::vector<size_t> indices(data_size);
299  // Fill vector sequentially w/ `iota`
300  std::iota(indices.begin(), indices.end(), 0);
301  // Randomly shuffle vector indices from start to end
302  std::shuffle(indices.begin(),
303  indices.end(),
304  std::default_random_engine(seed));
305 
306  // Clear training and testing data vectors
307  x_train.clear();
308  y_train.clear();
309  x_test.clear();
310  y_test.clear();
311 
312  // Split the data into training and testing sets based on shuffled
313  // indices
314  for (size_t i = 0; i < data_size; ++i) {
315  if (i < test_data_size) {
316  // Append x test vector with shuffled x value
317  x_test.push_back(x[indices[i]]);
318  y_test.push_back(y[indices[i]]);
319  } else {
320  // Append x training vector with suffled x value
321  x_train.push_back(x[indices[i]]);
322  y_train.push_back(y[indices[i]]);
323  }
324  }
325  } else {
326  // Without shuffling, split the data without changing its order by
327  // assigning the first training element to the training set
328  x_train.assign(x.begin(), x.begin() + data_size - test_data_size);
329  y_train.assign(y.begin(), y.begin() + data_size - test_data_size);
330  // Assign the 1+ testing elements to the testing test
331  x_test.assign(x.begin() + data_size - test_data_size, x.end());
332  y_test.assign(y.begin() + data_size - test_data_size, y.end());
333  }
334 }
335 
336 // Function to display the dataset
338  for (int64_t i = 0; i < 62; i++) {
339  printf("_");
340  }
341  printf("\n\n");
342  printf("|%15s%5s %15s%5s%20s\n", "X", "", "Y", "", "|");
343 
344  // Display each data point in the dataset
345  for (int64_t i = 0; uint64_t(i) < x.size(); i++) {
346  printf("|%20Lf %20Lf%20s\n", x[i], y[i], "|");
347  }
348 
349  for (int64_t i = 0; i < 62; i++) {
350  printf("_");
351  }
352  printf("\n");
353 }
354 
355 // Function to predict a value based on input x
356 long double gpmp::ml::LinearRegression::predict(long double _x) const {
357  return coeff * _x + constant;
358 }
359 
360 // Function to predict a value based on input x and a dataset
361 long double
363  const std::vector<long double> &x_data) {
364  // Calculate the coefficient if not already calculated
365  long double _coeff = return_coeffecient();
366  // Calculate the constant if not already calculated
367  long double _constant = return_constant();
368  // TODO FIXME unused variable
369  return _coeff * _x + _constant + x_data[0];
370 }
371 
372 // Function to calculate the error for a given value
373 long double gpmp::ml::LinearRegression::error_in(long double num) {
374  for (int64_t i = 0; uint64_t(i) < x.size(); i++) {
375  if (fabs(num - x[i]) < std::numeric_limits<double>::epsilon()) {
376  return (y[i] - predict(x[i]));
377  }
378  }
379  return 0;
380 }
381 
382 // Function to calculate the error for a given value and a dataset
383 long double
385  const std::vector<long double> &x_data,
386  const std::vector<long double> &y_data) {
387  long double prediction = predict(num, x_data);
388  // Find the corresponding y value for the input x
389  for (size_t i = 0; i < x_data.size(); i++) {
390  if (fabs(num - x_data[i]) < std::numeric_limits<double>::epsilon()) {
391  return y_data[i] - prediction;
392  }
393  }
394  return 0;
395 }
396 
397 // Function that returns the overall sum of the square of errors
399  long double ans = 0;
400 
401  // Iterate through each data point
402  for (int64_t i = 0; uint64_t(i) < x.size(); i++) {
403  // Calculate the square of the error (difference between predicted and
404  // actual values)
405  ans += ((predict(x[i]) - y[i]) * (predict(x[i]) - y[i]));
406  }
407  return ans; // Return the sum of squared errors
408 }
409 
410 // Find the Mean Squared Error (MSE) of the given dataset
411 long double
412 gpmp::ml::LinearRegression::mse(const std::vector<long double> &x_data,
413  const std::vector<long double> &y_data) const {
414  // Check if input vectors have the same size
415  if (x_data.size() != y_data.size()) {
416  return -1; // Return an error value
417  }
418 
419  long double mse = 0.0;
420  int64_t n = x_data.size();
421 
422  // Iterate through each data point
423  for (int64_t i = 0; i < n; i++) {
424  // Calculate the predicted value using the linear regression model
425  long double predicted = predict(x_data[i]);
426  // Calculate the error (difference between predicted and actual values)
427  long double error = predicted - y_data[i];
428  // Accumulate the squared error
429  mse += error * error;
430  }
431 
432  // Calculate the Mean Squared Error by dividing the accumulated squared
433  // error by the number of data points
434  return mse / static_cast<long double>(n);
435 }
436 
437 // Determine an R^2 score value
439  const std::vector<long double> &x_data,
440  const std::vector<long double> &y_data) const {
441  // Check if input vectors have the same size
442  if (x_data.size() != y_data.size()) {
443  _log_.log(ERROR, "Input vectors must have the same size.");
444  return -1; // Return an error value
445  }
446 
447  long double total_sum_of_squares = 0.0;
448  long double sum_of_squared_errors = 0.0;
449  int64_t n = x_data.size();
450 
451  long double y_mean = 0.0;
452  for (int64_t i = 0; i < n; i++) {
453  // Calculate the mean of the dependent variable (Y)
454  y_mean += y_data[i];
455  }
456  y_mean /= static_cast<long double>(n);
457 
458  // Iterate through each data point
459  for (int64_t i = 0; i < n; i++) {
460  // Calculate the predicted value using the linear regression model
461  long double predicted = predict(x_data[i]);
462  // Calculate the error (difference between predicted and actual values)
463  long double error = predicted - y_data[i];
464  // Calculate the total sum of squares and sum of squared errors
465  total_sum_of_squares += (y_data[i] - y_mean) * (y_data[i] - y_mean);
466  sum_of_squared_errors += error * error;
467  }
468 
469  // Calculate the R-squared value using the formula 1 - (SSE / SST)
470  return 1.0 - (sum_of_squared_errors / total_sum_of_squares);
471 }
472 
473 // Determine number of rows in given dataset
474 int64_t gpmp::ml::LinearRegression::num_rows(const char *input) {
475  int64_t num = 0;
476  std::string row;
477 
478  // create input file stream
479  std::ifstream file(input);
480 
481  while (getline(file, row)) {
482  num++;
483  }
484 
485  return num;
486 }
const int N
void log(LogLevel level, const std::string &message)
Logs a message with the specified log level.
Definition: utils.cpp:77
void best_fit()
Calculates and displays the best fitting line based on training data.
Definition: linreg.cpp:110
LinearRegression()
Constructor for LinearRegression.
Definition: linreg.cpp:54
void show_data()
Display the data set.
Definition: linreg.cpp:337
long double mse(const std::vector< long double > &x_data, const std::vector< long double > &y_data) const
Calculates the Mean Squared Error (MSE) for a dataset.
Definition: linreg.cpp:412
long double error_in(long double num)
Calculates the error (residual) for a given independent variable value.
Definition: linreg.cpp:373
long double predict(long double _x) const
Predict a value based on the input.
Definition: linreg.cpp:356
long double return_coeffecient()
Get the coefficient/slope of the best fitting line.
Definition: linreg.cpp:94
long double sum_x_square
Definition: linreg.hpp:70
int64_t data_size()
Get the number of entries (xi, yi) in the data set.
Definition: linreg.cpp:89
long double error_square()
Calculates the sum of squared errors for the entire dataset.
Definition: linreg.cpp:398
void split_data(double test_size, unsigned int seed, bool shuffle)
Splits the data into training and testing sets.
Definition: linreg.cpp:279
long double r_sqrd(const std::vector< long double > &x_data, const std::vector< long double > &y_data) const
Calculate the coefficient of determination (R-squared).
Definition: linreg.cpp:438
long double sum_y_square
Definition: linreg.hpp:72
void calculate_coeffecient()
Calculates the coefficient/slope of the best fitting line.
Definition: linreg.cpp:66
long double return_constant()
Get the constant term of the best fitting line.
Definition: linreg.cpp:102
int64_t num_rows(const char *input)
Calculate the number of rows in a file.
Definition: linreg.cpp:474
void get_input(const std::vector< long double > &x_data, const std::vector< long double > &y_data)
Sets the input data for the LinearRegression class from two vectors.
Definition: linreg.cpp:157
void calculate_constant()
Calculate the constant term of the best fitting line.
Definition: linreg.cpp:80
static gpmp::core::Logger _log_
Definition: linreg.cpp:48
std::pair< std::vector< std::string >, std::vector< std::vector< std::string > > > DataTableStr
Definition: datatable.hpp:65
Miscellaneous utilities methods related to openGPMP.
@ ERROR
Definition: utils.hpp:48
@ INFO
Definition: utils.hpp:48
@ WARNING
Definition: utils.hpp:48