Line data Source code
1 : /*************************************************************************
2 : *
3 : * Project
4 : * _____ _____ __ __ _____
5 : * / ____| __ \| \/ | __ \
6 : * ___ _ __ ___ _ __ | | __| |__) | \ / | |__) |
7 : * / _ \| '_ \ / _ \ '_ \| | |_ | ___/| |\/| | ___/
8 : *| (_) | |_) | __/ | | | |__| | | | | | | |
9 : * \___/| .__/ \___|_| |_|\_____|_| |_| |_|_|
10 : * | |
11 : * |_|
12 : *
13 : * Copyright (C) Akiel Aries, <akiel@akiel.org>, et al.
14 : *
15 : * This software is licensed as described in the file LICENSE, which
16 : * you should have received as part of this distribution. The terms
17 : * among other details are referenced in the official documentation
18 : * seen here : https://akielaries.github.io/openGPMP/ along with
19 : * important files seen in this project.
20 : *
21 : * You may opt to use, copy, modify, merge, publish, distribute
22 : * and/or sell copies of the Software, and permit persons to whom
23 : * the Software is furnished to do so, under the terms of the
24 : * LICENSE file. As this is an Open Source effort, all implementations
25 : * must be of the same methodology.
26 : *
27 : *
28 : *
29 : * This software is distributed on an AS IS basis, WITHOUT
30 : * WARRANTY OF ANY KIND, either express or implied.
31 : *
32 : ************************************************************************/
33 :
34 : /**
35 : * openGPMP implementation of Linear Regression
36 : */
37 : #include <fstream>
38 : #include <iostream>
39 : #include <openGPMP/core/datatable.hpp>
40 : #include <openGPMP/core/utils.hpp>
41 : #include <openGPMP/ml/linreg.hpp>
42 : #include <random>
43 : #include <stdio.h>
44 : #include <string>
45 : #include <vector>
46 :
47 : /** Logger class object*/
48 : static gpmp::core::Logger _log_;
49 :
50 : /*
51 : * Constructor to provide the default values to all the terms in the
52 : * object of class regression
53 : */
54 0 : gpmp::ml::LinearRegression::LinearRegression() {
55 0 : coeff = 0;
56 0 : constant = 0;
57 0 : sum_y = 0;
58 0 : sum_y_square = 0;
59 0 : sum_x_square = 0;
60 0 : sum_x = 0;
61 0 : sum_xy = 0;
62 0 : }
63 :
64 : // Function that calculate the coefficient/slope of the best fitting
65 : // line
66 0 : void gpmp::ml::LinearRegression::calculate_coeffecient() {
67 : // get number of datapoints
68 0 : long double N = x.size();
69 : // calculate numerator and denominator
70 0 : long double numerator = (N * sum_xy - sum_x * sum_y);
71 0 : long double denominator = (N * sum_x_square - sum_x * sum_x);
72 : // calculate the coeffecient
73 0 : coeff = numerator / denominator;
74 0 : }
75 :
76 : /*
77 : * Member function that will calculate the constant term of the best
78 : * fitting line
79 : */
80 0 : void gpmp::ml::LinearRegression::calculate_constant() {
81 0 : long double N = x.size();
82 0 : long double numerator = (sum_y * sum_x_square - sum_x * sum_xy);
83 0 : long double denominator = (N * sum_x_square - sum_x * sum_x);
84 : // calculate constant
85 0 : constant = numerator / denominator;
86 0 : }
87 :
88 : // Function that return the number of entries (xi, yi) in the data set
89 0 : int64_t gpmp::ml::LinearRegression::data_size() {
90 0 : return x.size();
91 : }
92 :
93 : // Function that return the coefficient/slope of the best fitting line
94 0 : long double gpmp::ml::LinearRegression::return_coeffecient() {
95 0 : if (fabs(coeff - 0.0f) < std::numeric_limits<double>::epsilon()) {
96 0 : calculate_coeffecient();
97 : }
98 0 : return coeff;
99 : }
100 :
101 : // Function that return the constant term of the best fitting line
102 0 : long double gpmp::ml::LinearRegression::return_constant() {
103 0 : if (fabs(constant - 0.0f) < std::numeric_limits<double>::epsilon()) {
104 0 : calculate_constant();
105 : }
106 0 : return constant;
107 : }
108 :
109 : // Function to calculate and display the best fitting line
110 0 : void gpmp::ml::LinearRegression::best_fit() {
111 0 : if (x_train.empty() || y_train.empty()) {
112 : // Check if training data is empty
113 0 : _log_.log(WARNING, "Training data is empty.");
114 :
115 0 : if (fabs(coeff - 0.0f) < std::numeric_limits<double>::epsilon() &&
116 0 : fabs(constant - 0.0f) < std::numeric_limits<double>::epsilon()) {
117 : // If coefficients are not calculated, calculate them
118 0 : calculate_coeffecient();
119 0 : calculate_constant();
120 : }
121 : // Display the best fitting line equation
122 0 : _log_.log(INFO,
123 0 : "Best fitting line : y = " + std::to_string(coeff) + "x + " +
124 0 : std::to_string(constant));
125 0 : return;
126 : }
127 :
128 : // Calculate the coefficients using the training data
129 0 : long double N = x_train.size();
130 0 : long double sum_xy_train = 0;
131 0 : long double sum_x_train = 0;
132 0 : long double sum_y_train = 0;
133 0 : long double sum_x_square_train = 0;
134 :
135 0 : for (size_t i = 0; i < N; i++) {
136 : // Calculate sums for the training data
137 0 : sum_xy_train += x_train[i] * y_train[i];
138 0 : sum_x_train += x_train[i];
139 0 : sum_y_train += y_train[i];
140 0 : sum_x_square_train += x_train[i] * x_train[i];
141 : }
142 :
143 0 : long double numerator = (N * sum_xy_train - sum_x_train * sum_y_train);
144 0 : long double denominator =
145 0 : (N * sum_x_square_train - sum_x_train * sum_x_train);
146 :
147 : // Calculate the coefficients of the best fitting line
148 0 : coeff = numerator / denominator;
149 0 : constant = (sum_y_train - coeff * sum_x_train) / N;
150 : // Display the best fitting line equation
151 0 : _log_.log(INFO,
152 0 : "Best fitting line : y = " + std::to_string(coeff) + "x + " +
153 0 : std::to_string(constant));
154 : }
155 :
156 : // Function to accept input data in the form of two vectors
157 0 : void gpmp::ml::LinearRegression::get_input(
158 : const std::vector<long double> &x_data,
159 : const std::vector<long double> &y_data) {
160 : // Clear existing data from x and y vectors
161 0 : x.clear();
162 0 : y.clear();
163 : // Initialize LinearRegression class variables
164 0 : sum_xy = 0; /* Set x*y sum */
165 0 : sum_x = 0; /* Set sum of x */
166 0 : sum_y = 0; /* Set sum of y */
167 0 : sum_x_square = 0; /* Set sum of x squares */
168 0 : sum_y_square = 0; /* Set sum of y squares */
169 :
170 0 : if (x_data.size() != y_data.size()) {
171 : // Check if input vectors have the same size
172 0 : _log_.log(ERROR, "Input vectors must have the same size");
173 0 : return;
174 : }
175 :
176 0 : for (size_t i = 0; i < x_data.size(); i++) {
177 : // Append x and y values to x and y vectors
178 0 : x.push_back(x_data[i]);
179 0 : y.push_back(y_data[i]);
180 : // Update sum of (x * y)
181 0 : sum_xy += x_data[i] * y_data[i];
182 : // Update sum of x and y
183 0 : sum_x += x_data[i];
184 0 : sum_y += y_data[i];
185 : // Update sum of x squares and y squares
186 0 : sum_x_square += x_data[i] * x_data[i];
187 0 : sum_y_square += y_data[i] * y_data[i];
188 : }
189 : }
190 :
191 : // Function to accept input data from a DataTableStr
192 0 : void gpmp::ml::LinearRegression::get_input(
193 : const gpmp::core::DataTableStr &data,
194 : const std::vector<std::string> &columns) {
195 : // Clear any existing data from x and y vectors
196 0 : x.clear();
197 0 : y.clear();
198 0 : sum_xy = 0;
199 0 : sum_x = 0;
200 0 : sum_y = 0;
201 0 : sum_x_square = 0;
202 0 : sum_y_square = 0;
203 :
204 : // Ensure that columns is not empty and has at least 2 elements
205 0 : if (columns.size() < 2) {
206 0 : _log_.log(ERROR, "Input vectors must have at least 2 column names.");
207 0 : return;
208 : }
209 :
210 : // Find the column indices for the specified column names
211 0 : std::vector<size_t> column_indices;
212 0 : for (const auto &column_name : columns) {
213 0 : bool found = false;
214 0 : for (size_t i = 0; i < data.first.size(); ++i) {
215 0 : if (data.first[i] == column_name) {
216 0 : column_indices.push_back(i);
217 0 : found = true;
218 0 : break;
219 : }
220 : }
221 0 : if (!found) {
222 0 : _log_.log(ERROR,
223 0 : "Column '" + column_name +
224 : "' not found in DataTableStr.");
225 0 : return;
226 : }
227 : }
228 :
229 0 : for (const auto &row : data.second) {
230 : try {
231 0 : long double xi = std::stold(row[column_indices[0]]);
232 0 : long double yi = std::stold(row[column_indices[1]]);
233 : // Append x and y values to x and y vectors
234 0 : x.push_back(xi);
235 0 : y.push_back(yi);
236 : // Update sum of (x * y)
237 0 : sum_xy += xi * yi;
238 : // Update sum of x and y
239 0 : sum_x += xi;
240 0 : sum_y += yi;
241 : // Update sum of x squares and y squares
242 0 : sum_x_square += xi * xi;
243 0 : sum_y_square += yi * yi;
244 0 : } catch (const std::exception &e) {
245 : // Handle parsing errors here
246 0 : _log_.log(ERROR, "Error parsing data: " + std::string(e.what()));
247 0 : continue;
248 0 : }
249 : }
250 0 : }
251 :
252 : // Function to accept input data from a file
253 0 : void gpmp::ml::LinearRegression::get_input(const char *file) {
254 0 : int n = num_rows(file);
255 0 : for (int64_t i = 0; i < n; i++) {
256 : /*
257 : * In a csv file, all the values of xi and yi are separated by
258 : * commas
259 : */
260 : char comma;
261 : long double xi;
262 : long double yi;
263 0 : std::cin >> xi >> comma >> yi;
264 : // Update sum of (x * y)
265 0 : sum_xy += xi * yi;
266 : // Update sum of x and y
267 0 : sum_x += xi;
268 0 : sum_y += yi;
269 : // Update sum of x squares and y squares
270 0 : sum_x_square += xi * xi;
271 0 : sum_y_square += yi * yi;
272 : // Append x and y values to x and y vectors
273 0 : x.push_back(xi);
274 0 : y.push_back(yi);
275 : }
276 0 : }
277 :
278 : // Function to split data into training and testing sets
279 0 : void gpmp::ml::LinearRegression::split_data(double test_size,
280 : unsigned int seed,
281 : bool shuffle) {
282 0 : if (x.empty() || y.empty()) {
283 0 : _log_.log(ERROR, "Training data is empty.");
284 0 : return;
285 : }
286 :
287 0 : if (test_size <= 0 || test_size >= 1) {
288 0 : _log_.log(ERROR, "Invalid `test_size`. Must be between 0 - 1.");
289 0 : return;
290 : }
291 :
292 0 : size_t data_size = x.size();
293 0 : size_t test_data_size = static_cast<size_t>(test_size * data_size);
294 :
295 : // Shuffle the data randomly if specified
296 0 : if (shuffle == true) {
297 : // Create vector of indices
298 0 : std::vector<size_t> indices(data_size);
299 : // Fill vector sequentially w/ `iota`
300 0 : std::iota(indices.begin(), indices.end(), 0);
301 : // Randomly shuffle vector indices from start to end
302 0 : std::shuffle(indices.begin(),
303 : indices.end(),
304 0 : std::default_random_engine(seed));
305 :
306 : // Clear training and testing data vectors
307 0 : x_train.clear();
308 0 : y_train.clear();
309 0 : x_test.clear();
310 0 : y_test.clear();
311 :
312 : // Split the data into training and testing sets based on shuffled
313 : // indices
314 0 : for (size_t i = 0; i < data_size; ++i) {
315 0 : if (i < test_data_size) {
316 : // Append x test vector with shuffled x value
317 0 : x_test.push_back(x[indices[i]]);
318 0 : y_test.push_back(y[indices[i]]);
319 : } else {
320 : // Append x training vector with suffled x value
321 0 : x_train.push_back(x[indices[i]]);
322 0 : y_train.push_back(y[indices[i]]);
323 : }
324 : }
325 0 : } else {
326 : // Without shuffling, split the data without changing its order by
327 : // assigning the first training element to the training set
328 0 : x_train.assign(x.begin(), x.begin() + data_size - test_data_size);
329 0 : y_train.assign(y.begin(), y.begin() + data_size - test_data_size);
330 : // Assign the 1+ testing elements to the testing test
331 0 : x_test.assign(x.begin() + data_size - test_data_size, x.end());
332 0 : y_test.assign(y.begin() + data_size - test_data_size, y.end());
333 : }
334 : }
335 :
336 : // Function to display the dataset
337 0 : void gpmp::ml::LinearRegression::show_data() {
338 0 : for (int64_t i = 0; i < 62; i++) {
339 0 : printf("_");
340 : }
341 0 : printf("\n\n");
342 0 : printf("|%15s%5s %15s%5s%20s\n", "X", "", "Y", "", "|");
343 :
344 : // Display each data point in the dataset
345 0 : for (int64_t i = 0; uint64_t(i) < x.size(); i++) {
346 0 : printf("|%20Lf %20Lf%20s\n", x[i], y[i], "|");
347 : }
348 :
349 0 : for (int64_t i = 0; i < 62; i++) {
350 0 : printf("_");
351 : }
352 0 : printf("\n");
353 0 : }
354 :
355 : // Function to predict a value based on input x
356 0 : long double gpmp::ml::LinearRegression::predict(long double _x) const {
357 0 : return coeff * _x + constant;
358 : }
359 :
360 : // Function to predict a value based on input x and a dataset
361 : long double
362 0 : gpmp::ml::LinearRegression::predict(long double _x,
363 : const std::vector<long double> &x_data) {
364 : // Calculate the coefficient if not already calculated
365 0 : long double _coeff = return_coeffecient();
366 : // Calculate the constant if not already calculated
367 0 : long double _constant = return_constant();
368 : // TODO FIXME unused variable
369 0 : return _coeff * _x + _constant + x_data[0];
370 : }
371 :
372 : // Function to calculate the error for a given value
373 0 : long double gpmp::ml::LinearRegression::error_in(long double num) {
374 0 : for (int64_t i = 0; uint64_t(i) < x.size(); i++) {
375 0 : if (fabs(num - x[i]) < std::numeric_limits<double>::epsilon()) {
376 0 : return (y[i] - predict(x[i]));
377 : }
378 : }
379 0 : return 0;
380 : }
381 :
382 : // Function to calculate the error for a given value and a dataset
383 : long double
384 0 : gpmp::ml::LinearRegression::error_in(long double num,
385 : const std::vector<long double> &x_data,
386 : const std::vector<long double> &y_data) {
387 0 : long double prediction = predict(num, x_data);
388 : // Find the corresponding y value for the input x
389 0 : for (size_t i = 0; i < x_data.size(); i++) {
390 0 : if (fabs(num - x_data[i]) < std::numeric_limits<double>::epsilon()) {
391 0 : return y_data[i] - prediction;
392 : }
393 : }
394 0 : return 0;
395 : }
396 :
397 : // Function that returns the overall sum of the square of errors
398 0 : long double gpmp::ml::LinearRegression::error_square() {
399 0 : long double ans = 0;
400 :
401 : // Iterate through each data point
402 0 : for (int64_t i = 0; uint64_t(i) < x.size(); i++) {
403 : // Calculate the square of the error (difference between predicted and
404 : // actual values)
405 0 : ans += ((predict(x[i]) - y[i]) * (predict(x[i]) - y[i]));
406 : }
407 0 : return ans; // Return the sum of squared errors
408 : }
409 :
410 : // Find the Mean Squared Error (MSE) of the given dataset
411 : long double
412 0 : gpmp::ml::LinearRegression::mse(const std::vector<long double> &x_data,
413 : const std::vector<long double> &y_data) const {
414 : // Check if input vectors have the same size
415 0 : if (x_data.size() != y_data.size()) {
416 0 : return -1; // Return an error value
417 : }
418 :
419 0 : long double mse = 0.0;
420 0 : int64_t n = x_data.size();
421 :
422 : // Iterate through each data point
423 0 : for (int64_t i = 0; i < n; i++) {
424 : // Calculate the predicted value using the linear regression model
425 0 : long double predicted = predict(x_data[i]);
426 : // Calculate the error (difference between predicted and actual values)
427 0 : long double error = predicted - y_data[i];
428 : // Accumulate the squared error
429 0 : mse += error * error;
430 : }
431 :
432 : // Calculate the Mean Squared Error by dividing the accumulated squared
433 : // error by the number of data points
434 0 : return mse / static_cast<long double>(n);
435 : }
436 :
437 : // Determine an R^2 score value
438 0 : long double gpmp::ml::LinearRegression::r_sqrd(
439 : const std::vector<long double> &x_data,
440 : const std::vector<long double> &y_data) const {
441 : // Check if input vectors have the same size
442 0 : if (x_data.size() != y_data.size()) {
443 0 : _log_.log(ERROR, "Input vectors must have the same size.");
444 0 : return -1; // Return an error value
445 : }
446 :
447 0 : long double total_sum_of_squares = 0.0;
448 0 : long double sum_of_squared_errors = 0.0;
449 0 : int64_t n = x_data.size();
450 :
451 0 : long double y_mean = 0.0;
452 0 : for (int64_t i = 0; i < n; i++) {
453 : // Calculate the mean of the dependent variable (Y)
454 0 : y_mean += y_data[i];
455 : }
456 0 : y_mean /= static_cast<long double>(n);
457 :
458 : // Iterate through each data point
459 0 : for (int64_t i = 0; i < n; i++) {
460 : // Calculate the predicted value using the linear regression model
461 0 : long double predicted = predict(x_data[i]);
462 : // Calculate the error (difference between predicted and actual values)
463 0 : long double error = predicted - y_data[i];
464 : // Calculate the total sum of squares and sum of squared errors
465 0 : total_sum_of_squares += (y_data[i] - y_mean) * (y_data[i] - y_mean);
466 0 : sum_of_squared_errors += error * error;
467 : }
468 :
469 : // Calculate the R-squared value using the formula 1 - (SSE / SST)
470 0 : return 1.0 - (sum_of_squared_errors / total_sum_of_squares);
471 : }
472 :
473 : // Determine number of rows in given dataset
474 0 : int64_t gpmp::ml::LinearRegression::num_rows(const char *input) {
475 0 : int64_t num = 0;
476 0 : std::string row;
477 :
478 : // create input file stream
479 0 : std::ifstream file(input);
480 :
481 0 : while (getline(file, row)) {
482 0 : num++;
483 : }
484 :
485 0 : return num;
486 0 : }
|