openGPMP
Open Source Mathematics Package
linreg.cpp
Go to the documentation of this file.
1 
10 #include <iostream>
11 #include <stdio.h>
12 // include all of the openGPMP headers
13 #include <openGPMP/gpmp.h>
14 
17 
18 void test_train() {
19  _log_.log(INFO, "OpenGPMP LinearRegression class");
20 
22 
24  dt.csv_read("../../data/school_scores.csv", {"Year", "GPA"});
25 
26  // Create a LinearRegression object
28 
29  std::vector<std::string> columns = {"Year", "GPA"};
30 
31  // Load data into the LinearRegression object
32  reg.get_input(result, columns);
33 
34  _log_.log(INFO, "Before splitting");
35 
36  // Printing the best fitting line before splitting
37  reg.best_fit();
38 
39  int v1 = 2007;
40  double v1_v = reg.predict(v1, reg.x);
41  double v1_e = reg.error_in(v1, reg.x, reg.y);
42  _log_.log(INFO,
43  "Predicted value at " + std::to_string(v1) + " = " +
44  std::to_string(v1_v));
45 
46  _log_.log(INFO,
47  "Error value at " + std::to_string(v1) + " = " +
48  std::to_string(v1_e));
49 
50  // Now, perform data splitting
51  reg.split_data(0.3, 42, false); // 70% for training, 30% for testing
52 
53  _log_.log(INFO, "After splitting");
54 
55  // Printing the best fitting line after splitting
56  reg.best_fit();
57 
58  double v1_v_after_split = reg.predict(v1, reg.x_test);
59  double v1_e_after_split = reg.error_in(v1, reg.x_test, reg.y_test);
60  printf("Predicted value at %d = %f\n", v1, v1_v_after_split);
61  printf("Error value at %d = %f\n\n", v1, v1_e_after_split);
62  _log_.log(INFO,
63  "Predicted value at " + std::to_string(v1) + " = " +
64  std::to_string(v1_v_after_split));
65 
66  _log_.log(INFO,
67  "Error value at " + std::to_string(v1) + " = " +
68  std::to_string(v1_e_after_split));
69 
70  // Calculate MSE and R2 score after splitting
71  double mse_after_split = reg.mse(reg.x_test, reg.y_test);
72  double r_squared_after_split = reg.r_sqrd(reg.x_test, reg.y_test);
73 
74  _log_.log(INFO, "MSE = " + std::to_string(mse_after_split));
75  _log_.log(INFO, "R2 score = " + std::to_string(r_squared_after_split));
76 }
77 
78 int main() {
79  test_train();
80  // gpmp::ml::LinearRegression reg;
81  // gpmp::core::DataTable dt;
82  // gpmp::core::DataTableStr result =
83  // dt.csv_read("../../data/forestfires.csv",
84  // {"month", "day", "temp",
85  // "wind"});
86 
87  /*gpmp::core::DataTableStr result =
88  dt.csv_read("../../data/school_scores.csv", {"Year", "GPA"});
89 
90  dt.display(result);
91  std::vector<std::string> columns = {"Year", "GPA"};
92  // pass in the datatable (result) and specify x , y columns
93  reg.get_input(result, columns);
94 
95  // split data into 75% training and 25% testing
96  reg.split_data(0.25);
97 
98  // declare Regression class object
99 
100  printf("LINEAR REGRESSION EXAMPLE ON YEAR/GPA DATA IN "
101  "MATHEMATICS\n");
102 
103  // Printing the best fitting line
104  reg.best_fit();
105 
106  int v1 = 1995;
107  double v1_v = reg.predict(v1);
108  double v1_e = reg.error_in(v1);
109  printf("Predicted value at %d = %f\n", v1, v1_v);
110  printf("Error value at %d = %f\n\n", v1, v1_e);
111 
112  int v2 = 1997;
113  double v2_v = reg.predict(v2);
114  double v2_e = reg.error_in(v2);
115  printf("Predicted value at %d = %f\n", v2, v2_v);
116  printf("Error value at %d = %f\n\n", v2, v2_e);
117 
118  int v3 = 1999;
119  double v3_v = reg.predict(v3);
120  double v3_e = reg.error_in(v3);
121  printf("Predicted value at %d = %f\n", v3, v3_v);
122  printf("Error value at %d = %f\n\n", v3, v3_e);
123 
124  int v4 = 2001;
125  double v4_v = reg.predict(v4);
126  double v4_e = reg.error_in(v4);
127  printf("Predicted value at %d = %f\n", v4, v4_v);
128  printf("Error value at %d = %f\n\n", v4, v4_e);
129 
130  int v5 = 2003;
131  double v5_v = reg.predict(v5);
132  double v5_e = reg.error_in(v5);
133  printf("Predicted value at %d = %f\n", v5, v5_v);
134  printf("Error value at %d = %f\n\n", v5, v5_e);
135 
136  int v6 = 2005;
137  double v6_v = reg.predict(v6);
138  double v6_e = reg.error_in(v6);
139  printf("Predicted value at %d = %f\n", v6, v6_v);
140  printf("Error value at %d = %f\n\n", v6, v6_e);
141 
142  int v7 = 2006;
143  double v7_v = reg.predict(v7);
144  double v7_e = reg.error_in(v7);
145  printf("Predicted value at %d = %f\n", v7, v7_v);
146  printf("Error value at %d = %f\n\n", v7, v7_e);
147 
148  int v8 = 2007;
149  double v8_v = reg.predict(v8);
150  double v8_e = reg.error_in(v8);
151  printf("Predicted value at %d = %f\n", v8, v8_v);
152  printf("Error value at %d = %f\n\n", v8, v8_e);
153 
154  int v10 = 2016;
155  double v10_v = reg.predict(v10);
156  double v10_e = reg.error_in(v10);
157  printf("Predicted value at %d = %f\n", v10, v10_v);
158  printf("Error value at %d = %f\n\n", v10, v10_e);
159 
160  int v11 = 2017;
161  double v11_v = reg.predict(v11);
162  double v11_e = reg.error_in(v11);
163  printf("Predicted value at %d = %f\n", v11, v11_v);
164  printf("Error value at %d = %f\n\n", v11, v11_e);
165  */
166  return 0;
167 }
DataTableStr csv_read(std::string filename, std::vector< std::string > columns={})
Reads a CSV file and returns a DataTableStr parses CSV files and stores all data as strings.
Definition: datatable.cpp:57
void log(LogLevel level, const std::string &message)
Logs a message with the specified log level.
Definition: utils.cpp:77
void best_fit()
Calculates and displays the best fitting line based on training data.
Definition: linreg.cpp:110
long double mse(const std::vector< long double > &x_data, const std::vector< long double > &y_data) const
Calculates the Mean Squared Error (MSE) for a dataset.
Definition: linreg.cpp:412
long double error_in(long double num)
Calculates the error (residual) for a given independent variable value.
Definition: linreg.cpp:373
long double predict(long double _x) const
Predict a value based on the input.
Definition: linreg.cpp:356
std::vector< long double > y
Definition: linreg.hpp:58
void split_data(double test_size, unsigned int seed, bool shuffle)
Splits the data into training and testing sets.
Definition: linreg.cpp:279
std::vector< long double > x_test
Definition: linreg.hpp:78
long double r_sqrd(const std::vector< long double > &x_data, const std::vector< long double > &y_data) const
Calculate the coefficient of determination (R-squared).
Definition: linreg.cpp:438
std::vector< long double > y_test
Definition: linreg.hpp:80
std::vector< long double > x
Definition: linreg.hpp:56
void get_input(const std::vector< long double > &x_data, const std::vector< long double > &y_data)
Sets the input data for the LinearRegression class from two vectors.
Definition: linreg.cpp:157
std::pair< std::vector< std::string >, std::vector< std::vector< std::string > > > DataTableStr
Definition: datatable.hpp:65
void test_train()
Definition: linreg.cpp:18
static gpmp::core::Logger _log_
Definition: linreg.cpp:16
int main()
Definition: linreg.cpp:78
@ INFO
Definition: utils.hpp:48