68 long double N = x.size();
70 long double numerator = (
N * sum_xy - sum_x * sum_y);
71 long double denominator = (
N * sum_x_square - sum_x * sum_x);
73 coeff = numerator / denominator;
81 long double N = x.size();
82 long double numerator = (sum_y * sum_x_square - sum_x * sum_xy);
83 long double denominator = (
N * sum_x_square - sum_x * sum_x);
85 constant = numerator / denominator;
95 if (fabs(coeff - 0.0f) < std::numeric_limits<double>::epsilon()) {
96 calculate_coeffecient();
103 if (fabs(constant - 0.0f) < std::numeric_limits<double>::epsilon()) {
104 calculate_constant();
111 if (x_train.empty() || y_train.empty()) {
115 if (fabs(coeff - 0.0f) < std::numeric_limits<double>::epsilon() &&
116 fabs(constant - 0.0f) < std::numeric_limits<double>::epsilon()) {
118 calculate_coeffecient();
119 calculate_constant();
123 "Best fitting line : y = " + std::to_string(coeff) +
"x + " +
124 std::to_string(constant));
129 long double N = x_train.size();
130 long double sum_xy_train = 0;
131 long double sum_x_train = 0;
132 long double sum_y_train = 0;
133 long double sum_x_square_train = 0;
135 for (
size_t i = 0; i <
N; i++) {
137 sum_xy_train += x_train[i] * y_train[i];
138 sum_x_train += x_train[i];
139 sum_y_train += y_train[i];
140 sum_x_square_train += x_train[i] * x_train[i];
143 long double numerator = (
N * sum_xy_train - sum_x_train * sum_y_train);
144 long double denominator =
145 (
N * sum_x_square_train - sum_x_train * sum_x_train);
148 coeff = numerator / denominator;
149 constant = (sum_y_train - coeff * sum_x_train) /
N;
152 "Best fitting line : y = " + std::to_string(coeff) +
"x + " +
153 std::to_string(constant));
158 const std::vector<long double> &x_data,
159 const std::vector<long double> &y_data) {
170 if (x_data.size() != y_data.size()) {
176 for (
size_t i = 0; i < x_data.size(); i++) {
178 x.push_back(x_data[i]);
179 y.push_back(y_data[i]);
181 sum_xy += x_data[i] * y_data[i];
186 sum_x_square += x_data[i] * x_data[i];
187 sum_y_square += y_data[i] * y_data[i];
194 const std::vector<std::string> &columns) {
205 if (columns.size() < 2) {
206 _log_.
log(
ERROR,
"Input vectors must have at least 2 column names.");
211 std::vector<size_t> column_indices;
212 for (
const auto &column_name : columns) {
214 for (
size_t i = 0; i < data.first.size(); ++i) {
215 if (data.first[i] == column_name) {
216 column_indices.push_back(i);
223 "Column '" + column_name +
224 "' not found in DataTableStr.");
229 for (
const auto &row : data.second) {
231 long double xi = std::stold(row[column_indices[0]]);
232 long double yi = std::stold(row[column_indices[1]]);
242 sum_x_square += xi * xi;
243 sum_y_square += yi * yi;
244 }
catch (
const std::exception &e) {
246 _log_.
log(
ERROR,
"Error parsing data: " + std::string(e.what()));
254 int n = num_rows(file);
255 for (int64_t i = 0; i < n; i++) {
263 std::cin >> xi >> comma >> yi;
270 sum_x_square += xi * xi;
271 sum_y_square += yi * yi;
282 if (x.empty() || y.empty()) {
287 if (test_size <= 0 || test_size >= 1) {
288 _log_.
log(
ERROR,
"Invalid `test_size`. Must be between 0 - 1.");
292 size_t data_size = x.size();
293 size_t test_data_size =
static_cast<size_t>(test_size * data_size);
296 if (shuffle ==
true) {
298 std::vector<size_t> indices(data_size);
300 std::iota(indices.begin(), indices.end(), 0);
302 std::shuffle(indices.begin(),
304 std::default_random_engine(seed));
314 for (
size_t i = 0; i < data_size; ++i) {
315 if (i < test_data_size) {
317 x_test.push_back(x[indices[i]]);
318 y_test.push_back(y[indices[i]]);
321 x_train.push_back(x[indices[i]]);
322 y_train.push_back(y[indices[i]]);
328 x_train.assign(x.begin(), x.begin() + data_size - test_data_size);
329 y_train.assign(y.begin(), y.begin() + data_size - test_data_size);
331 x_test.assign(x.begin() + data_size - test_data_size, x.end());
332 y_test.assign(y.begin() + data_size - test_data_size, y.end());
338 for (int64_t i = 0; i < 62; i++) {
342 printf(
"|%15s%5s %15s%5s%20s\n",
"X",
"",
"Y",
"",
"|");
345 for (int64_t i = 0; uint64_t(i) < x.size(); i++) {
346 printf(
"|%20Lf %20Lf%20s\n", x[i], y[i],
"|");
349 for (int64_t i = 0; i < 62; i++) {
357 return coeff * _x + constant;
363 const std::vector<long double> &x_data) {
365 long double _coeff = return_coeffecient();
367 long double _constant = return_constant();
369 return _coeff * _x + _constant + x_data[0];
374 for (int64_t i = 0; uint64_t(i) < x.size(); i++) {
375 if (fabs(num - x[i]) < std::numeric_limits<double>::epsilon()) {
376 return (y[i] - predict(x[i]));
385 const std::vector<long double> &x_data,
386 const std::vector<long double> &y_data) {
387 long double prediction = predict(num, x_data);
389 for (
size_t i = 0; i < x_data.size(); i++) {
390 if (fabs(num - x_data[i]) < std::numeric_limits<double>::epsilon()) {
391 return y_data[i] - prediction;
402 for (int64_t i = 0; uint64_t(i) < x.size(); i++) {
405 ans += ((predict(x[i]) - y[i]) * (predict(x[i]) - y[i]));
413 const std::vector<long double> &y_data)
const {
415 if (x_data.size() != y_data.size()) {
419 long double mse = 0.0;
420 int64_t n = x_data.size();
423 for (int64_t i = 0; i < n; i++) {
425 long double predicted = predict(x_data[i]);
427 long double error = predicted - y_data[i];
429 mse += error * error;
434 return mse /
static_cast<long double>(n);
439 const std::vector<long double> &x_data,
440 const std::vector<long double> &y_data)
const {
442 if (x_data.size() != y_data.size()) {
447 long double total_sum_of_squares = 0.0;
448 long double sum_of_squared_errors = 0.0;
449 int64_t n = x_data.size();
451 long double y_mean = 0.0;
452 for (int64_t i = 0; i < n; i++) {
456 y_mean /=
static_cast<long double>(n);
459 for (int64_t i = 0; i < n; i++) {
461 long double predicted = predict(x_data[i]);
463 long double error = predicted - y_data[i];
465 total_sum_of_squares += (y_data[i] - y_mean) * (y_data[i] - y_mean);
466 sum_of_squared_errors += error * error;
470 return 1.0 - (sum_of_squared_errors / total_sum_of_squares);
479 std::ifstream file(input);
481 while (getline(file, row)) {
void log(LogLevel level, const std::string &message)
Logs a message with the specified log level.
void best_fit()
Calculates and displays the best fitting line based on training data.
LinearRegression()
Constructor for LinearRegression.
void show_data()
Display the data set.
long double mse(const std::vector< long double > &x_data, const std::vector< long double > &y_data) const
Calculates the Mean Squared Error (MSE) for a dataset.
long double error_in(long double num)
Calculates the error (residual) for a given independent variable value.
long double predict(long double _x) const
Predict a value based on the input.
long double return_coeffecient()
Get the coefficient/slope of the best fitting line.
int64_t data_size()
Get the number of entries (xi, yi) in the data set.
long double error_square()
Calculates the sum of squared errors for the entire dataset.
void split_data(double test_size, unsigned int seed, bool shuffle)
Splits the data into training and testing sets.
long double r_sqrd(const std::vector< long double > &x_data, const std::vector< long double > &y_data) const
Calculate the coefficient of determination (R-squared).
void calculate_coeffecient()
Calculates the coefficient/slope of the best fitting line.
long double return_constant()
Get the constant term of the best fitting line.
int64_t num_rows(const char *input)
Calculate the number of rows in a file.
void get_input(const std::vector< long double > &x_data, const std::vector< long double > &y_data)
Sets the input data for the LinearRegression class from two vectors.
void calculate_constant()
Calculate the constant term of the best fitting line.
static gpmp::core::Logger _log_
std::pair< std::vector< std::string >, std::vector< std::vector< std::string > > > DataTableStr
Miscellaneous utilities methods related to openGPMP.