openGPMP
Open Source Mathematics Package
datatable1.cpp
Go to the documentation of this file.
1 /*************************************************************************
2  *
3  * Project
4  * _____ _____ __ __ _____
5  * / ____| __ \| \/ | __ \
6  * ___ _ __ ___ _ __ | | __| |__) | \ / | |__) |
7  * / _ \| '_ \ / _ \ '_ \| | |_ | ___/| |\/| | ___/
8  *| (_) | |_) | __/ | | | |__| | | | | | | |
9  * \___/| .__/ \___|_| |_|\_____|_| |_| |_|_|
10  * | |
11  * |_|
12  *
13  * Copyright (C) Akiel Aries, <akiel@akiel.org>, et al.
14  *
15  * This software is licensed as described in the file LICENSE, which
16  * you should have received as part of this distribution. The terms
17  * among other details are referenced in the official documentation
18  * seen here : https://akielaries.github.io/openGPMP/ along with
19  * important files seen in this project.
20  *
21  * You may opt to use, copy, modify, merge, publish, distribute
22  * and/or sell copies of the Software, and permit persons to whom
23  * the Software is furnished to do so, under the terms of the
24  * LICENSE file.
25  *
26  *
27  *
28  * This software is distributed on an AS IS basis, WITHOUT
29  * WARRANTY OF ANY KIND, either express or implied.
30  *
31  ************************************************************************/
32 #include <algorithm>
33 #include <fstream>
34 #include <iomanip>
35 #include <iostream>
36 #include <map>
37 #include <numeric>
39 #include <openGPMP/core/utils.hpp>
40 #include <regex>
41 #include <sstream>
42 #include <string>
43 #include <typeinfo>
44 #include <unordered_map>
45 #include <variant>
46 #include <vector>
47 
50 
51 // create method to create datatable from scratch? insert, drop, etc?
52 
53 // TODO : optimize these methods, CSV reader using threads? loop unrolling?,
54 // etc? conversion functions to be quicker,
56 gpmp::core::DataTable::csv_read(std::string filename,
57  std::vector<std::string> columns) {
58  std::ifstream file(filename);
59 
60  if (!file.is_open()) {
61  _log_.log(ERROR, "Unable to open file: " + filename + ".");
62  exit(EXIT_FAILURE);
63  }
64 
65  std::vector<std::vector<std::string>> data;
66  std::string line;
67 
68  // Get the header line and parse the column names
69  getline(file, line);
70  std::stringstream header(line);
71  std::vector<std::string> header_cols;
72  std::string columnName;
73 
74  while (getline(header, columnName, ',')) {
75  header_cols.push_back(columnName);
76  }
77 
78  // If no columns are specified, read in all columns
79  if (columns.empty()) {
80  columns = header_cols;
81  }
82 
83  // Check if specified columns exist in the header
84  for (const auto &column : columns) {
85  if (find(header_cols.begin(), header_cols.end(), column) ==
86  header_cols.end()) {
87  _log_.log(ERROR, "Column: " + column + " not found");
88  exit(EXIT_FAILURE);
89  }
90  }
91 
92  // Read in the data rows
93  while (getline(file, line)) {
94  std::vector<std::string> row;
95  std::stringstream rowStream(line);
96  std::string value;
97  int columnIndex = 0;
98 
99  while (getline(rowStream, value, ',')) {
100  // If column is specified, only read in specified columns
101  if (find(columns.begin(),
102  columns.end(),
103  header_cols[columnIndex]) != columns.end()) {
104  row.push_back(value);
105  }
106 
107  columnIndex++;
108  }
109 
110  if (row.size() > 0) {
111  data.push_back(row);
112  }
113  }
114  // populate headers_ class variable
115  headers_ = columns;
116  // populate data_ class variable
117  data_ = data;
118 
119  file.close();
120  return make_pair(columns, data);
121 }
122 
123 // Function to check if a string is an integer
124 bool is_int(const std::string &str) {
125  // TODO : determine type of int based on length of largest val?
126  return std::regex_match(str, std::regex(R"(-?\d+)"));
127 }
128 
129 // Function to check if a string is a double
130 bool is_double(const std::string &str) {
131  return std::regex_match(str, std::regex(R"(-?\d+\.\d+)"));
132 }
133 
135 gpmp::core::DataTable::inferType(const std::vector<std::string> &column) {
136  int integer_count = 0;
137  int double_count = 0;
138  int string_count = 0;
139 
140  for (const std::string &cell : column) {
141  if (is_int(cell)) {
142  integer_count++;
143  } else if (is_double(cell)) {
144  double_count++;
145  } else {
146  string_count++;
147  }
148  }
149 
150  _log_.log(INFO,
151  "int/double/str: " + std::to_string(integer_count) + "/" +
152  std::to_string(double_count) + "/" +
153  std::to_string(string_count));
154 
155  if (integer_count > double_count) {
156  return DataType::dt_int32;
157  } else if (double_count > integer_count) {
158  return DataType::dt_double;
159  } else {
160  return DataType::dt_str;
161  }
162 }
163 std::string dt_to_str(gpmp::core::DataType type) {
164  switch (type) {
166  return "int64";
168  return "long double";
170  return "std::string";
171  // TODO : Add more cases if needed
172  default:
173  return "Unknown";
174  }
175 }
177  const std::vector<std::string> &skip_columns) {
178  gpmp::core::TableType mixed_data;
179 
180  // Include all column headers in mixed_data (including skipped ones)
181  mixed_data.first = headers_;
182 
183  std::vector<gpmp::core::DataType> column_data_types;
184 
185  // Determine data types for each column (skip_columns remain as strings)
186  for (size_t col = 0; col < headers_.size(); ++col) {
187  // Check if this column should be skipped
188  if (std::find(skip_columns.begin(),
189  skip_columns.end(),
190  headers_[col]) != skip_columns.end()) {
191  column_data_types.push_back(gpmp::core::DataType::dt_str);
192  _log_.log(INFO, "Skipping column: " + headers_[col]);
193  } else {
194  std::vector<std::string> column_data;
195  for (const std::vector<std::string> &rowData : data_) {
196  column_data.push_back(rowData[col]);
197  }
198  gpmp::core::DataType column_type = inferType(column_data);
199  column_data_types.push_back(column_type);
200 
201  _log_.log(INFO,
202  "Column " + headers_[col] +
203  " using type: " + dt_to_str(column_type));
204  }
205  }
206 
207  // Traverse rows and convert based on the determined data types
208  for (const std::vector<std::string> &row : data_) {
209  std::vector<std::variant<int64_t, long double, std::string>> mixed_row;
210 
211  for (size_t col = 0; col < headers_.size(); ++col) {
212  const std::string &cell = row[col];
213  gpmp::core::DataType column_type = column_data_types[col];
214 
215  if (column_type == gpmp::core::DataType::dt_int32) {
216 
217  mixed_row.push_back(std::stoi(cell));
218  } else if (column_type == gpmp::core::DataType::dt_double) {
219  mixed_row.push_back(std::stold(cell));
220  } else {
221  mixed_row.push_back(cell); // Keep as a string
222  }
223  }
224 
225  mixed_data.second.push_back(mixed_row);
226  }
227 
228  std::cout << "Mixed Data:" << std::endl;
229  for (const std::string &header : mixed_data.first) {
230  std::cout << header << " ";
231  }
232  std::cout << std::endl;
233 
234  for (const auto &row : mixed_data.second) {
235  for (const auto &cell : row) {
236  if (std::holds_alternative<int64_t>(cell)) {
237  std::cout << std::get<int64_t>(cell) << " ";
238  } else if (std::holds_alternative<long double>(cell)) {
239  std::cout << std::get<long double>(cell) << " ";
240  } else if (std::holds_alternative<std::string>(cell)) {
241  std::cout << std::get<std::string>(cell) << " ";
242  }
243  }
244  std::cout << std::endl;
245  }
246 
247  return mixed_data;
248 }
249 
250 /*
251 gpmp::core::TableType gpmp::core::DataTable::native_type(
252  const std::vector<std::string> &skip_columns) {
253  gpmp::core::TableType mixed_data;
254 
255  std::cout << "HEADERS:" << headers_.size() << std::endl;
256  std::cout << "ROWS:" << data_.size() << std::endl;
257 
258  // Traverse column headers, skipping specified columns
259  for (const std::string &header : headers_) {
260 
261  if (std::find(skip_columns.begin(), skip_columns.end(), header) !=
262  skip_columns.end()) {
263  continue; // Skip this column
264  }
265 
266  std::cout << header << " ";
267  // push column headers into mixed_data var
268  mixed_data.first.push_back(header);
269  }
270 
271  std::cout << std::endl;
272 
273  // Traverse rows, skip rows of the specified columns
274  for (size_t col = 0; col < headers_.size(); ++col) {
275  if (std::find(skip_columns.begin(),
276  skip_columns.end(),
277  headers_[col]) != skip_columns.end()) {
278  continue; // Skip this column
279  }
280  _log_.log(INFO, "Column: " + headers_[col]);
281 
282  // Collect data for this column
283  std::vector<std::string> column_data;
284  for (const std::vector<std::string> &row : data_) {
285  column_data.push_back(row[col]);
286  }
287 
288  std::vector<std::variant<int64_t, long double, std::string>>
289 converted_data;
290 
291  // Call inferType on the column's data
292  gpmp::core::DataType column_type = inferType(column_data);
293 
294  _log_.log(INFO, "Using type: " + dt_to_str(column_type));
295 
296  // Check type and convert rows
297  if (column_type == gpmp::core::DataType::dt_int32) {
298  std::cout << "INT\n";
299  for (const std::string &cell : column_data) {
300  converted_data.push_back(std::stoi(cell));
301  }
302  }
303  else if (column_type == gpmp::core::DataType::dt_double) {
304  std::cout << "DOUBLE\n";
305  for (const std::string &cell : column_data) {
306  converted_data.push_back(std::stod(cell));
307  }
308  }
309  //else if (column_type == gpmp::core::DataType::dt_str) {
310  else {
311  std::cout << "STRING\n";
312  for (const std::string &cell : column_data) {
313  converted_data.push_back(cell);
314  }
315  }
316  // push rows into the mixed_data var
317  mixed_data.second.push_back(converted_data);
318  }
319 
320  std::cout << "Mixed Data:" << std::endl;
321  for (const std::string &header : mixed_data.first) {
322  std::cout << header << " ";
323  }
324  std::cout << std::endl;
325 
326  for (const auto &row : mixed_data.second) {
327  for (const auto &cell : row) {
328  if (std::holds_alternative<int64_t>(cell)) {
329  std::cout << std::get<int64_t>(cell) << " ";
330  } else if (std::holds_alternative<long double>(cell)) {
331  std::cout << std::get<long double>(cell) << " ";
332  } else if (std::holds_alternative<std::string>(cell)) {
333  std::cout << std::get<std::string>(cell) << " ";
334  }
335  }
336  std::cout << std::endl;
337  }
338 
339  return mixed_data;
340 }*/
341 
342 // Extracts date/time information from given column
343 // TODO: add additional options for detecting/converting date/time columns
344 // to numeric formats
346 gpmp::core::DataTable::datetime(std::string column_name,
347  bool extract_year,
348  bool extract_month,
349  bool extract_time) {
350  // Find the index of the specified column
351  auto column_iter = std::find(headers_.begin(), headers_.end(), column_name);
352  if (column_iter == headers_.end()) {
353  _log_.log(ERROR, "Column: " + column_name + " node found");
354  exit(EXIT_FAILURE);
355  }
356  int column_index = std::distance(headers_.begin(), column_iter);
357 
358  // Extract components from each row
359  std::vector<std::string> new_headers = headers_;
360  std::vector<std::vector<std::string>> new_data;
361 
362  // Iterate and populate the additional columns
363  for (size_t row_index = 0; row_index < data_.size(); ++row_index) {
364  std::vector<std::string> row = data_[row_index];
365  // If column row is not found
366  if (row.size() <= static_cast<size_t>(column_index)) {
367  _log_.log(ERROR, "Column: " + column_name + " not found");
368 
369  exit(EXIT_FAILURE);
370  }
371 
372  std::string timestamp = row[column_index];
373  std::string year, month, time;
374 
375  // Create a new row with extracted components
376  std::vector<std::string> new_row;
377 
378  // Extract year, month, and time components
379  if (extract_year) {
380  year = timestamp.substr(timestamp.find_last_of('/') + 1, 4);
381  new_row.push_back(year);
382  }
383  if (extract_month) {
384  month = timestamp.substr(0, timestamp.find_first_of('/'));
385  new_row.push_back(month);
386  }
387  if (extract_time) {
388  time = timestamp.substr(timestamp.find(' ') + 1);
389  new_row.push_back(time);
390  }
391 
392  // append original row data
393  new_row.insert(new_row.end(), row.begin(), row.end());
394  // add new rows
395  new_data.push_back(new_row);
396  }
397 
398  // Create new headers based on the extracted components
399  if (extract_month)
400  new_headers.insert(new_headers.begin(), "Month");
401  if (extract_year)
402  new_headers.insert(new_headers.begin(), "Year");
403  if (extract_time)
404  new_headers.insert(new_headers.begin(), "Time");
405 
406  // set class car data_ to hold rows/lines
407  data_ = new_data;
408  // set class var modified headers to new headers
409  // new_headers_ = new_headers;
410  headers_ = new_headers;
411 
412  return std::make_pair(new_headers, new_data);
413 }
414 
415 // Sort specified columns, by default in asending order
416 void gpmp::core::DataTable::sort(const std::vector<std::string> &sort_columns,
417  bool ascending) {
418  // Extract the column indices to be sorted by from the original data
419  std::vector<size_t> column_indices;
420  for (const std::string &column : sort_columns) {
421  auto iter = std::find(headers_.begin(), headers_.end(), column);
422  if (iter != headers_.end()) {
423  size_t index = std::distance(headers_.begin(), iter);
424  column_indices.push_back(index);
425  }
426  }
427 
428  // Sort the data based on the specified columns
429  std::stable_sort(data_.begin(),
430  data_.end(),
431  [&](const std::vector<std::string> &row1,
432  const std::vector<std::string> &row2) {
433  for (size_t index : column_indices) {
434  if (row1[index] != row2[index]) {
435  if (ascending) {
436  return row1[index] < row2[index];
437  } else {
438  return row1[index] > row2[index];
439  }
440  }
441  }
442  // Rows are equal, nothing to sort
443  return false;
444  });
445 }
446 
447 // Group rows by specific columns
448 std::vector<gpmp::core::DataTableStr>
449 gpmp::core::DataTable::group_by(std::vector<std::string> group_by_columns) {
450  // Find the indices of the specified group by columns
451  std::vector<int> group_by_indices;
452 
453  // Traverse group column names
454  for (const std::string &column_name : group_by_columns) {
455  std::cout << "Searching for column: " << column_name << std::endl;
456 
457  // Find start/end and match column name
458  auto column_iter =
459  std::find(headers_.begin(), headers_.end(), column_name);
460 
461  // If no columns
462  if (column_iter == headers_.end()) {
463  _log_.log(ERROR, "Column: " + column_name + " not found");
464  exit(EXIT_FAILURE);
465  }
466  // column index set to distance from start of first col to nexter iter
467  int column_index = std::distance(headers_.begin(), column_iter);
468  // add column index to group
469  group_by_indices.push_back(column_index);
470  }
471 
472  // Group the data based on the specified columns using a vector
473  std::vector<std::pair<std::vector<std::string>, gpmp::core::DataTableStr>>
474  groups;
475 
476  // Traverse row/line data
477  for (const std::vector<std::string> &row : data_) {
478  // store group key for each row
479  std::vector<std::string> group_key;
480  // Fill group key from specified group column names
481  for (int index : group_by_indices) {
482  group_key.push_back(row[index]);
483  }
484 
485  // Check if the group already exists
486  auto group_iter = std::find_if(
487  groups.begin(),
488  groups.end(),
489  [&group_key](const std::pair<std::vector<std::string>,
490  gpmp::core::DataTableStr> &group) {
491  return group.first == group_key;
492  });
493  // If the group DNE create a new one to add to groups vector
494  if (group_iter == groups.end()) {
495  // Create a new group
496  groups.push_back(
497  {group_key, gpmp::core::DataTableStr(headers_, {})});
498  group_iter = groups.end() - 1;
499  }
500  // Add current row to group
501  group_iter->second.second.push_back(row);
502  }
503 
504  // Extract the grouped data into a vector
505  std::vector<gpmp::core::DataTableStr> grouped_data;
506  // Iterate over sorted groups to push onto result vector
507  for (const auto &group : groups) {
508  grouped_data.push_back(group.second);
509  }
510 
511  // Return final DataTableStr type
512  return grouped_data;
513 }
514 
515 // Get first element of each created group
517  const std::vector<gpmp::core::DataTableStr> &groups) const {
518  if (groups.empty()) {
519  // Handle the case when there are no groups
520  return std::make_pair(std::vector<std::string>(),
521  std::vector<std::vector<std::string>>());
522  }
523 
524  std::vector<std::vector<std::string>> first_rows;
525 
526  for (const gpmp::core::DataTableStr &group : groups) {
527  if (!group.second.empty()) {
528  first_rows.push_back(
529  group.second[0]); // Get the first row of each group
530  }
531  }
532 
533  if (!first_rows.empty()) {
534  // Assuming all groups have the same headers as the first group
535  return std::make_pair(groups[0].first, first_rows);
536  } else {
537  // Handle the case when there are no first rows found.
538  return std::make_pair(groups[0].first,
539  std::vector<std::vector<std::string>>());
540  }
541 }
542 
543 // Prints some basic information about a DataTable object
544 
548 
549  for (const auto &v : src.first) {
550  // check if v contains only digits
551  if (std::regex_match(v, std::regex("\\d+"))) {
552  dest.first.push_back(std::stoi(v));
553  }
554  }
555  for (const auto &vv : src.second) {
556  std::vector<int64_t> new_vec;
557  for (const auto &v : vv) {
558  // check if v contains only digits
559  if (std::regex_match(v, std::regex("\\d+"))) {
560  new_vec.push_back(std::stoi(v));
561  }
562  }
563  dest.second.push_back(new_vec);
564  }
565  return dest;
566 }
567 
571 
572  for (const auto &v : src.first) {
573  if (std::regex_match(v, std::regex("[-+]?\\d*\\.?\\d+"))) {
574  dest.first.push_back(std::stold(v));
575  }
576  }
577 
578  for (const auto &vv : src.second) {
579  std::vector<long double> new_vec;
580  for (const auto &v : vv) {
581  if (std::regex_match(v, std::regex("[-+]?\\d*\\.?\\d+"))) {
582  new_vec.push_back(std::stold(v));
583  }
584  }
585  dest.second.push_back(new_vec);
586  }
587 
588  return dest;
589 }
DataType inferType(const std::vector< std::string > &column)
Definition: datatable1.cpp:135
std::vector< std::vector< std::string > > data_
Definition: datatable.hpp:85
std::vector< DataTableStr > group_by(std::vector< std::string > group_by_columns)
Groups the data by specified columns.
Definition: datatable.cpp:250
DataTableStr csv_read(std::string filename, std::vector< std::string > columns={})
Reads a CSV file and returns a DataTableStr parses CSV files and stores all data as strings.
Definition: datatable.cpp:57
TableType native_type(const std::vector< std::string > &skip_columns={})
Converts DataTable column's rows to their native types. Since the existing DataTable read/load relate...
Definition: datatable1.cpp:176
DataTableStr first(const std::vector< gpmp::core::DataTableStr > &groups) const
Gets the first element of each created group.
Definition: datatable.cpp:317
void sort(const std::vector< std::string > &sort_columns, bool ascending=true)
Sorts the rows of the DataTable based on specified columns.
Definition: datatable.cpp:217
std::vector< std::string > headers_
Definition: datatable.hpp:79
DataTableStr datetime(std::string column_name, bool extract_year=true, bool extract_month=true, bool extract_time=false)
Extracts date and time components from a timestamp column.
Definition: datatable.cpp:147
DataTableInt str_to_int(DataTableStr src)
Converts a DataTableStr to a DataTableInt.
Definition: datatable.cpp:347
DataTableDouble str_to_double(DataTableStr src)
Converts a DataTableStr to a DataTableDouble.
Definition: datatable.cpp:370
void log(LogLevel level, const std::string &message)
Logs a message with the specified log level.
Definition: utils.cpp:77
std::string dt_to_str(gpmp::core::DataType type)
Definition: datatable1.cpp:163
bool is_double(const std::string &str)
Definition: datatable1.cpp:130
static gpmp::core::Logger _log_
Definition: datatable1.cpp:49
bool is_int(const std::string &str)
Definition: datatable1.cpp:124
std::pair< std::vector< long double >, std::vector< std::vector< long double > > > DataTableDouble
Definition: datatable.hpp:74
std::pair< std::vector< std::string >, std::vector< std::vector< std::string > > > DataTableStr
Definition: datatable.hpp:65
std::pair< std::vector< std::string >, std::vector< std::vector< std::variant< int64_t, long double, std::string > > > > TableType
DataType
enum for representing different data types
Definition: datatable.hpp:59
std::pair< std::vector< int64_t >, std::vector< std::vector< int64_t > > > DataTableInt
Definition: datatable.hpp:69
static int year
Miscellaneous utilities methods related to openGPMP.
@ ERROR
Definition: utils.hpp:48
@ INFO
Definition: utils.hpp:48