openGPMP
Open Source Mathematics Package
datatable.cpp
Go to the documentation of this file.
1 /*************************************************************************
2  *
3  * Project
4  * _____ _____ __ __ _____
5  * / ____| __ \| \/ | __ \
6  * ___ _ __ ___ _ __ | | __| |__) | \ / | |__) |
7  * / _ \| '_ \ / _ \ '_ \| | |_ | ___/| |\/| | ___/
8  *| (_) | |_) | __/ | | | |__| | | | | | | |
9  * \___/| .__/ \___|_| |_|\_____|_| |_| |_|_|
10  * | |
11  * |_|
12  *
13  * Copyright (C) Akiel Aries, <akiel@akiel.org>, et al.
14  *
15  * This software is licensed as described in the file LICENSE, which
16  * you should have received as part of this distribution. The terms
17  * among other details are referenced in the official documentation
18  * seen here : https://akielaries.github.io/openGPMP/ along with
19  * important files seen in this project.
20  *
21  * You may opt to use, copy, modify, merge, publish, distribute
22  * and/or sell copies of the Software, and permit persons to whom
23  * the Software is furnished to do so, under the terms of the
24  * LICENSE file.
25  *
26  *
27  *
28  * This software is distributed on an AS IS basis, WITHOUT
29  * WARRANTY OF ANY KIND, either express or implied.
30  *
31  ************************************************************************/
32 #include <algorithm>
33 #include <fstream>
34 #include <iomanip>
35 #include <iostream>
36 #include <map>
38 #include <openGPMP/core/utils.hpp>
39 #include <regex>
40 #include <sstream>
41 #include <string>
42 #include <vector>
43 // for memory mapping files
44 #include <fcntl.h>
45 #include <sys/mman.h>
46 #include <sys/stat.h>
47 #include <unistd.h>
48 
51 
52 // create method to create datatable from scratch? insert, drop, etc?
53 
54 // TODO : optimize these methods, CSV reader using threads? loop unrolling?,
55 // etc? conversion functions to be quicker,
57 gpmp::core::DataTable::csv_read(std::string filename,
58  std::vector<std::string> columns) {
59 
60  int fd = open(filename.c_str(), O_RDONLY);
61  if (fd == -1) {
62  // Handle file open error
63  perror("Error opening file");
64  exit(EXIT_FAILURE);
65  }
66 
67  off_t size = lseek(fd, 0, SEEK_END);
68  char *file_data =
69  static_cast<char *>(mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0));
70 
71  if (file_data == MAP_FAILED) {
72  // Handle memory mapping error
73  perror("Error mapping file to memory");
74  close(fd);
75  exit(EXIT_FAILURE);
76  }
77 
78  std::stringstream file_stream(file_data);
79  std::vector<std::vector<std::string>> data;
80  std::string line;
81 
82  // Get the header line and parse the column names
83  getline(file_stream, line);
84  std::stringstream header(line);
85  std::vector<std::string> header_cols;
86  std::string columnName;
87 
88  while (getline(header, columnName, ',')) {
89  header_cols.push_back(columnName);
90  }
91 
92  // If no columns are specified, read in all columns
93  if (columns.empty()) {
94  columns = header_cols;
95  }
96 
97  // Check if specified columns exist in the header
98  for (const auto &column : columns) {
99  if (std::find(header_cols.begin(), header_cols.end(), column) ==
100  header_cols.end()) {
101  // Handle column not found error
102  perror(("Column: " + column + " not found").c_str());
103  munmap(file_data, size);
104  close(fd);
105  exit(EXIT_FAILURE);
106  }
107  }
108 
109  // Read in the data rows
110  while (getline(file_stream, line)) {
111  std::vector<std::string> row;
112  std::stringstream rowStream(line);
113  std::string value;
114  int columnIndex = 0;
115 
116  while (getline(rowStream, value, ',')) {
117  // If column is specified, only read in specified columns
118  if (std::find(columns.begin(),
119  columns.end(),
120  header_cols[columnIndex]) != columns.end()) {
121  row.push_back(value);
122  }
123 
124  columnIndex++;
125  }
126 
127  if (!row.empty()) {
128  data.push_back(row);
129  }
130  }
131 
132  // populate headers_ class variable
133  headers_ = columns;
134  // populate data_ class variable
135  data_ = data;
136 
137  munmap(file_data, size);
138  close(fd);
139 
140  return std::make_pair(headers_, data_);
141 }
142 
143 // Extracts date/time information from given column
144 // TODO: add additional options for detecting/converting date/time columns
145 // to numeric formats
147 gpmp::core::DataTable::datetime(std::string column_name,
148  bool extract_year,
149  bool extract_month,
150  bool extract_time) {
151  // Find the index of the specified column
152  auto column_iter = std::find(headers_.begin(), headers_.end(), column_name);
153  if (column_iter == headers_.end()) {
154  _log_.log(ERROR, "Column: " + column_name + " node found");
155  exit(EXIT_FAILURE);
156  }
157  int column_index = std::distance(headers_.begin(), column_iter);
158 
159  // Extract components from each row
160  std::vector<std::string> new_headers = headers_;
161  std::vector<std::vector<std::string>> new_data;
162 
163  // Iterate and populate the additional columns
164  for (size_t row_index = 0; row_index < data_.size(); ++row_index) {
165  std::vector<std::string> row = data_[row_index];
166  // If column row is not found
167  if (row.size() <= static_cast<size_t>(column_index)) {
168  _log_.log(ERROR, "Column: " + column_name + " not found");
169 
170  exit(EXIT_FAILURE);
171  }
172 
173  std::string timestamp = row[column_index];
174  std::string year, month, time;
175 
176  // Create a new row with extracted components
177  std::vector<std::string> new_row;
178 
179  // Extract year, month, and time components
180  if (extract_year) {
181  year = timestamp.substr(timestamp.find_last_of('/') + 1, 4);
182  new_row.push_back(year);
183  }
184  if (extract_month) {
185  month = timestamp.substr(0, timestamp.find_first_of('/'));
186  new_row.push_back(month);
187  }
188  if (extract_time) {
189  time = timestamp.substr(timestamp.find(' ') + 1);
190  new_row.push_back(time);
191  }
192 
193  // append original row data
194  new_row.insert(new_row.end(), row.begin(), row.end());
195  // add new rows
196  new_data.push_back(new_row);
197  }
198 
199  // Create new headers based on the extracted components
200  if (extract_month)
201  new_headers.insert(new_headers.begin(), "Month");
202  if (extract_year)
203  new_headers.insert(new_headers.begin(), "Year");
204  if (extract_time)
205  new_headers.insert(new_headers.begin(), "Time");
206 
207  // set class car data_ to hold rows/lines
208  data_ = new_data;
209  // set class var modified headers to new headers
210  // new_headers_ = new_headers;
211  headers_ = new_headers;
212 
213  return std::make_pair(new_headers, new_data);
214 }
215 
216 // Sort specified columns, by default in asending order
217 void gpmp::core::DataTable::sort(const std::vector<std::string> &sort_columns,
218  bool ascending) {
219  // Extract the column indices to be sorted by from the original data
220  std::vector<size_t> column_indices;
221  for (const std::string &column : sort_columns) {
222  auto iter = std::find(headers_.begin(), headers_.end(), column);
223  if (iter != headers_.end()) {
224  size_t index = std::distance(headers_.begin(), iter);
225  column_indices.push_back(index);
226  }
227  }
228 
229  // Sort the data based on the specified columns
230  std::stable_sort(data_.begin(),
231  data_.end(),
232  [&](const std::vector<std::string> &row1,
233  const std::vector<std::string> &row2) {
234  for (size_t index : column_indices) {
235  if (row1[index] != row2[index]) {
236  if (ascending) {
237  return row1[index] < row2[index];
238  } else {
239  return row1[index] > row2[index];
240  }
241  }
242  }
243  // Rows are equal, nothing to sort
244  return false;
245  });
246 }
247 
248 // Group rows by specific columns
249 std::vector<gpmp::core::DataTableStr>
250 gpmp::core::DataTable::group_by(std::vector<std::string> group_by_columns) {
251  // Find the indices of the specified group by columns
252  std::vector<int> group_by_indices;
253 
254  // Traverse group column names
255  for (const std::string &column_name : group_by_columns) {
256  std::cout << "Searching for column: " << column_name << std::endl;
257 
258  // Find start/end and match column name
259  auto column_iter =
260  std::find(headers_.begin(), headers_.end(), column_name);
261 
262  // If no columns
263  if (column_iter == headers_.end()) {
264  _log_.log(ERROR, "Column: " + column_name + " not found");
265  exit(EXIT_FAILURE);
266  }
267  // column index set to distance from start of first col to nexter iter
268  int column_index = std::distance(headers_.begin(), column_iter);
269  // add column index to group
270  group_by_indices.push_back(column_index);
271  }
272 
273  // Group the data based on the specified columns using a vector
274  std::vector<std::pair<std::vector<std::string>, gpmp::core::DataTableStr>>
275  groups;
276 
277  // Traverse row/line data
278  for (const std::vector<std::string> &row : data_) {
279  // store group key for each row
280  std::vector<std::string> group_key;
281  // Fill group key from specified group column names
282  for (int index : group_by_indices) {
283  group_key.push_back(row[index]);
284  }
285 
286  // Check if the group already exists
287  auto group_iter = std::find_if(
288  groups.begin(),
289  groups.end(),
290  [&group_key](const std::pair<std::vector<std::string>,
291  gpmp::core::DataTableStr> &group) {
292  return group.first == group_key;
293  });
294  // If the group DNE create a new one to add to groups vector
295  if (group_iter == groups.end()) {
296  // Create a new group
297  groups.push_back(
298  {group_key, gpmp::core::DataTableStr(headers_, {})});
299  group_iter = groups.end() - 1;
300  }
301  // Add current row to group
302  group_iter->second.second.push_back(row);
303  }
304 
305  // Extract the grouped data into a vector
306  std::vector<gpmp::core::DataTableStr> grouped_data;
307  // Iterate over sorted groups to push onto result vector
308  for (const auto &group : groups) {
309  grouped_data.push_back(group.second);
310  }
311 
312  // Return final DataTableStr type
313  return grouped_data;
314 }
315 
316 // Get first element of each created group
318  const std::vector<gpmp::core::DataTableStr> &groups) const {
319  if (groups.empty()) {
320  // Handle the case when there are no groups
321  return std::make_pair(std::vector<std::string>(),
322  std::vector<std::vector<std::string>>());
323  }
324 
325  std::vector<std::vector<std::string>> first_rows;
326 
327  for (const gpmp::core::DataTableStr &group : groups) {
328  if (!group.second.empty()) {
329  first_rows.push_back(
330  group.second[0]); // Get the first row of each group
331  }
332  }
333 
334  if (!first_rows.empty()) {
335  // Assuming all groups have the same headers as the first group
336  return std::make_pair(groups[0].first, first_rows);
337  } else {
338  // Handle the case when there are no first rows found.
339  return std::make_pair(groups[0].first,
340  std::vector<std::vector<std::string>>());
341  }
342 }
343 
344 // Prints some basic information about a DataTable object
345 
349 
350  for (const auto &v : src.first) {
351  // check if v contains only digits
352  if (std::regex_match(v, std::regex("\\d+"))) {
353  dest.first.push_back(std::stoi(v));
354  }
355  }
356  for (const auto &vv : src.second) {
357  std::vector<int64_t> new_vec;
358  for (const auto &v : vv) {
359  // check if v contains only digits
360  if (std::regex_match(v, std::regex("\\d+"))) {
361  new_vec.push_back(std::stoi(v));
362  }
363  }
364  dest.second.push_back(new_vec);
365  }
366  return dest;
367 }
368 
372 
373  for (const auto &v : src.first) {
374  if (std::regex_match(v, std::regex("[-+]?\\d*\\.?\\d+"))) {
375  dest.first.push_back(std::stold(v));
376  }
377  }
378 
379  for (const auto &vv : src.second) {
380  std::vector<long double> new_vec;
381  for (const auto &v : vv) {
382  if (std::regex_match(v, std::regex("[-+]?\\d*\\.?\\d+"))) {
383  new_vec.push_back(std::stold(v));
384  }
385  }
386  dest.second.push_back(new_vec);
387  }
388 
389  return dest;
390 }
std::vector< std::vector< std::string > > data_
Definition: datatable.hpp:85
std::vector< DataTableStr > group_by(std::vector< std::string > group_by_columns)
Groups the data by specified columns.
Definition: datatable.cpp:250
DataTableStr csv_read(std::string filename, std::vector< std::string > columns={})
Reads a CSV file and returns a DataTableStr parses CSV files and stores all data as strings.
Definition: datatable.cpp:57
DataTableStr first(const std::vector< gpmp::core::DataTableStr > &groups) const
Gets the first element of each created group.
Definition: datatable.cpp:317
void sort(const std::vector< std::string > &sort_columns, bool ascending=true)
Sorts the rows of the DataTable based on specified columns.
Definition: datatable.cpp:217
std::vector< std::string > headers_
Definition: datatable.hpp:79
DataTableStr datetime(std::string column_name, bool extract_year=true, bool extract_month=true, bool extract_time=false)
Extracts date and time components from a timestamp column.
Definition: datatable.cpp:147
DataTableInt str_to_int(DataTableStr src)
Converts a DataTableStr to a DataTableInt.
Definition: datatable.cpp:347
DataTableDouble str_to_double(DataTableStr src)
Converts a DataTableStr to a DataTableDouble.
Definition: datatable.cpp:370
void log(LogLevel level, const std::string &message)
Logs a message with the specified log level.
Definition: utils.cpp:77
static gpmp::core::Logger _log_
Definition: datatable.cpp:50
std::pair< std::vector< long double >, std::vector< std::vector< long double > > > DataTableDouble
Definition: datatable.hpp:74
std::pair< std::vector< std::string >, std::vector< std::vector< std::string > > > DataTableStr
Definition: datatable.hpp:65
std::pair< std::vector< int64_t >, std::vector< std::vector< int64_t > > > DataTableInt
Definition: datatable.hpp:69
static int year
Miscellaneous utilities methods related to openGPMP.
@ ERROR
Definition: utils.hpp:48