openGPMP
Open Source Mathematics Package
datatable.hpp
Go to the documentation of this file.
1 /*************************************************************************
2  *
3  * Project
4  * _____ _____ __ __ _____
5  * / ____| __ \| \/ | __ \
6  * ___ _ __ ___ _ __ | | __| |__) | \ / | |__) |
7  * / _ \| '_ \ / _ \ '_ \| | |_ | ___/| |\/| | ___/
8  *| (_) | |_) | __/ | | | |__| | | | | | | |
9  * \___/| .__/ \___|_| |_|\_____|_| |_| |_|_|
10  * | |
11  * |_|
12  *
13  * Copyright (C) Akiel Aries, <akiel@akiel.org>, et al.
14  *
15  * This software is licensed as described in the file LICENSE, which
16  * you should have received as part of this distribution. The terms
17  * among other details are referenced in the official documentation
18  * seen here : https://akielaries.github.io/openGPMP/ along with
19  * important files seen in this project.
20  *
21  * You may opt to use, copy, modify, merge, publish, distribute
22  * and/or sell copies of the Software, and permit persons to whom
23  * the Software is furnished to do so, under the terms of the
24  * LICENSE file. As this is an Open Source effort, all implementations
25  * must be of the same methodology.
26  *
27  *
28  *
29  * This software is distributed on an AS IS basis, WITHOUT
30  * WARRANTY OF ANY KIND, either express or implied.
31  *
32  ************************************************************************/
33 
34 /*
35  * The datatable class similar to the Pandas Dataframe, R data.table, etc
36  * with many more limitations.
37  */
38 #ifndef DATATABLE_HPP
39 #define DATATABLE_HPP
40 
41 #define MAX_ROWS 30
42 #define SHOW_ROWS 15
43 
44 #include <algorithm>
45 #include <chrono>
46 #include <fstream>
47 #include <iomanip>
48 #include <iostream>
49 #include <sstream>
50 #include <string>
51 #include <vector>
52 
53 namespace gpmp {
54 
55 namespace core {
56 
59 enum class DataType { Unknown, String, Integer, Double };
60 
63 typedef std::pair<std::vector<std::string>,
64  std::vector<std::vector<std::string>>>
68 typedef std::pair<std::vector<int64_t>, std::vector<std::vector<int64_t>>>
72 typedef std::pair<std::vector<long double>,
73  std::vector<std::vector<long double>>>
75 
76 class DataTable {
77  private:
78  // original DataTable object headers
79  std::vector<std::string> headers_;
80  // original DataTable object rows
81  std::vector<std::vector<std::string>> rows_;
82  // modified DataTable object headers
83  std::vector<std::string> new_headers_;
84  // vector to hold data
85  std::vector<std::vector<std::string>> data_;
86 
87  // original DataTable data
89 
90  public:
92  // Initialize data_ and headers_ to empty vectors
93  data_ = std::vector<std::vector<std::string>>();
94  headers_ = std::vector<std::string>();
95  }
96 
105  DataTableStr csv_read(std::string filename,
106  std::vector<std::string> columns = {});
107 
111  void csv_write();
112 
113  // TODO: TOML and JSON readers?
122  DataTableStr tsv_read(std::string filename,
123  std::vector<std::string> columns = {});
124 
133  DataTableStr json_read(std::string filename,
134  std::vector<std::string> objs = {});
135 
144  DataTableStr datetime(std::string column_name,
145  bool extract_year = true,
146  bool extract_month = true,
147  bool extract_time = false);
148 
155  void sort(const std::vector<std::string> &sort_columns,
156  bool ascending = true);
157 
158  // customSort(const std::vector<std::string>& columnNames,
159  // gpmp::core::DataTableStr& data);
160 
166  std::vector<DataTableStr>
167  group_by(std::vector<std::string> group_by_columns);
168 
175  first(const std::vector<gpmp::core::DataTableStr> &groups) const;
176 
180  void describe();
181 
192 
201 
215  template <typename T>
216  void display(std::pair<std::vector<T>, std::vector<std::vector<T>>> data,
217  bool display_all = false) {
218  // Get the number of columns and rows in the data
219  int num_columns = data.first.size();
220  int num_rows = data.second.size();
221  int num_omitted_rows = 0;
222 
223  // Initialize max_column_widths with the lengths of column headers
224  std::vector<int> max_column_widths(num_columns, 0);
225 
226  // Calculate the maximum width for each column based on column headers
227  for (int i = 0; i < num_columns; i++) {
228  max_column_widths[i] = data.first[i].length();
229  }
230 
231  // Calculate the maximum width for each column based on data rows
232  for (int i = 0; i < num_columns; i++) {
233  for (const auto &row : data.second) {
234  if (i < static_cast<int>(row.size())) {
235  max_column_widths[i] =
236  std::max(max_column_widths[i],
237  static_cast<int>(row[i].length()));
238  }
239  }
240  }
241 
242  // Set a larger width for the DateTime column (adjust the index as
243  // needed later on)
244  const int dateTimeColumnIndex = 0;
245  // adjust as needed?
246  max_column_widths[dateTimeColumnIndex] =
247  std::max(max_column_widths[dateTimeColumnIndex], 0);
248 
249  // Print headers with right-aligned values
250  std::cout << std::setw(7) << std::right << "Index"
251  << " ";
252 
253  for (int i = 0; i < num_columns; i++) {
254  std::cout << std::setw(max_column_widths[i]) << std::right
255  << data.first[i] << " ";
256  }
257  std::cout << std::endl;
258 
259  int num_elements = data.second.size();
260  if (!display_all && num_elements > MAX_ROWS) {
261  for (int i = 0; i < SHOW_ROWS; i++) {
262  // Prit index
263  std::cout << std::setw(7) << std::right << i << " ";
264  // Print each row with right-aligned values
265  for (int j = 0; j < num_columns; j++) {
266  if (j < static_cast<int>(data.second[i].size())) {
267  std::cout << std::setw(max_column_widths[j])
268  << std::right << data.second[i][j] << " ";
269  }
270  }
271  std::cout << std::endl;
272  }
273  num_omitted_rows = num_elements - MAX_ROWS;
274  std::cout << "...\n";
275  std::cout << "[" << num_omitted_rows << " rows omitted]\n";
276  for (int i = num_elements - SHOW_ROWS; i < num_elements; i++) {
277  std::cout << std::setw(7) << std::right << i << " ";
278  // Print each row with right-aligned values
279  for (int j = 0; j < num_columns; j++) {
280  if (j < static_cast<int>(data.second[i].size())) {
281  std::cout << std::setw(max_column_widths[j])
282  << std::right << data.second[i][j] << " ";
283  }
284  }
285  std::cout << std::endl;
286  }
287  } else {
288  // Print all rows with right-aligned values
289  for (int i = 0; i < num_elements; i++) {
290 
291  // Print index
292  std::cout << std::setw(7) << std::right << i << " ";
293  for (int j = 0; j < num_columns; j++) {
294  if (j < static_cast<int>(data.second[i].size())) {
295 
296  // Print formatted row
297  std::cout << std::setw(max_column_widths[j])
298  << std::right << data.second[i][j] << " ";
299  }
300  }
301  std::cout << std::endl;
302  }
303  }
304 
305  // Print the number of rows and columns
306  std::cout << "[" << num_rows << " rows"
307  << " x " << num_columns << " columns";
308  std::cout << "]\n\n";
309  }
310 
316  void display(bool display_all = false) {
317  display(std::make_pair(headers_, data_), display_all);
318  }
319 };
320 
321 } // namespace core
322 } // namespace gpmp
323 
324 #endif // DATATABLE_HPP
DataTableStr json_read(std::string filename, std::vector< std::string > objs={})
Reads a JSON file and returns a DataTableStr parses JSON files and stores all data as strings.
std::vector< std::vector< std::string > > data_
Definition: datatable.hpp:85
std::vector< DataTableStr > group_by(std::vector< std::string > group_by_columns)
Groups the data by specified columns.
Definition: datatable.cpp:250
DataTableStr csv_read(std::string filename, std::vector< std::string > columns={})
Reads a CSV file and returns a DataTableStr parses CSV files and stores all data as strings.
Definition: datatable.cpp:57
void describe()
Prints some information about the DataTable.
void csv_write()
Write DataTable to a CSV file.
DataTableStr first(const std::vector< gpmp::core::DataTableStr > &groups) const
Gets the first element of each created group.
Definition: datatable.cpp:317
void sort(const std::vector< std::string > &sort_columns, bool ascending=true)
Sorts the rows of the DataTable based on specified columns.
Definition: datatable.cpp:217
std::vector< std::string > headers_
Definition: datatable.hpp:79
DataTableStr original_data_
Definition: datatable.hpp:88
DataTableStr datetime(std::string column_name, bool extract_year=true, bool extract_month=true, bool extract_time=false)
Extracts date and time components from a timestamp column.
Definition: datatable.cpp:147
std::vector< std::vector< std::string > > rows_
Definition: datatable.hpp:81
void display(std::pair< std::vector< T >, std::vector< std::vector< T >>> data, bool display_all=false)
Sort a DataTable based on a specified column.
Definition: datatable.hpp:216
DataTableInt str_to_int(DataTableStr src)
Converts a DataTableStr to a DataTableInt.
Definition: datatable.cpp:347
std::vector< std::string > new_headers_
Definition: datatable.hpp:83
DataTableStr tsv_read(std::string filename, std::vector< std::string > columns={})
Reads a TSV file and returns a DataTableStr parses TSV files and stores all data as strings.
void display(bool display_all=false)
Overload function for display() defaults to displaying what is currently stored in a DataTable object...
Definition: datatable.hpp:316
DataTableDouble str_to_double(DataTableStr src)
Converts a DataTableStr to a DataTableDouble.
Definition: datatable.cpp:370
#define MAX_ROWS
Definition: datatable.hpp:41
#define SHOW_ROWS
Definition: datatable.hpp:42
std::pair< std::vector< long double >, std::vector< std::vector< long double > > > DataTableDouble
Definition: datatable.hpp:74
std::pair< std::vector< std::string >, std::vector< std::vector< std::string > > > DataTableStr
Definition: datatable.hpp:65
DataType
enum for representing different data types
Definition: datatable.hpp:59
std::pair< std::vector< int64_t >, std::vector< std::vector< int64_t > > > DataTableInt
Definition: datatable.hpp:69
The source C++ openGPMP namespace.