44 #include <unordered_map>
57 std::vector<std::string> columns) {
58 std::ifstream file(filename);
60 if (!file.is_open()) {
65 std::vector<std::vector<std::string>> data;
70 std::stringstream header(line);
71 std::vector<std::string> header_cols;
72 std::string columnName;
74 while (getline(header, columnName,
',')) {
75 header_cols.push_back(columnName);
79 if (columns.empty()) {
80 columns = header_cols;
84 for (
const auto &column : columns) {
85 if (find(header_cols.begin(), header_cols.end(), column) ==
93 while (getline(file, line)) {
94 std::vector<std::string> row;
95 std::stringstream rowStream(line);
99 while (getline(rowStream, value,
',')) {
101 if (find(columns.begin(),
103 header_cols[columnIndex]) != columns.end()) {
104 row.push_back(value);
110 if (row.size() > 0) {
120 return make_pair(columns, data);
126 return std::regex_match(str, std::regex(R
"(-?\d+)"));
131 return std::regex_match(str, std::regex(R
"(-?\d+\.\d+)"));
136 int integer_count = 0;
137 int double_count = 0;
138 int string_count = 0;
140 for (
const std::string &cell : column) {
151 "int/double/str: " + std::to_string(integer_count) +
"/" +
152 std::to_string(double_count) +
"/" +
153 std::to_string(string_count));
155 if (integer_count > double_count) {
157 }
else if (double_count > integer_count) {
168 return "long double";
170 return "std::string";
177 const std::vector<std::string> &skip_columns) {
181 mixed_data.first = headers_;
183 std::vector<gpmp::core::DataType> column_data_types;
186 for (
size_t col = 0; col < headers_.size(); ++col) {
188 if (std::find(skip_columns.begin(),
190 headers_[col]) != skip_columns.end()) {
194 std::vector<std::string> column_data;
195 for (
const std::vector<std::string> &rowData : data_) {
196 column_data.push_back(rowData[col]);
199 column_data_types.push_back(column_type);
202 "Column " + headers_[col] +
203 " using type: " +
dt_to_str(column_type));
208 for (
const std::vector<std::string> &row : data_) {
209 std::vector<std::variant<int64_t, long double, std::string>> mixed_row;
211 for (
size_t col = 0; col < headers_.size(); ++col) {
212 const std::string &cell = row[col];
217 mixed_row.push_back(std::stoi(cell));
219 mixed_row.push_back(std::stold(cell));
221 mixed_row.push_back(cell);
225 mixed_data.second.push_back(mixed_row);
228 std::cout <<
"Mixed Data:" << std::endl;
229 for (
const std::string &header : mixed_data.first) {
230 std::cout << header <<
" ";
232 std::cout << std::endl;
234 for (
const auto &row : mixed_data.second) {
235 for (
const auto &cell : row) {
236 if (std::holds_alternative<int64_t>(cell)) {
237 std::cout << std::get<int64_t>(cell) <<
" ";
238 }
else if (std::holds_alternative<long double>(cell)) {
239 std::cout << std::get<long double>(cell) <<
" ";
240 }
else if (std::holds_alternative<std::string>(cell)) {
241 std::cout << std::get<std::string>(cell) <<
" ";
244 std::cout << std::endl;
351 auto column_iter = std::find(headers_.begin(), headers_.end(), column_name);
352 if (column_iter == headers_.end()) {
356 int column_index = std::distance(headers_.begin(), column_iter);
359 std::vector<std::string> new_headers = headers_;
360 std::vector<std::vector<std::string>> new_data;
363 for (
size_t row_index = 0; row_index < data_.size(); ++row_index) {
364 std::vector<std::string> row = data_[row_index];
366 if (row.size() <=
static_cast<size_t>(column_index)) {
372 std::string timestamp = row[column_index];
373 std::string
year, month, time;
376 std::vector<std::string> new_row;
380 year = timestamp.substr(timestamp.find_last_of(
'/') + 1, 4);
381 new_row.push_back(
year);
384 month = timestamp.substr(0, timestamp.find_first_of(
'/'));
385 new_row.push_back(month);
388 time = timestamp.substr(timestamp.find(
' ') + 1);
389 new_row.push_back(time);
393 new_row.insert(new_row.end(), row.begin(), row.end());
395 new_data.push_back(new_row);
400 new_headers.insert(new_headers.begin(),
"Month");
402 new_headers.insert(new_headers.begin(),
"Year");
404 new_headers.insert(new_headers.begin(),
"Time");
410 headers_ = new_headers;
412 return std::make_pair(new_headers, new_data);
419 std::vector<size_t> column_indices;
420 for (
const std::string &column : sort_columns) {
421 auto iter = std::find(headers_.begin(), headers_.end(), column);
422 if (iter != headers_.end()) {
423 size_t index = std::distance(headers_.begin(), iter);
424 column_indices.push_back(index);
429 std::stable_sort(data_.begin(),
431 [&](
const std::vector<std::string> &row1,
432 const std::vector<std::string> &row2) {
433 for (size_t index : column_indices) {
434 if (row1[index] != row2[index]) {
436 return row1[index] < row2[index];
438 return row1[index] > row2[index];
448 std::vector<gpmp::core::DataTableStr>
451 std::vector<int> group_by_indices;
454 for (
const std::string &column_name : group_by_columns) {
455 std::cout <<
"Searching for column: " << column_name << std::endl;
459 std::find(headers_.begin(), headers_.end(), column_name);
462 if (column_iter == headers_.end()) {
467 int column_index = std::distance(headers_.begin(), column_iter);
469 group_by_indices.push_back(column_index);
477 for (
const std::vector<std::string> &row : data_) {
479 std::vector<std::string> group_key;
481 for (
int index : group_by_indices) {
482 group_key.push_back(row[index]);
486 auto group_iter = std::find_if(
489 [&group_key](
const std::pair<std::vector<std::string>,
491 return group.first == group_key;
494 if (group_iter == groups.end()) {
498 group_iter = groups.end() - 1;
501 group_iter->second.second.push_back(row);
505 std::vector<gpmp::core::DataTableStr> grouped_data;
507 for (
const auto &group : groups) {
508 grouped_data.push_back(group.second);
517 const std::vector<gpmp::core::DataTableStr> &groups)
const {
518 if (groups.empty()) {
520 return std::make_pair(std::vector<std::string>(),
521 std::vector<std::vector<std::string>>());
524 std::vector<std::vector<std::string>> first_rows;
527 if (!group.second.empty()) {
528 first_rows.push_back(
533 if (!first_rows.empty()) {
535 return std::make_pair(groups[0].first, first_rows);
538 return std::make_pair(groups[0].first,
539 std::vector<std::vector<std::string>>());
549 for (
const auto &v : src.first) {
551 if (std::regex_match(v, std::regex(
"\\d+"))) {
552 dest.first.push_back(std::stoi(v));
555 for (
const auto &vv : src.second) {
556 std::vector<int64_t> new_vec;
557 for (
const auto &v : vv) {
559 if (std::regex_match(v, std::regex(
"\\d+"))) {
560 new_vec.push_back(std::stoi(v));
563 dest.second.push_back(new_vec);
572 for (
const auto &v : src.first) {
573 if (std::regex_match(v, std::regex(
"[-+]?\\d*\\.?\\d+"))) {
574 dest.first.push_back(std::stold(v));
578 for (
const auto &vv : src.second) {
579 std::vector<long double> new_vec;
580 for (
const auto &v : vv) {
581 if (std::regex_match(v, std::regex(
"[-+]?\\d*\\.?\\d+"))) {
582 new_vec.push_back(std::stold(v));
585 dest.second.push_back(new_vec);
DataType inferType(const std::vector< std::string > &column)
std::vector< std::vector< std::string > > data_
std::vector< DataTableStr > group_by(std::vector< std::string > group_by_columns)
Groups the data by specified columns.
DataTableStr csv_read(std::string filename, std::vector< std::string > columns={})
Reads a CSV file and returns a DataTableStr parses CSV files and stores all data as strings.
TableType native_type(const std::vector< std::string > &skip_columns={})
Converts DataTable column's rows to their native types. Since the existing DataTable read/load relate...
DataTableStr first(const std::vector< gpmp::core::DataTableStr > &groups) const
Gets the first element of each created group.
void sort(const std::vector< std::string > &sort_columns, bool ascending=true)
Sorts the rows of the DataTable based on specified columns.
std::vector< std::string > headers_
DataTableStr datetime(std::string column_name, bool extract_year=true, bool extract_month=true, bool extract_time=false)
Extracts date and time components from a timestamp column.
DataTableInt str_to_int(DataTableStr src)
Converts a DataTableStr to a DataTableInt.
DataTableDouble str_to_double(DataTableStr src)
Converts a DataTableStr to a DataTableDouble.
void log(LogLevel level, const std::string &message)
Logs a message with the specified log level.
std::string dt_to_str(gpmp::core::DataType type)
bool is_double(const std::string &str)
static gpmp::core::Logger _log_
bool is_int(const std::string &str)
std::pair< std::vector< long double >, std::vector< std::vector< long double > > > DataTableDouble
std::pair< std::vector< std::string >, std::vector< std::vector< std::string > > > DataTableStr
std::pair< std::vector< std::string >, std::vector< std::vector< std::variant< int64_t, long double, std::string > > > > TableType
DataType
enum for representing different data types
std::pair< std::vector< int64_t >, std::vector< std::vector< int64_t > > > DataTableInt
Miscellaneous utilities methods related to openGPMP.