49 #include <unordered_set>
59 bool is_int(
const std::string &str) {
61 return std::regex_match(str, std::regex(R
"(-?\d+)"));
66 return std::regex_match(str, std::regex(R
"(-?\d+\.\d+)"));
74 const char *
map_file(
const char *fname,
size_t &length) {
75 int fd = open(fname, O_RDONLY);
81 if (fstat(fd, &sb) == -1)
86 const char *addr =
static_cast<const char *
>(
87 mmap(NULL, length, PROT_READ, MAP_PRIVATE, fd, 0
u));
88 if (addr == MAP_FAILED)
99 std::vector<std::string> columns) {
100 std::ifstream file(filename);
101 file.rdbuf()->pubsetbuf(
nullptr, 0);
103 if (!file.is_open()) {
105 throw std::runtime_error(
"Unable to open file: " + filename +
".");
112 if (!getline(file, line)) {
114 throw std::runtime_error(
"Empty file: " + filename +
".");
117 std::stringstream header(line);
118 std::vector<std::string> header_cols;
119 std::string column_name;
121 while (getline(header, column_name,
',')) {
122 header_cols.emplace_back(column_name);
126 if (columns.empty()) {
127 columns = header_cols;
131 for (
const auto &column : columns) {
132 if (std::find(header_cols.begin(), header_cols.end(), column) ==
135 throw std::runtime_error(
"Column: " + column +
" not found");
140 std::vector<std::variant<int64_t, long double, std::string>> row_vector;
142 while (getline(file, line)) {
143 std::stringstream rowStream(line);
147 while (getline(rowStream, value,
',')) {
148 if (std::find(columns.begin(),
150 header_cols[columnIndex]) != columns.end()) {
152 size_t decimalPointCount =
153 std::count(value.begin(), value.end(),
'.');
154 if (decimalPointCount == 1) {
156 long double double_value = std::stold(value);
157 row_vector.emplace_back(double_value);
158 }
catch (
const std::invalid_argument &) {
159 row_vector.emplace_back(value);
161 }
else if ((value.find_first_not_of(
"0123456789-") ==
162 std::string::npos) &&
163 (std::count(value.begin(), value.end(),
'-') <= 1)) {
165 int64_t int_value = std::stoll(value);
166 row_vector.emplace_back(int_value);
167 }
catch (
const std::invalid_argument &) {
168 row_vector.emplace_back(value);
171 row_vector.emplace_back(value);
177 if (!row_vector.empty()) {
178 data.emplace_back(row_vector);
190 return make_pair(columns, data);
198 int num_columns = data.first.size();
199 int num_rows = data.second.size();
200 int num_omitted_rows = 0;
202 std::vector<int> max_column_widths(num_columns, 0);
205 for (
int i = 0; i < num_columns; i++) {
206 max_column_widths[i] = data.first[i].length();
210 for (
int i = 0; i < num_columns; i++) {
211 for (
const auto &row : data.second) {
212 if (i <
static_cast<int>(row.size())) {
214 [&max_column_widths, &i](
const auto &cellValue) {
215 using T = std::decay_t<decltype(cellValue)>;
216 if constexpr (std::is_same_v<T, std::string>) {
217 max_column_widths[i] =
218 std::max(max_column_widths[i],
219 static_cast<int>(cellValue.length()));
220 }
else if constexpr (std::is_integral_v<T> ||
221 std::is_floating_point_v<T>) {
222 max_column_widths[i] = std::max(
223 max_column_widths[i],
225 std::to_string(cellValue).length()));
233 const int dateTimeColumnIndex = 0;
234 max_column_widths[dateTimeColumnIndex] =
235 std::max(max_column_widths[dateTimeColumnIndex], 0);
238 auto printRow = [&data, &max_column_widths, num_columns](
int row_index) {
239 std::cout << std::setw(7) << std::right << row_index <<
" ";
241 for (
int j = 0; j < num_columns; j++) {
242 if (j <
static_cast<int>(data.second[row_index].size())) {
244 [&max_column_widths, &j](
const auto &cellValue) {
245 using T = std::decay_t<decltype(cellValue)>;
246 if constexpr (std::is_same_v<T, double> ||
247 std::is_same_v<T, long double>) {
250 std::string cellValueStr =
251 std::to_string(cellValue);
253 cellValueStr.find_last_not_of(
'0') + 1,
256 cellValueStr.find_last_not_of(
'.') + 1,
259 std::cout << std::setw(max_column_widths[j])
260 << std::right << cellValueStr <<
" ";
262 std::cout << std::setw(max_column_widths[j])
263 << std::right << cellValue <<
" ";
266 data.second[row_index][j]);
270 std::cout << std::endl;
274 std::cout << std::setw(7) << std::right <<
"Index"
276 for (
int i = 0; i < num_columns; i++) {
277 std::cout << std::setw(max_column_widths[i]) << std::right
278 << data.first[i] <<
" ";
280 std::cout << std::endl;
282 int num_elements = data.second.size();
283 if (!display_all && num_elements >
MAX_ROWS) {
287 num_omitted_rows = num_elements -
MAX_ROWS;
288 std::cout <<
"...\n";
289 std::cout <<
"[" << num_omitted_rows <<
" rows omitted]\n";
290 for (
int i = num_elements -
SHOW_ROWS; i < num_elements; i++) {
295 for (
int i = 0; i < num_elements; i++) {
301 std::cout <<
"[" << num_rows <<
" rows"
302 <<
" x " << num_columns <<
" columns";
303 std::cout <<
"]\n\n";
309 display(std::make_pair(headers_, data_), display_all);
314 int integer_count = 0;
315 int double_count = 0;
316 int string_count = 0;
318 for (
const std::string &cell : column) {
329 "int/double/str: " + std::to_string(integer_count) +
"/" +
330 std::to_string(double_count) +
"/" +
331 std::to_string(string_count));
333 if (integer_count > double_count) {
335 }
else if (double_count > integer_count) {
346 return "long double";
348 return "std::string";
356 const std::vector<std::string> &skip_columns) {
358 std::cout <<
"HEADERS:" << headers_.size() << std::endl;
359 std::cout <<
"ROWS:" << data_.size() << std::endl;
362 for (
const std::string &header : headers_) {
364 if (std::find(skip_columns.begin(), skip_columns.end(), header) !=
365 skip_columns.end()) {
369 std::cout << header <<
" ";
371 mixed_data.first.emplace_back(header);
374 std::cout << std::endl;
377 for (
size_t col = 0; col < headers_.size(); ++col) {
378 if (std::find(skip_columns.begin(),
380 headers_[col]) != skip_columns.end()) {
386 std::vector<std::string> column_data;
387 for (
const std::vector<std::variant<int64_t, long double, std::string>>
389 column_data.emplace_back(
390 std::get<std::string>(row[col]));
393 std::vector<std::variant<int64_t, long double, std::string>>
403 std::cout <<
"INT\n";
404 for (
const std::string &cell : column_data) {
405 converted_data.emplace_back(std::stoi(cell));
408 std::cout <<
"DOUBLE\n";
409 for (
const std::string &cell : column_data) {
410 converted_data.emplace_back(std::stod(cell));
413 std::cout <<
"STRING\n";
414 for (
const std::string &cell : column_data) {
415 converted_data.emplace_back(cell);
419 mixed_data.second.emplace_back(converted_data);
649 std::vector<double> column_memory_usages(headers_.size(), 0.0);
650 std::vector<std::string> column_data_types(headers_.size());
651 double total_memory_usage_kb = 0.0;
654 size_t memory_usage_bytes =
sizeof(headers_);
655 for (
const auto &row : data_) {
656 for (
size_t i = 0; i < row.size(); ++i) {
657 if (std::holds_alternative<int64_t>(row[i])) {
658 memory_usage_bytes +=
sizeof(int64_t);
659 column_memory_usages[i] +=
660 static_cast<double>(
sizeof(int64_t)) / 1024.0;
661 column_data_types[i] =
"int64_t";
662 }
else if (std::holds_alternative<long double>(row[i])) {
663 memory_usage_bytes +=
sizeof(
long double);
664 column_memory_usages[i] +=
665 static_cast<double>(
sizeof(
long double)) / 1024.0;
666 column_data_types[i] =
"long double";
667 }
else if (std::holds_alternative<std::string>(row[i])) {
668 memory_usage_bytes += std::get<std::string>(row[i]).capacity();
669 column_memory_usages[i] +=
671 std::get<std::string>(row[i]).capacity()) /
673 column_data_types[i] =
"std::string";
679 total_memory_usage_kb =
static_cast<double>(memory_usage_bytes) / 1024.0;
682 size_t max_column_name_length = 0;
683 for (
const std::string &column : headers_) {
684 max_column_name_length =
685 std::max(max_column_name_length, column.length());
689 size_t max_data_type_length = 0;
690 for (
const std::string &data_type : column_data_types) {
691 max_data_type_length =
692 std::max(max_data_type_length, data_type.length());
696 int column_width =
static_cast<int>(std::max(max_column_name_length,
697 max_data_type_length)) +
701 std::cout << std::left << std::setw(column_width) <<
"Column"
702 << std::setw(column_width) <<
"Type" << std::setw(column_width)
703 <<
"Memory Usage (KB)" << std::endl;
706 for (
size_t i = 0; i < headers_.size(); ++i) {
707 std::cout << std::left << std::setw(column_width) << headers_[i]
708 << std::setw(column_width) << column_data_types[i]
709 << std::setw(column_width) << std::fixed
710 << std::setprecision(2) << column_memory_usages[i]
715 std::cout <<
"\nTotal Memory Usage: " << std::fixed << std::setprecision(2)
716 << total_memory_usage_kb <<
" KB" << std::endl;
724 for (
const auto &v : src.first) {
726 if (std::regex_match(v, std::regex(
"\\d+"))) {
727 dest.first.emplace_back(std::stoi(v));
730 for (
const auto &vv : src.second) {
731 std::vector<int64_t> new_vec;
732 for (
const auto &v : vv) {
734 if (std::regex_match(v, std::regex(
"\\d+"))) {
735 new_vec.emplace_back(std::stoi(v));
738 dest.second.emplace_back(new_vec);
747 for (
const auto &v : src.first) {
748 if (std::regex_match(v, std::regex(
"[-+]?\\d*\\.?\\d+"))) {
749 dest.first.emplace_back(std::stold(v));
753 for (
const auto &vv : src.second) {
754 std::vector<long double> new_vec;
755 for (
const auto &v : vv) {
756 if (std::regex_match(v, std::regex(
"[-+]?\\d*\\.?\\d+"))) {
757 new_vec.emplace_back(std::stold(v));
760 dest.second.emplace_back(new_vec);
DataType inferType(const std::vector< std::string > &column)
std::vector< std::vector< std::string > > data_
DataTableStr csv_read(std::string filename, std::vector< std::string > columns={})
Reads a CSV file and returns a DataTableStr parses CSV files and stores all data as strings.
void info()
Displays data types and null vals for each column.
TableType native_type(const std::vector< std::string > &skip_columns={})
Converts DataTable column's rows to their native types. Since the existing DataTable read/load relate...
std::vector< std::string > headers_
void display(std::pair< std::vector< T >, std::vector< std::vector< T >>> data, bool display_all=false)
Sort a DataTable based on a specified column.
DataTableInt str_to_int(DataTableStr src)
Converts a DataTableStr to a DataTableInt.
DataTableDouble str_to_double(DataTableStr src)
Converts a DataTableStr to a DataTableDouble.
void log(LogLevel level, const std::string &message)
Logs a message with the specified log level.
std::string dt_to_str(gpmp::core::DataType type)
const char * map_file(const char *fname, size_t &length)
void handle_error(const char *msg)
bool is_double(const std::string &str)
static gpmp::core::Logger _log_
bool is_int(const std::string &str)
std::pair< std::vector< long double >, std::vector< std::vector< long double > > > DataTableDouble
std::pair< std::vector< std::string >, std::vector< std::vector< std::string > > > DataTableStr
std::pair< std::vector< std::string >, std::vector< std::vector< std::variant< int64_t, long double, std::string > > > > TableType
DataType
enum for representing different data types
std::pair< std::vector< int64_t >, std::vector< std::vector< int64_t > > > DataTableInt
std::vector< std::vector< std::variant< int64_t, long double, std::string > > > MixedType
Miscellaneous utilities methods related to openGPMP.