58 std::vector<std::string> columns) {
60 int fd = open(filename.c_str(), O_RDONLY);
63 perror(
"Error opening file");
67 off_t size = lseek(fd, 0, SEEK_END);
69 static_cast<char *
>(mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0));
71 if (file_data == MAP_FAILED) {
73 perror(
"Error mapping file to memory");
78 std::stringstream file_stream(file_data);
79 std::vector<std::vector<std::string>> data;
83 getline(file_stream, line);
84 std::stringstream header(line);
85 std::vector<std::string> header_cols;
86 std::string columnName;
88 while (getline(header, columnName,
',')) {
89 header_cols.push_back(columnName);
93 if (columns.empty()) {
94 columns = header_cols;
98 for (
const auto &column : columns) {
99 if (std::find(header_cols.begin(), header_cols.end(), column) ==
102 perror((
"Column: " + column +
" not found").c_str());
103 munmap(file_data, size);
110 while (getline(file_stream, line)) {
111 std::vector<std::string> row;
112 std::stringstream rowStream(line);
116 while (getline(rowStream, value,
',')) {
118 if (std::find(columns.begin(),
120 header_cols[columnIndex]) != columns.end()) {
121 row.push_back(value);
137 munmap(file_data, size);
152 auto column_iter = std::find(headers_.begin(), headers_.end(), column_name);
153 if (column_iter == headers_.end()) {
157 int column_index = std::distance(headers_.begin(), column_iter);
160 std::vector<std::string> new_headers = headers_;
161 std::vector<std::vector<std::string>> new_data;
164 for (
size_t row_index = 0; row_index < data_.size(); ++row_index) {
165 std::vector<std::string> row = data_[row_index];
167 if (row.size() <=
static_cast<size_t>(column_index)) {
173 std::string timestamp = row[column_index];
174 std::string
year, month, time;
177 std::vector<std::string> new_row;
181 year = timestamp.substr(timestamp.find_last_of(
'/') + 1, 4);
182 new_row.push_back(
year);
185 month = timestamp.substr(0, timestamp.find_first_of(
'/'));
186 new_row.push_back(month);
189 time = timestamp.substr(timestamp.find(
' ') + 1);
190 new_row.push_back(time);
194 new_row.insert(new_row.end(), row.begin(), row.end());
196 new_data.push_back(new_row);
201 new_headers.insert(new_headers.begin(),
"Month");
203 new_headers.insert(new_headers.begin(),
"Year");
205 new_headers.insert(new_headers.begin(),
"Time");
211 headers_ = new_headers;
213 return std::make_pair(new_headers, new_data);
220 std::vector<size_t> column_indices;
221 for (
const std::string &column : sort_columns) {
222 auto iter = std::find(headers_.begin(), headers_.end(), column);
223 if (iter != headers_.end()) {
224 size_t index = std::distance(headers_.begin(), iter);
225 column_indices.push_back(index);
230 std::stable_sort(data_.begin(),
232 [&](
const std::vector<std::string> &row1,
233 const std::vector<std::string> &row2) {
234 for (size_t index : column_indices) {
235 if (row1[index] != row2[index]) {
237 return row1[index] < row2[index];
239 return row1[index] > row2[index];
249 std::vector<gpmp::core::DataTableStr>
252 std::vector<int> group_by_indices;
255 for (
const std::string &column_name : group_by_columns) {
256 std::cout <<
"Searching for column: " << column_name << std::endl;
260 std::find(headers_.begin(), headers_.end(), column_name);
263 if (column_iter == headers_.end()) {
268 int column_index = std::distance(headers_.begin(), column_iter);
270 group_by_indices.push_back(column_index);
278 for (
const std::vector<std::string> &row : data_) {
280 std::vector<std::string> group_key;
282 for (
int index : group_by_indices) {
283 group_key.push_back(row[index]);
287 auto group_iter = std::find_if(
290 [&group_key](
const std::pair<std::vector<std::string>,
292 return group.first == group_key;
295 if (group_iter == groups.end()) {
299 group_iter = groups.end() - 1;
302 group_iter->second.second.push_back(row);
306 std::vector<gpmp::core::DataTableStr> grouped_data;
308 for (
const auto &group : groups) {
309 grouped_data.push_back(group.second);
318 const std::vector<gpmp::core::DataTableStr> &groups)
const {
319 if (groups.empty()) {
321 return std::make_pair(std::vector<std::string>(),
322 std::vector<std::vector<std::string>>());
325 std::vector<std::vector<std::string>> first_rows;
328 if (!group.second.empty()) {
329 first_rows.push_back(
334 if (!first_rows.empty()) {
336 return std::make_pair(groups[0].first, first_rows);
339 return std::make_pair(groups[0].first,
340 std::vector<std::vector<std::string>>());
350 for (
const auto &v : src.first) {
352 if (std::regex_match(v, std::regex(
"\\d+"))) {
353 dest.first.push_back(std::stoi(v));
356 for (
const auto &vv : src.second) {
357 std::vector<int64_t> new_vec;
358 for (
const auto &v : vv) {
360 if (std::regex_match(v, std::regex(
"\\d+"))) {
361 new_vec.push_back(std::stoi(v));
364 dest.second.push_back(new_vec);
373 for (
const auto &v : src.first) {
374 if (std::regex_match(v, std::regex(
"[-+]?\\d*\\.?\\d+"))) {
375 dest.first.push_back(std::stold(v));
379 for (
const auto &vv : src.second) {
380 std::vector<long double> new_vec;
381 for (
const auto &v : vv) {
382 if (std::regex_match(v, std::regex(
"[-+]?\\d*\\.?\\d+"))) {
383 new_vec.push_back(std::stold(v));
386 dest.second.push_back(new_vec);
std::vector< std::vector< std::string > > data_
std::vector< DataTableStr > group_by(std::vector< std::string > group_by_columns)
Groups the data by specified columns.
DataTableStr csv_read(std::string filename, std::vector< std::string > columns={})
Reads a CSV file and returns a DataTableStr parses CSV files and stores all data as strings.
DataTableStr first(const std::vector< gpmp::core::DataTableStr > &groups) const
Gets the first element of each created group.
void sort(const std::vector< std::string > &sort_columns, bool ascending=true)
Sorts the rows of the DataTable based on specified columns.
std::vector< std::string > headers_
DataTableStr datetime(std::string column_name, bool extract_year=true, bool extract_month=true, bool extract_time=false)
Extracts date and time components from a timestamp column.
DataTableInt str_to_int(DataTableStr src)
Converts a DataTableStr to a DataTableInt.
DataTableDouble str_to_double(DataTableStr src)
Converts a DataTableStr to a DataTableDouble.
void log(LogLevel level, const std::string &message)
Logs a message with the specified log level.
static gpmp::core::Logger _log_
std::pair< std::vector< long double >, std::vector< std::vector< long double > > > DataTableDouble
std::pair< std::vector< std::string >, std::vector< std::vector< std::string > > > DataTableStr
std::pair< std::vector< int64_t >, std::vector< std::vector< int64_t > > > DataTableInt
Miscellaneous utilities methods related to openGPMP.