LCOV - code coverage report
Current view: top level - modules/core - datatable.cpp (source / functions) Hit Total Coverage
Test: lcov.info Lines: 0 174 0.0 %
Date: 2024-05-13 05:06:06 Functions: 0 9 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*************************************************************************
       2             :  *
       3             :  *  Project
       4             :  *                         _____ _____  __  __ _____
       5             :  *                        / ____|  __ \|  \/  |  __ \
       6             :  *  ___  _ __   ___ _ __ | |  __| |__) | \  / | |__) |
       7             :  * / _ \| '_ \ / _ \ '_ \| | |_ |  ___/| |\/| |  ___/
       8             :  *| (_) | |_) |  __/ | | | |__| | |    | |  | | |
       9             :  * \___/| .__/ \___|_| |_|\_____|_|    |_|  |_|_|
      10             :  *      | |
      11             :  *      |_|
      12             :  *
      13             :  * Copyright (C) Akiel Aries, <akiel@akiel.org>, et al.
      14             :  *
      15             :  * This software is licensed as described in the file LICENSE, which
      16             :  * you should have received as part of this distribution. The terms
      17             :  * among other details are referenced in the official documentation
      18             :  * seen here : https://akielaries.github.io/openGPMP/ along with
      19             :  * important files seen in this project.
      20             :  *
      21             :  * You may opt to use, copy, modify, merge, publish, distribute
      22             :  * and/or sell copies of the Software, and permit persons to whom
      23             :  * the Software is furnished to do so, under the terms of the
      24             :  * LICENSE file.
      25             :  *
      26             :  *
      27             :  *
      28             :  * This software is distributed on an AS IS basis, WITHOUT
      29             :  * WARRANTY OF ANY KIND, either express or implied.
      30             :  *
      31             :  ************************************************************************/
      32             : #include <algorithm>
      33             : #include <fstream>
      34             : #include <iomanip>
      35             : #include <iostream>
      36             : #include <map>
      37             : #include <openGPMP/core/datatable.hpp>
      38             : #include <openGPMP/core/utils.hpp>
      39             : #include <regex>
      40             : #include <sstream>
      41             : #include <string>
      42             : #include <vector>
      43             : // for memory mapping files
      44             : #include <fcntl.h>
      45             : #include <sys/mman.h>
      46             : #include <sys/stat.h>
      47             : #include <unistd.h>
      48             : 
      49             : /** Logger class object*/
      50             : static gpmp::core::Logger _log_;
      51             : 
      52             : // create method to create datatable from scratch? insert, drop, etc?
      53             : 
      54             : // TODO : optimize these methods, CSV reader using threads? loop unrolling?,
      55             : // etc? conversion functions to be quicker,
      56             : gpmp::core::DataTableStr
      57           0 : gpmp::core::DataTable::csv_read(std::string filename,
      58             :                                 std::vector<std::string> columns) {
      59             : 
      60           0 :     int fd = open(filename.c_str(), O_RDONLY);
      61           0 :     if (fd == -1) {
      62             :         // Handle file open error
      63           0 :         perror("Error opening file");
      64           0 :         exit(EXIT_FAILURE);
      65             :     }
      66             : 
      67           0 :     off_t size = lseek(fd, 0, SEEK_END);
      68             :     char *file_data =
      69           0 :         static_cast<char *>(mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0));
      70             : 
      71           0 :     if (file_data == MAP_FAILED) {
      72             :         // Handle memory mapping error
      73           0 :         perror("Error mapping file to memory");
      74           0 :         close(fd);
      75           0 :         exit(EXIT_FAILURE);
      76             :     }
      77             : 
      78           0 :     std::stringstream file_stream(file_data);
      79           0 :     std::vector<std::vector<std::string>> data;
      80           0 :     std::string line;
      81             : 
      82             :     // Get the header line and parse the column names
      83           0 :     getline(file_stream, line);
      84           0 :     std::stringstream header(line);
      85           0 :     std::vector<std::string> header_cols;
      86           0 :     std::string columnName;
      87             : 
      88           0 :     while (getline(header, columnName, ',')) {
      89           0 :         header_cols.push_back(columnName);
      90             :     }
      91             : 
      92             :     // If no columns are specified, read in all columns
      93           0 :     if (columns.empty()) {
      94           0 :         columns = header_cols;
      95             :     }
      96             : 
      97             :     // Check if specified columns exist in the header
      98           0 :     for (const auto &column : columns) {
      99           0 :         if (std::find(header_cols.begin(), header_cols.end(), column) ==
     100           0 :             header_cols.end()) {
     101             :             // Handle column not found error
     102           0 :             perror(("Column: " + column + " not found").c_str());
     103           0 :             munmap(file_data, size);
     104           0 :             close(fd);
     105           0 :             exit(EXIT_FAILURE);
     106             :         }
     107             :     }
     108             : 
     109             :     // Read in the data rows
     110           0 :     while (getline(file_stream, line)) {
     111           0 :         std::vector<std::string> row;
     112           0 :         std::stringstream rowStream(line);
     113           0 :         std::string value;
     114           0 :         int columnIndex = 0;
     115             : 
     116           0 :         while (getline(rowStream, value, ',')) {
     117             :             // If column is specified, only read in specified columns
     118           0 :             if (std::find(columns.begin(),
     119             :                           columns.end(),
     120           0 :                           header_cols[columnIndex]) != columns.end()) {
     121           0 :                 row.push_back(value);
     122             :             }
     123             : 
     124           0 :             columnIndex++;
     125             :         }
     126             : 
     127           0 :         if (!row.empty()) {
     128           0 :             data.push_back(row);
     129             :         }
     130           0 :     }
     131             : 
     132             :     // populate headers_ class variable
     133           0 :     headers_ = columns;
     134             :     // populate data_ class variable
     135           0 :     data_ = data;
     136             : 
     137           0 :     munmap(file_data, size);
     138           0 :     close(fd);
     139             : 
     140           0 :     return std::make_pair(headers_, data_);
     141           0 : }
     142             : 
     143             : // Extracts date/time information from given column
     144             : // TODO: add additional options for detecting/converting date/time columns
     145             : // to numeric formats
     146             : gpmp::core::DataTableStr
     147           0 : gpmp::core::DataTable::datetime(std::string column_name,
     148             :                                 bool extract_year,
     149             :                                 bool extract_month,
     150             :                                 bool extract_time) {
     151             :     // Find the index of the specified column
     152           0 :     auto column_iter = std::find(headers_.begin(), headers_.end(), column_name);
     153           0 :     if (column_iter == headers_.end()) {
     154           0 :         _log_.log(ERROR, "Column: " + column_name + " node found");
     155           0 :         exit(EXIT_FAILURE);
     156             :     }
     157           0 :     int column_index = std::distance(headers_.begin(), column_iter);
     158             : 
     159             :     // Extract components from each row
     160           0 :     std::vector<std::string> new_headers = headers_;
     161           0 :     std::vector<std::vector<std::string>> new_data;
     162             : 
     163             :     // Iterate and populate the additional columns
     164           0 :     for (size_t row_index = 0; row_index < data_.size(); ++row_index) {
     165           0 :         std::vector<std::string> row = data_[row_index];
     166             :         // If column row is not found
     167           0 :         if (row.size() <= static_cast<size_t>(column_index)) {
     168           0 :             _log_.log(ERROR, "Column: " + column_name + " not found");
     169             : 
     170           0 :             exit(EXIT_FAILURE);
     171             :         }
     172             : 
     173           0 :         std::string timestamp = row[column_index];
     174           0 :         std::string year, month, time;
     175             : 
     176             :         // Create a new row with extracted components
     177           0 :         std::vector<std::string> new_row;
     178             : 
     179             :         // Extract year, month, and time components
     180           0 :         if (extract_year) {
     181           0 :             year = timestamp.substr(timestamp.find_last_of('/') + 1, 4);
     182           0 :             new_row.push_back(year);
     183             :         }
     184           0 :         if (extract_month) {
     185           0 :             month = timestamp.substr(0, timestamp.find_first_of('/'));
     186           0 :             new_row.push_back(month);
     187             :         }
     188           0 :         if (extract_time) {
     189           0 :             time = timestamp.substr(timestamp.find(' ') + 1);
     190           0 :             new_row.push_back(time);
     191             :         }
     192             : 
     193             :         // append original row data
     194           0 :         new_row.insert(new_row.end(), row.begin(), row.end());
     195             :         // add new rows
     196           0 :         new_data.push_back(new_row);
     197           0 :     }
     198             : 
     199             :     // Create new headers based on the extracted components
     200           0 :     if (extract_month)
     201           0 :         new_headers.insert(new_headers.begin(), "Month");
     202           0 :     if (extract_year)
     203           0 :         new_headers.insert(new_headers.begin(), "Year");
     204           0 :     if (extract_time)
     205           0 :         new_headers.insert(new_headers.begin(), "Time");
     206             : 
     207             :     // set class car data_ to hold rows/lines
     208           0 :     data_ = new_data;
     209             :     // set class var modified headers to new headers
     210             :     // new_headers_ = new_headers;
     211           0 :     headers_ = new_headers;
     212             : 
     213           0 :     return std::make_pair(new_headers, new_data);
     214           0 : }
     215             : 
     216             : // Sort specified columns, by default in asending order
     217           0 : void gpmp::core::DataTable::sort(const std::vector<std::string> &sort_columns,
     218             :                                  bool ascending) {
     219             :     // Extract the column indices to be sorted by from the original data
     220           0 :     std::vector<size_t> column_indices;
     221           0 :     for (const std::string &column : sort_columns) {
     222           0 :         auto iter = std::find(headers_.begin(), headers_.end(), column);
     223           0 :         if (iter != headers_.end()) {
     224           0 :             size_t index = std::distance(headers_.begin(), iter);
     225           0 :             column_indices.push_back(index);
     226             :         }
     227             :     }
     228             : 
     229             :     // Sort the data based on the specified columns
     230           0 :     std::stable_sort(data_.begin(),
     231             :                      data_.end(),
     232           0 :                      [&](const std::vector<std::string> &row1,
     233             :                          const std::vector<std::string> &row2) {
     234           0 :                          for (size_t index : column_indices) {
     235           0 :                              if (row1[index] != row2[index]) {
     236           0 :                                  if (ascending) {
     237           0 :                                      return row1[index] < row2[index];
     238             :                                  } else {
     239           0 :                                      return row1[index] > row2[index];
     240             :                                  }
     241             :                              }
     242             :                          }
     243             :                          // Rows are equal, nothing to sort
     244           0 :                          return false;
     245             :                      });
     246           0 : }
     247             : 
     248             : // Group rows by specific columns
     249             : std::vector<gpmp::core::DataTableStr>
     250           0 : gpmp::core::DataTable::group_by(std::vector<std::string> group_by_columns) {
     251             :     // Find the indices of the specified group by columns
     252           0 :     std::vector<int> group_by_indices;
     253             : 
     254             :     // Traverse group column names
     255           0 :     for (const std::string &column_name : group_by_columns) {
     256           0 :         std::cout << "Searching for column: " << column_name << std::endl;
     257             : 
     258             :         // Find start/end and match column name
     259             :         auto column_iter =
     260           0 :             std::find(headers_.begin(), headers_.end(), column_name);
     261             : 
     262             :         // If no columns
     263           0 :         if (column_iter == headers_.end()) {
     264           0 :             _log_.log(ERROR, "Column: " + column_name + " not found");
     265           0 :             exit(EXIT_FAILURE);
     266             :         }
     267             :         // column index set to distance from start of first col to nexter iter
     268           0 :         int column_index = std::distance(headers_.begin(), column_iter);
     269             :         // add column index to group
     270           0 :         group_by_indices.push_back(column_index);
     271             :     }
     272             : 
     273             :     // Group the data based on the specified columns using a vector
     274             :     std::vector<std::pair<std::vector<std::string>, gpmp::core::DataTableStr>>
     275           0 :         groups;
     276             : 
     277             :     // Traverse row/line data
     278           0 :     for (const std::vector<std::string> &row : data_) {
     279             :         // store group key for each row
     280           0 :         std::vector<std::string> group_key;
     281             :         // Fill group key from specified group column names
     282           0 :         for (int index : group_by_indices) {
     283           0 :             group_key.push_back(row[index]);
     284             :         }
     285             : 
     286             :         // Check if the group already exists
     287           0 :         auto group_iter = std::find_if(
     288             :             groups.begin(),
     289             :             groups.end(),
     290           0 :             [&group_key](const std::pair<std::vector<std::string>,
     291           0 :                                          gpmp::core::DataTableStr> &group) {
     292           0 :                 return group.first == group_key;
     293             :             });
     294             :         // If the group DNE create a new one to add to groups vector
     295           0 :         if (group_iter == groups.end()) {
     296             :             // Create a new group
     297           0 :             groups.push_back(
     298           0 :                 {group_key, gpmp::core::DataTableStr(headers_, {})});
     299           0 :             group_iter = groups.end() - 1;
     300             :         }
     301             :         // Add current row to group
     302           0 :         group_iter->second.second.push_back(row);
     303           0 :     }
     304             : 
     305             :     // Extract the grouped data into a vector
     306           0 :     std::vector<gpmp::core::DataTableStr> grouped_data;
     307             :     // Iterate over sorted groups to push onto result vector
     308           0 :     for (const auto &group : groups) {
     309           0 :         grouped_data.push_back(group.second);
     310             :     }
     311             : 
     312             :     // Return final DataTableStr type
     313           0 :     return grouped_data;
     314           0 : }
     315             : 
     316             : // Get first element of each created group
     317           0 : gpmp::core::DataTableStr gpmp::core::DataTable::first(
     318             :     const std::vector<gpmp::core::DataTableStr> &groups) const {
     319           0 :     if (groups.empty()) {
     320             :         // Handle the case when there are no groups
     321           0 :         return std::make_pair(std::vector<std::string>(),
     322           0 :                               std::vector<std::vector<std::string>>());
     323             :     }
     324             : 
     325           0 :     std::vector<std::vector<std::string>> first_rows;
     326             : 
     327           0 :     for (const gpmp::core::DataTableStr &group : groups) {
     328           0 :         if (!group.second.empty()) {
     329           0 :             first_rows.push_back(
     330           0 :                 group.second[0]); // Get the first row of each group
     331             :         }
     332             :     }
     333             : 
     334           0 :     if (!first_rows.empty()) {
     335             :         // Assuming all groups have the same headers as the first group
     336           0 :         return std::make_pair(groups[0].first, first_rows);
     337             :     } else {
     338             :         // Handle the case when there are no first rows found.
     339           0 :         return std::make_pair(groups[0].first,
     340           0 :                               std::vector<std::vector<std::string>>());
     341             :     }
     342           0 : }
     343             : 
     344             : // Prints some basic information about a DataTable object
     345             : 
     346             : gpmp::core::DataTableInt
     347           0 : gpmp::core::DataTable::str_to_int(gpmp::core::DataTableStr src) {
     348           0 :     gpmp::core::DataTableInt dest;
     349             : 
     350           0 :     for (const auto &v : src.first) {
     351             :         // check if v contains only digits
     352           0 :         if (std::regex_match(v, std::regex("\\d+"))) {
     353           0 :             dest.first.push_back(std::stoi(v));
     354             :         }
     355             :     }
     356           0 :     for (const auto &vv : src.second) {
     357           0 :         std::vector<int64_t> new_vec;
     358           0 :         for (const auto &v : vv) {
     359             :             // check if v contains only digits
     360           0 :             if (std::regex_match(v, std::regex("\\d+"))) {
     361           0 :                 new_vec.push_back(std::stoi(v));
     362             :             }
     363             :         }
     364           0 :         dest.second.push_back(new_vec);
     365           0 :     }
     366           0 :     return dest;
     367           0 : }
     368             : 
     369             : gpmp::core::DataTableDouble
     370           0 : gpmp::core::DataTable::str_to_double(gpmp::core::DataTableStr src) {
     371           0 :     gpmp::core::DataTableDouble dest;
     372             : 
     373           0 :     for (const auto &v : src.first) {
     374           0 :         if (std::regex_match(v, std::regex("[-+]?\\d*\\.?\\d+"))) {
     375           0 :             dest.first.push_back(std::stold(v));
     376             :         }
     377             :     }
     378             : 
     379           0 :     for (const auto &vv : src.second) {
     380           0 :         std::vector<long double> new_vec;
     381           0 :         for (const auto &v : vv) {
     382           0 :             if (std::regex_match(v, std::regex("[-+]?\\d*\\.?\\d+"))) {
     383           0 :                 new_vec.push_back(std::stold(v));
     384             :             }
     385             :         }
     386           0 :         dest.second.push_back(new_vec);
     387           0 :     }
     388             : 
     389           0 :     return dest;
     390           0 : }

Generated by: LCOV version 1.14