Line data Source code
1 : /*************************************************************************
2 : *
3 : * Project
4 : * _____ _____ __ __ _____
5 : * / ____| __ \| \/ | __ \
6 : * ___ _ __ ___ _ __ | | __| |__) | \ / | |__) |
7 : * / _ \| '_ \ / _ \ '_ \| | |_ | ___/| |\/| | ___/
8 : *| (_) | |_) | __/ | | | |__| | | | | | | |
9 : * \___/| .__/ \___|_| |_|\_____|_| |_| |_|_|
10 : * | |
11 : * |_|
12 : *
13 : * Copyright (C) Akiel Aries, <akiel@akiel.org>, et al.
14 : *
15 : * This software is licensed as described in the file LICENSE, which
16 : * you should have received as part of this distribution. The terms
17 : * among other details are referenced in the official documentation
18 : * seen here : https://akielaries.github.io/openGPMP/ along with
19 : * important files seen in this project.
20 : *
21 : * You may opt to use, copy, modify, merge, publish, distribute
22 : * and/or sell copies of the Software, and permit persons to whom
23 : * the Software is furnished to do so, under the terms of the
24 : * LICENSE file.
25 : *
26 : *
27 : *
28 : * This software is distributed on an AS IS basis, WITHOUT
29 : * WARRANTY OF ANY KIND, either express or implied.
30 : *
31 : ************************************************************************/
32 : #include <algorithm>
33 : #include <fstream>
34 : #include <iomanip>
35 : #include <iostream>
36 : #include <map>
37 : #include <openGPMP/core/datatable.hpp>
38 : #include <openGPMP/core/utils.hpp>
39 : #include <regex>
40 : #include <sstream>
41 : #include <string>
42 : #include <vector>
43 : // for memory mapping files
44 : #include <fcntl.h>
45 : #include <sys/mman.h>
46 : #include <sys/stat.h>
47 : #include <unistd.h>
48 :
49 : /** Logger class object*/
50 : static gpmp::core::Logger _log_;
51 :
52 : // create method to create datatable from scratch? insert, drop, etc?
53 :
54 : // TODO : optimize these methods, CSV reader using threads? loop unrolling?,
55 : // etc? conversion functions to be quicker,
56 : gpmp::core::DataTableStr
57 0 : gpmp::core::DataTable::csv_read(std::string filename,
58 : std::vector<std::string> columns) {
59 :
60 0 : int fd = open(filename.c_str(), O_RDONLY);
61 0 : if (fd == -1) {
62 : // Handle file open error
63 0 : perror("Error opening file");
64 0 : exit(EXIT_FAILURE);
65 : }
66 :
67 0 : off_t size = lseek(fd, 0, SEEK_END);
68 : char *file_data =
69 0 : static_cast<char *>(mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0));
70 :
71 0 : if (file_data == MAP_FAILED) {
72 : // Handle memory mapping error
73 0 : perror("Error mapping file to memory");
74 0 : close(fd);
75 0 : exit(EXIT_FAILURE);
76 : }
77 :
78 0 : std::stringstream file_stream(file_data);
79 0 : std::vector<std::vector<std::string>> data;
80 0 : std::string line;
81 :
82 : // Get the header line and parse the column names
83 0 : getline(file_stream, line);
84 0 : std::stringstream header(line);
85 0 : std::vector<std::string> header_cols;
86 0 : std::string columnName;
87 :
88 0 : while (getline(header, columnName, ',')) {
89 0 : header_cols.push_back(columnName);
90 : }
91 :
92 : // If no columns are specified, read in all columns
93 0 : if (columns.empty()) {
94 0 : columns = header_cols;
95 : }
96 :
97 : // Check if specified columns exist in the header
98 0 : for (const auto &column : columns) {
99 0 : if (std::find(header_cols.begin(), header_cols.end(), column) ==
100 0 : header_cols.end()) {
101 : // Handle column not found error
102 0 : perror(("Column: " + column + " not found").c_str());
103 0 : munmap(file_data, size);
104 0 : close(fd);
105 0 : exit(EXIT_FAILURE);
106 : }
107 : }
108 :
109 : // Read in the data rows
110 0 : while (getline(file_stream, line)) {
111 0 : std::vector<std::string> row;
112 0 : std::stringstream rowStream(line);
113 0 : std::string value;
114 0 : int columnIndex = 0;
115 :
116 0 : while (getline(rowStream, value, ',')) {
117 : // If column is specified, only read in specified columns
118 0 : if (std::find(columns.begin(),
119 : columns.end(),
120 0 : header_cols[columnIndex]) != columns.end()) {
121 0 : row.push_back(value);
122 : }
123 :
124 0 : columnIndex++;
125 : }
126 :
127 0 : if (!row.empty()) {
128 0 : data.push_back(row);
129 : }
130 0 : }
131 :
132 : // populate headers_ class variable
133 0 : headers_ = columns;
134 : // populate data_ class variable
135 0 : data_ = data;
136 :
137 0 : munmap(file_data, size);
138 0 : close(fd);
139 :
140 0 : return std::make_pair(headers_, data_);
141 0 : }
142 :
143 : // Extracts date/time information from given column
144 : // TODO: add additional options for detecting/converting date/time columns
145 : // to numeric formats
146 : gpmp::core::DataTableStr
147 0 : gpmp::core::DataTable::datetime(std::string column_name,
148 : bool extract_year,
149 : bool extract_month,
150 : bool extract_time) {
151 : // Find the index of the specified column
152 0 : auto column_iter = std::find(headers_.begin(), headers_.end(), column_name);
153 0 : if (column_iter == headers_.end()) {
154 0 : _log_.log(ERROR, "Column: " + column_name + " node found");
155 0 : exit(EXIT_FAILURE);
156 : }
157 0 : int column_index = std::distance(headers_.begin(), column_iter);
158 :
159 : // Extract components from each row
160 0 : std::vector<std::string> new_headers = headers_;
161 0 : std::vector<std::vector<std::string>> new_data;
162 :
163 : // Iterate and populate the additional columns
164 0 : for (size_t row_index = 0; row_index < data_.size(); ++row_index) {
165 0 : std::vector<std::string> row = data_[row_index];
166 : // If column row is not found
167 0 : if (row.size() <= static_cast<size_t>(column_index)) {
168 0 : _log_.log(ERROR, "Column: " + column_name + " not found");
169 :
170 0 : exit(EXIT_FAILURE);
171 : }
172 :
173 0 : std::string timestamp = row[column_index];
174 0 : std::string year, month, time;
175 :
176 : // Create a new row with extracted components
177 0 : std::vector<std::string> new_row;
178 :
179 : // Extract year, month, and time components
180 0 : if (extract_year) {
181 0 : year = timestamp.substr(timestamp.find_last_of('/') + 1, 4);
182 0 : new_row.push_back(year);
183 : }
184 0 : if (extract_month) {
185 0 : month = timestamp.substr(0, timestamp.find_first_of('/'));
186 0 : new_row.push_back(month);
187 : }
188 0 : if (extract_time) {
189 0 : time = timestamp.substr(timestamp.find(' ') + 1);
190 0 : new_row.push_back(time);
191 : }
192 :
193 : // append original row data
194 0 : new_row.insert(new_row.end(), row.begin(), row.end());
195 : // add new rows
196 0 : new_data.push_back(new_row);
197 0 : }
198 :
199 : // Create new headers based on the extracted components
200 0 : if (extract_month)
201 0 : new_headers.insert(new_headers.begin(), "Month");
202 0 : if (extract_year)
203 0 : new_headers.insert(new_headers.begin(), "Year");
204 0 : if (extract_time)
205 0 : new_headers.insert(new_headers.begin(), "Time");
206 :
207 : // set class car data_ to hold rows/lines
208 0 : data_ = new_data;
209 : // set class var modified headers to new headers
210 : // new_headers_ = new_headers;
211 0 : headers_ = new_headers;
212 :
213 0 : return std::make_pair(new_headers, new_data);
214 0 : }
215 :
216 : // Sort specified columns, by default in asending order
217 0 : void gpmp::core::DataTable::sort(const std::vector<std::string> &sort_columns,
218 : bool ascending) {
219 : // Extract the column indices to be sorted by from the original data
220 0 : std::vector<size_t> column_indices;
221 0 : for (const std::string &column : sort_columns) {
222 0 : auto iter = std::find(headers_.begin(), headers_.end(), column);
223 0 : if (iter != headers_.end()) {
224 0 : size_t index = std::distance(headers_.begin(), iter);
225 0 : column_indices.push_back(index);
226 : }
227 : }
228 :
229 : // Sort the data based on the specified columns
230 0 : std::stable_sort(data_.begin(),
231 : data_.end(),
232 0 : [&](const std::vector<std::string> &row1,
233 : const std::vector<std::string> &row2) {
234 0 : for (size_t index : column_indices) {
235 0 : if (row1[index] != row2[index]) {
236 0 : if (ascending) {
237 0 : return row1[index] < row2[index];
238 : } else {
239 0 : return row1[index] > row2[index];
240 : }
241 : }
242 : }
243 : // Rows are equal, nothing to sort
244 0 : return false;
245 : });
246 0 : }
247 :
248 : // Group rows by specific columns
249 : std::vector<gpmp::core::DataTableStr>
250 0 : gpmp::core::DataTable::group_by(std::vector<std::string> group_by_columns) {
251 : // Find the indices of the specified group by columns
252 0 : std::vector<int> group_by_indices;
253 :
254 : // Traverse group column names
255 0 : for (const std::string &column_name : group_by_columns) {
256 0 : std::cout << "Searching for column: " << column_name << std::endl;
257 :
258 : // Find start/end and match column name
259 : auto column_iter =
260 0 : std::find(headers_.begin(), headers_.end(), column_name);
261 :
262 : // If no columns
263 0 : if (column_iter == headers_.end()) {
264 0 : _log_.log(ERROR, "Column: " + column_name + " not found");
265 0 : exit(EXIT_FAILURE);
266 : }
267 : // column index set to distance from start of first col to nexter iter
268 0 : int column_index = std::distance(headers_.begin(), column_iter);
269 : // add column index to group
270 0 : group_by_indices.push_back(column_index);
271 : }
272 :
273 : // Group the data based on the specified columns using a vector
274 : std::vector<std::pair<std::vector<std::string>, gpmp::core::DataTableStr>>
275 0 : groups;
276 :
277 : // Traverse row/line data
278 0 : for (const std::vector<std::string> &row : data_) {
279 : // store group key for each row
280 0 : std::vector<std::string> group_key;
281 : // Fill group key from specified group column names
282 0 : for (int index : group_by_indices) {
283 0 : group_key.push_back(row[index]);
284 : }
285 :
286 : // Check if the group already exists
287 0 : auto group_iter = std::find_if(
288 : groups.begin(),
289 : groups.end(),
290 0 : [&group_key](const std::pair<std::vector<std::string>,
291 0 : gpmp::core::DataTableStr> &group) {
292 0 : return group.first == group_key;
293 : });
294 : // If the group DNE create a new one to add to groups vector
295 0 : if (group_iter == groups.end()) {
296 : // Create a new group
297 0 : groups.push_back(
298 0 : {group_key, gpmp::core::DataTableStr(headers_, {})});
299 0 : group_iter = groups.end() - 1;
300 : }
301 : // Add current row to group
302 0 : group_iter->second.second.push_back(row);
303 0 : }
304 :
305 : // Extract the grouped data into a vector
306 0 : std::vector<gpmp::core::DataTableStr> grouped_data;
307 : // Iterate over sorted groups to push onto result vector
308 0 : for (const auto &group : groups) {
309 0 : grouped_data.push_back(group.second);
310 : }
311 :
312 : // Return final DataTableStr type
313 0 : return grouped_data;
314 0 : }
315 :
316 : // Get first element of each created group
317 0 : gpmp::core::DataTableStr gpmp::core::DataTable::first(
318 : const std::vector<gpmp::core::DataTableStr> &groups) const {
319 0 : if (groups.empty()) {
320 : // Handle the case when there are no groups
321 0 : return std::make_pair(std::vector<std::string>(),
322 0 : std::vector<std::vector<std::string>>());
323 : }
324 :
325 0 : std::vector<std::vector<std::string>> first_rows;
326 :
327 0 : for (const gpmp::core::DataTableStr &group : groups) {
328 0 : if (!group.second.empty()) {
329 0 : first_rows.push_back(
330 0 : group.second[0]); // Get the first row of each group
331 : }
332 : }
333 :
334 0 : if (!first_rows.empty()) {
335 : // Assuming all groups have the same headers as the first group
336 0 : return std::make_pair(groups[0].first, first_rows);
337 : } else {
338 : // Handle the case when there are no first rows found.
339 0 : return std::make_pair(groups[0].first,
340 0 : std::vector<std::vector<std::string>>());
341 : }
342 0 : }
343 :
344 : // Prints some basic information about a DataTable object
345 :
346 : gpmp::core::DataTableInt
347 0 : gpmp::core::DataTable::str_to_int(gpmp::core::DataTableStr src) {
348 0 : gpmp::core::DataTableInt dest;
349 :
350 0 : for (const auto &v : src.first) {
351 : // check if v contains only digits
352 0 : if (std::regex_match(v, std::regex("\\d+"))) {
353 0 : dest.first.push_back(std::stoi(v));
354 : }
355 : }
356 0 : for (const auto &vv : src.second) {
357 0 : std::vector<int64_t> new_vec;
358 0 : for (const auto &v : vv) {
359 : // check if v contains only digits
360 0 : if (std::regex_match(v, std::regex("\\d+"))) {
361 0 : new_vec.push_back(std::stoi(v));
362 : }
363 : }
364 0 : dest.second.push_back(new_vec);
365 0 : }
366 0 : return dest;
367 0 : }
368 :
369 : gpmp::core::DataTableDouble
370 0 : gpmp::core::DataTable::str_to_double(gpmp::core::DataTableStr src) {
371 0 : gpmp::core::DataTableDouble dest;
372 :
373 0 : for (const auto &v : src.first) {
374 0 : if (std::regex_match(v, std::regex("[-+]?\\d*\\.?\\d+"))) {
375 0 : dest.first.push_back(std::stold(v));
376 : }
377 : }
378 :
379 0 : for (const auto &vv : src.second) {
380 0 : std::vector<long double> new_vec;
381 0 : for (const auto &v : vv) {
382 0 : if (std::regex_match(v, std::regex("[-+]?\\d*\\.?\\d+"))) {
383 0 : new_vec.push_back(std::stold(v));
384 : }
385 : }
386 0 : dest.second.push_back(new_vec);
387 0 : }
388 :
389 0 : return dest;
390 0 : }
|