1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #include <ctime> |
19 | #include <iostream> |
20 | #include <list> |
21 | #include <memory> |
22 | |
23 | #include "parquet/api/reader.h" |
24 | |
25 | int main(int argc, char** argv) { |
26 | if (argc > 4 || argc < 1) { |
27 | std::cerr << "Usage: parquet-scan [--batch-size=] [--columns=...] <file>" |
28 | << std::endl; |
29 | return -1; |
30 | } |
31 | |
32 | std::string filename; |
33 | |
34 | // Read command-line options |
35 | int batch_size = 256; |
36 | const std::string COLUMNS_PREFIX = "--columns=" ; |
37 | const std::string BATCH_SIZE_PREFIX = "--batch-size=" ; |
38 | std::vector<int> columns; |
39 | int num_columns = 0; |
40 | |
41 | char *param, *value; |
42 | for (int i = 1; i < argc; i++) { |
43 | if ((param = std::strstr(argv[i], COLUMNS_PREFIX.c_str()))) { |
44 | value = std::strtok(param + COLUMNS_PREFIX.length(), "," ); |
45 | while (value) { |
46 | columns.push_back(std::atoi(value)); |
47 | value = std::strtok(nullptr, "," ); |
48 | num_columns++; |
49 | } |
50 | } else if ((param = std::strstr(argv[i], BATCH_SIZE_PREFIX.c_str()))) { |
51 | value = std::strtok(param + BATCH_SIZE_PREFIX.length(), " " ); |
52 | if (value) { |
53 | batch_size = std::atoi(value); |
54 | } |
55 | } else { |
56 | filename = argv[i]; |
57 | } |
58 | } |
59 | |
60 | try { |
61 | double total_time; |
62 | std::clock_t start_time = std::clock(); |
63 | std::unique_ptr<parquet::ParquetFileReader> reader = |
64 | parquet::ParquetFileReader::OpenFile(filename); |
65 | |
66 | int64_t total_rows = parquet::ScanFileContents(columns, batch_size, reader.get()); |
67 | |
68 | total_time = static_cast<double>(std::clock() - start_time) / |
69 | static_cast<double>(CLOCKS_PER_SEC); |
70 | std::cout << total_rows << " rows scanned in " << total_time << " seconds." |
71 | << std::endl; |
72 | } catch (const std::exception& e) { |
73 | std::cerr << "Parquet error: " << e.what() << std::endl; |
74 | return -1; |
75 | } |
76 | |
77 | return 0; |
78 | } |
79 | |