1 | #pragma once |
---|---|
2 | |
3 | #include <string> |
4 | #include <vector> |
5 | #include <bitset> |
6 | #include <fstream> |
7 | #include <cstring> |
8 | #include "parquet/parquet_types.h" |
9 | |
10 | namespace miniparquet { |
11 | |
12 | class ParquetColumn { |
13 | public: |
14 | uint64_t id; |
15 | parquet::format::Type::type type; |
16 | std::string name; |
17 | parquet::format::SchemaElement* schema_element; |
18 | }; |
19 | |
20 | struct Int96 { |
21 | uint32_t value[3]; |
22 | }; |
23 | |
24 | template<class T> |
25 | class Dictionary { |
26 | public: |
27 | std::vector<T> dict; |
28 | Dictionary(uint64_t n_values) { |
29 | dict.resize(n_values); |
30 | } |
31 | T& get(uint64_t offset) { |
32 | if (offset >= dict.size()) { |
33 | throw std::runtime_error("Dictionary offset out of bounds"); |
34 | } else |
35 | return dict.at(offset); |
36 | } |
37 | }; |
38 | |
39 | // todo move this to impl |
40 | |
41 | class ByteBuffer { // on to the 10 thousandth impl |
42 | public: |
43 | char* ptr = nullptr; |
44 | uint64_t len = 0; |
45 | |
46 | void resize(uint64_t new_size, bool copy=true) { |
47 | if (new_size > len) { |
48 | auto new_holder = std::unique_ptr<char[]>(new char[new_size]); |
49 | if (copy && holder != nullptr) { |
50 | memcpy(new_holder.get(), holder.get(), len); |
51 | } |
52 | holder = move(new_holder); |
53 | ptr = holder.get(); |
54 | len = new_size; |
55 | } |
56 | } |
57 | private: |
58 | std::unique_ptr<char[]> holder = nullptr; |
59 | }; |
60 | |
61 | class ScanState { |
62 | public: |
63 | uint64_t row_group_idx = 0; |
64 | uint64_t row_group_offset = 0; |
65 | }; |
66 | |
67 | struct ResultColumn { |
68 | uint64_t id; |
69 | ByteBuffer data; |
70 | ParquetColumn *col; |
71 | ByteBuffer defined; |
72 | std::vector<std::unique_ptr<char[]>> string_heap_chunks; |
73 | |
74 | }; |
75 | |
76 | struct ResultChunk { |
77 | std::vector<ResultColumn> cols; |
78 | uint64_t nrows; |
79 | }; |
80 | |
81 | class ParquetFile { |
82 | public: |
83 | ParquetFile(std::string filename); |
84 | void initialize_result(ResultChunk& result); |
85 | bool scan(ScanState &s, ResultChunk& result); |
86 | uint64_t nrow; |
87 | std::vector<std::unique_ptr<ParquetColumn>> columns; |
88 | |
89 | private: |
90 | void initialize(std::string filename); |
91 | void initialize_column(ResultColumn& col, uint64_t num_rows); |
92 | void scan_column(ScanState& state, ResultColumn& result_col); |
93 | parquet::format::FileMetaData file_meta_data; |
94 | std::ifstream pfile; |
95 | }; |
96 | |
97 | } |
98 |