| 1 | #pragma once |
|---|---|
| 2 | |
| 3 | #include <string> |
| 4 | #include <vector> |
| 5 | #include <bitset> |
| 6 | #include <fstream> |
| 7 | #include <cstring> |
| 8 | #include "parquet/parquet_types.h" |
| 9 | |
| 10 | namespace miniparquet { |
| 11 | |
| 12 | class ParquetColumn { |
| 13 | public: |
| 14 | uint64_t id; |
| 15 | parquet::format::Type::type type; |
| 16 | std::string name; |
| 17 | parquet::format::SchemaElement* schema_element; |
| 18 | }; |
| 19 | |
| 20 | struct Int96 { |
| 21 | uint32_t value[3]; |
| 22 | }; |
| 23 | |
| 24 | template<class T> |
| 25 | class Dictionary { |
| 26 | public: |
| 27 | std::vector<T> dict; |
| 28 | Dictionary(uint64_t n_values) { |
| 29 | dict.resize(n_values); |
| 30 | } |
| 31 | T& get(uint64_t offset) { |
| 32 | if (offset >= dict.size()) { |
| 33 | throw std::runtime_error("Dictionary offset out of bounds"); |
| 34 | } else |
| 35 | return dict.at(offset); |
| 36 | } |
| 37 | }; |
| 38 | |
| 39 | // todo move this to impl |
| 40 | |
| 41 | class ByteBuffer { // on to the 10 thousandth impl |
| 42 | public: |
| 43 | char* ptr = nullptr; |
| 44 | uint64_t len = 0; |
| 45 | |
| 46 | void resize(uint64_t new_size, bool copy=true) { |
| 47 | if (new_size > len) { |
| 48 | auto new_holder = std::unique_ptr<char[]>(new char[new_size]); |
| 49 | if (copy && holder != nullptr) { |
| 50 | memcpy(new_holder.get(), holder.get(), len); |
| 51 | } |
| 52 | holder = move(new_holder); |
| 53 | ptr = holder.get(); |
| 54 | len = new_size; |
| 55 | } |
| 56 | } |
| 57 | private: |
| 58 | std::unique_ptr<char[]> holder = nullptr; |
| 59 | }; |
| 60 | |
| 61 | class ScanState { |
| 62 | public: |
| 63 | uint64_t row_group_idx = 0; |
| 64 | uint64_t row_group_offset = 0; |
| 65 | }; |
| 66 | |
| 67 | struct ResultColumn { |
| 68 | uint64_t id; |
| 69 | ByteBuffer data; |
| 70 | ParquetColumn *col; |
| 71 | ByteBuffer defined; |
| 72 | std::vector<std::unique_ptr<char[]>> string_heap_chunks; |
| 73 | |
| 74 | }; |
| 75 | |
| 76 | struct ResultChunk { |
| 77 | std::vector<ResultColumn> cols; |
| 78 | uint64_t nrows; |
| 79 | }; |
| 80 | |
| 81 | class ParquetFile { |
| 82 | public: |
| 83 | ParquetFile(std::string filename); |
| 84 | void initialize_result(ResultChunk& result); |
| 85 | bool scan(ScanState &s, ResultChunk& result); |
| 86 | uint64_t nrow; |
| 87 | std::vector<std::unique_ptr<ParquetColumn>> columns; |
| 88 | |
| 89 | private: |
| 90 | void initialize(std::string filename); |
| 91 | void initialize_column(ResultColumn& col, uint64_t num_rows); |
| 92 | void scan_column(ScanState& state, ResultColumn& result_col); |
| 93 | parquet::format::FileMetaData file_meta_data; |
| 94 | std::ifstream pfile; |
| 95 | }; |
| 96 | |
| 97 | } |
| 98 |