1#pragma once
2
3#include <string>
4#include <vector>
5#include <bitset>
6#include <fstream>
7#include <cstring>
8#include "parquet/parquet_types.h"
9
10namespace miniparquet {
11
12class ParquetColumn {
13public:
14 uint64_t id;
15 parquet::format::Type::type type;
16 std::string name;
17 parquet::format::SchemaElement* schema_element;
18};
19
20struct Int96 {
21 uint32_t value[3];
22};
23
24template<class T>
25class Dictionary {
26public:
27 std::vector<T> dict;
28 Dictionary(uint64_t n_values) {
29 dict.resize(n_values);
30 }
31 T& get(uint64_t offset) {
32 if (offset >= dict.size()) {
33 throw std::runtime_error("Dictionary offset out of bounds");
34 } else
35 return dict.at(offset);
36 }
37};
38
39// todo move this to impl
40
41class ByteBuffer { // on to the 10 thousandth impl
42public:
43 char* ptr = nullptr;
44 uint64_t len = 0;
45
46 void resize(uint64_t new_size, bool copy=true) {
47 if (new_size > len) {
48 auto new_holder = std::unique_ptr<char[]>(new char[new_size]);
49 if (copy && holder != nullptr) {
50 memcpy(new_holder.get(), holder.get(), len);
51 }
52 holder = move(new_holder);
53 ptr = holder.get();
54 len = new_size;
55 }
56 }
57private:
58 std::unique_ptr<char[]> holder = nullptr;
59};
60
61class ScanState {
62public:
63 uint64_t row_group_idx = 0;
64 uint64_t row_group_offset = 0;
65};
66
67struct ResultColumn {
68 uint64_t id;
69 ByteBuffer data;
70 ParquetColumn *col;
71 ByteBuffer defined;
72 std::vector<std::unique_ptr<char[]>> string_heap_chunks;
73
74};
75
76struct ResultChunk {
77 std::vector<ResultColumn> cols;
78 uint64_t nrows;
79};
80
81class ParquetFile {
82public:
83 ParquetFile(std::string filename);
84 void initialize_result(ResultChunk& result);
85 bool scan(ScanState &s, ResultChunk& result);
86 uint64_t nrow;
87 std::vector<std::unique_ptr<ParquetColumn>> columns;
88
89private:
90 void initialize(std::string filename);
91 void initialize_column(ResultColumn& col, uint64_t num_rows);
92 void scan_column(ScanState& state, ResultColumn& result_col);
93 parquet::format::FileMetaData file_meta_data;
94 std::ifstream pfile;
95};
96
97}
98