1 | //===----------------------------------------------------------------------===// |
2 | // DuckDB |
3 | // |
4 | // duckdb/function/compression_function.hpp |
5 | // |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #pragma once |
10 | |
11 | #include "duckdb/common/common.hpp" |
12 | #include "duckdb/function/function.hpp" |
13 | #include "duckdb/common/enums/compression_type.hpp" |
14 | #include "duckdb/common/map.hpp" |
15 | #include "duckdb/storage/storage_info.hpp" |
16 | #include "duckdb/common/mutex.hpp" |
17 | |
18 | namespace duckdb { |
19 | class DatabaseInstance; |
20 | class ColumnData; |
21 | class ColumnDataCheckpointer; |
22 | class ColumnSegment; |
23 | class SegmentStatistics; |
24 | |
25 | struct ColumnFetchState; |
26 | struct ColumnScanState; |
27 | struct SegmentScanState; |
28 | |
29 | struct AnalyzeState { |
30 | virtual ~AnalyzeState() { |
31 | } |
32 | |
33 | template <class TARGET> |
34 | TARGET &Cast() { |
35 | D_ASSERT(dynamic_cast<TARGET *>(this)); |
36 | return reinterpret_cast<TARGET &>(*this); |
37 | } |
38 | template <class TARGET> |
39 | const TARGET &Cast() const { |
40 | D_ASSERT(dynamic_cast<const TARGET *>(this)); |
41 | return reinterpret_cast<const TARGET &>(*this); |
42 | } |
43 | }; |
44 | |
45 | struct CompressionState { |
46 | virtual ~CompressionState() { |
47 | } |
48 | |
49 | template <class TARGET> |
50 | TARGET &Cast() { |
51 | D_ASSERT(dynamic_cast<TARGET *>(this)); |
52 | return reinterpret_cast<TARGET &>(*this); |
53 | } |
54 | template <class TARGET> |
55 | const TARGET &Cast() const { |
56 | D_ASSERT(dynamic_cast<const TARGET *>(this)); |
57 | return reinterpret_cast<const TARGET &>(*this); |
58 | } |
59 | }; |
60 | |
61 | struct CompressedSegmentState { |
62 | virtual ~CompressedSegmentState() { |
63 | } |
64 | |
65 | template <class TARGET> |
66 | TARGET &Cast() { |
67 | D_ASSERT(dynamic_cast<TARGET *>(this)); |
68 | return reinterpret_cast<TARGET &>(*this); |
69 | } |
70 | template <class TARGET> |
71 | const TARGET &Cast() const { |
72 | D_ASSERT(dynamic_cast<const TARGET *>(this)); |
73 | return reinterpret_cast<const TARGET &>(*this); |
74 | } |
75 | }; |
76 | |
77 | struct CompressionAppendState { |
78 | CompressionAppendState(BufferHandle handle_p) : handle(std::move(handle_p)) { |
79 | } |
80 | virtual ~CompressionAppendState() { |
81 | } |
82 | |
83 | BufferHandle handle; |
84 | |
85 | template <class TARGET> |
86 | TARGET &Cast() { |
87 | D_ASSERT(dynamic_cast<TARGET *>(this)); |
88 | return reinterpret_cast<TARGET &>(*this); |
89 | } |
90 | template <class TARGET> |
91 | const TARGET &Cast() const { |
92 | D_ASSERT(dynamic_cast<const TARGET *>(this)); |
93 | return reinterpret_cast<const TARGET &>(*this); |
94 | } |
95 | }; |
96 | |
97 | //===--------------------------------------------------------------------===// |
98 | // Analyze |
99 | //===--------------------------------------------------------------------===// |
100 | //! The analyze functions are used to determine whether or not to use this compression method |
101 | //! The system first determines the potential compression methods to use based on the physical type of the column |
102 | //! After that the following steps are taken: |
103 | //! 1. The init_analyze is called to initialize the analyze state of every candidate compression method |
104 | //! 2. The analyze method is called with all of the input data in the order in which it must be stored. |
105 | //! analyze can return "false". In that case, the compression method is taken out of consideration early. |
106 | //! 3. The final_analyze method is called, which should return a score for the compression method |
107 | |
108 | //! The system then decides which compression function to use based on the analyzed score (returned from final_analyze) |
109 | typedef unique_ptr<AnalyzeState> (*compression_init_analyze_t)(ColumnData &col_data, PhysicalType type); |
110 | typedef bool (*compression_analyze_t)(AnalyzeState &state, Vector &input, idx_t count); |
111 | typedef idx_t (*compression_final_analyze_t)(AnalyzeState &state); |
112 | |
113 | //===--------------------------------------------------------------------===// |
114 | // Compress |
115 | //===--------------------------------------------------------------------===// |
116 | typedef unique_ptr<CompressionState> (*compression_init_compression_t)(ColumnDataCheckpointer &checkpointer, |
117 | unique_ptr<AnalyzeState> state); |
118 | typedef void (*compression_compress_data_t)(CompressionState &state, Vector &scan_vector, idx_t count); |
119 | typedef void (*compression_compress_finalize_t)(CompressionState &state); |
120 | |
121 | //===--------------------------------------------------------------------===// |
122 | // Uncompress / Scan |
123 | //===--------------------------------------------------------------------===// |
124 | typedef unique_ptr<SegmentScanState> (*compression_init_segment_scan_t)(ColumnSegment &segment); |
125 | |
126 | //! Function prototype used for reading an entire vector (STANDARD_VECTOR_SIZE) |
127 | typedef void (*compression_scan_vector_t)(ColumnSegment &segment, ColumnScanState &state, idx_t scan_count, |
128 | Vector &result); |
129 | //! Function prototype used for reading an arbitrary ('scan_count') number of values |
130 | typedef void (*compression_scan_partial_t)(ColumnSegment &segment, ColumnScanState &state, idx_t scan_count, |
131 | Vector &result, idx_t result_offset); |
132 | //! Function prototype used for reading a single value |
133 | typedef void (*compression_fetch_row_t)(ColumnSegment &segment, ColumnFetchState &state, row_t row_id, Vector &result, |
134 | idx_t result_idx); |
135 | //! Function prototype used for skipping 'skip_count' values, non-trivial if random-access is not supported for the |
136 | //! compressed data. |
137 | typedef void (*compression_skip_t)(ColumnSegment &segment, ColumnScanState &state, idx_t skip_count); |
138 | |
139 | //===--------------------------------------------------------------------===// |
140 | // Append (optional) |
141 | //===--------------------------------------------------------------------===// |
142 | typedef unique_ptr<CompressedSegmentState> (*compression_init_segment_t)(ColumnSegment &segment, block_id_t block_id); |
143 | typedef unique_ptr<CompressionAppendState> (*compression_init_append_t)(ColumnSegment &segment); |
144 | typedef idx_t (*compression_append_t)(CompressionAppendState &append_state, ColumnSegment &segment, |
145 | SegmentStatistics &stats, UnifiedVectorFormat &data, idx_t offset, idx_t count); |
146 | typedef idx_t (*compression_finalize_append_t)(ColumnSegment &segment, SegmentStatistics &stats); |
147 | typedef void (*compression_revert_append_t)(ColumnSegment &segment, idx_t start_row); |
148 | |
149 | class CompressionFunction { |
150 | public: |
151 | CompressionFunction(CompressionType type, PhysicalType data_type, compression_init_analyze_t init_analyze, |
152 | compression_analyze_t analyze, compression_final_analyze_t final_analyze, |
153 | compression_init_compression_t init_compression, compression_compress_data_t compress, |
154 | compression_compress_finalize_t compress_finalize, compression_init_segment_scan_t init_scan, |
155 | compression_scan_vector_t scan_vector, compression_scan_partial_t scan_partial, |
156 | compression_fetch_row_t fetch_row, compression_skip_t skip, |
157 | compression_init_segment_t init_segment = nullptr, |
158 | compression_init_append_t init_append = nullptr, compression_append_t append = nullptr, |
159 | compression_finalize_append_t finalize_append = nullptr, |
160 | compression_revert_append_t revert_append = nullptr) |
161 | : type(type), data_type(data_type), init_analyze(init_analyze), analyze(analyze), final_analyze(final_analyze), |
162 | init_compression(init_compression), compress(compress), compress_finalize(compress_finalize), |
163 | init_scan(init_scan), scan_vector(scan_vector), scan_partial(scan_partial), fetch_row(fetch_row), skip(skip), |
164 | init_segment(init_segment), init_append(init_append), append(append), finalize_append(finalize_append), |
165 | revert_append(revert_append) { |
166 | } |
167 | |
168 | //! Compression type |
169 | CompressionType type; |
170 | //! The data type this function can compress |
171 | PhysicalType data_type; |
172 | |
173 | //! Analyze step: determine which compression function is the most effective |
174 | //! init_analyze is called once to set up the analyze state |
175 | compression_init_analyze_t init_analyze; |
176 | //! analyze is called several times (once per vector in the row group) |
177 | //! analyze should return true, unless compression is no longer possible with this compression method |
178 | //! in that case false should be returned |
179 | compression_analyze_t analyze; |
180 | //! final_analyze should return the score of the compression function |
181 | //! ideally this is the exact number of bytes required to store the data |
182 | //! this is not required/enforced: it can be an estimate as well |
183 | //! also this function can return DConstants::INVALID_INDEX to skip this compression method |
184 | compression_final_analyze_t final_analyze; |
185 | |
186 | //! Compression step: actually compress the data |
187 | //! init_compression is called once to set up the comperssion state |
188 | compression_init_compression_t init_compression; |
189 | //! compress is called several times (once per vector in the row group) |
190 | compression_compress_data_t compress; |
191 | //! compress_finalize is called after |
192 | compression_compress_finalize_t compress_finalize; |
193 | |
194 | //! init_scan is called to set up the scan state |
195 | compression_init_segment_scan_t init_scan; |
196 | //! scan_vector scans an entire vector using the scan state |
197 | compression_scan_vector_t scan_vector; |
198 | //! scan_partial scans a subset of a vector |
199 | //! this can request > vector_size as well |
200 | //! this is used if a vector crosses segment boundaries, or for child columns of lists |
201 | compression_scan_partial_t scan_partial; |
202 | //! fetch an individual row from the compressed vector |
203 | //! used for index lookups |
204 | compression_fetch_row_t fetch_row; |
205 | //! Skip forward in the compressed segment |
206 | compression_skip_t skip; |
207 | |
208 | // Append functions |
209 | //! This only really needs to be defined for uncompressed segments |
210 | |
211 | //! Initialize a compressed segment (optional) |
212 | compression_init_segment_t init_segment; |
213 | //! Initialize the append state (optional) |
214 | compression_init_append_t init_append; |
215 | //! Append to the compressed segment (optional) |
216 | compression_append_t append; |
217 | //! Finalize an append to the segment |
218 | compression_finalize_append_t finalize_append; |
219 | //! Revert append (optional) |
220 | compression_revert_append_t revert_append; |
221 | }; |
222 | |
223 | //! The set of compression functions |
224 | struct CompressionFunctionSet { |
225 | mutex lock; |
226 | map<CompressionType, map<PhysicalType, CompressionFunction>> functions; |
227 | }; |
228 | |
229 | } // namespace duckdb |
230 | |