1//===----------------------------------------------------------------------===//
2// DuckDB
3//
4// duckdb/function/compression_function.hpp
5//
6//
7//===----------------------------------------------------------------------===//
8
9#pragma once
10
11#include "duckdb/common/common.hpp"
12#include "duckdb/function/function.hpp"
13#include "duckdb/common/enums/compression_type.hpp"
14#include "duckdb/common/map.hpp"
15#include "duckdb/storage/storage_info.hpp"
16#include "duckdb/common/mutex.hpp"
17
18namespace duckdb {
19class DatabaseInstance;
20class ColumnData;
21class ColumnDataCheckpointer;
22class ColumnSegment;
23class SegmentStatistics;
24
25struct ColumnFetchState;
26struct ColumnScanState;
27struct SegmentScanState;
28
29struct AnalyzeState {
30 virtual ~AnalyzeState() {
31 }
32
33 template <class TARGET>
34 TARGET &Cast() {
35 D_ASSERT(dynamic_cast<TARGET *>(this));
36 return reinterpret_cast<TARGET &>(*this);
37 }
38 template <class TARGET>
39 const TARGET &Cast() const {
40 D_ASSERT(dynamic_cast<const TARGET *>(this));
41 return reinterpret_cast<const TARGET &>(*this);
42 }
43};
44
45struct CompressionState {
46 virtual ~CompressionState() {
47 }
48
49 template <class TARGET>
50 TARGET &Cast() {
51 D_ASSERT(dynamic_cast<TARGET *>(this));
52 return reinterpret_cast<TARGET &>(*this);
53 }
54 template <class TARGET>
55 const TARGET &Cast() const {
56 D_ASSERT(dynamic_cast<const TARGET *>(this));
57 return reinterpret_cast<const TARGET &>(*this);
58 }
59};
60
61struct CompressedSegmentState {
62 virtual ~CompressedSegmentState() {
63 }
64
65 template <class TARGET>
66 TARGET &Cast() {
67 D_ASSERT(dynamic_cast<TARGET *>(this));
68 return reinterpret_cast<TARGET &>(*this);
69 }
70 template <class TARGET>
71 const TARGET &Cast() const {
72 D_ASSERT(dynamic_cast<const TARGET *>(this));
73 return reinterpret_cast<const TARGET &>(*this);
74 }
75};
76
77struct CompressionAppendState {
78 CompressionAppendState(BufferHandle handle_p) : handle(std::move(handle_p)) {
79 }
80 virtual ~CompressionAppendState() {
81 }
82
83 BufferHandle handle;
84
85 template <class TARGET>
86 TARGET &Cast() {
87 D_ASSERT(dynamic_cast<TARGET *>(this));
88 return reinterpret_cast<TARGET &>(*this);
89 }
90 template <class TARGET>
91 const TARGET &Cast() const {
92 D_ASSERT(dynamic_cast<const TARGET *>(this));
93 return reinterpret_cast<const TARGET &>(*this);
94 }
95};
96
97//===--------------------------------------------------------------------===//
98// Analyze
99//===--------------------------------------------------------------------===//
100//! The analyze functions are used to determine whether or not to use this compression method
101//! The system first determines the potential compression methods to use based on the physical type of the column
102//! After that the following steps are taken:
103//! 1. The init_analyze is called to initialize the analyze state of every candidate compression method
104//! 2. The analyze method is called with all of the input data in the order in which it must be stored.
105//! analyze can return "false". In that case, the compression method is taken out of consideration early.
106//! 3. The final_analyze method is called, which should return a score for the compression method
107
108//! The system then decides which compression function to use based on the analyzed score (returned from final_analyze)
109typedef unique_ptr<AnalyzeState> (*compression_init_analyze_t)(ColumnData &col_data, PhysicalType type);
110typedef bool (*compression_analyze_t)(AnalyzeState &state, Vector &input, idx_t count);
111typedef idx_t (*compression_final_analyze_t)(AnalyzeState &state);
112
113//===--------------------------------------------------------------------===//
114// Compress
115//===--------------------------------------------------------------------===//
116typedef unique_ptr<CompressionState> (*compression_init_compression_t)(ColumnDataCheckpointer &checkpointer,
117 unique_ptr<AnalyzeState> state);
118typedef void (*compression_compress_data_t)(CompressionState &state, Vector &scan_vector, idx_t count);
119typedef void (*compression_compress_finalize_t)(CompressionState &state);
120
121//===--------------------------------------------------------------------===//
122// Uncompress / Scan
123//===--------------------------------------------------------------------===//
124typedef unique_ptr<SegmentScanState> (*compression_init_segment_scan_t)(ColumnSegment &segment);
125
126//! Function prototype used for reading an entire vector (STANDARD_VECTOR_SIZE)
127typedef void (*compression_scan_vector_t)(ColumnSegment &segment, ColumnScanState &state, idx_t scan_count,
128 Vector &result);
129//! Function prototype used for reading an arbitrary ('scan_count') number of values
130typedef void (*compression_scan_partial_t)(ColumnSegment &segment, ColumnScanState &state, idx_t scan_count,
131 Vector &result, idx_t result_offset);
132//! Function prototype used for reading a single value
133typedef void (*compression_fetch_row_t)(ColumnSegment &segment, ColumnFetchState &state, row_t row_id, Vector &result,
134 idx_t result_idx);
135//! Function prototype used for skipping 'skip_count' values, non-trivial if random-access is not supported for the
136//! compressed data.
137typedef void (*compression_skip_t)(ColumnSegment &segment, ColumnScanState &state, idx_t skip_count);
138
139//===--------------------------------------------------------------------===//
140// Append (optional)
141//===--------------------------------------------------------------------===//
142typedef unique_ptr<CompressedSegmentState> (*compression_init_segment_t)(ColumnSegment &segment, block_id_t block_id);
143typedef unique_ptr<CompressionAppendState> (*compression_init_append_t)(ColumnSegment &segment);
144typedef idx_t (*compression_append_t)(CompressionAppendState &append_state, ColumnSegment &segment,
145 SegmentStatistics &stats, UnifiedVectorFormat &data, idx_t offset, idx_t count);
146typedef idx_t (*compression_finalize_append_t)(ColumnSegment &segment, SegmentStatistics &stats);
147typedef void (*compression_revert_append_t)(ColumnSegment &segment, idx_t start_row);
148
149class CompressionFunction {
150public:
151 CompressionFunction(CompressionType type, PhysicalType data_type, compression_init_analyze_t init_analyze,
152 compression_analyze_t analyze, compression_final_analyze_t final_analyze,
153 compression_init_compression_t init_compression, compression_compress_data_t compress,
154 compression_compress_finalize_t compress_finalize, compression_init_segment_scan_t init_scan,
155 compression_scan_vector_t scan_vector, compression_scan_partial_t scan_partial,
156 compression_fetch_row_t fetch_row, compression_skip_t skip,
157 compression_init_segment_t init_segment = nullptr,
158 compression_init_append_t init_append = nullptr, compression_append_t append = nullptr,
159 compression_finalize_append_t finalize_append = nullptr,
160 compression_revert_append_t revert_append = nullptr)
161 : type(type), data_type(data_type), init_analyze(init_analyze), analyze(analyze), final_analyze(final_analyze),
162 init_compression(init_compression), compress(compress), compress_finalize(compress_finalize),
163 init_scan(init_scan), scan_vector(scan_vector), scan_partial(scan_partial), fetch_row(fetch_row), skip(skip),
164 init_segment(init_segment), init_append(init_append), append(append), finalize_append(finalize_append),
165 revert_append(revert_append) {
166 }
167
168 //! Compression type
169 CompressionType type;
170 //! The data type this function can compress
171 PhysicalType data_type;
172
173 //! Analyze step: determine which compression function is the most effective
174 //! init_analyze is called once to set up the analyze state
175 compression_init_analyze_t init_analyze;
176 //! analyze is called several times (once per vector in the row group)
177 //! analyze should return true, unless compression is no longer possible with this compression method
178 //! in that case false should be returned
179 compression_analyze_t analyze;
180 //! final_analyze should return the score of the compression function
181 //! ideally this is the exact number of bytes required to store the data
182 //! this is not required/enforced: it can be an estimate as well
183 //! also this function can return DConstants::INVALID_INDEX to skip this compression method
184 compression_final_analyze_t final_analyze;
185
186 //! Compression step: actually compress the data
187 //! init_compression is called once to set up the comperssion state
188 compression_init_compression_t init_compression;
189 //! compress is called several times (once per vector in the row group)
190 compression_compress_data_t compress;
191 //! compress_finalize is called after
192 compression_compress_finalize_t compress_finalize;
193
194 //! init_scan is called to set up the scan state
195 compression_init_segment_scan_t init_scan;
196 //! scan_vector scans an entire vector using the scan state
197 compression_scan_vector_t scan_vector;
198 //! scan_partial scans a subset of a vector
199 //! this can request > vector_size as well
200 //! this is used if a vector crosses segment boundaries, or for child columns of lists
201 compression_scan_partial_t scan_partial;
202 //! fetch an individual row from the compressed vector
203 //! used for index lookups
204 compression_fetch_row_t fetch_row;
205 //! Skip forward in the compressed segment
206 compression_skip_t skip;
207
208 // Append functions
209 //! This only really needs to be defined for uncompressed segments
210
211 //! Initialize a compressed segment (optional)
212 compression_init_segment_t init_segment;
213 //! Initialize the append state (optional)
214 compression_init_append_t init_append;
215 //! Append to the compressed segment (optional)
216 compression_append_t append;
217 //! Finalize an append to the segment
218 compression_finalize_append_t finalize_append;
219 //! Revert append (optional)
220 compression_revert_append_t revert_append;
221};
222
223//! The set of compression functions
224struct CompressionFunctionSet {
225 mutex lock;
226 map<CompressionType, map<PhysicalType, CompressionFunction>> functions;
227};
228
229} // namespace duckdb
230