| 1 | //===----------------------------------------------------------------------===// |
| 2 | // DuckDB |
| 3 | // |
| 4 | // duckdb/function/compression_function.hpp |
| 5 | // |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | |
| 9 | #pragma once |
| 10 | |
| 11 | #include "duckdb/common/common.hpp" |
| 12 | #include "duckdb/function/function.hpp" |
| 13 | #include "duckdb/common/enums/compression_type.hpp" |
| 14 | #include "duckdb/common/map.hpp" |
| 15 | #include "duckdb/storage/storage_info.hpp" |
| 16 | #include "duckdb/common/mutex.hpp" |
| 17 | |
| 18 | namespace duckdb { |
| 19 | class DatabaseInstance; |
| 20 | class ColumnData; |
| 21 | class ColumnDataCheckpointer; |
| 22 | class ColumnSegment; |
| 23 | class SegmentStatistics; |
| 24 | |
| 25 | struct ColumnFetchState; |
| 26 | struct ColumnScanState; |
| 27 | struct SegmentScanState; |
| 28 | |
| 29 | struct AnalyzeState { |
| 30 | virtual ~AnalyzeState() { |
| 31 | } |
| 32 | |
| 33 | template <class TARGET> |
| 34 | TARGET &Cast() { |
| 35 | D_ASSERT(dynamic_cast<TARGET *>(this)); |
| 36 | return reinterpret_cast<TARGET &>(*this); |
| 37 | } |
| 38 | template <class TARGET> |
| 39 | const TARGET &Cast() const { |
| 40 | D_ASSERT(dynamic_cast<const TARGET *>(this)); |
| 41 | return reinterpret_cast<const TARGET &>(*this); |
| 42 | } |
| 43 | }; |
| 44 | |
| 45 | struct CompressionState { |
| 46 | virtual ~CompressionState() { |
| 47 | } |
| 48 | |
| 49 | template <class TARGET> |
| 50 | TARGET &Cast() { |
| 51 | D_ASSERT(dynamic_cast<TARGET *>(this)); |
| 52 | return reinterpret_cast<TARGET &>(*this); |
| 53 | } |
| 54 | template <class TARGET> |
| 55 | const TARGET &Cast() const { |
| 56 | D_ASSERT(dynamic_cast<const TARGET *>(this)); |
| 57 | return reinterpret_cast<const TARGET &>(*this); |
| 58 | } |
| 59 | }; |
| 60 | |
| 61 | struct CompressedSegmentState { |
| 62 | virtual ~CompressedSegmentState() { |
| 63 | } |
| 64 | |
| 65 | template <class TARGET> |
| 66 | TARGET &Cast() { |
| 67 | D_ASSERT(dynamic_cast<TARGET *>(this)); |
| 68 | return reinterpret_cast<TARGET &>(*this); |
| 69 | } |
| 70 | template <class TARGET> |
| 71 | const TARGET &Cast() const { |
| 72 | D_ASSERT(dynamic_cast<const TARGET *>(this)); |
| 73 | return reinterpret_cast<const TARGET &>(*this); |
| 74 | } |
| 75 | }; |
| 76 | |
| 77 | struct CompressionAppendState { |
| 78 | CompressionAppendState(BufferHandle handle_p) : handle(std::move(handle_p)) { |
| 79 | } |
| 80 | virtual ~CompressionAppendState() { |
| 81 | } |
| 82 | |
| 83 | BufferHandle handle; |
| 84 | |
| 85 | template <class TARGET> |
| 86 | TARGET &Cast() { |
| 87 | D_ASSERT(dynamic_cast<TARGET *>(this)); |
| 88 | return reinterpret_cast<TARGET &>(*this); |
| 89 | } |
| 90 | template <class TARGET> |
| 91 | const TARGET &Cast() const { |
| 92 | D_ASSERT(dynamic_cast<const TARGET *>(this)); |
| 93 | return reinterpret_cast<const TARGET &>(*this); |
| 94 | } |
| 95 | }; |
| 96 | |
| 97 | //===--------------------------------------------------------------------===// |
| 98 | // Analyze |
| 99 | //===--------------------------------------------------------------------===// |
| 100 | //! The analyze functions are used to determine whether or not to use this compression method |
| 101 | //! The system first determines the potential compression methods to use based on the physical type of the column |
| 102 | //! After that the following steps are taken: |
| 103 | //! 1. The init_analyze is called to initialize the analyze state of every candidate compression method |
| 104 | //! 2. The analyze method is called with all of the input data in the order in which it must be stored. |
| 105 | //! analyze can return "false". In that case, the compression method is taken out of consideration early. |
| 106 | //! 3. The final_analyze method is called, which should return a score for the compression method |
| 107 | |
| 108 | //! The system then decides which compression function to use based on the analyzed score (returned from final_analyze) |
| 109 | typedef unique_ptr<AnalyzeState> (*compression_init_analyze_t)(ColumnData &col_data, PhysicalType type); |
| 110 | typedef bool (*compression_analyze_t)(AnalyzeState &state, Vector &input, idx_t count); |
| 111 | typedef idx_t (*compression_final_analyze_t)(AnalyzeState &state); |
| 112 | |
| 113 | //===--------------------------------------------------------------------===// |
| 114 | // Compress |
| 115 | //===--------------------------------------------------------------------===// |
| 116 | typedef unique_ptr<CompressionState> (*compression_init_compression_t)(ColumnDataCheckpointer &checkpointer, |
| 117 | unique_ptr<AnalyzeState> state); |
| 118 | typedef void (*compression_compress_data_t)(CompressionState &state, Vector &scan_vector, idx_t count); |
| 119 | typedef void (*compression_compress_finalize_t)(CompressionState &state); |
| 120 | |
| 121 | //===--------------------------------------------------------------------===// |
| 122 | // Uncompress / Scan |
| 123 | //===--------------------------------------------------------------------===// |
| 124 | typedef unique_ptr<SegmentScanState> (*compression_init_segment_scan_t)(ColumnSegment &segment); |
| 125 | |
| 126 | //! Function prototype used for reading an entire vector (STANDARD_VECTOR_SIZE) |
| 127 | typedef void (*compression_scan_vector_t)(ColumnSegment &segment, ColumnScanState &state, idx_t scan_count, |
| 128 | Vector &result); |
| 129 | //! Function prototype used for reading an arbitrary ('scan_count') number of values |
| 130 | typedef void (*compression_scan_partial_t)(ColumnSegment &segment, ColumnScanState &state, idx_t scan_count, |
| 131 | Vector &result, idx_t result_offset); |
| 132 | //! Function prototype used for reading a single value |
| 133 | typedef void (*compression_fetch_row_t)(ColumnSegment &segment, ColumnFetchState &state, row_t row_id, Vector &result, |
| 134 | idx_t result_idx); |
| 135 | //! Function prototype used for skipping 'skip_count' values, non-trivial if random-access is not supported for the |
| 136 | //! compressed data. |
| 137 | typedef void (*compression_skip_t)(ColumnSegment &segment, ColumnScanState &state, idx_t skip_count); |
| 138 | |
| 139 | //===--------------------------------------------------------------------===// |
| 140 | // Append (optional) |
| 141 | //===--------------------------------------------------------------------===// |
| 142 | typedef unique_ptr<CompressedSegmentState> (*compression_init_segment_t)(ColumnSegment &segment, block_id_t block_id); |
| 143 | typedef unique_ptr<CompressionAppendState> (*compression_init_append_t)(ColumnSegment &segment); |
| 144 | typedef idx_t (*compression_append_t)(CompressionAppendState &append_state, ColumnSegment &segment, |
| 145 | SegmentStatistics &stats, UnifiedVectorFormat &data, idx_t offset, idx_t count); |
| 146 | typedef idx_t (*compression_finalize_append_t)(ColumnSegment &segment, SegmentStatistics &stats); |
| 147 | typedef void (*compression_revert_append_t)(ColumnSegment &segment, idx_t start_row); |
| 148 | |
| 149 | class CompressionFunction { |
| 150 | public: |
| 151 | CompressionFunction(CompressionType type, PhysicalType data_type, compression_init_analyze_t init_analyze, |
| 152 | compression_analyze_t analyze, compression_final_analyze_t final_analyze, |
| 153 | compression_init_compression_t init_compression, compression_compress_data_t compress, |
| 154 | compression_compress_finalize_t compress_finalize, compression_init_segment_scan_t init_scan, |
| 155 | compression_scan_vector_t scan_vector, compression_scan_partial_t scan_partial, |
| 156 | compression_fetch_row_t fetch_row, compression_skip_t skip, |
| 157 | compression_init_segment_t init_segment = nullptr, |
| 158 | compression_init_append_t init_append = nullptr, compression_append_t append = nullptr, |
| 159 | compression_finalize_append_t finalize_append = nullptr, |
| 160 | compression_revert_append_t revert_append = nullptr) |
| 161 | : type(type), data_type(data_type), init_analyze(init_analyze), analyze(analyze), final_analyze(final_analyze), |
| 162 | init_compression(init_compression), compress(compress), compress_finalize(compress_finalize), |
| 163 | init_scan(init_scan), scan_vector(scan_vector), scan_partial(scan_partial), fetch_row(fetch_row), skip(skip), |
| 164 | init_segment(init_segment), init_append(init_append), append(append), finalize_append(finalize_append), |
| 165 | revert_append(revert_append) { |
| 166 | } |
| 167 | |
| 168 | //! Compression type |
| 169 | CompressionType type; |
| 170 | //! The data type this function can compress |
| 171 | PhysicalType data_type; |
| 172 | |
| 173 | //! Analyze step: determine which compression function is the most effective |
| 174 | //! init_analyze is called once to set up the analyze state |
| 175 | compression_init_analyze_t init_analyze; |
| 176 | //! analyze is called several times (once per vector in the row group) |
| 177 | //! analyze should return true, unless compression is no longer possible with this compression method |
| 178 | //! in that case false should be returned |
| 179 | compression_analyze_t analyze; |
| 180 | //! final_analyze should return the score of the compression function |
| 181 | //! ideally this is the exact number of bytes required to store the data |
| 182 | //! this is not required/enforced: it can be an estimate as well |
| 183 | //! also this function can return DConstants::INVALID_INDEX to skip this compression method |
| 184 | compression_final_analyze_t final_analyze; |
| 185 | |
| 186 | //! Compression step: actually compress the data |
| 187 | //! init_compression is called once to set up the comperssion state |
| 188 | compression_init_compression_t init_compression; |
| 189 | //! compress is called several times (once per vector in the row group) |
| 190 | compression_compress_data_t compress; |
| 191 | //! compress_finalize is called after |
| 192 | compression_compress_finalize_t compress_finalize; |
| 193 | |
| 194 | //! init_scan is called to set up the scan state |
| 195 | compression_init_segment_scan_t init_scan; |
| 196 | //! scan_vector scans an entire vector using the scan state |
| 197 | compression_scan_vector_t scan_vector; |
| 198 | //! scan_partial scans a subset of a vector |
| 199 | //! this can request > vector_size as well |
| 200 | //! this is used if a vector crosses segment boundaries, or for child columns of lists |
| 201 | compression_scan_partial_t scan_partial; |
| 202 | //! fetch an individual row from the compressed vector |
| 203 | //! used for index lookups |
| 204 | compression_fetch_row_t fetch_row; |
| 205 | //! Skip forward in the compressed segment |
| 206 | compression_skip_t skip; |
| 207 | |
| 208 | // Append functions |
| 209 | //! This only really needs to be defined for uncompressed segments |
| 210 | |
| 211 | //! Initialize a compressed segment (optional) |
| 212 | compression_init_segment_t init_segment; |
| 213 | //! Initialize the append state (optional) |
| 214 | compression_init_append_t init_append; |
| 215 | //! Append to the compressed segment (optional) |
| 216 | compression_append_t append; |
| 217 | //! Finalize an append to the segment |
| 218 | compression_finalize_append_t finalize_append; |
| 219 | //! Revert append (optional) |
| 220 | compression_revert_append_t revert_append; |
| 221 | }; |
| 222 | |
| 223 | //! The set of compression functions |
| 224 | struct CompressionFunctionSet { |
| 225 | mutex lock; |
| 226 | map<CompressionType, map<PhysicalType, CompressionFunction>> functions; |
| 227 | }; |
| 228 | |
| 229 | } // namespace duckdb |
| 230 | |