1#pragma once
2
3#include "duckdb/common/types/null_value.hpp"
4#include "duckdb/common/types/vector.hpp"
5#include "duckdb/common/vector_size.hpp"
6#include "duckdb/function/compression_function.hpp"
7#include "duckdb/main/config.hpp"
8#include "duckdb/planner/table_filter.hpp"
9#include "duckdb/storage/buffer_manager.hpp"
10#include "duckdb/storage/checkpoint/string_checkpoint_state.hpp"
11#include "duckdb/storage/segment/uncompressed.hpp"
12#include "duckdb/storage/table/scan_state.hpp"
13#include "duckdb/storage/string_uncompressed.hpp"
14#include "duckdb/storage/table/append_state.hpp"
15#include "duckdb/storage/table/column_segment.hpp"
16#include "duckdb/common/likely.hpp"
17
18namespace duckdb {
19struct StringDictionaryContainer {
20 //! The size of the dictionary
21 uint32_t size;
22 //! The end of the dictionary (typically Storage::BLOCK_SIZE)
23 uint32_t end;
24
25 void Verify() {
26 D_ASSERT(size <= Storage::BLOCK_SIZE);
27 D_ASSERT(end <= Storage::BLOCK_SIZE);
28 D_ASSERT(size <= end);
29 }
30};
31
32struct StringScanState : public SegmentScanState {
33 BufferHandle handle;
34};
35
36struct UncompressedStringStorage {
37public:
38 //! Dictionary header size at the beginning of the string segment (offset + length)
39 static constexpr uint16_t DICTIONARY_HEADER_SIZE = sizeof(uint32_t) + sizeof(uint32_t);
40 //! Marker used in length field to indicate the presence of a big string
41 static constexpr uint16_t BIG_STRING_MARKER = (uint16_t)-1;
42 //! Base size of big string marker (block id + offset)
43 static constexpr idx_t BIG_STRING_MARKER_BASE_SIZE = sizeof(block_id_t) + sizeof(int32_t);
44 //! The marker size of the big string
45 static constexpr idx_t BIG_STRING_MARKER_SIZE = BIG_STRING_MARKER_BASE_SIZE;
46 //! The size below which the segment is compacted on flushing
47 static constexpr size_t COMPACTION_FLUSH_LIMIT = (size_t)Storage::BLOCK_SIZE / 5 * 4;
48
49public:
50 static unique_ptr<AnalyzeState> StringInitAnalyze(ColumnData &col_data, PhysicalType type);
51 static bool StringAnalyze(AnalyzeState &state_p, Vector &input, idx_t count);
52 static idx_t StringFinalAnalyze(AnalyzeState &state_p);
53 static unique_ptr<SegmentScanState> StringInitScan(ColumnSegment &segment);
54 static void StringScanPartial(ColumnSegment &segment, ColumnScanState &state, idx_t scan_count, Vector &result,
55 idx_t result_offset);
56 static void StringScan(ColumnSegment &segment, ColumnScanState &state, idx_t scan_count, Vector &result);
57 static void StringFetchRow(ColumnSegment &segment, ColumnFetchState &state, row_t row_id, Vector &result,
58 idx_t result_idx);
59 static unique_ptr<CompressedSegmentState> StringInitSegment(ColumnSegment &segment, block_id_t block_id);
60
61 static unique_ptr<CompressionAppendState> StringInitAppend(ColumnSegment &segment) {
62 auto &buffer_manager = BufferManager::GetBufferManager(db&: segment.db);
63 auto handle = buffer_manager.Pin(handle&: segment.block);
64 return make_uniq<CompressionAppendState>(args: std::move(handle));
65 }
66
67 static idx_t StringAppend(CompressionAppendState &append_state, ColumnSegment &segment, SegmentStatistics &stats,
68 UnifiedVectorFormat &data, idx_t offset, idx_t count) {
69 return StringAppendBase(handle&: append_state.handle, segment, stats, data, offset, count);
70 }
71
72 static idx_t StringAppendBase(ColumnSegment &segment, SegmentStatistics &stats, UnifiedVectorFormat &data,
73 idx_t offset, idx_t count) {
74 auto &buffer_manager = BufferManager::GetBufferManager(db&: segment.db);
75 auto handle = buffer_manager.Pin(handle&: segment.block);
76 return StringAppendBase(handle, segment, stats, data, offset, count);
77 }
78
79 static idx_t StringAppendBase(BufferHandle &handle, ColumnSegment &segment, SegmentStatistics &stats,
80 UnifiedVectorFormat &data, idx_t offset, idx_t count) {
81 D_ASSERT(segment.GetBlockOffset() == 0);
82 auto handle_ptr = handle.Ptr();
83 auto source_data = UnifiedVectorFormat::GetData<string_t>(format: data);
84 auto result_data = (int32_t *)(handle_ptr + DICTIONARY_HEADER_SIZE);
85 uint32_t *dictionary_size = (uint32_t *)handle_ptr;
86 uint32_t *dictionary_end = (uint32_t *)(handle_ptr + sizeof(uint32_t));
87
88 idx_t remaining_space = RemainingSpace(segment, handle);
89 auto base_count = segment.count.load();
90 for (idx_t i = 0; i < count; i++) {
91 auto source_idx = data.sel->get_index(idx: offset + i);
92 auto target_idx = base_count + i;
93 if (remaining_space < sizeof(int32_t)) {
94 // string index does not fit in the block at all
95 segment.count += i;
96 return i;
97 }
98 remaining_space -= sizeof(int32_t);
99 if (!data.validity.RowIsValid(row_idx: source_idx)) {
100 // null value is stored as a copy of the last value, this is done to be able to efficiently do the
101 // string_length calculation
102 if (target_idx > 0) {
103 result_data[target_idx] = result_data[target_idx - 1];
104 } else {
105 result_data[target_idx] = 0;
106 }
107 continue;
108 }
109 auto end = handle.Ptr() + *dictionary_end;
110
111#ifdef DEBUG
112 GetDictionary(segment, handle).Verify();
113#endif
114 // Unknown string, continue
115 // non-null value, check if we can fit it within the block
116 idx_t string_length = source_data[source_idx].GetSize();
117
118 // determine whether or not we have space in the block for this string
119 bool use_overflow_block = false;
120 idx_t required_space = string_length;
121 if (DUCKDB_UNLIKELY(required_space >= StringUncompressed::STRING_BLOCK_LIMIT)) {
122 // string exceeds block limit, store in overflow block and only write a marker here
123 required_space = BIG_STRING_MARKER_SIZE;
124 use_overflow_block = true;
125 }
126 if (DUCKDB_UNLIKELY(required_space > remaining_space)) {
127 // no space remaining: return how many tuples we ended up writing
128 segment.count += i;
129 return i;
130 }
131
132 // we have space: write the string
133 UpdateStringStats(stats, new_value: source_data[source_idx]);
134
135 if (DUCKDB_UNLIKELY(use_overflow_block)) {
136 // write to overflow blocks
137 block_id_t block;
138 int32_t offset;
139 // write the string into the current string block
140 WriteString(segment, string: source_data[source_idx], result_block&: block, result_offset&: offset);
141 *dictionary_size += BIG_STRING_MARKER_SIZE;
142 remaining_space -= BIG_STRING_MARKER_SIZE;
143 auto dict_pos = end - *dictionary_size;
144
145 // write a big string marker into the dictionary
146 WriteStringMarker(target: dict_pos, block_id: block, offset);
147
148 // place the dictionary offset into the set of vectors
149 // note: for overflow strings we write negative value
150 result_data[target_idx] = -(*dictionary_size);
151 } else {
152 // string fits in block, append to dictionary and increment dictionary position
153 D_ASSERT(string_length < NumericLimits<uint16_t>::Maximum());
154 *dictionary_size += required_space;
155 remaining_space -= required_space;
156 auto dict_pos = end - *dictionary_size;
157 // now write the actual string data into the dictionary
158 memcpy(dest: dict_pos, src: source_data[source_idx].GetData(), n: string_length);
159
160 // place the dictionary offset into the set of vectors
161 result_data[target_idx] = *dictionary_size;
162 }
163 D_ASSERT(RemainingSpace(segment, handle) <= Storage::BLOCK_SIZE);
164#ifdef DEBUG
165 GetDictionary(segment, handle).Verify();
166#endif
167 }
168 segment.count += count;
169 return count;
170 }
171
172 static idx_t FinalizeAppend(ColumnSegment &segment, SegmentStatistics &stats);
173
174public:
175 static inline void UpdateStringStats(SegmentStatistics &stats, const string_t &new_value) {
176 StringStats::Update(stats&: stats.statistics, value: new_value);
177 }
178
179 static void SetDictionary(ColumnSegment &segment, BufferHandle &handle, StringDictionaryContainer dict);
180 static StringDictionaryContainer GetDictionary(ColumnSegment &segment, BufferHandle &handle);
181 static idx_t RemainingSpace(ColumnSegment &segment, BufferHandle &handle);
182 static void WriteString(ColumnSegment &segment, string_t string, block_id_t &result_block, int32_t &result_offset);
183 static void WriteStringMemory(ColumnSegment &segment, string_t string, block_id_t &result_block,
184 int32_t &result_offset);
185 static string_t ReadOverflowString(ColumnSegment &segment, Vector &result, block_id_t block, int32_t offset);
186 static string_t ReadString(data_ptr_t target, int32_t offset, uint32_t string_length);
187 static string_t ReadStringWithLength(data_ptr_t target, int32_t offset);
188 static void WriteStringMarker(data_ptr_t target, block_id_t block_id, int32_t offset);
189 static void ReadStringMarker(data_ptr_t target, block_id_t &block_id, int32_t &offset);
190
191 static string_location_t FetchStringLocation(StringDictionaryContainer dict, data_ptr_t baseptr,
192 int32_t dict_offset);
193 static string_t FetchStringFromDict(ColumnSegment &segment, StringDictionaryContainer dict, Vector &result,
194 data_ptr_t baseptr, int32_t dict_offset, uint32_t string_length);
195 static string_t FetchString(ColumnSegment &segment, StringDictionaryContainer dict, Vector &result,
196 data_ptr_t baseptr, string_location_t location, uint32_t string_length);
197};
198} // namespace duckdb
199