1 | #include "duckdb/storage/statistics/string_stats.hpp" |
2 | #include "duckdb/storage/statistics/base_statistics.hpp" |
3 | #include "duckdb/common/field_writer.hpp" |
4 | #include "utf8proc_wrapper.hpp" |
5 | #include "duckdb/common/string_util.hpp" |
6 | #include "duckdb/common/types/vector.hpp" |
7 | #include "duckdb/main/error_manager.hpp" |
8 | |
9 | namespace duckdb { |
10 | |
11 | BaseStatistics StringStats::CreateUnknown(LogicalType type) { |
12 | BaseStatistics result(std::move(type)); |
13 | result.InitializeUnknown(); |
14 | auto &string_data = StringStats::GetDataUnsafe(stats&: result); |
15 | for (idx_t i = 0; i < StringStatsData::MAX_STRING_MINMAX_SIZE; i++) { |
16 | string_data.min[i] = 0; |
17 | string_data.max[i] = 0xFF; |
18 | } |
19 | string_data.max_string_length = 0; |
20 | string_data.has_max_string_length = false; |
21 | string_data.has_unicode = true; |
22 | return result; |
23 | } |
24 | |
25 | BaseStatistics StringStats::CreateEmpty(LogicalType type) { |
26 | BaseStatistics result(std::move(type)); |
27 | result.InitializeEmpty(); |
28 | auto &string_data = StringStats::GetDataUnsafe(stats&: result); |
29 | for (idx_t i = 0; i < StringStatsData::MAX_STRING_MINMAX_SIZE; i++) { |
30 | string_data.min[i] = 0xFF; |
31 | string_data.max[i] = 0; |
32 | } |
33 | string_data.max_string_length = 0; |
34 | string_data.has_max_string_length = true; |
35 | string_data.has_unicode = false; |
36 | return result; |
37 | } |
38 | |
39 | StringStatsData &StringStats::GetDataUnsafe(BaseStatistics &stats) { |
40 | D_ASSERT(stats.GetStatsType() == StatisticsType::STRING_STATS); |
41 | return stats.stats_union.string_data; |
42 | } |
43 | |
44 | const StringStatsData &StringStats::GetDataUnsafe(const BaseStatistics &stats) { |
45 | D_ASSERT(stats.GetStatsType() == StatisticsType::STRING_STATS); |
46 | return stats.stats_union.string_data; |
47 | } |
48 | |
49 | bool StringStats::HasMaxStringLength(const BaseStatistics &stats) { |
50 | if (stats.GetType().id() == LogicalTypeId::SQLNULL) { |
51 | return false; |
52 | } |
53 | return StringStats::GetDataUnsafe(stats).has_max_string_length; |
54 | } |
55 | |
56 | uint32_t StringStats::MaxStringLength(const BaseStatistics &stats) { |
57 | if (!HasMaxStringLength(stats)) { |
58 | throw InternalException("MaxStringLength called on statistics that does not have a max string length" ); |
59 | } |
60 | return StringStats::GetDataUnsafe(stats).max_string_length; |
61 | } |
62 | |
63 | bool StringStats::CanContainUnicode(const BaseStatistics &stats) { |
64 | if (stats.GetType().id() == LogicalTypeId::SQLNULL) { |
65 | return true; |
66 | } |
67 | return StringStats::GetDataUnsafe(stats).has_unicode; |
68 | } |
69 | |
70 | void StringStats::ResetMaxStringLength(BaseStatistics &stats) { |
71 | StringStats::GetDataUnsafe(stats).has_max_string_length = false; |
72 | } |
73 | |
74 | void StringStats::SetContainsUnicode(BaseStatistics &stats) { |
75 | StringStats::GetDataUnsafe(stats).has_unicode = true; |
76 | } |
77 | |
78 | void StringStats::Serialize(const BaseStatistics &stats, FieldWriter &writer) { |
79 | auto &string_data = StringStats::GetDataUnsafe(stats); |
80 | writer.WriteBlob(val: string_data.min, len: StringStatsData::MAX_STRING_MINMAX_SIZE); |
81 | writer.WriteBlob(val: string_data.max, len: StringStatsData::MAX_STRING_MINMAX_SIZE); |
82 | writer.WriteField<bool>(element: string_data.has_unicode); |
83 | writer.WriteField<bool>(element: string_data.has_max_string_length); |
84 | writer.WriteField<uint32_t>(element: string_data.max_string_length); |
85 | } |
86 | |
87 | BaseStatistics StringStats::Deserialize(FieldReader &reader, LogicalType type) { |
88 | BaseStatistics result(std::move(type)); |
89 | auto &string_data = StringStats::GetDataUnsafe(stats&: result); |
90 | reader.ReadBlob(result: string_data.min, read_size: StringStatsData::MAX_STRING_MINMAX_SIZE); |
91 | reader.ReadBlob(result: string_data.max, read_size: StringStatsData::MAX_STRING_MINMAX_SIZE); |
92 | string_data.has_unicode = reader.ReadRequired<bool>(); |
93 | string_data.has_max_string_length = reader.ReadRequired<bool>(); |
94 | string_data.max_string_length = reader.ReadRequired<uint32_t>(); |
95 | return result; |
96 | } |
97 | |
98 | static int StringValueComparison(const_data_ptr_t data, idx_t len, const_data_ptr_t comparison) { |
99 | D_ASSERT(len <= StringStatsData::MAX_STRING_MINMAX_SIZE); |
100 | for (idx_t i = 0; i < len; i++) { |
101 | if (data[i] < comparison[i]) { |
102 | return -1; |
103 | } else if (data[i] > comparison[i]) { |
104 | return 1; |
105 | } |
106 | } |
107 | return 0; |
108 | } |
109 | |
110 | static void ConstructValue(const_data_ptr_t data, idx_t size, data_t target[]) { |
111 | idx_t value_size = size > StringStatsData::MAX_STRING_MINMAX_SIZE ? StringStatsData::MAX_STRING_MINMAX_SIZE : size; |
112 | memcpy(dest: target, src: data, n: value_size); |
113 | for (idx_t i = value_size; i < StringStatsData::MAX_STRING_MINMAX_SIZE; i++) { |
114 | target[i] = '\0'; |
115 | } |
116 | } |
117 | |
118 | void StringStats::Update(BaseStatistics &stats, const string_t &value) { |
119 | auto data = const_data_ptr_cast(src: value.GetData()); |
120 | auto size = value.GetSize(); |
121 | |
122 | //! we can only fit 8 bytes, so we might need to trim our string |
123 | // construct the value |
124 | data_t target[StringStatsData::MAX_STRING_MINMAX_SIZE]; |
125 | ConstructValue(data, size, target); |
126 | |
127 | // update the min and max |
128 | auto &string_data = StringStats::GetDataUnsafe(stats); |
129 | if (StringValueComparison(data: target, len: StringStatsData::MAX_STRING_MINMAX_SIZE, comparison: string_data.min) < 0) { |
130 | memcpy(dest: string_data.min, src: target, n: StringStatsData::MAX_STRING_MINMAX_SIZE); |
131 | } |
132 | if (StringValueComparison(data: target, len: StringStatsData::MAX_STRING_MINMAX_SIZE, comparison: string_data.max) > 0) { |
133 | memcpy(dest: string_data.max, src: target, n: StringStatsData::MAX_STRING_MINMAX_SIZE); |
134 | } |
135 | if (size > string_data.max_string_length) { |
136 | string_data.max_string_length = size; |
137 | } |
138 | if (stats.GetType().id() == LogicalTypeId::VARCHAR && !string_data.has_unicode) { |
139 | auto unicode = Utf8Proc::Analyze(s: const_char_ptr_cast(src: data), len: size); |
140 | if (unicode == UnicodeType::UNICODE) { |
141 | string_data.has_unicode = true; |
142 | } else if (unicode == UnicodeType::INVALID) { |
143 | throw InvalidInputException(ErrorManager::InvalidUnicodeError(input: string(const_char_ptr_cast(src: data), size), |
144 | context: "segment statistics update" )); |
145 | } |
146 | } |
147 | } |
148 | |
149 | void StringStats::Merge(BaseStatistics &stats, const BaseStatistics &other) { |
150 | if (other.GetType().id() == LogicalTypeId::VALIDITY) { |
151 | return; |
152 | } |
153 | auto &string_data = StringStats::GetDataUnsafe(stats); |
154 | auto &other_data = StringStats::GetDataUnsafe(stats: other); |
155 | if (StringValueComparison(data: other_data.min, len: StringStatsData::MAX_STRING_MINMAX_SIZE, comparison: string_data.min) < 0) { |
156 | memcpy(dest: string_data.min, src: other_data.min, n: StringStatsData::MAX_STRING_MINMAX_SIZE); |
157 | } |
158 | if (StringValueComparison(data: other_data.max, len: StringStatsData::MAX_STRING_MINMAX_SIZE, comparison: string_data.max) > 0) { |
159 | memcpy(dest: string_data.max, src: other_data.max, n: StringStatsData::MAX_STRING_MINMAX_SIZE); |
160 | } |
161 | string_data.has_unicode = string_data.has_unicode || other_data.has_unicode; |
162 | string_data.has_max_string_length = string_data.has_max_string_length && other_data.has_max_string_length; |
163 | string_data.max_string_length = MaxValue<uint32_t>(a: string_data.max_string_length, b: other_data.max_string_length); |
164 | } |
165 | |
166 | FilterPropagateResult StringStats::CheckZonemap(const BaseStatistics &stats, ExpressionType comparison_type, |
167 | const string &constant) { |
168 | auto &string_data = StringStats::GetDataUnsafe(stats); |
169 | auto data = const_data_ptr_cast(src: constant.c_str()); |
170 | auto size = constant.size(); |
171 | |
172 | idx_t value_size = size > StringStatsData::MAX_STRING_MINMAX_SIZE ? StringStatsData::MAX_STRING_MINMAX_SIZE : size; |
173 | int min_comp = StringValueComparison(data, len: value_size, comparison: string_data.min); |
174 | int max_comp = StringValueComparison(data, len: value_size, comparison: string_data.max); |
175 | switch (comparison_type) { |
176 | case ExpressionType::COMPARE_EQUAL: |
177 | if (min_comp >= 0 && max_comp <= 0) { |
178 | return FilterPropagateResult::NO_PRUNING_POSSIBLE; |
179 | } else { |
180 | return FilterPropagateResult::FILTER_ALWAYS_FALSE; |
181 | } |
182 | case ExpressionType::COMPARE_NOTEQUAL: |
183 | if (min_comp < 0 || max_comp > 0) { |
184 | return FilterPropagateResult::FILTER_ALWAYS_TRUE; |
185 | } |
186 | return FilterPropagateResult::NO_PRUNING_POSSIBLE; |
187 | case ExpressionType::COMPARE_GREATERTHANOREQUALTO: |
188 | case ExpressionType::COMPARE_GREATERTHAN: |
189 | if (max_comp <= 0) { |
190 | return FilterPropagateResult::NO_PRUNING_POSSIBLE; |
191 | } else { |
192 | return FilterPropagateResult::FILTER_ALWAYS_FALSE; |
193 | } |
194 | case ExpressionType::COMPARE_LESSTHAN: |
195 | case ExpressionType::COMPARE_LESSTHANOREQUALTO: |
196 | if (min_comp >= 0) { |
197 | return FilterPropagateResult::NO_PRUNING_POSSIBLE; |
198 | } else { |
199 | return FilterPropagateResult::FILTER_ALWAYS_FALSE; |
200 | } |
201 | default: |
202 | throw InternalException("Expression type not implemented for string statistics zone map" ); |
203 | } |
204 | } |
205 | |
206 | static idx_t GetValidMinMaxSubstring(const_data_ptr_t data) { |
207 | for (idx_t i = 0; i < StringStatsData::MAX_STRING_MINMAX_SIZE; i++) { |
208 | if (data[i] == '\0') { |
209 | return i; |
210 | } |
211 | if ((data[i] & 0x80) != 0) { |
212 | return i; |
213 | } |
214 | } |
215 | return StringStatsData::MAX_STRING_MINMAX_SIZE; |
216 | } |
217 | |
218 | string StringStats::ToString(const BaseStatistics &stats) { |
219 | auto &string_data = StringStats::GetDataUnsafe(stats); |
220 | idx_t min_len = GetValidMinMaxSubstring(data: string_data.min); |
221 | idx_t max_len = GetValidMinMaxSubstring(data: string_data.max); |
222 | return StringUtil::Format(fmt_str: "[Min: %s, Max: %s, Has Unicode: %s, Max String Length: %s]" , |
223 | params: string(const_char_ptr_cast(src: string_data.min), min_len), |
224 | params: string(const_char_ptr_cast(src: string_data.max), max_len), |
225 | params: string_data.has_unicode ? "true" : "false" , |
226 | params: string_data.has_max_string_length ? to_string(val: string_data.max_string_length) : "?" ); |
227 | } |
228 | |
229 | void StringStats::Verify(const BaseStatistics &stats, Vector &vector, const SelectionVector &sel, idx_t count) { |
230 | auto &string_data = StringStats::GetDataUnsafe(stats); |
231 | |
232 | UnifiedVectorFormat vdata; |
233 | vector.ToUnifiedFormat(count, data&: vdata); |
234 | auto data = UnifiedVectorFormat::GetData<string_t>(format: vdata); |
235 | for (idx_t i = 0; i < count; i++) { |
236 | auto idx = sel.get_index(idx: i); |
237 | auto index = vdata.sel->get_index(idx); |
238 | if (!vdata.validity.RowIsValid(row_idx: index)) { |
239 | continue; |
240 | } |
241 | auto value = data[index]; |
242 | auto data = value.GetData(); |
243 | auto len = value.GetSize(); |
244 | // LCOV_EXCL_START |
245 | if (string_data.has_max_string_length && len > string_data.max_string_length) { |
246 | throw InternalException( |
247 | "Statistics mismatch: string value exceeds maximum string length.\nStatistics: %s\nVector: %s" , |
248 | stats.ToString(), vector.ToString(count)); |
249 | } |
250 | if (stats.GetType().id() == LogicalTypeId::VARCHAR && !string_data.has_unicode) { |
251 | auto unicode = Utf8Proc::Analyze(s: data, len); |
252 | if (unicode == UnicodeType::UNICODE) { |
253 | throw InternalException("Statistics mismatch: string value contains unicode, but statistics says it " |
254 | "shouldn't.\nStatistics: %s\nVector: %s" , |
255 | stats.ToString(), vector.ToString(count)); |
256 | } else if (unicode == UnicodeType::INVALID) { |
257 | throw InternalException("Invalid unicode detected in vector: %s" , vector.ToString(count)); |
258 | } |
259 | } |
260 | if (StringValueComparison(data: const_data_ptr_cast(src: data), |
261 | len: MinValue<idx_t>(a: len, b: StringStatsData::MAX_STRING_MINMAX_SIZE), comparison: string_data.min) < 0) { |
262 | throw InternalException("Statistics mismatch: value is smaller than min.\nStatistics: %s\nVector: %s" , |
263 | stats.ToString(), vector.ToString(count)); |
264 | } |
265 | if (StringValueComparison(data: const_data_ptr_cast(src: data), |
266 | len: MinValue<idx_t>(a: len, b: StringStatsData::MAX_STRING_MINMAX_SIZE), comparison: string_data.max) > 0) { |
267 | throw InternalException("Statistics mismatch: value is bigger than max.\nStatistics: %s\nVector: %s" , |
268 | stats.ToString(), vector.ToString(count)); |
269 | } |
270 | // LCOV_EXCL_STOP |
271 | } |
272 | } |
273 | |
274 | } // namespace duckdb |
275 | |