1#include "duckdb/storage/statistics/string_stats.hpp"
2#include "duckdb/storage/statistics/base_statistics.hpp"
3#include "duckdb/common/field_writer.hpp"
4#include "utf8proc_wrapper.hpp"
5#include "duckdb/common/string_util.hpp"
6#include "duckdb/common/types/vector.hpp"
7#include "duckdb/main/error_manager.hpp"
8
9namespace duckdb {
10
11BaseStatistics StringStats::CreateUnknown(LogicalType type) {
12 BaseStatistics result(std::move(type));
13 result.InitializeUnknown();
14 auto &string_data = StringStats::GetDataUnsafe(stats&: result);
15 for (idx_t i = 0; i < StringStatsData::MAX_STRING_MINMAX_SIZE; i++) {
16 string_data.min[i] = 0;
17 string_data.max[i] = 0xFF;
18 }
19 string_data.max_string_length = 0;
20 string_data.has_max_string_length = false;
21 string_data.has_unicode = true;
22 return result;
23}
24
25BaseStatistics StringStats::CreateEmpty(LogicalType type) {
26 BaseStatistics result(std::move(type));
27 result.InitializeEmpty();
28 auto &string_data = StringStats::GetDataUnsafe(stats&: result);
29 for (idx_t i = 0; i < StringStatsData::MAX_STRING_MINMAX_SIZE; i++) {
30 string_data.min[i] = 0xFF;
31 string_data.max[i] = 0;
32 }
33 string_data.max_string_length = 0;
34 string_data.has_max_string_length = true;
35 string_data.has_unicode = false;
36 return result;
37}
38
39StringStatsData &StringStats::GetDataUnsafe(BaseStatistics &stats) {
40 D_ASSERT(stats.GetStatsType() == StatisticsType::STRING_STATS);
41 return stats.stats_union.string_data;
42}
43
44const StringStatsData &StringStats::GetDataUnsafe(const BaseStatistics &stats) {
45 D_ASSERT(stats.GetStatsType() == StatisticsType::STRING_STATS);
46 return stats.stats_union.string_data;
47}
48
49bool StringStats::HasMaxStringLength(const BaseStatistics &stats) {
50 if (stats.GetType().id() == LogicalTypeId::SQLNULL) {
51 return false;
52 }
53 return StringStats::GetDataUnsafe(stats).has_max_string_length;
54}
55
56uint32_t StringStats::MaxStringLength(const BaseStatistics &stats) {
57 if (!HasMaxStringLength(stats)) {
58 throw InternalException("MaxStringLength called on statistics that does not have a max string length");
59 }
60 return StringStats::GetDataUnsafe(stats).max_string_length;
61}
62
63bool StringStats::CanContainUnicode(const BaseStatistics &stats) {
64 if (stats.GetType().id() == LogicalTypeId::SQLNULL) {
65 return true;
66 }
67 return StringStats::GetDataUnsafe(stats).has_unicode;
68}
69
70void StringStats::ResetMaxStringLength(BaseStatistics &stats) {
71 StringStats::GetDataUnsafe(stats).has_max_string_length = false;
72}
73
74void StringStats::SetContainsUnicode(BaseStatistics &stats) {
75 StringStats::GetDataUnsafe(stats).has_unicode = true;
76}
77
78void StringStats::Serialize(const BaseStatistics &stats, FieldWriter &writer) {
79 auto &string_data = StringStats::GetDataUnsafe(stats);
80 writer.WriteBlob(val: string_data.min, len: StringStatsData::MAX_STRING_MINMAX_SIZE);
81 writer.WriteBlob(val: string_data.max, len: StringStatsData::MAX_STRING_MINMAX_SIZE);
82 writer.WriteField<bool>(element: string_data.has_unicode);
83 writer.WriteField<bool>(element: string_data.has_max_string_length);
84 writer.WriteField<uint32_t>(element: string_data.max_string_length);
85}
86
87BaseStatistics StringStats::Deserialize(FieldReader &reader, LogicalType type) {
88 BaseStatistics result(std::move(type));
89 auto &string_data = StringStats::GetDataUnsafe(stats&: result);
90 reader.ReadBlob(result: string_data.min, read_size: StringStatsData::MAX_STRING_MINMAX_SIZE);
91 reader.ReadBlob(result: string_data.max, read_size: StringStatsData::MAX_STRING_MINMAX_SIZE);
92 string_data.has_unicode = reader.ReadRequired<bool>();
93 string_data.has_max_string_length = reader.ReadRequired<bool>();
94 string_data.max_string_length = reader.ReadRequired<uint32_t>();
95 return result;
96}
97
98static int StringValueComparison(const_data_ptr_t data, idx_t len, const_data_ptr_t comparison) {
99 D_ASSERT(len <= StringStatsData::MAX_STRING_MINMAX_SIZE);
100 for (idx_t i = 0; i < len; i++) {
101 if (data[i] < comparison[i]) {
102 return -1;
103 } else if (data[i] > comparison[i]) {
104 return 1;
105 }
106 }
107 return 0;
108}
109
110static void ConstructValue(const_data_ptr_t data, idx_t size, data_t target[]) {
111 idx_t value_size = size > StringStatsData::MAX_STRING_MINMAX_SIZE ? StringStatsData::MAX_STRING_MINMAX_SIZE : size;
112 memcpy(dest: target, src: data, n: value_size);
113 for (idx_t i = value_size; i < StringStatsData::MAX_STRING_MINMAX_SIZE; i++) {
114 target[i] = '\0';
115 }
116}
117
118void StringStats::Update(BaseStatistics &stats, const string_t &value) {
119 auto data = const_data_ptr_cast(src: value.GetData());
120 auto size = value.GetSize();
121
122 //! we can only fit 8 bytes, so we might need to trim our string
123 // construct the value
124 data_t target[StringStatsData::MAX_STRING_MINMAX_SIZE];
125 ConstructValue(data, size, target);
126
127 // update the min and max
128 auto &string_data = StringStats::GetDataUnsafe(stats);
129 if (StringValueComparison(data: target, len: StringStatsData::MAX_STRING_MINMAX_SIZE, comparison: string_data.min) < 0) {
130 memcpy(dest: string_data.min, src: target, n: StringStatsData::MAX_STRING_MINMAX_SIZE);
131 }
132 if (StringValueComparison(data: target, len: StringStatsData::MAX_STRING_MINMAX_SIZE, comparison: string_data.max) > 0) {
133 memcpy(dest: string_data.max, src: target, n: StringStatsData::MAX_STRING_MINMAX_SIZE);
134 }
135 if (size > string_data.max_string_length) {
136 string_data.max_string_length = size;
137 }
138 if (stats.GetType().id() == LogicalTypeId::VARCHAR && !string_data.has_unicode) {
139 auto unicode = Utf8Proc::Analyze(s: const_char_ptr_cast(src: data), len: size);
140 if (unicode == UnicodeType::UNICODE) {
141 string_data.has_unicode = true;
142 } else if (unicode == UnicodeType::INVALID) {
143 throw InvalidInputException(ErrorManager::InvalidUnicodeError(input: string(const_char_ptr_cast(src: data), size),
144 context: "segment statistics update"));
145 }
146 }
147}
148
149void StringStats::Merge(BaseStatistics &stats, const BaseStatistics &other) {
150 if (other.GetType().id() == LogicalTypeId::VALIDITY) {
151 return;
152 }
153 auto &string_data = StringStats::GetDataUnsafe(stats);
154 auto &other_data = StringStats::GetDataUnsafe(stats: other);
155 if (StringValueComparison(data: other_data.min, len: StringStatsData::MAX_STRING_MINMAX_SIZE, comparison: string_data.min) < 0) {
156 memcpy(dest: string_data.min, src: other_data.min, n: StringStatsData::MAX_STRING_MINMAX_SIZE);
157 }
158 if (StringValueComparison(data: other_data.max, len: StringStatsData::MAX_STRING_MINMAX_SIZE, comparison: string_data.max) > 0) {
159 memcpy(dest: string_data.max, src: other_data.max, n: StringStatsData::MAX_STRING_MINMAX_SIZE);
160 }
161 string_data.has_unicode = string_data.has_unicode || other_data.has_unicode;
162 string_data.has_max_string_length = string_data.has_max_string_length && other_data.has_max_string_length;
163 string_data.max_string_length = MaxValue<uint32_t>(a: string_data.max_string_length, b: other_data.max_string_length);
164}
165
166FilterPropagateResult StringStats::CheckZonemap(const BaseStatistics &stats, ExpressionType comparison_type,
167 const string &constant) {
168 auto &string_data = StringStats::GetDataUnsafe(stats);
169 auto data = const_data_ptr_cast(src: constant.c_str());
170 auto size = constant.size();
171
172 idx_t value_size = size > StringStatsData::MAX_STRING_MINMAX_SIZE ? StringStatsData::MAX_STRING_MINMAX_SIZE : size;
173 int min_comp = StringValueComparison(data, len: value_size, comparison: string_data.min);
174 int max_comp = StringValueComparison(data, len: value_size, comparison: string_data.max);
175 switch (comparison_type) {
176 case ExpressionType::COMPARE_EQUAL:
177 if (min_comp >= 0 && max_comp <= 0) {
178 return FilterPropagateResult::NO_PRUNING_POSSIBLE;
179 } else {
180 return FilterPropagateResult::FILTER_ALWAYS_FALSE;
181 }
182 case ExpressionType::COMPARE_NOTEQUAL:
183 if (min_comp < 0 || max_comp > 0) {
184 return FilterPropagateResult::FILTER_ALWAYS_TRUE;
185 }
186 return FilterPropagateResult::NO_PRUNING_POSSIBLE;
187 case ExpressionType::COMPARE_GREATERTHANOREQUALTO:
188 case ExpressionType::COMPARE_GREATERTHAN:
189 if (max_comp <= 0) {
190 return FilterPropagateResult::NO_PRUNING_POSSIBLE;
191 } else {
192 return FilterPropagateResult::FILTER_ALWAYS_FALSE;
193 }
194 case ExpressionType::COMPARE_LESSTHAN:
195 case ExpressionType::COMPARE_LESSTHANOREQUALTO:
196 if (min_comp >= 0) {
197 return FilterPropagateResult::NO_PRUNING_POSSIBLE;
198 } else {
199 return FilterPropagateResult::FILTER_ALWAYS_FALSE;
200 }
201 default:
202 throw InternalException("Expression type not implemented for string statistics zone map");
203 }
204}
205
206static idx_t GetValidMinMaxSubstring(const_data_ptr_t data) {
207 for (idx_t i = 0; i < StringStatsData::MAX_STRING_MINMAX_SIZE; i++) {
208 if (data[i] == '\0') {
209 return i;
210 }
211 if ((data[i] & 0x80) != 0) {
212 return i;
213 }
214 }
215 return StringStatsData::MAX_STRING_MINMAX_SIZE;
216}
217
218string StringStats::ToString(const BaseStatistics &stats) {
219 auto &string_data = StringStats::GetDataUnsafe(stats);
220 idx_t min_len = GetValidMinMaxSubstring(data: string_data.min);
221 idx_t max_len = GetValidMinMaxSubstring(data: string_data.max);
222 return StringUtil::Format(fmt_str: "[Min: %s, Max: %s, Has Unicode: %s, Max String Length: %s]",
223 params: string(const_char_ptr_cast(src: string_data.min), min_len),
224 params: string(const_char_ptr_cast(src: string_data.max), max_len),
225 params: string_data.has_unicode ? "true" : "false",
226 params: string_data.has_max_string_length ? to_string(val: string_data.max_string_length) : "?");
227}
228
229void StringStats::Verify(const BaseStatistics &stats, Vector &vector, const SelectionVector &sel, idx_t count) {
230 auto &string_data = StringStats::GetDataUnsafe(stats);
231
232 UnifiedVectorFormat vdata;
233 vector.ToUnifiedFormat(count, data&: vdata);
234 auto data = UnifiedVectorFormat::GetData<string_t>(format: vdata);
235 for (idx_t i = 0; i < count; i++) {
236 auto idx = sel.get_index(idx: i);
237 auto index = vdata.sel->get_index(idx);
238 if (!vdata.validity.RowIsValid(row_idx: index)) {
239 continue;
240 }
241 auto value = data[index];
242 auto data = value.GetData();
243 auto len = value.GetSize();
244 // LCOV_EXCL_START
245 if (string_data.has_max_string_length && len > string_data.max_string_length) {
246 throw InternalException(
247 "Statistics mismatch: string value exceeds maximum string length.\nStatistics: %s\nVector: %s",
248 stats.ToString(), vector.ToString(count));
249 }
250 if (stats.GetType().id() == LogicalTypeId::VARCHAR && !string_data.has_unicode) {
251 auto unicode = Utf8Proc::Analyze(s: data, len);
252 if (unicode == UnicodeType::UNICODE) {
253 throw InternalException("Statistics mismatch: string value contains unicode, but statistics says it "
254 "shouldn't.\nStatistics: %s\nVector: %s",
255 stats.ToString(), vector.ToString(count));
256 } else if (unicode == UnicodeType::INVALID) {
257 throw InternalException("Invalid unicode detected in vector: %s", vector.ToString(count));
258 }
259 }
260 if (StringValueComparison(data: const_data_ptr_cast(src: data),
261 len: MinValue<idx_t>(a: len, b: StringStatsData::MAX_STRING_MINMAX_SIZE), comparison: string_data.min) < 0) {
262 throw InternalException("Statistics mismatch: value is smaller than min.\nStatistics: %s\nVector: %s",
263 stats.ToString(), vector.ToString(count));
264 }
265 if (StringValueComparison(data: const_data_ptr_cast(src: data),
266 len: MinValue<idx_t>(a: len, b: StringStatsData::MAX_STRING_MINMAX_SIZE), comparison: string_data.max) > 0) {
267 throw InternalException("Statistics mismatch: value is bigger than max.\nStatistics: %s\nVector: %s",
268 stats.ToString(), vector.ToString(count));
269 }
270 // LCOV_EXCL_STOP
271 }
272}
273
274} // namespace duckdb
275