| 1 | #include "duckdb/storage/statistics/string_stats.hpp" |
| 2 | #include "duckdb/storage/statistics/base_statistics.hpp" |
| 3 | #include "duckdb/common/field_writer.hpp" |
| 4 | #include "utf8proc_wrapper.hpp" |
| 5 | #include "duckdb/common/string_util.hpp" |
| 6 | #include "duckdb/common/types/vector.hpp" |
| 7 | #include "duckdb/main/error_manager.hpp" |
| 8 | |
| 9 | namespace duckdb { |
| 10 | |
| 11 | BaseStatistics StringStats::CreateUnknown(LogicalType type) { |
| 12 | BaseStatistics result(std::move(type)); |
| 13 | result.InitializeUnknown(); |
| 14 | auto &string_data = StringStats::GetDataUnsafe(stats&: result); |
| 15 | for (idx_t i = 0; i < StringStatsData::MAX_STRING_MINMAX_SIZE; i++) { |
| 16 | string_data.min[i] = 0; |
| 17 | string_data.max[i] = 0xFF; |
| 18 | } |
| 19 | string_data.max_string_length = 0; |
| 20 | string_data.has_max_string_length = false; |
| 21 | string_data.has_unicode = true; |
| 22 | return result; |
| 23 | } |
| 24 | |
| 25 | BaseStatistics StringStats::CreateEmpty(LogicalType type) { |
| 26 | BaseStatistics result(std::move(type)); |
| 27 | result.InitializeEmpty(); |
| 28 | auto &string_data = StringStats::GetDataUnsafe(stats&: result); |
| 29 | for (idx_t i = 0; i < StringStatsData::MAX_STRING_MINMAX_SIZE; i++) { |
| 30 | string_data.min[i] = 0xFF; |
| 31 | string_data.max[i] = 0; |
| 32 | } |
| 33 | string_data.max_string_length = 0; |
| 34 | string_data.has_max_string_length = true; |
| 35 | string_data.has_unicode = false; |
| 36 | return result; |
| 37 | } |
| 38 | |
| 39 | StringStatsData &StringStats::GetDataUnsafe(BaseStatistics &stats) { |
| 40 | D_ASSERT(stats.GetStatsType() == StatisticsType::STRING_STATS); |
| 41 | return stats.stats_union.string_data; |
| 42 | } |
| 43 | |
| 44 | const StringStatsData &StringStats::GetDataUnsafe(const BaseStatistics &stats) { |
| 45 | D_ASSERT(stats.GetStatsType() == StatisticsType::STRING_STATS); |
| 46 | return stats.stats_union.string_data; |
| 47 | } |
| 48 | |
| 49 | bool StringStats::HasMaxStringLength(const BaseStatistics &stats) { |
| 50 | if (stats.GetType().id() == LogicalTypeId::SQLNULL) { |
| 51 | return false; |
| 52 | } |
| 53 | return StringStats::GetDataUnsafe(stats).has_max_string_length; |
| 54 | } |
| 55 | |
| 56 | uint32_t StringStats::MaxStringLength(const BaseStatistics &stats) { |
| 57 | if (!HasMaxStringLength(stats)) { |
| 58 | throw InternalException("MaxStringLength called on statistics that does not have a max string length" ); |
| 59 | } |
| 60 | return StringStats::GetDataUnsafe(stats).max_string_length; |
| 61 | } |
| 62 | |
| 63 | bool StringStats::CanContainUnicode(const BaseStatistics &stats) { |
| 64 | if (stats.GetType().id() == LogicalTypeId::SQLNULL) { |
| 65 | return true; |
| 66 | } |
| 67 | return StringStats::GetDataUnsafe(stats).has_unicode; |
| 68 | } |
| 69 | |
| 70 | void StringStats::ResetMaxStringLength(BaseStatistics &stats) { |
| 71 | StringStats::GetDataUnsafe(stats).has_max_string_length = false; |
| 72 | } |
| 73 | |
| 74 | void StringStats::SetContainsUnicode(BaseStatistics &stats) { |
| 75 | StringStats::GetDataUnsafe(stats).has_unicode = true; |
| 76 | } |
| 77 | |
| 78 | void StringStats::Serialize(const BaseStatistics &stats, FieldWriter &writer) { |
| 79 | auto &string_data = StringStats::GetDataUnsafe(stats); |
| 80 | writer.WriteBlob(val: string_data.min, len: StringStatsData::MAX_STRING_MINMAX_SIZE); |
| 81 | writer.WriteBlob(val: string_data.max, len: StringStatsData::MAX_STRING_MINMAX_SIZE); |
| 82 | writer.WriteField<bool>(element: string_data.has_unicode); |
| 83 | writer.WriteField<bool>(element: string_data.has_max_string_length); |
| 84 | writer.WriteField<uint32_t>(element: string_data.max_string_length); |
| 85 | } |
| 86 | |
| 87 | BaseStatistics StringStats::Deserialize(FieldReader &reader, LogicalType type) { |
| 88 | BaseStatistics result(std::move(type)); |
| 89 | auto &string_data = StringStats::GetDataUnsafe(stats&: result); |
| 90 | reader.ReadBlob(result: string_data.min, read_size: StringStatsData::MAX_STRING_MINMAX_SIZE); |
| 91 | reader.ReadBlob(result: string_data.max, read_size: StringStatsData::MAX_STRING_MINMAX_SIZE); |
| 92 | string_data.has_unicode = reader.ReadRequired<bool>(); |
| 93 | string_data.has_max_string_length = reader.ReadRequired<bool>(); |
| 94 | string_data.max_string_length = reader.ReadRequired<uint32_t>(); |
| 95 | return result; |
| 96 | } |
| 97 | |
| 98 | static int StringValueComparison(const_data_ptr_t data, idx_t len, const_data_ptr_t comparison) { |
| 99 | D_ASSERT(len <= StringStatsData::MAX_STRING_MINMAX_SIZE); |
| 100 | for (idx_t i = 0; i < len; i++) { |
| 101 | if (data[i] < comparison[i]) { |
| 102 | return -1; |
| 103 | } else if (data[i] > comparison[i]) { |
| 104 | return 1; |
| 105 | } |
| 106 | } |
| 107 | return 0; |
| 108 | } |
| 109 | |
| 110 | static void ConstructValue(const_data_ptr_t data, idx_t size, data_t target[]) { |
| 111 | idx_t value_size = size > StringStatsData::MAX_STRING_MINMAX_SIZE ? StringStatsData::MAX_STRING_MINMAX_SIZE : size; |
| 112 | memcpy(dest: target, src: data, n: value_size); |
| 113 | for (idx_t i = value_size; i < StringStatsData::MAX_STRING_MINMAX_SIZE; i++) { |
| 114 | target[i] = '\0'; |
| 115 | } |
| 116 | } |
| 117 | |
| 118 | void StringStats::Update(BaseStatistics &stats, const string_t &value) { |
| 119 | auto data = const_data_ptr_cast(src: value.GetData()); |
| 120 | auto size = value.GetSize(); |
| 121 | |
| 122 | //! we can only fit 8 bytes, so we might need to trim our string |
| 123 | // construct the value |
| 124 | data_t target[StringStatsData::MAX_STRING_MINMAX_SIZE]; |
| 125 | ConstructValue(data, size, target); |
| 126 | |
| 127 | // update the min and max |
| 128 | auto &string_data = StringStats::GetDataUnsafe(stats); |
| 129 | if (StringValueComparison(data: target, len: StringStatsData::MAX_STRING_MINMAX_SIZE, comparison: string_data.min) < 0) { |
| 130 | memcpy(dest: string_data.min, src: target, n: StringStatsData::MAX_STRING_MINMAX_SIZE); |
| 131 | } |
| 132 | if (StringValueComparison(data: target, len: StringStatsData::MAX_STRING_MINMAX_SIZE, comparison: string_data.max) > 0) { |
| 133 | memcpy(dest: string_data.max, src: target, n: StringStatsData::MAX_STRING_MINMAX_SIZE); |
| 134 | } |
| 135 | if (size > string_data.max_string_length) { |
| 136 | string_data.max_string_length = size; |
| 137 | } |
| 138 | if (stats.GetType().id() == LogicalTypeId::VARCHAR && !string_data.has_unicode) { |
| 139 | auto unicode = Utf8Proc::Analyze(s: const_char_ptr_cast(src: data), len: size); |
| 140 | if (unicode == UnicodeType::UNICODE) { |
| 141 | string_data.has_unicode = true; |
| 142 | } else if (unicode == UnicodeType::INVALID) { |
| 143 | throw InvalidInputException(ErrorManager::InvalidUnicodeError(input: string(const_char_ptr_cast(src: data), size), |
| 144 | context: "segment statistics update" )); |
| 145 | } |
| 146 | } |
| 147 | } |
| 148 | |
| 149 | void StringStats::Merge(BaseStatistics &stats, const BaseStatistics &other) { |
| 150 | if (other.GetType().id() == LogicalTypeId::VALIDITY) { |
| 151 | return; |
| 152 | } |
| 153 | auto &string_data = StringStats::GetDataUnsafe(stats); |
| 154 | auto &other_data = StringStats::GetDataUnsafe(stats: other); |
| 155 | if (StringValueComparison(data: other_data.min, len: StringStatsData::MAX_STRING_MINMAX_SIZE, comparison: string_data.min) < 0) { |
| 156 | memcpy(dest: string_data.min, src: other_data.min, n: StringStatsData::MAX_STRING_MINMAX_SIZE); |
| 157 | } |
| 158 | if (StringValueComparison(data: other_data.max, len: StringStatsData::MAX_STRING_MINMAX_SIZE, comparison: string_data.max) > 0) { |
| 159 | memcpy(dest: string_data.max, src: other_data.max, n: StringStatsData::MAX_STRING_MINMAX_SIZE); |
| 160 | } |
| 161 | string_data.has_unicode = string_data.has_unicode || other_data.has_unicode; |
| 162 | string_data.has_max_string_length = string_data.has_max_string_length && other_data.has_max_string_length; |
| 163 | string_data.max_string_length = MaxValue<uint32_t>(a: string_data.max_string_length, b: other_data.max_string_length); |
| 164 | } |
| 165 | |
| 166 | FilterPropagateResult StringStats::CheckZonemap(const BaseStatistics &stats, ExpressionType comparison_type, |
| 167 | const string &constant) { |
| 168 | auto &string_data = StringStats::GetDataUnsafe(stats); |
| 169 | auto data = const_data_ptr_cast(src: constant.c_str()); |
| 170 | auto size = constant.size(); |
| 171 | |
| 172 | idx_t value_size = size > StringStatsData::MAX_STRING_MINMAX_SIZE ? StringStatsData::MAX_STRING_MINMAX_SIZE : size; |
| 173 | int min_comp = StringValueComparison(data, len: value_size, comparison: string_data.min); |
| 174 | int max_comp = StringValueComparison(data, len: value_size, comparison: string_data.max); |
| 175 | switch (comparison_type) { |
| 176 | case ExpressionType::COMPARE_EQUAL: |
| 177 | if (min_comp >= 0 && max_comp <= 0) { |
| 178 | return FilterPropagateResult::NO_PRUNING_POSSIBLE; |
| 179 | } else { |
| 180 | return FilterPropagateResult::FILTER_ALWAYS_FALSE; |
| 181 | } |
| 182 | case ExpressionType::COMPARE_NOTEQUAL: |
| 183 | if (min_comp < 0 || max_comp > 0) { |
| 184 | return FilterPropagateResult::FILTER_ALWAYS_TRUE; |
| 185 | } |
| 186 | return FilterPropagateResult::NO_PRUNING_POSSIBLE; |
| 187 | case ExpressionType::COMPARE_GREATERTHANOREQUALTO: |
| 188 | case ExpressionType::COMPARE_GREATERTHAN: |
| 189 | if (max_comp <= 0) { |
| 190 | return FilterPropagateResult::NO_PRUNING_POSSIBLE; |
| 191 | } else { |
| 192 | return FilterPropagateResult::FILTER_ALWAYS_FALSE; |
| 193 | } |
| 194 | case ExpressionType::COMPARE_LESSTHAN: |
| 195 | case ExpressionType::COMPARE_LESSTHANOREQUALTO: |
| 196 | if (min_comp >= 0) { |
| 197 | return FilterPropagateResult::NO_PRUNING_POSSIBLE; |
| 198 | } else { |
| 199 | return FilterPropagateResult::FILTER_ALWAYS_FALSE; |
| 200 | } |
| 201 | default: |
| 202 | throw InternalException("Expression type not implemented for string statistics zone map" ); |
| 203 | } |
| 204 | } |
| 205 | |
| 206 | static idx_t GetValidMinMaxSubstring(const_data_ptr_t data) { |
| 207 | for (idx_t i = 0; i < StringStatsData::MAX_STRING_MINMAX_SIZE; i++) { |
| 208 | if (data[i] == '\0') { |
| 209 | return i; |
| 210 | } |
| 211 | if ((data[i] & 0x80) != 0) { |
| 212 | return i; |
| 213 | } |
| 214 | } |
| 215 | return StringStatsData::MAX_STRING_MINMAX_SIZE; |
| 216 | } |
| 217 | |
| 218 | string StringStats::ToString(const BaseStatistics &stats) { |
| 219 | auto &string_data = StringStats::GetDataUnsafe(stats); |
| 220 | idx_t min_len = GetValidMinMaxSubstring(data: string_data.min); |
| 221 | idx_t max_len = GetValidMinMaxSubstring(data: string_data.max); |
| 222 | return StringUtil::Format(fmt_str: "[Min: %s, Max: %s, Has Unicode: %s, Max String Length: %s]" , |
| 223 | params: string(const_char_ptr_cast(src: string_data.min), min_len), |
| 224 | params: string(const_char_ptr_cast(src: string_data.max), max_len), |
| 225 | params: string_data.has_unicode ? "true" : "false" , |
| 226 | params: string_data.has_max_string_length ? to_string(val: string_data.max_string_length) : "?" ); |
| 227 | } |
| 228 | |
| 229 | void StringStats::Verify(const BaseStatistics &stats, Vector &vector, const SelectionVector &sel, idx_t count) { |
| 230 | auto &string_data = StringStats::GetDataUnsafe(stats); |
| 231 | |
| 232 | UnifiedVectorFormat vdata; |
| 233 | vector.ToUnifiedFormat(count, data&: vdata); |
| 234 | auto data = UnifiedVectorFormat::GetData<string_t>(format: vdata); |
| 235 | for (idx_t i = 0; i < count; i++) { |
| 236 | auto idx = sel.get_index(idx: i); |
| 237 | auto index = vdata.sel->get_index(idx); |
| 238 | if (!vdata.validity.RowIsValid(row_idx: index)) { |
| 239 | continue; |
| 240 | } |
| 241 | auto value = data[index]; |
| 242 | auto data = value.GetData(); |
| 243 | auto len = value.GetSize(); |
| 244 | // LCOV_EXCL_START |
| 245 | if (string_data.has_max_string_length && len > string_data.max_string_length) { |
| 246 | throw InternalException( |
| 247 | "Statistics mismatch: string value exceeds maximum string length.\nStatistics: %s\nVector: %s" , |
| 248 | stats.ToString(), vector.ToString(count)); |
| 249 | } |
| 250 | if (stats.GetType().id() == LogicalTypeId::VARCHAR && !string_data.has_unicode) { |
| 251 | auto unicode = Utf8Proc::Analyze(s: data, len); |
| 252 | if (unicode == UnicodeType::UNICODE) { |
| 253 | throw InternalException("Statistics mismatch: string value contains unicode, but statistics says it " |
| 254 | "shouldn't.\nStatistics: %s\nVector: %s" , |
| 255 | stats.ToString(), vector.ToString(count)); |
| 256 | } else if (unicode == UnicodeType::INVALID) { |
| 257 | throw InternalException("Invalid unicode detected in vector: %s" , vector.ToString(count)); |
| 258 | } |
| 259 | } |
| 260 | if (StringValueComparison(data: const_data_ptr_cast(src: data), |
| 261 | len: MinValue<idx_t>(a: len, b: StringStatsData::MAX_STRING_MINMAX_SIZE), comparison: string_data.min) < 0) { |
| 262 | throw InternalException("Statistics mismatch: value is smaller than min.\nStatistics: %s\nVector: %s" , |
| 263 | stats.ToString(), vector.ToString(count)); |
| 264 | } |
| 265 | if (StringValueComparison(data: const_data_ptr_cast(src: data), |
| 266 | len: MinValue<idx_t>(a: len, b: StringStatsData::MAX_STRING_MINMAX_SIZE), comparison: string_data.max) > 0) { |
| 267 | throw InternalException("Statistics mismatch: value is bigger than max.\nStatistics: %s\nVector: %s" , |
| 268 | stats.ToString(), vector.ToString(count)); |
| 269 | } |
| 270 | // LCOV_EXCL_STOP |
| 271 | } |
| 272 | } |
| 273 | |
| 274 | } // namespace duckdb |
| 275 | |