1 | #include "duckdb/common/exception.hpp" |
2 | #include "duckdb/common/vector_operations/vector_operations.hpp" |
3 | #include "duckdb/function/scalar/string_functions.hpp" |
4 | #include "duckdb/planner/expression/bound_function_expression.hpp" |
5 | |
6 | #include "duckdb/execution/expression_executor.hpp" |
7 | |
8 | namespace duckdb { |
9 | |
10 | struct StandardCharacterReader { |
11 | static char Operation(const char *data, idx_t pos) { |
12 | return data[pos]; |
13 | } |
14 | }; |
15 | |
16 | struct ASCIILCaseReader { |
17 | static char Operation(const char *data, idx_t pos) { |
18 | return (char)LowerFun::ascii_to_lower_map[(uint8_t)data[pos]]; |
19 | } |
20 | }; |
21 | |
22 | template <char PERCENTAGE, char UNDERSCORE, bool HAS_ESCAPE, class READER = StandardCharacterReader> |
23 | bool TemplatedLikeOperator(const char *sdata, idx_t slen, const char *pdata, idx_t plen, char escape) { |
24 | idx_t pidx = 0; |
25 | idx_t sidx = 0; |
26 | for (; pidx < plen && sidx < slen; pidx++) { |
27 | char pchar = READER::Operation(pdata, pidx); |
28 | char schar = READER::Operation(sdata, sidx); |
29 | if (HAS_ESCAPE && pchar == escape) { |
30 | pidx++; |
31 | if (pidx == plen) { |
32 | throw SyntaxException("Like pattern must not end with escape character!" ); |
33 | } |
34 | if (pdata[pidx] != schar) { |
35 | return false; |
36 | } |
37 | sidx++; |
38 | } else if (pchar == UNDERSCORE) { |
39 | sidx++; |
40 | } else if (pchar == PERCENTAGE) { |
41 | pidx++; |
42 | while (pidx < plen && pdata[pidx] == PERCENTAGE) { |
43 | pidx++; |
44 | } |
45 | if (pidx == plen) { |
46 | return true; /* tail is acceptable */ |
47 | } |
48 | for (; sidx < slen; sidx++) { |
49 | if (TemplatedLikeOperator<PERCENTAGE, UNDERSCORE, HAS_ESCAPE, READER>( |
50 | sdata + sidx, slen - sidx, pdata + pidx, plen - pidx, escape)) { |
51 | return true; |
52 | } |
53 | } |
54 | return false; |
55 | } else if (pchar == schar) { |
56 | sidx++; |
57 | } else { |
58 | return false; |
59 | } |
60 | } |
61 | while (pidx < plen && pdata[pidx] == PERCENTAGE) { |
62 | pidx++; |
63 | } |
64 | return pidx == plen && sidx == slen; |
65 | } |
66 | |
67 | struct LikeSegment { |
68 | explicit LikeSegment(string pattern) : pattern(std::move(pattern)) { |
69 | } |
70 | |
71 | string pattern; |
72 | }; |
73 | |
74 | struct LikeMatcher : public FunctionData { |
75 | LikeMatcher(string like_pattern_p, vector<LikeSegment> segments, bool has_start_percentage, bool has_end_percentage) |
76 | : like_pattern(std::move(like_pattern_p)), segments(std::move(segments)), |
77 | has_start_percentage(has_start_percentage), has_end_percentage(has_end_percentage) { |
78 | } |
79 | |
80 | bool Match(string_t &str) { |
81 | auto str_data = const_uchar_ptr_cast(src: str.GetData()); |
82 | auto str_len = str.GetSize(); |
83 | idx_t segment_idx = 0; |
84 | idx_t end_idx = segments.size() - 1; |
85 | if (!has_start_percentage) { |
86 | // no start sample_size: match the first part of the string directly |
87 | auto &segment = segments[0]; |
88 | if (str_len < segment.pattern.size()) { |
89 | return false; |
90 | } |
91 | if (memcmp(s1: str_data, s2: segment.pattern.c_str(), n: segment.pattern.size()) != 0) { |
92 | return false; |
93 | } |
94 | str_data += segment.pattern.size(); |
95 | str_len -= segment.pattern.size(); |
96 | segment_idx++; |
97 | if (segments.size() == 1) { |
98 | // only one segment, and it matches |
99 | // we have a match if there is an end sample_size, OR if the memcmp was an exact match (remaining str is |
100 | // empty) |
101 | return has_end_percentage || str_len == 0; |
102 | } |
103 | } |
104 | // main match loop: for every segment in the middle, use Contains to find the needle in the haystack |
105 | for (; segment_idx < end_idx; segment_idx++) { |
106 | auto &segment = segments[segment_idx]; |
107 | // find the pattern of the current segment |
108 | idx_t next_offset = ContainsFun::Find(haystack: str_data, haystack_size: str_len, needle: const_uchar_ptr_cast(src: segment.pattern.c_str()), |
109 | needle_size: segment.pattern.size()); |
110 | if (next_offset == DConstants::INVALID_INDEX) { |
111 | // could not find this pattern in the string: no match |
112 | return false; |
113 | } |
114 | idx_t offset = next_offset + segment.pattern.size(); |
115 | str_data += offset; |
116 | str_len -= offset; |
117 | } |
118 | if (!has_end_percentage) { |
119 | end_idx--; |
120 | // no end sample_size: match the final segment now |
121 | auto &segment = segments.back(); |
122 | if (str_len < segment.pattern.size()) { |
123 | return false; |
124 | } |
125 | if (memcmp(s1: str_data + str_len - segment.pattern.size(), s2: segment.pattern.c_str(), n: segment.pattern.size()) != |
126 | 0) { |
127 | return false; |
128 | } |
129 | return true; |
130 | } else { |
131 | auto &segment = segments.back(); |
132 | // find the pattern of the current segment |
133 | idx_t next_offset = ContainsFun::Find(haystack: str_data, haystack_size: str_len, needle: const_uchar_ptr_cast(src: segment.pattern.c_str()), |
134 | needle_size: segment.pattern.size()); |
135 | return next_offset != DConstants::INVALID_INDEX; |
136 | } |
137 | } |
138 | |
139 | static unique_ptr<LikeMatcher> CreateLikeMatcher(string like_pattern, char escape = '\0') { |
140 | vector<LikeSegment> segments; |
141 | idx_t last_non_pattern = 0; |
142 | bool has_start_percentage = false; |
143 | bool has_end_percentage = false; |
144 | for (idx_t i = 0; i < like_pattern.size(); i++) { |
145 | auto ch = like_pattern[i]; |
146 | if (ch == escape || ch == '%' || ch == '_') { |
147 | // special character, push a constant pattern |
148 | if (i > last_non_pattern) { |
149 | segments.emplace_back(args: like_pattern.substr(pos: last_non_pattern, n: i - last_non_pattern)); |
150 | } |
151 | last_non_pattern = i + 1; |
152 | if (ch == escape || ch == '_') { |
153 | // escape or underscore: could not create efficient like matcher |
154 | // FIXME: we could handle escaped percentages here |
155 | return nullptr; |
156 | } else { |
157 | // sample_size |
158 | if (i == 0) { |
159 | has_start_percentage = true; |
160 | } |
161 | if (i + 1 == like_pattern.size()) { |
162 | has_end_percentage = true; |
163 | } |
164 | } |
165 | } |
166 | } |
167 | if (last_non_pattern < like_pattern.size()) { |
168 | segments.emplace_back(args: like_pattern.substr(pos: last_non_pattern, n: like_pattern.size() - last_non_pattern)); |
169 | } |
170 | if (segments.empty()) { |
171 | return nullptr; |
172 | } |
173 | return make_uniq<LikeMatcher>(args: std::move(like_pattern), args: std::move(segments), args&: has_start_percentage, |
174 | args&: has_end_percentage); |
175 | } |
176 | |
177 | unique_ptr<FunctionData> Copy() const override { |
178 | return make_uniq<LikeMatcher>(args: like_pattern, args: segments, args: has_start_percentage, args: has_end_percentage); |
179 | } |
180 | |
181 | bool Equals(const FunctionData &other_p) const override { |
182 | auto &other = other_p.Cast<LikeMatcher>(); |
183 | return like_pattern == other.like_pattern; |
184 | } |
185 | |
186 | private: |
187 | string like_pattern; |
188 | vector<LikeSegment> segments; |
189 | bool has_start_percentage; |
190 | bool has_end_percentage; |
191 | }; |
192 | |
193 | static unique_ptr<FunctionData> LikeBindFunction(ClientContext &context, ScalarFunction &bound_function, |
194 | vector<unique_ptr<Expression>> &arguments) { |
195 | // pattern is the second argument. If its constant, we can already prepare the pattern and store it for later. |
196 | D_ASSERT(arguments.size() == 2 || arguments.size() == 3); |
197 | if (arguments[1]->IsFoldable()) { |
198 | Value pattern_str = ExpressionExecutor::EvaluateScalar(context, expr: *arguments[1]); |
199 | if (pattern_str.IsNull()) { |
200 | return nullptr; |
201 | } |
202 | return LikeMatcher::CreateLikeMatcher(like_pattern: pattern_str.ToString()); |
203 | } |
204 | return nullptr; |
205 | } |
206 | |
207 | bool LikeOperatorFunction(const char *s, idx_t slen, const char *pattern, idx_t plen, char escape) { |
208 | return TemplatedLikeOperator<'%', '_', true>(sdata: s, slen, pdata: pattern, plen, escape); |
209 | } |
210 | |
211 | bool LikeOperatorFunction(const char *s, idx_t slen, const char *pattern, idx_t plen) { |
212 | return TemplatedLikeOperator<'%', '_', false>(sdata: s, slen, pdata: pattern, plen, escape: '\0'); |
213 | } |
214 | |
215 | bool LikeOperatorFunction(string_t &s, string_t &pat) { |
216 | return LikeOperatorFunction(s: s.GetData(), slen: s.GetSize(), pattern: pat.GetData(), plen: pat.GetSize()); |
217 | } |
218 | |
219 | bool LikeOperatorFunction(string_t &s, string_t &pat, char escape) { |
220 | return LikeOperatorFunction(s: s.GetData(), slen: s.GetSize(), pattern: pat.GetData(), plen: pat.GetSize(), escape); |
221 | } |
222 | |
223 | bool LikeFun::Glob(const char *string, idx_t slen, const char *pattern, idx_t plen, bool allow_question_mark) { |
224 | idx_t sidx = 0; |
225 | idx_t pidx = 0; |
226 | main_loop : { |
227 | // main matching loop |
228 | while (sidx < slen && pidx < plen) { |
229 | char s = string[sidx]; |
230 | char p = pattern[pidx]; |
231 | switch (p) { |
232 | case '*': { |
233 | // asterisk: match any set of characters |
234 | // skip any subsequent asterisks |
235 | pidx++; |
236 | while (pidx < plen && pattern[pidx] == '*') { |
237 | pidx++; |
238 | } |
239 | // if the asterisk is the last character, the pattern always matches |
240 | if (pidx == plen) { |
241 | return true; |
242 | } |
243 | // recursively match the remainder of the pattern |
244 | for (; sidx < slen; sidx++) { |
245 | if (LikeFun::Glob(string: string + sidx, slen: slen - sidx, pattern: pattern + pidx, plen: plen - pidx)) { |
246 | return true; |
247 | } |
248 | } |
249 | return false; |
250 | } |
251 | case '?': |
252 | // when enabled: matches anything but null |
253 | if (allow_question_mark) { |
254 | break; |
255 | } |
256 | DUCKDB_EXPLICIT_FALLTHROUGH; |
257 | case '[': |
258 | pidx++; |
259 | goto parse_bracket; |
260 | case '\\': |
261 | // escape character, next character needs to match literally |
262 | pidx++; |
263 | // check that we still have a character remaining |
264 | if (pidx == plen) { |
265 | return false; |
266 | } |
267 | p = pattern[pidx]; |
268 | if (s != p) { |
269 | return false; |
270 | } |
271 | break; |
272 | default: |
273 | // not a control character: characters need to match literally |
274 | if (s != p) { |
275 | return false; |
276 | } |
277 | break; |
278 | } |
279 | sidx++; |
280 | pidx++; |
281 | } |
282 | while (pidx < plen && pattern[pidx] == '*') { |
283 | pidx++; |
284 | } |
285 | // we are finished only if we have consumed the full pattern |
286 | return pidx == plen && sidx == slen; |
287 | } |
288 | parse_bracket : { |
289 | // inside a bracket |
290 | if (pidx == plen) { |
291 | return false; |
292 | } |
293 | // check the first character |
294 | // if it is an exclamation mark we need to invert our logic |
295 | char p = pattern[pidx]; |
296 | char s = string[sidx]; |
297 | bool invert = false; |
298 | if (p == '!') { |
299 | invert = true; |
300 | pidx++; |
301 | } |
302 | bool found_match = invert; |
303 | idx_t start_pos = pidx; |
304 | bool found_closing_bracket = false; |
305 | // now check the remainder of the pattern |
306 | while (pidx < plen) { |
307 | p = pattern[pidx]; |
308 | // if the first character is a closing bracket, we match it literally |
309 | // otherwise it indicates an end of bracket |
310 | if (p == ']' && pidx > start_pos) { |
311 | // end of bracket found: we are done |
312 | found_closing_bracket = true; |
313 | pidx++; |
314 | break; |
315 | } |
316 | // we either match a range (a-b) or a single character (a) |
317 | // check if the next character is a dash |
318 | if (pidx + 1 == plen) { |
319 | // no next character! |
320 | break; |
321 | } |
322 | bool matches; |
323 | if (pattern[pidx + 1] == '-') { |
324 | // range! find the next character in the range |
325 | if (pidx + 2 == plen) { |
326 | break; |
327 | } |
328 | char next_char = pattern[pidx + 2]; |
329 | // check if the current character is within the range |
330 | matches = s >= p && s <= next_char; |
331 | // shift the pattern forward past the range |
332 | pidx += 3; |
333 | } else { |
334 | // no range! perform a direct match |
335 | matches = p == s; |
336 | // shift the pattern forward past the character |
337 | pidx++; |
338 | } |
339 | if (found_match == invert && matches) { |
340 | // found a match! set the found_matches flag |
341 | // we keep on pattern matching after this until we reach the end bracket |
342 | // however, we don't need to update the found_match flag anymore |
343 | found_match = !invert; |
344 | } |
345 | } |
346 | if (!found_closing_bracket) { |
347 | // no end of bracket: invalid pattern |
348 | return false; |
349 | } |
350 | if (!found_match) { |
351 | // did not match the bracket: return false; |
352 | return false; |
353 | } |
354 | // finished the bracket matching: move forward |
355 | sidx++; |
356 | goto main_loop; |
357 | } |
358 | } |
359 | |
360 | static char GetEscapeChar(string_t escape) { |
361 | // Only one escape character should be allowed |
362 | if (escape.GetSize() > 1) { |
363 | throw SyntaxException("Invalid escape string. Escape string must be empty or one character." ); |
364 | } |
365 | return escape.GetSize() == 0 ? '\0' : *escape.GetData(); |
366 | } |
367 | |
368 | struct LikeEscapeOperator { |
369 | template <class TA, class TB, class TC> |
370 | static inline bool Operation(TA str, TB pattern, TC escape) { |
371 | char escape_char = GetEscapeChar(escape); |
372 | return LikeOperatorFunction(str.GetData(), str.GetSize(), pattern.GetData(), pattern.GetSize(), escape_char); |
373 | } |
374 | }; |
375 | |
376 | struct NotLikeEscapeOperator { |
377 | template <class TA, class TB, class TC> |
378 | static inline bool Operation(TA str, TB pattern, TC escape) { |
379 | return !LikeEscapeOperator::Operation(str, pattern, escape); |
380 | } |
381 | }; |
382 | |
383 | struct LikeOperator { |
384 | template <class TA, class TB, class TR> |
385 | static inline TR Operation(TA str, TB pattern) { |
386 | return LikeOperatorFunction(str, pattern); |
387 | } |
388 | }; |
389 | |
390 | bool ILikeOperatorFunction(string_t &str, string_t &pattern, char escape = '\0') { |
391 | auto str_data = str.GetData(); |
392 | auto str_size = str.GetSize(); |
393 | auto pat_data = pattern.GetData(); |
394 | auto pat_size = pattern.GetSize(); |
395 | |
396 | // lowercase both the str and the pattern |
397 | idx_t str_llength = LowerFun::LowerLength(input_data: str_data, input_length: str_size); |
398 | auto str_ldata = make_unsafe_uniq_array<char>(n: str_llength); |
399 | LowerFun::LowerCase(input_data: str_data, input_length: str_size, result_data: str_ldata.get()); |
400 | |
401 | idx_t pat_llength = LowerFun::LowerLength(input_data: pat_data, input_length: pat_size); |
402 | auto pat_ldata = make_unsafe_uniq_array<char>(n: pat_llength); |
403 | LowerFun::LowerCase(input_data: pat_data, input_length: pat_size, result_data: pat_ldata.get()); |
404 | string_t str_lcase(str_ldata.get(), str_llength); |
405 | string_t pat_lcase(pat_ldata.get(), pat_llength); |
406 | return LikeOperatorFunction(s&: str_lcase, pat&: pat_lcase, escape); |
407 | } |
408 | |
409 | struct ILikeEscapeOperator { |
410 | template <class TA, class TB, class TC> |
411 | static inline bool Operation(TA str, TB pattern, TC escape) { |
412 | char escape_char = GetEscapeChar(escape); |
413 | return ILikeOperatorFunction(str, pattern, escape_char); |
414 | } |
415 | }; |
416 | |
417 | struct NotILikeEscapeOperator { |
418 | template <class TA, class TB, class TC> |
419 | static inline bool Operation(TA str, TB pattern, TC escape) { |
420 | return !ILikeEscapeOperator::Operation(str, pattern, escape); |
421 | } |
422 | }; |
423 | |
424 | struct ILikeOperator { |
425 | template <class TA, class TB, class TR> |
426 | static inline TR Operation(TA str, TB pattern) { |
427 | return ILikeOperatorFunction(str, pattern); |
428 | } |
429 | }; |
430 | |
431 | struct NotLikeOperator { |
432 | template <class TA, class TB, class TR> |
433 | static inline TR Operation(TA str, TB pattern) { |
434 | return !LikeOperatorFunction(str, pattern); |
435 | } |
436 | }; |
437 | |
438 | struct NotILikeOperator { |
439 | template <class TA, class TB, class TR> |
440 | static inline TR Operation(TA str, TB pattern) { |
441 | return !ILikeOperator::Operation<TA, TB, TR>(str, pattern); |
442 | } |
443 | }; |
444 | |
445 | struct ILikeOperatorASCII { |
446 | template <class TA, class TB, class TR> |
447 | static inline TR Operation(TA str, TB pattern) { |
448 | return TemplatedLikeOperator<'%', '_', false, ASCIILCaseReader>(str.GetData(), str.GetSize(), pattern.GetData(), |
449 | pattern.GetSize(), '\0'); |
450 | } |
451 | }; |
452 | |
453 | struct NotILikeOperatorASCII { |
454 | template <class TA, class TB, class TR> |
455 | static inline TR Operation(TA str, TB pattern) { |
456 | return !ILikeOperatorASCII::Operation<TA, TB, TR>(str, pattern); |
457 | } |
458 | }; |
459 | |
460 | struct GlobOperator { |
461 | template <class TA, class TB, class TR> |
462 | static inline TR Operation(TA str, TB pattern) { |
463 | return LikeFun::Glob(string: str.GetData(), slen: str.GetSize(), pattern: pattern.GetData(), plen: pattern.GetSize()); |
464 | } |
465 | }; |
466 | |
467 | // This can be moved to the scalar_function class |
468 | template <typename FUNC> |
469 | static void LikeEscapeFunction(DataChunk &args, ExpressionState &state, Vector &result) { |
470 | auto &str = args.data[0]; |
471 | auto &pattern = args.data[1]; |
472 | auto &escape = args.data[2]; |
473 | |
474 | TernaryExecutor::Execute<string_t, string_t, string_t, bool>( |
475 | str, pattern, escape, result, args.size(), FUNC::template Operation<string_t, string_t, string_t>); |
476 | } |
477 | |
478 | template <class ASCII_OP> |
479 | static unique_ptr<BaseStatistics> ILikePropagateStats(ClientContext &context, FunctionStatisticsInput &input) { |
480 | auto &child_stats = input.child_stats; |
481 | auto &expr = input.expr; |
482 | D_ASSERT(child_stats.size() >= 1); |
483 | // can only propagate stats if the children have stats |
484 | if (!StringStats::CanContainUnicode(stats: child_stats[0])) { |
485 | expr.function.function = ScalarFunction::BinaryFunction<string_t, string_t, bool, ASCII_OP>; |
486 | } |
487 | return nullptr; |
488 | } |
489 | |
490 | template <class OP, bool INVERT> |
491 | static void RegularLikeFunction(DataChunk &input, ExpressionState &state, Vector &result) { |
492 | auto &func_expr = state.expr.Cast<BoundFunctionExpression>(); |
493 | if (func_expr.bind_info) { |
494 | auto &matcher = func_expr.bind_info->Cast<LikeMatcher>(); |
495 | // use fast like matcher |
496 | UnaryExecutor::Execute<string_t, bool>(input.data[0], result, input.size(), [&](string_t input) { |
497 | return INVERT ? !matcher.Match(str&: input) : matcher.Match(str&: input); |
498 | }); |
499 | } else { |
500 | // use generic like matcher |
501 | BinaryExecutor::ExecuteStandard<string_t, string_t, bool, OP>(input.data[0], input.data[1], result, |
502 | input.size()); |
503 | } |
504 | } |
505 | void LikeFun::RegisterFunction(BuiltinFunctions &set) { |
506 | // like |
507 | set.AddFunction(function: ScalarFunction("~~" , {LogicalType::VARCHAR, LogicalType::VARCHAR}, LogicalType::BOOLEAN, |
508 | RegularLikeFunction<LikeOperator, false>, LikeBindFunction)); |
509 | // not like |
510 | set.AddFunction(function: ScalarFunction("!~~" , {LogicalType::VARCHAR, LogicalType::VARCHAR}, LogicalType::BOOLEAN, |
511 | RegularLikeFunction<NotLikeOperator, true>, LikeBindFunction)); |
512 | // glob |
513 | set.AddFunction(function: ScalarFunction("~~~" , {LogicalType::VARCHAR, LogicalType::VARCHAR}, LogicalType::BOOLEAN, |
514 | ScalarFunction::BinaryFunction<string_t, string_t, bool, GlobOperator>)); |
515 | // ilike |
516 | set.AddFunction(function: ScalarFunction("~~*" , {LogicalType::VARCHAR, LogicalType::VARCHAR}, LogicalType::BOOLEAN, |
517 | ScalarFunction::BinaryFunction<string_t, string_t, bool, ILikeOperator>, nullptr, |
518 | nullptr, ILikePropagateStats<ILikeOperatorASCII>)); |
519 | // not ilike |
520 | set.AddFunction(function: ScalarFunction("!~~*" , {LogicalType::VARCHAR, LogicalType::VARCHAR}, LogicalType::BOOLEAN, |
521 | ScalarFunction::BinaryFunction<string_t, string_t, bool, NotILikeOperator>, nullptr, |
522 | nullptr, ILikePropagateStats<NotILikeOperatorASCII>)); |
523 | } |
524 | |
525 | void LikeEscapeFun::RegisterFunction(BuiltinFunctions &set) { |
526 | set.AddFunction(names: {"like_escape" }, function: ScalarFunction({LogicalType::VARCHAR, LogicalType::VARCHAR, LogicalType::VARCHAR}, |
527 | LogicalType::BOOLEAN, LikeEscapeFunction<LikeEscapeOperator>)); |
528 | set.AddFunction(names: {"not_like_escape" }, |
529 | function: ScalarFunction({LogicalType::VARCHAR, LogicalType::VARCHAR, LogicalType::VARCHAR}, |
530 | LogicalType::BOOLEAN, LikeEscapeFunction<NotLikeEscapeOperator>)); |
531 | |
532 | set.AddFunction(names: {"ilike_escape" }, function: ScalarFunction({LogicalType::VARCHAR, LogicalType::VARCHAR, LogicalType::VARCHAR}, |
533 | LogicalType::BOOLEAN, LikeEscapeFunction<ILikeEscapeOperator>)); |
534 | set.AddFunction(names: {"not_ilike_escape" }, |
535 | function: ScalarFunction({LogicalType::VARCHAR, LogicalType::VARCHAR, LogicalType::VARCHAR}, |
536 | LogicalType::BOOLEAN, LikeEscapeFunction<NotILikeEscapeOperator>)); |
537 | } |
538 | } // namespace duckdb |
539 | |