1#include "duckdb/common/exception.hpp"
2#include "duckdb/common/vector_operations/vector_operations.hpp"
3#include "duckdb/function/scalar/string_functions.hpp"
4#include "duckdb/planner/expression/bound_function_expression.hpp"
5
6#include "duckdb/execution/expression_executor.hpp"
7
8namespace duckdb {
9
10struct StandardCharacterReader {
11 static char Operation(const char *data, idx_t pos) {
12 return data[pos];
13 }
14};
15
16struct ASCIILCaseReader {
17 static char Operation(const char *data, idx_t pos) {
18 return (char)LowerFun::ascii_to_lower_map[(uint8_t)data[pos]];
19 }
20};
21
22template <char PERCENTAGE, char UNDERSCORE, bool HAS_ESCAPE, class READER = StandardCharacterReader>
23bool TemplatedLikeOperator(const char *sdata, idx_t slen, const char *pdata, idx_t plen, char escape) {
24 idx_t pidx = 0;
25 idx_t sidx = 0;
26 for (; pidx < plen && sidx < slen; pidx++) {
27 char pchar = READER::Operation(pdata, pidx);
28 char schar = READER::Operation(sdata, sidx);
29 if (HAS_ESCAPE && pchar == escape) {
30 pidx++;
31 if (pidx == plen) {
32 throw SyntaxException("Like pattern must not end with escape character!");
33 }
34 if (pdata[pidx] != schar) {
35 return false;
36 }
37 sidx++;
38 } else if (pchar == UNDERSCORE) {
39 sidx++;
40 } else if (pchar == PERCENTAGE) {
41 pidx++;
42 while (pidx < plen && pdata[pidx] == PERCENTAGE) {
43 pidx++;
44 }
45 if (pidx == plen) {
46 return true; /* tail is acceptable */
47 }
48 for (; sidx < slen; sidx++) {
49 if (TemplatedLikeOperator<PERCENTAGE, UNDERSCORE, HAS_ESCAPE, READER>(
50 sdata + sidx, slen - sidx, pdata + pidx, plen - pidx, escape)) {
51 return true;
52 }
53 }
54 return false;
55 } else if (pchar == schar) {
56 sidx++;
57 } else {
58 return false;
59 }
60 }
61 while (pidx < plen && pdata[pidx] == PERCENTAGE) {
62 pidx++;
63 }
64 return pidx == plen && sidx == slen;
65}
66
67struct LikeSegment {
68 explicit LikeSegment(string pattern) : pattern(std::move(pattern)) {
69 }
70
71 string pattern;
72};
73
74struct LikeMatcher : public FunctionData {
75 LikeMatcher(string like_pattern_p, vector<LikeSegment> segments, bool has_start_percentage, bool has_end_percentage)
76 : like_pattern(std::move(like_pattern_p)), segments(std::move(segments)),
77 has_start_percentage(has_start_percentage), has_end_percentage(has_end_percentage) {
78 }
79
80 bool Match(string_t &str) {
81 auto str_data = const_uchar_ptr_cast(src: str.GetData());
82 auto str_len = str.GetSize();
83 idx_t segment_idx = 0;
84 idx_t end_idx = segments.size() - 1;
85 if (!has_start_percentage) {
86 // no start sample_size: match the first part of the string directly
87 auto &segment = segments[0];
88 if (str_len < segment.pattern.size()) {
89 return false;
90 }
91 if (memcmp(s1: str_data, s2: segment.pattern.c_str(), n: segment.pattern.size()) != 0) {
92 return false;
93 }
94 str_data += segment.pattern.size();
95 str_len -= segment.pattern.size();
96 segment_idx++;
97 if (segments.size() == 1) {
98 // only one segment, and it matches
99 // we have a match if there is an end sample_size, OR if the memcmp was an exact match (remaining str is
100 // empty)
101 return has_end_percentage || str_len == 0;
102 }
103 }
104 // main match loop: for every segment in the middle, use Contains to find the needle in the haystack
105 for (; segment_idx < end_idx; segment_idx++) {
106 auto &segment = segments[segment_idx];
107 // find the pattern of the current segment
108 idx_t next_offset = ContainsFun::Find(haystack: str_data, haystack_size: str_len, needle: const_uchar_ptr_cast(src: segment.pattern.c_str()),
109 needle_size: segment.pattern.size());
110 if (next_offset == DConstants::INVALID_INDEX) {
111 // could not find this pattern in the string: no match
112 return false;
113 }
114 idx_t offset = next_offset + segment.pattern.size();
115 str_data += offset;
116 str_len -= offset;
117 }
118 if (!has_end_percentage) {
119 end_idx--;
120 // no end sample_size: match the final segment now
121 auto &segment = segments.back();
122 if (str_len < segment.pattern.size()) {
123 return false;
124 }
125 if (memcmp(s1: str_data + str_len - segment.pattern.size(), s2: segment.pattern.c_str(), n: segment.pattern.size()) !=
126 0) {
127 return false;
128 }
129 return true;
130 } else {
131 auto &segment = segments.back();
132 // find the pattern of the current segment
133 idx_t next_offset = ContainsFun::Find(haystack: str_data, haystack_size: str_len, needle: const_uchar_ptr_cast(src: segment.pattern.c_str()),
134 needle_size: segment.pattern.size());
135 return next_offset != DConstants::INVALID_INDEX;
136 }
137 }
138
139 static unique_ptr<LikeMatcher> CreateLikeMatcher(string like_pattern, char escape = '\0') {
140 vector<LikeSegment> segments;
141 idx_t last_non_pattern = 0;
142 bool has_start_percentage = false;
143 bool has_end_percentage = false;
144 for (idx_t i = 0; i < like_pattern.size(); i++) {
145 auto ch = like_pattern[i];
146 if (ch == escape || ch == '%' || ch == '_') {
147 // special character, push a constant pattern
148 if (i > last_non_pattern) {
149 segments.emplace_back(args: like_pattern.substr(pos: last_non_pattern, n: i - last_non_pattern));
150 }
151 last_non_pattern = i + 1;
152 if (ch == escape || ch == '_') {
153 // escape or underscore: could not create efficient like matcher
154 // FIXME: we could handle escaped percentages here
155 return nullptr;
156 } else {
157 // sample_size
158 if (i == 0) {
159 has_start_percentage = true;
160 }
161 if (i + 1 == like_pattern.size()) {
162 has_end_percentage = true;
163 }
164 }
165 }
166 }
167 if (last_non_pattern < like_pattern.size()) {
168 segments.emplace_back(args: like_pattern.substr(pos: last_non_pattern, n: like_pattern.size() - last_non_pattern));
169 }
170 if (segments.empty()) {
171 return nullptr;
172 }
173 return make_uniq<LikeMatcher>(args: std::move(like_pattern), args: std::move(segments), args&: has_start_percentage,
174 args&: has_end_percentage);
175 }
176
177 unique_ptr<FunctionData> Copy() const override {
178 return make_uniq<LikeMatcher>(args: like_pattern, args: segments, args: has_start_percentage, args: has_end_percentage);
179 }
180
181 bool Equals(const FunctionData &other_p) const override {
182 auto &other = other_p.Cast<LikeMatcher>();
183 return like_pattern == other.like_pattern;
184 }
185
186private:
187 string like_pattern;
188 vector<LikeSegment> segments;
189 bool has_start_percentage;
190 bool has_end_percentage;
191};
192
193static unique_ptr<FunctionData> LikeBindFunction(ClientContext &context, ScalarFunction &bound_function,
194 vector<unique_ptr<Expression>> &arguments) {
195 // pattern is the second argument. If its constant, we can already prepare the pattern and store it for later.
196 D_ASSERT(arguments.size() == 2 || arguments.size() == 3);
197 if (arguments[1]->IsFoldable()) {
198 Value pattern_str = ExpressionExecutor::EvaluateScalar(context, expr: *arguments[1]);
199 if (pattern_str.IsNull()) {
200 return nullptr;
201 }
202 return LikeMatcher::CreateLikeMatcher(like_pattern: pattern_str.ToString());
203 }
204 return nullptr;
205}
206
207bool LikeOperatorFunction(const char *s, idx_t slen, const char *pattern, idx_t plen, char escape) {
208 return TemplatedLikeOperator<'%', '_', true>(sdata: s, slen, pdata: pattern, plen, escape);
209}
210
211bool LikeOperatorFunction(const char *s, idx_t slen, const char *pattern, idx_t plen) {
212 return TemplatedLikeOperator<'%', '_', false>(sdata: s, slen, pdata: pattern, plen, escape: '\0');
213}
214
215bool LikeOperatorFunction(string_t &s, string_t &pat) {
216 return LikeOperatorFunction(s: s.GetData(), slen: s.GetSize(), pattern: pat.GetData(), plen: pat.GetSize());
217}
218
219bool LikeOperatorFunction(string_t &s, string_t &pat, char escape) {
220 return LikeOperatorFunction(s: s.GetData(), slen: s.GetSize(), pattern: pat.GetData(), plen: pat.GetSize(), escape);
221}
222
223bool LikeFun::Glob(const char *string, idx_t slen, const char *pattern, idx_t plen, bool allow_question_mark) {
224 idx_t sidx = 0;
225 idx_t pidx = 0;
226main_loop : {
227 // main matching loop
228 while (sidx < slen && pidx < plen) {
229 char s = string[sidx];
230 char p = pattern[pidx];
231 switch (p) {
232 case '*': {
233 // asterisk: match any set of characters
234 // skip any subsequent asterisks
235 pidx++;
236 while (pidx < plen && pattern[pidx] == '*') {
237 pidx++;
238 }
239 // if the asterisk is the last character, the pattern always matches
240 if (pidx == plen) {
241 return true;
242 }
243 // recursively match the remainder of the pattern
244 for (; sidx < slen; sidx++) {
245 if (LikeFun::Glob(string: string + sidx, slen: slen - sidx, pattern: pattern + pidx, plen: plen - pidx)) {
246 return true;
247 }
248 }
249 return false;
250 }
251 case '?':
252 // when enabled: matches anything but null
253 if (allow_question_mark) {
254 break;
255 }
256 DUCKDB_EXPLICIT_FALLTHROUGH;
257 case '[':
258 pidx++;
259 goto parse_bracket;
260 case '\\':
261 // escape character, next character needs to match literally
262 pidx++;
263 // check that we still have a character remaining
264 if (pidx == plen) {
265 return false;
266 }
267 p = pattern[pidx];
268 if (s != p) {
269 return false;
270 }
271 break;
272 default:
273 // not a control character: characters need to match literally
274 if (s != p) {
275 return false;
276 }
277 break;
278 }
279 sidx++;
280 pidx++;
281 }
282 while (pidx < plen && pattern[pidx] == '*') {
283 pidx++;
284 }
285 // we are finished only if we have consumed the full pattern
286 return pidx == plen && sidx == slen;
287}
288parse_bracket : {
289 // inside a bracket
290 if (pidx == plen) {
291 return false;
292 }
293 // check the first character
294 // if it is an exclamation mark we need to invert our logic
295 char p = pattern[pidx];
296 char s = string[sidx];
297 bool invert = false;
298 if (p == '!') {
299 invert = true;
300 pidx++;
301 }
302 bool found_match = invert;
303 idx_t start_pos = pidx;
304 bool found_closing_bracket = false;
305 // now check the remainder of the pattern
306 while (pidx < plen) {
307 p = pattern[pidx];
308 // if the first character is a closing bracket, we match it literally
309 // otherwise it indicates an end of bracket
310 if (p == ']' && pidx > start_pos) {
311 // end of bracket found: we are done
312 found_closing_bracket = true;
313 pidx++;
314 break;
315 }
316 // we either match a range (a-b) or a single character (a)
317 // check if the next character is a dash
318 if (pidx + 1 == plen) {
319 // no next character!
320 break;
321 }
322 bool matches;
323 if (pattern[pidx + 1] == '-') {
324 // range! find the next character in the range
325 if (pidx + 2 == plen) {
326 break;
327 }
328 char next_char = pattern[pidx + 2];
329 // check if the current character is within the range
330 matches = s >= p && s <= next_char;
331 // shift the pattern forward past the range
332 pidx += 3;
333 } else {
334 // no range! perform a direct match
335 matches = p == s;
336 // shift the pattern forward past the character
337 pidx++;
338 }
339 if (found_match == invert && matches) {
340 // found a match! set the found_matches flag
341 // we keep on pattern matching after this until we reach the end bracket
342 // however, we don't need to update the found_match flag anymore
343 found_match = !invert;
344 }
345 }
346 if (!found_closing_bracket) {
347 // no end of bracket: invalid pattern
348 return false;
349 }
350 if (!found_match) {
351 // did not match the bracket: return false;
352 return false;
353 }
354 // finished the bracket matching: move forward
355 sidx++;
356 goto main_loop;
357}
358}
359
360static char GetEscapeChar(string_t escape) {
361 // Only one escape character should be allowed
362 if (escape.GetSize() > 1) {
363 throw SyntaxException("Invalid escape string. Escape string must be empty or one character.");
364 }
365 return escape.GetSize() == 0 ? '\0' : *escape.GetData();
366}
367
368struct LikeEscapeOperator {
369 template <class TA, class TB, class TC>
370 static inline bool Operation(TA str, TB pattern, TC escape) {
371 char escape_char = GetEscapeChar(escape);
372 return LikeOperatorFunction(str.GetData(), str.GetSize(), pattern.GetData(), pattern.GetSize(), escape_char);
373 }
374};
375
376struct NotLikeEscapeOperator {
377 template <class TA, class TB, class TC>
378 static inline bool Operation(TA str, TB pattern, TC escape) {
379 return !LikeEscapeOperator::Operation(str, pattern, escape);
380 }
381};
382
383struct LikeOperator {
384 template <class TA, class TB, class TR>
385 static inline TR Operation(TA str, TB pattern) {
386 return LikeOperatorFunction(str, pattern);
387 }
388};
389
390bool ILikeOperatorFunction(string_t &str, string_t &pattern, char escape = '\0') {
391 auto str_data = str.GetData();
392 auto str_size = str.GetSize();
393 auto pat_data = pattern.GetData();
394 auto pat_size = pattern.GetSize();
395
396 // lowercase both the str and the pattern
397 idx_t str_llength = LowerFun::LowerLength(input_data: str_data, input_length: str_size);
398 auto str_ldata = make_unsafe_uniq_array<char>(n: str_llength);
399 LowerFun::LowerCase(input_data: str_data, input_length: str_size, result_data: str_ldata.get());
400
401 idx_t pat_llength = LowerFun::LowerLength(input_data: pat_data, input_length: pat_size);
402 auto pat_ldata = make_unsafe_uniq_array<char>(n: pat_llength);
403 LowerFun::LowerCase(input_data: pat_data, input_length: pat_size, result_data: pat_ldata.get());
404 string_t str_lcase(str_ldata.get(), str_llength);
405 string_t pat_lcase(pat_ldata.get(), pat_llength);
406 return LikeOperatorFunction(s&: str_lcase, pat&: pat_lcase, escape);
407}
408
409struct ILikeEscapeOperator {
410 template <class TA, class TB, class TC>
411 static inline bool Operation(TA str, TB pattern, TC escape) {
412 char escape_char = GetEscapeChar(escape);
413 return ILikeOperatorFunction(str, pattern, escape_char);
414 }
415};
416
417struct NotILikeEscapeOperator {
418 template <class TA, class TB, class TC>
419 static inline bool Operation(TA str, TB pattern, TC escape) {
420 return !ILikeEscapeOperator::Operation(str, pattern, escape);
421 }
422};
423
424struct ILikeOperator {
425 template <class TA, class TB, class TR>
426 static inline TR Operation(TA str, TB pattern) {
427 return ILikeOperatorFunction(str, pattern);
428 }
429};
430
431struct NotLikeOperator {
432 template <class TA, class TB, class TR>
433 static inline TR Operation(TA str, TB pattern) {
434 return !LikeOperatorFunction(str, pattern);
435 }
436};
437
438struct NotILikeOperator {
439 template <class TA, class TB, class TR>
440 static inline TR Operation(TA str, TB pattern) {
441 return !ILikeOperator::Operation<TA, TB, TR>(str, pattern);
442 }
443};
444
445struct ILikeOperatorASCII {
446 template <class TA, class TB, class TR>
447 static inline TR Operation(TA str, TB pattern) {
448 return TemplatedLikeOperator<'%', '_', false, ASCIILCaseReader>(str.GetData(), str.GetSize(), pattern.GetData(),
449 pattern.GetSize(), '\0');
450 }
451};
452
453struct NotILikeOperatorASCII {
454 template <class TA, class TB, class TR>
455 static inline TR Operation(TA str, TB pattern) {
456 return !ILikeOperatorASCII::Operation<TA, TB, TR>(str, pattern);
457 }
458};
459
460struct GlobOperator {
461 template <class TA, class TB, class TR>
462 static inline TR Operation(TA str, TB pattern) {
463 return LikeFun::Glob(string: str.GetData(), slen: str.GetSize(), pattern: pattern.GetData(), plen: pattern.GetSize());
464 }
465};
466
467// This can be moved to the scalar_function class
468template <typename FUNC>
469static void LikeEscapeFunction(DataChunk &args, ExpressionState &state, Vector &result) {
470 auto &str = args.data[0];
471 auto &pattern = args.data[1];
472 auto &escape = args.data[2];
473
474 TernaryExecutor::Execute<string_t, string_t, string_t, bool>(
475 str, pattern, escape, result, args.size(), FUNC::template Operation<string_t, string_t, string_t>);
476}
477
478template <class ASCII_OP>
479static unique_ptr<BaseStatistics> ILikePropagateStats(ClientContext &context, FunctionStatisticsInput &input) {
480 auto &child_stats = input.child_stats;
481 auto &expr = input.expr;
482 D_ASSERT(child_stats.size() >= 1);
483 // can only propagate stats if the children have stats
484 if (!StringStats::CanContainUnicode(stats: child_stats[0])) {
485 expr.function.function = ScalarFunction::BinaryFunction<string_t, string_t, bool, ASCII_OP>;
486 }
487 return nullptr;
488}
489
490template <class OP, bool INVERT>
491static void RegularLikeFunction(DataChunk &input, ExpressionState &state, Vector &result) {
492 auto &func_expr = state.expr.Cast<BoundFunctionExpression>();
493 if (func_expr.bind_info) {
494 auto &matcher = func_expr.bind_info->Cast<LikeMatcher>();
495 // use fast like matcher
496 UnaryExecutor::Execute<string_t, bool>(input.data[0], result, input.size(), [&](string_t input) {
497 return INVERT ? !matcher.Match(str&: input) : matcher.Match(str&: input);
498 });
499 } else {
500 // use generic like matcher
501 BinaryExecutor::ExecuteStandard<string_t, string_t, bool, OP>(input.data[0], input.data[1], result,
502 input.size());
503 }
504}
505void LikeFun::RegisterFunction(BuiltinFunctions &set) {
506 // like
507 set.AddFunction(function: ScalarFunction("~~", {LogicalType::VARCHAR, LogicalType::VARCHAR}, LogicalType::BOOLEAN,
508 RegularLikeFunction<LikeOperator, false>, LikeBindFunction));
509 // not like
510 set.AddFunction(function: ScalarFunction("!~~", {LogicalType::VARCHAR, LogicalType::VARCHAR}, LogicalType::BOOLEAN,
511 RegularLikeFunction<NotLikeOperator, true>, LikeBindFunction));
512 // glob
513 set.AddFunction(function: ScalarFunction("~~~", {LogicalType::VARCHAR, LogicalType::VARCHAR}, LogicalType::BOOLEAN,
514 ScalarFunction::BinaryFunction<string_t, string_t, bool, GlobOperator>));
515 // ilike
516 set.AddFunction(function: ScalarFunction("~~*", {LogicalType::VARCHAR, LogicalType::VARCHAR}, LogicalType::BOOLEAN,
517 ScalarFunction::BinaryFunction<string_t, string_t, bool, ILikeOperator>, nullptr,
518 nullptr, ILikePropagateStats<ILikeOperatorASCII>));
519 // not ilike
520 set.AddFunction(function: ScalarFunction("!~~*", {LogicalType::VARCHAR, LogicalType::VARCHAR}, LogicalType::BOOLEAN,
521 ScalarFunction::BinaryFunction<string_t, string_t, bool, NotILikeOperator>, nullptr,
522 nullptr, ILikePropagateStats<NotILikeOperatorASCII>));
523}
524
525void LikeEscapeFun::RegisterFunction(BuiltinFunctions &set) {
526 set.AddFunction(names: {"like_escape"}, function: ScalarFunction({LogicalType::VARCHAR, LogicalType::VARCHAR, LogicalType::VARCHAR},
527 LogicalType::BOOLEAN, LikeEscapeFunction<LikeEscapeOperator>));
528 set.AddFunction(names: {"not_like_escape"},
529 function: ScalarFunction({LogicalType::VARCHAR, LogicalType::VARCHAR, LogicalType::VARCHAR},
530 LogicalType::BOOLEAN, LikeEscapeFunction<NotLikeEscapeOperator>));
531
532 set.AddFunction(names: {"ilike_escape"}, function: ScalarFunction({LogicalType::VARCHAR, LogicalType::VARCHAR, LogicalType::VARCHAR},
533 LogicalType::BOOLEAN, LikeEscapeFunction<ILikeEscapeOperator>));
534 set.AddFunction(names: {"not_ilike_escape"},
535 function: ScalarFunction({LogicalType::VARCHAR, LogicalType::VARCHAR, LogicalType::VARCHAR},
536 LogicalType::BOOLEAN, LikeEscapeFunction<NotILikeEscapeOperator>));
537}
538} // namespace duckdb
539