1 | /*------------------------------------------------------------------------- |
2 | * |
3 | * like_match.c |
4 | * LIKE pattern matching internal code. |
5 | * |
6 | * This file is included by like.c four times, to provide matching code for |
7 | * (1) single-byte encodings, (2) UTF8, (3) other multi-byte encodings, |
8 | * and (4) case insensitive matches in single-byte encodings. |
9 | * (UTF8 is a special case because we can use a much more efficient version |
10 | * of NextChar than can be used for general multi-byte encodings.) |
11 | * |
12 | * Before the inclusion, we need to define the following macros: |
13 | * |
14 | * NextChar |
15 | * MatchText - to name of function wanted |
16 | * do_like_escape - name of function if wanted - needs CHAREQ and CopyAdvChar |
17 | * MATCH_LOWER - define for case (4) to specify case folding for 1-byte chars |
18 | * |
19 | * Copyright (c) 1996-2019, PostgreSQL Global Development Group |
20 | * |
21 | * IDENTIFICATION |
22 | * src/backend/utils/adt/like_match.c |
23 | * |
24 | *------------------------------------------------------------------------- |
25 | */ |
26 | |
27 | /* |
28 | * Originally written by Rich $alz, mirror!rs, Wed Nov 26 19:03:17 EST 1986. |
29 | * Rich $alz is now <rsalz@bbn.com>. |
30 | * Special thanks to Lars Mathiesen <thorinn@diku.dk> for the LABORT code. |
31 | * |
32 | * This code was shamelessly stolen from the "pql" code by myself and |
33 | * slightly modified :) |
34 | * |
35 | * All references to the word "star" were replaced by "percent" |
36 | * All references to the word "wild" were replaced by "like" |
37 | * |
38 | * All the nice shell RE matching stuff was replaced by just "_" and "%" |
39 | * |
40 | * As I don't have a copy of the SQL standard handy I wasn't sure whether |
41 | * to leave in the '\' escape character handling. |
42 | * |
43 | * Keith Parks. <keith@mtcc.demon.co.uk> |
44 | * |
45 | * SQL lets you specify the escape character by saying |
46 | * LIKE <pattern> ESCAPE <escape character>. We are a small operation |
47 | * so we force you to use '\'. - ay 7/95 |
48 | * |
49 | * Now we have the like_escape() function that converts patterns with |
50 | * any specified escape character (or none at all) to the internal |
51 | * default escape character, which is still '\'. - tgl 9/2000 |
52 | * |
53 | * The code is rewritten to avoid requiring null-terminated strings, |
54 | * which in turn allows us to leave out some memcpy() operations. |
55 | * This code should be faster and take less memory, but no promises... |
56 | * - thomas 2000-08-06 |
57 | */ |
58 | |
59 | |
60 | /*-------------------- |
61 | * Match text and pattern, return LIKE_TRUE, LIKE_FALSE, or LIKE_ABORT. |
62 | * |
63 | * LIKE_TRUE: they match |
64 | * LIKE_FALSE: they don't match |
65 | * LIKE_ABORT: not only don't they match, but the text is too short. |
66 | * |
67 | * If LIKE_ABORT is returned, then no suffix of the text can match the |
68 | * pattern either, so an upper-level % scan can stop scanning now. |
69 | *-------------------- |
70 | */ |
71 | |
72 | #ifdef MATCH_LOWER |
73 | #define GETCHAR(t) MATCH_LOWER(t) |
74 | #else |
75 | #define GETCHAR(t) (t) |
76 | #endif |
77 | |
78 | static int |
79 | MatchText(const char *t, int tlen, const char *p, int plen, |
80 | pg_locale_t locale, bool locale_is_c) |
81 | { |
82 | /* Fast path for match-everything pattern */ |
83 | if (plen == 1 && *p == '%') |
84 | return LIKE_TRUE; |
85 | |
86 | /* Since this function recurses, it could be driven to stack overflow */ |
87 | check_stack_depth(); |
88 | |
89 | /* |
90 | * In this loop, we advance by char when matching wildcards (and thus on |
91 | * recursive entry to this function we are properly char-synced). On other |
92 | * occasions it is safe to advance by byte, as the text and pattern will |
93 | * be in lockstep. This allows us to perform all comparisons between the |
94 | * text and pattern on a byte by byte basis, even for multi-byte |
95 | * encodings. |
96 | */ |
97 | while (tlen > 0 && plen > 0) |
98 | { |
99 | if (*p == '\\') |
100 | { |
101 | /* Next pattern byte must match literally, whatever it is */ |
102 | NextByte(p, plen); |
103 | /* ... and there had better be one, per SQL standard */ |
104 | if (plen <= 0) |
105 | ereport(ERROR, |
106 | (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE), |
107 | errmsg("LIKE pattern must not end with escape character" ))); |
108 | if (GETCHAR(*p) != GETCHAR(*t)) |
109 | return LIKE_FALSE; |
110 | } |
111 | else if (*p == '%') |
112 | { |
113 | char firstpat; |
114 | |
115 | /* |
116 | * % processing is essentially a search for a text position at |
117 | * which the remainder of the text matches the remainder of the |
118 | * pattern, using a recursive call to check each potential match. |
119 | * |
120 | * If there are wildcards immediately following the %, we can skip |
121 | * over them first, using the idea that any sequence of N _'s and |
122 | * one or more %'s is equivalent to N _'s and one % (ie, it will |
123 | * match any sequence of at least N text characters). In this way |
124 | * we will always run the recursive search loop using a pattern |
125 | * fragment that begins with a literal character-to-match, thereby |
126 | * not recursing more than we have to. |
127 | */ |
128 | NextByte(p, plen); |
129 | |
130 | while (plen > 0) |
131 | { |
132 | if (*p == '%') |
133 | NextByte(p, plen); |
134 | else if (*p == '_') |
135 | { |
136 | /* If not enough text left to match the pattern, ABORT */ |
137 | if (tlen <= 0) |
138 | return LIKE_ABORT; |
139 | NextChar(t, tlen); |
140 | NextByte(p, plen); |
141 | } |
142 | else |
143 | break; /* Reached a non-wildcard pattern char */ |
144 | } |
145 | |
146 | /* |
147 | * If we're at end of pattern, match: we have a trailing % which |
148 | * matches any remaining text string. |
149 | */ |
150 | if (plen <= 0) |
151 | return LIKE_TRUE; |
152 | |
153 | /* |
154 | * Otherwise, scan for a text position at which we can match the |
155 | * rest of the pattern. The first remaining pattern char is known |
156 | * to be a regular or escaped literal character, so we can compare |
157 | * the first pattern byte to each text byte to avoid recursing |
158 | * more than we have to. This fact also guarantees that we don't |
159 | * have to consider a match to the zero-length substring at the |
160 | * end of the text. |
161 | */ |
162 | if (*p == '\\') |
163 | { |
164 | if (plen < 2) |
165 | ereport(ERROR, |
166 | (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE), |
167 | errmsg("LIKE pattern must not end with escape character" ))); |
168 | firstpat = GETCHAR(p[1]); |
169 | } |
170 | else |
171 | firstpat = GETCHAR(*p); |
172 | |
173 | while (tlen > 0) |
174 | { |
175 | if (GETCHAR(*t) == firstpat) |
176 | { |
177 | int matched = MatchText(t, tlen, p, plen, |
178 | locale, locale_is_c); |
179 | |
180 | if (matched != LIKE_FALSE) |
181 | return matched; /* TRUE or ABORT */ |
182 | } |
183 | |
184 | NextChar(t, tlen); |
185 | } |
186 | |
187 | /* |
188 | * End of text with no match, so no point in trying later places |
189 | * to start matching this pattern. |
190 | */ |
191 | return LIKE_ABORT; |
192 | } |
193 | else if (*p == '_') |
194 | { |
195 | /* _ matches any single character, and we know there is one */ |
196 | NextChar(t, tlen); |
197 | NextByte(p, plen); |
198 | continue; |
199 | } |
200 | else if (GETCHAR(*p) != GETCHAR(*t)) |
201 | { |
202 | /* non-wildcard pattern char fails to match text char */ |
203 | return LIKE_FALSE; |
204 | } |
205 | |
206 | /* |
207 | * Pattern and text match, so advance. |
208 | * |
209 | * It is safe to use NextByte instead of NextChar here, even for |
210 | * multi-byte character sets, because we are not following immediately |
211 | * after a wildcard character. If we are in the middle of a multibyte |
212 | * character, we must already have matched at least one byte of the |
213 | * character from both text and pattern; so we cannot get out-of-sync |
214 | * on character boundaries. And we know that no backend-legal |
215 | * encoding allows ASCII characters such as '%' to appear as non-first |
216 | * bytes of characters, so we won't mistakenly detect a new wildcard. |
217 | */ |
218 | NextByte(t, tlen); |
219 | NextByte(p, plen); |
220 | } |
221 | |
222 | if (tlen > 0) |
223 | return LIKE_FALSE; /* end of pattern, but not of text */ |
224 | |
225 | /* |
226 | * End of text, but perhaps not of pattern. Match iff the remaining |
227 | * pattern can match a zero-length string, ie, it's zero or more %'s. |
228 | */ |
229 | while (plen > 0 && *p == '%') |
230 | NextByte(p, plen); |
231 | if (plen <= 0) |
232 | return LIKE_TRUE; |
233 | |
234 | /* |
235 | * End of text with no match, so no point in trying later places to start |
236 | * matching this pattern. |
237 | */ |
238 | return LIKE_ABORT; |
239 | } /* MatchText() */ |
240 | |
241 | /* |
242 | * like_escape() --- given a pattern and an ESCAPE string, |
243 | * convert the pattern to use Postgres' standard backslash escape convention. |
244 | */ |
245 | #ifdef do_like_escape |
246 | |
247 | static text * |
248 | do_like_escape(text *pat, text *esc) |
249 | { |
250 | text *result; |
251 | char *p, |
252 | *e, |
253 | *r; |
254 | int plen, |
255 | elen; |
256 | bool afterescape; |
257 | |
258 | p = VARDATA_ANY(pat); |
259 | plen = VARSIZE_ANY_EXHDR(pat); |
260 | e = VARDATA_ANY(esc); |
261 | elen = VARSIZE_ANY_EXHDR(esc); |
262 | |
263 | /* |
264 | * Worst-case pattern growth is 2x --- unlikely, but it's hardly worth |
265 | * trying to calculate the size more accurately than that. |
266 | */ |
267 | result = (text *) palloc(plen * 2 + VARHDRSZ); |
268 | r = VARDATA(result); |
269 | |
270 | if (elen == 0) |
271 | { |
272 | /* |
273 | * No escape character is wanted. Double any backslashes in the |
274 | * pattern to make them act like ordinary characters. |
275 | */ |
276 | while (plen > 0) |
277 | { |
278 | if (*p == '\\') |
279 | *r++ = '\\'; |
280 | CopyAdvChar(r, p, plen); |
281 | } |
282 | } |
283 | else |
284 | { |
285 | /* |
286 | * The specified escape must be only a single character. |
287 | */ |
288 | NextChar(e, elen); |
289 | if (elen != 0) |
290 | ereport(ERROR, |
291 | (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE), |
292 | errmsg("invalid escape string" ), |
293 | errhint("Escape string must be empty or one character." ))); |
294 | |
295 | e = VARDATA_ANY(esc); |
296 | |
297 | /* |
298 | * If specified escape is '\', just copy the pattern as-is. |
299 | */ |
300 | if (*e == '\\') |
301 | { |
302 | memcpy(result, pat, VARSIZE_ANY(pat)); |
303 | return result; |
304 | } |
305 | |
306 | /* |
307 | * Otherwise, convert occurrences of the specified escape character to |
308 | * '\', and double occurrences of '\' --- unless they immediately |
309 | * follow an escape character! |
310 | */ |
311 | afterescape = false; |
312 | while (plen > 0) |
313 | { |
314 | if (CHAREQ(p, e) && !afterescape) |
315 | { |
316 | *r++ = '\\'; |
317 | NextChar(p, plen); |
318 | afterescape = true; |
319 | } |
320 | else if (*p == '\\') |
321 | { |
322 | *r++ = '\\'; |
323 | if (!afterescape) |
324 | *r++ = '\\'; |
325 | NextChar(p, plen); |
326 | afterescape = false; |
327 | } |
328 | else |
329 | { |
330 | CopyAdvChar(r, p, plen); |
331 | afterescape = false; |
332 | } |
333 | } |
334 | } |
335 | |
336 | SET_VARSIZE(result, r - ((char *) result)); |
337 | |
338 | return result; |
339 | } |
340 | #endif /* do_like_escape */ |
341 | |
342 | #ifdef CHAREQ |
343 | #undef CHAREQ |
344 | #endif |
345 | |
346 | #undef NextChar |
347 | #undef CopyAdvChar |
348 | #undef MatchText |
349 | |
350 | #ifdef do_like_escape |
351 | #undef do_like_escape |
352 | #endif |
353 | |
354 | #undef GETCHAR |
355 | |
356 | #ifdef MATCH_LOWER |
357 | #undef MATCH_LOWER |
358 | |
359 | #endif |
360 | |