like_match.c source code [PostgreSQL/src/backend/utils/adt/like_match.c]

1	/-------------------------------------------------------------------------*
2	*
3	* like_match.c
4	* LIKE pattern matching internal code.
5	*
6	* This file is included by like.c four times, to provide matching code for
7	* (1) single-byte encodings, (2) UTF8, (3) other multi-byte encodings,
8	* and (4) case insensitive matches in single-byte encodings.
9	* (UTF8 is a special case because we can use a much more efficient version
10	* of NextChar than can be used for general multi-byte encodings.)
11	*
12	* Before the inclusion, we need to define the following macros:
13	*
14	* NextChar
15	* MatchText - to name of function wanted
16	* do_like_escape - name of function if wanted - needs CHAREQ and CopyAdvChar
17	* MATCH_LOWER - define for case (4) to specify case folding for 1-byte chars
18	*
19	* Copyright (c) 1996-2019, PostgreSQL Global Development Group
20	*
21	* IDENTIFICATION
22	* src/backend/utils/adt/like_match.c
23	*
24	*-------------------------------------------------------------------------
25	*/
26
27	/*
28	* Originally written by Rich $alz, mirror!rs, Wed Nov 26 19:03:17 EST 1986.
29	* Rich $alz is now <rsalz@bbn.com>.
30	* Special thanks to Lars Mathiesen <thorinn@diku.dk> for the LABORT code.
31	*
32	* This code was shamelessly stolen from the "pql" code by myself and
33	* slightly modified :)
34	*
35	* All references to the word "star" were replaced by "percent"
36	* All references to the word "wild" were replaced by "like"
37	*
38	* All the nice shell RE matching stuff was replaced by just "_" and "%"
39	*
40	* As I don't have a copy of the SQL standard handy I wasn't sure whether
41	* to leave in the '\' escape character handling.
42	*
43	* Keith Parks. <keith@mtcc.demon.co.uk>
44	*
45	* SQL lets you specify the escape character by saying
46	* LIKE <pattern> ESCAPE <escape character>. We are a small operation
47	* so we force you to use '\'. - ay 7/95
48	*
49	* Now we have the like_escape() function that converts patterns with
50	* any specified escape character (or none at all) to the internal
51	* default escape character, which is still '\'. - tgl 9/2000
52	*
53	* The code is rewritten to avoid requiring null-terminated strings,
54	* which in turn allows us to leave out some memcpy() operations.
55	* This code should be faster and take less memory, but no promises...
56	* - thomas 2000-08-06
57	*/
58
59
60	/--------------------*
61	* Match text and pattern, return LIKE_TRUE, LIKE_FALSE, or LIKE_ABORT.
62	*
63	* LIKE_TRUE: they match
64	* LIKE_FALSE: they don't match
65	* LIKE_ABORT: not only don't they match, but the text is too short.
66	*
67	* If LIKE_ABORT is returned, then no suffix of the text can match the
68	* pattern either, so an upper-level % scan can stop scanning now.
69	*--------------------
70	*/
71
72	#ifdef MATCH_LOWER
73	#define GETCHAR(t) MATCH_LOWER(t)
74	#else
75	#define GETCHAR(t) (t)
76	#endif
77
78	static int
79	MatchText(const char t, int* tlen, const char p, int* plen,
80	pg_locale_t locale, bool locale_is_c)
81	{
82	/ Fast path for match-everything pattern /
83	if (plen == `1` && *p == `'%'`)
84	return LIKE_TRUE;
85
86	/ Since this function recurses, it could be driven to stack overflow /
87	check_stack_depth();
88
89	/*
90	* In this loop, we advance by char when matching wildcards (and thus on
91	* recursive entry to this function we are properly char-synced). On other
92	* occasions it is safe to advance by byte, as the text and pattern will
93	* be in lockstep. This allows us to perform all comparisons between the
94	* text and pattern on a byte by byte basis, even for multi-byte
95	* encodings.
96	*/
97	while (tlen > `0` && plen > `0`)
98	{
99	if (*p == `'\\'`)
100	{
101	/ Next pattern byte must match literally, whatever it is /
102	NextByte(p, plen);
103	/ ... and there had better be one, per SQL standard /
104	if (plen <= `0`)
105	ereport(ERROR,
106	(errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
107	errmsg("LIKE pattern must not end with escape character")));
108	if (GETCHAR(p) != GETCHAR(t))
109	return LIKE_FALSE;
110	}
111	else if (*p == `'%'`)
112	{
113	char firstpat;
114
115	/*
116	* % processing is essentially a search for a text position at
117	* which the remainder of the text matches the remainder of the
118	* pattern, using a recursive call to check each potential match.
119	*
120	* If there are wildcards immediately following the %, we can skip
121	* over them first, using the idea that any sequence of N _'s and
122	* one or more %'s is equivalent to N _'s and one % (ie, it will
123	* match any sequence of at least N text characters). In this way
124	* we will always run the recursive search loop using a pattern
125	* fragment that begins with a literal character-to-match, thereby
126	* not recursing more than we have to.
127	*/
128	NextByte(p, plen);
129
130	while (plen > `0`)
131	{
132	if (*p == `'%'`)
133	NextByte(p, plen);
134	else if (*p == `'_'`)
135	{
136	/ If not enough text left to match the pattern, ABORT /
137	if (tlen <= `0`)
138	return LIKE_ABORT;
139	NextChar(t, tlen);
140	NextByte(p, plen);
141	}
142	else
143	break; / Reached a non-wildcard pattern char /
144	}
145
146	/*
147	* If we're at end of pattern, match: we have a trailing % which
148	* matches any remaining text string.
149	*/
150	if (plen <= `0`)
151	return LIKE_TRUE;
152
153	/*
154	* Otherwise, scan for a text position at which we can match the
155	* rest of the pattern. The first remaining pattern char is known
156	* to be a regular or escaped literal character, so we can compare
157	* the first pattern byte to each text byte to avoid recursing
158	* more than we have to. This fact also guarantees that we don't
159	* have to consider a match to the zero-length substring at the
160	* end of the text.
161	*/
162	if (*p == `'\\'`)
163	{
164	if (plen < `2`)
165	ereport(ERROR,
166	(errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
167	errmsg("LIKE pattern must not end with escape character")));
168	firstpat = GETCHAR(p[`1`]);
169	}
170	else
171	firstpat = GETCHAR(*p);
172
173	while (tlen > `0`)
174	{
175	if (GETCHAR(*t) == firstpat)
176	{
177	int matched = MatchText(t, tlen, p, plen,
178	locale, locale_is_c);
179
180	if (matched != LIKE_FALSE)
181	return matched; / TRUE or ABORT /
182	}
183
184	NextChar(t, tlen);
185	}
186
187	/*
188	* End of text with no match, so no point in trying later places
189	* to start matching this pattern.
190	*/
191	return LIKE_ABORT;
192	}
193	else if (*p == `'_'`)
194	{
195	/ _ matches any single character, and we know there is one /
196	NextChar(t, tlen);
197	NextByte(p, plen);
198	continue;
199	}
200	else if (GETCHAR(p) != GETCHAR(t))
201	{
202	/ non-wildcard pattern char fails to match text char /
203	return LIKE_FALSE;
204	}
205
206	/*
207	* Pattern and text match, so advance.
208	*
209	* It is safe to use NextByte instead of NextChar here, even for
210	* multi-byte character sets, because we are not following immediately
211	* after a wildcard character. If we are in the middle of a multibyte
212	* character, we must already have matched at least one byte of the
213	* character from both text and pattern; so we cannot get out-of-sync
214	* on character boundaries. And we know that no backend-legal
215	* encoding allows ASCII characters such as '%' to appear as non-first
216	* bytes of characters, so we won't mistakenly detect a new wildcard.
217	*/
218	NextByte(t, tlen);
219	NextByte(p, plen);
220	}
221
222	if (tlen > `0`)
223	return LIKE_FALSE; / end of pattern, but not of text /
224
225	/*
226	* End of text, but perhaps not of pattern. Match iff the remaining
227	* pattern can match a zero-length string, ie, it's zero or more %'s.
228	*/
229	while (plen > `0` && *p == `'%'`)
230	NextByte(p, plen);
231	if (plen <= `0`)
232	return LIKE_TRUE;
233
234	/*
235	* End of text with no match, so no point in trying later places to start
236	* matching this pattern.
237	*/
238	return LIKE_ABORT;
239	} / MatchText() /
240
241	/*
242	* like_escape() --- given a pattern and an ESCAPE string,
243	* convert the pattern to use Postgres' standard backslash escape convention.
244	*/
245	#ifdef do_like_escape
246
247	static text *
248	do_like_escape(text pat, text esc)
249	{
250	text *result;
251	char *p,
252	*e,
253	*r;
254	int plen,
255	elen;
256	bool afterescape;
257
258	p = VARDATA_ANY(pat);
259	plen = VARSIZE_ANY_EXHDR(pat);
260	e = VARDATA_ANY(esc);
261	elen = VARSIZE_ANY_EXHDR(esc);
262
263	/*
264	* Worst-case pattern growth is 2x --- unlikely, but it's hardly worth
265	* trying to calculate the size more accurately than that.
266	*/
267	result = (text ) palloc(plen `2` + VARHDRSZ);
268	r = VARDATA(result);
269
270	if (elen == `0`)
271	{
272	/*
273	* No escape character is wanted. Double any backslashes in the
274	* pattern to make them act like ordinary characters.
275	*/
276	while (plen > `0`)
277	{
278	if (*p == `'\\'`)
279	*r++ = `'\\'`;
280	CopyAdvChar(r, p, plen);
281	}
282	}
283	else
284	{
285	/*
286	* The specified escape must be only a single character.
287	*/
288	NextChar(e, elen);
289	if (elen != `0`)
290	ereport(ERROR,
291	(errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
292	errmsg("invalid escape string"),
293	errhint("Escape string must be empty or one character.")));
294
295	e = VARDATA_ANY(esc);
296
297	/*
298	* If specified escape is '\', just copy the pattern as-is.
299	*/
300	if (*e == `'\\'`)
301	{
302	memcpy(result, pat, VARSIZE_ANY(pat));
303	return result;
304	}
305
306	/*
307	* Otherwise, convert occurrences of the specified escape character to
308	* '\', and double occurrences of '\' --- unless they immediately
309	* follow an escape character!
310	*/
311	afterescape = false;
312	while (plen > `0`)
313	{
314	if (CHAREQ(p, e) && !afterescape)
315	{
316	*r++ = `'\\'`;
317	NextChar(p, plen);
318	afterescape = true;
319	}
320	else if (*p == `'\\'`)
321	{
322	*r++ = `'\\'`;
323	if (!afterescape)
324	*r++ = `'\\'`;
325	NextChar(p, plen);
326	afterescape = false;
327	}
328	else
329	{
330	CopyAdvChar(r, p, plen);
331	afterescape = false;
332	}
333	}
334	}
335
336	SET_VARSIZE(result, r - ((char *) result));
337
338	return result;
339	}
340	#endif /* do_like_escape */
341
342	#ifdef CHAREQ
343	#undef CHAREQ
344	#endif
345
346	#undef NextChar
347	#undef CopyAdvChar
348	#undef MatchText
349
350	#ifdef do_like_escape
351	#undef do_like_escape
352	#endif
353
354	#undef GETCHAR
355
356	#ifdef MATCH_LOWER
357	#undef MATCH_LOWER
358
359	#endif
360

Browse the source code of PostgreSQL/src/backend/utils/adt/like_match.c