strcoll.ic source code [MariaDB/strings/strcoll.ic]

1	/*
2	Copyright (c) 2015, MariaDB Foundation
3
4	This program is free software; you can redistribute it and/or modify
5	it under the terms of the GNU General Public License as published by
6	the Free Software Foundation; version 2 of the License.
7
8	This program is distributed in the hope that it will be useful,
9	but WITHOUT ANY WARRANTY; without even the implied warranty of
10	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11	GNU General Public License for more details.
12
13	You should have received a copy of the GNU General Public License
14	along with this program; if not, write to the Free Software
15	Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
16	*/
17
18
19	#ifndef MY_FUNCTION_NAME
20	#error MY_FUNCTION_NAME is not defined
21	#endif
22
23
24	/*
25	The weight for automatically padded spaces when comparing strings with
26	the PAD SPACE property.
27	Should normally be equal to the weight of a regular space.
28	*/
29	#ifndef WEIGHT_PAD_SPACE
30	#define WEIGHT_PAD_SPACE (' ')
31	#endif
32
33
34	/*
35	Weight of an illegal byte, must follow these rules:
36	1. Must be greater than weight of any normal character in the collation.
37	2. Two different bad bytes must have different weights and must be
38	compared in their binary order.
39
40	Depends on mbmaxlen of the character set, as well as how the collation
41	sorts various single-byte and multi-byte character blocks.
42
43	The macro below is the default definition, it is suitable for mbmaxlen=2
44	character sets that sort all multi-byte characters after all single-byte
45	characters: big5, euckr, gb2312, gbk.
46
47	All mbmaxlen>2 character sets must provide their own definitions.
48	All collations that have a more complex order (than just MB1 followed by MB2)
49	must also provide their own definitions (see definitions for
50	cp932_japanese_ci and sjis_japanese_ci as examples of a more complex order).
51	*/
52	#ifndef WEIGHT_ILSEQ
53	#define WEIGHT_ILSEQ(x) (0xFF00 + (x))
54	#endif
55
56
57	/**
58	Scan a valid character, or a bad byte, or an auto-padded space
59	from a string and calculate the weight of the scanned sequence.
60
61	@param [OUT] weight - the weight is returned here
62	@param str - the string
63	@param end - the end of the string
64	@return - the number of bytes scanned
65
66	The including source file must define the following macros:
67	IS_MB1_CHAR(b0) - for character sets that have MB1 characters
68	IS_MB1_MB2HEAD_GAP(b0) - optional, for better performance
69	IS_MB2_CHAR(b0,b1) - for character sets that have MB2 characters
70	IS_MB3_CHAR(b0,b1,b2) - for character sets that have MB3 characters
71	IS_MB4_CHAR(b0,b1,b2,b3) - for character sets with have MB4 characters
72	WEIGHT_PAD_SPACE
73	WEIGHT_MB1(b0) - for character sets that have MB1 characters
74	WEIGHT_MB2(b0,b1) - for character sets that have MB2 characters
75	WEIGHT_MB3(b0,b1,b2) - for character sets that have MB3 characters
76	WEIGHT_MB4(b0,b1,b2,b3) - for character sets that have MB4 characters
77	WEIGHT_ILSEQ(x)
78	*/
79	static inline uint
80	MY_FUNCTION_NAME(scan_weight)(int weight, const* uchar str, const* uchar *end)
81	{
82	if (str >= end)
83	{
84	*weight= WEIGHT_PAD_SPACE;
85	return `0`;
86	}
87
88	#ifdef IS_MB1_CHAR
89	if (IS_MB1_CHAR(*str))
90	{
91	weight= WEIGHT_MB1(str); / A valid single byte character/
92	return `1`;
93	}
94	#endif
95
96	#ifdef IS_MB1_MBHEAD_UNUSED_GAP
97	/*
98	Quickly filter out unused bytes that are neither MB1 nor MBHEAD.
99	E.g. [0x80..0xC1] in utf8. This allows using simplified conditions
100	in IS_MB2_CHAR(), IS_MB3_CHAR(), etc.
101	*/
102	if (IS_MB1_MBHEAD_UNUSED_GAP(*str))
103	goto bad;
104	#endif
105
106	#ifdef IS_MB2_CHAR
107	if (str + `2` > end) / The string ended unexpectedly /
108	goto bad; / Treat as a bad byte /
109
110	if (IS_MB2_CHAR(str[`0`], str[`1`]))
111	{
112	*weight= WEIGHT_MB2(str[`0`], str[`1`]);
113	return `2`; / A valid two-byte character /
114	}
115	#endif
116
117	#ifdef IS_MB3_CHAR
118	if (str + `3` > end) / Incomplete three-byte character /
119	goto bad;
120
121	if (IS_MB3_CHAR(str[`0`], str[`1`], str[`2`]))
122	{
123	*weight= WEIGHT_MB3(str[`0`], str[`1`], str[`2`]);
124	return `3`; / A valid three-byte character /
125	}
126	#endif
127
128	#ifdef IS_MB4_CHAR
129	if (str + `4` > end) / Incomplete four-byte character /
130	goto bad;
131
132	if (IS_MB4_CHAR(str[`0`], str[`1`], str[`2`], str[`3`]))
133	{
134	*weight= WEIGHT_MB4(str[`0`], str[`1`], str[`2`], str[`3`]);
135	return `4`; / A valid four-byte character /
136	}
137
138	#endif
139
140	bad:
141	weight= WEIGHT_ILSEQ(str[`0`]); /* Bad byte /
142	return `1`;
143	}
144
145
146	/**
147	Compare two strings according to the collation,
148	without handling the PAD SPACE property.
149
150	Note, cs->coll->strnncoll() is usually used to compare identifiers.
151	Perhaps we should eventually (in 10.2?) create a new collation
152	my_charset_utf8_general_ci_no_pad and have only one comparison function
153	in MY_COLLATION_HANDLER.
154
155	@param cs - the character set and collation
156	@param a - the left string
157	@param a_length - the length of the left string
158	@param b - the right string
159	@param b_length - the length of the right string
160	@param b_is_prefix - if the caller wants to check if "b" is a prefix of "a"
161	@return - the comparison result
162	*/
163	static int
164	MY_FUNCTION_NAME(strnncoll)(CHARSET_INFO cs __attribute__*((unused)),
165	const uchar *a, size_t a_length,
166	const uchar *b, size_t b_length,
167	my_bool b_is_prefix)
168	{
169	const uchar *a_end= a + a_length;
170	const uchar *b_end= b + b_length;
171	for ( ; ; )
172	{
173	int a_weight, b_weight, res;
174	uint a_wlen= MY_FUNCTION_NAME(scan_weight)(&a_weight, a, a_end);
175	uint b_wlen= MY_FUNCTION_NAME(scan_weight)(&b_weight, b, b_end);
176	/*
177	a_wlen b_wlen Comment
178	------ ------ -------
179	0 0 Strings ended simultaneously, "a" and "b" are equal.
180	0 >0 "a" is a prefix of "b", so "a" is smaller.
181	>0 0 "b" is a prefix of "a", check b_is_prefix.
182	>0 >0 Two weights were scanned, check weight difference.
183	*/
184	if (!a_wlen)
185	return b_wlen ? -b_weight : `0`;
186
187	if (!b_wlen)
188	return b_is_prefix ? `0` : a_weight;
189
190	if ((res= (a_weight - b_weight)))
191	return res;
192	/*
193	None of the strings has ended yet.
194	*/
195	DBUG_ASSERT(a < a_end);
196	DBUG_ASSERT(b < b_end);
197	a+= a_wlen;
198	b+= b_wlen;
199	}
200	DBUG_ASSERT(`0`);
201	return `0`;
202	}
203
204
205	#ifdef DEFINE_STRNNCOLLSP_NOPAD
206
207	/**
208	Compare two strings according to the collation, with NO PAD handling.
209
210	@param cs - the character set and collation
211	@param a - the left string
212	@param a_length - the length of the left string
213	@param b - the right string
214	@param b_length - the length of the right string
215	@return - the comparison result
216	*/
217	static int
218	MY_FUNCTION_NAME(strnncollsp)(CHARSET_INFO cs __attribute__*((unused)),
219	const uchar *a, size_t a_length,
220	const uchar *b, size_t b_length)
221	{
222	return MY_FUNCTION_NAME(strnncoll)(cs, a, a_length, b, b_length, FALSE);
223	}
224	#else
225	/**
226	Compare two strings according to the collation, with PAD SPACE handling.
227
228	@param cs - the character set and collation
229	@param a - the left string
230	@param a_length - the length of the left string
231	@param b - the right string
232	@param b_length - the length of the right string
233	@return - the comparison result
234	*/
235	static int
236	MY_FUNCTION_NAME(strnncollsp)(CHARSET_INFO cs __attribute__*((unused)),
237	const uchar *a, size_t a_length,
238	const uchar *b, size_t b_length)
239	{
240	const uchar *a_end= a + a_length;
241	const uchar *b_end= b + b_length;
242	for ( ; ; )
243	{
244	int a_weight, b_weight, res;
245	uint a_wlen= MY_FUNCTION_NAME(scan_weight)(&a_weight, a, a_end);
246	uint b_wlen= MY_FUNCTION_NAME(scan_weight)(&b_weight, b, b_end);
247	if ((res= (a_weight - b_weight)))
248	{
249	/*
250	Got two different weights. Each weight can be generated by either of:
251	- a real character
252	- a bad byte sequence or an incomplete byte sequence
253	- an auto-generated trailing space (PAD SPACE)
254	It does not matter how exactly each weight was generated.
255	Just return the weight difference.
256	*/
257	return res;
258	}
259	if (!a_wlen && !b_wlen)
260	{
261	/*
262	Got two auto-generated trailing spaces, i.e.
263	both strings have now ended, so they are equal.
264	*/
265	DBUG_ASSERT(a == a_end);
266	DBUG_ASSERT(b == b_end);
267	return `0`;
268	}
269	/*
270	At least one of the strings has not ended yet, continue comparison.
271	*/
272	DBUG_ASSERT(a < a_end \|\| b < b_end);
273	a+= a_wlen;
274	b+= b_wlen;
275	}
276	DBUG_ASSERT(`0`);
277	return `0`;
278	}
279	#endif
280
281
282	#ifdef DEFINE_STRNXFRM
283	#ifndef WEIGHT_MB2_FRM
284	#define WEIGHT_MB2_FRM(x,y) WEIGHT_MB2(x,y)
285	#endif
286
287	static size_t
288	MY_FUNCTION_NAME(strnxfrm)(CHARSET_INFO *cs,
289	uchar *dst, size_t dstlen, uint nweights,
290	const uchar *src, size_t srclen, uint flags)
291	{
292	uchar *d0= dst;
293	uchar *de= dst + dstlen;
294	const uchar *se= src + srclen;
295	const uchar *sort_order= cs->sort_order;
296
297	for (; dst < de && src < se && nweights; nweights--)
298	{
299	if (my_charlen(cs, (const char ) src, (const* char *) se) > `1`)
300	{
301	/*
302	Note, it is safe not to check (src < se)
303	in the code below, because my_charlen() would
304	not return 2 if src was too short
305	*/
306	uint16 e= WEIGHT_MB2_FRM(src[`0`], src[`1`]);
307	*dst++= (uchar) (e >> `8`);
308	if (dst < de)
309	*dst++= (uchar) (e & `0xFF`);
310	src+= `2`;
311	}
312	else
313	dst++= sort_order ? sort_order[src++] : *src++;
314	}
315	#ifdef DEFINE_STRNNCOLLSP_NOPAD
316	return my_strxfrm_pad_desc_and_reverse_nopad(cs, d0, dst, de,
317	nweights, flags, `0`);
318	#else
319	return my_strxfrm_pad_desc_and_reverse(cs, d0, dst, de, nweights, flags, `0`);
320	#endif
321	}
322	#endif /* DEFINE_STRNXFRM */
323
324
325	/*
326	We usually include this file at least two times from the same source file,
327	for the _ci and the _bin collations. Prepare for the second inclusion.
328	*/
329	#undef MY_FUNCTION_NAME
330	#undef WEIGHT_ILSEQ
331	#undef WEIGHT_MB1
332	#undef WEIGHT_MB2
333	#undef WEIGHT_MB3
334	#undef WEIGHT_MB4
335	#undef WEIGHT_PAD_SPACE
336	#undef WEIGHT_MB2_FRM
337	#undef DEFINE_STRNXFRM
338	#undef DEFINE_STRNNCOLLSP_NOPAD
339

Browse the source code of MariaDB/strings/strcoll.ic