1/*
2 Copyright (c) 2015, MariaDB Foundation
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; version 2 of the License.
7
8 This program is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with this program; if not, write to the Free Software
15 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
16*/
17
18
19#ifndef MY_FUNCTION_NAME
20#error MY_FUNCTION_NAME is not defined
21#endif
22
23
24/*
25 The weight for automatically padded spaces when comparing strings with
26 the PAD SPACE property.
27 Should normally be equal to the weight of a regular space.
28*/
29#ifndef WEIGHT_PAD_SPACE
30#define WEIGHT_PAD_SPACE (' ')
31#endif
32
33
34/*
35 Weight of an illegal byte, must follow these rules:
36 1. Must be greater than weight of any normal character in the collation.
37 2. Two different bad bytes must have different weights and must be
38 compared in their binary order.
39
40 Depends on mbmaxlen of the character set, as well as how the collation
41 sorts various single-byte and multi-byte character blocks.
42
43 The macro below is the default definition, it is suitable for mbmaxlen=2
44 character sets that sort all multi-byte characters after all single-byte
45 characters: big5, euckr, gb2312, gbk.
46
47 All mbmaxlen>2 character sets must provide their own definitions.
48 All collations that have a more complex order (than just MB1 followed by MB2)
49 must also provide their own definitions (see definitions for
50 cp932_japanese_ci and sjis_japanese_ci as examples of a more complex order).
51*/
52#ifndef WEIGHT_ILSEQ
53#define WEIGHT_ILSEQ(x) (0xFF00 + (x))
54#endif
55
56
57/**
58 Scan a valid character, or a bad byte, or an auto-padded space
59 from a string and calculate the weight of the scanned sequence.
60
61 @param [OUT] weight - the weight is returned here
62 @param str - the string
63 @param end - the end of the string
64 @return - the number of bytes scanned
65
66 The including source file must define the following macros:
67 IS_MB1_CHAR(b0) - for character sets that have MB1 characters
68 IS_MB1_MB2HEAD_GAP(b0) - optional, for better performance
69 IS_MB2_CHAR(b0,b1) - for character sets that have MB2 characters
70 IS_MB3_CHAR(b0,b1,b2) - for character sets that have MB3 characters
71 IS_MB4_CHAR(b0,b1,b2,b3) - for character sets with have MB4 characters
72 WEIGHT_PAD_SPACE
73 WEIGHT_MB1(b0) - for character sets that have MB1 characters
74 WEIGHT_MB2(b0,b1) - for character sets that have MB2 characters
75 WEIGHT_MB3(b0,b1,b2) - for character sets that have MB3 characters
76 WEIGHT_MB4(b0,b1,b2,b3) - for character sets that have MB4 characters
77 WEIGHT_ILSEQ(x)
78*/
79static inline uint
80MY_FUNCTION_NAME(scan_weight)(int *weight, const uchar *str, const uchar *end)
81{
82 if (str >= end)
83 {
84 *weight= WEIGHT_PAD_SPACE;
85 return 0;
86 }
87
88#ifdef IS_MB1_CHAR
89 if (IS_MB1_CHAR(*str))
90 {
91 *weight= WEIGHT_MB1(*str); /* A valid single byte character*/
92 return 1;
93 }
94#endif
95
96#ifdef IS_MB1_MBHEAD_UNUSED_GAP
97 /*
98 Quickly filter out unused bytes that are neither MB1 nor MBHEAD.
99 E.g. [0x80..0xC1] in utf8. This allows using simplified conditions
100 in IS_MB2_CHAR(), IS_MB3_CHAR(), etc.
101 */
102 if (IS_MB1_MBHEAD_UNUSED_GAP(*str))
103 goto bad;
104#endif
105
106#ifdef IS_MB2_CHAR
107 if (str + 2 > end) /* The string ended unexpectedly */
108 goto bad; /* Treat as a bad byte */
109
110 if (IS_MB2_CHAR(str[0], str[1]))
111 {
112 *weight= WEIGHT_MB2(str[0], str[1]);
113 return 2; /* A valid two-byte character */
114 }
115#endif
116
117#ifdef IS_MB3_CHAR
118 if (str + 3 > end) /* Incomplete three-byte character */
119 goto bad;
120
121 if (IS_MB3_CHAR(str[0], str[1], str[2]))
122 {
123 *weight= WEIGHT_MB3(str[0], str[1], str[2]);
124 return 3; /* A valid three-byte character */
125 }
126#endif
127
128#ifdef IS_MB4_CHAR
129 if (str + 4 > end) /* Incomplete four-byte character */
130 goto bad;
131
132 if (IS_MB4_CHAR(str[0], str[1], str[2], str[3]))
133 {
134 *weight= WEIGHT_MB4(str[0], str[1], str[2], str[3]);
135 return 4; /* A valid four-byte character */
136 }
137
138#endif
139
140bad:
141 *weight= WEIGHT_ILSEQ(str[0]); /* Bad byte */
142 return 1;
143}
144
145
146/**
147 Compare two strings according to the collation,
148 without handling the PAD SPACE property.
149
150 Note, cs->coll->strnncoll() is usually used to compare identifiers.
151 Perhaps we should eventually (in 10.2?) create a new collation
152 my_charset_utf8_general_ci_no_pad and have only one comparison function
153 in MY_COLLATION_HANDLER.
154
155 @param cs - the character set and collation
156 @param a - the left string
157 @param a_length - the length of the left string
158 @param b - the right string
159 @param b_length - the length of the right string
160 @param b_is_prefix - if the caller wants to check if "b" is a prefix of "a"
161 @return - the comparison result
162*/
163static int
164MY_FUNCTION_NAME(strnncoll)(CHARSET_INFO *cs __attribute__((unused)),
165 const uchar *a, size_t a_length,
166 const uchar *b, size_t b_length,
167 my_bool b_is_prefix)
168{
169 const uchar *a_end= a + a_length;
170 const uchar *b_end= b + b_length;
171 for ( ; ; )
172 {
173 int a_weight, b_weight, res;
174 uint a_wlen= MY_FUNCTION_NAME(scan_weight)(&a_weight, a, a_end);
175 uint b_wlen= MY_FUNCTION_NAME(scan_weight)(&b_weight, b, b_end);
176 /*
177 a_wlen b_wlen Comment
178 ------ ------ -------
179 0 0 Strings ended simultaneously, "a" and "b" are equal.
180 0 >0 "a" is a prefix of "b", so "a" is smaller.
181 >0 0 "b" is a prefix of "a", check b_is_prefix.
182 >0 >0 Two weights were scanned, check weight difference.
183 */
184 if (!a_wlen)
185 return b_wlen ? -b_weight : 0;
186
187 if (!b_wlen)
188 return b_is_prefix ? 0 : a_weight;
189
190 if ((res= (a_weight - b_weight)))
191 return res;
192 /*
193 None of the strings has ended yet.
194 */
195 DBUG_ASSERT(a < a_end);
196 DBUG_ASSERT(b < b_end);
197 a+= a_wlen;
198 b+= b_wlen;
199 }
200 DBUG_ASSERT(0);
201 return 0;
202}
203
204
205#ifdef DEFINE_STRNNCOLLSP_NOPAD
206
207/**
208 Compare two strings according to the collation, with NO PAD handling.
209
210 @param cs - the character set and collation
211 @param a - the left string
212 @param a_length - the length of the left string
213 @param b - the right string
214 @param b_length - the length of the right string
215 @return - the comparison result
216*/
217static int
218MY_FUNCTION_NAME(strnncollsp)(CHARSET_INFO *cs __attribute__((unused)),
219 const uchar *a, size_t a_length,
220 const uchar *b, size_t b_length)
221{
222 return MY_FUNCTION_NAME(strnncoll)(cs, a, a_length, b, b_length, FALSE);
223}
224#else
225/**
226 Compare two strings according to the collation, with PAD SPACE handling.
227
228 @param cs - the character set and collation
229 @param a - the left string
230 @param a_length - the length of the left string
231 @param b - the right string
232 @param b_length - the length of the right string
233 @return - the comparison result
234*/
235static int
236MY_FUNCTION_NAME(strnncollsp)(CHARSET_INFO *cs __attribute__((unused)),
237 const uchar *a, size_t a_length,
238 const uchar *b, size_t b_length)
239{
240 const uchar *a_end= a + a_length;
241 const uchar *b_end= b + b_length;
242 for ( ; ; )
243 {
244 int a_weight, b_weight, res;
245 uint a_wlen= MY_FUNCTION_NAME(scan_weight)(&a_weight, a, a_end);
246 uint b_wlen= MY_FUNCTION_NAME(scan_weight)(&b_weight, b, b_end);
247 if ((res= (a_weight - b_weight)))
248 {
249 /*
250 Got two different weights. Each weight can be generated by either of:
251 - a real character
252 - a bad byte sequence or an incomplete byte sequence
253 - an auto-generated trailing space (PAD SPACE)
254 It does not matter how exactly each weight was generated.
255 Just return the weight difference.
256 */
257 return res;
258 }
259 if (!a_wlen && !b_wlen)
260 {
261 /*
262 Got two auto-generated trailing spaces, i.e.
263 both strings have now ended, so they are equal.
264 */
265 DBUG_ASSERT(a == a_end);
266 DBUG_ASSERT(b == b_end);
267 return 0;
268 }
269 /*
270 At least one of the strings has not ended yet, continue comparison.
271 */
272 DBUG_ASSERT(a < a_end || b < b_end);
273 a+= a_wlen;
274 b+= b_wlen;
275 }
276 DBUG_ASSERT(0);
277 return 0;
278}
279#endif
280
281
282#ifdef DEFINE_STRNXFRM
283#ifndef WEIGHT_MB2_FRM
284#define WEIGHT_MB2_FRM(x,y) WEIGHT_MB2(x,y)
285#endif
286
287static size_t
288MY_FUNCTION_NAME(strnxfrm)(CHARSET_INFO *cs,
289 uchar *dst, size_t dstlen, uint nweights,
290 const uchar *src, size_t srclen, uint flags)
291{
292 uchar *d0= dst;
293 uchar *de= dst + dstlen;
294 const uchar *se= src + srclen;
295 const uchar *sort_order= cs->sort_order;
296
297 for (; dst < de && src < se && nweights; nweights--)
298 {
299 if (my_charlen(cs, (const char *) src, (const char *) se) > 1)
300 {
301 /*
302 Note, it is safe not to check (src < se)
303 in the code below, because my_charlen() would
304 not return 2 if src was too short
305 */
306 uint16 e= WEIGHT_MB2_FRM(src[0], src[1]);
307 *dst++= (uchar) (e >> 8);
308 if (dst < de)
309 *dst++= (uchar) (e & 0xFF);
310 src+= 2;
311 }
312 else
313 *dst++= sort_order ? sort_order[*src++] : *src++;
314 }
315#ifdef DEFINE_STRNNCOLLSP_NOPAD
316 return my_strxfrm_pad_desc_and_reverse_nopad(cs, d0, dst, de,
317 nweights, flags, 0);
318#else
319 return my_strxfrm_pad_desc_and_reverse(cs, d0, dst, de, nweights, flags, 0);
320#endif
321}
322#endif /* DEFINE_STRNXFRM */
323
324
325/*
326 We usually include this file at least two times from the same source file,
327 for the _ci and the _bin collations. Prepare for the second inclusion.
328*/
329#undef MY_FUNCTION_NAME
330#undef WEIGHT_ILSEQ
331#undef WEIGHT_MB1
332#undef WEIGHT_MB2
333#undef WEIGHT_MB3
334#undef WEIGHT_MB4
335#undef WEIGHT_PAD_SPACE
336#undef WEIGHT_MB2_FRM
337#undef DEFINE_STRNXFRM
338#undef DEFINE_STRNNCOLLSP_NOPAD
339