1 | /* |
2 | Copyright (c) 2015, MariaDB Foundation |
3 | |
4 | This program is free software; you can redistribute it and/or modify |
5 | it under the terms of the GNU General Public License as published by |
6 | the Free Software Foundation; version 2 of the License. |
7 | |
8 | This program is distributed in the hope that it will be useful, |
9 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
10 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
11 | GNU General Public License for more details. |
12 | |
13 | You should have received a copy of the GNU General Public License |
14 | along with this program; if not, write to the Free Software |
15 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA |
16 | */ |
17 | |
18 | |
19 | #ifndef MY_FUNCTION_NAME |
20 | #error MY_FUNCTION_NAME is not defined |
21 | #endif |
22 | |
23 | |
24 | /* |
25 | The weight for automatically padded spaces when comparing strings with |
26 | the PAD SPACE property. |
27 | Should normally be equal to the weight of a regular space. |
28 | */ |
29 | #ifndef WEIGHT_PAD_SPACE |
30 | #define WEIGHT_PAD_SPACE (' ') |
31 | #endif |
32 | |
33 | |
34 | /* |
35 | Weight of an illegal byte, must follow these rules: |
36 | 1. Must be greater than weight of any normal character in the collation. |
37 | 2. Two different bad bytes must have different weights and must be |
38 | compared in their binary order. |
39 | |
40 | Depends on mbmaxlen of the character set, as well as how the collation |
41 | sorts various single-byte and multi-byte character blocks. |
42 | |
43 | The macro below is the default definition, it is suitable for mbmaxlen=2 |
44 | character sets that sort all multi-byte characters after all single-byte |
45 | characters: big5, euckr, gb2312, gbk. |
46 | |
47 | All mbmaxlen>2 character sets must provide their own definitions. |
48 | All collations that have a more complex order (than just MB1 followed by MB2) |
49 | must also provide their own definitions (see definitions for |
50 | cp932_japanese_ci and sjis_japanese_ci as examples of a more complex order). |
51 | */ |
52 | #ifndef WEIGHT_ILSEQ |
53 | #define WEIGHT_ILSEQ(x) (0xFF00 + (x)) |
54 | #endif |
55 | |
56 | |
57 | /** |
58 | Scan a valid character, or a bad byte, or an auto-padded space |
59 | from a string and calculate the weight of the scanned sequence. |
60 | |
61 | @param [OUT] weight - the weight is returned here |
62 | @param str - the string |
63 | @param end - the end of the string |
64 | @return - the number of bytes scanned |
65 | |
66 | The including source file must define the following macros: |
67 | IS_MB1_CHAR(b0) - for character sets that have MB1 characters |
68 | IS_MB1_MB2HEAD_GAP(b0) - optional, for better performance |
69 | IS_MB2_CHAR(b0,b1) - for character sets that have MB2 characters |
70 | IS_MB3_CHAR(b0,b1,b2) - for character sets that have MB3 characters |
71 | IS_MB4_CHAR(b0,b1,b2,b3) - for character sets with have MB4 characters |
72 | WEIGHT_PAD_SPACE |
73 | WEIGHT_MB1(b0) - for character sets that have MB1 characters |
74 | WEIGHT_MB2(b0,b1) - for character sets that have MB2 characters |
75 | WEIGHT_MB3(b0,b1,b2) - for character sets that have MB3 characters |
76 | WEIGHT_MB4(b0,b1,b2,b3) - for character sets that have MB4 characters |
77 | WEIGHT_ILSEQ(x) |
78 | */ |
79 | static inline uint |
80 | MY_FUNCTION_NAME(scan_weight)(int *weight, const uchar *str, const uchar *end) |
81 | { |
82 | if (str >= end) |
83 | { |
84 | *weight= WEIGHT_PAD_SPACE; |
85 | return 0; |
86 | } |
87 | |
88 | #ifdef IS_MB1_CHAR |
89 | if (IS_MB1_CHAR(*str)) |
90 | { |
91 | *weight= WEIGHT_MB1(*str); /* A valid single byte character*/ |
92 | return 1; |
93 | } |
94 | #endif |
95 | |
96 | #ifdef IS_MB1_MBHEAD_UNUSED_GAP |
97 | /* |
98 | Quickly filter out unused bytes that are neither MB1 nor MBHEAD. |
99 | E.g. [0x80..0xC1] in utf8. This allows using simplified conditions |
100 | in IS_MB2_CHAR(), IS_MB3_CHAR(), etc. |
101 | */ |
102 | if (IS_MB1_MBHEAD_UNUSED_GAP(*str)) |
103 | goto bad; |
104 | #endif |
105 | |
106 | #ifdef IS_MB2_CHAR |
107 | if (str + 2 > end) /* The string ended unexpectedly */ |
108 | goto bad; /* Treat as a bad byte */ |
109 | |
110 | if (IS_MB2_CHAR(str[0], str[1])) |
111 | { |
112 | *weight= WEIGHT_MB2(str[0], str[1]); |
113 | return 2; /* A valid two-byte character */ |
114 | } |
115 | #endif |
116 | |
117 | #ifdef IS_MB3_CHAR |
118 | if (str + 3 > end) /* Incomplete three-byte character */ |
119 | goto bad; |
120 | |
121 | if (IS_MB3_CHAR(str[0], str[1], str[2])) |
122 | { |
123 | *weight= WEIGHT_MB3(str[0], str[1], str[2]); |
124 | return 3; /* A valid three-byte character */ |
125 | } |
126 | #endif |
127 | |
128 | #ifdef IS_MB4_CHAR |
129 | if (str + 4 > end) /* Incomplete four-byte character */ |
130 | goto bad; |
131 | |
132 | if (IS_MB4_CHAR(str[0], str[1], str[2], str[3])) |
133 | { |
134 | *weight= WEIGHT_MB4(str[0], str[1], str[2], str[3]); |
135 | return 4; /* A valid four-byte character */ |
136 | } |
137 | |
138 | #endif |
139 | |
140 | bad: |
141 | *weight= WEIGHT_ILSEQ(str[0]); /* Bad byte */ |
142 | return 1; |
143 | } |
144 | |
145 | |
146 | /** |
147 | Compare two strings according to the collation, |
148 | without handling the PAD SPACE property. |
149 | |
150 | Note, cs->coll->strnncoll() is usually used to compare identifiers. |
151 | Perhaps we should eventually (in 10.2?) create a new collation |
152 | my_charset_utf8_general_ci_no_pad and have only one comparison function |
153 | in MY_COLLATION_HANDLER. |
154 | |
155 | @param cs - the character set and collation |
156 | @param a - the left string |
157 | @param a_length - the length of the left string |
158 | @param b - the right string |
159 | @param b_length - the length of the right string |
160 | @param b_is_prefix - if the caller wants to check if "b" is a prefix of "a" |
161 | @return - the comparison result |
162 | */ |
163 | static int |
164 | MY_FUNCTION_NAME(strnncoll)(CHARSET_INFO *cs __attribute__((unused)), |
165 | const uchar *a, size_t a_length, |
166 | const uchar *b, size_t b_length, |
167 | my_bool b_is_prefix) |
168 | { |
169 | const uchar *a_end= a + a_length; |
170 | const uchar *b_end= b + b_length; |
171 | for ( ; ; ) |
172 | { |
173 | int a_weight, b_weight, res; |
174 | uint a_wlen= MY_FUNCTION_NAME(scan_weight)(&a_weight, a, a_end); |
175 | uint b_wlen= MY_FUNCTION_NAME(scan_weight)(&b_weight, b, b_end); |
176 | /* |
177 | a_wlen b_wlen Comment |
178 | ------ ------ ------- |
179 | 0 0 Strings ended simultaneously, "a" and "b" are equal. |
180 | 0 >0 "a" is a prefix of "b", so "a" is smaller. |
181 | >0 0 "b" is a prefix of "a", check b_is_prefix. |
182 | >0 >0 Two weights were scanned, check weight difference. |
183 | */ |
184 | if (!a_wlen) |
185 | return b_wlen ? -b_weight : 0; |
186 | |
187 | if (!b_wlen) |
188 | return b_is_prefix ? 0 : a_weight; |
189 | |
190 | if ((res= (a_weight - b_weight))) |
191 | return res; |
192 | /* |
193 | None of the strings has ended yet. |
194 | */ |
195 | DBUG_ASSERT(a < a_end); |
196 | DBUG_ASSERT(b < b_end); |
197 | a+= a_wlen; |
198 | b+= b_wlen; |
199 | } |
200 | DBUG_ASSERT(0); |
201 | return 0; |
202 | } |
203 | |
204 | |
205 | #ifdef DEFINE_STRNNCOLLSP_NOPAD |
206 | |
207 | /** |
208 | Compare two strings according to the collation, with NO PAD handling. |
209 | |
210 | @param cs - the character set and collation |
211 | @param a - the left string |
212 | @param a_length - the length of the left string |
213 | @param b - the right string |
214 | @param b_length - the length of the right string |
215 | @return - the comparison result |
216 | */ |
217 | static int |
218 | MY_FUNCTION_NAME(strnncollsp)(CHARSET_INFO *cs __attribute__((unused)), |
219 | const uchar *a, size_t a_length, |
220 | const uchar *b, size_t b_length) |
221 | { |
222 | return MY_FUNCTION_NAME(strnncoll)(cs, a, a_length, b, b_length, FALSE); |
223 | } |
224 | #else |
225 | /** |
226 | Compare two strings according to the collation, with PAD SPACE handling. |
227 | |
228 | @param cs - the character set and collation |
229 | @param a - the left string |
230 | @param a_length - the length of the left string |
231 | @param b - the right string |
232 | @param b_length - the length of the right string |
233 | @return - the comparison result |
234 | */ |
235 | static int |
236 | MY_FUNCTION_NAME(strnncollsp)(CHARSET_INFO *cs __attribute__((unused)), |
237 | const uchar *a, size_t a_length, |
238 | const uchar *b, size_t b_length) |
239 | { |
240 | const uchar *a_end= a + a_length; |
241 | const uchar *b_end= b + b_length; |
242 | for ( ; ; ) |
243 | { |
244 | int a_weight, b_weight, res; |
245 | uint a_wlen= MY_FUNCTION_NAME(scan_weight)(&a_weight, a, a_end); |
246 | uint b_wlen= MY_FUNCTION_NAME(scan_weight)(&b_weight, b, b_end); |
247 | if ((res= (a_weight - b_weight))) |
248 | { |
249 | /* |
250 | Got two different weights. Each weight can be generated by either of: |
251 | - a real character |
252 | - a bad byte sequence or an incomplete byte sequence |
253 | - an auto-generated trailing space (PAD SPACE) |
254 | It does not matter how exactly each weight was generated. |
255 | Just return the weight difference. |
256 | */ |
257 | return res; |
258 | } |
259 | if (!a_wlen && !b_wlen) |
260 | { |
261 | /* |
262 | Got two auto-generated trailing spaces, i.e. |
263 | both strings have now ended, so they are equal. |
264 | */ |
265 | DBUG_ASSERT(a == a_end); |
266 | DBUG_ASSERT(b == b_end); |
267 | return 0; |
268 | } |
269 | /* |
270 | At least one of the strings has not ended yet, continue comparison. |
271 | */ |
272 | DBUG_ASSERT(a < a_end || b < b_end); |
273 | a+= a_wlen; |
274 | b+= b_wlen; |
275 | } |
276 | DBUG_ASSERT(0); |
277 | return 0; |
278 | } |
279 | #endif |
280 | |
281 | |
282 | #ifdef DEFINE_STRNXFRM |
283 | #ifndef WEIGHT_MB2_FRM |
284 | #define WEIGHT_MB2_FRM(x,y) WEIGHT_MB2(x,y) |
285 | #endif |
286 | |
287 | static size_t |
288 | MY_FUNCTION_NAME(strnxfrm)(CHARSET_INFO *cs, |
289 | uchar *dst, size_t dstlen, uint nweights, |
290 | const uchar *src, size_t srclen, uint flags) |
291 | { |
292 | uchar *d0= dst; |
293 | uchar *de= dst + dstlen; |
294 | const uchar *se= src + srclen; |
295 | const uchar *sort_order= cs->sort_order; |
296 | |
297 | for (; dst < de && src < se && nweights; nweights--) |
298 | { |
299 | if (my_charlen(cs, (const char *) src, (const char *) se) > 1) |
300 | { |
301 | /* |
302 | Note, it is safe not to check (src < se) |
303 | in the code below, because my_charlen() would |
304 | not return 2 if src was too short |
305 | */ |
306 | uint16 e= WEIGHT_MB2_FRM(src[0], src[1]); |
307 | *dst++= (uchar) (e >> 8); |
308 | if (dst < de) |
309 | *dst++= (uchar) (e & 0xFF); |
310 | src+= 2; |
311 | } |
312 | else |
313 | *dst++= sort_order ? sort_order[*src++] : *src++; |
314 | } |
315 | #ifdef DEFINE_STRNNCOLLSP_NOPAD |
316 | return my_strxfrm_pad_desc_and_reverse_nopad(cs, d0, dst, de, |
317 | nweights, flags, 0); |
318 | #else |
319 | return my_strxfrm_pad_desc_and_reverse(cs, d0, dst, de, nweights, flags, 0); |
320 | #endif |
321 | } |
322 | #endif /* DEFINE_STRNXFRM */ |
323 | |
324 | |
325 | /* |
326 | We usually include this file at least two times from the same source file, |
327 | for the _ci and the _bin collations. Prepare for the second inclusion. |
328 | */ |
329 | #undef MY_FUNCTION_NAME |
330 | #undef WEIGHT_ILSEQ |
331 | #undef WEIGHT_MB1 |
332 | #undef WEIGHT_MB2 |
333 | #undef WEIGHT_MB3 |
334 | #undef WEIGHT_MB4 |
335 | #undef WEIGHT_PAD_SPACE |
336 | #undef WEIGHT_MB2_FRM |
337 | #undef DEFINE_STRNXFRM |
338 | #undef DEFINE_STRNNCOLLSP_NOPAD |
339 | |