| 1 | /* |
| 2 | Copyright (c) 2015, MariaDB Foundation |
| 3 | |
| 4 | This program is free software; you can redistribute it and/or modify |
| 5 | it under the terms of the GNU General Public License as published by |
| 6 | the Free Software Foundation; version 2 of the License. |
| 7 | |
| 8 | This program is distributed in the hope that it will be useful, |
| 9 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 10 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 11 | GNU General Public License for more details. |
| 12 | |
| 13 | You should have received a copy of the GNU General Public License |
| 14 | along with this program; if not, write to the Free Software |
| 15 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA |
| 16 | */ |
| 17 | |
| 18 | |
| 19 | #ifndef MY_FUNCTION_NAME |
| 20 | #error MY_FUNCTION_NAME is not defined |
| 21 | #endif |
| 22 | |
| 23 | |
| 24 | /* |
| 25 | The weight for automatically padded spaces when comparing strings with |
| 26 | the PAD SPACE property. |
| 27 | Should normally be equal to the weight of a regular space. |
| 28 | */ |
| 29 | #ifndef WEIGHT_PAD_SPACE |
| 30 | #define WEIGHT_PAD_SPACE (' ') |
| 31 | #endif |
| 32 | |
| 33 | |
| 34 | /* |
| 35 | Weight of an illegal byte, must follow these rules: |
| 36 | 1. Must be greater than weight of any normal character in the collation. |
| 37 | 2. Two different bad bytes must have different weights and must be |
| 38 | compared in their binary order. |
| 39 | |
| 40 | Depends on mbmaxlen of the character set, as well as how the collation |
| 41 | sorts various single-byte and multi-byte character blocks. |
| 42 | |
| 43 | The macro below is the default definition, it is suitable for mbmaxlen=2 |
| 44 | character sets that sort all multi-byte characters after all single-byte |
| 45 | characters: big5, euckr, gb2312, gbk. |
| 46 | |
| 47 | All mbmaxlen>2 character sets must provide their own definitions. |
| 48 | All collations that have a more complex order (than just MB1 followed by MB2) |
| 49 | must also provide their own definitions (see definitions for |
| 50 | cp932_japanese_ci and sjis_japanese_ci as examples of a more complex order). |
| 51 | */ |
| 52 | #ifndef WEIGHT_ILSEQ |
| 53 | #define WEIGHT_ILSEQ(x) (0xFF00 + (x)) |
| 54 | #endif |
| 55 | |
| 56 | |
| 57 | /** |
| 58 | Scan a valid character, or a bad byte, or an auto-padded space |
| 59 | from a string and calculate the weight of the scanned sequence. |
| 60 | |
| 61 | @param [OUT] weight - the weight is returned here |
| 62 | @param str - the string |
| 63 | @param end - the end of the string |
| 64 | @return - the number of bytes scanned |
| 65 | |
| 66 | The including source file must define the following macros: |
| 67 | IS_MB1_CHAR(b0) - for character sets that have MB1 characters |
| 68 | IS_MB1_MB2HEAD_GAP(b0) - optional, for better performance |
| 69 | IS_MB2_CHAR(b0,b1) - for character sets that have MB2 characters |
| 70 | IS_MB3_CHAR(b0,b1,b2) - for character sets that have MB3 characters |
| 71 | IS_MB4_CHAR(b0,b1,b2,b3) - for character sets with have MB4 characters |
| 72 | WEIGHT_PAD_SPACE |
| 73 | WEIGHT_MB1(b0) - for character sets that have MB1 characters |
| 74 | WEIGHT_MB2(b0,b1) - for character sets that have MB2 characters |
| 75 | WEIGHT_MB3(b0,b1,b2) - for character sets that have MB3 characters |
| 76 | WEIGHT_MB4(b0,b1,b2,b3) - for character sets that have MB4 characters |
| 77 | WEIGHT_ILSEQ(x) |
| 78 | */ |
| 79 | static inline uint |
| 80 | MY_FUNCTION_NAME(scan_weight)(int *weight, const uchar *str, const uchar *end) |
| 81 | { |
| 82 | if (str >= end) |
| 83 | { |
| 84 | *weight= WEIGHT_PAD_SPACE; |
| 85 | return 0; |
| 86 | } |
| 87 | |
| 88 | #ifdef IS_MB1_CHAR |
| 89 | if (IS_MB1_CHAR(*str)) |
| 90 | { |
| 91 | *weight= WEIGHT_MB1(*str); /* A valid single byte character*/ |
| 92 | return 1; |
| 93 | } |
| 94 | #endif |
| 95 | |
| 96 | #ifdef IS_MB1_MBHEAD_UNUSED_GAP |
| 97 | /* |
| 98 | Quickly filter out unused bytes that are neither MB1 nor MBHEAD. |
| 99 | E.g. [0x80..0xC1] in utf8. This allows using simplified conditions |
| 100 | in IS_MB2_CHAR(), IS_MB3_CHAR(), etc. |
| 101 | */ |
| 102 | if (IS_MB1_MBHEAD_UNUSED_GAP(*str)) |
| 103 | goto bad; |
| 104 | #endif |
| 105 | |
| 106 | #ifdef IS_MB2_CHAR |
| 107 | if (str + 2 > end) /* The string ended unexpectedly */ |
| 108 | goto bad; /* Treat as a bad byte */ |
| 109 | |
| 110 | if (IS_MB2_CHAR(str[0], str[1])) |
| 111 | { |
| 112 | *weight= WEIGHT_MB2(str[0], str[1]); |
| 113 | return 2; /* A valid two-byte character */ |
| 114 | } |
| 115 | #endif |
| 116 | |
| 117 | #ifdef IS_MB3_CHAR |
| 118 | if (str + 3 > end) /* Incomplete three-byte character */ |
| 119 | goto bad; |
| 120 | |
| 121 | if (IS_MB3_CHAR(str[0], str[1], str[2])) |
| 122 | { |
| 123 | *weight= WEIGHT_MB3(str[0], str[1], str[2]); |
| 124 | return 3; /* A valid three-byte character */ |
| 125 | } |
| 126 | #endif |
| 127 | |
| 128 | #ifdef IS_MB4_CHAR |
| 129 | if (str + 4 > end) /* Incomplete four-byte character */ |
| 130 | goto bad; |
| 131 | |
| 132 | if (IS_MB4_CHAR(str[0], str[1], str[2], str[3])) |
| 133 | { |
| 134 | *weight= WEIGHT_MB4(str[0], str[1], str[2], str[3]); |
| 135 | return 4; /* A valid four-byte character */ |
| 136 | } |
| 137 | |
| 138 | #endif |
| 139 | |
| 140 | bad: |
| 141 | *weight= WEIGHT_ILSEQ(str[0]); /* Bad byte */ |
| 142 | return 1; |
| 143 | } |
| 144 | |
| 145 | |
| 146 | /** |
| 147 | Compare two strings according to the collation, |
| 148 | without handling the PAD SPACE property. |
| 149 | |
| 150 | Note, cs->coll->strnncoll() is usually used to compare identifiers. |
| 151 | Perhaps we should eventually (in 10.2?) create a new collation |
| 152 | my_charset_utf8_general_ci_no_pad and have only one comparison function |
| 153 | in MY_COLLATION_HANDLER. |
| 154 | |
| 155 | @param cs - the character set and collation |
| 156 | @param a - the left string |
| 157 | @param a_length - the length of the left string |
| 158 | @param b - the right string |
| 159 | @param b_length - the length of the right string |
| 160 | @param b_is_prefix - if the caller wants to check if "b" is a prefix of "a" |
| 161 | @return - the comparison result |
| 162 | */ |
| 163 | static int |
| 164 | MY_FUNCTION_NAME(strnncoll)(CHARSET_INFO *cs __attribute__((unused)), |
| 165 | const uchar *a, size_t a_length, |
| 166 | const uchar *b, size_t b_length, |
| 167 | my_bool b_is_prefix) |
| 168 | { |
| 169 | const uchar *a_end= a + a_length; |
| 170 | const uchar *b_end= b + b_length; |
| 171 | for ( ; ; ) |
| 172 | { |
| 173 | int a_weight, b_weight, res; |
| 174 | uint a_wlen= MY_FUNCTION_NAME(scan_weight)(&a_weight, a, a_end); |
| 175 | uint b_wlen= MY_FUNCTION_NAME(scan_weight)(&b_weight, b, b_end); |
| 176 | /* |
| 177 | a_wlen b_wlen Comment |
| 178 | ------ ------ ------- |
| 179 | 0 0 Strings ended simultaneously, "a" and "b" are equal. |
| 180 | 0 >0 "a" is a prefix of "b", so "a" is smaller. |
| 181 | >0 0 "b" is a prefix of "a", check b_is_prefix. |
| 182 | >0 >0 Two weights were scanned, check weight difference. |
| 183 | */ |
| 184 | if (!a_wlen) |
| 185 | return b_wlen ? -b_weight : 0; |
| 186 | |
| 187 | if (!b_wlen) |
| 188 | return b_is_prefix ? 0 : a_weight; |
| 189 | |
| 190 | if ((res= (a_weight - b_weight))) |
| 191 | return res; |
| 192 | /* |
| 193 | None of the strings has ended yet. |
| 194 | */ |
| 195 | DBUG_ASSERT(a < a_end); |
| 196 | DBUG_ASSERT(b < b_end); |
| 197 | a+= a_wlen; |
| 198 | b+= b_wlen; |
| 199 | } |
| 200 | DBUG_ASSERT(0); |
| 201 | return 0; |
| 202 | } |
| 203 | |
| 204 | |
| 205 | #ifdef DEFINE_STRNNCOLLSP_NOPAD |
| 206 | |
| 207 | /** |
| 208 | Compare two strings according to the collation, with NO PAD handling. |
| 209 | |
| 210 | @param cs - the character set and collation |
| 211 | @param a - the left string |
| 212 | @param a_length - the length of the left string |
| 213 | @param b - the right string |
| 214 | @param b_length - the length of the right string |
| 215 | @return - the comparison result |
| 216 | */ |
| 217 | static int |
| 218 | MY_FUNCTION_NAME(strnncollsp)(CHARSET_INFO *cs __attribute__((unused)), |
| 219 | const uchar *a, size_t a_length, |
| 220 | const uchar *b, size_t b_length) |
| 221 | { |
| 222 | return MY_FUNCTION_NAME(strnncoll)(cs, a, a_length, b, b_length, FALSE); |
| 223 | } |
| 224 | #else |
| 225 | /** |
| 226 | Compare two strings according to the collation, with PAD SPACE handling. |
| 227 | |
| 228 | @param cs - the character set and collation |
| 229 | @param a - the left string |
| 230 | @param a_length - the length of the left string |
| 231 | @param b - the right string |
| 232 | @param b_length - the length of the right string |
| 233 | @return - the comparison result |
| 234 | */ |
| 235 | static int |
| 236 | MY_FUNCTION_NAME(strnncollsp)(CHARSET_INFO *cs __attribute__((unused)), |
| 237 | const uchar *a, size_t a_length, |
| 238 | const uchar *b, size_t b_length) |
| 239 | { |
| 240 | const uchar *a_end= a + a_length; |
| 241 | const uchar *b_end= b + b_length; |
| 242 | for ( ; ; ) |
| 243 | { |
| 244 | int a_weight, b_weight, res; |
| 245 | uint a_wlen= MY_FUNCTION_NAME(scan_weight)(&a_weight, a, a_end); |
| 246 | uint b_wlen= MY_FUNCTION_NAME(scan_weight)(&b_weight, b, b_end); |
| 247 | if ((res= (a_weight - b_weight))) |
| 248 | { |
| 249 | /* |
| 250 | Got two different weights. Each weight can be generated by either of: |
| 251 | - a real character |
| 252 | - a bad byte sequence or an incomplete byte sequence |
| 253 | - an auto-generated trailing space (PAD SPACE) |
| 254 | It does not matter how exactly each weight was generated. |
| 255 | Just return the weight difference. |
| 256 | */ |
| 257 | return res; |
| 258 | } |
| 259 | if (!a_wlen && !b_wlen) |
| 260 | { |
| 261 | /* |
| 262 | Got two auto-generated trailing spaces, i.e. |
| 263 | both strings have now ended, so they are equal. |
| 264 | */ |
| 265 | DBUG_ASSERT(a == a_end); |
| 266 | DBUG_ASSERT(b == b_end); |
| 267 | return 0; |
| 268 | } |
| 269 | /* |
| 270 | At least one of the strings has not ended yet, continue comparison. |
| 271 | */ |
| 272 | DBUG_ASSERT(a < a_end || b < b_end); |
| 273 | a+= a_wlen; |
| 274 | b+= b_wlen; |
| 275 | } |
| 276 | DBUG_ASSERT(0); |
| 277 | return 0; |
| 278 | } |
| 279 | #endif |
| 280 | |
| 281 | |
| 282 | #ifdef DEFINE_STRNXFRM |
| 283 | #ifndef WEIGHT_MB2_FRM |
| 284 | #define WEIGHT_MB2_FRM(x,y) WEIGHT_MB2(x,y) |
| 285 | #endif |
| 286 | |
| 287 | static size_t |
| 288 | MY_FUNCTION_NAME(strnxfrm)(CHARSET_INFO *cs, |
| 289 | uchar *dst, size_t dstlen, uint nweights, |
| 290 | const uchar *src, size_t srclen, uint flags) |
| 291 | { |
| 292 | uchar *d0= dst; |
| 293 | uchar *de= dst + dstlen; |
| 294 | const uchar *se= src + srclen; |
| 295 | const uchar *sort_order= cs->sort_order; |
| 296 | |
| 297 | for (; dst < de && src < se && nweights; nweights--) |
| 298 | { |
| 299 | if (my_charlen(cs, (const char *) src, (const char *) se) > 1) |
| 300 | { |
| 301 | /* |
| 302 | Note, it is safe not to check (src < se) |
| 303 | in the code below, because my_charlen() would |
| 304 | not return 2 if src was too short |
| 305 | */ |
| 306 | uint16 e= WEIGHT_MB2_FRM(src[0], src[1]); |
| 307 | *dst++= (uchar) (e >> 8); |
| 308 | if (dst < de) |
| 309 | *dst++= (uchar) (e & 0xFF); |
| 310 | src+= 2; |
| 311 | } |
| 312 | else |
| 313 | *dst++= sort_order ? sort_order[*src++] : *src++; |
| 314 | } |
| 315 | #ifdef DEFINE_STRNNCOLLSP_NOPAD |
| 316 | return my_strxfrm_pad_desc_and_reverse_nopad(cs, d0, dst, de, |
| 317 | nweights, flags, 0); |
| 318 | #else |
| 319 | return my_strxfrm_pad_desc_and_reverse(cs, d0, dst, de, nweights, flags, 0); |
| 320 | #endif |
| 321 | } |
| 322 | #endif /* DEFINE_STRNXFRM */ |
| 323 | |
| 324 | |
| 325 | /* |
| 326 | We usually include this file at least two times from the same source file, |
| 327 | for the _ci and the _bin collations. Prepare for the second inclusion. |
| 328 | */ |
| 329 | #undef MY_FUNCTION_NAME |
| 330 | #undef WEIGHT_ILSEQ |
| 331 | #undef WEIGHT_MB1 |
| 332 | #undef WEIGHT_MB2 |
| 333 | #undef WEIGHT_MB3 |
| 334 | #undef WEIGHT_MB4 |
| 335 | #undef WEIGHT_PAD_SPACE |
| 336 | #undef WEIGHT_MB2_FRM |
| 337 | #undef DEFINE_STRNXFRM |
| 338 | #undef DEFINE_STRNNCOLLSP_NOPAD |
| 339 | |