| 1 | // Scintilla source code edit control |
| 2 | // Encoding: UTF-8 |
| 3 | /** @file CaseConvert.cxx |
| 4 | ** Case fold characters and convert them to upper or lower case. |
| 5 | ** Tables automatically regenerated by scripts/GenerateCaseConvert.py |
| 6 | ** Should only be rarely regenerated for new versions of Unicode. |
| 7 | **/ |
| 8 | // Copyright 2013 by Neil Hodgson <neilh@scintilla.org> |
| 9 | // The License.txt file describes the conditions under which this software may be distributed. |
| 10 | |
| 11 | #include <cassert> |
| 12 | #include <cstring> |
| 13 | |
| 14 | #include <stdexcept> |
| 15 | #include <string> |
| 16 | #include <string_view> |
| 17 | #include <vector> |
| 18 | #include <algorithm> |
| 19 | |
| 20 | #include "CaseConvert.h" |
| 21 | #include "UniConversion.h" |
| 22 | |
| 23 | using namespace Scintilla::Internal; |
| 24 | |
| 25 | namespace { |
| 26 | // Use an unnamed namespace to protect the declarations from name conflicts |
| 27 | |
| 28 | // Unicode code points are ordered by groups and follow patterns. |
| 29 | // Most characters (pitch==1) are in ranges for a particular alphabet and their |
| 30 | // upper case forms are a fixed distance away. |
| 31 | // Another pattern (pitch==2) is where each lower case letter is preceded by |
| 32 | // the upper case form. These are also grouped into ranges. |
| 33 | |
| 34 | int symmetricCaseConversionRanges[] = { |
| 35 | //lower, upper, range length, range pitch |
| 36 | //++Autogenerated -- start of section automatically generated |
| 37 | //**\(\*\n\) |
| 38 | 97,65,26,1, |
| 39 | 224,192,23,1, |
| 40 | 248,216,7,1, |
| 41 | 257,256,24,2, |
| 42 | 314,313,8,2, |
| 43 | 331,330,23,2, |
| 44 | 462,461,8,2, |
| 45 | 479,478,9,2, |
| 46 | 505,504,20,2, |
| 47 | 547,546,9,2, |
| 48 | 583,582,5,2, |
| 49 | 945,913,17,1, |
| 50 | 963,931,9,1, |
| 51 | 985,984,12,2, |
| 52 | 1072,1040,32,1, |
| 53 | 1104,1024,16,1, |
| 54 | 1121,1120,17,2, |
| 55 | 1163,1162,27,2, |
| 56 | 1218,1217,7,2, |
| 57 | 1233,1232,48,2, |
| 58 | 1377,1329,38,1, |
| 59 | 4304,7312,43,1, |
| 60 | 7681,7680,75,2, |
| 61 | 7841,7840,48,2, |
| 62 | 7936,7944,8,1, |
| 63 | 7952,7960,6,1, |
| 64 | 7968,7976,8,1, |
| 65 | 7984,7992,8,1, |
| 66 | 8000,8008,6,1, |
| 67 | 8032,8040,8,1, |
| 68 | 8560,8544,16,1, |
| 69 | 9424,9398,26,1, |
| 70 | 11312,11264,47,1, |
| 71 | 11393,11392,50,2, |
| 72 | 11520,4256,38,1, |
| 73 | 42561,42560,23,2, |
| 74 | 42625,42624,14,2, |
| 75 | 42787,42786,7,2, |
| 76 | 42803,42802,31,2, |
| 77 | 42879,42878,5,2, |
| 78 | 42903,42902,10,2, |
| 79 | 42933,42932,6,2, |
| 80 | 65345,65313,26,1, |
| 81 | 66600,66560,40,1, |
| 82 | 66776,66736,36,1, |
| 83 | 68800,68736,51,1, |
| 84 | 71872,71840,32,1, |
| 85 | 93792,93760,32,1, |
| 86 | 125218,125184,34,1, |
| 87 | |
| 88 | //--Autogenerated -- end of section automatically generated |
| 89 | }; |
| 90 | |
| 91 | // Code points that are symmetric but don't fit into a range of similar characters |
| 92 | // are listed here. |
| 93 | |
| 94 | int symmetricCaseConversions[] = { |
| 95 | //lower, upper |
| 96 | //++Autogenerated -- start of section automatically generated |
| 97 | //**1 \(\*\n\) |
| 98 | 255,376, |
| 99 | 307,306, |
| 100 | 309,308, |
| 101 | 311,310, |
| 102 | 378,377, |
| 103 | 380,379, |
| 104 | 382,381, |
| 105 | 384,579, |
| 106 | 387,386, |
| 107 | 389,388, |
| 108 | 392,391, |
| 109 | 396,395, |
| 110 | 402,401, |
| 111 | 405,502, |
| 112 | 409,408, |
| 113 | 410,573, |
| 114 | 414,544, |
| 115 | 417,416, |
| 116 | 419,418, |
| 117 | 421,420, |
| 118 | 424,423, |
| 119 | 429,428, |
| 120 | 432,431, |
| 121 | 436,435, |
| 122 | 438,437, |
| 123 | 441,440, |
| 124 | 445,444, |
| 125 | 447,503, |
| 126 | 454,452, |
| 127 | 457,455, |
| 128 | 460,458, |
| 129 | 477,398, |
| 130 | 499,497, |
| 131 | 501,500, |
| 132 | 572,571, |
| 133 | 575,11390, |
| 134 | 576,11391, |
| 135 | 578,577, |
| 136 | 592,11375, |
| 137 | 593,11373, |
| 138 | 594,11376, |
| 139 | 595,385, |
| 140 | 596,390, |
| 141 | 598,393, |
| 142 | 599,394, |
| 143 | 601,399, |
| 144 | 603,400, |
| 145 | 604,42923, |
| 146 | 608,403, |
| 147 | 609,42924, |
| 148 | 611,404, |
| 149 | 613,42893, |
| 150 | 614,42922, |
| 151 | 616,407, |
| 152 | 617,406, |
| 153 | 618,42926, |
| 154 | 619,11362, |
| 155 | 620,42925, |
| 156 | 623,412, |
| 157 | 625,11374, |
| 158 | 626,413, |
| 159 | 629,415, |
| 160 | 637,11364, |
| 161 | 640,422, |
| 162 | 642,42949, |
| 163 | 643,425, |
| 164 | 647,42929, |
| 165 | 648,430, |
| 166 | 649,580, |
| 167 | 650,433, |
| 168 | 651,434, |
| 169 | 652,581, |
| 170 | 658,439, |
| 171 | 669,42930, |
| 172 | 670,42928, |
| 173 | 881,880, |
| 174 | 883,882, |
| 175 | 887,886, |
| 176 | 891,1021, |
| 177 | 892,1022, |
| 178 | 893,1023, |
| 179 | 940,902, |
| 180 | 941,904, |
| 181 | 942,905, |
| 182 | 943,906, |
| 183 | 972,908, |
| 184 | 973,910, |
| 185 | 974,911, |
| 186 | 983,975, |
| 187 | 1010,1017, |
| 188 | 1011,895, |
| 189 | 1016,1015, |
| 190 | 1019,1018, |
| 191 | 1231,1216, |
| 192 | 4349,7357, |
| 193 | 4350,7358, |
| 194 | 4351,7359, |
| 195 | 7545,42877, |
| 196 | 7549,11363, |
| 197 | 7566,42950, |
| 198 | 8017,8025, |
| 199 | 8019,8027, |
| 200 | 8021,8029, |
| 201 | 8023,8031, |
| 202 | 8048,8122, |
| 203 | 8049,8123, |
| 204 | 8050,8136, |
| 205 | 8051,8137, |
| 206 | 8052,8138, |
| 207 | 8053,8139, |
| 208 | 8054,8154, |
| 209 | 8055,8155, |
| 210 | 8056,8184, |
| 211 | 8057,8185, |
| 212 | 8058,8170, |
| 213 | 8059,8171, |
| 214 | 8060,8186, |
| 215 | 8061,8187, |
| 216 | 8112,8120, |
| 217 | 8113,8121, |
| 218 | 8144,8152, |
| 219 | 8145,8153, |
| 220 | 8160,8168, |
| 221 | 8161,8169, |
| 222 | 8165,8172, |
| 223 | 8526,8498, |
| 224 | 8580,8579, |
| 225 | 11361,11360, |
| 226 | 11365,570, |
| 227 | 11366,574, |
| 228 | 11368,11367, |
| 229 | 11370,11369, |
| 230 | 11372,11371, |
| 231 | 11379,11378, |
| 232 | 11382,11381, |
| 233 | 11500,11499, |
| 234 | 11502,11501, |
| 235 | 11507,11506, |
| 236 | 11559,4295, |
| 237 | 11565,4301, |
| 238 | 42874,42873, |
| 239 | 42876,42875, |
| 240 | 42892,42891, |
| 241 | 42897,42896, |
| 242 | 42899,42898, |
| 243 | 42900,42948, |
| 244 | 42947,42946, |
| 245 | 42952,42951, |
| 246 | 42954,42953, |
| 247 | 42998,42997, |
| 248 | 43859,42931, |
| 249 | |
| 250 | //--Autogenerated -- end of section automatically generated |
| 251 | }; |
| 252 | |
| 253 | // Characters that have complex case conversions are listed here. |
| 254 | // This includes cases where more than one character is needed for a conversion, |
| 255 | // folding is different to lowering, or (as appropriate) upper(lower(x)) != x or |
| 256 | // lower(upper(x)) != x. |
| 257 | |
| 258 | const char *complexCaseConversions = |
| 259 | // Original | Folded | Upper | Lower | |
| 260 | //++Autogenerated -- start of section automatically generated |
| 261 | //**2 \(\*\n\) |
| 262 | "\xc2\xb5|\xce\xbc|\xce\x9c||" |
| 263 | "\xc3\x9f|ss|SS||" |
| 264 | "\xc4\xb0|i\xcc\x87||i\xcc\x87|" |
| 265 | "\xc4\xb1||I||" |
| 266 | "\xc5\x89|\xca\xbcn|\xca\xbcN||" |
| 267 | "\xc5\xbf|s|S||" |
| 268 | "\xc7\x85|\xc7\x86|\xc7\x84|\xc7\x86|" |
| 269 | "\xc7\x88|\xc7\x89|\xc7\x87|\xc7\x89|" |
| 270 | "\xc7\x8b|\xc7\x8c|\xc7\x8a|\xc7\x8c|" |
| 271 | "\xc7\xb0|j\xcc\x8c|J\xcc\x8c||" |
| 272 | "\xc7\xb2|\xc7\xb3|\xc7\xb1|\xc7\xb3|" |
| 273 | "\xcd\x85|\xce\xb9|\xce\x99||" |
| 274 | "\xce\x90|\xce\xb9\xcc\x88\xcc\x81|\xce\x99\xcc\x88\xcc\x81||" |
| 275 | "\xce\xb0|\xcf\x85\xcc\x88\xcc\x81|\xce\xa5\xcc\x88\xcc\x81||" |
| 276 | "\xcf\x82|\xcf\x83|\xce\xa3||" |
| 277 | "\xcf\x90|\xce\xb2|\xce\x92||" |
| 278 | "\xcf\x91|\xce\xb8|\xce\x98||" |
| 279 | "\xcf\x95|\xcf\x86|\xce\xa6||" |
| 280 | "\xcf\x96|\xcf\x80|\xce\xa0||" |
| 281 | "\xcf\xb0|\xce\xba|\xce\x9a||" |
| 282 | "\xcf\xb1|\xcf\x81|\xce\xa1||" |
| 283 | "\xcf\xb4|\xce\xb8||\xce\xb8|" |
| 284 | "\xcf\xb5|\xce\xb5|\xce\x95||" |
| 285 | "\xd6\x87|\xd5\xa5\xd6\x82|\xd4\xb5\xd5\x92||" |
| 286 | "\xe1\x8e\xa0|||\xea\xad\xb0|" |
| 287 | "\xe1\x8e\xa1|||\xea\xad\xb1|" |
| 288 | "\xe1\x8e\xa2|||\xea\xad\xb2|" |
| 289 | "\xe1\x8e\xa3|||\xea\xad\xb3|" |
| 290 | "\xe1\x8e\xa4|||\xea\xad\xb4|" |
| 291 | "\xe1\x8e\xa5|||\xea\xad\xb5|" |
| 292 | "\xe1\x8e\xa6|||\xea\xad\xb6|" |
| 293 | "\xe1\x8e\xa7|||\xea\xad\xb7|" |
| 294 | "\xe1\x8e\xa8|||\xea\xad\xb8|" |
| 295 | "\xe1\x8e\xa9|||\xea\xad\xb9|" |
| 296 | "\xe1\x8e\xaa|||\xea\xad\xba|" |
| 297 | "\xe1\x8e\xab|||\xea\xad\xbb|" |
| 298 | "\xe1\x8e\xac|||\xea\xad\xbc|" |
| 299 | "\xe1\x8e\xad|||\xea\xad\xbd|" |
| 300 | "\xe1\x8e\xae|||\xea\xad\xbe|" |
| 301 | "\xe1\x8e\xaf|||\xea\xad\xbf|" |
| 302 | "\xe1\x8e\xb0|||\xea\xae\x80|" |
| 303 | "\xe1\x8e\xb1|||\xea\xae\x81|" |
| 304 | "\xe1\x8e\xb2|||\xea\xae\x82|" |
| 305 | "\xe1\x8e\xb3|||\xea\xae\x83|" |
| 306 | "\xe1\x8e\xb4|||\xea\xae\x84|" |
| 307 | "\xe1\x8e\xb5|||\xea\xae\x85|" |
| 308 | "\xe1\x8e\xb6|||\xea\xae\x86|" |
| 309 | "\xe1\x8e\xb7|||\xea\xae\x87|" |
| 310 | "\xe1\x8e\xb8|||\xea\xae\x88|" |
| 311 | "\xe1\x8e\xb9|||\xea\xae\x89|" |
| 312 | "\xe1\x8e\xba|||\xea\xae\x8a|" |
| 313 | "\xe1\x8e\xbb|||\xea\xae\x8b|" |
| 314 | "\xe1\x8e\xbc|||\xea\xae\x8c|" |
| 315 | "\xe1\x8e\xbd|||\xea\xae\x8d|" |
| 316 | "\xe1\x8e\xbe|||\xea\xae\x8e|" |
| 317 | "\xe1\x8e\xbf|||\xea\xae\x8f|" |
| 318 | "\xe1\x8f\x80|||\xea\xae\x90|" |
| 319 | "\xe1\x8f\x81|||\xea\xae\x91|" |
| 320 | "\xe1\x8f\x82|||\xea\xae\x92|" |
| 321 | "\xe1\x8f\x83|||\xea\xae\x93|" |
| 322 | "\xe1\x8f\x84|||\xea\xae\x94|" |
| 323 | "\xe1\x8f\x85|||\xea\xae\x95|" |
| 324 | "\xe1\x8f\x86|||\xea\xae\x96|" |
| 325 | "\xe1\x8f\x87|||\xea\xae\x97|" |
| 326 | "\xe1\x8f\x88|||\xea\xae\x98|" |
| 327 | "\xe1\x8f\x89|||\xea\xae\x99|" |
| 328 | "\xe1\x8f\x8a|||\xea\xae\x9a|" |
| 329 | "\xe1\x8f\x8b|||\xea\xae\x9b|" |
| 330 | "\xe1\x8f\x8c|||\xea\xae\x9c|" |
| 331 | "\xe1\x8f\x8d|||\xea\xae\x9d|" |
| 332 | "\xe1\x8f\x8e|||\xea\xae\x9e|" |
| 333 | "\xe1\x8f\x8f|||\xea\xae\x9f|" |
| 334 | "\xe1\x8f\x90|||\xea\xae\xa0|" |
| 335 | "\xe1\x8f\x91|||\xea\xae\xa1|" |
| 336 | "\xe1\x8f\x92|||\xea\xae\xa2|" |
| 337 | "\xe1\x8f\x93|||\xea\xae\xa3|" |
| 338 | "\xe1\x8f\x94|||\xea\xae\xa4|" |
| 339 | "\xe1\x8f\x95|||\xea\xae\xa5|" |
| 340 | "\xe1\x8f\x96|||\xea\xae\xa6|" |
| 341 | "\xe1\x8f\x97|||\xea\xae\xa7|" |
| 342 | "\xe1\x8f\x98|||\xea\xae\xa8|" |
| 343 | "\xe1\x8f\x99|||\xea\xae\xa9|" |
| 344 | "\xe1\x8f\x9a|||\xea\xae\xaa|" |
| 345 | "\xe1\x8f\x9b|||\xea\xae\xab|" |
| 346 | "\xe1\x8f\x9c|||\xea\xae\xac|" |
| 347 | "\xe1\x8f\x9d|||\xea\xae\xad|" |
| 348 | "\xe1\x8f\x9e|||\xea\xae\xae|" |
| 349 | "\xe1\x8f\x9f|||\xea\xae\xaf|" |
| 350 | "\xe1\x8f\xa0|||\xea\xae\xb0|" |
| 351 | "\xe1\x8f\xa1|||\xea\xae\xb1|" |
| 352 | "\xe1\x8f\xa2|||\xea\xae\xb2|" |
| 353 | "\xe1\x8f\xa3|||\xea\xae\xb3|" |
| 354 | "\xe1\x8f\xa4|||\xea\xae\xb4|" |
| 355 | "\xe1\x8f\xa5|||\xea\xae\xb5|" |
| 356 | "\xe1\x8f\xa6|||\xea\xae\xb6|" |
| 357 | "\xe1\x8f\xa7|||\xea\xae\xb7|" |
| 358 | "\xe1\x8f\xa8|||\xea\xae\xb8|" |
| 359 | "\xe1\x8f\xa9|||\xea\xae\xb9|" |
| 360 | "\xe1\x8f\xaa|||\xea\xae\xba|" |
| 361 | "\xe1\x8f\xab|||\xea\xae\xbb|" |
| 362 | "\xe1\x8f\xac|||\xea\xae\xbc|" |
| 363 | "\xe1\x8f\xad|||\xea\xae\xbd|" |
| 364 | "\xe1\x8f\xae|||\xea\xae\xbe|" |
| 365 | "\xe1\x8f\xaf|||\xea\xae\xbf|" |
| 366 | "\xe1\x8f\xb0|||\xe1\x8f\xb8|" |
| 367 | "\xe1\x8f\xb1|||\xe1\x8f\xb9|" |
| 368 | "\xe1\x8f\xb2|||\xe1\x8f\xba|" |
| 369 | "\xe1\x8f\xb3|||\xe1\x8f\xbb|" |
| 370 | "\xe1\x8f\xb4|||\xe1\x8f\xbc|" |
| 371 | "\xe1\x8f\xb5|||\xe1\x8f\xbd|" |
| 372 | "\xe1\x8f\xb8|\xe1\x8f\xb0|\xe1\x8f\xb0||" |
| 373 | "\xe1\x8f\xb9|\xe1\x8f\xb1|\xe1\x8f\xb1||" |
| 374 | "\xe1\x8f\xba|\xe1\x8f\xb2|\xe1\x8f\xb2||" |
| 375 | "\xe1\x8f\xbb|\xe1\x8f\xb3|\xe1\x8f\xb3||" |
| 376 | "\xe1\x8f\xbc|\xe1\x8f\xb4|\xe1\x8f\xb4||" |
| 377 | "\xe1\x8f\xbd|\xe1\x8f\xb5|\xe1\x8f\xb5||" |
| 378 | "\xe1\xb2\x80|\xd0\xb2|\xd0\x92||" |
| 379 | "\xe1\xb2\x81|\xd0\xb4|\xd0\x94||" |
| 380 | "\xe1\xb2\x82|\xd0\xbe|\xd0\x9e||" |
| 381 | "\xe1\xb2\x83|\xd1\x81|\xd0\xa1||" |
| 382 | "\xe1\xb2\x84|\xd1\x82|\xd0\xa2||" |
| 383 | "\xe1\xb2\x85|\xd1\x82|\xd0\xa2||" |
| 384 | "\xe1\xb2\x86|\xd1\x8a|\xd0\xaa||" |
| 385 | "\xe1\xb2\x87|\xd1\xa3|\xd1\xa2||" |
| 386 | "\xe1\xb2\x88|\xea\x99\x8b|\xea\x99\x8a||" |
| 387 | "\xe1\xba\x96|h\xcc\xb1|H\xcc\xb1||" |
| 388 | "\xe1\xba\x97|t\xcc\x88|T\xcc\x88||" |
| 389 | "\xe1\xba\x98|w\xcc\x8a|W\xcc\x8a||" |
| 390 | "\xe1\xba\x99|y\xcc\x8a|Y\xcc\x8a||" |
| 391 | "\xe1\xba\x9a|a\xca\xbe|A\xca\xbe||" |
| 392 | "\xe1\xba\x9b|\xe1\xb9\xa1|\xe1\xb9\xa0||" |
| 393 | "\xe1\xba\x9e|ss||\xc3\x9f|" |
| 394 | "\xe1\xbd\x90|\xcf\x85\xcc\x93|\xce\xa5\xcc\x93||" |
| 395 | "\xe1\xbd\x92|\xcf\x85\xcc\x93\xcc\x80|\xce\xa5\xcc\x93\xcc\x80||" |
| 396 | "\xe1\xbd\x94|\xcf\x85\xcc\x93\xcc\x81|\xce\xa5\xcc\x93\xcc\x81||" |
| 397 | "\xe1\xbd\x96|\xcf\x85\xcc\x93\xcd\x82|\xce\xa5\xcc\x93\xcd\x82||" |
| 398 | "\xe1\xbe\x80|\xe1\xbc\x80\xce\xb9|\xe1\xbc\x88\xce\x99||" |
| 399 | "\xe1\xbe\x81|\xe1\xbc\x81\xce\xb9|\xe1\xbc\x89\xce\x99||" |
| 400 | "\xe1\xbe\x82|\xe1\xbc\x82\xce\xb9|\xe1\xbc\x8a\xce\x99||" |
| 401 | "\xe1\xbe\x83|\xe1\xbc\x83\xce\xb9|\xe1\xbc\x8b\xce\x99||" |
| 402 | "\xe1\xbe\x84|\xe1\xbc\x84\xce\xb9|\xe1\xbc\x8c\xce\x99||" |
| 403 | "\xe1\xbe\x85|\xe1\xbc\x85\xce\xb9|\xe1\xbc\x8d\xce\x99||" |
| 404 | "\xe1\xbe\x86|\xe1\xbc\x86\xce\xb9|\xe1\xbc\x8e\xce\x99||" |
| 405 | "\xe1\xbe\x87|\xe1\xbc\x87\xce\xb9|\xe1\xbc\x8f\xce\x99||" |
| 406 | "\xe1\xbe\x88|\xe1\xbc\x80\xce\xb9|\xe1\xbc\x88\xce\x99|\xe1\xbe\x80|" |
| 407 | "\xe1\xbe\x89|\xe1\xbc\x81\xce\xb9|\xe1\xbc\x89\xce\x99|\xe1\xbe\x81|" |
| 408 | "\xe1\xbe\x8a|\xe1\xbc\x82\xce\xb9|\xe1\xbc\x8a\xce\x99|\xe1\xbe\x82|" |
| 409 | "\xe1\xbe\x8b|\xe1\xbc\x83\xce\xb9|\xe1\xbc\x8b\xce\x99|\xe1\xbe\x83|" |
| 410 | "\xe1\xbe\x8c|\xe1\xbc\x84\xce\xb9|\xe1\xbc\x8c\xce\x99|\xe1\xbe\x84|" |
| 411 | "\xe1\xbe\x8d|\xe1\xbc\x85\xce\xb9|\xe1\xbc\x8d\xce\x99|\xe1\xbe\x85|" |
| 412 | "\xe1\xbe\x8e|\xe1\xbc\x86\xce\xb9|\xe1\xbc\x8e\xce\x99|\xe1\xbe\x86|" |
| 413 | "\xe1\xbe\x8f|\xe1\xbc\x87\xce\xb9|\xe1\xbc\x8f\xce\x99|\xe1\xbe\x87|" |
| 414 | "\xe1\xbe\x90|\xe1\xbc\xa0\xce\xb9|\xe1\xbc\xa8\xce\x99||" |
| 415 | "\xe1\xbe\x91|\xe1\xbc\xa1\xce\xb9|\xe1\xbc\xa9\xce\x99||" |
| 416 | "\xe1\xbe\x92|\xe1\xbc\xa2\xce\xb9|\xe1\xbc\xaa\xce\x99||" |
| 417 | "\xe1\xbe\x93|\xe1\xbc\xa3\xce\xb9|\xe1\xbc\xab\xce\x99||" |
| 418 | "\xe1\xbe\x94|\xe1\xbc\xa4\xce\xb9|\xe1\xbc\xac\xce\x99||" |
| 419 | "\xe1\xbe\x95|\xe1\xbc\xa5\xce\xb9|\xe1\xbc\xad\xce\x99||" |
| 420 | "\xe1\xbe\x96|\xe1\xbc\xa6\xce\xb9|\xe1\xbc\xae\xce\x99||" |
| 421 | "\xe1\xbe\x97|\xe1\xbc\xa7\xce\xb9|\xe1\xbc\xaf\xce\x99||" |
| 422 | "\xe1\xbe\x98|\xe1\xbc\xa0\xce\xb9|\xe1\xbc\xa8\xce\x99|\xe1\xbe\x90|" |
| 423 | "\xe1\xbe\x99|\xe1\xbc\xa1\xce\xb9|\xe1\xbc\xa9\xce\x99|\xe1\xbe\x91|" |
| 424 | "\xe1\xbe\x9a|\xe1\xbc\xa2\xce\xb9|\xe1\xbc\xaa\xce\x99|\xe1\xbe\x92|" |
| 425 | "\xe1\xbe\x9b|\xe1\xbc\xa3\xce\xb9|\xe1\xbc\xab\xce\x99|\xe1\xbe\x93|" |
| 426 | "\xe1\xbe\x9c|\xe1\xbc\xa4\xce\xb9|\xe1\xbc\xac\xce\x99|\xe1\xbe\x94|" |
| 427 | "\xe1\xbe\x9d|\xe1\xbc\xa5\xce\xb9|\xe1\xbc\xad\xce\x99|\xe1\xbe\x95|" |
| 428 | "\xe1\xbe\x9e|\xe1\xbc\xa6\xce\xb9|\xe1\xbc\xae\xce\x99|\xe1\xbe\x96|" |
| 429 | "\xe1\xbe\x9f|\xe1\xbc\xa7\xce\xb9|\xe1\xbc\xaf\xce\x99|\xe1\xbe\x97|" |
| 430 | "\xe1\xbe\xa0|\xe1\xbd\xa0\xce\xb9|\xe1\xbd\xa8\xce\x99||" |
| 431 | "\xe1\xbe\xa1|\xe1\xbd\xa1\xce\xb9|\xe1\xbd\xa9\xce\x99||" |
| 432 | "\xe1\xbe\xa2|\xe1\xbd\xa2\xce\xb9|\xe1\xbd\xaa\xce\x99||" |
| 433 | "\xe1\xbe\xa3|\xe1\xbd\xa3\xce\xb9|\xe1\xbd\xab\xce\x99||" |
| 434 | "\xe1\xbe\xa4|\xe1\xbd\xa4\xce\xb9|\xe1\xbd\xac\xce\x99||" |
| 435 | "\xe1\xbe\xa5|\xe1\xbd\xa5\xce\xb9|\xe1\xbd\xad\xce\x99||" |
| 436 | "\xe1\xbe\xa6|\xe1\xbd\xa6\xce\xb9|\xe1\xbd\xae\xce\x99||" |
| 437 | "\xe1\xbe\xa7|\xe1\xbd\xa7\xce\xb9|\xe1\xbd\xaf\xce\x99||" |
| 438 | "\xe1\xbe\xa8|\xe1\xbd\xa0\xce\xb9|\xe1\xbd\xa8\xce\x99|\xe1\xbe\xa0|" |
| 439 | "\xe1\xbe\xa9|\xe1\xbd\xa1\xce\xb9|\xe1\xbd\xa9\xce\x99|\xe1\xbe\xa1|" |
| 440 | "\xe1\xbe\xaa|\xe1\xbd\xa2\xce\xb9|\xe1\xbd\xaa\xce\x99|\xe1\xbe\xa2|" |
| 441 | "\xe1\xbe\xab|\xe1\xbd\xa3\xce\xb9|\xe1\xbd\xab\xce\x99|\xe1\xbe\xa3|" |
| 442 | "\xe1\xbe\xac|\xe1\xbd\xa4\xce\xb9|\xe1\xbd\xac\xce\x99|\xe1\xbe\xa4|" |
| 443 | "\xe1\xbe\xad|\xe1\xbd\xa5\xce\xb9|\xe1\xbd\xad\xce\x99|\xe1\xbe\xa5|" |
| 444 | "\xe1\xbe\xae|\xe1\xbd\xa6\xce\xb9|\xe1\xbd\xae\xce\x99|\xe1\xbe\xa6|" |
| 445 | "\xe1\xbe\xaf|\xe1\xbd\xa7\xce\xb9|\xe1\xbd\xaf\xce\x99|\xe1\xbe\xa7|" |
| 446 | "\xe1\xbe\xb2|\xe1\xbd\xb0\xce\xb9|\xe1\xbe\xba\xce\x99||" |
| 447 | "\xe1\xbe\xb3|\xce\xb1\xce\xb9|\xce\x91\xce\x99||" |
| 448 | "\xe1\xbe\xb4|\xce\xac\xce\xb9|\xce\x86\xce\x99||" |
| 449 | "\xe1\xbe\xb6|\xce\xb1\xcd\x82|\xce\x91\xcd\x82||" |
| 450 | "\xe1\xbe\xb7|\xce\xb1\xcd\x82\xce\xb9|\xce\x91\xcd\x82\xce\x99||" |
| 451 | "\xe1\xbe\xbc|\xce\xb1\xce\xb9|\xce\x91\xce\x99|\xe1\xbe\xb3|" |
| 452 | "\xe1\xbe\xbe|\xce\xb9|\xce\x99||" |
| 453 | "\xe1\xbf\x82|\xe1\xbd\xb4\xce\xb9|\xe1\xbf\x8a\xce\x99||" |
| 454 | "\xe1\xbf\x83|\xce\xb7\xce\xb9|\xce\x97\xce\x99||" |
| 455 | "\xe1\xbf\x84|\xce\xae\xce\xb9|\xce\x89\xce\x99||" |
| 456 | "\xe1\xbf\x86|\xce\xb7\xcd\x82|\xce\x97\xcd\x82||" |
| 457 | "\xe1\xbf\x87|\xce\xb7\xcd\x82\xce\xb9|\xce\x97\xcd\x82\xce\x99||" |
| 458 | "\xe1\xbf\x8c|\xce\xb7\xce\xb9|\xce\x97\xce\x99|\xe1\xbf\x83|" |
| 459 | "\xe1\xbf\x92|\xce\xb9\xcc\x88\xcc\x80|\xce\x99\xcc\x88\xcc\x80||" |
| 460 | "\xe1\xbf\x93|\xce\xb9\xcc\x88\xcc\x81|\xce\x99\xcc\x88\xcc\x81||" |
| 461 | "\xe1\xbf\x96|\xce\xb9\xcd\x82|\xce\x99\xcd\x82||" |
| 462 | "\xe1\xbf\x97|\xce\xb9\xcc\x88\xcd\x82|\xce\x99\xcc\x88\xcd\x82||" |
| 463 | "\xe1\xbf\xa2|\xcf\x85\xcc\x88\xcc\x80|\xce\xa5\xcc\x88\xcc\x80||" |
| 464 | "\xe1\xbf\xa3|\xcf\x85\xcc\x88\xcc\x81|\xce\xa5\xcc\x88\xcc\x81||" |
| 465 | "\xe1\xbf\xa4|\xcf\x81\xcc\x93|\xce\xa1\xcc\x93||" |
| 466 | "\xe1\xbf\xa6|\xcf\x85\xcd\x82|\xce\xa5\xcd\x82||" |
| 467 | "\xe1\xbf\xa7|\xcf\x85\xcc\x88\xcd\x82|\xce\xa5\xcc\x88\xcd\x82||" |
| 468 | "\xe1\xbf\xb2|\xe1\xbd\xbc\xce\xb9|\xe1\xbf\xba\xce\x99||" |
| 469 | "\xe1\xbf\xb3|\xcf\x89\xce\xb9|\xce\xa9\xce\x99||" |
| 470 | "\xe1\xbf\xb4|\xcf\x8e\xce\xb9|\xce\x8f\xce\x99||" |
| 471 | "\xe1\xbf\xb6|\xcf\x89\xcd\x82|\xce\xa9\xcd\x82||" |
| 472 | "\xe1\xbf\xb7|\xcf\x89\xcd\x82\xce\xb9|\xce\xa9\xcd\x82\xce\x99||" |
| 473 | "\xe1\xbf\xbc|\xcf\x89\xce\xb9|\xce\xa9\xce\x99|\xe1\xbf\xb3|" |
| 474 | "\xe2\x84\xa6|\xcf\x89||\xcf\x89|" |
| 475 | "\xe2\x84\xaa|k||k|" |
| 476 | "\xe2\x84\xab|\xc3\xa5||\xc3\xa5|" |
| 477 | "\xea\xad\xb0|\xe1\x8e\xa0|\xe1\x8e\xa0||" |
| 478 | "\xea\xad\xb1|\xe1\x8e\xa1|\xe1\x8e\xa1||" |
| 479 | "\xea\xad\xb2|\xe1\x8e\xa2|\xe1\x8e\xa2||" |
| 480 | "\xea\xad\xb3|\xe1\x8e\xa3|\xe1\x8e\xa3||" |
| 481 | "\xea\xad\xb4|\xe1\x8e\xa4|\xe1\x8e\xa4||" |
| 482 | "\xea\xad\xb5|\xe1\x8e\xa5|\xe1\x8e\xa5||" |
| 483 | "\xea\xad\xb6|\xe1\x8e\xa6|\xe1\x8e\xa6||" |
| 484 | "\xea\xad\xb7|\xe1\x8e\xa7|\xe1\x8e\xa7||" |
| 485 | "\xea\xad\xb8|\xe1\x8e\xa8|\xe1\x8e\xa8||" |
| 486 | "\xea\xad\xb9|\xe1\x8e\xa9|\xe1\x8e\xa9||" |
| 487 | "\xea\xad\xba|\xe1\x8e\xaa|\xe1\x8e\xaa||" |
| 488 | "\xea\xad\xbb|\xe1\x8e\xab|\xe1\x8e\xab||" |
| 489 | "\xea\xad\xbc|\xe1\x8e\xac|\xe1\x8e\xac||" |
| 490 | "\xea\xad\xbd|\xe1\x8e\xad|\xe1\x8e\xad||" |
| 491 | "\xea\xad\xbe|\xe1\x8e\xae|\xe1\x8e\xae||" |
| 492 | "\xea\xad\xbf|\xe1\x8e\xaf|\xe1\x8e\xaf||" |
| 493 | "\xea\xae\x80|\xe1\x8e\xb0|\xe1\x8e\xb0||" |
| 494 | "\xea\xae\x81|\xe1\x8e\xb1|\xe1\x8e\xb1||" |
| 495 | "\xea\xae\x82|\xe1\x8e\xb2|\xe1\x8e\xb2||" |
| 496 | "\xea\xae\x83|\xe1\x8e\xb3|\xe1\x8e\xb3||" |
| 497 | "\xea\xae\x84|\xe1\x8e\xb4|\xe1\x8e\xb4||" |
| 498 | "\xea\xae\x85|\xe1\x8e\xb5|\xe1\x8e\xb5||" |
| 499 | "\xea\xae\x86|\xe1\x8e\xb6|\xe1\x8e\xb6||" |
| 500 | "\xea\xae\x87|\xe1\x8e\xb7|\xe1\x8e\xb7||" |
| 501 | "\xea\xae\x88|\xe1\x8e\xb8|\xe1\x8e\xb8||" |
| 502 | "\xea\xae\x89|\xe1\x8e\xb9|\xe1\x8e\xb9||" |
| 503 | "\xea\xae\x8a|\xe1\x8e\xba|\xe1\x8e\xba||" |
| 504 | "\xea\xae\x8b|\xe1\x8e\xbb|\xe1\x8e\xbb||" |
| 505 | "\xea\xae\x8c|\xe1\x8e\xbc|\xe1\x8e\xbc||" |
| 506 | "\xea\xae\x8d|\xe1\x8e\xbd|\xe1\x8e\xbd||" |
| 507 | "\xea\xae\x8e|\xe1\x8e\xbe|\xe1\x8e\xbe||" |
| 508 | "\xea\xae\x8f|\xe1\x8e\xbf|\xe1\x8e\xbf||" |
| 509 | "\xea\xae\x90|\xe1\x8f\x80|\xe1\x8f\x80||" |
| 510 | "\xea\xae\x91|\xe1\x8f\x81|\xe1\x8f\x81||" |
| 511 | "\xea\xae\x92|\xe1\x8f\x82|\xe1\x8f\x82||" |
| 512 | "\xea\xae\x93|\xe1\x8f\x83|\xe1\x8f\x83||" |
| 513 | "\xea\xae\x94|\xe1\x8f\x84|\xe1\x8f\x84||" |
| 514 | "\xea\xae\x95|\xe1\x8f\x85|\xe1\x8f\x85||" |
| 515 | "\xea\xae\x96|\xe1\x8f\x86|\xe1\x8f\x86||" |
| 516 | "\xea\xae\x97|\xe1\x8f\x87|\xe1\x8f\x87||" |
| 517 | "\xea\xae\x98|\xe1\x8f\x88|\xe1\x8f\x88||" |
| 518 | "\xea\xae\x99|\xe1\x8f\x89|\xe1\x8f\x89||" |
| 519 | "\xea\xae\x9a|\xe1\x8f\x8a|\xe1\x8f\x8a||" |
| 520 | "\xea\xae\x9b|\xe1\x8f\x8b|\xe1\x8f\x8b||" |
| 521 | "\xea\xae\x9c|\xe1\x8f\x8c|\xe1\x8f\x8c||" |
| 522 | "\xea\xae\x9d|\xe1\x8f\x8d|\xe1\x8f\x8d||" |
| 523 | "\xea\xae\x9e|\xe1\x8f\x8e|\xe1\x8f\x8e||" |
| 524 | "\xea\xae\x9f|\xe1\x8f\x8f|\xe1\x8f\x8f||" |
| 525 | "\xea\xae\xa0|\xe1\x8f\x90|\xe1\x8f\x90||" |
| 526 | "\xea\xae\xa1|\xe1\x8f\x91|\xe1\x8f\x91||" |
| 527 | "\xea\xae\xa2|\xe1\x8f\x92|\xe1\x8f\x92||" |
| 528 | "\xea\xae\xa3|\xe1\x8f\x93|\xe1\x8f\x93||" |
| 529 | "\xea\xae\xa4|\xe1\x8f\x94|\xe1\x8f\x94||" |
| 530 | "\xea\xae\xa5|\xe1\x8f\x95|\xe1\x8f\x95||" |
| 531 | "\xea\xae\xa6|\xe1\x8f\x96|\xe1\x8f\x96||" |
| 532 | "\xea\xae\xa7|\xe1\x8f\x97|\xe1\x8f\x97||" |
| 533 | "\xea\xae\xa8|\xe1\x8f\x98|\xe1\x8f\x98||" |
| 534 | "\xea\xae\xa9|\xe1\x8f\x99|\xe1\x8f\x99||" |
| 535 | "\xea\xae\xaa|\xe1\x8f\x9a|\xe1\x8f\x9a||" |
| 536 | "\xea\xae\xab|\xe1\x8f\x9b|\xe1\x8f\x9b||" |
| 537 | "\xea\xae\xac|\xe1\x8f\x9c|\xe1\x8f\x9c||" |
| 538 | "\xea\xae\xad|\xe1\x8f\x9d|\xe1\x8f\x9d||" |
| 539 | "\xea\xae\xae|\xe1\x8f\x9e|\xe1\x8f\x9e||" |
| 540 | "\xea\xae\xaf|\xe1\x8f\x9f|\xe1\x8f\x9f||" |
| 541 | "\xea\xae\xb0|\xe1\x8f\xa0|\xe1\x8f\xa0||" |
| 542 | "\xea\xae\xb1|\xe1\x8f\xa1|\xe1\x8f\xa1||" |
| 543 | "\xea\xae\xb2|\xe1\x8f\xa2|\xe1\x8f\xa2||" |
| 544 | "\xea\xae\xb3|\xe1\x8f\xa3|\xe1\x8f\xa3||" |
| 545 | "\xea\xae\xb4|\xe1\x8f\xa4|\xe1\x8f\xa4||" |
| 546 | "\xea\xae\xb5|\xe1\x8f\xa5|\xe1\x8f\xa5||" |
| 547 | "\xea\xae\xb6|\xe1\x8f\xa6|\xe1\x8f\xa6||" |
| 548 | "\xea\xae\xb7|\xe1\x8f\xa7|\xe1\x8f\xa7||" |
| 549 | "\xea\xae\xb8|\xe1\x8f\xa8|\xe1\x8f\xa8||" |
| 550 | "\xea\xae\xb9|\xe1\x8f\xa9|\xe1\x8f\xa9||" |
| 551 | "\xea\xae\xba|\xe1\x8f\xaa|\xe1\x8f\xaa||" |
| 552 | "\xea\xae\xbb|\xe1\x8f\xab|\xe1\x8f\xab||" |
| 553 | "\xea\xae\xbc|\xe1\x8f\xac|\xe1\x8f\xac||" |
| 554 | "\xea\xae\xbd|\xe1\x8f\xad|\xe1\x8f\xad||" |
| 555 | "\xea\xae\xbe|\xe1\x8f\xae|\xe1\x8f\xae||" |
| 556 | "\xea\xae\xbf|\xe1\x8f\xaf|\xe1\x8f\xaf||" |
| 557 | "\xef\xac\x80|ff|FF||" |
| 558 | "\xef\xac\x81|fi|FI||" |
| 559 | "\xef\xac\x82|fl|FL||" |
| 560 | "\xef\xac\x83|ffi|FFI||" |
| 561 | "\xef\xac\x84|ffl|FFL||" |
| 562 | "\xef\xac\x85|st|ST||" |
| 563 | "\xef\xac\x86|st|ST||" |
| 564 | "\xef\xac\x93|\xd5\xb4\xd5\xb6|\xd5\x84\xd5\x86||" |
| 565 | "\xef\xac\x94|\xd5\xb4\xd5\xa5|\xd5\x84\xd4\xb5||" |
| 566 | "\xef\xac\x95|\xd5\xb4\xd5\xab|\xd5\x84\xd4\xbb||" |
| 567 | "\xef\xac\x96|\xd5\xbe\xd5\xb6|\xd5\x8e\xd5\x86||" |
| 568 | "\xef\xac\x97|\xd5\xb4\xd5\xad|\xd5\x84\xd4\xbd||" |
| 569 | |
| 570 | //--Autogenerated -- end of section automatically generated |
| 571 | ; |
| 572 | |
| 573 | class CaseConverter : public ICaseConverter { |
| 574 | // Maximum length of a case conversion result is 6 bytes in UTF-8 |
| 575 | enum { maxConversionLength=6 }; |
| 576 | struct ConversionString { |
| 577 | char conversion[maxConversionLength+1]; |
| 578 | ConversionString() noexcept : conversion{} { |
| 579 | } |
| 580 | }; |
| 581 | // Conversions are initially store in a vector of structs but then decomposed into |
| 582 | // parallel arrays as that is about 10% faster to search. |
| 583 | struct CharacterConversion { |
| 584 | int character; |
| 585 | ConversionString conversion; |
| 586 | CharacterConversion() noexcept : character(0) { |
| 587 | // Empty case: NUL -> "". |
| 588 | } |
| 589 | CharacterConversion(int character_, std::string_view conversion_) noexcept : character(character_) { |
| 590 | assert(conversion_.length() <= maxConversionLength); |
| 591 | try { |
| 592 | // This can never fail as std::string_view::copy should only throw |
| 593 | // std::out_of_range if pos > size() and pos == 0 here |
| 594 | conversion_.copy(conversion.conversion, conversion_.length()); |
| 595 | } catch (...) { |
| 596 | // Ignore any exception |
| 597 | } |
| 598 | } |
| 599 | bool operator<(const CharacterConversion &other) const noexcept { |
| 600 | return character < other.character; |
| 601 | } |
| 602 | }; |
| 603 | typedef std::vector<CharacterConversion> CharacterToConversion; |
| 604 | CharacterToConversion characterToConversion; |
| 605 | // The parallel arrays |
| 606 | std::vector<int> characters; |
| 607 | std::vector<ConversionString> conversions; |
| 608 | |
| 609 | public: |
| 610 | CaseConverter() = default; |
| 611 | // Deleted so CaseConverter objects can not be copied. |
| 612 | CaseConverter(const CaseConverter &) = delete; |
| 613 | CaseConverter(CaseConverter &&) = delete; |
| 614 | CaseConverter &operator=(const CaseConverter &) = delete; |
| 615 | CaseConverter &operator=(CaseConverter &&) = delete; |
| 616 | virtual ~CaseConverter() noexcept = default; |
| 617 | bool Initialised() const noexcept { |
| 618 | return !characters.empty(); |
| 619 | } |
| 620 | void Add(int character, const char *conversion) { |
| 621 | characterToConversion.emplace_back(character, conversion); |
| 622 | } |
| 623 | const char *Find(int character) { |
| 624 | const std::vector<int>::iterator it = std::lower_bound(characters.begin(), characters.end(), character); |
| 625 | if (it == characters.end()) |
| 626 | return nullptr; |
| 627 | else if (*it == character) |
| 628 | return conversions[it - characters.begin()].conversion; |
| 629 | else |
| 630 | return nullptr; |
| 631 | } |
| 632 | size_t CaseConvertString(char *converted, size_t sizeConverted, const char *mixed, size_t lenMixed) override { |
| 633 | size_t lenConverted = 0; |
| 634 | size_t mixedPos = 0; |
| 635 | unsigned char bytes[UTF8MaxBytes + 1]{}; |
| 636 | while (mixedPos < lenMixed) { |
| 637 | const unsigned char leadByte = mixed[mixedPos]; |
| 638 | const char *caseConverted = nullptr; |
| 639 | size_t lenMixedChar = 1; |
| 640 | if (UTF8IsAscii(leadByte)) { |
| 641 | caseConverted = Find(leadByte); |
| 642 | } else { |
| 643 | bytes[0] = leadByte; |
| 644 | const int widthCharBytes = UTF8BytesOfLead[leadByte]; |
| 645 | for (int b=1; b<widthCharBytes; b++) { |
| 646 | bytes[b] = (mixedPos+b < lenMixed) ? mixed[mixedPos+b] : 0; |
| 647 | } |
| 648 | const int classified = UTF8Classify(bytes, widthCharBytes); |
| 649 | if (!(classified & UTF8MaskInvalid)) { |
| 650 | // valid UTF-8 |
| 651 | lenMixedChar = classified & UTF8MaskWidth; |
| 652 | const int character = UnicodeFromUTF8(bytes); |
| 653 | caseConverted = Find(character); |
| 654 | } |
| 655 | } |
| 656 | if (caseConverted) { |
| 657 | // Character has a conversion so copy that conversion in |
| 658 | while (*caseConverted) { |
| 659 | converted[lenConverted++] = *caseConverted++; |
| 660 | if (lenConverted >= sizeConverted) |
| 661 | return 0; |
| 662 | } |
| 663 | } else { |
| 664 | // Character has no conversion so copy the input to output |
| 665 | for (size_t i=0; i<lenMixedChar; i++) { |
| 666 | converted[lenConverted++] = mixed[mixedPos+i]; |
| 667 | if (lenConverted >= sizeConverted) |
| 668 | return 0; |
| 669 | } |
| 670 | } |
| 671 | mixedPos += lenMixedChar; |
| 672 | } |
| 673 | return lenConverted; |
| 674 | } |
| 675 | void FinishedAdding() { |
| 676 | std::sort(characterToConversion.begin(), characterToConversion.end()); |
| 677 | characters.reserve(characterToConversion.size()); |
| 678 | conversions.reserve(characterToConversion.size()); |
| 679 | for (const CharacterConversion &chConv : characterToConversion) { |
| 680 | characters.push_back(chConv.character); |
| 681 | conversions.push_back(chConv.conversion); |
| 682 | } |
| 683 | // Empty the original calculated data completely |
| 684 | CharacterToConversion().swap(characterToConversion); |
| 685 | } |
| 686 | }; |
| 687 | |
| 688 | CaseConverter caseConvFold; |
| 689 | CaseConverter caseConvUp; |
| 690 | CaseConverter caseConvLow; |
| 691 | |
| 692 | void AddSymmetric(CaseConversion conversion, int lower,int upper) { |
| 693 | char lowerUTF8[UTF8MaxBytes+1]; |
| 694 | UTF8FromUTF32Character(lower, lowerUTF8); |
| 695 | char upperUTF8[UTF8MaxBytes+1]; |
| 696 | UTF8FromUTF32Character(upper, upperUTF8); |
| 697 | |
| 698 | switch (conversion) { |
| 699 | case CaseConversion::fold: |
| 700 | caseConvFold.Add(upper, lowerUTF8); |
| 701 | break; |
| 702 | case CaseConversion::upper: |
| 703 | caseConvUp.Add(lower, upperUTF8); |
| 704 | break; |
| 705 | case CaseConversion::lower: |
| 706 | caseConvLow.Add(upper, lowerUTF8); |
| 707 | break; |
| 708 | } |
| 709 | } |
| 710 | |
| 711 | void SetupConversions(CaseConversion conversion) { |
| 712 | // First initialize for the symmetric ranges |
| 713 | for (size_t i=0; i<std::size(symmetricCaseConversionRanges);) { |
| 714 | const int lower = symmetricCaseConversionRanges[i++]; |
| 715 | const int upper = symmetricCaseConversionRanges[i++]; |
| 716 | const int length = symmetricCaseConversionRanges[i++]; |
| 717 | const int pitch = symmetricCaseConversionRanges[i++]; |
| 718 | for (int j=0; j<length*pitch; j+=pitch) { |
| 719 | AddSymmetric(conversion, lower+j, upper+j); |
| 720 | } |
| 721 | } |
| 722 | // Add the symmetric singletons |
| 723 | for (size_t i=0; i<std::size(symmetricCaseConversions);) { |
| 724 | const int lower = symmetricCaseConversions[i++]; |
| 725 | const int upper = symmetricCaseConversions[i++]; |
| 726 | AddSymmetric(conversion, lower, upper); |
| 727 | } |
| 728 | // Add the complex cases |
| 729 | const char *sComplex = complexCaseConversions; |
| 730 | while (*sComplex) { |
| 731 | // Longest ligature is 3 character so 5 for safety |
| 732 | constexpr size_t lenUTF8 = 5*UTF8MaxBytes+1; |
| 733 | unsigned char originUTF8[lenUTF8]{}; |
| 734 | char foldedUTF8[lenUTF8]{}; |
| 735 | char lowerUTF8[lenUTF8]{}; |
| 736 | char upperUTF8[lenUTF8]{}; |
| 737 | size_t i = 0; |
| 738 | while (*sComplex && *sComplex != '|') { |
| 739 | originUTF8[i++] = *sComplex; |
| 740 | sComplex++; |
| 741 | } |
| 742 | sComplex++; |
| 743 | originUTF8[i] = 0; |
| 744 | i = 0; |
| 745 | while (*sComplex && *sComplex != '|') { |
| 746 | foldedUTF8[i++] = *sComplex; |
| 747 | sComplex++; |
| 748 | } |
| 749 | sComplex++; |
| 750 | foldedUTF8[i] = 0; |
| 751 | i = 0; |
| 752 | while (*sComplex && *sComplex != '|') { |
| 753 | upperUTF8[i++] = *sComplex; |
| 754 | sComplex++; |
| 755 | } |
| 756 | sComplex++; |
| 757 | upperUTF8[i] = 0; |
| 758 | i = 0; |
| 759 | while (*sComplex && *sComplex != '|') { |
| 760 | lowerUTF8[i++] = *sComplex; |
| 761 | sComplex++; |
| 762 | } |
| 763 | sComplex++; |
| 764 | lowerUTF8[i] = 0; |
| 765 | |
| 766 | const int character = UnicodeFromUTF8(originUTF8); |
| 767 | |
| 768 | if (conversion == CaseConversion::fold && foldedUTF8[0]) { |
| 769 | caseConvFold.Add(character, foldedUTF8); |
| 770 | } |
| 771 | |
| 772 | if (conversion == CaseConversion::upper && upperUTF8[0]) { |
| 773 | caseConvUp.Add(character, upperUTF8); |
| 774 | } |
| 775 | |
| 776 | if (conversion == CaseConversion::lower && lowerUTF8[0]) { |
| 777 | caseConvLow.Add(character, lowerUTF8); |
| 778 | } |
| 779 | } |
| 780 | |
| 781 | switch (conversion) { |
| 782 | case CaseConversion::fold: |
| 783 | caseConvFold.FinishedAdding(); |
| 784 | break; |
| 785 | case CaseConversion::upper: |
| 786 | caseConvUp.FinishedAdding(); |
| 787 | break; |
| 788 | case CaseConversion::lower: |
| 789 | caseConvLow.FinishedAdding(); |
| 790 | break; |
| 791 | } |
| 792 | } |
| 793 | |
| 794 | CaseConverter *ConverterForConversion(CaseConversion conversion) noexcept { |
| 795 | switch (conversion) { |
| 796 | case CaseConversion::fold: |
| 797 | return &caseConvFold; |
| 798 | case CaseConversion::upper: |
| 799 | return &caseConvUp; |
| 800 | case CaseConversion::lower: |
| 801 | return &caseConvLow; |
| 802 | } |
| 803 | return nullptr; |
| 804 | } |
| 805 | |
| 806 | } |
| 807 | |
| 808 | namespace Scintilla::Internal { |
| 809 | |
| 810 | ICaseConverter *ConverterFor(CaseConversion conversion) { |
| 811 | CaseConverter *pCaseConv = ConverterForConversion(conversion); |
| 812 | if (!pCaseConv->Initialised()) |
| 813 | SetupConversions(conversion); |
| 814 | return pCaseConv; |
| 815 | } |
| 816 | |
| 817 | const char *CaseConvert(int character, CaseConversion conversion) { |
| 818 | CaseConverter *pCaseConv = ConverterForConversion(conversion); |
| 819 | if (!pCaseConv->Initialised()) |
| 820 | SetupConversions(conversion); |
| 821 | return pCaseConv->Find(character); |
| 822 | } |
| 823 | |
| 824 | size_t CaseConvertString(char *converted, size_t sizeConverted, const char *mixed, size_t lenMixed, CaseConversion conversion) { |
| 825 | CaseConverter *pCaseConv = ConverterForConversion(conversion); |
| 826 | if (!pCaseConv->Initialised()) |
| 827 | SetupConversions(conversion); |
| 828 | return pCaseConv->CaseConvertString(converted, sizeConverted, mixed, lenMixed); |
| 829 | } |
| 830 | |
| 831 | std::string CaseConvertString(const std::string &s, CaseConversion conversion) { |
| 832 | std::string retMapped(s.length() * maxExpansionCaseConversion, 0); |
| 833 | const size_t lenMapped = CaseConvertString(&retMapped[0], retMapped.length(), s.c_str(), s.length(), |
| 834 | conversion); |
| 835 | retMapped.resize(lenMapped); |
| 836 | return retMapped; |
| 837 | } |
| 838 | |
| 839 | } |
| 840 | |