| 1 | /* |
| 2 | * regc_locale.c -- |
| 3 | * |
| 4 | * This file contains locale-specific regexp routines. |
| 5 | * This file is #included by regcomp.c. |
| 6 | * |
| 7 | * Copyright (c) 1998 by Scriptics Corporation. |
| 8 | * |
| 9 | * This software is copyrighted by the Regents of the University of |
| 10 | * California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState |
| 11 | * Corporation and other parties. The following terms apply to all files |
| 12 | * associated with the software unless explicitly disclaimed in |
| 13 | * individual files. |
| 14 | * |
| 15 | * The authors hereby grant permission to use, copy, modify, distribute, |
| 16 | * and license this software and its documentation for any purpose, provided |
| 17 | * that existing copyright notices are retained in all copies and that this |
| 18 | * notice is included verbatim in any distributions. No written agreement, |
| 19 | * license, or royalty fee is required for any of the authorized uses. |
| 20 | * Modifications to this software may be copyrighted by their authors |
| 21 | * and need not follow the licensing terms described here, provided that |
| 22 | * the new terms are clearly indicated on the first page of each file where |
| 23 | * they apply. |
| 24 | * |
| 25 | * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY |
| 26 | * FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES |
| 27 | * ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY |
| 28 | * DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE |
| 29 | * POSSIBILITY OF SUCH DAMAGE. |
| 30 | * |
| 31 | * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES, |
| 32 | * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, |
| 33 | * FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THIS SOFTWARE |
| 34 | * IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE |
| 35 | * NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR |
| 36 | * MODIFICATIONS. |
| 37 | * |
| 38 | * GOVERNMENT USE: If you are acquiring this software on behalf of the |
| 39 | * U.S. government, the Government shall have only "Restricted Rights" |
| 40 | * in the software and related documentation as defined in the Federal |
| 41 | * Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2). If you |
| 42 | * are acquiring the software on behalf of the Department of Defense, the |
| 43 | * software shall be classified as "Commercial Computer Software" and the |
| 44 | * Government shall have only "Restricted Rights" as defined in Clause |
| 45 | * 252.227-7013 (c) (1) of DFARs. Notwithstanding the foregoing, the |
| 46 | * authors grant the U.S. Government and others acting in its behalf |
| 47 | * permission to use and distribute the software in accordance with the |
| 48 | * terms specified in this license. |
| 49 | * |
| 50 | * src/backend/regex/regc_locale.c |
| 51 | */ |
| 52 | |
| 53 | /* ASCII character-name table */ |
| 54 | |
| 55 | static const struct cname |
| 56 | { |
| 57 | const char *name; |
| 58 | const char code; |
| 59 | } cnames[] = |
| 60 | |
| 61 | { |
| 62 | { |
| 63 | "NUL" , '\0' |
| 64 | }, |
| 65 | { |
| 66 | "SOH" , '\001' |
| 67 | }, |
| 68 | { |
| 69 | "STX" , '\002' |
| 70 | }, |
| 71 | { |
| 72 | "ETX" , '\003' |
| 73 | }, |
| 74 | { |
| 75 | "EOT" , '\004' |
| 76 | }, |
| 77 | { |
| 78 | "ENQ" , '\005' |
| 79 | }, |
| 80 | { |
| 81 | "ACK" , '\006' |
| 82 | }, |
| 83 | { |
| 84 | "BEL" , '\007' |
| 85 | }, |
| 86 | { |
| 87 | "alert" , '\007' |
| 88 | }, |
| 89 | { |
| 90 | "BS" , '\010' |
| 91 | }, |
| 92 | { |
| 93 | "backspace" , '\b' |
| 94 | }, |
| 95 | { |
| 96 | "HT" , '\011' |
| 97 | }, |
| 98 | { |
| 99 | "tab" , '\t' |
| 100 | }, |
| 101 | { |
| 102 | "LF" , '\012' |
| 103 | }, |
| 104 | { |
| 105 | "newline" , '\n' |
| 106 | }, |
| 107 | { |
| 108 | "VT" , '\013' |
| 109 | }, |
| 110 | { |
| 111 | "vertical-tab" , '\v' |
| 112 | }, |
| 113 | { |
| 114 | "FF" , '\014' |
| 115 | }, |
| 116 | { |
| 117 | "form-feed" , '\f' |
| 118 | }, |
| 119 | { |
| 120 | "CR" , '\015' |
| 121 | }, |
| 122 | { |
| 123 | "carriage-return" , '\r' |
| 124 | }, |
| 125 | { |
| 126 | "SO" , '\016' |
| 127 | }, |
| 128 | { |
| 129 | "SI" , '\017' |
| 130 | }, |
| 131 | { |
| 132 | "DLE" , '\020' |
| 133 | }, |
| 134 | { |
| 135 | "DC1" , '\021' |
| 136 | }, |
| 137 | { |
| 138 | "DC2" , '\022' |
| 139 | }, |
| 140 | { |
| 141 | "DC3" , '\023' |
| 142 | }, |
| 143 | { |
| 144 | "DC4" , '\024' |
| 145 | }, |
| 146 | { |
| 147 | "NAK" , '\025' |
| 148 | }, |
| 149 | { |
| 150 | "SYN" , '\026' |
| 151 | }, |
| 152 | { |
| 153 | "ETB" , '\027' |
| 154 | }, |
| 155 | { |
| 156 | "CAN" , '\030' |
| 157 | }, |
| 158 | { |
| 159 | "EM" , '\031' |
| 160 | }, |
| 161 | { |
| 162 | "SUB" , '\032' |
| 163 | }, |
| 164 | { |
| 165 | "ESC" , '\033' |
| 166 | }, |
| 167 | { |
| 168 | "IS4" , '\034' |
| 169 | }, |
| 170 | { |
| 171 | "FS" , '\034' |
| 172 | }, |
| 173 | { |
| 174 | "IS3" , '\035' |
| 175 | }, |
| 176 | { |
| 177 | "GS" , '\035' |
| 178 | }, |
| 179 | { |
| 180 | "IS2" , '\036' |
| 181 | }, |
| 182 | { |
| 183 | "RS" , '\036' |
| 184 | }, |
| 185 | { |
| 186 | "IS1" , '\037' |
| 187 | }, |
| 188 | { |
| 189 | "US" , '\037' |
| 190 | }, |
| 191 | { |
| 192 | "space" , ' ' |
| 193 | }, |
| 194 | { |
| 195 | "exclamation-mark" , '!' |
| 196 | }, |
| 197 | { |
| 198 | "quotation-mark" , '"' |
| 199 | }, |
| 200 | { |
| 201 | "number-sign" , '#' |
| 202 | }, |
| 203 | { |
| 204 | "dollar-sign" , '$' |
| 205 | }, |
| 206 | { |
| 207 | "percent-sign" , '%' |
| 208 | }, |
| 209 | { |
| 210 | "ampersand" , '&' |
| 211 | }, |
| 212 | { |
| 213 | "apostrophe" , '\'' |
| 214 | }, |
| 215 | { |
| 216 | "left-parenthesis" , '(' |
| 217 | }, |
| 218 | { |
| 219 | "right-parenthesis" , ')' |
| 220 | }, |
| 221 | { |
| 222 | "asterisk" , '*' |
| 223 | }, |
| 224 | { |
| 225 | "plus-sign" , '+' |
| 226 | }, |
| 227 | { |
| 228 | "comma" , ',' |
| 229 | }, |
| 230 | { |
| 231 | "hyphen" , '-' |
| 232 | }, |
| 233 | { |
| 234 | "hyphen-minus" , '-' |
| 235 | }, |
| 236 | { |
| 237 | "period" , '.' |
| 238 | }, |
| 239 | { |
| 240 | "full-stop" , '.' |
| 241 | }, |
| 242 | { |
| 243 | "slash" , '/' |
| 244 | }, |
| 245 | { |
| 246 | "solidus" , '/' |
| 247 | }, |
| 248 | { |
| 249 | "zero" , '0' |
| 250 | }, |
| 251 | { |
| 252 | "one" , '1' |
| 253 | }, |
| 254 | { |
| 255 | "two" , '2' |
| 256 | }, |
| 257 | { |
| 258 | "three" , '3' |
| 259 | }, |
| 260 | { |
| 261 | "four" , '4' |
| 262 | }, |
| 263 | { |
| 264 | "five" , '5' |
| 265 | }, |
| 266 | { |
| 267 | "six" , '6' |
| 268 | }, |
| 269 | { |
| 270 | "seven" , '7' |
| 271 | }, |
| 272 | { |
| 273 | "eight" , '8' |
| 274 | }, |
| 275 | { |
| 276 | "nine" , '9' |
| 277 | }, |
| 278 | { |
| 279 | "colon" , ':' |
| 280 | }, |
| 281 | { |
| 282 | "semicolon" , ';' |
| 283 | }, |
| 284 | { |
| 285 | "less-than-sign" , '<' |
| 286 | }, |
| 287 | { |
| 288 | "equals-sign" , '=' |
| 289 | }, |
| 290 | { |
| 291 | "greater-than-sign" , '>' |
| 292 | }, |
| 293 | { |
| 294 | "question-mark" , '?' |
| 295 | }, |
| 296 | { |
| 297 | "commercial-at" , '@' |
| 298 | }, |
| 299 | { |
| 300 | "left-square-bracket" , '[' |
| 301 | }, |
| 302 | { |
| 303 | "backslash" , '\\' |
| 304 | }, |
| 305 | { |
| 306 | "reverse-solidus" , '\\' |
| 307 | }, |
| 308 | { |
| 309 | "right-square-bracket" , ']' |
| 310 | }, |
| 311 | { |
| 312 | "circumflex" , '^' |
| 313 | }, |
| 314 | { |
| 315 | "circumflex-accent" , '^' |
| 316 | }, |
| 317 | { |
| 318 | "underscore" , '_' |
| 319 | }, |
| 320 | { |
| 321 | "low-line" , '_' |
| 322 | }, |
| 323 | { |
| 324 | "grave-accent" , '`' |
| 325 | }, |
| 326 | { |
| 327 | "left-brace" , '{' |
| 328 | }, |
| 329 | { |
| 330 | "left-curly-bracket" , '{' |
| 331 | }, |
| 332 | { |
| 333 | "vertical-line" , '|' |
| 334 | }, |
| 335 | { |
| 336 | "right-brace" , '}' |
| 337 | }, |
| 338 | { |
| 339 | "right-curly-bracket" , '}' |
| 340 | }, |
| 341 | { |
| 342 | "tilde" , '~' |
| 343 | }, |
| 344 | { |
| 345 | "DEL" , '\177' |
| 346 | }, |
| 347 | { |
| 348 | NULL, 0 |
| 349 | } |
| 350 | }; |
| 351 | |
| 352 | /* |
| 353 | * The following arrays define the valid character class names. |
| 354 | */ |
| 355 | static const char *const classNames[NUM_CCLASSES + 1] = { |
| 356 | "alnum" , "alpha" , "ascii" , "blank" , "cntrl" , "digit" , "graph" , |
| 357 | "lower" , "print" , "punct" , "space" , "upper" , "xdigit" , NULL |
| 358 | }; |
| 359 | |
| 360 | enum classes |
| 361 | { |
| 362 | CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH, |
| 363 | CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT |
| 364 | }; |
| 365 | |
| 366 | /* |
| 367 | * We do not use the hard-wired Unicode classification tables that Tcl does. |
| 368 | * This is because (a) we need to deal with other encodings besides Unicode, |
| 369 | * and (b) we want to track the behavior of the libc locale routines as |
| 370 | * closely as possible. For example, it wouldn't be unreasonable for a |
| 371 | * locale to not consider every Unicode letter as a letter. So we build |
| 372 | * character classification cvecs by asking libc, even for Unicode. |
| 373 | */ |
| 374 | |
| 375 | |
| 376 | /* |
| 377 | * element - map collating-element name to chr |
| 378 | */ |
| 379 | static chr |
| 380 | element(struct vars *v, /* context */ |
| 381 | const chr *startp, /* points to start of name */ |
| 382 | const chr *endp) /* points just past end of name */ |
| 383 | { |
| 384 | const struct cname *cn; |
| 385 | size_t len; |
| 386 | |
| 387 | /* generic: one-chr names stand for themselves */ |
| 388 | assert(startp < endp); |
| 389 | len = endp - startp; |
| 390 | if (len == 1) |
| 391 | return *startp; |
| 392 | |
| 393 | NOTE(REG_ULOCALE); |
| 394 | |
| 395 | /* search table */ |
| 396 | for (cn = cnames; cn->name != NULL; cn++) |
| 397 | { |
| 398 | if (strlen(cn->name) == len && |
| 399 | pg_char_and_wchar_strncmp(cn->name, startp, len) == 0) |
| 400 | { |
| 401 | break; /* NOTE BREAK OUT */ |
| 402 | } |
| 403 | } |
| 404 | if (cn->name != NULL) |
| 405 | return CHR(cn->code); |
| 406 | |
| 407 | /* couldn't find it */ |
| 408 | ERR(REG_ECOLLATE); |
| 409 | return 0; |
| 410 | } |
| 411 | |
| 412 | /* |
| 413 | * range - supply cvec for a range, including legality check |
| 414 | */ |
| 415 | static struct cvec * |
| 416 | range(struct vars *v, /* context */ |
| 417 | chr a, /* range start */ |
| 418 | chr b, /* range end, might equal a */ |
| 419 | int cases) /* case-independent? */ |
| 420 | { |
| 421 | int nchrs; |
| 422 | struct cvec *cv; |
| 423 | chr c, |
| 424 | cc; |
| 425 | |
| 426 | if (a != b && !before(a, b)) |
| 427 | { |
| 428 | ERR(REG_ERANGE); |
| 429 | return NULL; |
| 430 | } |
| 431 | |
| 432 | if (!cases) |
| 433 | { /* easy version */ |
| 434 | cv = getcvec(v, 0, 1); |
| 435 | NOERRN(); |
| 436 | addrange(cv, a, b); |
| 437 | return cv; |
| 438 | } |
| 439 | |
| 440 | /* |
| 441 | * When case-independent, it's hard to decide when cvec ranges are usable, |
| 442 | * so for now at least, we won't try. We use a range for the originally |
| 443 | * specified chrs and then add on any case-equivalents that are outside |
| 444 | * that range as individual chrs. |
| 445 | * |
| 446 | * To ensure sane behavior if someone specifies a very large range, limit |
| 447 | * the allocation size to 100000 chrs (arbitrary) and check for overrun |
| 448 | * inside the loop below. |
| 449 | */ |
| 450 | nchrs = b - a + 1; |
| 451 | if (nchrs <= 0 || nchrs > 100000) |
| 452 | nchrs = 100000; |
| 453 | |
| 454 | cv = getcvec(v, nchrs, 1); |
| 455 | NOERRN(); |
| 456 | addrange(cv, a, b); |
| 457 | |
| 458 | for (c = a; c <= b; c++) |
| 459 | { |
| 460 | cc = pg_wc_tolower(c); |
| 461 | if (cc != c && |
| 462 | (before(cc, a) || before(b, cc))) |
| 463 | { |
| 464 | if (cv->nchrs >= cv->chrspace) |
| 465 | { |
| 466 | ERR(REG_ETOOBIG); |
| 467 | return NULL; |
| 468 | } |
| 469 | addchr(cv, cc); |
| 470 | } |
| 471 | cc = pg_wc_toupper(c); |
| 472 | if (cc != c && |
| 473 | (before(cc, a) || before(b, cc))) |
| 474 | { |
| 475 | if (cv->nchrs >= cv->chrspace) |
| 476 | { |
| 477 | ERR(REG_ETOOBIG); |
| 478 | return NULL; |
| 479 | } |
| 480 | addchr(cv, cc); |
| 481 | } |
| 482 | if (CANCEL_REQUESTED(v->re)) |
| 483 | { |
| 484 | ERR(REG_CANCEL); |
| 485 | return NULL; |
| 486 | } |
| 487 | } |
| 488 | |
| 489 | return cv; |
| 490 | } |
| 491 | |
| 492 | /* |
| 493 | * before - is chr x before chr y, for purposes of range legality? |
| 494 | */ |
| 495 | static int /* predicate */ |
| 496 | before(chr x, chr y) |
| 497 | { |
| 498 | if (x < y) |
| 499 | return 1; |
| 500 | return 0; |
| 501 | } |
| 502 | |
| 503 | /* |
| 504 | * eclass - supply cvec for an equivalence class |
| 505 | * Must include case counterparts on request. |
| 506 | */ |
| 507 | static struct cvec * |
| 508 | eclass(struct vars *v, /* context */ |
| 509 | chr c, /* Collating element representing the |
| 510 | * equivalence class. */ |
| 511 | int cases) /* all cases? */ |
| 512 | { |
| 513 | struct cvec *cv; |
| 514 | |
| 515 | /* crude fake equivalence class for testing */ |
| 516 | if ((v->cflags & REG_FAKE) && c == 'x') |
| 517 | { |
| 518 | cv = getcvec(v, 4, 0); |
| 519 | addchr(cv, CHR('x')); |
| 520 | addchr(cv, CHR('y')); |
| 521 | if (cases) |
| 522 | { |
| 523 | addchr(cv, CHR('X')); |
| 524 | addchr(cv, CHR('Y')); |
| 525 | } |
| 526 | return cv; |
| 527 | } |
| 528 | |
| 529 | /* otherwise, none */ |
| 530 | if (cases) |
| 531 | return allcases(v, c); |
| 532 | cv = getcvec(v, 1, 0); |
| 533 | assert(cv != NULL); |
| 534 | addchr(cv, c); |
| 535 | return cv; |
| 536 | } |
| 537 | |
| 538 | /* |
| 539 | * cclass - supply cvec for a character class |
| 540 | * |
| 541 | * Must include case counterparts if "cases" is true. |
| 542 | * |
| 543 | * The returned cvec might be either a transient cvec gotten from getcvec(), |
| 544 | * or a permanently cached one from pg_ctype_get_cache(). This is okay |
| 545 | * because callers are not supposed to explicitly free the result either way. |
| 546 | */ |
| 547 | static struct cvec * |
| 548 | cclass(struct vars *v, /* context */ |
| 549 | const chr *startp, /* where the name starts */ |
| 550 | const chr *endp, /* just past the end of the name */ |
| 551 | int cases) /* case-independent? */ |
| 552 | { |
| 553 | size_t len; |
| 554 | struct cvec *cv = NULL; |
| 555 | const char *const *namePtr; |
| 556 | int i, |
| 557 | index; |
| 558 | |
| 559 | /* |
| 560 | * Map the name to the corresponding enumerated value. |
| 561 | */ |
| 562 | len = endp - startp; |
| 563 | index = -1; |
| 564 | for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++) |
| 565 | { |
| 566 | if (strlen(*namePtr) == len && |
| 567 | pg_char_and_wchar_strncmp(*namePtr, startp, len) == 0) |
| 568 | { |
| 569 | index = i; |
| 570 | break; |
| 571 | } |
| 572 | } |
| 573 | if (index == -1) |
| 574 | { |
| 575 | ERR(REG_ECTYPE); |
| 576 | return NULL; |
| 577 | } |
| 578 | |
| 579 | /* |
| 580 | * Remap lower and upper to alpha if the match is case insensitive. |
| 581 | */ |
| 582 | |
| 583 | if (cases && |
| 584 | ((enum classes) index == CC_LOWER || |
| 585 | (enum classes) index == CC_UPPER)) |
| 586 | index = (int) CC_ALPHA; |
| 587 | |
| 588 | /* |
| 589 | * Now compute the character class contents. For classes that are based |
| 590 | * on the behavior of a <wctype.h> or <ctype.h> function, we use |
| 591 | * pg_ctype_get_cache so that we can cache the results. Other classes |
| 592 | * have definitions that are hard-wired here, and for those we just |
| 593 | * construct a transient cvec on the fly. |
| 594 | * |
| 595 | * NB: keep this code in sync with cclass_column_index(), below. |
| 596 | */ |
| 597 | |
| 598 | switch ((enum classes) index) |
| 599 | { |
| 600 | case CC_PRINT: |
| 601 | cv = pg_ctype_get_cache(pg_wc_isprint, index); |
| 602 | break; |
| 603 | case CC_ALNUM: |
| 604 | cv = pg_ctype_get_cache(pg_wc_isalnum, index); |
| 605 | break; |
| 606 | case CC_ALPHA: |
| 607 | cv = pg_ctype_get_cache(pg_wc_isalpha, index); |
| 608 | break; |
| 609 | case CC_ASCII: |
| 610 | /* hard-wired meaning */ |
| 611 | cv = getcvec(v, 0, 1); |
| 612 | if (cv) |
| 613 | addrange(cv, 0, 0x7f); |
| 614 | break; |
| 615 | case CC_BLANK: |
| 616 | /* hard-wired meaning */ |
| 617 | cv = getcvec(v, 2, 0); |
| 618 | addchr(cv, '\t'); |
| 619 | addchr(cv, ' '); |
| 620 | break; |
| 621 | case CC_CNTRL: |
| 622 | /* hard-wired meaning */ |
| 623 | cv = getcvec(v, 0, 2); |
| 624 | addrange(cv, 0x0, 0x1f); |
| 625 | addrange(cv, 0x7f, 0x9f); |
| 626 | break; |
| 627 | case CC_DIGIT: |
| 628 | cv = pg_ctype_get_cache(pg_wc_isdigit, index); |
| 629 | break; |
| 630 | case CC_PUNCT: |
| 631 | cv = pg_ctype_get_cache(pg_wc_ispunct, index); |
| 632 | break; |
| 633 | case CC_XDIGIT: |
| 634 | |
| 635 | /* |
| 636 | * It's not clear how to define this in non-western locales, and |
| 637 | * even less clear that there's any particular use in trying. So |
| 638 | * just hard-wire the meaning. |
| 639 | */ |
| 640 | cv = getcvec(v, 0, 3); |
| 641 | if (cv) |
| 642 | { |
| 643 | addrange(cv, '0', '9'); |
| 644 | addrange(cv, 'a', 'f'); |
| 645 | addrange(cv, 'A', 'F'); |
| 646 | } |
| 647 | break; |
| 648 | case CC_SPACE: |
| 649 | cv = pg_ctype_get_cache(pg_wc_isspace, index); |
| 650 | break; |
| 651 | case CC_LOWER: |
| 652 | cv = pg_ctype_get_cache(pg_wc_islower, index); |
| 653 | break; |
| 654 | case CC_UPPER: |
| 655 | cv = pg_ctype_get_cache(pg_wc_isupper, index); |
| 656 | break; |
| 657 | case CC_GRAPH: |
| 658 | cv = pg_ctype_get_cache(pg_wc_isgraph, index); |
| 659 | break; |
| 660 | } |
| 661 | |
| 662 | /* If cv is NULL now, the reason must be "out of memory" */ |
| 663 | if (cv == NULL) |
| 664 | ERR(REG_ESPACE); |
| 665 | return cv; |
| 666 | } |
| 667 | |
| 668 | /* |
| 669 | * cclass_column_index - get appropriate high colormap column index for chr |
| 670 | */ |
| 671 | static int |
| 672 | cclass_column_index(struct colormap *cm, chr c) |
| 673 | { |
| 674 | int colnum = 0; |
| 675 | |
| 676 | /* Shouldn't go through all these pushups for simple chrs */ |
| 677 | assert(c > MAX_SIMPLE_CHR); |
| 678 | |
| 679 | /* |
| 680 | * Note: we should not see requests to consider cclasses that are not |
| 681 | * treated as locale-specific by cclass(), above. |
| 682 | */ |
| 683 | if (cm->classbits[CC_PRINT] && pg_wc_isprint(c)) |
| 684 | colnum |= cm->classbits[CC_PRINT]; |
| 685 | if (cm->classbits[CC_ALNUM] && pg_wc_isalnum(c)) |
| 686 | colnum |= cm->classbits[CC_ALNUM]; |
| 687 | if (cm->classbits[CC_ALPHA] && pg_wc_isalpha(c)) |
| 688 | colnum |= cm->classbits[CC_ALPHA]; |
| 689 | assert(cm->classbits[CC_ASCII] == 0); |
| 690 | assert(cm->classbits[CC_BLANK] == 0); |
| 691 | assert(cm->classbits[CC_CNTRL] == 0); |
| 692 | if (cm->classbits[CC_DIGIT] && pg_wc_isdigit(c)) |
| 693 | colnum |= cm->classbits[CC_DIGIT]; |
| 694 | if (cm->classbits[CC_PUNCT] && pg_wc_ispunct(c)) |
| 695 | colnum |= cm->classbits[CC_PUNCT]; |
| 696 | assert(cm->classbits[CC_XDIGIT] == 0); |
| 697 | if (cm->classbits[CC_SPACE] && pg_wc_isspace(c)) |
| 698 | colnum |= cm->classbits[CC_SPACE]; |
| 699 | if (cm->classbits[CC_LOWER] && pg_wc_islower(c)) |
| 700 | colnum |= cm->classbits[CC_LOWER]; |
| 701 | if (cm->classbits[CC_UPPER] && pg_wc_isupper(c)) |
| 702 | colnum |= cm->classbits[CC_UPPER]; |
| 703 | if (cm->classbits[CC_GRAPH] && pg_wc_isgraph(c)) |
| 704 | colnum |= cm->classbits[CC_GRAPH]; |
| 705 | |
| 706 | return colnum; |
| 707 | } |
| 708 | |
| 709 | /* |
| 710 | * allcases - supply cvec for all case counterparts of a chr (including itself) |
| 711 | * |
| 712 | * This is a shortcut, preferably an efficient one, for simple characters; |
| 713 | * messy cases are done via range(). |
| 714 | */ |
| 715 | static struct cvec * |
| 716 | allcases(struct vars *v, /* context */ |
| 717 | chr c) /* character to get case equivs of */ |
| 718 | { |
| 719 | struct cvec *cv; |
| 720 | chr lc, |
| 721 | uc; |
| 722 | |
| 723 | lc = pg_wc_tolower(c); |
| 724 | uc = pg_wc_toupper(c); |
| 725 | |
| 726 | cv = getcvec(v, 2, 0); |
| 727 | addchr(cv, lc); |
| 728 | if (lc != uc) |
| 729 | addchr(cv, uc); |
| 730 | return cv; |
| 731 | } |
| 732 | |
| 733 | /* |
| 734 | * cmp - chr-substring compare |
| 735 | * |
| 736 | * Backrefs need this. It should preferably be efficient. |
| 737 | * Note that it does not need to report anything except equal/unequal. |
| 738 | * Note also that the length is exact, and the comparison should not |
| 739 | * stop at embedded NULs! |
| 740 | */ |
| 741 | static int /* 0 for equal, nonzero for unequal */ |
| 742 | cmp(const chr *x, const chr *y, /* strings to compare */ |
| 743 | size_t len) /* exact length of comparison */ |
| 744 | { |
| 745 | return memcmp(VS(x), VS(y), len * sizeof(chr)); |
| 746 | } |
| 747 | |
| 748 | /* |
| 749 | * casecmp - case-independent chr-substring compare |
| 750 | * |
| 751 | * REG_ICASE backrefs need this. It should preferably be efficient. |
| 752 | * Note that it does not need to report anything except equal/unequal. |
| 753 | * Note also that the length is exact, and the comparison should not |
| 754 | * stop at embedded NULs! |
| 755 | */ |
| 756 | static int /* 0 for equal, nonzero for unequal */ |
| 757 | casecmp(const chr *x, const chr *y, /* strings to compare */ |
| 758 | size_t len) /* exact length of comparison */ |
| 759 | { |
| 760 | for (; len > 0; len--, x++, y++) |
| 761 | { |
| 762 | if ((*x != *y) && (pg_wc_tolower(*x) != pg_wc_tolower(*y))) |
| 763 | return 1; |
| 764 | } |
| 765 | return 0; |
| 766 | } |
| 767 | |