| 1 | /************************************************* | 
|---|
| 2 | *      Perl-Compatible Regular Expressions       * | 
|---|
| 3 | *************************************************/ | 
|---|
| 4 |  | 
|---|
| 5 | /* PCRE is a library of functions to support regular expressions whose syntax | 
|---|
| 6 | and semantics are as close as possible to those of the Perl 5 language. | 
|---|
| 7 |  | 
|---|
| 8 | Written by Philip Hazel | 
|---|
| 9 | Original API code Copyright (c) 1997-2012 University of Cambridge | 
|---|
| 10 | New API code Copyright (c) 2016-2021 University of Cambridge | 
|---|
| 11 |  | 
|---|
| 12 | ----------------------------------------------------------------------------- | 
|---|
| 13 | Redistribution and use in source and binary forms, with or without | 
|---|
| 14 | modification, are permitted provided that the following conditions are met: | 
|---|
| 15 |  | 
|---|
| 16 | * Redistributions of source code must retain the above copyright notice, | 
|---|
| 17 | this list of conditions and the following disclaimer. | 
|---|
| 18 |  | 
|---|
| 19 | * Redistributions in binary form must reproduce the above copyright | 
|---|
| 20 | notice, this list of conditions and the following disclaimer in the | 
|---|
| 21 | documentation and/or other materials provided with the distribution. | 
|---|
| 22 |  | 
|---|
| 23 | * Neither the name of the University of Cambridge nor the names of its | 
|---|
| 24 | contributors may be used to endorse or promote products derived from | 
|---|
| 25 | this software without specific prior written permission. | 
|---|
| 26 |  | 
|---|
| 27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | 
|---|
| 28 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | 
|---|
| 29 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | 
|---|
| 30 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | 
|---|
| 31 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | 
|---|
| 32 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | 
|---|
| 33 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | 
|---|
| 34 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | 
|---|
| 35 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | 
|---|
| 36 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | 
|---|
| 37 | POSSIBILITY OF SUCH DAMAGE. | 
|---|
| 38 | ----------------------------------------------------------------------------- | 
|---|
| 39 | */ | 
|---|
| 40 |  | 
|---|
| 41 | /* This module contains some fixed tables that are used by more than one of the | 
|---|
| 42 | PCRE2 code modules. The tables are also #included by the pcre2test program, | 
|---|
| 43 | which uses macros to change their names from _pcre2_xxx to xxxx, thereby | 
|---|
| 44 | avoiding name clashes with the library. In this case, PCRE2_PCRE2TEST is | 
|---|
| 45 | defined. */ | 
|---|
| 46 |  | 
|---|
| 47 | #ifndef PCRE2_PCRE2TEST           /* We're compiling the library */ | 
|---|
| 48 | #ifdef HAVE_CONFIG_H | 
|---|
| 49 | #include "config.h" | 
|---|
| 50 | #endif | 
|---|
| 51 | #include "pcre2_internal.h" | 
|---|
| 52 | #endif /* PCRE2_PCRE2TEST */ | 
|---|
| 53 |  | 
|---|
| 54 | /* Table of sizes for the fixed-length opcodes. It's defined in a macro so that | 
|---|
| 55 | the definition is next to the definition of the opcodes in pcre2_internal.h. | 
|---|
| 56 | This is mode-dependent, so it is skipped when this file is included by | 
|---|
| 57 | pcre2test. */ | 
|---|
| 58 |  | 
|---|
| 59 | #ifndef PCRE2_PCRE2TEST | 
|---|
| 60 | const uint8_t PRIV(OP_lengths)[] = { OP_LENGTHS }; | 
|---|
| 61 | #endif | 
|---|
| 62 |  | 
|---|
| 63 | /* Tables of horizontal and vertical whitespace characters, suitable for | 
|---|
| 64 | adding to classes. */ | 
|---|
| 65 |  | 
|---|
| 66 | const uint32_t PRIV(hspace_list)[] = { HSPACE_LIST }; | 
|---|
| 67 | const uint32_t PRIV(vspace_list)[] = { VSPACE_LIST }; | 
|---|
| 68 |  | 
|---|
| 69 | /* These tables are the pairs of delimiters that are valid for callout string | 
|---|
| 70 | arguments. For each starting delimiter there must be a matching ending | 
|---|
| 71 | delimiter, which in fact is different only for bracket-like delimiters. */ | 
|---|
| 72 |  | 
|---|
| 73 | const uint32_t PRIV(callout_start_delims)[] = { | 
|---|
| 74 | CHAR_GRAVE_ACCENT, CHAR_APOSTROPHE, CHAR_QUOTATION_MARK, | 
|---|
| 75 | CHAR_CIRCUMFLEX_ACCENT, CHAR_PERCENT_SIGN, CHAR_NUMBER_SIGN, | 
|---|
| 76 | CHAR_DOLLAR_SIGN, CHAR_LEFT_CURLY_BRACKET, 0 }; | 
|---|
| 77 |  | 
|---|
| 78 | const uint32_t PRIV(callout_end_delims[]) = { | 
|---|
| 79 | CHAR_GRAVE_ACCENT, CHAR_APOSTROPHE, CHAR_QUOTATION_MARK, | 
|---|
| 80 | CHAR_CIRCUMFLEX_ACCENT, CHAR_PERCENT_SIGN, CHAR_NUMBER_SIGN, | 
|---|
| 81 | CHAR_DOLLAR_SIGN, CHAR_RIGHT_CURLY_BRACKET, 0 }; | 
|---|
| 82 |  | 
|---|
| 83 |  | 
|---|
| 84 | /************************************************* | 
|---|
| 85 | *           Tables for UTF-8 support             * | 
|---|
| 86 | *************************************************/ | 
|---|
| 87 |  | 
|---|
| 88 | /* These tables are required by pcre2test in 16- or 32-bit mode, as well | 
|---|
| 89 | as for the library in 8-bit mode, because pcre2test uses UTF-8 internally for | 
|---|
| 90 | handling wide characters. */ | 
|---|
| 91 |  | 
|---|
| 92 | #if defined PCRE2_PCRE2TEST || \ | 
|---|
| 93 | (defined SUPPORT_UNICODE && \ | 
|---|
| 94 | defined PCRE2_CODE_UNIT_WIDTH && \ | 
|---|
| 95 | PCRE2_CODE_UNIT_WIDTH == 8) | 
|---|
| 96 |  | 
|---|
| 97 | /* These are the breakpoints for different numbers of bytes in a UTF-8 | 
|---|
| 98 | character. */ | 
|---|
| 99 |  | 
|---|
| 100 | const int PRIV(utf8_table1)[] = | 
|---|
| 101 | { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff}; | 
|---|
| 102 |  | 
|---|
| 103 | const int PRIV(utf8_table1_size) = sizeof(PRIV(utf8_table1)) / sizeof(int); | 
|---|
| 104 |  | 
|---|
| 105 | /* These are the indicator bits and the mask for the data bits to set in the | 
|---|
| 106 | first byte of a character, indexed by the number of additional bytes. */ | 
|---|
| 107 |  | 
|---|
| 108 | const int PRIV(utf8_table2)[] = { 0,    0xc0, 0xe0, 0xf0, 0xf8, 0xfc}; | 
|---|
| 109 | const int PRIV(utf8_table3)[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01}; | 
|---|
| 110 |  | 
|---|
| 111 | /* Table of the number of extra bytes, indexed by the first byte masked with | 
|---|
| 112 | 0x3f. The highest number for a valid UTF-8 first byte is in fact 0x3d. */ | 
|---|
| 113 |  | 
|---|
| 114 | const uint8_t PRIV(utf8_table4)[] = { | 
|---|
| 115 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | 
|---|
| 116 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | 
|---|
| 117 | 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, | 
|---|
| 118 | 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 }; | 
|---|
| 119 |  | 
|---|
| 120 | #endif /* UTF-8 support needed */ | 
|---|
| 121 |  | 
|---|
| 122 | /* Tables concerned with Unicode properties are relevant only when Unicode | 
|---|
| 123 | support is enabled. See also the pcre2_ucptables.c file, which is generated by | 
|---|
| 124 | a Python script from Unicode data files. */ | 
|---|
| 125 |  | 
|---|
| 126 | #ifdef SUPPORT_UNICODE | 
|---|
| 127 |  | 
|---|
| 128 | /* Table to translate from particular type value to the general value. */ | 
|---|
| 129 |  | 
|---|
| 130 | const uint32_t PRIV(ucp_gentype)[] = { | 
|---|
| 131 | ucp_C, ucp_C, ucp_C, ucp_C, ucp_C,  /* Cc, Cf, Cn, Co, Cs */ | 
|---|
| 132 | ucp_L, ucp_L, ucp_L, ucp_L, ucp_L,  /* Ll, Lu, Lm, Lo, Lt */ | 
|---|
| 133 | ucp_M, ucp_M, ucp_M,                /* Mc, Me, Mn */ | 
|---|
| 134 | ucp_N, ucp_N, ucp_N,                /* Nd, Nl, No */ | 
|---|
| 135 | ucp_P, ucp_P, ucp_P, ucp_P, ucp_P,  /* Pc, Pd, Pe, Pf, Pi */ | 
|---|
| 136 | ucp_P, ucp_P,                       /* Ps, Po */ | 
|---|
| 137 | ucp_S, ucp_S, ucp_S, ucp_S,         /* Sc, Sk, Sm, So */ | 
|---|
| 138 | ucp_Z, ucp_Z, ucp_Z                 /* Zl, Zp, Zs */ | 
|---|
| 139 | }; | 
|---|
| 140 |  | 
|---|
| 141 | /* This table encodes the rules for finding the end of an extended grapheme | 
|---|
| 142 | cluster. Every code point has a grapheme break property which is one of the | 
|---|
| 143 | ucp_gbXX values defined in pcre2_ucp.h. These changed between Unicode versions | 
|---|
| 144 | 10 and 11. The 2-dimensional table is indexed by the properties of two adjacent | 
|---|
| 145 | code points. The left property selects a word from the table, and the right | 
|---|
| 146 | property selects a bit from that word like this: | 
|---|
| 147 |  | 
|---|
| 148 | PRIV(ucp_gbtable)[left-property] & (1u << right-property) | 
|---|
| 149 |  | 
|---|
| 150 | The value is non-zero if a grapheme break is NOT permitted between the relevant | 
|---|
| 151 | two code points. The breaking rules are as follows: | 
|---|
| 152 |  | 
|---|
| 153 | 1. Break at the start and end of text (pretty obviously). | 
|---|
| 154 |  | 
|---|
| 155 | 2. Do not break between a CR and LF; otherwise, break before and after | 
|---|
| 156 | controls. | 
|---|
| 157 |  | 
|---|
| 158 | 3. Do not break Hangul syllable sequences, the rules for which are: | 
|---|
| 159 |  | 
|---|
| 160 | L may be followed by L, V, LV or LVT | 
|---|
| 161 | LV or V may be followed by V or T | 
|---|
| 162 | LVT or T may be followed by T | 
|---|
| 163 |  | 
|---|
| 164 | 4. Do not break before extending characters or zero-width-joiner (ZWJ). | 
|---|
| 165 |  | 
|---|
| 166 | The following rules are only for extended grapheme clusters (but that's what we | 
|---|
| 167 | are implementing). | 
|---|
| 168 |  | 
|---|
| 169 | 5. Do not break before SpacingMarks. | 
|---|
| 170 |  | 
|---|
| 171 | 6. Do not break after Prepend characters. | 
|---|
| 172 |  | 
|---|
| 173 | 7. Do not break within emoji modifier sequences or emoji zwj sequences. That | 
|---|
| 174 | is, do not break between characters with the Extended_Pictographic property. | 
|---|
| 175 | Extend and ZWJ characters are allowed between the characters; this cannot be | 
|---|
| 176 | represented in this table, the code has to deal with it. | 
|---|
| 177 |  | 
|---|
| 178 | 8. Do not break within emoji flag sequences. That is, do not break between | 
|---|
| 179 | regional indicator (RI) symbols if there are an odd number of RI characters | 
|---|
| 180 | before the break point. This table encodes "join RI characters"; the code | 
|---|
| 181 | has to deal with checking for previous adjoining RIs. | 
|---|
| 182 |  | 
|---|
| 183 | 9. Otherwise, break everywhere. | 
|---|
| 184 | */ | 
|---|
| 185 |  | 
|---|
| 186 | #define ESZ (1<<ucp_gbExtend)|(1<<ucp_gbSpacingMark)|(1<<ucp_gbZWJ) | 
|---|
| 187 |  | 
|---|
| 188 | const uint32_t PRIV(ucp_gbtable)[] = { | 
|---|
| 189 | (1u<<ucp_gbLF),                                      /*  0 CR */ | 
|---|
| 190 | 0,                                                   /*  1 LF */ | 
|---|
| 191 | 0,                                                   /*  2 Control */ | 
|---|
| 192 | ESZ,                                                 /*  3 Extend */ | 
|---|
| 193 | ESZ|(1u<<ucp_gbPrepend)|                             /*  4 Prepend */ | 
|---|
| 194 | (1u<<ucp_gbL)|(1u<<ucp_gbV)|(1u<<ucp_gbT)| | 
|---|
| 195 | (1u<<ucp_gbLV)|(1u<<ucp_gbLVT)|(1u<<ucp_gbOther)| | 
|---|
| 196 | (1u<<ucp_gbRegional_Indicator), | 
|---|
| 197 | ESZ,                                                 /*  5 SpacingMark */ | 
|---|
| 198 | ESZ|(1u<<ucp_gbL)|(1u<<ucp_gbV)|(1u<<ucp_gbLV)|      /*  6 L */ | 
|---|
| 199 | (1u<<ucp_gbLVT), | 
|---|
| 200 | ESZ|(1u<<ucp_gbV)|(1u<<ucp_gbT),                     /*  7 V */ | 
|---|
| 201 | ESZ|(1u<<ucp_gbT),                                   /*  8 T */ | 
|---|
| 202 | ESZ|(1u<<ucp_gbV)|(1u<<ucp_gbT),                     /*  9 LV */ | 
|---|
| 203 | ESZ|(1u<<ucp_gbT),                                   /* 10 LVT */ | 
|---|
| 204 | (1u<<ucp_gbRegional_Indicator),                      /* 11 Regional Indicator */ | 
|---|
| 205 | ESZ,                                                 /* 12 Other */ | 
|---|
| 206 | ESZ,                                                 /* 13 ZWJ */ | 
|---|
| 207 | ESZ|(1u<<ucp_gbExtended_Pictographic)                /* 14 Extended Pictographic */ | 
|---|
| 208 | }; | 
|---|
| 209 |  | 
|---|
| 210 | #undef ESZ | 
|---|
| 211 |  | 
|---|
| 212 | #ifdef SUPPORT_JIT | 
|---|
| 213 | /* This table reverses PRIV(ucp_gentype). We can save the cost | 
|---|
| 214 | of a memory load. */ | 
|---|
| 215 |  | 
|---|
| 216 | const int PRIV(ucp_typerange)[] = { | 
|---|
| 217 | ucp_Cc, ucp_Cs, | 
|---|
| 218 | ucp_Ll, ucp_Lu, | 
|---|
| 219 | ucp_Mc, ucp_Mn, | 
|---|
| 220 | ucp_Nd, ucp_No, | 
|---|
| 221 | ucp_Pc, ucp_Ps, | 
|---|
| 222 | ucp_Sc, ucp_So, | 
|---|
| 223 | ucp_Zl, ucp_Zs, | 
|---|
| 224 | }; | 
|---|
| 225 | #endif /* SUPPORT_JIT */ | 
|---|
| 226 |  | 
|---|
| 227 | /* Finally, include the tables that are auto-generated from the Unicode data | 
|---|
| 228 | files. */ | 
|---|
| 229 |  | 
|---|
| 230 | #include "pcre2_ucptables.c" | 
|---|
| 231 |  | 
|---|
| 232 | #endif /* SUPPORT_UNICODE */ | 
|---|
| 233 |  | 
|---|
| 234 | /* End of pcre2_tables.c */ | 
|---|
| 235 |  | 
|---|