| 1 | // © 2016 and later: Unicode, Inc. and others. | 
|---|
| 2 | // License & terms of use: http://www.unicode.org/copyright.html | 
|---|
| 3 | // | 
|---|
| 4 | //   Copyright (C) 2002-2015 International Business Machines Corporation | 
|---|
| 5 | //   and others. All rights reserved. | 
|---|
| 6 | // | 
|---|
| 7 | //   file:  regeximp.h | 
|---|
| 8 | // | 
|---|
| 9 | //           ICU Regular Expressions, | 
|---|
| 10 | //               Definitions of constant values used in the compiled form of | 
|---|
| 11 | //               a regular expression pattern. | 
|---|
| 12 | // | 
|---|
| 13 |  | 
|---|
| 14 | #ifndef _REGEXIMP_H | 
|---|
| 15 | #define _REGEXIMP_H | 
|---|
| 16 |  | 
|---|
| 17 | #include "unicode/utypes.h" | 
|---|
| 18 | #include "unicode/uobject.h" | 
|---|
| 19 | #include "unicode/uniset.h" | 
|---|
| 20 | #include "unicode/utext.h" | 
|---|
| 21 |  | 
|---|
| 22 | #include "cmemory.h" | 
|---|
| 23 | #include "ucase.h" | 
|---|
| 24 |  | 
|---|
| 25 | U_NAMESPACE_BEGIN | 
|---|
| 26 |  | 
|---|
| 27 | // For debugging, define REGEX_DEBUG | 
|---|
| 28 | // To define with configure, | 
|---|
| 29 | //   CPPFLAGS="-DREGEX_DEBUG" ./runConfigureICU --enable-debug --disable-release Linux | 
|---|
| 30 |  | 
|---|
| 31 | #ifdef REGEX_DEBUG | 
|---|
| 32 | // | 
|---|
| 33 | //  debugging options.  Enable one or more of the three #defines immediately following | 
|---|
| 34 | // | 
|---|
| 35 |  | 
|---|
| 36 | //#define REGEX_SCAN_DEBUG | 
|---|
| 37 | #define REGEX_DUMP_DEBUG | 
|---|
| 38 | #define REGEX_RUN_DEBUG | 
|---|
| 39 |  | 
|---|
| 40 | //  End of #defines inteded to be directly set. | 
|---|
| 41 |  | 
|---|
| 42 | #include <stdio.h> | 
|---|
| 43 | #endif | 
|---|
| 44 |  | 
|---|
| 45 | #ifdef REGEX_SCAN_DEBUG | 
|---|
| 46 | #define REGEX_SCAN_DEBUG_PRINTF(a) printf a | 
|---|
| 47 | #else | 
|---|
| 48 | #define REGEX_SCAN_DEBUG_PRINTF(a) | 
|---|
| 49 | #endif | 
|---|
| 50 |  | 
|---|
| 51 |  | 
|---|
| 52 | // | 
|---|
| 53 | //  Opcode types     In the compiled form of the regexp, these are the type, or opcodes, | 
|---|
| 54 | //                   of the entries. | 
|---|
| 55 | // | 
|---|
| 56 | enum { | 
|---|
| 57 | URX_RESERVED_OP   = 0,    // For multi-operand ops, most non-first words. | 
|---|
| 58 | URX_RESERVED_OP_N = 255,  // For multi-operand ops, negative operand values. | 
|---|
| 59 | URX_BACKTRACK     = 1,    // Force a backtrack, as if a match test had failed. | 
|---|
| 60 | URX_END           = 2, | 
|---|
| 61 | URX_ONECHAR       = 3,    // Value field is the 21 bit unicode char to match | 
|---|
| 62 | URX_STRING        = 4,    // Value field is index of string start | 
|---|
| 63 | URX_STRING_LEN    = 5,    // Value field is string length (code units) | 
|---|
| 64 | URX_STATE_SAVE    = 6,    // Value field is pattern position to push | 
|---|
| 65 | URX_NOP           = 7, | 
|---|
| 66 | URX_START_CAPTURE = 8,    // Value field is capture group number. | 
|---|
| 67 | URX_END_CAPTURE   = 9,    // Value field is capture group number | 
|---|
| 68 | URX_STATIC_SETREF = 10,   // Value field is index of set in array of sets. | 
|---|
| 69 | URX_SETREF        = 11,   // Value field is index of set in array of sets. | 
|---|
| 70 | URX_DOTANY        = 12, | 
|---|
| 71 | URX_JMP           = 13,   // Value field is destination position in | 
|---|
| 72 | //   the pattern. | 
|---|
| 73 | URX_FAIL          = 14,   // Stop match operation,  No match. | 
|---|
| 74 |  | 
|---|
| 75 | URX_JMP_SAV       = 15,   // Operand:  JMP destination location | 
|---|
| 76 | URX_BACKSLASH_B   = 16,   // Value field:  0:  \b    1:  \B | 
|---|
| 77 | URX_BACKSLASH_G   = 17, | 
|---|
| 78 | URX_JMP_SAV_X     = 18,   // Conditional JMP_SAV, | 
|---|
| 79 | //    Used in (x)+, breaks loop on zero length match. | 
|---|
| 80 | //    Operand:  Jmp destination. | 
|---|
| 81 | URX_BACKSLASH_X   = 19, | 
|---|
| 82 | URX_BACKSLASH_Z   = 20,   // \z   Unconditional end of line. | 
|---|
| 83 |  | 
|---|
| 84 | URX_DOTANY_ALL    = 21,   // ., in the . matches any mode. | 
|---|
| 85 | URX_BACKSLASH_D   = 22,   // Value field:  0:  \d    1:  \D | 
|---|
| 86 | URX_CARET         = 23,   // Value field:  1:  multi-line mode. | 
|---|
| 87 | URX_DOLLAR        = 24,  // Also for \Z | 
|---|
| 88 |  | 
|---|
| 89 | URX_CTR_INIT      = 25,   // Counter Inits for {Interval} loops. | 
|---|
| 90 | URX_CTR_INIT_NG   = 26,   //   2 kinds, normal and non-greedy. | 
|---|
| 91 | //   These are 4 word opcodes.  See description. | 
|---|
| 92 | //    First Operand:  Data loc of counter variable | 
|---|
| 93 | //    2nd   Operand:  Pat loc of the URX_CTR_LOOPx | 
|---|
| 94 | //                    at the end of the loop. | 
|---|
| 95 | //    3rd   Operand:  Minimum count. | 
|---|
| 96 | //    4th   Operand:  Max count, -1 for unbounded. | 
|---|
| 97 |  | 
|---|
| 98 | URX_DOTANY_UNIX   = 27,   // '.' operator in UNIX_LINES mode, only \n marks end of line. | 
|---|
| 99 |  | 
|---|
| 100 | URX_CTR_LOOP      = 28,   // Loop Ops for {interval} loops. | 
|---|
| 101 | URX_CTR_LOOP_NG   = 29,   //   Also in three flavors. | 
|---|
| 102 | //   Operand is loc of corresponding CTR_INIT. | 
|---|
| 103 |  | 
|---|
| 104 | URX_CARET_M_UNIX  = 30,   // '^' operator, test for start of line in multi-line | 
|---|
| 105 | //      plus UNIX_LINES mode. | 
|---|
| 106 |  | 
|---|
| 107 | URX_RELOC_OPRND   = 31,   // Operand value in multi-operand ops that refers | 
|---|
| 108 | //   back into compiled pattern code, and thus must | 
|---|
| 109 | //   be relocated when inserting/deleting ops in code. | 
|---|
| 110 |  | 
|---|
| 111 | URX_STO_SP        = 32,   // Store the stack ptr.  Operand is location within | 
|---|
| 112 | //   matcher data (not stack data) to store it. | 
|---|
| 113 | URX_LD_SP         = 33,   // Load the stack pointer.  Operand is location | 
|---|
| 114 | //   to load from. | 
|---|
| 115 | URX_BACKREF       = 34,   // Back Reference.  Parameter is the index of the | 
|---|
| 116 | //   capture group variables in the state stack frame. | 
|---|
| 117 | URX_STO_INP_LOC   = 35,   // Store the input location.  Operand is location | 
|---|
| 118 | //   within the matcher stack frame. | 
|---|
| 119 | URX_JMPX          = 36,  // Conditional JMP. | 
|---|
| 120 | //   First Operand:  JMP target location. | 
|---|
| 121 | //   Second Operand:  Data location containing an | 
|---|
| 122 | //     input position.  If current input position == | 
|---|
| 123 | //     saved input position, FAIL rather than taking | 
|---|
| 124 | //     the JMP | 
|---|
| 125 | URX_LA_START      = 37,   // Starting a LookAround expression. | 
|---|
| 126 | //   Save InputPos, SP and active region in static data. | 
|---|
| 127 | //   Operand:  Static data offset for the save | 
|---|
| 128 | URX_LA_END        = 38,   // Ending a Lookaround expression. | 
|---|
| 129 | //   Restore InputPos and Stack to saved values. | 
|---|
| 130 | //   Operand:  Static data offset for saved data. | 
|---|
| 131 | URX_ONECHAR_I     = 39,   // Test for case-insensitive match of a literal character. | 
|---|
| 132 | //   Operand:  the literal char. | 
|---|
| 133 | URX_STRING_I      = 40,   // Case insensitive string compare. | 
|---|
| 134 | //   First Operand:  Index of start of string in string literals | 
|---|
| 135 | //   Second Operand (next word in compiled code): | 
|---|
| 136 | //     the length of the string. | 
|---|
| 137 | URX_BACKREF_I     = 41,   // Case insensitive back reference. | 
|---|
| 138 | //   Parameter is the index of the | 
|---|
| 139 | //   capture group variables in the state stack frame. | 
|---|
| 140 | URX_DOLLAR_M      = 42,   // $ in multi-line mode. | 
|---|
| 141 | URX_CARET_M       = 43,   // ^ in multi-line mode. | 
|---|
| 142 | URX_LB_START      = 44,   // LookBehind Start. | 
|---|
| 143 | //   Paramater is data location | 
|---|
| 144 | URX_LB_CONT       = 45,   // LookBehind Continue. | 
|---|
| 145 | //   Param 0:  the data location | 
|---|
| 146 | //   Param 1:  The minimum length of the look-behind match | 
|---|
| 147 | //   Param 2:  The max length of the look-behind match | 
|---|
| 148 | URX_LB_END        = 46,   // LookBehind End. | 
|---|
| 149 | //   Parameter is the data location. | 
|---|
| 150 | //     Check that match ended at the right spot, | 
|---|
| 151 | //     Restore original input string len. | 
|---|
| 152 | URX_LBN_CONT      = 47,   // Negative LookBehind Continue | 
|---|
| 153 | //   Param 0:  the data location | 
|---|
| 154 | //   Param 1:  The minimum length of the look-behind match | 
|---|
| 155 | //   Param 2:  The max     length of the look-behind match | 
|---|
| 156 | //   Param 3:  The pattern loc following the look-behind block. | 
|---|
| 157 | URX_LBN_END       = 48,   // Negative LookBehind end | 
|---|
| 158 | //   Parameter is the data location. | 
|---|
| 159 | //   Check that the match ended at the right spot. | 
|---|
| 160 | URX_STAT_SETREF_N = 49,   // Reference to a prebuilt set (e.g. \w), negated | 
|---|
| 161 | //   Operand is index of set in array of sets. | 
|---|
| 162 | URX_LOOP_SR_I     = 50,   // Init a [set]* loop. | 
|---|
| 163 | //   Operand is the sets index in array of user sets. | 
|---|
| 164 | URX_LOOP_C        = 51,   // Continue a [set]* or OneChar* loop. | 
|---|
| 165 | //   Operand is a matcher static data location. | 
|---|
| 166 | //   Must always immediately follow  LOOP_x_I instruction. | 
|---|
| 167 | URX_LOOP_DOT_I    = 52,   // .*, initialization of the optimized loop. | 
|---|
| 168 | //   Operand value: | 
|---|
| 169 | //      bit 0: | 
|---|
| 170 | //         0:  Normal (. doesn't match new-line) mode. | 
|---|
| 171 | //         1:  . matches new-line mode. | 
|---|
| 172 | //      bit 1:  controls what new-lines are recognized by this operation. | 
|---|
| 173 | //         0:  All Unicode New-lines | 
|---|
| 174 | //         1:  UNIX_LINES, \u000a only. | 
|---|
| 175 | URX_BACKSLASH_BU  = 53,   // \b or \B in UREGEX_UWORD mode, using Unicode style | 
|---|
| 176 | //   word boundaries. | 
|---|
| 177 | URX_DOLLAR_D      = 54,   // $ end of input test, in UNIX_LINES mode. | 
|---|
| 178 | URX_DOLLAR_MD     = 55,   // $ end of input test, in MULTI_LINE and UNIX_LINES mode. | 
|---|
| 179 | URX_BACKSLASH_H   = 56,   // Value field:  0:  \h    1:  \H | 
|---|
| 180 | URX_BACKSLASH_R   = 57,   // Any line break sequence. | 
|---|
| 181 | URX_BACKSLASH_V   = 58    // Value field:  0:  \v    1:  \V | 
|---|
| 182 |  | 
|---|
| 183 | }; | 
|---|
| 184 |  | 
|---|
| 185 | // Keep this list of opcode names in sync with the above enum | 
|---|
| 186 | //   Used for debug printing only. | 
|---|
| 187 | #define URX_OPCODE_NAMES       \ | 
|---|
| 188 | "               ",     \ | 
|---|
| 189 | "BACKTRACK",           \ | 
|---|
| 190 | "END",                 \ | 
|---|
| 191 | "ONECHAR",             \ | 
|---|
| 192 | "STRING",              \ | 
|---|
| 193 | "STRING_LEN",          \ | 
|---|
| 194 | "STATE_SAVE",          \ | 
|---|
| 195 | "NOP",                 \ | 
|---|
| 196 | "START_CAPTURE",       \ | 
|---|
| 197 | "END_CAPTURE",         \ | 
|---|
| 198 | "URX_STATIC_SETREF",   \ | 
|---|
| 199 | "SETREF",              \ | 
|---|
| 200 | "DOTANY",              \ | 
|---|
| 201 | "JMP",                 \ | 
|---|
| 202 | "FAIL",                \ | 
|---|
| 203 | "JMP_SAV",             \ | 
|---|
| 204 | "BACKSLASH_B",         \ | 
|---|
| 205 | "BACKSLASH_G",         \ | 
|---|
| 206 | "JMP_SAV_X",           \ | 
|---|
| 207 | "BACKSLASH_X",         \ | 
|---|
| 208 | "BACKSLASH_Z",         \ | 
|---|
| 209 | "DOTANY_ALL",          \ | 
|---|
| 210 | "BACKSLASH_D",         \ | 
|---|
| 211 | "CARET",               \ | 
|---|
| 212 | "DOLLAR",              \ | 
|---|
| 213 | "CTR_INIT",            \ | 
|---|
| 214 | "CTR_INIT_NG",         \ | 
|---|
| 215 | "DOTANY_UNIX",         \ | 
|---|
| 216 | "CTR_LOOP",            \ | 
|---|
| 217 | "CTR_LOOP_NG",         \ | 
|---|
| 218 | "URX_CARET_M_UNIX",    \ | 
|---|
| 219 | "RELOC_OPRND",         \ | 
|---|
| 220 | "STO_SP",              \ | 
|---|
| 221 | "LD_SP",               \ | 
|---|
| 222 | "BACKREF",             \ | 
|---|
| 223 | "STO_INP_LOC",         \ | 
|---|
| 224 | "JMPX",                \ | 
|---|
| 225 | "LA_START",            \ | 
|---|
| 226 | "LA_END",              \ | 
|---|
| 227 | "ONECHAR_I",           \ | 
|---|
| 228 | "STRING_I",            \ | 
|---|
| 229 | "BACKREF_I",           \ | 
|---|
| 230 | "DOLLAR_M",            \ | 
|---|
| 231 | "CARET_M",             \ | 
|---|
| 232 | "LB_START",            \ | 
|---|
| 233 | "LB_CONT",             \ | 
|---|
| 234 | "LB_END",              \ | 
|---|
| 235 | "LBN_CONT",            \ | 
|---|
| 236 | "LBN_END",             \ | 
|---|
| 237 | "STAT_SETREF_N",       \ | 
|---|
| 238 | "LOOP_SR_I",           \ | 
|---|
| 239 | "LOOP_C",              \ | 
|---|
| 240 | "LOOP_DOT_I",          \ | 
|---|
| 241 | "BACKSLASH_BU",        \ | 
|---|
| 242 | "DOLLAR_D",            \ | 
|---|
| 243 | "DOLLAR_MD",           \ | 
|---|
| 244 | "URX_BACKSLASH_H",     \ | 
|---|
| 245 | "URX_BACKSLASH_R",     \ | 
|---|
| 246 | "URX_BACKSLASH_V" | 
|---|
| 247 |  | 
|---|
| 248 |  | 
|---|
| 249 | // | 
|---|
| 250 | //  Convenience macros for assembling and disassembling a compiled operation. | 
|---|
| 251 | // | 
|---|
| 252 | #define URX_TYPE(x)          ((uint32_t)(x) >> 24) | 
|---|
| 253 | #define URX_VAL(x)           ((x) & 0xffffff) | 
|---|
| 254 |  | 
|---|
| 255 |  | 
|---|
| 256 | // | 
|---|
| 257 | //  Access to Unicode Sets composite character properties | 
|---|
| 258 | //     The sets are accessed by the match engine for things like \w (word boundary) | 
|---|
| 259 | // | 
|---|
| 260 | enum { | 
|---|
| 261 | URX_ISWORD_SET  = 1, | 
|---|
| 262 | URX_ISALNUM_SET = 2, | 
|---|
| 263 | URX_ISALPHA_SET = 3, | 
|---|
| 264 | URX_ISSPACE_SET = 4, | 
|---|
| 265 |  | 
|---|
| 266 | URX_GC_NORMAL,          // Sets for finding grapheme cluster boundaries. | 
|---|
| 267 | URX_GC_EXTEND, | 
|---|
| 268 | URX_GC_CONTROL, | 
|---|
| 269 | URX_GC_L, | 
|---|
| 270 | URX_GC_LV, | 
|---|
| 271 | URX_GC_LVT, | 
|---|
| 272 | URX_GC_V, | 
|---|
| 273 | URX_GC_T, | 
|---|
| 274 |  | 
|---|
| 275 | URX_LAST_SET, | 
|---|
| 276 |  | 
|---|
| 277 | URX_NEG_SET     = 0x800000          // Flag bit to reverse sense of set | 
|---|
| 278 | //   membership test. | 
|---|
| 279 | }; | 
|---|
| 280 |  | 
|---|
| 281 |  | 
|---|
| 282 | // | 
|---|
| 283 | //  Match Engine State Stack Frame Layout. | 
|---|
| 284 | // | 
|---|
| 285 | struct REStackFrame { | 
|---|
| 286 | // Header | 
|---|
| 287 | int64_t            fInputIdx;        // Position of next character in the input string | 
|---|
| 288 | int64_t            fPatIdx;          // Position of next Op in the compiled pattern | 
|---|
| 289 | // (int64_t for UVector64, values fit in an int32_t) | 
|---|
| 290 | // Remainder | 
|---|
| 291 | int64_t            [1];        // Extra state, for capture group start/ends | 
|---|
| 292 | //   atomic parentheses, repeat counts, etc. | 
|---|
| 293 | //   Locations assigned at pattern compile time. | 
|---|
| 294 | //   Variable-length array. | 
|---|
| 295 | }; | 
|---|
| 296 | // number of UVector elements in the header | 
|---|
| 297 | #define RESTACKFRAME_HDRCOUNT 2 | 
|---|
| 298 |  | 
|---|
| 299 | // | 
|---|
| 300 | //  Start-Of-Match type.  Used by find() to quickly scan to positions where a | 
|---|
| 301 | //                        match might start before firing up the full match engine. | 
|---|
| 302 | // | 
|---|
| 303 | enum StartOfMatch { | 
|---|
| 304 | START_NO_INFO,             // No hint available. | 
|---|
| 305 | START_CHAR,                // Match starts with a literal code point. | 
|---|
| 306 | START_SET,                 // Match starts with something matching a set. | 
|---|
| 307 | START_START,               // Match starts at start of buffer only (^ or \A) | 
|---|
| 308 | START_LINE,                // Match starts with ^ in multi-line mode. | 
|---|
| 309 | START_STRING               // Match starts with a literal string. | 
|---|
| 310 | }; | 
|---|
| 311 |  | 
|---|
| 312 | #define START_OF_MATCH_STR(v) ((v)==START_NO_INFO? "START_NO_INFO" : \ | 
|---|
| 313 | (v)==START_CHAR?    "START_CHAR"    : \ | 
|---|
| 314 | (v)==START_SET?     "START_SET"     : \ | 
|---|
| 315 | (v)==START_START?   "START_START"   : \ | 
|---|
| 316 | (v)==START_LINE?    "START_LINE"    : \ | 
|---|
| 317 | (v)==START_STRING?  "START_STRING"  : \ | 
|---|
| 318 | "ILLEGAL") | 
|---|
| 319 |  | 
|---|
| 320 | // | 
|---|
| 321 | //  8 bit set, to fast-path latin-1 set membership tests. | 
|---|
| 322 | // | 
|---|
| 323 | struct Regex8BitSet : public UMemory { | 
|---|
| 324 | inline Regex8BitSet(); | 
|---|
| 325 | inline void operator = (const Regex8BitSet &s); | 
|---|
| 326 | inline void init(const UnicodeSet *src); | 
|---|
| 327 | inline UBool contains(UChar32 c); | 
|---|
| 328 | inline void  add(UChar32 c); | 
|---|
| 329 | int8_t d[32]; | 
|---|
| 330 | }; | 
|---|
| 331 |  | 
|---|
| 332 | inline Regex8BitSet::Regex8BitSet() { | 
|---|
| 333 | uprv_memset(d, 0, sizeof(d)); | 
|---|
| 334 | } | 
|---|
| 335 |  | 
|---|
| 336 | inline UBool Regex8BitSet::contains(UChar32 c) { | 
|---|
| 337 | // No bounds checking!  This is deliberate. | 
|---|
| 338 | return ((d[c>>3] & 1 <<(c&7)) != 0); | 
|---|
| 339 | } | 
|---|
| 340 |  | 
|---|
| 341 | inline void  Regex8BitSet::add(UChar32 c) { | 
|---|
| 342 | d[c>>3] |= 1 << (c&7); | 
|---|
| 343 | } | 
|---|
| 344 |  | 
|---|
| 345 | inline void Regex8BitSet::init(const UnicodeSet *s) { | 
|---|
| 346 | if (s != NULL) { | 
|---|
| 347 | for (int32_t i=0; i<=255; i++) { | 
|---|
| 348 | if (s->contains(i)) { | 
|---|
| 349 | this->add(i); | 
|---|
| 350 | } | 
|---|
| 351 | } | 
|---|
| 352 | } | 
|---|
| 353 | } | 
|---|
| 354 |  | 
|---|
| 355 | inline void Regex8BitSet::operator = (const Regex8BitSet &s) { | 
|---|
| 356 | uprv_memcpy(d, s.d, sizeof(d)); | 
|---|
| 357 | } | 
|---|
| 358 |  | 
|---|
| 359 |  | 
|---|
| 360 | //  Case folded UText Iterator helper class. | 
|---|
| 361 | //  Wraps a UText, provides a case-folded enumeration over its contents. | 
|---|
| 362 | //  Used in implementing case insensitive matching constructs. | 
|---|
| 363 | //  Implementation in rematch.cpp | 
|---|
| 364 |  | 
|---|
| 365 | class CaseFoldingUTextIterator: public UMemory { | 
|---|
| 366 | public: | 
|---|
| 367 | CaseFoldingUTextIterator(UText &text); | 
|---|
| 368 | ~CaseFoldingUTextIterator(); | 
|---|
| 369 |  | 
|---|
| 370 | UChar32 next();           // Next case folded character | 
|---|
| 371 |  | 
|---|
| 372 | UBool   inExpansion();    // True if last char returned from next() and the | 
|---|
| 373 | //  next to be returned both originated from a string | 
|---|
| 374 | //  folding of the same code point from the orignal UText. | 
|---|
| 375 | private: | 
|---|
| 376 | UText             &fUText; | 
|---|
| 377 | const  UChar      *fFoldChars; | 
|---|
| 378 | int32_t            fFoldLength; | 
|---|
| 379 | int32_t            fFoldIndex; | 
|---|
| 380 |  | 
|---|
| 381 | }; | 
|---|
| 382 |  | 
|---|
| 383 |  | 
|---|
| 384 | // Case folded UChar * string iterator. | 
|---|
| 385 | //  Wraps a UChar  *, provides a case-folded enumeration over its contents. | 
|---|
| 386 | //  Used in implementing case insensitive matching constructs. | 
|---|
| 387 | //  Implementation in rematch.cpp | 
|---|
| 388 |  | 
|---|
| 389 | class CaseFoldingUCharIterator: public UMemory { | 
|---|
| 390 | public: | 
|---|
| 391 | CaseFoldingUCharIterator(const UChar *chars, int64_t start, int64_t limit); | 
|---|
| 392 | ~CaseFoldingUCharIterator(); | 
|---|
| 393 |  | 
|---|
| 394 | UChar32 next();           // Next case folded character | 
|---|
| 395 |  | 
|---|
| 396 | UBool   inExpansion();    // True if last char returned from next() and the | 
|---|
| 397 | //  next to be returned both originated from a string | 
|---|
| 398 | //  folding of the same code point from the orignal UText. | 
|---|
| 399 |  | 
|---|
| 400 | int64_t  getIndex();      // Return the current input buffer index. | 
|---|
| 401 |  | 
|---|
| 402 | private: | 
|---|
| 403 | const  UChar      *fChars; | 
|---|
| 404 | int64_t            fIndex; | 
|---|
| 405 | int64_t            fLimit; | 
|---|
| 406 | const  UChar      *fFoldChars; | 
|---|
| 407 | int32_t            fFoldLength; | 
|---|
| 408 | int32_t            fFoldIndex; | 
|---|
| 409 |  | 
|---|
| 410 | }; | 
|---|
| 411 |  | 
|---|
| 412 | U_NAMESPACE_END | 
|---|
| 413 | #endif | 
|---|
| 414 |  | 
|---|
| 415 |  | 
|---|