| 1 | // © 2016 and later: Unicode, Inc. and others. | 
|---|
| 2 | // License & terms of use: http://www.unicode.org/copyright.html | 
|---|
| 3 | /* | 
|---|
| 4 | ********************************************************************** | 
|---|
| 5 | * Copyright (C) 1999-2011, International Business Machines Corporation | 
|---|
| 6 | * and others. All Rights Reserved. | 
|---|
| 7 | ********************************************************************** | 
|---|
| 8 | *   Date        Name        Description | 
|---|
| 9 | *   11/17/99    aliu        Creation. | 
|---|
| 10 | ********************************************************************** | 
|---|
| 11 | */ | 
|---|
| 12 | #ifndef RBT_PARS_H | 
|---|
| 13 | #define RBT_PARS_H | 
|---|
| 14 |  | 
|---|
| 15 | #include "unicode/utypes.h" | 
|---|
| 16 |  | 
|---|
| 17 | #if !UCONFIG_NO_TRANSLITERATION | 
|---|
| 18 | #ifdef __cplusplus | 
|---|
| 19 |  | 
|---|
| 20 | #include "unicode/uobject.h" | 
|---|
| 21 | #include "unicode/parseerr.h" | 
|---|
| 22 | #include "unicode/unorm.h" | 
|---|
| 23 | #include "rbt.h" | 
|---|
| 24 | #include "hash.h" | 
|---|
| 25 | #include "uvector.h" | 
|---|
| 26 |  | 
|---|
| 27 | U_NAMESPACE_BEGIN | 
|---|
| 28 |  | 
|---|
| 29 | class TransliterationRuleData; | 
|---|
| 30 | class UnicodeFunctor; | 
|---|
| 31 | class ParseData; | 
|---|
| 32 | class RuleHalf; | 
|---|
| 33 | class ParsePosition; | 
|---|
| 34 | class StringMatcher; | 
|---|
| 35 |  | 
|---|
| 36 | class TransliteratorParser : public UMemory { | 
|---|
| 37 |  | 
|---|
| 38 | public: | 
|---|
| 39 |  | 
|---|
| 40 | /** | 
|---|
| 41 | * A Vector of TransliterationRuleData objects, one for each discrete group | 
|---|
| 42 | * of rules in the rule set | 
|---|
| 43 | */ | 
|---|
| 44 | UVector dataVector; | 
|---|
| 45 |  | 
|---|
| 46 | /** | 
|---|
| 47 | * PUBLIC data member. | 
|---|
| 48 | * A Vector of UnicodeStrings containing all of the ID blocks in the rule set | 
|---|
| 49 | */ | 
|---|
| 50 | UVector idBlockVector; | 
|---|
| 51 |  | 
|---|
| 52 | /** | 
|---|
| 53 | * PUBLIC data member containing the parsed compound filter, if any. | 
|---|
| 54 | */ | 
|---|
| 55 | UnicodeSet* compoundFilter; | 
|---|
| 56 |  | 
|---|
| 57 | private: | 
|---|
| 58 |  | 
|---|
| 59 | /** | 
|---|
| 60 | * The current data object for which we are parsing rules | 
|---|
| 61 | */ | 
|---|
| 62 | TransliterationRuleData* curData; | 
|---|
| 63 |  | 
|---|
| 64 | UTransDirection direction; | 
|---|
| 65 |  | 
|---|
| 66 | /** | 
|---|
| 67 | * Parse error information. | 
|---|
| 68 | */ | 
|---|
| 69 | UParseError parseError; | 
|---|
| 70 |  | 
|---|
| 71 | /** | 
|---|
| 72 | * Temporary symbol table used during parsing. | 
|---|
| 73 | */ | 
|---|
| 74 | ParseData* parseData; | 
|---|
| 75 |  | 
|---|
| 76 | /** | 
|---|
| 77 | * Temporary vector of matcher variables.  When parsing is complete, this | 
|---|
| 78 | * is copied into the array data.variables.  As with data.variables, | 
|---|
| 79 | * element 0 corresponds to character data.variablesBase. | 
|---|
| 80 | */ | 
|---|
| 81 | UVector variablesVector; | 
|---|
| 82 |  | 
|---|
| 83 | /** | 
|---|
| 84 | * Temporary table of variable names.  When parsing is complete, this is | 
|---|
| 85 | * copied into data.variableNames. | 
|---|
| 86 | */ | 
|---|
| 87 | Hashtable variableNames; | 
|---|
| 88 |  | 
|---|
| 89 | /** | 
|---|
| 90 | * String of standins for segments.  Used during the parsing of a single | 
|---|
| 91 | * rule.  segmentStandins.charAt(0) is the standin for "$1" and corresponds | 
|---|
| 92 | * to StringMatcher object segmentObjects.elementAt(0), etc. | 
|---|
| 93 | */ | 
|---|
| 94 | UnicodeString segmentStandins; | 
|---|
| 95 |  | 
|---|
| 96 | /** | 
|---|
| 97 | * Vector of StringMatcher objects for segments.  Used during the | 
|---|
| 98 | * parsing of a single rule. | 
|---|
| 99 | * segmentStandins.charAt(0) is the standin for "$1" and corresponds | 
|---|
| 100 | * to StringMatcher object segmentObjects.elementAt(0), etc. | 
|---|
| 101 | */ | 
|---|
| 102 | UVector segmentObjects; | 
|---|
| 103 |  | 
|---|
| 104 | /** | 
|---|
| 105 | * The next available stand-in for variables.  This starts at some point in | 
|---|
| 106 | * the private use area (discovered dynamically) and increments up toward | 
|---|
| 107 | * <code>variableLimit</code>.  At any point during parsing, available | 
|---|
| 108 | * variables are <code>variableNext..variableLimit-1</code>. | 
|---|
| 109 | */ | 
|---|
| 110 | UChar variableNext; | 
|---|
| 111 |  | 
|---|
| 112 | /** | 
|---|
| 113 | * The last available stand-in for variables.  This is discovered | 
|---|
| 114 | * dynamically.  At any point during parsing, available variables are | 
|---|
| 115 | * <code>variableNext..variableLimit-1</code>. | 
|---|
| 116 | */ | 
|---|
| 117 | UChar variableLimit; | 
|---|
| 118 |  | 
|---|
| 119 | /** | 
|---|
| 120 | * When we encounter an undefined variable, we do not immediately signal | 
|---|
| 121 | * an error, in case we are defining this variable, e.g., "$a = [a-z];". | 
|---|
| 122 | * Instead, we save the name of the undefined variable, and substitute | 
|---|
| 123 | * in the placeholder char variableLimit - 1, and decrement | 
|---|
| 124 | * variableLimit. | 
|---|
| 125 | */ | 
|---|
| 126 | UnicodeString undefinedVariableName; | 
|---|
| 127 |  | 
|---|
| 128 | /** | 
|---|
| 129 | * The stand-in character for the 'dot' set, represented by '.' in | 
|---|
| 130 | * patterns.  This is allocated the first time it is needed, and | 
|---|
| 131 | * reused thereafter. | 
|---|
| 132 | */ | 
|---|
| 133 | UChar dotStandIn; | 
|---|
| 134 |  | 
|---|
| 135 | public: | 
|---|
| 136 |  | 
|---|
| 137 | /** | 
|---|
| 138 | * Constructor. | 
|---|
| 139 | */ | 
|---|
| 140 | TransliteratorParser(UErrorCode &statusReturn); | 
|---|
| 141 |  | 
|---|
| 142 | /** | 
|---|
| 143 | * Destructor. | 
|---|
| 144 | */ | 
|---|
| 145 | ~TransliteratorParser(); | 
|---|
| 146 |  | 
|---|
| 147 | /** | 
|---|
| 148 | * Parse the given string as a sequence of rules, separated by newline | 
|---|
| 149 | * characters ('\n'), and cause this object to implement those rules.  Any | 
|---|
| 150 | * previous rules are discarded.  Typically this method is called exactly | 
|---|
| 151 | * once after construction. | 
|---|
| 152 | * | 
|---|
| 153 | * Parse the given rules, in the given direction.  After this call | 
|---|
| 154 | * returns, query the public data members for results.  The caller | 
|---|
| 155 | * owns the 'data' and 'compoundFilter' data members after this | 
|---|
| 156 | * call returns. | 
|---|
| 157 | * @param rules      rules, separated by ';' | 
|---|
| 158 | * @param direction  either FORWARD or REVERSE. | 
|---|
| 159 | * @param pe         Struct to recieve information on position | 
|---|
| 160 | *                   of error if an error is encountered | 
|---|
| 161 | * @param ec         Output param set to success/failure code. | 
|---|
| 162 | */ | 
|---|
| 163 | void parse(const UnicodeString& rules, | 
|---|
| 164 | UTransDirection direction, | 
|---|
| 165 | UParseError& pe, | 
|---|
| 166 | UErrorCode& ec); | 
|---|
| 167 |  | 
|---|
| 168 | /** | 
|---|
| 169 | * Return the compound filter parsed by parse().  Caller owns result. | 
|---|
| 170 | * @return the compound filter parsed by parse(). | 
|---|
| 171 | */ | 
|---|
| 172 | UnicodeSet* orphanCompoundFilter(); | 
|---|
| 173 |  | 
|---|
| 174 | private: | 
|---|
| 175 |  | 
|---|
| 176 | /** | 
|---|
| 177 | * Return a representation of this transliterator as source rules. | 
|---|
| 178 | * @param rules      Output param to receive the rules. | 
|---|
| 179 | * @param direction  either FORWARD or REVERSE. | 
|---|
| 180 | */ | 
|---|
| 181 | void parseRules(const UnicodeString& rules, | 
|---|
| 182 | UTransDirection direction, | 
|---|
| 183 | UErrorCode& status); | 
|---|
| 184 |  | 
|---|
| 185 | /** | 
|---|
| 186 | * MAIN PARSER.  Parse the next rule in the given rule string, starting | 
|---|
| 187 | * at pos.  Return the index after the last character parsed.  Do not | 
|---|
| 188 | * parse characters at or after limit. | 
|---|
| 189 | * | 
|---|
| 190 | * Important:  The character at pos must be a non-whitespace character | 
|---|
| 191 | * that is not the comment character. | 
|---|
| 192 | * | 
|---|
| 193 | * This method handles quoting, escaping, and whitespace removal.  It | 
|---|
| 194 | * parses the end-of-rule character.  It recognizes context and cursor | 
|---|
| 195 | * indicators.  Once it does a lexical breakdown of the rule at pos, it | 
|---|
| 196 | * creates a rule object and adds it to our rule list. | 
|---|
| 197 | * @param rules      Output param to receive the rules. | 
|---|
| 198 | * @param pos        the starting position. | 
|---|
| 199 | * @param limit      pointer past the last character of the rule. | 
|---|
| 200 | * @return           the index after the last character parsed. | 
|---|
| 201 | */ | 
|---|
| 202 | int32_t parseRule(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status); | 
|---|
| 203 |  | 
|---|
| 204 | /** | 
|---|
| 205 | * Set the variable range to [start, end] (inclusive). | 
|---|
| 206 | * @param start    the start value of the range. | 
|---|
| 207 | * @param end      the end value of the range. | 
|---|
| 208 | */ | 
|---|
| 209 | void setVariableRange(int32_t start, int32_t end, UErrorCode& status); | 
|---|
| 210 |  | 
|---|
| 211 | /** | 
|---|
| 212 | * Assert that the given character is NOT within the variable range. | 
|---|
| 213 | * If it is, return FALSE.  This is neccesary to ensure that the | 
|---|
| 214 | * variable range does not overlap characters used in a rule. | 
|---|
| 215 | * @param ch     the given character. | 
|---|
| 216 | * @return       True, if the given character is NOT within the variable range. | 
|---|
| 217 | */ | 
|---|
| 218 | UBool checkVariableRange(UChar32 ch) const; | 
|---|
| 219 |  | 
|---|
| 220 | /** | 
|---|
| 221 | * Set the maximum backup to 'backup', in response to a pragma | 
|---|
| 222 | * statement. | 
|---|
| 223 | * @param backup    the new value to be set. | 
|---|
| 224 | */ | 
|---|
| 225 | void pragmaMaximumBackup(int32_t backup); | 
|---|
| 226 |  | 
|---|
| 227 | /** | 
|---|
| 228 | * Begin normalizing all rules using the given mode, in response | 
|---|
| 229 | * to a pragma statement. | 
|---|
| 230 | * @param mode    the given mode. | 
|---|
| 231 | */ | 
|---|
| 232 | void pragmaNormalizeRules(UNormalizationMode mode); | 
|---|
| 233 |  | 
|---|
| 234 | /** | 
|---|
| 235 | * Return true if the given rule looks like a pragma. | 
|---|
| 236 | * @param pos offset to the first non-whitespace character | 
|---|
| 237 | * of the rule. | 
|---|
| 238 | * @param limit pointer past the last character of the rule. | 
|---|
| 239 | * @return true if the given rule looks like a pragma. | 
|---|
| 240 | */ | 
|---|
| 241 | static UBool resemblesPragma(const UnicodeString& rule, int32_t pos, int32_t limit); | 
|---|
| 242 |  | 
|---|
| 243 | /** | 
|---|
| 244 | * Parse a pragma.  This method assumes resemblesPragma() has | 
|---|
| 245 | * already returned true. | 
|---|
| 246 | * @param pos offset to the first non-whitespace character | 
|---|
| 247 | * of the rule. | 
|---|
| 248 | * @param limit pointer past the last character of the rule. | 
|---|
| 249 | * @return the position index after the final ';' of the pragma, | 
|---|
| 250 | * or -1 on failure. | 
|---|
| 251 | */ | 
|---|
| 252 | int32_t parsePragma(const UnicodeString& rule, int32_t pos, int32_t limit, UErrorCode& status); | 
|---|
| 253 |  | 
|---|
| 254 | /** | 
|---|
| 255 | * Called by main parser upon syntax error.  Search the rule string | 
|---|
| 256 | * for the probable end of the rule.  Of course, if the error is that | 
|---|
| 257 | * the end of rule marker is missing, then the rule end will not be found. | 
|---|
| 258 | * In any case the rule start will be correctly reported. | 
|---|
| 259 | * @param parseErrorCode error code. | 
|---|
| 260 | * @param msg error description. | 
|---|
| 261 | * @param start position of first character of current rule. | 
|---|
| 262 | * @return start position of first character of current rule. | 
|---|
| 263 | */ | 
|---|
| 264 | int32_t syntaxError(UErrorCode parseErrorCode, const UnicodeString&, int32_t start, | 
|---|
| 265 | UErrorCode& status); | 
|---|
| 266 |  | 
|---|
| 267 | /** | 
|---|
| 268 | * Parse a UnicodeSet out, store it, and return the stand-in character | 
|---|
| 269 | * used to represent it. | 
|---|
| 270 | * | 
|---|
| 271 | * @param rule    the rule for UnicodeSet. | 
|---|
| 272 | * @param pos     the position in pattern at which to start parsing. | 
|---|
| 273 | * @return        the stand-in character used to represent it. | 
|---|
| 274 | */ | 
|---|
| 275 | UChar parseSet(const UnicodeString& rule, | 
|---|
| 276 | ParsePosition& pos, | 
|---|
| 277 | UErrorCode& status); | 
|---|
| 278 |  | 
|---|
| 279 | /** | 
|---|
| 280 | * Generate and return a stand-in for a new UnicodeFunctor.  Store | 
|---|
| 281 | * the matcher (adopt it). | 
|---|
| 282 | * @param adopted the UnicodeFunctor to be adopted. | 
|---|
| 283 | * @return        a stand-in for a new UnicodeFunctor. | 
|---|
| 284 | */ | 
|---|
| 285 | UChar generateStandInFor(UnicodeFunctor* adopted, UErrorCode& status); | 
|---|
| 286 |  | 
|---|
| 287 | /** | 
|---|
| 288 | * Return the standin for segment seg (1-based). | 
|---|
| 289 | * @param seg    the given segment. | 
|---|
| 290 | * @return       the standIn character for the given segment. | 
|---|
| 291 | */ | 
|---|
| 292 | UChar getSegmentStandin(int32_t seg, UErrorCode& status); | 
|---|
| 293 |  | 
|---|
| 294 | /** | 
|---|
| 295 | * Set the object for segment seg (1-based). | 
|---|
| 296 | * @param seg      the given segment. | 
|---|
| 297 | * @param adopted  the StringMatcher to be adopted. | 
|---|
| 298 | */ | 
|---|
| 299 | void setSegmentObject(int32_t seg, StringMatcher* adopted, UErrorCode& status); | 
|---|
| 300 |  | 
|---|
| 301 | /** | 
|---|
| 302 | * Return the stand-in for the dot set.  It is allocated the first | 
|---|
| 303 | * time and reused thereafter. | 
|---|
| 304 | * @return    the stand-in for the dot set. | 
|---|
| 305 | */ | 
|---|
| 306 | UChar getDotStandIn(UErrorCode& status); | 
|---|
| 307 |  | 
|---|
| 308 | /** | 
|---|
| 309 | * Append the value of the given variable name to the given | 
|---|
| 310 | * UnicodeString. | 
|---|
| 311 | * @param name    the variable name to be appended. | 
|---|
| 312 | * @param buf     the given UnicodeString to append to. | 
|---|
| 313 | */ | 
|---|
| 314 | void appendVariableDef(const UnicodeString& name, | 
|---|
| 315 | UnicodeString& buf, | 
|---|
| 316 | UErrorCode& status); | 
|---|
| 317 |  | 
|---|
| 318 | /** | 
|---|
| 319 | * Glue method to get around access restrictions in C++. | 
|---|
| 320 | */ | 
|---|
| 321 | /*static Transliterator* createBasicInstance(const UnicodeString& id, | 
|---|
| 322 | const UnicodeString* canonID);*/ | 
|---|
| 323 |  | 
|---|
| 324 | friend class RuleHalf; | 
|---|
| 325 |  | 
|---|
| 326 | // Disallowed methods; no impl. | 
|---|
| 327 | /** | 
|---|
| 328 | * Copy constructor | 
|---|
| 329 | */ | 
|---|
| 330 | TransliteratorParser(const TransliteratorParser&); | 
|---|
| 331 |  | 
|---|
| 332 | /** | 
|---|
| 333 | * Assignment operator | 
|---|
| 334 | */ | 
|---|
| 335 | TransliteratorParser& operator=(const TransliteratorParser&); | 
|---|
| 336 | }; | 
|---|
| 337 |  | 
|---|
| 338 | U_NAMESPACE_END | 
|---|
| 339 |  | 
|---|
| 340 | #endif /* #ifdef __cplusplus */ | 
|---|
| 341 |  | 
|---|
| 342 | /** | 
|---|
| 343 | * Strip/convert the following from the transliterator rules: | 
|---|
| 344 | * comments | 
|---|
| 345 | * newlines | 
|---|
| 346 | * white space at the beginning and end of a line | 
|---|
| 347 | * unescape \u notation | 
|---|
| 348 | * | 
|---|
| 349 | * The target must be equal in size as the source. | 
|---|
| 350 | * @internal | 
|---|
| 351 | */ | 
|---|
| 352 | U_CAPI int32_t | 
|---|
| 353 | utrans_stripRules(const UChar *source, int32_t sourceLen, UChar *target, UErrorCode *status); | 
|---|
| 354 |  | 
|---|
| 355 | #endif /* #if !UCONFIG_NO_TRANSLITERATION */ | 
|---|
| 356 |  | 
|---|
| 357 | #endif | 
|---|
| 358 |  | 
|---|