| 1 | // © 2016 and later: Unicode, Inc. and others. | 
|---|
| 2 | // License & terms of use: http://www.unicode.org/copyright.html | 
|---|
| 3 | /* | 
|---|
| 4 | * Copyright (C) {1999-2001}, International Business Machines Corporation and others. All Rights Reserved. | 
|---|
| 5 | ********************************************************************** | 
|---|
| 6 | *   Date        Name        Description | 
|---|
| 7 | *   11/17/99    aliu        Creation. | 
|---|
| 8 | ********************************************************************** | 
|---|
| 9 | */ | 
|---|
| 10 | #ifndef RBT_RULE_H | 
|---|
| 11 | #define RBT_RULE_H | 
|---|
| 12 |  | 
|---|
| 13 | #include "unicode/utypes.h" | 
|---|
| 14 |  | 
|---|
| 15 | #if !UCONFIG_NO_TRANSLITERATION | 
|---|
| 16 |  | 
|---|
| 17 | #include "unicode/uobject.h" | 
|---|
| 18 | #include "unicode/unistr.h" | 
|---|
| 19 | #include "unicode/utrans.h" | 
|---|
| 20 | #include "unicode/unimatch.h" | 
|---|
| 21 |  | 
|---|
| 22 | U_NAMESPACE_BEGIN | 
|---|
| 23 |  | 
|---|
| 24 | class Replaceable; | 
|---|
| 25 | class TransliterationRuleData; | 
|---|
| 26 | class StringMatcher; | 
|---|
| 27 | class UnicodeFunctor; | 
|---|
| 28 |  | 
|---|
| 29 | /** | 
|---|
| 30 | * A transliteration rule used by | 
|---|
| 31 | * <code>RuleBasedTransliterator</code>. | 
|---|
| 32 | * <code>TransliterationRule</code> is an immutable object. | 
|---|
| 33 | * | 
|---|
| 34 | * <p>A rule consists of an input pattern and an output string.  When | 
|---|
| 35 | * the input pattern is matched, the output string is emitted.  The | 
|---|
| 36 | * input pattern consists of zero or more characters which are matched | 
|---|
| 37 | * exactly (the key) and optional context.  Context must match if it | 
|---|
| 38 | * is specified.  Context may be specified before the key, after the | 
|---|
| 39 | * key, or both.  The key, preceding context, and following context | 
|---|
| 40 | * may contain variables.  Variables represent a set of Unicode | 
|---|
| 41 | * characters, such as the letters <i>a</i> through <i>z</i>. | 
|---|
| 42 | * Variables are detected by looking up each character in a supplied | 
|---|
| 43 | * variable list to see if it has been so defined. | 
|---|
| 44 | * | 
|---|
| 45 | * <p>A rule may contain segments in its input string and segment | 
|---|
| 46 | * references in its output string.  A segment is a substring of the | 
|---|
| 47 | * input pattern, indicated by an offset and limit.  The segment may | 
|---|
| 48 | * be in the preceding or following context.  It may not span a | 
|---|
| 49 | * context boundary.  A segment reference is a special character in | 
|---|
| 50 | * the output string that causes a segment of the input string (not | 
|---|
| 51 | * the input pattern) to be copied to the output string.  The range of | 
|---|
| 52 | * special characters that represent segment references is defined by | 
|---|
| 53 | * RuleBasedTransliterator.Data. | 
|---|
| 54 | * | 
|---|
| 55 | * @author Alan Liu | 
|---|
| 56 | */ | 
|---|
| 57 | class TransliterationRule : public UMemory { | 
|---|
| 58 |  | 
|---|
| 59 | private: | 
|---|
| 60 |  | 
|---|
| 61 | // TODO Eliminate the pattern and keyLength data members.  They | 
|---|
| 62 | // are used only by masks() and getIndexValue() which are called | 
|---|
| 63 | // only during build time, not during run-time.  Perhaps these | 
|---|
| 64 | // methods and pattern/keyLength can be isolated into a separate | 
|---|
| 65 | // object. | 
|---|
| 66 |  | 
|---|
| 67 | /** | 
|---|
| 68 | * The match that must occur before the key, or null if there is no | 
|---|
| 69 | * preceding context. | 
|---|
| 70 | */ | 
|---|
| 71 | StringMatcher *anteContext; | 
|---|
| 72 |  | 
|---|
| 73 | /** | 
|---|
| 74 | * The matcher object for the key.  If null, then the key is empty. | 
|---|
| 75 | */ | 
|---|
| 76 | StringMatcher *key; | 
|---|
| 77 |  | 
|---|
| 78 | /** | 
|---|
| 79 | * The match that must occur after the key, or null if there is no | 
|---|
| 80 | * following context. | 
|---|
| 81 | */ | 
|---|
| 82 | StringMatcher *postContext; | 
|---|
| 83 |  | 
|---|
| 84 | /** | 
|---|
| 85 | * The object that performs the replacement if the key, | 
|---|
| 86 | * anteContext, and postContext are matched.  Never null. | 
|---|
| 87 | */ | 
|---|
| 88 | UnicodeFunctor* output; | 
|---|
| 89 |  | 
|---|
| 90 | /** | 
|---|
| 91 | * The string that must be matched, consisting of the anteContext, key, | 
|---|
| 92 | * and postContext, concatenated together, in that order.  Some components | 
|---|
| 93 | * may be empty (zero length). | 
|---|
| 94 | * @see anteContextLength | 
|---|
| 95 | * @see keyLength | 
|---|
| 96 | */ | 
|---|
| 97 | UnicodeString pattern; | 
|---|
| 98 |  | 
|---|
| 99 | /** | 
|---|
| 100 | * An array of matcher objects corresponding to the input pattern | 
|---|
| 101 | * segments.  If there are no segments this is null.  N.B. This is | 
|---|
| 102 | * a UnicodeMatcher for generality, but in practice it is always a | 
|---|
| 103 | * StringMatcher.  In the future we may generalize this, but for | 
|---|
| 104 | * now we sometimes cast down to StringMatcher. | 
|---|
| 105 | * | 
|---|
| 106 | * The array is owned, but the pointers within it are not. | 
|---|
| 107 | */ | 
|---|
| 108 | UnicodeFunctor** segments; | 
|---|
| 109 |  | 
|---|
| 110 | /** | 
|---|
| 111 | * The number of elements in segments[] or zero if segments is NULL. | 
|---|
| 112 | */ | 
|---|
| 113 | int32_t segmentsCount; | 
|---|
| 114 |  | 
|---|
| 115 | /** | 
|---|
| 116 | * The length of the string that must match before the key.  If | 
|---|
| 117 | * zero, then there is no matching requirement before the key. | 
|---|
| 118 | * Substring [0,anteContextLength) of pattern is the anteContext. | 
|---|
| 119 | */ | 
|---|
| 120 | int32_t anteContextLength; | 
|---|
| 121 |  | 
|---|
| 122 | /** | 
|---|
| 123 | * The length of the key.  Substring [anteContextLength, | 
|---|
| 124 | * anteContextLength + keyLength) is the key. | 
|---|
| 125 |  | 
|---|
| 126 | */ | 
|---|
| 127 | int32_t keyLength; | 
|---|
| 128 |  | 
|---|
| 129 | /** | 
|---|
| 130 | * Miscellaneous attributes. | 
|---|
| 131 | */ | 
|---|
| 132 | int8_t flags; | 
|---|
| 133 |  | 
|---|
| 134 | /** | 
|---|
| 135 | * Flag attributes. | 
|---|
| 136 | */ | 
|---|
| 137 | enum { | 
|---|
| 138 | ANCHOR_START = 1, | 
|---|
| 139 | ANCHOR_END   = 2 | 
|---|
| 140 | }; | 
|---|
| 141 |  | 
|---|
| 142 | /** | 
|---|
| 143 | * An alias pointer to the data for this rule.  The data provides | 
|---|
| 144 | * lookup services for matchers and segments. | 
|---|
| 145 | */ | 
|---|
| 146 | const TransliterationRuleData* data; | 
|---|
| 147 |  | 
|---|
| 148 | public: | 
|---|
| 149 |  | 
|---|
| 150 | /** | 
|---|
| 151 | * Construct a new rule with the given input, output text, and other | 
|---|
| 152 | * attributes.  A cursor position may be specified for the output text. | 
|---|
| 153 | * @param input          input string, including key and optional ante and | 
|---|
| 154 | *                       post context. | 
|---|
| 155 | * @param anteContextPos offset into input to end of ante context, or -1 if | 
|---|
| 156 | *                       none.  Must be <= input.length() if not -1. | 
|---|
| 157 | * @param postContextPos offset into input to start of post context, or -1 | 
|---|
| 158 | *                       if none.  Must be <= input.length() if not -1, and must be >= | 
|---|
| 159 | *                       anteContextPos. | 
|---|
| 160 | * @param outputStr      output string. | 
|---|
| 161 | * @param cursorPosition offset into output at which cursor is located, or -1 if | 
|---|
| 162 | *                       none.  If less than zero, then the cursor is placed after the | 
|---|
| 163 | *                       <code>output</code>; that is, -1 is equivalent to | 
|---|
| 164 | *                       <code>output.length()</code>.  If greater than | 
|---|
| 165 | *                       <code>output.length()</code> then an exception is thrown. | 
|---|
| 166 | * @param cursorOffset   an offset to be added to cursorPos to position the | 
|---|
| 167 | *                       cursor either in the ante context, if < 0, or in the post context, if > | 
|---|
| 168 | *                       0.  For example, the rule "abc{def} > | @@@ xyz;" changes "def" to | 
|---|
| 169 | *                       "xyz" and moves the cursor to before "a".  It would have a cursorOffset | 
|---|
| 170 | *                       of -3. | 
|---|
| 171 | * @param segs           array of UnicodeMatcher corresponding to input pattern | 
|---|
| 172 | *                       segments, or null if there are none.  The array itself is adopted, | 
|---|
| 173 | *                       but the pointers within it are not. | 
|---|
| 174 | * @param segsCount      number of elements in segs[]. | 
|---|
| 175 | * @param anchorStart    TRUE if the the rule is anchored on the left to | 
|---|
| 176 | *                       the context start. | 
|---|
| 177 | * @param anchorEnd      TRUE if the rule is anchored on the right to the | 
|---|
| 178 | *                       context limit. | 
|---|
| 179 | * @param data           the rule data. | 
|---|
| 180 | * @param status         Output parameter filled in with success or failure status. | 
|---|
| 181 | */ | 
|---|
| 182 | TransliterationRule(const UnicodeString& input, | 
|---|
| 183 | int32_t anteContextPos, int32_t postContextPos, | 
|---|
| 184 | const UnicodeString& outputStr, | 
|---|
| 185 | int32_t cursorPosition, int32_t cursorOffset, | 
|---|
| 186 | UnicodeFunctor** segs, | 
|---|
| 187 | int32_t segsCount, | 
|---|
| 188 | UBool anchorStart, UBool anchorEnd, | 
|---|
| 189 | const TransliterationRuleData* data, | 
|---|
| 190 | UErrorCode& status); | 
|---|
| 191 |  | 
|---|
| 192 | /** | 
|---|
| 193 | * Copy constructor. | 
|---|
| 194 | * @param other    the object to be copied. | 
|---|
| 195 | */ | 
|---|
| 196 | TransliterationRule(TransliterationRule& other); | 
|---|
| 197 |  | 
|---|
| 198 | /** | 
|---|
| 199 | * Destructor. | 
|---|
| 200 | */ | 
|---|
| 201 | virtual ~TransliterationRule(); | 
|---|
| 202 |  | 
|---|
| 203 | /** | 
|---|
| 204 | * Change the data object that this rule belongs to.  Used | 
|---|
| 205 | * internally by the TransliterationRuleData copy constructor. | 
|---|
| 206 | * @param data    the new data value to be set. | 
|---|
| 207 | */ | 
|---|
| 208 | void setData(const TransliterationRuleData* data); | 
|---|
| 209 |  | 
|---|
| 210 | /** | 
|---|
| 211 | * Return the preceding context length.  This method is needed to | 
|---|
| 212 | * support the <code>Transliterator</code> method | 
|---|
| 213 | * <code>getMaximumContextLength()</code>.  Internally, this is | 
|---|
| 214 | * implemented as the anteContextLength, optionally plus one if | 
|---|
| 215 | * there is a start anchor.  The one character anchor gap is | 
|---|
| 216 | * needed to make repeated incremental transliteration with | 
|---|
| 217 | * anchors work. | 
|---|
| 218 | * @return    the preceding context length. | 
|---|
| 219 | */ | 
|---|
| 220 | virtual int32_t getContextLength(void) const; | 
|---|
| 221 |  | 
|---|
| 222 | /** | 
|---|
| 223 | * Internal method.  Returns 8-bit index value for this rule. | 
|---|
| 224 | * This is the low byte of the first character of the key, | 
|---|
| 225 | * unless the first character of the key is a set.  If it's a | 
|---|
| 226 | * set, or otherwise can match multiple keys, the index value is -1. | 
|---|
| 227 | * @return    8-bit index value for this rule. | 
|---|
| 228 | */ | 
|---|
| 229 | int16_t getIndexValue() const; | 
|---|
| 230 |  | 
|---|
| 231 | /** | 
|---|
| 232 | * Internal method.  Returns true if this rule matches the given | 
|---|
| 233 | * index value.  The index value is an 8-bit integer, 0..255, | 
|---|
| 234 | * representing the low byte of the first character of the key. | 
|---|
| 235 | * It matches this rule if it matches the first character of the | 
|---|
| 236 | * key, or if the first character of the key is a set, and the set | 
|---|
| 237 | * contains any character with a low byte equal to the index | 
|---|
| 238 | * value.  If the rule contains only ante context, as in foo)>bar, | 
|---|
| 239 | * then it will match any key. | 
|---|
| 240 | * @param v    the given index value. | 
|---|
| 241 | * @return     true if this rule matches the given index value. | 
|---|
| 242 | */ | 
|---|
| 243 | UBool matchesIndexValue(uint8_t v) const; | 
|---|
| 244 |  | 
|---|
| 245 | /** | 
|---|
| 246 | * Return true if this rule masks another rule.  If r1 masks r2 then | 
|---|
| 247 | * r1 matches any input string that r2 matches.  If r1 masks r2 and r2 masks | 
|---|
| 248 | * r1 then r1 == r2.  Examples: "a>x" masks "ab>y".  "a>x" masks "a[b]>y". | 
|---|
| 249 | * "[c]a>x" masks "[dc]a>y". | 
|---|
| 250 | * @param r2  the given rule to be compared with. | 
|---|
| 251 | * @return    true if this rule masks 'r2' | 
|---|
| 252 | */ | 
|---|
| 253 | virtual UBool masks(const TransliterationRule& r2) const; | 
|---|
| 254 |  | 
|---|
| 255 | /** | 
|---|
| 256 | * Attempt a match and replacement at the given position.  Return | 
|---|
| 257 | * the degree of match between this rule and the given text.  The | 
|---|
| 258 | * degree of match may be mismatch, a partial match, or a full | 
|---|
| 259 | * match.  A mismatch means at least one character of the text | 
|---|
| 260 | * does not match the context or key.  A partial match means some | 
|---|
| 261 | * context and key characters match, but the text is not long | 
|---|
| 262 | * enough to match all of them.  A full match means all context | 
|---|
| 263 | * and key characters match. | 
|---|
| 264 | * | 
|---|
| 265 | * If a full match is obtained, perform a replacement, update pos, | 
|---|
| 266 | * and return U_MATCH.  Otherwise both text and pos are unchanged. | 
|---|
| 267 | * | 
|---|
| 268 | * @param text the text | 
|---|
| 269 | * @param pos the position indices | 
|---|
| 270 | * @param incremental if TRUE, test for partial matches that may | 
|---|
| 271 | * be completed by additional text inserted at pos.limit. | 
|---|
| 272 | * @return one of <code>U_MISMATCH</code>, | 
|---|
| 273 | * <code>U_PARTIAL_MATCH</code>, or <code>U_MATCH</code>.  If | 
|---|
| 274 | * incremental is FALSE then U_PARTIAL_MATCH will not be returned. | 
|---|
| 275 | */ | 
|---|
| 276 | UMatchDegree matchAndReplace(Replaceable& text, | 
|---|
| 277 | UTransPosition& pos, | 
|---|
| 278 | UBool incremental) const; | 
|---|
| 279 |  | 
|---|
| 280 | /** | 
|---|
| 281 | * Create a rule string that represents this rule object.  Append | 
|---|
| 282 | * it to the given string. | 
|---|
| 283 | */ | 
|---|
| 284 | virtual UnicodeString& toRule(UnicodeString& pat, | 
|---|
| 285 | UBool escapeUnprintable) const; | 
|---|
| 286 |  | 
|---|
| 287 | /** | 
|---|
| 288 | * Union the set of all characters that may be modified by this rule | 
|---|
| 289 | * into the given set. | 
|---|
| 290 | */ | 
|---|
| 291 | void addSourceSetTo(UnicodeSet& toUnionTo) const; | 
|---|
| 292 |  | 
|---|
| 293 | /** | 
|---|
| 294 | * Union the set of all characters that may be emitted by this rule | 
|---|
| 295 | * into the given set. | 
|---|
| 296 | */ | 
|---|
| 297 | void addTargetSetTo(UnicodeSet& toUnionTo) const; | 
|---|
| 298 |  | 
|---|
| 299 | private: | 
|---|
| 300 |  | 
|---|
| 301 | friend class StringMatcher; | 
|---|
| 302 |  | 
|---|
| 303 | TransliterationRule &operator=(const TransliterationRule &other); // forbid copying of this class | 
|---|
| 304 | }; | 
|---|
| 305 |  | 
|---|
| 306 | U_NAMESPACE_END | 
|---|
| 307 |  | 
|---|
| 308 | #endif /* #if !UCONFIG_NO_TRANSLITERATION */ | 
|---|
| 309 |  | 
|---|
| 310 | #endif | 
|---|
| 311 |  | 
|---|