| 1 | // © 2016 and later: Unicode, Inc. and others. | 
|---|
| 2 | // License & terms of use: http://www.unicode.org/copyright.html | 
|---|
| 3 | /* | 
|---|
| 4 | * Copyright (C) 2001-2011, International Business Machines Corporation | 
|---|
| 5 | * and others. All Rights Reserved. | 
|---|
| 6 | ********************************************************************** | 
|---|
| 7 | *   Date        Name        Description | 
|---|
| 8 | *   07/23/01    aliu        Creation. | 
|---|
| 9 | ********************************************************************** | 
|---|
| 10 | */ | 
|---|
| 11 | #ifndef STRMATCH_H | 
|---|
| 12 | #define STRMATCH_H | 
|---|
| 13 |  | 
|---|
| 14 | #include "unicode/utypes.h" | 
|---|
| 15 |  | 
|---|
| 16 | #if !UCONFIG_NO_TRANSLITERATION | 
|---|
| 17 |  | 
|---|
| 18 | #include "unicode/unistr.h" | 
|---|
| 19 | #include "unicode/unifunct.h" | 
|---|
| 20 | #include "unicode/unimatch.h" | 
|---|
| 21 | #include "unicode/unirepl.h" | 
|---|
| 22 |  | 
|---|
| 23 | U_NAMESPACE_BEGIN | 
|---|
| 24 |  | 
|---|
| 25 | class TransliterationRuleData; | 
|---|
| 26 |  | 
|---|
| 27 | /** | 
|---|
| 28 | * An object that matches a fixed input string, implementing the | 
|---|
| 29 | * UnicodeMatcher API.  This object also implements the | 
|---|
| 30 | * UnicodeReplacer API, allowing it to emit the matched text as | 
|---|
| 31 | * output.  Since the match text may contain flexible match elements, | 
|---|
| 32 | * such as UnicodeSets, the emitted text is not the match pattern, but | 
|---|
| 33 | * instead a substring of the actual matched text.  Following | 
|---|
| 34 | * convention, the output text is the leftmost match seen up to this | 
|---|
| 35 | * point. | 
|---|
| 36 | * | 
|---|
| 37 | * A StringMatcher may represent a segment, in which case it has a | 
|---|
| 38 | * positive segment number.  This affects how the matcher converts | 
|---|
| 39 | * itself to a pattern but does not otherwise affect its function. | 
|---|
| 40 | * | 
|---|
| 41 | * A StringMatcher that is not a segment should not be used as a | 
|---|
| 42 | * UnicodeReplacer. | 
|---|
| 43 | */ | 
|---|
| 44 | class StringMatcher : public UnicodeFunctor, public UnicodeMatcher, public UnicodeReplacer { | 
|---|
| 45 |  | 
|---|
| 46 | public: | 
|---|
| 47 |  | 
|---|
| 48 | /** | 
|---|
| 49 | * Construct a matcher that matches the given pattern string. | 
|---|
| 50 | * @param string the pattern to be matched, possibly containing | 
|---|
| 51 | * stand-ins that represent nested UnicodeMatcher objects. | 
|---|
| 52 | * @param start inclusive start index of text to be replaced | 
|---|
| 53 | * @param limit exclusive end index of text to be replaced; | 
|---|
| 54 | * must be greater than or equal to start | 
|---|
| 55 | * @param segmentNum the segment number from 1..n, or 0 if this is | 
|---|
| 56 | * not a segment. | 
|---|
| 57 | * @param data context object mapping stand-ins to | 
|---|
| 58 | * UnicodeMatcher objects. | 
|---|
| 59 | */ | 
|---|
| 60 | StringMatcher(const UnicodeString& string, | 
|---|
| 61 | int32_t start, | 
|---|
| 62 | int32_t limit, | 
|---|
| 63 | int32_t segmentNum, | 
|---|
| 64 | const TransliterationRuleData& data); | 
|---|
| 65 |  | 
|---|
| 66 | /** | 
|---|
| 67 | * Copy constructor | 
|---|
| 68 | * @param o  the object to be copied. | 
|---|
| 69 | */ | 
|---|
| 70 | StringMatcher(const StringMatcher& o); | 
|---|
| 71 |  | 
|---|
| 72 | /** | 
|---|
| 73 | * Destructor | 
|---|
| 74 | */ | 
|---|
| 75 | virtual ~StringMatcher(); | 
|---|
| 76 |  | 
|---|
| 77 | /** | 
|---|
| 78 | * Implement UnicodeFunctor | 
|---|
| 79 | * @return a copy of the object. | 
|---|
| 80 | */ | 
|---|
| 81 | virtual StringMatcher* clone() const; | 
|---|
| 82 |  | 
|---|
| 83 | /** | 
|---|
| 84 | * UnicodeFunctor API.  Cast 'this' to a UnicodeMatcher* pointer | 
|---|
| 85 | * and return the pointer. | 
|---|
| 86 | * @return the UnicodeMatcher point. | 
|---|
| 87 | */ | 
|---|
| 88 | virtual UnicodeMatcher* toMatcher() const; | 
|---|
| 89 |  | 
|---|
| 90 | /** | 
|---|
| 91 | * UnicodeFunctor API.  Cast 'this' to a UnicodeReplacer* pointer | 
|---|
| 92 | * and return the pointer. | 
|---|
| 93 | * @return the UnicodeReplacer pointer. | 
|---|
| 94 | */ | 
|---|
| 95 | virtual UnicodeReplacer* toReplacer() const; | 
|---|
| 96 |  | 
|---|
| 97 | /** | 
|---|
| 98 | * Implement UnicodeMatcher | 
|---|
| 99 | * @param text the text to be matched | 
|---|
| 100 | * @param offset on input, the index into text at which to begin | 
|---|
| 101 | * matching.  On output, the limit of the matched text.  The | 
|---|
| 102 | * number of matched characters is the output value of offset | 
|---|
| 103 | * minus the input value.  Offset should always point to the | 
|---|
| 104 | * HIGH SURROGATE (leading code unit) of a pair of surrogates, | 
|---|
| 105 | * both on entry and upon return. | 
|---|
| 106 | * @param limit the limit index of text to be matched.  Greater | 
|---|
| 107 | * than offset for a forward direction match, less than offset for | 
|---|
| 108 | * a backward direction match.  The last character to be | 
|---|
| 109 | * considered for matching will be text.charAt(limit-1) in the | 
|---|
| 110 | * forward direction or text.charAt(limit+1) in the backward | 
|---|
| 111 | * direction. | 
|---|
| 112 | * @param incremental  if TRUE, then assume further characters may | 
|---|
| 113 | * be inserted at limit and check for partial matching.  Otherwise | 
|---|
| 114 | * assume the text as given is complete. | 
|---|
| 115 | * @return a match degree value indicating a full match, a partial | 
|---|
| 116 | * match, or a mismatch.  If incremental is FALSE then | 
|---|
| 117 | * U_PARTIAL_MATCH should never be returned. | 
|---|
| 118 | */ | 
|---|
| 119 | virtual UMatchDegree matches(const Replaceable& text, | 
|---|
| 120 | int32_t& offset, | 
|---|
| 121 | int32_t limit, | 
|---|
| 122 | UBool incremental); | 
|---|
| 123 |  | 
|---|
| 124 | /** | 
|---|
| 125 | * Implement UnicodeMatcher | 
|---|
| 126 | * @param result            Output param to receive the pattern. | 
|---|
| 127 | * @param escapeUnprintable if True then escape the unprintable characters. | 
|---|
| 128 | * @return                  A reference to 'result'. | 
|---|
| 129 | */ | 
|---|
| 130 | virtual UnicodeString& toPattern(UnicodeString& result, | 
|---|
| 131 | UBool escapeUnprintable = FALSE) const; | 
|---|
| 132 |  | 
|---|
| 133 | /** | 
|---|
| 134 | * Implement UnicodeMatcher | 
|---|
| 135 | * Returns TRUE if this matcher will match a character c, where c | 
|---|
| 136 | * & 0xFF == v, at offset, in the forward direction (with limit > | 
|---|
| 137 | * offset).  This is used by <tt>RuleBasedTransliterator</tt> for | 
|---|
| 138 | * indexing. | 
|---|
| 139 | * @param v    the given value | 
|---|
| 140 | * @return     TRUE if this matcher will match a character c, | 
|---|
| 141 | *             where c & 0xFF == v | 
|---|
| 142 | */ | 
|---|
| 143 | virtual UBool matchesIndexValue(uint8_t v) const; | 
|---|
| 144 |  | 
|---|
| 145 | /** | 
|---|
| 146 | * Implement UnicodeMatcher | 
|---|
| 147 | */ | 
|---|
| 148 | virtual void addMatchSetTo(UnicodeSet& toUnionTo) const; | 
|---|
| 149 |  | 
|---|
| 150 | /** | 
|---|
| 151 | * Implement UnicodeFunctor | 
|---|
| 152 | */ | 
|---|
| 153 | virtual void setData(const TransliterationRuleData*); | 
|---|
| 154 |  | 
|---|
| 155 | /** | 
|---|
| 156 | * Replace characters in 'text' from 'start' to 'limit' with the | 
|---|
| 157 | * output text of this object.  Update the 'cursor' parameter to | 
|---|
| 158 | * give the cursor position and return the length of the | 
|---|
| 159 | * replacement text. | 
|---|
| 160 | * | 
|---|
| 161 | * @param text the text to be matched | 
|---|
| 162 | * @param start inclusive start index of text to be replaced | 
|---|
| 163 | * @param limit exclusive end index of text to be replaced; | 
|---|
| 164 | * must be greater than or equal to start | 
|---|
| 165 | * @param cursor output parameter for the cursor position. | 
|---|
| 166 | * Not all replacer objects will update this, but in a complete | 
|---|
| 167 | * tree of replacer objects, representing the entire output side | 
|---|
| 168 | * of a transliteration rule, at least one must update it. | 
|---|
| 169 | * @return the number of 16-bit code units in the text replacing | 
|---|
| 170 | * the characters at offsets start..(limit-1) in text | 
|---|
| 171 | */ | 
|---|
| 172 | virtual int32_t replace(Replaceable& text, | 
|---|
| 173 | int32_t start, | 
|---|
| 174 | int32_t limit, | 
|---|
| 175 | int32_t& cursor); | 
|---|
| 176 |  | 
|---|
| 177 | /** | 
|---|
| 178 | * Returns a string representation of this replacer.  If the | 
|---|
| 179 | * result of calling this function is passed to the appropriate | 
|---|
| 180 | * parser, typically TransliteratorParser, it will produce another | 
|---|
| 181 | * replacer that is equal to this one. | 
|---|
| 182 | * @param result the string to receive the pattern.  Previous | 
|---|
| 183 | * contents will be deleted. | 
|---|
| 184 | * @param escapeUnprintable if TRUE then convert unprintable | 
|---|
| 185 | * character to their hex escape representations, \\uxxxx or | 
|---|
| 186 | * \\Uxxxxxxxx.  Unprintable characters are defined by | 
|---|
| 187 | * Utility.isUnprintable(). | 
|---|
| 188 | * @return a reference to 'result'. | 
|---|
| 189 | */ | 
|---|
| 190 | virtual UnicodeString& toReplacerPattern(UnicodeString& result, | 
|---|
| 191 | UBool escapeUnprintable) const; | 
|---|
| 192 |  | 
|---|
| 193 | /** | 
|---|
| 194 | * Remove any match data.  This must be called before performing a | 
|---|
| 195 | * set of matches with this segment. | 
|---|
| 196 | */ | 
|---|
| 197 | void resetMatch(); | 
|---|
| 198 |  | 
|---|
| 199 | /** | 
|---|
| 200 | * ICU "poor man's RTTI", returns a UClassID for the actual class. | 
|---|
| 201 | */ | 
|---|
| 202 | virtual UClassID getDynamicClassID() const; | 
|---|
| 203 |  | 
|---|
| 204 | /** | 
|---|
| 205 | * ICU "poor man's RTTI", returns a UClassID for this class. | 
|---|
| 206 | */ | 
|---|
| 207 | static UClassID U_EXPORT2 getStaticClassID(); | 
|---|
| 208 |  | 
|---|
| 209 | /** | 
|---|
| 210 | * Union the set of all characters that may output by this object | 
|---|
| 211 | * into the given set. | 
|---|
| 212 | * @param toUnionTo the set into which to union the output characters | 
|---|
| 213 | */ | 
|---|
| 214 | virtual void addReplacementSetTo(UnicodeSet& toUnionTo) const; | 
|---|
| 215 |  | 
|---|
| 216 | private: | 
|---|
| 217 |  | 
|---|
| 218 | /** | 
|---|
| 219 | * The text to be matched. | 
|---|
| 220 | */ | 
|---|
| 221 | UnicodeString pattern; | 
|---|
| 222 |  | 
|---|
| 223 | /** | 
|---|
| 224 | * Context object that maps stand-ins to matcher and replacer | 
|---|
| 225 | * objects. | 
|---|
| 226 | */ | 
|---|
| 227 | const TransliterationRuleData* data; | 
|---|
| 228 |  | 
|---|
| 229 | /** | 
|---|
| 230 | * The segment number, 1-based, or 0 if not a segment. | 
|---|
| 231 | */ | 
|---|
| 232 | int32_t segmentNumber; | 
|---|
| 233 |  | 
|---|
| 234 | /** | 
|---|
| 235 | * Start offset, in the match text, of the <em>rightmost</em> | 
|---|
| 236 | * match. | 
|---|
| 237 | */ | 
|---|
| 238 | int32_t matchStart; | 
|---|
| 239 |  | 
|---|
| 240 | /** | 
|---|
| 241 | * Limit offset, in the match text, of the <em>rightmost</em> | 
|---|
| 242 | * match. | 
|---|
| 243 | */ | 
|---|
| 244 | int32_t matchLimit; | 
|---|
| 245 |  | 
|---|
| 246 | }; | 
|---|
| 247 |  | 
|---|
| 248 | U_NAMESPACE_END | 
|---|
| 249 |  | 
|---|
| 250 | #endif /* #if !UCONFIG_NO_TRANSLITERATION */ | 
|---|
| 251 |  | 
|---|
| 252 | #endif | 
|---|
| 253 |  | 
|---|