| 1 | // © 2016 and later: Unicode, Inc. and others. |
| 2 | // License & terms of use: http://www.unicode.org/copyright.html |
| 3 | // |
| 4 | // rbbirb.h |
| 5 | // |
| 6 | // Copyright (C) 2002-2008, International Business Machines Corporation and others. |
| 7 | // All Rights Reserved. |
| 8 | // |
| 9 | // This file contains declarations for several classes from the |
| 10 | // Rule Based Break Iterator rule builder. |
| 11 | // |
| 12 | |
| 13 | |
| 14 | #ifndef RBBIRB_H |
| 15 | #define RBBIRB_H |
| 16 | |
| 17 | #include "unicode/utypes.h" |
| 18 | |
| 19 | #if !UCONFIG_NO_BREAK_ITERATION |
| 20 | |
| 21 | #include <utility> |
| 22 | |
| 23 | #include "unicode/uobject.h" |
| 24 | #include "unicode/rbbi.h" |
| 25 | #include "unicode/uniset.h" |
| 26 | #include "unicode/parseerr.h" |
| 27 | #include "uhash.h" |
| 28 | #include "uvector.h" |
| 29 | #include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that |
| 30 | // looks up references to $variables within a set. |
| 31 | |
| 32 | |
| 33 | U_NAMESPACE_BEGIN |
| 34 | |
| 35 | class RBBIRuleScanner; |
| 36 | struct RBBIRuleTableEl; |
| 37 | class RBBISetBuilder; |
| 38 | class RBBINode; |
| 39 | class RBBITableBuilder; |
| 40 | |
| 41 | |
| 42 | |
| 43 | //-------------------------------------------------------------------------------- |
| 44 | // |
| 45 | // RBBISymbolTable. Implements SymbolTable interface that is used by the |
| 46 | // UnicodeSet parser to resolve references to $variables. |
| 47 | // |
| 48 | //-------------------------------------------------------------------------------- |
| 49 | class RBBISymbolTableEntry : public UMemory { // The symbol table hash table contains one |
| 50 | public: // of these structs for each entry. |
| 51 | RBBISymbolTableEntry(); |
| 52 | UnicodeString key; |
| 53 | RBBINode *val; |
| 54 | ~RBBISymbolTableEntry(); |
| 55 | |
| 56 | private: |
| 57 | RBBISymbolTableEntry(const RBBISymbolTableEntry &other); // forbid copying of this class |
| 58 | RBBISymbolTableEntry &operator=(const RBBISymbolTableEntry &other); // forbid copying of this class |
| 59 | }; |
| 60 | |
| 61 | |
| 62 | class RBBISymbolTable : public UMemory, public SymbolTable { |
| 63 | private: |
| 64 | const UnicodeString &fRules; |
| 65 | UHashtable *fHashTable; |
| 66 | RBBIRuleScanner *fRuleScanner; |
| 67 | |
| 68 | // These next two fields are part of the mechanism for passing references to |
| 69 | // already-constructed UnicodeSets back to the UnicodeSet constructor |
| 70 | // when the pattern includes $variable references. |
| 71 | const UnicodeString ffffString; // = "/uffff" |
| 72 | UnicodeSet *fCachedSetLookup; |
| 73 | |
| 74 | public: |
| 75 | // API inherited from class SymbolTable |
| 76 | virtual const UnicodeString* lookup(const UnicodeString& s) const; |
| 77 | virtual const UnicodeFunctor* lookupMatcher(UChar32 ch) const; |
| 78 | virtual UnicodeString parseReference(const UnicodeString& text, |
| 79 | ParsePosition& pos, int32_t limit) const; |
| 80 | |
| 81 | // Additional Functions |
| 82 | RBBISymbolTable(RBBIRuleScanner *, const UnicodeString &fRules, UErrorCode &status); |
| 83 | virtual ~RBBISymbolTable(); |
| 84 | |
| 85 | virtual RBBINode *lookupNode(const UnicodeString &key) const; |
| 86 | virtual void addEntry (const UnicodeString &key, RBBINode *val, UErrorCode &err); |
| 87 | |
| 88 | #ifdef RBBI_DEBUG |
| 89 | virtual void rbbiSymtablePrint() const; |
| 90 | #else |
| 91 | // A do-nothing inline function for non-debug builds. Member funcs can't be empty |
| 92 | // or the call sites won't compile. |
| 93 | int32_t fFakeField; |
| 94 | #define rbbiSymtablePrint() fFakeField=0; |
| 95 | #endif |
| 96 | |
| 97 | private: |
| 98 | RBBISymbolTable(const RBBISymbolTable &other); // forbid copying of this class |
| 99 | RBBISymbolTable &operator=(const RBBISymbolTable &other); // forbid copying of this class |
| 100 | }; |
| 101 | |
| 102 | |
| 103 | //-------------------------------------------------------------------------------- |
| 104 | // |
| 105 | // class RBBIRuleBuilder The top-level class handling RBBI rule compiling. |
| 106 | // |
| 107 | //-------------------------------------------------------------------------------- |
| 108 | class RBBIRuleBuilder : public UMemory { |
| 109 | public: |
| 110 | |
| 111 | // Create a rule based break iterator from a set of rules. |
| 112 | // This function is the main entry point into the rule builder. The |
| 113 | // public ICU API for creating RBBIs uses this function to do the actual work. |
| 114 | // |
| 115 | static BreakIterator * createRuleBasedBreakIterator( const UnicodeString &rules, |
| 116 | UParseError *parseError, |
| 117 | UErrorCode &status); |
| 118 | |
| 119 | public: |
| 120 | // The "public" functions and data members that appear below are accessed |
| 121 | // (and shared) by the various parts that make up the rule builder. They |
| 122 | // are NOT intended to be accessed by anything outside of the |
| 123 | // rule builder implementation. |
| 124 | RBBIRuleBuilder(const UnicodeString &rules, |
| 125 | UParseError *parseErr, |
| 126 | UErrorCode &status |
| 127 | ); |
| 128 | |
| 129 | virtual ~RBBIRuleBuilder(); |
| 130 | |
| 131 | /** |
| 132 | * Build the state tables and char class Trie from the source rules. |
| 133 | */ |
| 134 | RBBIDataHeader *build(UErrorCode &status); |
| 135 | |
| 136 | |
| 137 | /** |
| 138 | * Fold together redundant character classes (table columns) and |
| 139 | * redundant states (table rows). Done after initial table generation, |
| 140 | * before serializing the result. |
| 141 | */ |
| 142 | void optimizeTables(); |
| 143 | |
| 144 | char *fDebugEnv; // controls debug trace output |
| 145 | UErrorCode *fStatus; // Error reporting. Keeping status |
| 146 | UParseError *fParseError; // here avoids passing it everywhere. |
| 147 | const UnicodeString &fRules; // The rule string that we are compiling |
| 148 | UnicodeString fStrippedRules; // The rule string, with comments stripped. |
| 149 | |
| 150 | RBBIRuleScanner *fScanner; // The scanner. |
| 151 | RBBINode *fForwardTree; // The parse trees, generated by the scanner, |
| 152 | RBBINode *fReverseTree; // then manipulated by subsequent steps. |
| 153 | RBBINode *fSafeFwdTree; |
| 154 | RBBINode *fSafeRevTree; |
| 155 | |
| 156 | RBBINode **fDefaultTree; // For rules not qualified with a ! |
| 157 | // the tree to which they belong to. |
| 158 | |
| 159 | UBool fChainRules; // True for chained Unicode TR style rules. |
| 160 | // False for traditional regexp rules. |
| 161 | |
| 162 | UBool fLBCMNoChain; // True: suppress chaining of rules on |
| 163 | // chars with LineBreak property == CM. |
| 164 | |
| 165 | UBool fLookAheadHardBreak; // True: Look ahead matches cause an |
| 166 | // immediate break, no continuing for the |
| 167 | // longest match. |
| 168 | |
| 169 | RBBISetBuilder *fSetBuilder; // Set and Character Category builder. |
| 170 | UVector *fUSetNodes; // Vector of all uset nodes. |
| 171 | |
| 172 | RBBITableBuilder *fForwardTable; // State transition table, build time form. |
| 173 | |
| 174 | UVector *fRuleStatusVals; // The values that can be returned |
| 175 | // from getRuleStatus(). |
| 176 | |
| 177 | RBBIDataHeader *flattenData(); // Create the flattened (runtime format) |
| 178 | // data tables.. |
| 179 | private: |
| 180 | RBBIRuleBuilder(const RBBIRuleBuilder &other); // forbid copying of this class |
| 181 | RBBIRuleBuilder &operator=(const RBBIRuleBuilder &other); // forbid copying of this class |
| 182 | }; |
| 183 | |
| 184 | |
| 185 | |
| 186 | |
| 187 | //---------------------------------------------------------------------------- |
| 188 | // |
| 189 | // RBBISetTableEl is an entry in the hash table of UnicodeSets that have |
| 190 | // been encountered. The val Node will be of nodetype uset |
| 191 | // and contain pointers to the actual UnicodeSets. |
| 192 | // The Key is the source string for initializing the set. |
| 193 | // |
| 194 | // The hash table is used to avoid creating duplicate |
| 195 | // unnamed (not $var references) UnicodeSets. |
| 196 | // |
| 197 | // Memory Management: |
| 198 | // The Hash Table owns these RBBISetTableEl structs and |
| 199 | // the key strings. It does NOT own the val nodes. |
| 200 | // |
| 201 | //---------------------------------------------------------------------------- |
| 202 | struct RBBISetTableEl { |
| 203 | UnicodeString *key; |
| 204 | RBBINode *val; |
| 205 | }; |
| 206 | |
| 207 | /** |
| 208 | * A pair of ints, used to bundle pairs of states or pairs of character classes. |
| 209 | */ |
| 210 | typedef std::pair<int32_t, int32_t> IntPair; |
| 211 | |
| 212 | |
| 213 | //---------------------------------------------------------------------------- |
| 214 | // |
| 215 | // RBBIDebugPrintf Printf equivalent, for debugging output. |
| 216 | // Conditional compilation of the implementation lets us |
| 217 | // get rid of the stdio dependency in environments where it |
| 218 | // is unavailable. |
| 219 | // |
| 220 | //---------------------------------------------------------------------------- |
| 221 | #ifdef RBBI_DEBUG |
| 222 | #include <stdio.h> |
| 223 | #define RBBIDebugPrintf printf |
| 224 | #define RBBIDebugPuts puts |
| 225 | #else |
| 226 | #undef RBBIDebugPrintf |
| 227 | #define RBBIDebugPuts(arg) |
| 228 | #endif |
| 229 | |
| 230 | U_NAMESPACE_END |
| 231 | |
| 232 | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |
| 233 | |
| 234 | #endif |
| 235 | |
| 236 | |
| 237 | |
| 238 | |