| 1 | // © 2016 and later: Unicode, Inc. and others. | 
|---|
| 2 | // License & terms of use: http://www.unicode.org/copyright.html | 
|---|
| 3 | /* | 
|---|
| 4 | ******************************************************************************* | 
|---|
| 5 | * | 
|---|
| 6 | *   Copyright (C) 1999-2014 International Business Machines | 
|---|
| 7 | *   Corporation and others.  All Rights Reserved. | 
|---|
| 8 | * | 
|---|
| 9 | ******************************************************************************* | 
|---|
| 10 | *   file name:  rbbidata.h | 
|---|
| 11 | *   encoding:   UTF-8 | 
|---|
| 12 | *   tab size:   8 (not used) | 
|---|
| 13 | *   indentation:4 | 
|---|
| 14 | * | 
|---|
| 15 | *   RBBI data formats  Includes | 
|---|
| 16 | * | 
|---|
| 17 | *                          Structs that describes the format of the Binary RBBI data, | 
|---|
| 18 | *                          as it is stored in ICU's data file. | 
|---|
| 19 | * | 
|---|
| 20 | *      RBBIDataWrapper  -  Instances of this class sit between the | 
|---|
| 21 | *                          raw data structs and the RulesBasedBreakIterator objects | 
|---|
| 22 | *                          that are created by applications.  The wrapper class | 
|---|
| 23 | *                          provides reference counting for the underlying data, | 
|---|
| 24 | *                          and direct pointers to data that would not otherwise | 
|---|
| 25 | *                          be accessible without ugly pointer arithmetic.  The | 
|---|
| 26 | *                          wrapper does not attempt to provide any higher level | 
|---|
| 27 | *                          abstractions for the data itself. | 
|---|
| 28 | * | 
|---|
| 29 | *                          There will be only one instance of RBBIDataWrapper for any | 
|---|
| 30 | *                          set of RBBI run time data being shared by instances | 
|---|
| 31 | *                          (clones) of RulesBasedBreakIterator. | 
|---|
| 32 | */ | 
|---|
| 33 |  | 
|---|
| 34 | #ifndef __RBBIDATA_H__ | 
|---|
| 35 | #define __RBBIDATA_H__ | 
|---|
| 36 |  | 
|---|
| 37 | #include "unicode/utypes.h" | 
|---|
| 38 | #include "unicode/udata.h" | 
|---|
| 39 | #include "udataswp.h" | 
|---|
| 40 |  | 
|---|
| 41 | /** | 
|---|
| 42 | * Swap RBBI data. See udataswp.h. | 
|---|
| 43 | * @internal | 
|---|
| 44 | */ | 
|---|
| 45 | U_CAPI int32_t U_EXPORT2 | 
|---|
| 46 | ubrk_swap(const UDataSwapper *ds, | 
|---|
| 47 | const void *inData, int32_t length, void *outData, | 
|---|
| 48 | UErrorCode *pErrorCode); | 
|---|
| 49 |  | 
|---|
| 50 | #ifdef __cplusplus | 
|---|
| 51 |  | 
|---|
| 52 | #include "unicode/ucptrie.h" | 
|---|
| 53 | #include "unicode/uobject.h" | 
|---|
| 54 | #include "unicode/unistr.h" | 
|---|
| 55 | #include "unicode/uversion.h" | 
|---|
| 56 | #include "umutex.h" | 
|---|
| 57 |  | 
|---|
| 58 |  | 
|---|
| 59 | U_NAMESPACE_BEGIN | 
|---|
| 60 |  | 
|---|
| 61 | // The current RBBI data format version. | 
|---|
| 62 | static const uint8_t RBBI_DATA_FORMAT_VERSION[] = {6, 0, 0, 0}; | 
|---|
| 63 |  | 
|---|
| 64 | /* | 
|---|
| 65 | *   The following structs map exactly onto the raw data from ICU common data file. | 
|---|
| 66 | */ | 
|---|
| 67 | struct  { | 
|---|
| 68 | uint32_t         ;           /*  == 0xbla0                                               */ | 
|---|
| 69 | UVersionInfo     ;   /* Data Format.  Same as the value in struct UDataInfo      */ | 
|---|
| 70 | /*   if there is one associated with this data.             */ | 
|---|
| 71 | /*     (version originates in rbbi, is copied to UDataInfo) */ | 
|---|
| 72 | uint32_t         ;          /*  Total length in bytes of this RBBI Data,                */ | 
|---|
| 73 | /*      including all sections, not just the header.        */ | 
|---|
| 74 | uint32_t         ;        /*  Number of character categories.                         */ | 
|---|
| 75 |  | 
|---|
| 76 | /*                                                                        */ | 
|---|
| 77 | /*  Offsets and sizes of each of the subsections within the RBBI data.    */ | 
|---|
| 78 | /*  All offsets are bytes from the start of the RBBIDataHeader.           */ | 
|---|
| 79 | /*  All sizes are in bytes.                                               */ | 
|---|
| 80 | /*                                                                        */ | 
|---|
| 81 | uint32_t         ;         /*  forward state transition table. */ | 
|---|
| 82 | uint32_t         ; | 
|---|
| 83 | uint32_t         ;         /*  Offset to the reverse state transition table. */ | 
|---|
| 84 | uint32_t         ; | 
|---|
| 85 | uint32_t         ;           /*  Offset to Trie data for character categories */ | 
|---|
| 86 | uint32_t         ; | 
|---|
| 87 | uint32_t         ;     /*  Offset to the source for for the break */ | 
|---|
| 88 | uint32_t         ;  /*    rules.  Stored char16_t *. */ | 
|---|
| 89 | uint32_t         ;    /* Offset to the table of rule status values */ | 
|---|
| 90 | uint32_t         ; | 
|---|
| 91 |  | 
|---|
| 92 | uint32_t         [6];    /*  Reserved for expansion */ | 
|---|
| 93 |  | 
|---|
| 94 | }; | 
|---|
| 95 |  | 
|---|
| 96 |  | 
|---|
| 97 |  | 
|---|
| 98 | template <typename T> | 
|---|
| 99 | struct RBBIStateTableRowT { | 
|---|
| 100 | T               fAccepting;    //  Non-zero if this row is for an accepting state. | 
|---|
| 101 | //  Value 0: not an accepting state. | 
|---|
| 102 | //        1: (ACCEPTING_UNCONDITIONAL) Unconditional Accepting state. | 
|---|
| 103 | //       >1: Look-ahead match has completed. | 
|---|
| 104 | //           Actual boundary position happened earlier. | 
|---|
| 105 | //           Value here == fLookAhead in earlier | 
|---|
| 106 | //           state, at actual boundary pos. | 
|---|
| 107 | T               fLookAhead;    //  Non-zero if this row is for a state that | 
|---|
| 108 | //    corresponds to a '/' in the rule source. | 
|---|
| 109 | //    Value is the same as the fAccepting | 
|---|
| 110 | //    value for the rule (which will appear | 
|---|
| 111 | //    in a different state. | 
|---|
| 112 | T               fTagsIdx;      //  Non-zero if this row covers a {tagged} position | 
|---|
| 113 | //    from a rule.  Value is the index in the | 
|---|
| 114 | //    StatusTable of the set of matching | 
|---|
| 115 | //    tags (rule status values) | 
|---|
| 116 | T               fNextState[1]; //  Next State, indexed by char category. | 
|---|
| 117 | //    Variable-length array declared with length 1 | 
|---|
| 118 | //    to disable bounds checkers. | 
|---|
| 119 | //    Array Size is actually fData->fHeader->fCatCount | 
|---|
| 120 | //    CAUTION:  see RBBITableBuilder::getTableSize() | 
|---|
| 121 | //              before changing anything here. | 
|---|
| 122 | }; | 
|---|
| 123 |  | 
|---|
| 124 | typedef RBBIStateTableRowT<uint8_t> RBBIStateTableRow8; | 
|---|
| 125 | typedef RBBIStateTableRowT<uint16_t> RBBIStateTableRow16; | 
|---|
| 126 |  | 
|---|
| 127 | constexpr uint16_t ACCEPTING_UNCONDITIONAL = 1;   // Value constant for RBBIStateTableRow::fAccepting | 
|---|
| 128 |  | 
|---|
| 129 | union RBBIStateTableRow { | 
|---|
| 130 | RBBIStateTableRow16 r16; | 
|---|
| 131 | RBBIStateTableRow8 r8; | 
|---|
| 132 | }; | 
|---|
| 133 |  | 
|---|
| 134 | struct RBBIStateTable { | 
|---|
| 135 | uint32_t         fNumStates;            // Number of states. | 
|---|
| 136 | uint32_t         fRowLen;               // Length of a state table row, in bytes. | 
|---|
| 137 | uint32_t         fDictCategoriesStart;  // Char category number of the first dictionary | 
|---|
| 138 | //   char class, or the the largest category number + 1 | 
|---|
| 139 | //   if there are no dictionary categories. | 
|---|
| 140 | uint32_t         fLookAheadResultsSize; // Size of run-time array required for holding | 
|---|
| 141 | //   look-ahead results. Indexed by row.fLookAhead. | 
|---|
| 142 | uint32_t         fFlags;                // Option Flags for this state table. | 
|---|
| 143 | char             fTableData[1];         // First RBBIStateTableRow begins here. | 
|---|
| 144 | //   Variable-length array declared with length 1 | 
|---|
| 145 | //   to disable bounds checkers. | 
|---|
| 146 | //   (making it char[] simplifies ugly address | 
|---|
| 147 | //   arithmetic for indexing variable length rows.) | 
|---|
| 148 | }; | 
|---|
| 149 |  | 
|---|
| 150 | constexpr uint32_t RBBI_LOOKAHEAD_HARD_BREAK = 1; | 
|---|
| 151 | constexpr uint32_t RBBI_BOF_REQUIRED = 2; | 
|---|
| 152 | constexpr uint32_t RBBI_8BITS_ROWS = 4; | 
|---|
| 153 |  | 
|---|
| 154 |  | 
|---|
| 155 | /*                                        */ | 
|---|
| 156 | /*   The reference counting wrapper class */ | 
|---|
| 157 | /*                                        */ | 
|---|
| 158 | class RBBIDataWrapper : public UMemory { | 
|---|
| 159 | public: | 
|---|
| 160 | enum EDontAdopt { | 
|---|
| 161 | kDontAdopt | 
|---|
| 162 | }; | 
|---|
| 163 | (const RBBIDataHeader *data, UErrorCode &status); | 
|---|
| 164 | (const RBBIDataHeader *data, enum EDontAdopt dontAdopt, UErrorCode &status); | 
|---|
| 165 | RBBIDataWrapper(UDataMemory* udm, UErrorCode &status); | 
|---|
| 166 | ~RBBIDataWrapper(); | 
|---|
| 167 |  | 
|---|
| 168 | static UBool          isDataVersionAcceptable(const UVersionInfo version); | 
|---|
| 169 |  | 
|---|
| 170 | void                  init0(); | 
|---|
| 171 | void                  (const RBBIDataHeader *data, UErrorCode &status); | 
|---|
| 172 | RBBIDataWrapper      *addReference(); | 
|---|
| 173 | void                  removeReference(); | 
|---|
| 174 | bool                  operator ==(const RBBIDataWrapper &other) const; | 
|---|
| 175 | int32_t               hashCode(); | 
|---|
| 176 | const UnicodeString  &getRuleSourceString() const; | 
|---|
| 177 | void                  printData(); | 
|---|
| 178 | void                  printTable(const char *heading, const RBBIStateTable *table); | 
|---|
| 179 |  | 
|---|
| 180 | /*                                     */ | 
|---|
| 181 | /*   Pointers to items within the data */ | 
|---|
| 182 | /*                                     */ | 
|---|
| 183 | const RBBIDataHeader     *; | 
|---|
| 184 | const RBBIStateTable     *fForwardTable; | 
|---|
| 185 | const RBBIStateTable     *fReverseTable; | 
|---|
| 186 | const char               *fRuleSource; | 
|---|
| 187 | const int32_t            *fRuleStatusTable; | 
|---|
| 188 |  | 
|---|
| 189 | /* number of int32_t values in the rule status table.   Used to sanity check indexing */ | 
|---|
| 190 | int32_t             fStatusMaxIdx; | 
|---|
| 191 |  | 
|---|
| 192 | UCPTrie             *fTrie; | 
|---|
| 193 |  | 
|---|
| 194 | private: | 
|---|
| 195 | u_atomic_int32_t    fRefCount; | 
|---|
| 196 | UDataMemory        *fUDataMem; | 
|---|
| 197 | UnicodeString       fRuleString; | 
|---|
| 198 | UBool               fDontFreeData; | 
|---|
| 199 |  | 
|---|
| 200 | RBBIDataWrapper(const RBBIDataWrapper &other) = delete; /*  forbid copying of this class */ | 
|---|
| 201 | RBBIDataWrapper &operator=(const RBBIDataWrapper &other) = delete; /*  forbid copying of this class */ | 
|---|
| 202 | }; | 
|---|
| 203 |  | 
|---|
| 204 |  | 
|---|
| 205 |  | 
|---|
| 206 | U_NAMESPACE_END | 
|---|
| 207 |  | 
|---|
| 208 | U_CFUNC UBool rbbi_cleanup(); | 
|---|
| 209 |  | 
|---|
| 210 | #endif /* C++ */ | 
|---|
| 211 |  | 
|---|
| 212 | #endif | 
|---|
| 213 |  | 
|---|