1 | // © 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | /* |
4 | ******************************************************************************* |
5 | * |
6 | * Copyright (C) 1999-2014 International Business Machines |
7 | * Corporation and others. All Rights Reserved. |
8 | * |
9 | ******************************************************************************* |
10 | * file name: rbbidata.h |
11 | * encoding: UTF-8 |
12 | * tab size: 8 (not used) |
13 | * indentation:4 |
14 | * |
15 | * RBBI data formats Includes |
16 | * |
17 | * Structs that describes the format of the Binary RBBI data, |
18 | * as it is stored in ICU's data file. |
19 | * |
20 | * RBBIDataWrapper - Instances of this class sit between the |
21 | * raw data structs and the RulesBasedBreakIterator objects |
22 | * that are created by applications. The wrapper class |
23 | * provides reference counting for the underlying data, |
24 | * and direct pointers to data that would not otherwise |
25 | * be accessible without ugly pointer arithmetic. The |
26 | * wrapper does not attempt to provide any higher level |
27 | * abstractions for the data itself. |
28 | * |
29 | * There will be only one instance of RBBIDataWrapper for any |
30 | * set of RBBI run time data being shared by instances |
31 | * (clones) of RulesBasedBreakIterator. |
32 | */ |
33 | |
34 | #ifndef __RBBIDATA_H__ |
35 | #define __RBBIDATA_H__ |
36 | |
37 | #include "unicode/utypes.h" |
38 | #include "unicode/udata.h" |
39 | #include "udataswp.h" |
40 | |
41 | /** |
42 | * Swap RBBI data. See udataswp.h. |
43 | * @internal |
44 | */ |
45 | U_CAPI int32_t U_EXPORT2 |
46 | ubrk_swap(const UDataSwapper *ds, |
47 | const void *inData, int32_t length, void *outData, |
48 | UErrorCode *pErrorCode); |
49 | |
50 | #ifdef __cplusplus |
51 | |
52 | #include "unicode/ucptrie.h" |
53 | #include "unicode/uobject.h" |
54 | #include "unicode/unistr.h" |
55 | #include "unicode/uversion.h" |
56 | #include "umutex.h" |
57 | |
58 | |
59 | U_NAMESPACE_BEGIN |
60 | |
61 | // The current RBBI data format version. |
62 | static const uint8_t RBBI_DATA_FORMAT_VERSION[] = {6, 0, 0, 0}; |
63 | |
64 | /* |
65 | * The following structs map exactly onto the raw data from ICU common data file. |
66 | */ |
67 | struct { |
68 | uint32_t ; /* == 0xbla0 */ |
69 | UVersionInfo ; /* Data Format. Same as the value in struct UDataInfo */ |
70 | /* if there is one associated with this data. */ |
71 | /* (version originates in rbbi, is copied to UDataInfo) */ |
72 | uint32_t ; /* Total length in bytes of this RBBI Data, */ |
73 | /* including all sections, not just the header. */ |
74 | uint32_t ; /* Number of character categories. */ |
75 | |
76 | /* */ |
77 | /* Offsets and sizes of each of the subsections within the RBBI data. */ |
78 | /* All offsets are bytes from the start of the RBBIDataHeader. */ |
79 | /* All sizes are in bytes. */ |
80 | /* */ |
81 | uint32_t ; /* forward state transition table. */ |
82 | uint32_t ; |
83 | uint32_t ; /* Offset to the reverse state transition table. */ |
84 | uint32_t ; |
85 | uint32_t ; /* Offset to Trie data for character categories */ |
86 | uint32_t ; |
87 | uint32_t ; /* Offset to the source for for the break */ |
88 | uint32_t ; /* rules. Stored char16_t *. */ |
89 | uint32_t ; /* Offset to the table of rule status values */ |
90 | uint32_t ; |
91 | |
92 | uint32_t [6]; /* Reserved for expansion */ |
93 | |
94 | }; |
95 | |
96 | |
97 | |
98 | template <typename T> |
99 | struct RBBIStateTableRowT { |
100 | T fAccepting; // Non-zero if this row is for an accepting state. |
101 | // Value 0: not an accepting state. |
102 | // 1: (ACCEPTING_UNCONDITIONAL) Unconditional Accepting state. |
103 | // >1: Look-ahead match has completed. |
104 | // Actual boundary position happened earlier. |
105 | // Value here == fLookAhead in earlier |
106 | // state, at actual boundary pos. |
107 | T fLookAhead; // Non-zero if this row is for a state that |
108 | // corresponds to a '/' in the rule source. |
109 | // Value is the same as the fAccepting |
110 | // value for the rule (which will appear |
111 | // in a different state. |
112 | T fTagsIdx; // Non-zero if this row covers a {tagged} position |
113 | // from a rule. Value is the index in the |
114 | // StatusTable of the set of matching |
115 | // tags (rule status values) |
116 | T fNextState[1]; // Next State, indexed by char category. |
117 | // Variable-length array declared with length 1 |
118 | // to disable bounds checkers. |
119 | // Array Size is actually fData->fHeader->fCatCount |
120 | // CAUTION: see RBBITableBuilder::getTableSize() |
121 | // before changing anything here. |
122 | }; |
123 | |
124 | typedef RBBIStateTableRowT<uint8_t> RBBIStateTableRow8; |
125 | typedef RBBIStateTableRowT<uint16_t> RBBIStateTableRow16; |
126 | |
127 | constexpr uint16_t ACCEPTING_UNCONDITIONAL = 1; // Value constant for RBBIStateTableRow::fAccepting |
128 | |
129 | union RBBIStateTableRow { |
130 | RBBIStateTableRow16 r16; |
131 | RBBIStateTableRow8 r8; |
132 | }; |
133 | |
134 | struct RBBIStateTable { |
135 | uint32_t fNumStates; // Number of states. |
136 | uint32_t fRowLen; // Length of a state table row, in bytes. |
137 | uint32_t fDictCategoriesStart; // Char category number of the first dictionary |
138 | // char class, or the the largest category number + 1 |
139 | // if there are no dictionary categories. |
140 | uint32_t fLookAheadResultsSize; // Size of run-time array required for holding |
141 | // look-ahead results. Indexed by row.fLookAhead. |
142 | uint32_t fFlags; // Option Flags for this state table. |
143 | char fTableData[1]; // First RBBIStateTableRow begins here. |
144 | // Variable-length array declared with length 1 |
145 | // to disable bounds checkers. |
146 | // (making it char[] simplifies ugly address |
147 | // arithmetic for indexing variable length rows.) |
148 | }; |
149 | |
150 | constexpr uint32_t RBBI_LOOKAHEAD_HARD_BREAK = 1; |
151 | constexpr uint32_t RBBI_BOF_REQUIRED = 2; |
152 | constexpr uint32_t RBBI_8BITS_ROWS = 4; |
153 | |
154 | |
155 | /* */ |
156 | /* The reference counting wrapper class */ |
157 | /* */ |
158 | class RBBIDataWrapper : public UMemory { |
159 | public: |
160 | enum EDontAdopt { |
161 | kDontAdopt |
162 | }; |
163 | (const RBBIDataHeader *data, UErrorCode &status); |
164 | (const RBBIDataHeader *data, enum EDontAdopt dontAdopt, UErrorCode &status); |
165 | RBBIDataWrapper(UDataMemory* udm, UErrorCode &status); |
166 | ~RBBIDataWrapper(); |
167 | |
168 | static UBool isDataVersionAcceptable(const UVersionInfo version); |
169 | |
170 | void init0(); |
171 | void (const RBBIDataHeader *data, UErrorCode &status); |
172 | RBBIDataWrapper *addReference(); |
173 | void removeReference(); |
174 | bool operator ==(const RBBIDataWrapper &other) const; |
175 | int32_t hashCode(); |
176 | const UnicodeString &getRuleSourceString() const; |
177 | void printData(); |
178 | void printTable(const char *heading, const RBBIStateTable *table); |
179 | |
180 | /* */ |
181 | /* Pointers to items within the data */ |
182 | /* */ |
183 | const RBBIDataHeader *; |
184 | const RBBIStateTable *fForwardTable; |
185 | const RBBIStateTable *fReverseTable; |
186 | const char *fRuleSource; |
187 | const int32_t *fRuleStatusTable; |
188 | |
189 | /* number of int32_t values in the rule status table. Used to sanity check indexing */ |
190 | int32_t fStatusMaxIdx; |
191 | |
192 | UCPTrie *fTrie; |
193 | |
194 | private: |
195 | u_atomic_int32_t fRefCount; |
196 | UDataMemory *fUDataMem; |
197 | UnicodeString fRuleString; |
198 | UBool fDontFreeData; |
199 | |
200 | RBBIDataWrapper(const RBBIDataWrapper &other) = delete; /* forbid copying of this class */ |
201 | RBBIDataWrapper &operator=(const RBBIDataWrapper &other) = delete; /* forbid copying of this class */ |
202 | }; |
203 | |
204 | |
205 | |
206 | U_NAMESPACE_END |
207 | |
208 | U_CFUNC UBool rbbi_cleanup(); |
209 | |
210 | #endif /* C++ */ |
211 | |
212 | #endif |
213 | |