1 | // Copyright (C) 2016 and later: Unicode, Inc. and others. |
2 | // License & terms of use: http://www.unicode.org/copyright.html |
3 | |
4 | // file: rbbi_cache.h |
5 | // |
6 | #ifndef RBBI_CACHE_H |
7 | #define RBBI_CACHE_H |
8 | |
9 | #include "unicode/utypes.h" |
10 | |
11 | #if !UCONFIG_NO_BREAK_ITERATION |
12 | |
13 | #include "unicode/rbbi.h" |
14 | #include "unicode/uobject.h" |
15 | |
16 | #include "uvectr32.h" |
17 | |
18 | U_NAMESPACE_BEGIN |
19 | |
20 | /* DictionaryCache stores the boundaries obtained from a run of dictionary characters. |
21 | * Dictionary boundaries are moved first to this cache, then from here |
22 | * to the main BreakCache, where they may inter-leave with non-dictionary |
23 | * boundaries. The public BreakIterator API always fetches directly |
24 | * from the main BreakCache, not from here. |
25 | * |
26 | * In common situations, the number of boundaries in a single dictionary run |
27 | * should be quite small, it will be terminated by punctuation, spaces, |
28 | * or any other non-dictionary characters. The main BreakCache may end |
29 | * up with boundaries from multiple dictionary based runs. |
30 | * |
31 | * The boundaries are stored in a simple ArrayList (vector), with the |
32 | * assumption that they will be accessed sequentially. |
33 | */ |
34 | class RuleBasedBreakIterator::DictionaryCache: public UMemory { |
35 | public: |
36 | DictionaryCache(RuleBasedBreakIterator *bi, UErrorCode &status); |
37 | ~DictionaryCache(); |
38 | |
39 | void reset(); |
40 | |
41 | UBool following(int32_t fromPos, int32_t *pos, int32_t *statusIndex); |
42 | UBool preceding(int32_t fromPos, int32_t *pos, int32_t *statusIndex); |
43 | |
44 | /** |
45 | * Populate the cache with the dictionary based boundaries within a region of text. |
46 | * @param startPos The start position of a range of text |
47 | * @param endPos The end position of a range of text |
48 | * @param firstRuleStatus The rule status index that applies to the break at startPos |
49 | * @param otherRuleStatus The rule status index that applies to boundaries other than startPos |
50 | * @internal |
51 | */ |
52 | void populateDictionary(int32_t startPos, int32_t endPos, |
53 | int32_t firstRuleStatus, int32_t otherRuleStatus); |
54 | |
55 | |
56 | |
57 | RuleBasedBreakIterator *fBI; |
58 | |
59 | UVector32 fBreaks; // A vector containing the boundaries. |
60 | int32_t fPositionInCache; // Index in fBreaks of last boundary returned by following() |
61 | // or preceding(). Optimizes sequential access. |
62 | int32_t fStart; // Text position of first boundary in cache. |
63 | int32_t fLimit; // Last boundary in cache. Which is the limit of the |
64 | // text segment being handled by the dictionary. |
65 | int32_t fFirstRuleStatusIndex; // Rule status info for first boundary. |
66 | int32_t fOtherRuleStatusIndex; // Rule status info for 2nd through last boundaries. |
67 | }; |
68 | |
69 | |
70 | /* |
71 | * class BreakCache |
72 | * |
73 | * Cache of break boundary positions and rule status values. |
74 | * Break iterator API functions, next(), previous(), etc., will use cached results |
75 | * when possible, and otherwise cache new results as they are obtained. |
76 | * |
77 | * Uniformly caches both dictionary and rule based (non-dictionary) boundaries. |
78 | * |
79 | * The cache is implemented as a single circular buffer. |
80 | */ |
81 | |
82 | /* |
83 | * size of the circular cache buffer. |
84 | */ |
85 | |
86 | class RuleBasedBreakIterator::BreakCache: public UMemory { |
87 | public: |
88 | BreakCache(RuleBasedBreakIterator *bi, UErrorCode &status); |
89 | virtual ~BreakCache(); |
90 | void reset(int32_t pos = 0, int32_t ruleStatus = 0); |
91 | void next() { if (fBufIdx == fEndBufIdx) { |
92 | nextOL(); |
93 | } else { |
94 | fBufIdx = modChunkSize(fBufIdx + 1); |
95 | fTextIdx = fBI->fPosition = fBoundaries[fBufIdx]; |
96 | fBI->fRuleStatusIndex = fStatuses[fBufIdx]; |
97 | } |
98 | } |
99 | |
100 | |
101 | void nextOL(); |
102 | void previous(UErrorCode &status); |
103 | |
104 | // Move the iteration state to the position following the startPosition. |
105 | // Input position must be pinned to the input length. |
106 | void following(int32_t startPosition, UErrorCode &status); |
107 | |
108 | void preceding(int32_t startPosition, UErrorCode &status); |
109 | |
110 | /* |
111 | * Update the state of the public BreakIterator (fBI) to reflect the |
112 | * current state of the break iterator cache (this). |
113 | */ |
114 | int32_t current(); |
115 | |
116 | /** |
117 | * Add boundaries to the cache near the specified position. |
118 | * The given position need not be a boundary itself. |
119 | * The input position must be within the range of the text, and |
120 | * on a code point boundary. |
121 | * If the requested position is a break boundary, leave the iteration |
122 | * position on it. |
123 | * If the requested position is not a boundary, leave the iteration |
124 | * position on the preceding boundary and include both the |
125 | * preceding and following boundaries in the cache. |
126 | * Additional boundaries, either preceding or following, may be added |
127 | * to the cache as a side effect. |
128 | * |
129 | * Return false if the operation failed. |
130 | */ |
131 | UBool populateNear(int32_t position, UErrorCode &status); |
132 | |
133 | /** |
134 | * Add boundary(s) to the cache following the current last boundary. |
135 | * Return false if at the end of the text, and no more boundaries can be added. |
136 | * Leave iteration position at the first newly added boundary, or unchanged if no boundary was added. |
137 | */ |
138 | UBool populateFollowing(); |
139 | |
140 | /** |
141 | * Add one or more boundaries to the cache preceding the first currently cached boundary. |
142 | * Leave the iteration position on the first added boundary. |
143 | * Return false if no boundaries could be added (if at the start of the text.) |
144 | */ |
145 | UBool populatePreceding(UErrorCode &status); |
146 | |
147 | enum UpdatePositionValues { |
148 | RetainCachePosition = 0, |
149 | UpdateCachePosition = 1 |
150 | }; |
151 | |
152 | /* |
153 | * Add the boundary following the current position. |
154 | * The current position can be left as it was, or changed to the newly added boundary, |
155 | * as specified by the update parameter. |
156 | */ |
157 | void addFollowing(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update); |
158 | |
159 | |
160 | /* |
161 | * Add the boundary preceding the current position. |
162 | * The current position can be left as it was, or changed to the newly added boundary, |
163 | * as specified by the update parameter. |
164 | */ |
165 | bool addPreceding(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update); |
166 | |
167 | /** |
168 | * Set the cache position to the specified position, or, if the position |
169 | * falls between to cached boundaries, to the preceding boundary. |
170 | * Fails if the requested position is outside of the range of boundaries currently held by the cache. |
171 | * The startPosition must be on a code point boundary. |
172 | * |
173 | * Return true if successful, false if the specified position is after |
174 | * the last cached boundary or before the first. |
175 | */ |
176 | UBool seek(int32_t startPosition); |
177 | |
178 | void dumpCache(); |
179 | |
180 | private: |
181 | static inline int32_t modChunkSize(int index) { return index & (CACHE_SIZE - 1); } |
182 | |
183 | static constexpr int32_t CACHE_SIZE = 128; |
184 | static_assert((CACHE_SIZE & (CACHE_SIZE-1)) == 0, "CACHE_SIZE must be power of two." ); |
185 | |
186 | RuleBasedBreakIterator *fBI; |
187 | int32_t fStartBufIdx; |
188 | int32_t fEndBufIdx; // inclusive |
189 | |
190 | int32_t fTextIdx; |
191 | int32_t fBufIdx; |
192 | |
193 | int32_t fBoundaries[CACHE_SIZE]; |
194 | uint16_t fStatuses[CACHE_SIZE]; |
195 | |
196 | UVector32 fSideBuffer; |
197 | }; |
198 | |
199 | U_NAMESPACE_END |
200 | |
201 | #endif // #if !UCONFIG_NO_BREAK_ITERATION |
202 | |
203 | #endif // RBBI_CACHE_H |
204 | |