| 1 | // Copyright (C) 2016 and later: Unicode, Inc. and others. | 
|---|
| 2 | // License & terms of use: http://www.unicode.org/copyright.html | 
|---|
| 3 |  | 
|---|
| 4 | // file: rbbi_cache.h | 
|---|
| 5 | // | 
|---|
| 6 | #ifndef RBBI_CACHE_H | 
|---|
| 7 | #define RBBI_CACHE_H | 
|---|
| 8 |  | 
|---|
| 9 | #include "unicode/utypes.h" | 
|---|
| 10 |  | 
|---|
| 11 | #if !UCONFIG_NO_BREAK_ITERATION | 
|---|
| 12 |  | 
|---|
| 13 | #include "unicode/rbbi.h" | 
|---|
| 14 | #include "unicode/uobject.h" | 
|---|
| 15 |  | 
|---|
| 16 | #include "uvectr32.h" | 
|---|
| 17 |  | 
|---|
| 18 | U_NAMESPACE_BEGIN | 
|---|
| 19 |  | 
|---|
| 20 | /* DictionaryCache  stores the boundaries obtained from a run of dictionary characters. | 
|---|
| 21 | *                 Dictionary boundaries are moved first to this cache, then from here | 
|---|
| 22 | *                 to the main BreakCache, where they may inter-leave with non-dictionary | 
|---|
| 23 | *                 boundaries. The public BreakIterator API always fetches directly | 
|---|
| 24 | *                 from the main BreakCache, not from here. | 
|---|
| 25 | * | 
|---|
| 26 | *                 In common situations, the number of boundaries in a single dictionary run | 
|---|
| 27 | *                 should be quite small, it will be terminated by punctuation, spaces, | 
|---|
| 28 | *                 or any other non-dictionary characters. The main BreakCache may end | 
|---|
| 29 | *                 up with boundaries from multiple dictionary based runs. | 
|---|
| 30 | * | 
|---|
| 31 | *                 The boundaries are stored in a simple ArrayList (vector), with the | 
|---|
| 32 | *                 assumption that they will be accessed sequentially. | 
|---|
| 33 | */ | 
|---|
| 34 | class RuleBasedBreakIterator::DictionaryCache: public UMemory { | 
|---|
| 35 | public: | 
|---|
| 36 | DictionaryCache(RuleBasedBreakIterator *bi, UErrorCode &status); | 
|---|
| 37 | ~DictionaryCache(); | 
|---|
| 38 |  | 
|---|
| 39 | void reset(); | 
|---|
| 40 |  | 
|---|
| 41 | UBool following(int32_t fromPos, int32_t *pos, int32_t *statusIndex); | 
|---|
| 42 | UBool preceding(int32_t fromPos, int32_t *pos, int32_t *statusIndex); | 
|---|
| 43 |  | 
|---|
| 44 | /** | 
|---|
| 45 | * Populate the cache with the dictionary based boundaries within a region of text. | 
|---|
| 46 | * @param startPos  The start position of a range of text | 
|---|
| 47 | * @param endPos    The end position of a range of text | 
|---|
| 48 | * @param firstRuleStatus The rule status index that applies to the break at startPos | 
|---|
| 49 | * @param otherRuleStatus The rule status index that applies to boundaries other than startPos | 
|---|
| 50 | * @internal | 
|---|
| 51 | */ | 
|---|
| 52 | void populateDictionary(int32_t startPos, int32_t endPos, | 
|---|
| 53 | int32_t firstRuleStatus, int32_t otherRuleStatus); | 
|---|
| 54 |  | 
|---|
| 55 |  | 
|---|
| 56 |  | 
|---|
| 57 | RuleBasedBreakIterator *fBI; | 
|---|
| 58 |  | 
|---|
| 59 | UVector32           fBreaks;                // A vector containing the boundaries. | 
|---|
| 60 | int32_t             fPositionInCache;       // Index in fBreaks of last boundary returned by following() | 
|---|
| 61 | //    or preceding(). Optimizes sequential access. | 
|---|
| 62 | int32_t             fStart;                 // Text position of first boundary in cache. | 
|---|
| 63 | int32_t             fLimit;                 // Last boundary in cache. Which is the limit of the | 
|---|
| 64 | //    text segment being handled by the dictionary. | 
|---|
| 65 | int32_t             fFirstRuleStatusIndex;  // Rule status info for first boundary. | 
|---|
| 66 | int32_t             fOtherRuleStatusIndex;  // Rule status info for 2nd through last boundaries. | 
|---|
| 67 | }; | 
|---|
| 68 |  | 
|---|
| 69 |  | 
|---|
| 70 | /* | 
|---|
| 71 | * class BreakCache | 
|---|
| 72 | * | 
|---|
| 73 | * Cache of break boundary positions and rule status values. | 
|---|
| 74 | * Break iterator API functions, next(), previous(), etc., will use cached results | 
|---|
| 75 | * when possible, and otherwise cache new results as they are obtained. | 
|---|
| 76 | * | 
|---|
| 77 | * Uniformly caches both dictionary and rule based (non-dictionary) boundaries. | 
|---|
| 78 | * | 
|---|
| 79 | * The cache is implemented as a single circular buffer. | 
|---|
| 80 | */ | 
|---|
| 81 |  | 
|---|
| 82 | /* | 
|---|
| 83 | * size of the circular cache buffer. | 
|---|
| 84 | */ | 
|---|
| 85 |  | 
|---|
| 86 | class RuleBasedBreakIterator::BreakCache: public UMemory { | 
|---|
| 87 | public: | 
|---|
| 88 | BreakCache(RuleBasedBreakIterator *bi, UErrorCode &status); | 
|---|
| 89 | virtual     ~BreakCache(); | 
|---|
| 90 | void        reset(int32_t pos = 0, int32_t ruleStatus = 0); | 
|---|
| 91 | void        next() {    if (fBufIdx == fEndBufIdx) { | 
|---|
| 92 | nextOL(); | 
|---|
| 93 | } else { | 
|---|
| 94 | fBufIdx = modChunkSize(fBufIdx + 1); | 
|---|
| 95 | fTextIdx = fBI->fPosition = fBoundaries[fBufIdx]; | 
|---|
| 96 | fBI->fRuleStatusIndex = fStatuses[fBufIdx]; | 
|---|
| 97 | } | 
|---|
| 98 | } | 
|---|
| 99 |  | 
|---|
| 100 |  | 
|---|
| 101 | void        nextOL(); | 
|---|
| 102 | void        previous(UErrorCode &status); | 
|---|
| 103 |  | 
|---|
| 104 | // Move the iteration state to the position following the startPosition. | 
|---|
| 105 | // Input position must be pinned to the input length. | 
|---|
| 106 | void        following(int32_t startPosition, UErrorCode &status); | 
|---|
| 107 |  | 
|---|
| 108 | void        preceding(int32_t startPosition, UErrorCode &status); | 
|---|
| 109 |  | 
|---|
| 110 | /* | 
|---|
| 111 | * Update the state of the public BreakIterator (fBI) to reflect the | 
|---|
| 112 | * current state of the break iterator cache (this). | 
|---|
| 113 | */ | 
|---|
| 114 | int32_t     current(); | 
|---|
| 115 |  | 
|---|
| 116 | /** | 
|---|
| 117 | * Add boundaries to the cache near the specified position. | 
|---|
| 118 | * The given position need not be a boundary itself. | 
|---|
| 119 | * The input position must be within the range of the text, and | 
|---|
| 120 | * on a code point boundary. | 
|---|
| 121 | * If the requested position is a break boundary, leave the iteration | 
|---|
| 122 | * position on it. | 
|---|
| 123 | * If the requested position is not a boundary, leave the iteration | 
|---|
| 124 | * position on the preceding boundary and include both the | 
|---|
| 125 | * preceding and following boundaries in the cache. | 
|---|
| 126 | * Additional boundaries, either preceding or following, may be added | 
|---|
| 127 | * to the cache as a side effect. | 
|---|
| 128 | * | 
|---|
| 129 | * Return FALSE if the operation failed. | 
|---|
| 130 | */ | 
|---|
| 131 | UBool populateNear(int32_t position, UErrorCode &status); | 
|---|
| 132 |  | 
|---|
| 133 | /** | 
|---|
| 134 | *  Add boundary(s) to the cache following the current last boundary. | 
|---|
| 135 | *  Return FALSE if at the end of the text, and no more boundaries can be added. | 
|---|
| 136 | *  Leave iteration position at the first newly added boundary, or unchanged if no boundary was added. | 
|---|
| 137 | */ | 
|---|
| 138 | UBool populateFollowing(); | 
|---|
| 139 |  | 
|---|
| 140 | /** | 
|---|
| 141 | *  Add one or more boundaries to the cache preceding the first currently cached boundary. | 
|---|
| 142 | *  Leave the iteration position on the first added boundary. | 
|---|
| 143 | *  Return false if no boundaries could be added (if at the start of the text.) | 
|---|
| 144 | */ | 
|---|
| 145 | UBool populatePreceding(UErrorCode &status); | 
|---|
| 146 |  | 
|---|
| 147 | enum UpdatePositionValues { | 
|---|
| 148 | RetainCachePosition = 0, | 
|---|
| 149 | UpdateCachePosition = 1 | 
|---|
| 150 | }; | 
|---|
| 151 |  | 
|---|
| 152 | /* | 
|---|
| 153 | * Add the boundary following the current position. | 
|---|
| 154 | * The current position can be left as it was, or changed to the newly added boundary, | 
|---|
| 155 | * as specified by the update parameter. | 
|---|
| 156 | */ | 
|---|
| 157 | void addFollowing(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update); | 
|---|
| 158 |  | 
|---|
| 159 |  | 
|---|
| 160 | /* | 
|---|
| 161 | * Add the boundary preceding the current position. | 
|---|
| 162 | * The current position can be left as it was, or changed to the newly added boundary, | 
|---|
| 163 | * as specified by the update parameter. | 
|---|
| 164 | */ | 
|---|
| 165 | bool addPreceding(int32_t position, int32_t ruleStatusIdx, UpdatePositionValues update); | 
|---|
| 166 |  | 
|---|
| 167 | /** | 
|---|
| 168 | *  Set the cache position to the specified position, or, if the position | 
|---|
| 169 | *  falls between to cached boundaries, to the preceding boundary. | 
|---|
| 170 | *  Fails if the requested position is outside of the range of boundaries currently held by the cache. | 
|---|
| 171 | *  The startPosition must be on a code point boundary. | 
|---|
| 172 | * | 
|---|
| 173 | *  Return TRUE if successful, FALSE if the specified position is after | 
|---|
| 174 | *  the last cached boundary or before the first. | 
|---|
| 175 | */ | 
|---|
| 176 | UBool                   seek(int32_t startPosition); | 
|---|
| 177 |  | 
|---|
| 178 | void dumpCache(); | 
|---|
| 179 |  | 
|---|
| 180 | private: | 
|---|
| 181 | static inline int32_t   modChunkSize(int index) { return index & (CACHE_SIZE - 1); } | 
|---|
| 182 |  | 
|---|
| 183 | static constexpr int32_t CACHE_SIZE = 128; | 
|---|
| 184 | static_assert((CACHE_SIZE & (CACHE_SIZE-1)) == 0, "CACHE_SIZE must be power of two."); | 
|---|
| 185 |  | 
|---|
| 186 | RuleBasedBreakIterator *fBI; | 
|---|
| 187 | int32_t                 fStartBufIdx; | 
|---|
| 188 | int32_t                 fEndBufIdx;    // inclusive | 
|---|
| 189 |  | 
|---|
| 190 | int32_t                 fTextIdx; | 
|---|
| 191 | int32_t                 fBufIdx; | 
|---|
| 192 |  | 
|---|
| 193 | int32_t                 fBoundaries[CACHE_SIZE]; | 
|---|
| 194 | uint16_t                fStatuses[CACHE_SIZE]; | 
|---|
| 195 |  | 
|---|
| 196 | UVector32               fSideBuffer; | 
|---|
| 197 | }; | 
|---|
| 198 |  | 
|---|
| 199 | U_NAMESPACE_END | 
|---|
| 200 |  | 
|---|
| 201 | #endif // #if !UCONFIG_NO_BREAK_ITERATION | 
|---|
| 202 |  | 
|---|
| 203 | #endif // RBBI_CACHE_H | 
|---|
| 204 |  | 
|---|