| 1 | // © 2016 and later: Unicode, Inc. and others. | 
|---|
| 2 | // License & terms of use: http://www.unicode.org/copyright.html | 
|---|
| 3 | /* | 
|---|
| 4 | ******************************************************************************* | 
|---|
| 5 | * Copyright (C) 2014-2016, International Business Machines | 
|---|
| 6 | * Corporation and others.  All Rights Reserved. | 
|---|
| 7 | ******************************************************************************* | 
|---|
| 8 | * dictionarydata.h | 
|---|
| 9 | * | 
|---|
| 10 | * created on: 2012may31 | 
|---|
| 11 | * created by: Markus W. Scherer & Maxime Serrano | 
|---|
| 12 | */ | 
|---|
| 13 |  | 
|---|
| 14 | #include "dictionarydata.h" | 
|---|
| 15 | #include "unicode/ucharstrie.h" | 
|---|
| 16 | #include "unicode/bytestrie.h" | 
|---|
| 17 | #include "unicode/udata.h" | 
|---|
| 18 | #include "cmemory.h" | 
|---|
| 19 |  | 
|---|
| 20 | #if !UCONFIG_NO_BREAK_ITERATION | 
|---|
| 21 |  | 
|---|
| 22 | U_NAMESPACE_BEGIN | 
|---|
| 23 |  | 
|---|
| 24 | const int32_t  DictionaryData::TRIE_TYPE_BYTES = 0; | 
|---|
| 25 | const int32_t  DictionaryData::TRIE_TYPE_UCHARS = 1; | 
|---|
| 26 | const int32_t  DictionaryData::TRIE_TYPE_MASK = 7; | 
|---|
| 27 | const int32_t  DictionaryData::TRIE_HAS_VALUES = 8; | 
|---|
| 28 |  | 
|---|
| 29 | const int32_t  DictionaryData::TRANSFORM_NONE = 0; | 
|---|
| 30 | const int32_t  DictionaryData::TRANSFORM_TYPE_OFFSET = 0x1000000; | 
|---|
| 31 | const int32_t  DictionaryData::TRANSFORM_TYPE_MASK = 0x7f000000; | 
|---|
| 32 | const int32_t  DictionaryData::TRANSFORM_OFFSET_MASK = 0x1fffff; | 
|---|
| 33 |  | 
|---|
| 34 | DictionaryMatcher::~DictionaryMatcher() { | 
|---|
| 35 | } | 
|---|
| 36 |  | 
|---|
| 37 | UCharsDictionaryMatcher::~UCharsDictionaryMatcher() { | 
|---|
| 38 | udata_close(file); | 
|---|
| 39 | } | 
|---|
| 40 |  | 
|---|
| 41 | int32_t UCharsDictionaryMatcher::getType() const { | 
|---|
| 42 | return DictionaryData::TRIE_TYPE_UCHARS; | 
|---|
| 43 | } | 
|---|
| 44 |  | 
|---|
| 45 | int32_t UCharsDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit, | 
|---|
| 46 | int32_t *lengths, int32_t *cpLengths, int32_t *values, | 
|---|
| 47 | int32_t *prefix) const { | 
|---|
| 48 |  | 
|---|
| 49 | UCharsTrie uct(characters); | 
|---|
| 50 | int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text); | 
|---|
| 51 | int32_t wordCount = 0; | 
|---|
| 52 | int32_t codePointsMatched = 0; | 
|---|
| 53 |  | 
|---|
| 54 | for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) { | 
|---|
| 55 | UStringTrieResult result = (codePointsMatched == 0) ? uct.first(c) : uct.next(c); | 
|---|
| 56 | int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex; | 
|---|
| 57 | codePointsMatched += 1; | 
|---|
| 58 | if (USTRINGTRIE_HAS_VALUE(result)) { | 
|---|
| 59 | if (wordCount < limit) { | 
|---|
| 60 | if (values != NULL) { | 
|---|
| 61 | values[wordCount] = uct.getValue(); | 
|---|
| 62 | } | 
|---|
| 63 | if (lengths != NULL) { | 
|---|
| 64 | lengths[wordCount] = lengthMatched; | 
|---|
| 65 | } | 
|---|
| 66 | if (cpLengths != NULL) { | 
|---|
| 67 | cpLengths[wordCount] = codePointsMatched; | 
|---|
| 68 | } | 
|---|
| 69 | ++wordCount; | 
|---|
| 70 | } | 
|---|
| 71 | if (result == USTRINGTRIE_FINAL_VALUE) { | 
|---|
| 72 | break; | 
|---|
| 73 | } | 
|---|
| 74 | } | 
|---|
| 75 | else if (result == USTRINGTRIE_NO_MATCH) { | 
|---|
| 76 | break; | 
|---|
| 77 | } | 
|---|
| 78 | if (lengthMatched >= maxLength) { | 
|---|
| 79 | break; | 
|---|
| 80 | } | 
|---|
| 81 | } | 
|---|
| 82 |  | 
|---|
| 83 | if (prefix != NULL) { | 
|---|
| 84 | *prefix = codePointsMatched; | 
|---|
| 85 | } | 
|---|
| 86 | return wordCount; | 
|---|
| 87 | } | 
|---|
| 88 |  | 
|---|
| 89 | BytesDictionaryMatcher::~BytesDictionaryMatcher() { | 
|---|
| 90 | udata_close(file); | 
|---|
| 91 | } | 
|---|
| 92 |  | 
|---|
| 93 | UChar32 BytesDictionaryMatcher::transform(UChar32 c) const { | 
|---|
| 94 | if ((transformConstant & DictionaryData::TRANSFORM_TYPE_MASK) == DictionaryData::TRANSFORM_TYPE_OFFSET) { | 
|---|
| 95 | if (c == 0x200D) { | 
|---|
| 96 | return 0xFF; | 
|---|
| 97 | } else if (c == 0x200C) { | 
|---|
| 98 | return 0xFE; | 
|---|
| 99 | } | 
|---|
| 100 | int32_t delta = c - (transformConstant & DictionaryData::TRANSFORM_OFFSET_MASK); | 
|---|
| 101 | if (delta < 0 || 0xFD < delta) { | 
|---|
| 102 | return U_SENTINEL; | 
|---|
| 103 | } | 
|---|
| 104 | return (UChar32)delta; | 
|---|
| 105 | } | 
|---|
| 106 | return c; | 
|---|
| 107 | } | 
|---|
| 108 |  | 
|---|
| 109 | int32_t BytesDictionaryMatcher::getType() const { | 
|---|
| 110 | return DictionaryData::TRIE_TYPE_BYTES; | 
|---|
| 111 | } | 
|---|
| 112 |  | 
|---|
| 113 | int32_t BytesDictionaryMatcher::matches(UText *text, int32_t maxLength, int32_t limit, | 
|---|
| 114 | int32_t *lengths, int32_t *cpLengths, int32_t *values, | 
|---|
| 115 | int32_t *prefix) const { | 
|---|
| 116 | BytesTrie bt(characters); | 
|---|
| 117 | int32_t startingTextIndex = (int32_t)utext_getNativeIndex(text); | 
|---|
| 118 | int32_t wordCount = 0; | 
|---|
| 119 | int32_t codePointsMatched = 0; | 
|---|
| 120 |  | 
|---|
| 121 | for (UChar32 c = utext_next32(text); c >= 0; c=utext_next32(text)) { | 
|---|
| 122 | UStringTrieResult result = (codePointsMatched == 0) ? bt.first(transform(c)) : bt.next(transform(c)); | 
|---|
| 123 | int32_t lengthMatched = (int32_t)utext_getNativeIndex(text) - startingTextIndex; | 
|---|
| 124 | codePointsMatched += 1; | 
|---|
| 125 | if (USTRINGTRIE_HAS_VALUE(result)) { | 
|---|
| 126 | if (wordCount < limit) { | 
|---|
| 127 | if (values != NULL) { | 
|---|
| 128 | values[wordCount] = bt.getValue(); | 
|---|
| 129 | } | 
|---|
| 130 | if (lengths != NULL) { | 
|---|
| 131 | lengths[wordCount] = lengthMatched; | 
|---|
| 132 | } | 
|---|
| 133 | if (cpLengths != NULL) { | 
|---|
| 134 | cpLengths[wordCount] = codePointsMatched; | 
|---|
| 135 | } | 
|---|
| 136 | ++wordCount; | 
|---|
| 137 | } | 
|---|
| 138 | if (result == USTRINGTRIE_FINAL_VALUE) { | 
|---|
| 139 | break; | 
|---|
| 140 | } | 
|---|
| 141 | } | 
|---|
| 142 | else if (result == USTRINGTRIE_NO_MATCH) { | 
|---|
| 143 | break; | 
|---|
| 144 | } | 
|---|
| 145 | if (lengthMatched >= maxLength) { | 
|---|
| 146 | break; | 
|---|
| 147 | } | 
|---|
| 148 | } | 
|---|
| 149 |  | 
|---|
| 150 | if (prefix != NULL) { | 
|---|
| 151 | *prefix = codePointsMatched; | 
|---|
| 152 | } | 
|---|
| 153 | return wordCount; | 
|---|
| 154 | } | 
|---|
| 155 |  | 
|---|
| 156 |  | 
|---|
| 157 | U_NAMESPACE_END | 
|---|
| 158 |  | 
|---|
| 159 | U_NAMESPACE_USE | 
|---|
| 160 |  | 
|---|
| 161 | U_CAPI int32_t U_EXPORT2 | 
|---|
| 162 | udict_swap(const UDataSwapper *ds, const void *inData, int32_t length, | 
|---|
| 163 | void *outData, UErrorCode *pErrorCode) { | 
|---|
| 164 | const UDataInfo *pInfo; | 
|---|
| 165 | int32_t ; | 
|---|
| 166 | const uint8_t *inBytes; | 
|---|
| 167 | uint8_t *outBytes; | 
|---|
| 168 | const int32_t *inIndexes; | 
|---|
| 169 | int32_t indexes[DictionaryData::IX_COUNT]; | 
|---|
| 170 | int32_t i, offset, size; | 
|---|
| 171 |  | 
|---|
| 172 | headerSize = udata_swapDataHeader(ds, inData, length, outData, pErrorCode); | 
|---|
| 173 | if (pErrorCode == NULL || U_FAILURE(*pErrorCode)) return 0; | 
|---|
| 174 | pInfo = (const UDataInfo *)((const char *)inData + 4); | 
|---|
| 175 | if (!(pInfo->dataFormat[0] == 0x44 && | 
|---|
| 176 | pInfo->dataFormat[1] == 0x69 && | 
|---|
| 177 | pInfo->dataFormat[2] == 0x63 && | 
|---|
| 178 | pInfo->dataFormat[3] == 0x74 && | 
|---|
| 179 | pInfo->formatVersion[0] == 1)) { | 
|---|
| 180 | udata_printError(ds, "udict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as dictionary data\n", | 
|---|
| 181 | pInfo->dataFormat[0], pInfo->dataFormat[1], pInfo->dataFormat[2], pInfo->dataFormat[3], pInfo->formatVersion[0]); | 
|---|
| 182 | *pErrorCode = U_UNSUPPORTED_ERROR; | 
|---|
| 183 | return 0; | 
|---|
| 184 | } | 
|---|
| 185 |  | 
|---|
| 186 | inBytes = (const uint8_t *)inData + headerSize; | 
|---|
| 187 | outBytes = (uint8_t *)outData + headerSize; | 
|---|
| 188 |  | 
|---|
| 189 | inIndexes = (const int32_t *)inBytes; | 
|---|
| 190 | if (length >= 0) { | 
|---|
| 191 | length -= headerSize; | 
|---|
| 192 | if (length < (int32_t)(sizeof(indexes))) { | 
|---|
| 193 | udata_printError(ds, "udict_swap(): too few bytes (%d after header) for dictionary data\n", length); | 
|---|
| 194 | *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; | 
|---|
| 195 | return 0; | 
|---|
| 196 | } | 
|---|
| 197 | } | 
|---|
| 198 |  | 
|---|
| 199 | for (i = 0; i < DictionaryData::IX_COUNT; i++) { | 
|---|
| 200 | indexes[i] = udata_readInt32(ds, inIndexes[i]); | 
|---|
| 201 | } | 
|---|
| 202 |  | 
|---|
| 203 | size = indexes[DictionaryData::IX_TOTAL_SIZE]; | 
|---|
| 204 |  | 
|---|
| 205 | if (length >= 0) { | 
|---|
| 206 | if (length < size) { | 
|---|
| 207 | udata_printError(ds, "udict_swap(): too few bytes (%d after header) for all of dictionary data\n", length); | 
|---|
| 208 | *pErrorCode = U_INDEX_OUTOFBOUNDS_ERROR; | 
|---|
| 209 | return 0; | 
|---|
| 210 | } | 
|---|
| 211 |  | 
|---|
| 212 | if (inBytes != outBytes) { | 
|---|
| 213 | uprv_memcpy(outBytes, inBytes, size); | 
|---|
| 214 | } | 
|---|
| 215 |  | 
|---|
| 216 | offset = 0; | 
|---|
| 217 | ds->swapArray32(ds, inBytes, sizeof(indexes), outBytes, pErrorCode); | 
|---|
| 218 | offset = (int32_t)sizeof(indexes); | 
|---|
| 219 | int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK; | 
|---|
| 220 | int32_t nextOffset = indexes[DictionaryData::IX_RESERVED1_OFFSET]; | 
|---|
| 221 |  | 
|---|
| 222 | if (trieType == DictionaryData::TRIE_TYPE_UCHARS) { | 
|---|
| 223 | ds->swapArray16(ds, inBytes + offset, nextOffset - offset, outBytes + offset, pErrorCode); | 
|---|
| 224 | } else if (trieType == DictionaryData::TRIE_TYPE_BYTES) { | 
|---|
| 225 | // nothing to do | 
|---|
| 226 | } else { | 
|---|
| 227 | udata_printError(ds, "udict_swap(): unknown trie type!\n"); | 
|---|
| 228 | *pErrorCode = U_UNSUPPORTED_ERROR; | 
|---|
| 229 | return 0; | 
|---|
| 230 | } | 
|---|
| 231 |  | 
|---|
| 232 | // these next two sections are empty in the current format, | 
|---|
| 233 | // but may be used later. | 
|---|
| 234 | offset = nextOffset; | 
|---|
| 235 | nextOffset = indexes[DictionaryData::IX_RESERVED2_OFFSET]; | 
|---|
| 236 | offset = nextOffset; | 
|---|
| 237 | nextOffset = indexes[DictionaryData::IX_TOTAL_SIZE]; | 
|---|
| 238 | offset = nextOffset; | 
|---|
| 239 | } | 
|---|
| 240 | return headerSize + size; | 
|---|
| 241 | } | 
|---|
| 242 | #endif | 
|---|
| 243 |  | 
|---|