| 1 | // © 2020 and later: Unicode, Inc. and others. |
| 2 | // License & terms of use: http://www.unicode.org/copyright.html |
| 3 | |
| 4 | // uniquecharstr.h |
| 5 | // created: 2020sep01 Frank Yung-Fong Tang |
| 6 | |
| 7 | #ifndef __UNIQUECHARSTR_H__ |
| 8 | #define __UNIQUECHARSTR_H__ |
| 9 | |
| 10 | #include "charstr.h" |
| 11 | #include "uassert.h" |
| 12 | #include "uhash.h" |
| 13 | |
| 14 | U_NAMESPACE_BEGIN |
| 15 | |
| 16 | /** |
| 17 | * Stores NUL-terminated strings with duplicate elimination. |
| 18 | * Checks for unique UTF-16 string pointers and converts to invariant characters. |
| 19 | * |
| 20 | * Intended to be stack-allocated. Add strings, get a unique number for each, |
| 21 | * freeze the object, get a char * pointer for each string, |
| 22 | * call orphanCharStrings() to capture the string storage, and let this object go out of scope. |
| 23 | */ |
| 24 | class UniqueCharStrings { |
| 25 | public: |
| 26 | UniqueCharStrings(UErrorCode &errorCode) : strings(nullptr) { |
| 27 | // Note: We hash on string contents but store stable char16_t * pointers. |
| 28 | // If the strings are stored in resource bundles which should be built with |
| 29 | // duplicate elimination, then we should be able to hash on just the pointer values. |
| 30 | uhash_init(&map, uhash_hashUChars, uhash_compareUChars, uhash_compareLong, &errorCode); |
| 31 | if (U_FAILURE(errorCode)) { return; } |
| 32 | strings = new CharString(); |
| 33 | if (strings == nullptr) { |
| 34 | errorCode = U_MEMORY_ALLOCATION_ERROR; |
| 35 | } |
| 36 | } |
| 37 | ~UniqueCharStrings() { |
| 38 | uhash_close(&map); |
| 39 | delete strings; |
| 40 | } |
| 41 | |
| 42 | /** Returns/orphans the CharString that contains all strings. */ |
| 43 | CharString *orphanCharStrings() { |
| 44 | CharString *result = strings; |
| 45 | strings = nullptr; |
| 46 | return result; |
| 47 | } |
| 48 | |
| 49 | /** |
| 50 | * Adds a string and returns a unique number for it. |
| 51 | * The string's buffer contents must not change, nor move around in memory, |
| 52 | * while this UniqueCharStrings is in use. |
| 53 | * The string contents must be NUL-terminated exactly at s.length(). |
| 54 | * |
| 55 | * Best used with read-only-alias UnicodeString objects that point to |
| 56 | * stable storage, such as strings returned by resource bundle functions. |
| 57 | */ |
| 58 | int32_t add(const UnicodeString &s, UErrorCode &errorCode) { |
| 59 | if (U_FAILURE(errorCode)) { return 0; } |
| 60 | if (isFrozen) { |
| 61 | errorCode = U_NO_WRITE_PERMISSION; |
| 62 | return 0; |
| 63 | } |
| 64 | // The string points into the resource bundle. |
| 65 | const char16_t *p = s.getBuffer(); |
| 66 | int32_t oldIndex = uhash_geti(&map, p); |
| 67 | if (oldIndex != 0) { // found duplicate |
| 68 | return oldIndex; |
| 69 | } |
| 70 | // Explicit NUL terminator for the previous string. |
| 71 | // The strings object is also terminated with one implicit NUL. |
| 72 | strings->append(0, errorCode); |
| 73 | int32_t newIndex = strings->length(); |
| 74 | strings->appendInvariantChars(s, errorCode); |
| 75 | uhash_puti(&map, const_cast<char16_t *>(p), newIndex, &errorCode); |
| 76 | return newIndex; |
| 77 | } |
| 78 | |
| 79 | void freeze() { isFrozen = true; } |
| 80 | |
| 81 | /** |
| 82 | * Returns a string pointer for its unique number, if this object is frozen. |
| 83 | * Otherwise nullptr. |
| 84 | */ |
| 85 | const char *get(int32_t i) const { |
| 86 | U_ASSERT(isFrozen); |
| 87 | return isFrozen && i > 0 ? strings->data() + i : nullptr; |
| 88 | } |
| 89 | |
| 90 | private: |
| 91 | UHashtable map; |
| 92 | CharString *strings; |
| 93 | bool isFrozen = false; |
| 94 | }; |
| 95 | |
| 96 | U_NAMESPACE_END |
| 97 | |
| 98 | #endif // __UNIQUECHARSTR_H__ |
| 99 | |