1// © 2020 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3
4// uniquecharstr.h
5// created: 2020sep01 Frank Yung-Fong Tang
6
7#ifndef __UNIQUECHARSTR_H__
8#define __UNIQUECHARSTR_H__
9
10#include "charstr.h"
11#include "uassert.h"
12#include "uhash.h"
13
14U_NAMESPACE_BEGIN
15
16/**
17 * Stores NUL-terminated strings with duplicate elimination.
18 * Checks for unique UTF-16 string pointers and converts to invariant characters.
19 *
20 * Intended to be stack-allocated. Add strings, get a unique number for each,
21 * freeze the object, get a char * pointer for each string,
22 * call orphanCharStrings() to capture the string storage, and let this object go out of scope.
23 */
24class UniqueCharStrings {
25public:
26 UniqueCharStrings(UErrorCode &errorCode) : strings(nullptr) {
27 // Note: We hash on string contents but store stable char16_t * pointers.
28 // If the strings are stored in resource bundles which should be built with
29 // duplicate elimination, then we should be able to hash on just the pointer values.
30 uhash_init(&map, uhash_hashUChars, uhash_compareUChars, uhash_compareLong, &errorCode);
31 if (U_FAILURE(errorCode)) { return; }
32 strings = new CharString();
33 if (strings == nullptr) {
34 errorCode = U_MEMORY_ALLOCATION_ERROR;
35 }
36 }
37 ~UniqueCharStrings() {
38 uhash_close(&map);
39 delete strings;
40 }
41
42 /** Returns/orphans the CharString that contains all strings. */
43 CharString *orphanCharStrings() {
44 CharString *result = strings;
45 strings = nullptr;
46 return result;
47 }
48
49 /**
50 * Adds a string and returns a unique number for it.
51 * The string's buffer contents must not change, nor move around in memory,
52 * while this UniqueCharStrings is in use.
53 * The string contents must be NUL-terminated exactly at s.length().
54 *
55 * Best used with read-only-alias UnicodeString objects that point to
56 * stable storage, such as strings returned by resource bundle functions.
57 */
58 int32_t add(const UnicodeString &s, UErrorCode &errorCode) {
59 if (U_FAILURE(errorCode)) { return 0; }
60 if (isFrozen) {
61 errorCode = U_NO_WRITE_PERMISSION;
62 return 0;
63 }
64 // The string points into the resource bundle.
65 const char16_t *p = s.getBuffer();
66 int32_t oldIndex = uhash_geti(&map, p);
67 if (oldIndex != 0) { // found duplicate
68 return oldIndex;
69 }
70 // Explicit NUL terminator for the previous string.
71 // The strings object is also terminated with one implicit NUL.
72 strings->append(0, errorCode);
73 int32_t newIndex = strings->length();
74 strings->appendInvariantChars(s, errorCode);
75 uhash_puti(&map, const_cast<char16_t *>(p), newIndex, &errorCode);
76 return newIndex;
77 }
78
79 void freeze() { isFrozen = true; }
80
81 /**
82 * Returns a string pointer for its unique number, if this object is frozen.
83 * Otherwise nullptr.
84 */
85 const char *get(int32_t i) const {
86 U_ASSERT(isFrozen);
87 return isFrozen && i > 0 ? strings->data() + i : nullptr;
88 }
89
90private:
91 UHashtable map;
92 CharString *strings;
93 bool isFrozen = false;
94};
95
96U_NAMESPACE_END
97
98#endif // __UNIQUECHARSTR_H__
99