| 1 | // © 2016 and later: Unicode, Inc. and others. | 
|---|
| 2 | // License & terms of use: http://www.unicode.org/copyright.html | 
|---|
| 3 | /* | 
|---|
| 4 | ****************************************************************************** | 
|---|
| 5 | * | 
|---|
| 6 | *   Copyright (C) 2007, International Business Machines | 
|---|
| 7 | *   Corporation and others.  All Rights Reserved. | 
|---|
| 8 | * | 
|---|
| 9 | ****************************************************************************** | 
|---|
| 10 | *   file name:  unisetspan.h | 
|---|
| 11 | *   encoding:   UTF-8 | 
|---|
| 12 | *   tab size:   8 (not used) | 
|---|
| 13 | *   indentation:4 | 
|---|
| 14 | * | 
|---|
| 15 | *   created on: 2007mar01 | 
|---|
| 16 | *   created by: Markus W. Scherer | 
|---|
| 17 | */ | 
|---|
| 18 |  | 
|---|
| 19 | #ifndef __UNISETSPAN_H__ | 
|---|
| 20 | #define __UNISETSPAN_H__ | 
|---|
| 21 |  | 
|---|
| 22 | #include "unicode/utypes.h" | 
|---|
| 23 | #include "unicode/uniset.h" | 
|---|
| 24 |  | 
|---|
| 25 | U_NAMESPACE_BEGIN | 
|---|
| 26 |  | 
|---|
| 27 | /* | 
|---|
| 28 | * Implement span() etc. for a set with strings. | 
|---|
| 29 | * Avoid recursion because of its exponential complexity. | 
|---|
| 30 | * Instead, try multiple paths at once and track them with an IndexList. | 
|---|
| 31 | */ | 
|---|
| 32 | class UnicodeSetStringSpan : public UMemory { | 
|---|
| 33 | public: | 
|---|
| 34 | /* | 
|---|
| 35 | * Which span() variant will be used? | 
|---|
| 36 | * The object is either built for one variant and used once, | 
|---|
| 37 | * or built for all and may be used many times. | 
|---|
| 38 | */ | 
|---|
| 39 | enum { | 
|---|
| 40 | FWD             = 0x20, | 
|---|
| 41 | BACK            = 0x10, | 
|---|
| 42 | UTF16           = 8, | 
|---|
| 43 | UTF8            = 4, | 
|---|
| 44 | CONTAINED       = 2, | 
|---|
| 45 | NOT_CONTAINED   = 1, | 
|---|
| 46 |  | 
|---|
| 47 | ALL             = 0x3f, | 
|---|
| 48 |  | 
|---|
| 49 | FWD_UTF16_CONTAINED     = FWD  | UTF16 |     CONTAINED, | 
|---|
| 50 | FWD_UTF16_NOT_CONTAINED = FWD  | UTF16 | NOT_CONTAINED, | 
|---|
| 51 | FWD_UTF8_CONTAINED      = FWD  | UTF8  |     CONTAINED, | 
|---|
| 52 | FWD_UTF8_NOT_CONTAINED  = FWD  | UTF8  | NOT_CONTAINED, | 
|---|
| 53 | BACK_UTF16_CONTAINED    = BACK | UTF16 |     CONTAINED, | 
|---|
| 54 | BACK_UTF16_NOT_CONTAINED= BACK | UTF16 | NOT_CONTAINED, | 
|---|
| 55 | BACK_UTF8_CONTAINED     = BACK | UTF8  |     CONTAINED, | 
|---|
| 56 | BACK_UTF8_NOT_CONTAINED = BACK | UTF8  | NOT_CONTAINED | 
|---|
| 57 | }; | 
|---|
| 58 |  | 
|---|
| 59 | UnicodeSetStringSpan(const UnicodeSet &set, const UVector &setStrings, uint32_t which); | 
|---|
| 60 |  | 
|---|
| 61 | // Copy constructor. Assumes which==ALL for a frozen set. | 
|---|
| 62 | UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan, const UVector &newParentSetStrings); | 
|---|
| 63 |  | 
|---|
| 64 | ~UnicodeSetStringSpan(); | 
|---|
| 65 |  | 
|---|
| 66 | /* | 
|---|
| 67 | * Do the strings need to be checked in span() etc.? | 
|---|
| 68 | * @return true if strings need to be checked (call span() here), | 
|---|
| 69 | *         false if not (use a BMPSet for best performance). | 
|---|
| 70 | */ | 
|---|
| 71 | inline UBool needsStringSpanUTF16(); | 
|---|
| 72 | inline UBool needsStringSpanUTF8(); | 
|---|
| 73 |  | 
|---|
| 74 | // For fast UnicodeSet::contains(c). | 
|---|
| 75 | inline UBool contains(UChar32 c) const; | 
|---|
| 76 |  | 
|---|
| 77 | int32_t span(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const; | 
|---|
| 78 |  | 
|---|
| 79 | int32_t spanBack(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const; | 
|---|
| 80 |  | 
|---|
| 81 | int32_t spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const; | 
|---|
| 82 |  | 
|---|
| 83 | int32_t spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const; | 
|---|
| 84 |  | 
|---|
| 85 | private: | 
|---|
| 86 | // Special spanLength byte values. | 
|---|
| 87 | enum { | 
|---|
| 88 | // The spanLength is >=0xfe. | 
|---|
| 89 | LONG_SPAN=0xfe, | 
|---|
| 90 | // All code points in the string are contained in the parent set. | 
|---|
| 91 | ALL_CP_CONTAINED=0xff | 
|---|
| 92 | }; | 
|---|
| 93 |  | 
|---|
| 94 | // Add a starting or ending string character to the spanNotSet | 
|---|
| 95 | // so that a character span ends before any string. | 
|---|
| 96 | void addToSpanNotSet(UChar32 c); | 
|---|
| 97 |  | 
|---|
| 98 | int32_t spanNot(const char16_t *s, int32_t length) const; | 
|---|
| 99 | int32_t spanNotBack(const char16_t *s, int32_t length) const; | 
|---|
| 100 | int32_t spanNotUTF8(const uint8_t *s, int32_t length) const; | 
|---|
| 101 | int32_t spanNotBackUTF8(const uint8_t *s, int32_t length) const; | 
|---|
| 102 |  | 
|---|
| 103 | // Set for span(). Same as parent but without strings. | 
|---|
| 104 | UnicodeSet spanSet; | 
|---|
| 105 |  | 
|---|
| 106 | // Set for span(not contained). | 
|---|
| 107 | // Same as spanSet, plus characters that start or end strings. | 
|---|
| 108 | UnicodeSet *pSpanNotSet; | 
|---|
| 109 |  | 
|---|
| 110 | // The strings of the parent set. | 
|---|
| 111 | const UVector &strings; | 
|---|
| 112 |  | 
|---|
| 113 | // Pointer to the UTF-8 string lengths. | 
|---|
| 114 | // Also pointer to further allocated storage for meta data and | 
|---|
| 115 | // UTF-8 string contents as necessary. | 
|---|
| 116 | int32_t *utf8Lengths; | 
|---|
| 117 |  | 
|---|
| 118 | // Pointer to the part of the (utf8Lengths) memory block that stores | 
|---|
| 119 | // the lengths of span(), spanBack() etc. for each string. | 
|---|
| 120 | uint8_t *spanLengths; | 
|---|
| 121 |  | 
|---|
| 122 | // Pointer to the part of the (utf8Lengths) memory block that stores | 
|---|
| 123 | // the UTF-8 versions of the parent set's strings. | 
|---|
| 124 | uint8_t *utf8; | 
|---|
| 125 |  | 
|---|
| 126 | // Number of bytes for all UTF-8 versions of strings together. | 
|---|
| 127 | int32_t utf8Length; | 
|---|
| 128 |  | 
|---|
| 129 | // Maximum lengths of relevant strings. | 
|---|
| 130 | int32_t maxLength16; | 
|---|
| 131 | int32_t maxLength8; | 
|---|
| 132 |  | 
|---|
| 133 | // Set up for all variants of span()? | 
|---|
| 134 | UBool all; | 
|---|
| 135 |  | 
|---|
| 136 | // Memory for small numbers and lengths of strings. | 
|---|
| 137 | // For example, for 8 strings: | 
|---|
| 138 | // 8 UTF-8 lengths, 8*4 bytes span lengths, 8*2 3-byte UTF-8 characters | 
|---|
| 139 | // = 112 bytes = int32_t[28]. | 
|---|
| 140 | int32_t staticLengths[32]; | 
|---|
| 141 | }; | 
|---|
| 142 |  | 
|---|
| 143 | UBool UnicodeSetStringSpan::needsStringSpanUTF16() { | 
|---|
| 144 | return (UBool)(maxLength16!=0); | 
|---|
| 145 | } | 
|---|
| 146 |  | 
|---|
| 147 | UBool UnicodeSetStringSpan::needsStringSpanUTF8() { | 
|---|
| 148 | return (UBool)(maxLength8!=0); | 
|---|
| 149 | } | 
|---|
| 150 |  | 
|---|
| 151 | UBool UnicodeSetStringSpan::contains(UChar32 c) const { | 
|---|
| 152 | return spanSet.contains(c); | 
|---|
| 153 | } | 
|---|
| 154 |  | 
|---|
| 155 | U_NAMESPACE_END | 
|---|
| 156 |  | 
|---|
| 157 | #endif | 
|---|
| 158 |  | 
|---|