| 1 | // © 2016 and later: Unicode, Inc. and others. |
| 2 | // License & terms of use: http://www.unicode.org/copyright.html |
| 3 | /* |
| 4 | ********************************************************************** |
| 5 | * Copyright (c) 2002-2014, International Business Machines |
| 6 | * Corporation and others. All Rights Reserved. |
| 7 | ********************************************************************** |
| 8 | */ |
| 9 | #ifndef USETITER_H |
| 10 | #define USETITER_H |
| 11 | |
| 12 | #include "unicode/utypes.h" |
| 13 | |
| 14 | #if U_SHOW_CPLUSPLUS_API |
| 15 | |
| 16 | #include "unicode/uobject.h" |
| 17 | #include "unicode/unistr.h" |
| 18 | |
| 19 | /** |
| 20 | * \file |
| 21 | * \brief C++ API: UnicodeSetIterator iterates over the contents of a UnicodeSet. |
| 22 | */ |
| 23 | |
| 24 | U_NAMESPACE_BEGIN |
| 25 | |
| 26 | class UnicodeSet; |
| 27 | class UnicodeString; |
| 28 | |
| 29 | /** |
| 30 | * |
| 31 | * UnicodeSetIterator iterates over the contents of a UnicodeSet. It |
| 32 | * iterates over either code points or code point ranges. After all |
| 33 | * code points or ranges have been returned, it returns the |
| 34 | * multicharacter strings of the UnicodeSet, if any. |
| 35 | * |
| 36 | * This class is not intended for public subclassing. |
| 37 | * |
| 38 | * <p>To iterate over code points and strings, use a loop like this: |
| 39 | * <pre> |
| 40 | * UnicodeSetIterator it(set); |
| 41 | * while (it.next()) { |
| 42 | * processItem(it.getString()); |
| 43 | * } |
| 44 | * </pre> |
| 45 | * <p>Each item in the set is accessed as a string. Set elements |
| 46 | * consisting of single code points are returned as strings containing |
| 47 | * just the one code point. |
| 48 | * |
| 49 | * <p>To iterate over code point ranges, instead of individual code points, |
| 50 | * use a loop like this: |
| 51 | * <pre> |
| 52 | * UnicodeSetIterator it(set); |
| 53 | * while (it.nextRange()) { |
| 54 | * if (it.isString()) { |
| 55 | * processString(it.getString()); |
| 56 | * } else { |
| 57 | * processCodepointRange(it.getCodepoint(), it.getCodepointEnd()); |
| 58 | * } |
| 59 | * } |
| 60 | * </pre> |
| 61 | * |
| 62 | * To iterate over only the strings, start with <code>skipToStrings()</code>. |
| 63 | * |
| 64 | * @author M. Davis |
| 65 | * @stable ICU 2.4 |
| 66 | */ |
| 67 | class U_COMMON_API UnicodeSetIterator final : public UObject { |
| 68 | /** |
| 69 | * Value of <tt>codepoint</tt> if the iterator points to a string. |
| 70 | * If <tt>codepoint == IS_STRING</tt>, then examine |
| 71 | * <tt>string</tt> for the current iteration result. |
| 72 | */ |
| 73 | enum { IS_STRING = -1 }; |
| 74 | |
| 75 | /** |
| 76 | * Current code point, or the special value <tt>IS_STRING</tt>, if |
| 77 | * the iterator points to a string. |
| 78 | */ |
| 79 | UChar32 codepoint; |
| 80 | |
| 81 | /** |
| 82 | * When iterating over ranges using <tt>nextRange()</tt>, |
| 83 | * <tt>codepointEnd</tt> contains the inclusive end of the |
| 84 | * iteration range, if <tt>codepoint != IS_STRING</tt>. If |
| 85 | * iterating over code points using <tt>next()</tt>, or if |
| 86 | * <tt>codepoint == IS_STRING</tt>, then the value of |
| 87 | * <tt>codepointEnd</tt> is undefined. |
| 88 | */ |
| 89 | UChar32 codepointEnd; |
| 90 | |
| 91 | /** |
| 92 | * If <tt>codepoint == IS_STRING</tt>, then <tt>string</tt> points |
| 93 | * to the current string. If <tt>codepoint != IS_STRING</tt>, the |
| 94 | * value of <tt>string</tt> is undefined. |
| 95 | */ |
| 96 | const UnicodeString* string; |
| 97 | |
| 98 | public: |
| 99 | |
| 100 | /** |
| 101 | * Create an iterator over the given set. The iterator is valid |
| 102 | * only so long as <tt>set</tt> is valid. |
| 103 | * @param set set to iterate over |
| 104 | * @stable ICU 2.4 |
| 105 | */ |
| 106 | UnicodeSetIterator(const UnicodeSet& set); |
| 107 | |
| 108 | /** |
| 109 | * Create an iterator over nothing. <tt>next()</tt> and |
| 110 | * <tt>nextRange()</tt> return false. This is a convenience |
| 111 | * constructor allowing the target to be set later. |
| 112 | * @stable ICU 2.4 |
| 113 | */ |
| 114 | UnicodeSetIterator(); |
| 115 | |
| 116 | /** |
| 117 | * Destructor. |
| 118 | * @stable ICU 2.4 |
| 119 | */ |
| 120 | virtual ~UnicodeSetIterator(); |
| 121 | |
| 122 | /** |
| 123 | * Returns true if the current element is a string. If so, the |
| 124 | * caller can retrieve it with <tt>getString()</tt>. If this |
| 125 | * method returns false, the current element is a code point or |
| 126 | * code point range, depending on whether <tt>next()</tt> or |
| 127 | * <tt>nextRange()</tt> was called. |
| 128 | * Elements of types string and codepoint can both be retrieved |
| 129 | * with the function <tt>getString()</tt>. |
| 130 | * Elements of type codepoint can also be retrieved with |
| 131 | * <tt>getCodepoint()</tt>. |
| 132 | * For ranges, <tt>getCodepoint()</tt> returns the starting codepoint |
| 133 | * of the range, and <tt>getCodepointEnd()</tt> returns the end |
| 134 | * of the range. |
| 135 | * @stable ICU 2.4 |
| 136 | */ |
| 137 | inline UBool isString() const; |
| 138 | |
| 139 | /** |
| 140 | * Returns the current code point, if <tt>isString()</tt> returned |
| 141 | * false. Otherwise returns an undefined result. |
| 142 | * @stable ICU 2.4 |
| 143 | */ |
| 144 | inline UChar32 getCodepoint() const; |
| 145 | |
| 146 | /** |
| 147 | * Returns the end of the current code point range, if |
| 148 | * <tt>isString()</tt> returned false and <tt>nextRange()</tt> was |
| 149 | * called. Otherwise returns an undefined result. |
| 150 | * @stable ICU 2.4 |
| 151 | */ |
| 152 | inline UChar32 getCodepointEnd() const; |
| 153 | |
| 154 | /** |
| 155 | * Returns the current string, if <tt>isString()</tt> returned |
| 156 | * true. If the current iteration item is a code point, a UnicodeString |
| 157 | * containing that single code point is returned. |
| 158 | * |
| 159 | * Ownership of the returned string remains with the iterator. |
| 160 | * The string is guaranteed to remain valid only until the iterator is |
| 161 | * advanced to the next item, or until the iterator is deleted. |
| 162 | * |
| 163 | * @stable ICU 2.4 |
| 164 | */ |
| 165 | const UnicodeString& getString(); |
| 166 | |
| 167 | /** |
| 168 | * Skips over the remaining code points/ranges, if any. |
| 169 | * A following call to next() or nextRange() will yield a string, if there is one. |
| 170 | * No-op if next() would return false, or if it would yield a string anyway. |
| 171 | * |
| 172 | * @return *this |
| 173 | * @stable ICU 70 |
| 174 | * @see UnicodeSet#strings() |
| 175 | */ |
| 176 | inline UnicodeSetIterator &skipToStrings() { |
| 177 | // Finish code point/range iteration. |
| 178 | range = endRange; |
| 179 | endElement = -1; |
| 180 | nextElement = 0; |
| 181 | return *this; |
| 182 | } |
| 183 | |
| 184 | /** |
| 185 | * Advances the iteration position to the next element in the set, |
| 186 | * which can be either a single code point or a string. |
| 187 | * If there are no more elements in the set, return false. |
| 188 | * |
| 189 | * <p> |
| 190 | * If <tt>isString() == true</tt>, the value is a |
| 191 | * string, otherwise the value is a |
| 192 | * single code point. Elements of either type can be retrieved |
| 193 | * with the function <tt>getString()</tt>, while elements of |
| 194 | * consisting of a single code point can be retrieved with |
| 195 | * <tt>getCodepoint()</tt> |
| 196 | * |
| 197 | * <p>The order of iteration is all code points in sorted order, |
| 198 | * followed by all strings sorted order. Do not mix |
| 199 | * calls to <tt>next()</tt> and <tt>nextRange()</tt> without |
| 200 | * calling <tt>reset()</tt> between them. The results of doing so |
| 201 | * are undefined. |
| 202 | * |
| 203 | * @return true if there was another element in the set. |
| 204 | * @stable ICU 2.4 |
| 205 | */ |
| 206 | UBool next(); |
| 207 | |
| 208 | /** |
| 209 | * Returns the next element in the set, either a code point range |
| 210 | * or a string. If there are no more elements in the set, return |
| 211 | * false. If <tt>isString() == true</tt>, the value is a |
| 212 | * string and can be accessed with <tt>getString()</tt>. Otherwise the value is a |
| 213 | * range of one or more code points from <tt>getCodepoint()</tt> to |
| 214 | * <tt>getCodepointeEnd()</tt> inclusive. |
| 215 | * |
| 216 | * <p>The order of iteration is all code points ranges in sorted |
| 217 | * order, followed by all strings sorted order. Ranges are |
| 218 | * disjoint and non-contiguous. The value returned from <tt>getString()</tt> |
| 219 | * is undefined unless <tt>isString() == true</tt>. Do not mix calls to |
| 220 | * <tt>next()</tt> and <tt>nextRange()</tt> without calling |
| 221 | * <tt>reset()</tt> between them. The results of doing so are |
| 222 | * undefined. |
| 223 | * |
| 224 | * @return true if there was another element in the set. |
| 225 | * @stable ICU 2.4 |
| 226 | */ |
| 227 | UBool (); |
| 228 | |
| 229 | /** |
| 230 | * Sets this iterator to visit the elements of the given set and |
| 231 | * resets it to the start of that set. The iterator is valid only |
| 232 | * so long as <tt>set</tt> is valid. |
| 233 | * @param set the set to iterate over. |
| 234 | * @stable ICU 2.4 |
| 235 | */ |
| 236 | void reset(const UnicodeSet& set); |
| 237 | |
| 238 | /** |
| 239 | * Resets this iterator to the start of the set. |
| 240 | * @stable ICU 2.4 |
| 241 | */ |
| 242 | void reset(); |
| 243 | |
| 244 | /** |
| 245 | * ICU "poor man's RTTI", returns a UClassID for this class. |
| 246 | * |
| 247 | * @stable ICU 2.4 |
| 248 | */ |
| 249 | static UClassID U_EXPORT2 getStaticClassID(); |
| 250 | |
| 251 | /** |
| 252 | * ICU "poor man's RTTI", returns a UClassID for the actual class. |
| 253 | * |
| 254 | * @stable ICU 2.4 |
| 255 | */ |
| 256 | virtual UClassID getDynamicClassID() const override; |
| 257 | |
| 258 | // ======================= PRIVATES =========================== |
| 259 | |
| 260 | private: |
| 261 | |
| 262 | // endElement and nextElements are really UChar32's, but we keep |
| 263 | // them as signed int32_t's so we can do comparisons with |
| 264 | // endElement set to -1. Leave them as int32_t's. |
| 265 | /** The set |
| 266 | */ |
| 267 | const UnicodeSet* set; |
| 268 | /** End range |
| 269 | */ |
| 270 | int32_t endRange; |
| 271 | /** Range |
| 272 | */ |
| 273 | int32_t range; |
| 274 | /** End element |
| 275 | */ |
| 276 | int32_t endElement; |
| 277 | /** Next element |
| 278 | */ |
| 279 | int32_t nextElement; |
| 280 | /** Next string |
| 281 | */ |
| 282 | int32_t nextString; |
| 283 | /** String count |
| 284 | */ |
| 285 | int32_t stringCount; |
| 286 | |
| 287 | /** |
| 288 | * Points to the string to use when the caller asks for a |
| 289 | * string and the current iteration item is a code point, not a string. |
| 290 | */ |
| 291 | UnicodeString *cpString; |
| 292 | |
| 293 | /** Copy constructor. Disallowed. |
| 294 | */ |
| 295 | UnicodeSetIterator(const UnicodeSetIterator&) = delete; |
| 296 | |
| 297 | /** Assignment operator. Disallowed. |
| 298 | */ |
| 299 | UnicodeSetIterator& operator=(const UnicodeSetIterator&) = delete; |
| 300 | |
| 301 | /** Load range |
| 302 | */ |
| 303 | void loadRange(int32_t range); |
| 304 | }; |
| 305 | |
| 306 | inline UBool UnicodeSetIterator::isString() const { |
| 307 | return codepoint < 0; |
| 308 | } |
| 309 | |
| 310 | inline UChar32 UnicodeSetIterator::getCodepoint() const { |
| 311 | return codepoint; |
| 312 | } |
| 313 | |
| 314 | inline UChar32 UnicodeSetIterator::getCodepointEnd() const { |
| 315 | return codepointEnd; |
| 316 | } |
| 317 | |
| 318 | |
| 319 | U_NAMESPACE_END |
| 320 | |
| 321 | #endif /* U_SHOW_CPLUSPLUS_API */ |
| 322 | |
| 323 | #endif |
| 324 | |