1//
2// Unicode.h
3//
4// Library: Foundation
5// Package: Text
6// Module: Unicode
7//
8// Definition of the Unicode class.
9//
10// Copyright (c) 2007, Applied Informatics Software Engineering GmbH.
11// and Contributors.
12//
13// SPDX-License-Identifier: BSL-1.0
14//
15
16
17#ifndef Foundation_Unicode_INCLUDED
18#define Foundation_Unicode_INCLUDED
19
20
21#include "Poco/Foundation.h"
22
23
24namespace Poco {
25
26
27class Foundation_API Unicode
28 /// This class contains enumerations and static
29 /// utility functions for dealing with Unicode characters
30 /// and their properties.
31 ///
32 /// For more information on Unicode, see <http://www.unicode.org>.
33 ///
34 /// The implementation is based on the Unicode support
35 /// functions in PCRE.
36{
37public:
38 // Implementation note: the following definitions must be kept
39 // in sync with those from ucp.h (PCRE).
40 enum CharacterCategory
41 /// Unicode character categories.
42 {
43 UCP_OTHER,
44 UCP_LETTER,
45 UCP_MARK,
46 UCP_NUMBER,
47 UCP_PUNCTUATION,
48 UCP_SYMBOL,
49 UCP_SEPARATOR
50 };
51
52 enum CharacterType
53 /// Unicode character types.
54 {
55 UCP_CONTROL,
56 UCP_FORMAT,
57 UCP_UNASSIGNED,
58 UCP_PRIVATE_USE,
59 UCP_SURROGATE,
60 UCP_LOWER_CASE_LETTER,
61 UCP_MODIFIER_LETTER,
62 UCP_OTHER_LETTER,
63 UCP_TITLE_CASE_LETTER,
64 UCP_UPPER_CASE_LETTER,
65 UCP_SPACING_MARK,
66 UCP_ENCLOSING_MARK,
67 UCP_NON_SPACING_MARK,
68 UCP_DECIMAL_NUMBER,
69 UCP_LETTER_NUMBER,
70 UCP_OTHER_NUMBER,
71 UCP_CONNECTOR_PUNCTUATION,
72 UCP_DASH_PUNCTUATION,
73 UCP_CLOSE_PUNCTUATION,
74 UCP_FINAL_PUNCTUATION,
75 UCP_INITIAL_PUNCTUATION,
76 UCP_OTHER_PUNCTUATION,
77 UCP_OPEN_PUNCTUATION,
78 UCP_CURRENCY_SYMBOL,
79 UCP_MODIFIER_SYMBOL,
80 UCP_MATHEMATICAL_SYMBOL,
81 UCP_OTHER_SYMBOL,
82 UCP_LINE_SEPARATOR,
83 UCP_PARAGRAPH_SEPARATOR,
84 UCP_SPACE_SEPARATOR
85 };
86
87 enum Script
88 /// Unicode 7.0 script identifiers.
89 {
90 UCP_ARABIC,
91 UCP_ARMENIAN,
92 UCP_BENGALI,
93 UCP_BOPOMOFO,
94 UCP_BRAILLE,
95 UCP_BUGINESE,
96 UCP_BUHID,
97 UCP_CANADIAN_ABORIGINAL,
98 UCP_CHEROKEE,
99 UCP_COMMON,
100 UCP_COPTIC,
101 UCP_CYPRIOT,
102 UCP_CYRILLIC,
103 UCP_DESERET,
104 UCP_DEVANAGARI,
105 UCP_ETHIOPIC,
106 UCP_GEORGIAN,
107 UCP_GLAGOLITIC,
108 UCP_GOTHIC,
109 UCP_GREEK,
110 UCP_GUJARATI,
111 UCP_GURMUKHI,
112 UCP_HAN,
113 UCP_HANGUL,
114 UCP_HANUNOO,
115 UCP_HEBREW,
116 UCP_HIRAGANA,
117 UCP_INHERITED,
118 UCP_KANNADA,
119 UCP_KATAKANA,
120 UCP_KHAROSHTHI,
121 UCP_KHMER,
122 UCP_LAO,
123 UCP_LATIN,
124 UCP_LIMBU,
125 UCP_LINEAR_B,
126 UCP_MALAYALAM,
127 UCP_MONGOLIAN,
128 UCP_MYANMAR,
129 UCP_NEW_TAI_LUE,
130 UCP_OGHAM,
131 UCP_OLD_ITALIC,
132 UCP_OLD_PERSIAN,
133 UCP_ORIYA,
134 UCP_OSMANYA,
135 UCP_RUNIC,
136 UCP_SHAVIAN,
137 UCP_SINHALA,
138 UCP_SYLOTI_NAGRI,
139 UCP_SYRIAC,
140 UCP_TAGALOG,
141 UCP_TAGBANWA,
142 UCP_TAI_LE,
143 UCP_TAMIL,
144 UCP_TELUGU,
145 UCP_THAANA,
146 UCP_THAI,
147 UCP_TIBETAN,
148 UCP_TIFINAGH,
149 UCP_UGARITIC,
150 UCP_YI,
151 // Unicode 5.0
152 UCP_BALINESE,
153 UCP_CUNEIFORM,
154 UCP_NKO,
155 UCP_PHAGS_PA,
156 UCP_PHOENICIAN,
157 // Unicode 5.1
158 UCP_CARIAN,
159 UCP_CHAM,
160 UCP_KAYAH_LI,
161 UCP_LEPCHA,
162 UCP_LYCIAN,
163 UCP_LYDIAN,
164 UCP_OL_CHIKI,
165 UCP_REJANG,
166 UCP_SAURASHTRA,
167 UCP_SUNDANESE,
168 UCP_VAI,
169 // Unicode 5.2
170 UCP_AVESTAN,
171 UCP_BAMUM,
172 UCP_EGYPTIAN_HIEROGLYPHS,
173 UCP_IMPERIAL_ARAMAIC,
174 UCP_INSCRIPTIONAL_PAHLAVI,
175 UCP_INSCRIPTIONAL_PARTHIAN,
176 UCP_JAVANESE,
177 UCP_KAITHI,
178 UCP_LISU,
179 UCP_MEETEI_MAYEK,
180 UCP_OLD_SOUTH_ARABIAN,
181 UCP_OLD_TURKIC,
182 UCP_SAMARITAN,
183 UCP_TAI_THAM,
184 UCP_TAI_VIET,
185 // Unicode 6.0
186 UCP_BATAK,
187 UCP_BRAHMI,
188 UCP_MANDAIC,
189 // Unicode 6.1
190 UCP_CHAKMA,
191 UCP_MEROITIC_CURSIVE,
192 UCP_MEROITIC_HIEROGLYPHS,
193 UCP_MIAO,
194 UCP_SHARADA,
195 UCP_SORA_SOMPENG,
196 UCP_TAKRI,
197 // Unicode 7.0
198 UCP_BASSA_VAH,
199 UCP_CAUCASIAN_ALBANIAN,
200 UCP_DUPLOYAN,
201 UCP_ELBASAN,
202 UCP_GRANTHA,
203 UCP_KHOJKI,
204 UCP_KHUDAWADI,
205 UCP_LINEAR_A,
206 UCP_MAHAJANI,
207 UCP_MANICHAEAN,
208 UCP_MENDE_KIKAKUI,
209 UCP_MODI,
210 UCP_MRO,
211 UCP_NABATAEAN,
212 UCP_OLD_NORTH_ARABIAN,
213 UCP_OLD_PERMIC,
214 UCP_PAHAWH_HMONG,
215 UCP_PALMYRENE,
216 UCP_PSALTER_PAHLAVI,
217 UCP_PAU_CIN_HAU,
218 UCP_SIDDHAM,
219 UCP_TIRHUTA,
220 UCP_WARANG_CITI
221 };
222
223 enum
224 {
225 UCP_MAX_CODEPOINT = 0x10FFFF
226 };
227
228 struct CharacterProperties
229 /// This structure holds the character properties
230 /// of an Unicode character.
231 {
232 CharacterCategory category;
233 CharacterType type;
234 Script script;
235 };
236
237 static void properties(int ch, CharacterProperties& props);
238 /// Return the Unicode character properties for the
239 /// character with the given Unicode value.
240
241 static bool isSpace(int ch);
242 /// Returns true iff the given character is a separator.
243
244 static bool isDigit(int ch);
245 /// Returns true iff the given character is a numeric character.
246
247 static bool isPunct(int ch);
248 /// Returns true iff the given character is a punctuation character.
249
250 static bool isAlpha(int ch);
251 /// Returns true iff the given character is a letter.
252
253 static bool isLower(int ch);
254 /// Returns true iff the given character is a lowercase
255 /// character.
256
257 static bool isUpper(int ch);
258 /// Returns true iff the given character is an uppercase
259 /// character.
260
261 static int toLower(int ch);
262 /// If the given character is an uppercase character,
263 /// return its lowercase counterpart, otherwise return
264 /// the character.
265
266 static int toUpper(int ch);
267 /// If the given character is a lowercase character,
268 /// return its uppercase counterpart, otherwise return
269 /// the character.
270};
271
272
273//
274// inlines
275//
276inline bool Unicode::isSpace(int ch)
277{
278 CharacterProperties props;
279 properties(ch, props);
280 return props.category == UCP_SEPARATOR;
281}
282
283
284inline bool Unicode::isDigit(int ch)
285{
286 CharacterProperties props;
287 properties(ch, props);
288 return props.category == UCP_NUMBER;
289}
290
291
292inline bool Unicode::isPunct(int ch)
293{
294 CharacterProperties props;
295 properties(ch, props);
296 return props.category == UCP_PUNCTUATION;
297}
298
299
300inline bool Unicode::isAlpha(int ch)
301{
302 CharacterProperties props;
303 properties(ch, props);
304 return props.category == UCP_LETTER;
305}
306
307
308inline bool Unicode::isLower(int ch)
309{
310 CharacterProperties props;
311 properties(ch, props);
312 return props.category == UCP_LETTER && props.type == UCP_LOWER_CASE_LETTER;
313}
314
315
316inline bool Unicode::isUpper(int ch)
317{
318 CharacterProperties props;
319 properties(ch, props);
320 return props.category == UCP_LETTER && props.type == UCP_UPPER_CASE_LETTER;
321}
322
323
324} // namespace Poco
325
326
327#endif // Foundation_Unicode_INCLUDED
328