1/*
2 * Copyright (C) 2012 Grigori Goronzy <greg@kinoho.net>
3 *
4 * Permission to use, copy, modify, and/or distribute this software for any
5 * purpose with or without fee is hereby granted, provided that the above
6 * copyright notice and this permission notice appear in all copies.
7 *
8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15 */
16
17#ifndef UCDN_H
18#define UCDN_H
19
20#ifdef __cplusplus
21extern "C" {
22#endif
23
24#define UCDN_EAST_ASIAN_F 0
25#define UCDN_EAST_ASIAN_H 1
26#define UCDN_EAST_ASIAN_W 2
27#define UCDN_EAST_ASIAN_NA 3
28#define UCDN_EAST_ASIAN_A 4
29#define UCDN_EAST_ASIAN_N 5
30
31#define UCDN_SCRIPT_COMMON 0
32#define UCDN_SCRIPT_LATIN 1
33#define UCDN_SCRIPT_GREEK 2
34#define UCDN_SCRIPT_CYRILLIC 3
35#define UCDN_SCRIPT_ARMENIAN 4
36#define UCDN_SCRIPT_HEBREW 5
37#define UCDN_SCRIPT_ARABIC 6
38#define UCDN_SCRIPT_SYRIAC 7
39#define UCDN_SCRIPT_THAANA 8
40#define UCDN_SCRIPT_DEVANAGARI 9
41#define UCDN_SCRIPT_BENGALI 10
42#define UCDN_SCRIPT_GURMUKHI 11
43#define UCDN_SCRIPT_GUJARATI 12
44#define UCDN_SCRIPT_ORIYA 13
45#define UCDN_SCRIPT_TAMIL 14
46#define UCDN_SCRIPT_TELUGU 15
47#define UCDN_SCRIPT_KANNADA 16
48#define UCDN_SCRIPT_MALAYALAM 17
49#define UCDN_SCRIPT_SINHALA 18
50#define UCDN_SCRIPT_THAI 19
51#define UCDN_SCRIPT_LAO 20
52#define UCDN_SCRIPT_TIBETAN 21
53#define UCDN_SCRIPT_MYANMAR 22
54#define UCDN_SCRIPT_GEORGIAN 23
55#define UCDN_SCRIPT_HANGUL 24
56#define UCDN_SCRIPT_ETHIOPIC 25
57#define UCDN_SCRIPT_CHEROKEE 26
58#define UCDN_SCRIPT_CANADIAN_ABORIGINAL 27
59#define UCDN_SCRIPT_OGHAM 28
60#define UCDN_SCRIPT_RUNIC 29
61#define UCDN_SCRIPT_KHMER 30
62#define UCDN_SCRIPT_MONGOLIAN 31
63#define UCDN_SCRIPT_HIRAGANA 32
64#define UCDN_SCRIPT_KATAKANA 33
65#define UCDN_SCRIPT_BOPOMOFO 34
66#define UCDN_SCRIPT_HAN 35
67#define UCDN_SCRIPT_YI 36
68#define UCDN_SCRIPT_OLD_ITALIC 37
69#define UCDN_SCRIPT_GOTHIC 38
70#define UCDN_SCRIPT_DESERET 39
71#define UCDN_SCRIPT_INHERITED 40
72#define UCDN_SCRIPT_TAGALOG 41
73#define UCDN_SCRIPT_HANUNOO 42
74#define UCDN_SCRIPT_BUHID 43
75#define UCDN_SCRIPT_TAGBANWA 44
76#define UCDN_SCRIPT_LIMBU 45
77#define UCDN_SCRIPT_TAI_LE 46
78#define UCDN_SCRIPT_LINEAR_B 47
79#define UCDN_SCRIPT_UGARITIC 48
80#define UCDN_SCRIPT_SHAVIAN 49
81#define UCDN_SCRIPT_OSMANYA 50
82#define UCDN_SCRIPT_CYPRIOT 51
83#define UCDN_SCRIPT_BRAILLE 52
84#define UCDN_SCRIPT_BUGINESE 53
85#define UCDN_SCRIPT_COPTIC 54
86#define UCDN_SCRIPT_NEW_TAI_LUE 55
87#define UCDN_SCRIPT_GLAGOLITIC 56
88#define UCDN_SCRIPT_TIFINAGH 57
89#define UCDN_SCRIPT_SYLOTI_NAGRI 58
90#define UCDN_SCRIPT_OLD_PERSIAN 59
91#define UCDN_SCRIPT_KHAROSHTHI 60
92#define UCDN_SCRIPT_BALINESE 61
93#define UCDN_SCRIPT_CUNEIFORM 62
94#define UCDN_SCRIPT_PHOENICIAN 63
95#define UCDN_SCRIPT_PHAGS_PA 64
96#define UCDN_SCRIPT_NKO 65
97#define UCDN_SCRIPT_SUNDANESE 66
98#define UCDN_SCRIPT_LEPCHA 67
99#define UCDN_SCRIPT_OL_CHIKI 68
100#define UCDN_SCRIPT_VAI 69
101#define UCDN_SCRIPT_SAURASHTRA 70
102#define UCDN_SCRIPT_KAYAH_LI 71
103#define UCDN_SCRIPT_REJANG 72
104#define UCDN_SCRIPT_LYCIAN 73
105#define UCDN_SCRIPT_CARIAN 74
106#define UCDN_SCRIPT_LYDIAN 75
107#define UCDN_SCRIPT_CHAM 76
108#define UCDN_SCRIPT_TAI_THAM 77
109#define UCDN_SCRIPT_TAI_VIET 78
110#define UCDN_SCRIPT_AVESTAN 79
111#define UCDN_SCRIPT_EGYPTIAN_HIEROGLYPHS 80
112#define UCDN_SCRIPT_SAMARITAN 81
113#define UCDN_SCRIPT_LISU 82
114#define UCDN_SCRIPT_BAMUM 83
115#define UCDN_SCRIPT_JAVANESE 84
116#define UCDN_SCRIPT_MEETEI_MAYEK 85
117#define UCDN_SCRIPT_IMPERIAL_ARAMAIC 86
118#define UCDN_SCRIPT_OLD_SOUTH_ARABIAN 87
119#define UCDN_SCRIPT_INSCRIPTIONAL_PARTHIAN 88
120#define UCDN_SCRIPT_INSCRIPTIONAL_PAHLAVI 89
121#define UCDN_SCRIPT_OLD_TURKIC 90
122#define UCDN_SCRIPT_KAITHI 91
123#define UCDN_SCRIPT_BATAK 92
124#define UCDN_SCRIPT_BRAHMI 93
125#define UCDN_SCRIPT_MANDAIC 94
126#define UCDN_SCRIPT_CHAKMA 95
127#define UCDN_SCRIPT_MEROITIC_CURSIVE 96
128#define UCDN_SCRIPT_MEROITIC_HIEROGLYPHS 97
129#define UCDN_SCRIPT_MIAO 98
130#define UCDN_SCRIPT_SHARADA 99
131#define UCDN_SCRIPT_SORA_SOMPENG 100
132#define UCDN_SCRIPT_TAKRI 101
133#define UCDN_SCRIPT_UNKNOWN 102
134#define UCDN_SCRIPT_BASSA_VAH 103
135#define UCDN_SCRIPT_CAUCASIAN_ALBANIAN 104
136#define UCDN_SCRIPT_DUPLOYAN 105
137#define UCDN_SCRIPT_ELBASAN 106
138#define UCDN_SCRIPT_GRANTHA 107
139#define UCDN_SCRIPT_KHOJKI 108
140#define UCDN_SCRIPT_KHUDAWADI 109
141#define UCDN_SCRIPT_LINEAR_A 110
142#define UCDN_SCRIPT_MAHAJANI 111
143#define UCDN_SCRIPT_MANICHAEAN 112
144#define UCDN_SCRIPT_MENDE_KIKAKUI 113
145#define UCDN_SCRIPT_MODI 114
146#define UCDN_SCRIPT_MRO 115
147#define UCDN_SCRIPT_NABATAEAN 116
148#define UCDN_SCRIPT_OLD_NORTH_ARABIAN 117
149#define UCDN_SCRIPT_OLD_PERMIC 118
150#define UCDN_SCRIPT_PAHAWH_HMONG 119
151#define UCDN_SCRIPT_PALMYRENE 120
152#define UCDN_SCRIPT_PAU_CIN_HAU 121
153#define UCDN_SCRIPT_PSALTER_PAHLAVI 122
154#define UCDN_SCRIPT_SIDDHAM 123
155#define UCDN_SCRIPT_TIRHUTA 124
156#define UCDN_SCRIPT_WARANG_CITI 125
157#define UCDN_SCRIPT_AHOM 126
158#define UCDN_SCRIPT_ANATOLIAN_HIEROGLYPHS 127
159#define UCDN_SCRIPT_HATRAN 128
160#define UCDN_SCRIPT_MULTANI 129
161#define UCDN_SCRIPT_OLD_HUNGARIAN 130
162#define UCDN_SCRIPT_SIGNWRITING 131
163#define UCDN_SCRIPT_ADLAM 132
164#define UCDN_SCRIPT_BHAIKSUKI 133
165#define UCDN_SCRIPT_MARCHEN 134
166#define UCDN_SCRIPT_NEWA 135
167#define UCDN_SCRIPT_OSAGE 136
168#define UCDN_SCRIPT_TANGUT 137
169#define UCDN_SCRIPT_MASARAM_GONDI 138
170#define UCDN_SCRIPT_NUSHU 139
171#define UCDN_SCRIPT_SOYOMBO 140
172#define UCDN_SCRIPT_ZANABAZAR_SQUARE 141
173#define UCDN_SCRIPT_DOGRA 142
174#define UCDN_SCRIPT_GUNJALA_GONDI 143
175#define UCDN_SCRIPT_HANIFI_ROHINGYA 144
176#define UCDN_SCRIPT_MAKASAR 145
177#define UCDN_SCRIPT_MEDEFAIDRIN 146
178#define UCDN_SCRIPT_OLD_SOGDIAN 147
179#define UCDN_SCRIPT_SOGDIAN 148
180#define UCDN_SCRIPT_ELYMAIC 149
181#define UCDN_SCRIPT_NANDINAGARI 150
182#define UCDN_SCRIPT_NYIAKENG_PUACHUE_HMONG 151
183#define UCDN_SCRIPT_WANCHO 152
184#define UCDN_LAST_SCRIPT 152
185
186#define UCDN_LINEBREAK_CLASS_OP 0
187#define UCDN_LINEBREAK_CLASS_CL 1
188#define UCDN_LINEBREAK_CLASS_CP 2
189#define UCDN_LINEBREAK_CLASS_QU 3
190#define UCDN_LINEBREAK_CLASS_GL 4
191#define UCDN_LINEBREAK_CLASS_NS 5
192#define UCDN_LINEBREAK_CLASS_EX 6
193#define UCDN_LINEBREAK_CLASS_SY 7
194#define UCDN_LINEBREAK_CLASS_IS 8
195#define UCDN_LINEBREAK_CLASS_PR 9
196#define UCDN_LINEBREAK_CLASS_PO 10
197#define UCDN_LINEBREAK_CLASS_NU 11
198#define UCDN_LINEBREAK_CLASS_AL 12
199#define UCDN_LINEBREAK_CLASS_HL 13
200#define UCDN_LINEBREAK_CLASS_ID 14
201#define UCDN_LINEBREAK_CLASS_IN 15
202#define UCDN_LINEBREAK_CLASS_HY 16
203#define UCDN_LINEBREAK_CLASS_BA 17
204#define UCDN_LINEBREAK_CLASS_BB 18
205#define UCDN_LINEBREAK_CLASS_B2 19
206#define UCDN_LINEBREAK_CLASS_ZW 20
207#define UCDN_LINEBREAK_CLASS_CM 21
208#define UCDN_LINEBREAK_CLASS_WJ 22
209#define UCDN_LINEBREAK_CLASS_H2 23
210#define UCDN_LINEBREAK_CLASS_H3 24
211#define UCDN_LINEBREAK_CLASS_JL 25
212#define UCDN_LINEBREAK_CLASS_JV 26
213#define UCDN_LINEBREAK_CLASS_JT 27
214#define UCDN_LINEBREAK_CLASS_RI 28
215#define UCDN_LINEBREAK_CLASS_AI 29
216#define UCDN_LINEBREAK_CLASS_BK 30
217#define UCDN_LINEBREAK_CLASS_CB 31
218#define UCDN_LINEBREAK_CLASS_CJ 32
219#define UCDN_LINEBREAK_CLASS_CR 33
220#define UCDN_LINEBREAK_CLASS_LF 34
221#define UCDN_LINEBREAK_CLASS_NL 35
222#define UCDN_LINEBREAK_CLASS_SA 36
223#define UCDN_LINEBREAK_CLASS_SG 37
224#define UCDN_LINEBREAK_CLASS_SP 38
225#define UCDN_LINEBREAK_CLASS_XX 39
226#define UCDN_LINEBREAK_CLASS_ZWJ 40
227#define UCDN_LINEBREAK_CLASS_EB 41
228#define UCDN_LINEBREAK_CLASS_EM 42
229
230#define UCDN_GENERAL_CATEGORY_CC 0
231#define UCDN_GENERAL_CATEGORY_CF 1
232#define UCDN_GENERAL_CATEGORY_CN 2
233#define UCDN_GENERAL_CATEGORY_CO 3
234#define UCDN_GENERAL_CATEGORY_CS 4
235#define UCDN_GENERAL_CATEGORY_LL 5
236#define UCDN_GENERAL_CATEGORY_LM 6
237#define UCDN_GENERAL_CATEGORY_LO 7
238#define UCDN_GENERAL_CATEGORY_LT 8
239#define UCDN_GENERAL_CATEGORY_LU 9
240#define UCDN_GENERAL_CATEGORY_MC 10
241#define UCDN_GENERAL_CATEGORY_ME 11
242#define UCDN_GENERAL_CATEGORY_MN 12
243#define UCDN_GENERAL_CATEGORY_ND 13
244#define UCDN_GENERAL_CATEGORY_NL 14
245#define UCDN_GENERAL_CATEGORY_NO 15
246#define UCDN_GENERAL_CATEGORY_PC 16
247#define UCDN_GENERAL_CATEGORY_PD 17
248#define UCDN_GENERAL_CATEGORY_PE 18
249#define UCDN_GENERAL_CATEGORY_PF 19
250#define UCDN_GENERAL_CATEGORY_PI 20
251#define UCDN_GENERAL_CATEGORY_PO 21
252#define UCDN_GENERAL_CATEGORY_PS 22
253#define UCDN_GENERAL_CATEGORY_SC 23
254#define UCDN_GENERAL_CATEGORY_SK 24
255#define UCDN_GENERAL_CATEGORY_SM 25
256#define UCDN_GENERAL_CATEGORY_SO 26
257#define UCDN_GENERAL_CATEGORY_ZL 27
258#define UCDN_GENERAL_CATEGORY_ZP 28
259#define UCDN_GENERAL_CATEGORY_ZS 29
260
261#define UCDN_BIDI_CLASS_L 0
262#define UCDN_BIDI_CLASS_LRE 1
263#define UCDN_BIDI_CLASS_LRO 2
264#define UCDN_BIDI_CLASS_R 3
265#define UCDN_BIDI_CLASS_AL 4
266#define UCDN_BIDI_CLASS_RLE 5
267#define UCDN_BIDI_CLASS_RLO 6
268#define UCDN_BIDI_CLASS_PDF 7
269#define UCDN_BIDI_CLASS_EN 8
270#define UCDN_BIDI_CLASS_ES 9
271#define UCDN_BIDI_CLASS_ET 10
272#define UCDN_BIDI_CLASS_AN 11
273#define UCDN_BIDI_CLASS_CS 12
274#define UCDN_BIDI_CLASS_NSM 13
275#define UCDN_BIDI_CLASS_BN 14
276#define UCDN_BIDI_CLASS_B 15
277#define UCDN_BIDI_CLASS_S 16
278#define UCDN_BIDI_CLASS_WS 17
279#define UCDN_BIDI_CLASS_ON 18
280#define UCDN_BIDI_CLASS_LRI 19
281#define UCDN_BIDI_CLASS_RLI 20
282#define UCDN_BIDI_CLASS_FSI 21
283#define UCDN_BIDI_CLASS_PDI 22
284
285#define UCDN_BIDI_PAIRED_BRACKET_TYPE_OPEN 0
286#define UCDN_BIDI_PAIRED_BRACKET_TYPE_CLOSE 1
287#define UCDN_BIDI_PAIRED_BRACKET_TYPE_NONE 2
288
289/**
290 * Return version of the Unicode database.
291 *
292 * @return Unicode database version
293 */
294const char *ucdn_get_unicode_version(void);
295
296/**
297 * Get combining class of a codepoint.
298 *
299 * @param code Unicode codepoint
300 * @return combining class value, as defined in UAX#44
301 */
302int ucdn_get_combining_class(uint32_t code);
303
304/**
305 * Get east-asian width of a codepoint.
306 *
307 * @param code Unicode codepoint
308 * @return value according to UCDN_EAST_ASIAN_* and as defined in UAX#11.
309 */
310int ucdn_get_east_asian_width(uint32_t code);
311
312/**
313 * Get general category of a codepoint.
314 *
315 * @param code Unicode codepoint
316 * @return value according to UCDN_GENERAL_CATEGORY_* and as defined in
317 * UAX#44.
318 */
319int ucdn_get_general_category(uint32_t code);
320
321/**
322 * Get bidirectional class of a codepoint.
323 *
324 * @param code Unicode codepoint
325 * @return value according to UCDN_BIDI_CLASS_* and as defined in UAX#44.
326 */
327int ucdn_get_bidi_class(uint32_t code);
328
329/**
330 * Get script of a codepoint.
331 *
332 * @param code Unicode codepoint
333 * @return value according to UCDN_SCRIPT_* and as defined in UAX#24.
334 */
335int ucdn_get_script(uint32_t code);
336
337/**
338 * Get unresolved linebreak class of a codepoint. This does not take
339 * rule LB1 of UAX#14 into account. See ucdn_get_resolved_linebreak_class()
340 * for resolved linebreak classes.
341 *
342 * @param code Unicode codepoint
343 * @return value according to UCDN_LINEBREAK_* and as defined in UAX#14.
344 */
345int ucdn_get_linebreak_class(uint32_t code);
346
347/**
348 * Get resolved linebreak class of a codepoint. This resolves characters
349 * in the AI, SG, XX, SA and CJ classes according to rule LB1 of UAX#14.
350 * In addition the CB class is resolved as the equivalent B2 class and
351 * the NL class is resolved as the equivalent BK class.
352 *
353 * @param code Unicode codepoint
354 * @return value according to UCDN_LINEBREAK_* and as defined in UAX#14.
355 */
356int ucdn_get_resolved_linebreak_class(uint32_t code);
357
358/**
359 * Check if codepoint can be mirrored.
360 *
361 * @param code Unicode codepoint
362 * @return 1 if mirrored character exists, otherwise 0
363 */
364int ucdn_get_mirrored(uint32_t code);
365
366/**
367 * Mirror a codepoint.
368 *
369 * @param code Unicode codepoint
370 * @return mirrored codepoint or the original codepoint if no
371 * mirrored character exists
372 */
373uint32_t ucdn_mirror(uint32_t code);
374
375/**
376 * Get paired bracket for a codepoint.
377 *
378 * @param code Unicode codepoint
379 * @return paired bracket codepoint or the original codepoint if no
380 * paired bracket character exists
381 */
382uint32_t ucdn_paired_bracket(uint32_t code);
383
384/**
385 * Get paired bracket type for a codepoint.
386 *
387 * @param code Unicode codepoint
388 * @return value according to UCDN_BIDI_PAIRED_BRACKET_TYPE_* and as defined
389 * in UAX#9.
390 *
391 */
392int ucdn_paired_bracket_type(uint32_t code);
393
394/**
395 * Pairwise canonical decomposition of a codepoint. This includes
396 * Hangul Jamo decomposition (see chapter 3.12 of the Unicode core
397 * specification).
398 *
399 * Hangul is decomposed into L and V jamos for LV forms, and an
400 * LV precomposed syllable and a T jamo for LVT forms.
401 *
402 * @param code Unicode codepoint
403 * @param a filled with first codepoint of decomposition
404 * @param b filled with second codepoint of decomposition, or 0
405 * @return success
406 */
407int ucdn_decompose(uint32_t code, uint32_t *a, uint32_t *b);
408
409/**
410 * Compatibility decomposition of a codepoint.
411 *
412 * @param code Unicode codepoint
413 * @param decomposed filled with decomposition, must be able to hold 18
414 * characters
415 * @return length of decomposition or 0 in case none exists
416 */
417int ucdn_compat_decompose(uint32_t code, uint32_t *decomposed);
418
419/**
420 * Pairwise canonical composition of two codepoints. This includes
421 * Hangul Jamo composition (see chapter 3.12 of the Unicode core
422 * specification).
423 *
424 * Hangul composition expects either L and V jamos, or an LV
425 * precomposed syllable and a T jamo. This is exactly the inverse
426 * of pairwise Hangul decomposition.
427 *
428 * @param code filled with composition
429 * @param a first codepoint
430 * @param b second codepoint
431 * @return success
432 */
433int ucdn_compose(uint32_t *code, uint32_t a, uint32_t b);
434
435#ifdef __cplusplus
436}
437#endif
438
439#endif
440