1 | /* |
2 | * Copyright © 1991-2015 Unicode, Inc. All rights reserved. |
3 | * Distributed under the Terms of Use in |
4 | * http://www.unicode.org/copyright.html. |
5 | * |
6 | * Permission is hereby granted, free of charge, to any person obtaining |
7 | * a copy of the Unicode data files and any associated documentation |
8 | * (the "Data Files") or Unicode software and any associated documentation |
9 | * (the "Software") to deal in the Data Files or Software |
10 | * without restriction, including without limitation the rights to use, |
11 | * copy, modify, merge, publish, distribute, and/or sell copies of |
12 | * the Data Files or Software, and to permit persons to whom the Data Files |
13 | * or Software are furnished to do so, provided that |
14 | * (a) this copyright and permission notice appear with all copies |
15 | * of the Data Files or Software, |
16 | * (b) this copyright and permission notice appear in associated |
17 | * documentation, and |
18 | * (c) there is clear notice in each modified Data File or in the Software |
19 | * as well as in the documentation associated with the Data File(s) or |
20 | * Software that the data or software has been modified. |
21 | * |
22 | * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF |
23 | * ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE |
24 | * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
25 | * NONINFRINGEMENT OF THIRD PARTY RIGHTS. |
26 | * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS |
27 | * NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL |
28 | * DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, |
29 | * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER |
30 | * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR |
31 | * PERFORMANCE OF THE DATA FILES OR SOFTWARE. |
32 | * |
33 | * Except as contained in this notice, the name of a copyright holder |
34 | * shall not be used in advertising or otherwise to promote the sale, |
35 | * use or other dealings in these Data Files or Software without prior |
36 | * written authorization of the copyright holder. |
37 | */ |
38 | |
39 | #ifndef COMMON_CONVERT_UTF_H_ |
40 | #define COMMON_CONVERT_UTF_H_ |
41 | |
42 | /* --------------------------------------------------------------------- |
43 | |
44 | Conversions between UTF32, UTF-16, and UTF-8. Header file. |
45 | |
46 | Several funtions are included here, forming a complete set of |
47 | conversions between the three formats. UTF-7 is not included |
48 | here, but is handled in a separate source file. |
49 | |
50 | Each of these routines takes pointers to input buffers and output |
51 | buffers. The input buffers are const. |
52 | |
53 | Each routine converts the text between *sourceStart and sourceEnd, |
54 | putting the result into the buffer between *targetStart and |
55 | targetEnd. Note: the end pointers are *after* the last item: e.g. |
56 | *(sourceEnd - 1) is the last item. |
57 | |
58 | The return result indicates whether the conversion was successful, |
59 | and if not, whether the problem was in the source or target buffers. |
60 | (Only the first encountered problem is indicated.) |
61 | |
62 | After the conversion, *sourceStart and *targetStart are both |
63 | updated to point to the end of last text successfully converted in |
64 | the respective buffers. |
65 | |
66 | Input parameters: |
67 | sourceStart - pointer to a pointer to the source buffer. |
68 | The contents of this are modified on return so that |
69 | it points at the next thing to be converted. |
70 | targetStart - similarly, pointer to pointer to the target buffer. |
71 | sourceEnd, targetEnd - respectively pointers to the ends of the |
72 | two buffers, for overflow checking only. |
73 | |
74 | These conversion functions take a ConversionFlags argument. When this |
75 | flag is set to strict, both irregular sequences and isolated surrogates |
76 | will cause an error. When the flag is set to lenient, both irregular |
77 | sequences and isolated surrogates are converted. |
78 | |
79 | Whether the flag is strict or lenient, all illegal sequences will cause |
80 | an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>, |
81 | or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code |
82 | must check for illegal sequences. |
83 | |
84 | When the flag is set to lenient, characters over 0x10FFFF are converted |
85 | to the replacement character; otherwise (when the flag is set to strict) |
86 | they constitute an error. |
87 | |
88 | Output parameters: |
89 | The value "sourceIllegal" is returned from some routines if the input |
90 | sequence is malformed. When "sourceIllegal" is returned, the source |
91 | value will point to the illegal value that caused the problem. E.g., |
92 | in UTF-8 when a sequence is malformed, it points to the start of the |
93 | malformed sequence. |
94 | |
95 | Author: Mark E. Davis, 1994. |
96 | Rev History: Rick McGowan, fixes & updates May 2001. |
97 | Fixes & updates, Sept 2001. |
98 | |
99 | ------------------------------------------------------------------------ */ |
100 | |
101 | /* --------------------------------------------------------------------- |
102 | The following 4 definitions are compiler-specific. |
103 | The C standard does not guarantee that wchar_t has at least |
104 | 16 bits, so wchar_t is no less portable than unsigned short! |
105 | All should be unsigned values to avoid sign extension during |
106 | bit mask & shift operations. |
107 | ------------------------------------------------------------------------ */ |
108 | |
109 | namespace google_breakpad { |
110 | |
111 | typedef unsigned long UTF32; /* at least 32 bits */ |
112 | typedef unsigned short UTF16; /* at least 16 bits */ |
113 | typedef unsigned char UTF8; /* typically 8 bits */ |
114 | typedef unsigned char Boolean; /* 0 or 1 */ |
115 | |
116 | /* Some fundamental constants */ |
117 | #define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD |
118 | #define UNI_MAX_BMP (UTF32)0x0000FFFF |
119 | #define UNI_MAX_UTF16 (UTF32)0x0010FFFF |
120 | #define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF |
121 | #define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF |
122 | |
123 | typedef enum { |
124 | conversionOK, /* conversion successful */ |
125 | sourceExhausted, /* partial character in source, but hit end */ |
126 | targetExhausted, /* insuff. room in target for conversion */ |
127 | sourceIllegal /* source sequence is illegal/malformed */ |
128 | } ConversionResult; |
129 | |
130 | typedef enum { |
131 | strictConversion = 0, |
132 | lenientConversion |
133 | } ConversionFlags; |
134 | |
135 | ConversionResult ConvertUTF8toUTF16 (const UTF8** sourceStart, const UTF8* sourceEnd, |
136 | UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags); |
137 | |
138 | ConversionResult ConvertUTF16toUTF8 (const UTF16** sourceStart, const UTF16* sourceEnd, |
139 | UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags); |
140 | |
141 | ConversionResult ConvertUTF8toUTF32 (const UTF8** sourceStart, const UTF8* sourceEnd, |
142 | UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags); |
143 | |
144 | ConversionResult ConvertUTF32toUTF8 (const UTF32** sourceStart, const UTF32* sourceEnd, |
145 | UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags); |
146 | |
147 | ConversionResult ConvertUTF16toUTF32 (const UTF16** sourceStart, const UTF16* sourceEnd, |
148 | UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags); |
149 | |
150 | ConversionResult ConvertUTF32toUTF16 (const UTF32** sourceStart, const UTF32* sourceEnd, |
151 | UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags); |
152 | |
153 | Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd); |
154 | |
155 | } // namespace google_breakpad |
156 | |
157 | /* --------------------------------------------------------------------- */ |
158 | |
159 | #endif // COMMON_CONVERT_UTF_H_ |
160 | |