1 | /* |
2 | * Copyright (c) 2009-2012 Petri Lehtinen <petri@digip.org> |
3 | * |
4 | * Jansson is free software; you can redistribute it and/or modify |
5 | * it under the terms of the MIT license. See LICENSE for details. |
6 | */ |
7 | |
8 | #include <string.h> |
9 | #include "utf.h" |
10 | |
11 | int utf8_encode(int32_t codepoint, char *buffer, int *size) |
12 | { |
13 | if(codepoint < 0) |
14 | return -1; |
15 | else if(codepoint < 0x80) |
16 | { |
17 | buffer[0] = (char)codepoint; |
18 | *size = 1; |
19 | } |
20 | else if(codepoint < 0x800) |
21 | { |
22 | buffer[0] = 0xC0 + ((codepoint & 0x7C0) >> 6); |
23 | buffer[1] = 0x80 + ((codepoint & 0x03F)); |
24 | *size = 2; |
25 | } |
26 | else if(codepoint < 0x10000) |
27 | { |
28 | buffer[0] = 0xE0 + ((codepoint & 0xF000) >> 12); |
29 | buffer[1] = 0x80 + ((codepoint & 0x0FC0) >> 6); |
30 | buffer[2] = 0x80 + ((codepoint & 0x003F)); |
31 | *size = 3; |
32 | } |
33 | else if(codepoint <= 0x10FFFF) |
34 | { |
35 | buffer[0] = 0xF0 + ((codepoint & 0x1C0000) >> 18); |
36 | buffer[1] = 0x80 + ((codepoint & 0x03F000) >> 12); |
37 | buffer[2] = 0x80 + ((codepoint & 0x000FC0) >> 6); |
38 | buffer[3] = 0x80 + ((codepoint & 0x00003F)); |
39 | *size = 4; |
40 | } |
41 | else |
42 | return -1; |
43 | |
44 | return 0; |
45 | } |
46 | |
47 | int utf8_check_first(char byte) |
48 | { |
49 | unsigned char u = (unsigned char)byte; |
50 | |
51 | if(u < 0x80) |
52 | return 1; |
53 | |
54 | if(0x80 <= u && u <= 0xBF) { |
55 | /* second, third or fourth byte of a multi-byte |
56 | sequence, i.e. a "continuation byte" */ |
57 | return 0; |
58 | } |
59 | else if(u == 0xC0 || u == 0xC1) { |
60 | /* overlong encoding of an ASCII byte */ |
61 | return 0; |
62 | } |
63 | else if(0xC2 <= u && u <= 0xDF) { |
64 | /* 2-byte sequence */ |
65 | return 2; |
66 | } |
67 | |
68 | else if(0xE0 <= u && u <= 0xEF) { |
69 | /* 3-byte sequence */ |
70 | return 3; |
71 | } |
72 | else if(0xF0 <= u && u <= 0xF4) { |
73 | /* 4-byte sequence */ |
74 | return 4; |
75 | } |
76 | else { /* u >= 0xF5 */ |
77 | /* Restricted (start of 4-, 5- or 6-byte sequence) or invalid |
78 | UTF-8 */ |
79 | return 0; |
80 | } |
81 | } |
82 | |
83 | int utf8_check_full(const char *buffer, int size, int32_t *codepoint) |
84 | { |
85 | int i; |
86 | int32_t value = 0; |
87 | unsigned char u = (unsigned char)buffer[0]; |
88 | |
89 | if(size == 2) |
90 | { |
91 | value = u & 0x1F; |
92 | } |
93 | else if(size == 3) |
94 | { |
95 | value = u & 0xF; |
96 | } |
97 | else if(size == 4) |
98 | { |
99 | value = u & 0x7; |
100 | } |
101 | else |
102 | return 0; |
103 | |
104 | for(i = 1; i < size; i++) |
105 | { |
106 | u = (unsigned char)buffer[i]; |
107 | |
108 | if(u < 0x80 || u > 0xBF) { |
109 | /* not a continuation byte */ |
110 | return 0; |
111 | } |
112 | |
113 | value = (value << 6) + (u & 0x3F); |
114 | } |
115 | |
116 | if(value > 0x10FFFF) { |
117 | /* not in Unicode range */ |
118 | return 0; |
119 | } |
120 | |
121 | else if(0xD800 <= value && value <= 0xDFFF) { |
122 | /* invalid code point (UTF-16 surrogate halves) */ |
123 | return 0; |
124 | } |
125 | |
126 | else if((size == 2 && value < 0x80) || |
127 | (size == 3 && value < 0x800) || |
128 | (size == 4 && value < 0x10000)) { |
129 | /* overlong encoding */ |
130 | return 0; |
131 | } |
132 | |
133 | if(codepoint) |
134 | *codepoint = value; |
135 | |
136 | return 1; |
137 | } |
138 | |
139 | const char *utf8_iterate(const char *buffer, int32_t *codepoint) |
140 | { |
141 | int count; |
142 | int32_t value; |
143 | |
144 | if(!*buffer) |
145 | return buffer; |
146 | |
147 | count = utf8_check_first(buffer[0]); |
148 | if(count <= 0) |
149 | return NULL; |
150 | |
151 | if(count == 1) |
152 | value = (unsigned char)buffer[0]; |
153 | else |
154 | { |
155 | if(!utf8_check_full(buffer, count, &value)) |
156 | return NULL; |
157 | } |
158 | |
159 | if(codepoint) |
160 | *codepoint = value; |
161 | |
162 | return buffer + count; |
163 | } |
164 | |
165 | int utf8_check_string(const char *string, int length) |
166 | { |
167 | int i; |
168 | |
169 | if(length == -1) |
170 | length = strlen(string); |
171 | |
172 | for(i = 0; i < length; i++) |
173 | { |
174 | int count = utf8_check_first(string[i]); |
175 | if(count == 0) |
176 | return 0; |
177 | else if(count > 1) |
178 | { |
179 | if(i + count > length) |
180 | return 0; |
181 | |
182 | if(!utf8_check_full(&string[i], count, NULL)) |
183 | return 0; |
184 | |
185 | i += count - 1; |
186 | } |
187 | } |
188 | |
189 | return 1; |
190 | } |
191 | |