utf.c source code [Aerospike/modules/jansson/src/utf.c]

1	/*
2	* Copyright (c) 2009-2012 Petri Lehtinen <petri@digip.org>
3	*
4	* Jansson is free software; you can redistribute it and/or modify
5	* it under the terms of the MIT license. See LICENSE for details.
6	*/
7
8	#include <string.h>
9	#include "utf.h"
10
11	int utf8_encode(int32_t codepoint, char buffer, int* *size)
12	{
13	if(codepoint < `0`)
14	return -`1`;
15	else if(codepoint < `0x80`)
16	{
17	buffer[`0`] = (char)codepoint;
18	*size = `1`;
19	}
20	else if(codepoint < `0x800`)
21	{
22	buffer[`0`] = `0xC0` + ((codepoint & `0x7C0`) >> `6`);
23	buffer[`1`] = `0x80` + ((codepoint & `0x03F`));
24	*size = `2`;
25	}
26	else if(codepoint < `0x10000`)
27	{
28	buffer[`0`] = `0xE0` + ((codepoint & `0xF000`) >> `12`);
29	buffer[`1`] = `0x80` + ((codepoint & `0x0FC0`) >> `6`);
30	buffer[`2`] = `0x80` + ((codepoint & `0x003F`));
31	*size = `3`;
32	}
33	else if(codepoint <= `0x10FFFF`)
34	{
35	buffer[`0`] = `0xF0` + ((codepoint & `0x1C0000`) >> `18`);
36	buffer[`1`] = `0x80` + ((codepoint & `0x03F000`) >> `12`);
37	buffer[`2`] = `0x80` + ((codepoint & `0x000FC0`) >> `6`);
38	buffer[`3`] = `0x80` + ((codepoint & `0x00003F`));
39	*size = `4`;
40	}
41	else
42	return -`1`;
43
44	return `0`;
45	}
46
47	int utf8_check_first(char byte)
48	{
49	unsigned char u = (unsigned char)byte;
50
51	if(u < `0x80`)
52	return `1`;
53
54	if(`0x80` <= u && u <= `0xBF`) {
55	/ second, third or fourth byte of a multi-byte*
56	sequence, i.e. a "continuation byte" /*
57	return `0`;
58	}
59	else if(u == `0xC0` \|\| u == `0xC1`) {
60	/ overlong encoding of an ASCII byte /
61	return `0`;
62	}
63	else if(`0xC2` <= u && u <= `0xDF`) {
64	/ 2-byte sequence /
65	return `2`;
66	}
67
68	else if(`0xE0` <= u && u <= `0xEF`) {
69	/ 3-byte sequence /
70	return `3`;
71	}
72	else if(`0xF0` <= u && u <= `0xF4`) {
73	/ 4-byte sequence /
74	return `4`;
75	}
76	else { / u >= 0xF5 /
77	/ Restricted (start of 4-, 5- or 6-byte sequence) or invalid*
78	UTF-8 /*
79	return `0`;
80	}
81	}
82
83	int utf8_check_full(const char buffer, int* size, int32_t *codepoint)
84	{
85	int i;
86	int32_t value = `0`;
87	unsigned char u = (unsigned char)buffer[`0`];
88
89	if(size == `2`)
90	{
91	value = u & `0x1F`;
92	}
93	else if(size == `3`)
94	{
95	value = u & `0xF`;
96	}
97	else if(size == `4`)
98	{
99	value = u & `0x7`;
100	}
101	else
102	return `0`;
103
104	for(i = `1`; i < size; i++)
105	{
106	u = (unsigned char)buffer[i];
107
108	if(u < `0x80` \|\| u > `0xBF`) {
109	/ not a continuation byte /
110	return `0`;
111	}
112
113	value = (value << `6`) + (u & `0x3F`);
114	}
115
116	if(value > `0x10FFFF`) {
117	/ not in Unicode range /
118	return `0`;
119	}
120
121	else if(`0xD800` <= value && value <= `0xDFFF`) {
122	/ invalid code point (UTF-16 surrogate halves) /
123	return `0`;
124	}
125
126	else if((size == `2` && value < `0x80`) \|\|
127	(size == `3` && value < `0x800`) \|\|
128	(size == `4` && value < `0x10000`)) {
129	/ overlong encoding /
130	return `0`;
131	}
132
133	if(codepoint)
134	*codepoint = value;
135
136	return `1`;
137	}
138
139	const char utf8_iterate(const* char buffer, int32_t codepoint)
140	{
141	int count;
142	int32_t value;
143
144	if(!*buffer)
145	return buffer;
146
147	count = utf8_check_first(buffer[`0`]);
148	if(count <= `0`)
149	return NULL;
150
151	if(count == `1`)
152	value = (unsigned char)buffer[`0`];
153	else
154	{
155	if(!utf8_check_full(buffer, count, &value))
156	return NULL;
157	}
158
159	if(codepoint)
160	*codepoint = value;
161
162	return buffer + count;
163	}
164
165	int utf8_check_string(const char string, int* length)
166	{
167	int i;
168
169	if(length == -`1`)
170	length = strlen(string);
171
172	for(i = `0`; i < length; i++)
173	{
174	int count = utf8_check_first(string[i]);
175	if(count == `0`)
176	return `0`;
177	else if(count > `1`)
178	{
179	if(i + count > length)
180	return `0`;
181
182	if(!utf8_check_full(&string[i], count, NULL))
183	return `0`;
184
185	i += count - `1`;
186	}
187	}
188
189	return `1`;
190	}
191

Browse the source code of Aerospike/modules/jansson/src/utf.c