unicode.c source code [qemu/util/unicode.c]

1	/*
2	* Dealing with Unicode
3	*
4	* Copyright (C) 2013 Red Hat, Inc.
5	*
6	* Authors:
7	* Markus Armbruster <armbru@redhat.com>
8	*
9	* This work is licensed under the terms of the GNU GPL, version 2 or
10	* later. See the COPYING file in the top-level directory.
11	*/
12
13	#include "qemu/osdep.h"
14	#include "qemu/unicode.h"
15
16	static bool is_valid_codepoint(int codepoint)
17	{
18	if (codepoint > `0x10FFFFu`) {
19	return false; / beyond Unicode range /
20	}
21	if ((codepoint >= `0xFDD0` && codepoint <= `0xFDEF`)
22	\|\| (codepoint & `0xFFFE`) == `0xFFFE`) {
23	return false; / noncharacter /
24	}
25	if (codepoint >= `0xD800` && codepoint <= `0xDFFF`) {
26	return false; / surrogate code point /
27	}
28	return true;
29	}
30
31	/**
32	* mod_utf8_codepoint:
33	* @s: string encoded in modified UTF-8
34	* @n: maximum number of bytes to read from @s, if less than 6
35	* @end: set to end of sequence on return
36	*
37	* Convert the modified UTF-8 sequence at the start of @s. Modified
38	* UTF-8 is exactly like UTF-8, except U+0000 is encoded as
39	* "\xC0\x80".
40	*
41	* If @n is zero or @s points to a zero byte, the sequence is invalid,
42	* and @end is set to @s.
43	*
44	* If @s points to an impossible byte (0xFE or 0xFF) or a continuation
45	* byte, the sequence is invalid, and @end is set to @s + 1
46	*
47	* Else, the first byte determines how many continuation bytes are
48	* expected. If there are fewer, the sequence is invalid, and @end is
49	* set to @s + 1 + actual number of continuation bytes. Else, the
50	* sequence is well-formed, and @end is set to @s + 1 + expected
51	* number of continuation bytes.
52	*
53	* A well-formed sequence is valid unless it encodes a codepoint
54	* outside the Unicode range U+0000..U+10FFFF, one of Unicode's 66
55	* noncharacters, a surrogate codepoint, or is overlong. Except the
56	* overlong sequence "\xC0\x80" is valid.
57	*
58	* Conversion succeeds if and only if the sequence is valid.
59	*
60	* Returns: the Unicode codepoint on success, -1 on failure.
61	*/
62	int mod_utf8_codepoint(const char s, size_t n, char* **end)
63	{
64	static int min_cp[`5`] = { `0x80`, `0x800`, `0x10000`, `0x200000`, `0x4000000` };
65	const unsigned char *p;
66	unsigned byte, mask, len, i;
67	int cp;
68
69	if (n == `0` \|\| *s == `0`) {
70	/ empty sequence /
71	end = (char* *)s;
72	return -`1`;
73	}
74
75	p = (const unsigned char *)s;
76	byte = *p++;
77	if (byte < `0x80`) {
78	cp = byte; / one byte sequence /
79	} else if (byte >= `0xFE`) {
80	cp = -`1`; / impossible bytes 0xFE, 0xFF /
81	} else if ((byte & `0x40`) == `0`) {
82	cp = -`1`; / unexpected continuation byte /
83	} else {
84	/ multi-byte sequence /
85	len = `0`;
86	for (mask = `0x80`; byte & mask; mask >>= `1`) {
87	len++;
88	}
89	assert(len > `1` && len < `7`);
90	cp = byte & (mask - `1`);
91	for (i = `1`; i < len; i++) {
92	byte = i < n ? *p : `0`;
93	if ((byte & `0xC0`) != `0x80`) {
94	cp = -`1`; / continuation byte missing /
95	goto out;
96	}
97	p++;
98	cp <<= `6`;
99	cp \|= byte & `0x3F`;
100	}
101	if (!is_valid_codepoint(cp)) {
102	cp = -`1`;
103	} else if (cp < min_cp[len - `2`] && !(cp == `0` && len == `2`)) {
104	cp = -`1`; / overlong, not \xC0\x80 /
105	}
106	}
107
108	out:
109	end = (char* *)p;
110	return cp;
111	}
112
113	/**
114	* mod_utf8_encode:
115	* @buf: Destination buffer
116	* @bufsz: size of @buf, at least 5.
117	* @codepoint: Unicode codepoint to encode
118	*
119	* Convert Unicode codepoint @codepoint to modified UTF-8.
120	*
121	* Returns: the length of the UTF-8 sequence on success, -1 when
122	* @codepoint is invalid.
123	*/
124	ssize_t mod_utf8_encode(char buf[], size_t bufsz, int codepoint)
125	{
126	assert(bufsz >= `5`);
127
128	if (!is_valid_codepoint(codepoint)) {
129	return -`1`;
130	}
131
132	if (codepoint > `0` && codepoint <= `0x7F`) {
133	buf[`0`] = codepoint & `0x7F`;
134	buf[`1`] = `0`;
135	return `1`;
136	}
137	if (codepoint <= `0x7FF`) {
138	buf[`0`] = `0xC0` \| ((codepoint >> `6`) & `0x1F`);
139	buf[`1`] = `0x80` \| (codepoint & `0x3F`);
140	buf[`2`] = `0`;
141	return `2`;
142	}
143	if (codepoint <= `0xFFFF`) {
144	buf[`0`] = `0xE0` \| ((codepoint >> `12`) & `0x0F`);
145	buf[`1`] = `0x80` \| ((codepoint >> `6`) & `0x3F`);
146	buf[`2`] = `0x80` \| (codepoint & `0x3F`);
147	buf[`3`] = `0`;
148	return `3`;
149	}
150	buf[`0`] = `0xF0` \| ((codepoint >> `18`) & `0x07`);
151	buf[`1`] = `0x80` \| ((codepoint >> `12`) & `0x3F`);
152	buf[`2`] = `0x80` \| ((codepoint >> `6`) & `0x3F`);
153	buf[`3`] = `0x80` \| (codepoint & `0x3F`);
154	buf[`4`] = `0`;
155	return `4`;
156	}
157

Browse the source code of qemu/util/unicode.c