unicode.c source code [MicroPython/py/unicode.c]

1	/*
2	* This file is part of the MicroPython project, http://micropython.org/
3	*
4	* The MIT License (MIT)
5	*
6	* Copyright (c) 2013, 2014 Damien P. George
7	*
8	* Permission is hereby granted, free of charge, to any person obtaining a copy
9	* of this software and associated documentation files (the "Software"), to deal
10	* in the Software without restriction, including without limitation the rights
11	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12	* copies of the Software, and to permit persons to whom the Software is
13	* furnished to do so, subject to the following conditions:
14	*
15	* The above copyright notice and this permission notice shall be included in
16	* all copies or substantial portions of the Software.
17	*
18	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24	* THE SOFTWARE.
25	*/
26
27	#include <stdint.h>
28
29	#include "py/unicode.h"
30
31	// attribute flags
32	#define FL_PRINT (0x01)
33	#define FL_SPACE (0x02)
34	#define FL_DIGIT (0x04)
35	#define FL_ALPHA (0x08)
36	#define FL_UPPER (0x10)
37	#define FL_LOWER (0x20)
38	#define FL_XDIGIT (0x40)
39
40	// shorthand character attributes
41	#define AT_PR (FL_PRINT)
42	#define AT_SP (FL_SPACE \| FL_PRINT)
43	#define AT_DI (FL_DIGIT \| FL_PRINT \| FL_XDIGIT)
44	#define AT_AL (FL_ALPHA \| FL_PRINT)
45	#define AT_UP (FL_UPPER \| FL_ALPHA \| FL_PRINT)
46	#define AT_LO (FL_LOWER \| FL_ALPHA \| FL_PRINT)
47	#define AT_UX (FL_UPPER \| FL_ALPHA \| FL_PRINT \| FL_XDIGIT)
48	#define AT_LX (FL_LOWER \| FL_ALPHA \| FL_PRINT \| FL_XDIGIT)
49
50	// table of attributes for ascii characters
51	STATIC const uint8_t attr[] = {
52	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
53	`0`, AT_SP, AT_SP, AT_SP, AT_SP, AT_SP, `0`, `0`,
54	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
55	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
56	AT_SP, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR,
57	AT_PR, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR,
58	AT_DI, AT_DI, AT_DI, AT_DI, AT_DI, AT_DI, AT_DI, AT_DI,
59	AT_DI, AT_DI, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR,
60	AT_PR, AT_UX, AT_UX, AT_UX, AT_UX, AT_UX, AT_UX, AT_UP,
61	AT_UP, AT_UP, AT_UP, AT_UP, AT_UP, AT_UP, AT_UP, AT_UP,
62	AT_UP, AT_UP, AT_UP, AT_UP, AT_UP, AT_UP, AT_UP, AT_UP,
63	AT_UP, AT_UP, AT_UP, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR,
64	AT_PR, AT_LX, AT_LX, AT_LX, AT_LX, AT_LX, AT_LX, AT_LO,
65	AT_LO, AT_LO, AT_LO, AT_LO, AT_LO, AT_LO, AT_LO, AT_LO,
66	AT_LO, AT_LO, AT_LO, AT_LO, AT_LO, AT_LO, AT_LO, AT_LO,
67	AT_LO, AT_LO, AT_LO, AT_PR, AT_PR, AT_PR, AT_PR, `0`
68	};
69
70	#if MICROPY_PY_BUILTINS_STR_UNICODE
71
72	unichar utf8_get_char(const byte *s) {
73	unichar ord = *s++;
74	if (!UTF8_IS_NONASCII(ord)) {
75	return ord;
76	}
77	ord &= `0x7F`;
78	for (unichar mask = `0x40`; ord & mask; mask >>= `1`) {
79	ord &= ~mask;
80	}
81	while (UTF8_IS_CONT(*s)) {
82	ord = (ord << `6`) \| (*s++ & `0x3F`);
83	}
84	return ord;
85	}
86
87	const byte utf8_next_char(const* byte *s) {
88	++s;
89	while (UTF8_IS_CONT(*s)) {
90	++s;
91	}
92	return s;
93	}
94
95	mp_uint_t utf8_ptr_to_index(const byte s, const* byte *ptr) {
96	mp_uint_t i = `0`;
97	while (ptr > s) {
98	if (!UTF8_IS_CONT(*--ptr)) {
99	i++;
100	}
101	}
102
103	return i;
104	}
105
106	size_t utf8_charlen(const byte *str, size_t len) {
107	size_t charlen = `0`;
108	for (const byte *top = str + len; str < top; ++str) {
109	if (!UTF8_IS_CONT(*str)) {
110	++charlen;
111	}
112	}
113	return charlen;
114	}
115
116	#endif
117
118	// Be aware: These unichar_is functions are actually ASCII-only!*
119	bool unichar_isspace(unichar c) {
120	return c < `128` && (attr[c] & FL_SPACE) != `0`;
121	}
122
123	bool unichar_isalpha(unichar c) {
124	return c < `128` && (attr[c] & FL_ALPHA) != `0`;
125	}
126
127	/ unused*
128	bool unichar_isprint(unichar c) {
129	return c < 128 && (attr[c] & FL_PRINT) != 0;
130	}
131	*/
132
133	bool unichar_isdigit(unichar c) {
134	return c < `128` && (attr[c] & FL_DIGIT) != `0`;
135	}
136
137	bool unichar_isxdigit(unichar c) {
138	return c < `128` && (attr[c] & FL_XDIGIT) != `0`;
139	}
140
141	bool unichar_isident(unichar c) {
142	return c < `128` && ((attr[c] & (FL_ALPHA \| FL_DIGIT)) != `0` \|\| c == `'_'`);
143	}
144
145	bool unichar_isalnum(unichar c) {
146	return c < `128` && ((attr[c] & (FL_ALPHA \| FL_DIGIT)) != `0`);
147	}
148
149	bool unichar_isupper(unichar c) {
150	return c < `128` && (attr[c] & FL_UPPER) != `0`;
151	}
152
153	bool unichar_islower(unichar c) {
154	return c < `128` && (attr[c] & FL_LOWER) != `0`;
155	}
156
157	unichar unichar_tolower(unichar c) {
158	if (unichar_isupper(c)) {
159	return c + `0x20`;
160	}
161	return c;
162	}
163
164	unichar unichar_toupper(unichar c) {
165	if (unichar_islower(c)) {
166	return c - `0x20`;
167	}
168	return c;
169	}
170
171	mp_uint_t unichar_xdigit_value(unichar c) {
172	// c is assumed to be hex digit
173	mp_uint_t n = c - `'0'`;
174	if (n > `9`) {
175	n &= ~(`'a'` - `'A'`);
176	n -= (`'A'` - (`'9'` + `1`));
177	}
178	return n;
179	}
180
181	#if MICROPY_PY_BUILTINS_STR_UNICODE
182
183	bool utf8_check(const byte *p, size_t len) {
184	uint8_t need = `0`;
185	const byte *end = p + len;
186	for (; p < end; p++) {
187	byte c = *p;
188	if (need) {
189	if (UTF8_IS_CONT(c)) {
190	need--;
191	} else {
192	// mismatch
193	return `0`;
194	}
195	} else {
196	if (c >= `0xc0`) {
197	if (c >= `0xf8`) {
198	// mismatch
199	return `0`;
200	}
201	need = (`0xe5` >> ((c >> `3`) & `0x6`)) & `3`;
202	} else if (c >= `0x80`) {
203	// mismatch
204	return `0`;
205	}
206	}
207	}
208	return need == `0`; // no pending fragments allowed
209	}
210
211	#endif
212

Browse the source code of MicroPython/py/unicode.c