1 | /* |
2 | * This file is part of the MicroPython project, http://micropython.org/ |
3 | * |
4 | * The MIT License (MIT) |
5 | * |
6 | * Copyright (c) 2013, 2014 Damien P. George |
7 | * Copyright (c) 2014-2016 Paul Sokolovsky |
8 | * |
9 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
10 | * of this software and associated documentation files (the "Software"), to deal |
11 | * in the Software without restriction, including without limitation the rights |
12 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
13 | * copies of the Software, and to permit persons to whom the Software is |
14 | * furnished to do so, subject to the following conditions: |
15 | * |
16 | * The above copyright notice and this permission notice shall be included in |
17 | * all copies or substantial portions of the Software. |
18 | * |
19 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
20 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
21 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
22 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
23 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
24 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
25 | * THE SOFTWARE. |
26 | */ |
27 | |
28 | #include <string.h> |
29 | #include <assert.h> |
30 | |
31 | #include "py/objstr.h" |
32 | #include "py/objlist.h" |
33 | #include "py/runtime.h" |
34 | |
35 | #if MICROPY_PY_BUILTINS_STR_UNICODE |
36 | |
37 | STATIC mp_obj_t mp_obj_new_str_iterator(mp_obj_t str, mp_obj_iter_buf_t *iter_buf); |
38 | |
39 | /******************************************************************************/ |
40 | /* str */ |
41 | |
42 | STATIC void uni_print_quoted(const mp_print_t *print, const byte *str_data, uint str_len) { |
43 | // this escapes characters, but it will be very slow to print (calling print many times) |
44 | bool has_single_quote = false; |
45 | bool has_double_quote = false; |
46 | for (const byte *s = str_data, *top = str_data + str_len; !has_double_quote && s < top; s++) { |
47 | if (*s == '\'') { |
48 | has_single_quote = true; |
49 | } else if (*s == '"') { |
50 | has_double_quote = true; |
51 | } |
52 | } |
53 | unichar quote_char = '\''; |
54 | if (has_single_quote && !has_double_quote) { |
55 | quote_char = '"'; |
56 | } |
57 | mp_printf(print, "%c" , quote_char); |
58 | const byte *s = str_data, *top = str_data + str_len; |
59 | while (s < top) { |
60 | unichar ch; |
61 | ch = utf8_get_char(s); |
62 | s = utf8_next_char(s); |
63 | if (ch == quote_char) { |
64 | mp_printf(print, "\\%c" , quote_char); |
65 | } else if (ch == '\\') { |
66 | mp_print_str(print, "\\\\" ); |
67 | } else if (32 <= ch && ch <= 126) { |
68 | mp_printf(print, "%c" , ch); |
69 | } else if (ch == '\n') { |
70 | mp_print_str(print, "\\n" ); |
71 | } else if (ch == '\r') { |
72 | mp_print_str(print, "\\r" ); |
73 | } else if (ch == '\t') { |
74 | mp_print_str(print, "\\t" ); |
75 | } else if (ch < 0x100) { |
76 | mp_printf(print, "\\x%02x" , ch); |
77 | } else if (ch < 0x10000) { |
78 | mp_printf(print, "\\u%04x" , ch); |
79 | } else { |
80 | mp_printf(print, "\\U%08x" , ch); |
81 | } |
82 | } |
83 | mp_printf(print, "%c" , quote_char); |
84 | } |
85 | |
86 | STATIC void uni_print(const mp_print_t *print, mp_obj_t self_in, mp_print_kind_t kind) { |
87 | GET_STR_DATA_LEN(self_in, str_data, str_len); |
88 | #if MICROPY_PY_UJSON |
89 | if (kind == PRINT_JSON) { |
90 | mp_str_print_json(print, str_data, str_len); |
91 | return; |
92 | } |
93 | #endif |
94 | if (kind == PRINT_STR) { |
95 | print->print_strn(print->data, (const char *)str_data, str_len); |
96 | } else { |
97 | uni_print_quoted(print, str_data, str_len); |
98 | } |
99 | } |
100 | |
101 | STATIC mp_obj_t uni_unary_op(mp_unary_op_t op, mp_obj_t self_in) { |
102 | GET_STR_DATA_LEN(self_in, str_data, str_len); |
103 | switch (op) { |
104 | case MP_UNARY_OP_BOOL: |
105 | return mp_obj_new_bool(str_len != 0); |
106 | case MP_UNARY_OP_LEN: |
107 | return MP_OBJ_NEW_SMALL_INT(utf8_charlen(str_data, str_len)); |
108 | default: |
109 | return MP_OBJ_NULL; // op not supported |
110 | } |
111 | } |
112 | |
113 | // Convert an index into a pointer to its lead byte. Out of bounds indexing will raise IndexError or |
114 | // be capped to the first/last character of the string, depending on is_slice. |
115 | const byte *str_index_to_ptr(const mp_obj_type_t *type, const byte *self_data, size_t self_len, |
116 | mp_obj_t index, bool is_slice) { |
117 | // All str functions also handle bytes objects, and they call str_index_to_ptr(), |
118 | // so it must handle bytes. |
119 | if (type == &mp_type_bytes) { |
120 | // Taken from objstr.c:str_index_to_ptr() |
121 | size_t index_val = mp_get_index(type, self_len, index, is_slice); |
122 | return self_data + index_val; |
123 | } |
124 | |
125 | mp_int_t i; |
126 | // Copied from mp_get_index; I don't want bounds checking, just give me |
127 | // the integer as-is. (I can't bounds-check without scanning the whole |
128 | // string; an out-of-bounds index will be caught in the loops below.) |
129 | if (mp_obj_is_small_int(index)) { |
130 | i = MP_OBJ_SMALL_INT_VALUE(index); |
131 | } else if (!mp_obj_get_int_maybe(index, &i)) { |
132 | mp_raise_msg_varg(&mp_type_TypeError, MP_ERROR_TEXT("string indices must be integers, not %s" ), mp_obj_get_type_str(index)); |
133 | } |
134 | const byte *s, *top = self_data + self_len; |
135 | if (i < 0) { |
136 | // Negative indexing is performed by counting from the end of the string. |
137 | for (s = top - 1; i; --s) { |
138 | if (s < self_data) { |
139 | if (is_slice) { |
140 | return self_data; |
141 | } |
142 | mp_raise_msg(&mp_type_IndexError, MP_ERROR_TEXT("string index out of range" )); |
143 | } |
144 | if (!UTF8_IS_CONT(*s)) { |
145 | ++i; |
146 | } |
147 | } |
148 | ++s; |
149 | } else { |
150 | // Positive indexing, correspondingly, counts from the start of the string. |
151 | // It's assumed that negative indexing will generally be used with small |
152 | // absolute values (eg str[-1], not str[-1000000]), which means it'll be |
153 | // more efficient this way. |
154 | s = self_data; |
155 | while (1) { |
156 | // First check out-of-bounds |
157 | if (s >= top) { |
158 | if (is_slice) { |
159 | return top; |
160 | } |
161 | mp_raise_msg(&mp_type_IndexError, MP_ERROR_TEXT("string index out of range" )); |
162 | } |
163 | // Then check completion |
164 | if (i-- == 0) { |
165 | break; |
166 | } |
167 | // Then skip UTF-8 char |
168 | ++s; |
169 | while (UTF8_IS_CONT(*s)) { |
170 | ++s; |
171 | } |
172 | } |
173 | } |
174 | return s; |
175 | } |
176 | |
177 | STATIC mp_obj_t str_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) { |
178 | const mp_obj_type_t *type = mp_obj_get_type(self_in); |
179 | assert(type == &mp_type_str); |
180 | GET_STR_DATA_LEN(self_in, self_data, self_len); |
181 | if (value == MP_OBJ_SENTINEL) { |
182 | // load |
183 | #if MICROPY_PY_BUILTINS_SLICE |
184 | if (mp_obj_is_type(index, &mp_type_slice)) { |
185 | mp_obj_t ostart, ostop, ostep; |
186 | mp_obj_slice_t *slice = MP_OBJ_TO_PTR(index); |
187 | ostart = slice->start; |
188 | ostop = slice->stop; |
189 | ostep = slice->step; |
190 | |
191 | if (ostep != mp_const_none && ostep != MP_OBJ_NEW_SMALL_INT(1)) { |
192 | mp_raise_NotImplementedError(MP_ERROR_TEXT("only slices with step=1 (aka None) are supported" )); |
193 | } |
194 | |
195 | const byte *pstart, *pstop; |
196 | if (ostart != mp_const_none) { |
197 | pstart = str_index_to_ptr(type, self_data, self_len, ostart, true); |
198 | } else { |
199 | pstart = self_data; |
200 | } |
201 | if (ostop != mp_const_none) { |
202 | // pstop will point just after the stop character. This depends on |
203 | // the \0 at the end of the string. |
204 | pstop = str_index_to_ptr(type, self_data, self_len, ostop, true); |
205 | } else { |
206 | pstop = self_data + self_len; |
207 | } |
208 | if (pstop < pstart) { |
209 | return MP_OBJ_NEW_QSTR(MP_QSTR_); |
210 | } |
211 | return mp_obj_new_str_of_type(type, (const byte *)pstart, pstop - pstart); |
212 | } |
213 | #endif |
214 | const byte *s = str_index_to_ptr(type, self_data, self_len, index, false); |
215 | int len = 1; |
216 | if (UTF8_IS_NONASCII(*s)) { |
217 | // Count the number of 1 bits (after the first) |
218 | for (char mask = 0x40; *s & mask; mask >>= 1) { |
219 | ++len; |
220 | } |
221 | } |
222 | return mp_obj_new_str_via_qstr((const char *)s, len); // This will create a one-character string |
223 | } else { |
224 | return MP_OBJ_NULL; // op not supported |
225 | } |
226 | } |
227 | |
228 | STATIC const mp_rom_map_elem_t struni_locals_dict_table[] = { |
229 | #if MICROPY_CPYTHON_COMPAT |
230 | { MP_ROM_QSTR(MP_QSTR_encode), MP_ROM_PTR(&str_encode_obj) }, |
231 | #endif |
232 | { MP_ROM_QSTR(MP_QSTR_find), MP_ROM_PTR(&str_find_obj) }, |
233 | { MP_ROM_QSTR(MP_QSTR_rfind), MP_ROM_PTR(&str_rfind_obj) }, |
234 | { MP_ROM_QSTR(MP_QSTR_index), MP_ROM_PTR(&str_index_obj) }, |
235 | { MP_ROM_QSTR(MP_QSTR_rindex), MP_ROM_PTR(&str_rindex_obj) }, |
236 | { MP_ROM_QSTR(MP_QSTR_join), MP_ROM_PTR(&str_join_obj) }, |
237 | { MP_ROM_QSTR(MP_QSTR_split), MP_ROM_PTR(&str_split_obj) }, |
238 | #if MICROPY_PY_BUILTINS_STR_SPLITLINES |
239 | { MP_ROM_QSTR(MP_QSTR_splitlines), MP_ROM_PTR(&str_splitlines_obj) }, |
240 | #endif |
241 | { MP_ROM_QSTR(MP_QSTR_rsplit), MP_ROM_PTR(&str_rsplit_obj) }, |
242 | { MP_ROM_QSTR(MP_QSTR_startswith), MP_ROM_PTR(&str_startswith_obj) }, |
243 | { MP_ROM_QSTR(MP_QSTR_endswith), MP_ROM_PTR(&str_endswith_obj) }, |
244 | { MP_ROM_QSTR(MP_QSTR_strip), MP_ROM_PTR(&str_strip_obj) }, |
245 | { MP_ROM_QSTR(MP_QSTR_lstrip), MP_ROM_PTR(&str_lstrip_obj) }, |
246 | { MP_ROM_QSTR(MP_QSTR_rstrip), MP_ROM_PTR(&str_rstrip_obj) }, |
247 | { MP_ROM_QSTR(MP_QSTR_format), MP_ROM_PTR(&str_format_obj) }, |
248 | { MP_ROM_QSTR(MP_QSTR_replace), MP_ROM_PTR(&str_replace_obj) }, |
249 | #if MICROPY_PY_BUILTINS_STR_COUNT |
250 | { MP_ROM_QSTR(MP_QSTR_count), MP_ROM_PTR(&str_count_obj) }, |
251 | #endif |
252 | #if MICROPY_PY_BUILTINS_STR_PARTITION |
253 | { MP_ROM_QSTR(MP_QSTR_partition), MP_ROM_PTR(&str_partition_obj) }, |
254 | { MP_ROM_QSTR(MP_QSTR_rpartition), MP_ROM_PTR(&str_rpartition_obj) }, |
255 | #endif |
256 | #if MICROPY_PY_BUILTINS_STR_CENTER |
257 | { MP_ROM_QSTR(MP_QSTR_center), MP_ROM_PTR(&str_center_obj) }, |
258 | #endif |
259 | { MP_ROM_QSTR(MP_QSTR_lower), MP_ROM_PTR(&str_lower_obj) }, |
260 | { MP_ROM_QSTR(MP_QSTR_upper), MP_ROM_PTR(&str_upper_obj) }, |
261 | { MP_ROM_QSTR(MP_QSTR_isspace), MP_ROM_PTR(&str_isspace_obj) }, |
262 | { MP_ROM_QSTR(MP_QSTR_isalpha), MP_ROM_PTR(&str_isalpha_obj) }, |
263 | { MP_ROM_QSTR(MP_QSTR_isdigit), MP_ROM_PTR(&str_isdigit_obj) }, |
264 | { MP_ROM_QSTR(MP_QSTR_isupper), MP_ROM_PTR(&str_isupper_obj) }, |
265 | { MP_ROM_QSTR(MP_QSTR_islower), MP_ROM_PTR(&str_islower_obj) }, |
266 | }; |
267 | |
268 | STATIC MP_DEFINE_CONST_DICT(struni_locals_dict, struni_locals_dict_table); |
269 | |
270 | const mp_obj_type_t mp_type_str = { |
271 | { &mp_type_type }, |
272 | .name = MP_QSTR_str, |
273 | .print = uni_print, |
274 | .make_new = mp_obj_str_make_new, |
275 | .unary_op = uni_unary_op, |
276 | .binary_op = mp_obj_str_binary_op, |
277 | .subscr = str_subscr, |
278 | .getiter = mp_obj_new_str_iterator, |
279 | .buffer_p = { .get_buffer = mp_obj_str_get_buffer }, |
280 | .locals_dict = (mp_obj_dict_t *)&struni_locals_dict, |
281 | }; |
282 | |
283 | /******************************************************************************/ |
284 | /* str iterator */ |
285 | |
286 | typedef struct _mp_obj_str_it_t { |
287 | mp_obj_base_t base; |
288 | mp_fun_1_t iternext; |
289 | mp_obj_t str; |
290 | size_t cur; |
291 | } mp_obj_str_it_t; |
292 | |
293 | STATIC mp_obj_t str_it_iternext(mp_obj_t self_in) { |
294 | mp_obj_str_it_t *self = MP_OBJ_TO_PTR(self_in); |
295 | GET_STR_DATA_LEN(self->str, str, len); |
296 | if (self->cur < len) { |
297 | const byte *cur = str + self->cur; |
298 | const byte *end = utf8_next_char(str + self->cur); |
299 | mp_obj_t o_out = mp_obj_new_str_via_qstr((const char *)cur, end - cur); |
300 | self->cur += end - cur; |
301 | return o_out; |
302 | } else { |
303 | return MP_OBJ_STOP_ITERATION; |
304 | } |
305 | } |
306 | |
307 | STATIC mp_obj_t mp_obj_new_str_iterator(mp_obj_t str, mp_obj_iter_buf_t *iter_buf) { |
308 | assert(sizeof(mp_obj_str_it_t) <= sizeof(mp_obj_iter_buf_t)); |
309 | mp_obj_str_it_t *o = (mp_obj_str_it_t *)iter_buf; |
310 | o->base.type = &mp_type_polymorph_iter; |
311 | o->iternext = str_it_iternext; |
312 | o->str = str; |
313 | o->cur = 0; |
314 | return MP_OBJ_FROM_PTR(o); |
315 | } |
316 | |
317 | #endif // MICROPY_PY_BUILTINS_STR_UNICODE |
318 | |