1 | /* |
2 | * This file is part of the MicroPython project, http://micropython.org/ |
3 | * |
4 | * The MIT License (MIT) |
5 | * |
6 | * Copyright (c) 2013, 2014 Damien P. George |
7 | * Copyright (c) 2014-2018 Paul Sokolovsky |
8 | * |
9 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
10 | * of this software and associated documentation files (the "Software"), to deal |
11 | * in the Software without restriction, including without limitation the rights |
12 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
13 | * copies of the Software, and to permit persons to whom the Software is |
14 | * furnished to do so, subject to the following conditions: |
15 | * |
16 | * The above copyright notice and this permission notice shall be included in |
17 | * all copies or substantial portions of the Software. |
18 | * |
19 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
20 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
21 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
22 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
23 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
24 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
25 | * THE SOFTWARE. |
26 | */ |
27 | |
28 | #include <string.h> |
29 | #include <assert.h> |
30 | |
31 | #include "py/unicode.h" |
32 | #include "py/objstr.h" |
33 | #include "py/objlist.h" |
34 | #include "py/runtime.h" |
35 | #include "py/stackctrl.h" |
36 | |
37 | #if MICROPY_PY_BUILTINS_STR_OP_MODULO |
38 | STATIC mp_obj_t str_modulo_format(mp_obj_t pattern, size_t n_args, const mp_obj_t *args, mp_obj_t dict); |
39 | #endif |
40 | |
41 | STATIC mp_obj_t mp_obj_new_bytes_iterator(mp_obj_t str, mp_obj_iter_buf_t *iter_buf); |
42 | STATIC NORETURN void bad_implicit_conversion(mp_obj_t self_in); |
43 | |
44 | /******************************************************************************/ |
45 | /* str */ |
46 | |
47 | void mp_str_print_quoted(const mp_print_t *print, const byte *str_data, size_t str_len, bool is_bytes) { |
48 | // this escapes characters, but it will be very slow to print (calling print many times) |
49 | bool has_single_quote = false; |
50 | bool has_double_quote = false; |
51 | for (const byte *s = str_data, *top = str_data + str_len; !has_double_quote && s < top; s++) { |
52 | if (*s == '\'') { |
53 | has_single_quote = true; |
54 | } else if (*s == '"') { |
55 | has_double_quote = true; |
56 | } |
57 | } |
58 | int quote_char = '\''; |
59 | if (has_single_quote && !has_double_quote) { |
60 | quote_char = '"'; |
61 | } |
62 | mp_printf(print, "%c" , quote_char); |
63 | for (const byte *s = str_data, *top = str_data + str_len; s < top; s++) { |
64 | if (*s == quote_char) { |
65 | mp_printf(print, "\\%c" , quote_char); |
66 | } else if (*s == '\\') { |
67 | mp_print_str(print, "\\\\" ); |
68 | } else if (*s >= 0x20 && *s != 0x7f && (!is_bytes || *s < 0x80)) { |
69 | // In strings, anything which is not ascii control character |
70 | // is printed as is, this includes characters in range 0x80-0xff |
71 | // (which can be non-Latin letters, etc.) |
72 | mp_printf(print, "%c" , *s); |
73 | } else if (*s == '\n') { |
74 | mp_print_str(print, "\\n" ); |
75 | } else if (*s == '\r') { |
76 | mp_print_str(print, "\\r" ); |
77 | } else if (*s == '\t') { |
78 | mp_print_str(print, "\\t" ); |
79 | } else { |
80 | mp_printf(print, "\\x%02x" , *s); |
81 | } |
82 | } |
83 | mp_printf(print, "%c" , quote_char); |
84 | } |
85 | |
86 | #if MICROPY_PY_UJSON |
87 | void mp_str_print_json(const mp_print_t *print, const byte *str_data, size_t str_len) { |
88 | // for JSON spec, see http://www.ietf.org/rfc/rfc4627.txt |
89 | // if we are given a valid utf8-encoded string, we will print it in a JSON-conforming way |
90 | mp_print_str(print, "\"" ); |
91 | for (const byte *s = str_data, *top = str_data + str_len; s < top; s++) { |
92 | if (*s == '"' || *s == '\\') { |
93 | mp_printf(print, "\\%c" , *s); |
94 | } else if (*s >= 32) { |
95 | // this will handle normal and utf-8 encoded chars |
96 | mp_printf(print, "%c" , *s); |
97 | } else if (*s == '\n') { |
98 | mp_print_str(print, "\\n" ); |
99 | } else if (*s == '\r') { |
100 | mp_print_str(print, "\\r" ); |
101 | } else if (*s == '\t') { |
102 | mp_print_str(print, "\\t" ); |
103 | } else { |
104 | // this will handle control chars |
105 | mp_printf(print, "\\u%04x" , *s); |
106 | } |
107 | } |
108 | mp_print_str(print, "\"" ); |
109 | } |
110 | #endif |
111 | |
112 | STATIC void str_print(const mp_print_t *print, mp_obj_t self_in, mp_print_kind_t kind) { |
113 | GET_STR_DATA_LEN(self_in, str_data, str_len); |
114 | #if MICROPY_PY_UJSON |
115 | if (kind == PRINT_JSON) { |
116 | mp_str_print_json(print, str_data, str_len); |
117 | return; |
118 | } |
119 | #endif |
120 | #if !MICROPY_PY_BUILTINS_STR_UNICODE |
121 | bool is_bytes = mp_obj_is_type(self_in, &mp_type_bytes); |
122 | #else |
123 | bool is_bytes = true; |
124 | #endif |
125 | if (kind == PRINT_RAW || (!MICROPY_PY_BUILTINS_STR_UNICODE && kind == PRINT_STR && !is_bytes)) { |
126 | print->print_strn(print->data, (const char *)str_data, str_len); |
127 | } else { |
128 | if (is_bytes) { |
129 | print->print_strn(print->data, "b" , 1); |
130 | } |
131 | mp_str_print_quoted(print, str_data, str_len, is_bytes); |
132 | } |
133 | } |
134 | |
135 | mp_obj_t mp_obj_str_make_new(const mp_obj_type_t *type, size_t n_args, size_t n_kw, const mp_obj_t *args) { |
136 | #if MICROPY_CPYTHON_COMPAT |
137 | if (n_kw != 0) { |
138 | mp_arg_error_unimpl_kw(); |
139 | } |
140 | #endif |
141 | |
142 | mp_arg_check_num(n_args, n_kw, 0, 3, false); |
143 | |
144 | switch (n_args) { |
145 | case 0: |
146 | return MP_OBJ_NEW_QSTR(MP_QSTR_); |
147 | |
148 | case 1: { |
149 | vstr_t vstr; |
150 | mp_print_t print; |
151 | vstr_init_print(&vstr, 16, &print); |
152 | mp_obj_print_helper(&print, args[0], PRINT_STR); |
153 | return mp_obj_new_str_from_vstr(type, &vstr); |
154 | } |
155 | |
156 | default: // 2 or 3 args |
157 | // TODO: validate 2nd/3rd args |
158 | if (mp_obj_is_type(args[0], &mp_type_bytes)) { |
159 | GET_STR_DATA_LEN(args[0], str_data, str_len); |
160 | GET_STR_HASH(args[0], str_hash); |
161 | if (str_hash == 0) { |
162 | str_hash = qstr_compute_hash(str_data, str_len); |
163 | } |
164 | #if MICROPY_PY_BUILTINS_STR_UNICODE_CHECK |
165 | if (!utf8_check(str_data, str_len)) { |
166 | mp_raise_msg(&mp_type_UnicodeError, NULL); |
167 | } |
168 | #endif |
169 | |
170 | // Check if a qstr with this data already exists |
171 | qstr q = qstr_find_strn((const char *)str_data, str_len); |
172 | if (q != MP_QSTRnull) { |
173 | return MP_OBJ_NEW_QSTR(q); |
174 | } |
175 | |
176 | mp_obj_str_t *o = MP_OBJ_TO_PTR(mp_obj_new_str_copy(type, NULL, str_len)); |
177 | o->data = str_data; |
178 | o->hash = str_hash; |
179 | return MP_OBJ_FROM_PTR(o); |
180 | } else { |
181 | mp_buffer_info_t bufinfo; |
182 | mp_get_buffer_raise(args[0], &bufinfo, MP_BUFFER_READ); |
183 | #if MICROPY_PY_BUILTINS_STR_UNICODE_CHECK |
184 | if (!utf8_check(bufinfo.buf, bufinfo.len)) { |
185 | mp_raise_msg(&mp_type_UnicodeError, NULL); |
186 | } |
187 | #endif |
188 | return mp_obj_new_str(bufinfo.buf, bufinfo.len); |
189 | } |
190 | } |
191 | } |
192 | |
193 | STATIC mp_obj_t bytes_make_new(const mp_obj_type_t *type_in, size_t n_args, size_t n_kw, const mp_obj_t *args) { |
194 | (void)type_in; |
195 | |
196 | #if MICROPY_CPYTHON_COMPAT |
197 | if (n_kw != 0) { |
198 | mp_arg_error_unimpl_kw(); |
199 | } |
200 | #else |
201 | (void)n_kw; |
202 | #endif |
203 | |
204 | if (n_args == 0) { |
205 | return mp_const_empty_bytes; |
206 | } |
207 | |
208 | if (mp_obj_is_type(args[0], &mp_type_bytes)) { |
209 | return args[0]; |
210 | } |
211 | |
212 | if (mp_obj_is_str(args[0])) { |
213 | if (n_args < 2 || n_args > 3) { |
214 | goto wrong_args; |
215 | } |
216 | GET_STR_DATA_LEN(args[0], str_data, str_len); |
217 | GET_STR_HASH(args[0], str_hash); |
218 | if (str_hash == 0) { |
219 | str_hash = qstr_compute_hash(str_data, str_len); |
220 | } |
221 | mp_obj_str_t *o = MP_OBJ_TO_PTR(mp_obj_new_str_copy(&mp_type_bytes, NULL, str_len)); |
222 | o->data = str_data; |
223 | o->hash = str_hash; |
224 | return MP_OBJ_FROM_PTR(o); |
225 | } |
226 | |
227 | if (n_args > 1) { |
228 | goto wrong_args; |
229 | } |
230 | |
231 | if (mp_obj_is_small_int(args[0])) { |
232 | mp_int_t len = MP_OBJ_SMALL_INT_VALUE(args[0]); |
233 | if (len < 0) { |
234 | mp_raise_ValueError(NULL); |
235 | } |
236 | vstr_t vstr; |
237 | vstr_init_len(&vstr, len); |
238 | memset(vstr.buf, 0, len); |
239 | return mp_obj_new_str_from_vstr(&mp_type_bytes, &vstr); |
240 | } |
241 | |
242 | // check if argument has the buffer protocol |
243 | mp_buffer_info_t bufinfo; |
244 | if (mp_get_buffer(args[0], &bufinfo, MP_BUFFER_READ)) { |
245 | return mp_obj_new_bytes(bufinfo.buf, bufinfo.len); |
246 | } |
247 | |
248 | vstr_t vstr; |
249 | // Try to create array of exact len if initializer len is known |
250 | mp_obj_t len_in = mp_obj_len_maybe(args[0]); |
251 | if (len_in == MP_OBJ_NULL) { |
252 | vstr_init(&vstr, 16); |
253 | } else { |
254 | mp_int_t len = MP_OBJ_SMALL_INT_VALUE(len_in); |
255 | vstr_init(&vstr, len); |
256 | } |
257 | |
258 | mp_obj_iter_buf_t iter_buf; |
259 | mp_obj_t iterable = mp_getiter(args[0], &iter_buf); |
260 | mp_obj_t item; |
261 | while ((item = mp_iternext(iterable)) != MP_OBJ_STOP_ITERATION) { |
262 | mp_int_t val = mp_obj_get_int(item); |
263 | #if MICROPY_FULL_CHECKS |
264 | if (val < 0 || val > 255) { |
265 | mp_raise_ValueError(MP_ERROR_TEXT("bytes value out of range" )); |
266 | } |
267 | #endif |
268 | vstr_add_byte(&vstr, val); |
269 | } |
270 | |
271 | return mp_obj_new_str_from_vstr(&mp_type_bytes, &vstr); |
272 | |
273 | wrong_args: |
274 | mp_raise_TypeError(MP_ERROR_TEXT("wrong number of arguments" )); |
275 | } |
276 | |
277 | // like strstr but with specified length and allows \0 bytes |
278 | // TODO replace with something more efficient/standard |
279 | const byte *find_subbytes(const byte *haystack, size_t hlen, const byte *needle, size_t nlen, int direction) { |
280 | if (hlen >= nlen) { |
281 | size_t str_index, str_index_end; |
282 | if (direction > 0) { |
283 | str_index = 0; |
284 | str_index_end = hlen - nlen; |
285 | } else { |
286 | str_index = hlen - nlen; |
287 | str_index_end = 0; |
288 | } |
289 | for (;;) { |
290 | if (memcmp(&haystack[str_index], needle, nlen) == 0) { |
291 | // found |
292 | return haystack + str_index; |
293 | } |
294 | if (str_index == str_index_end) { |
295 | // not found |
296 | break; |
297 | } |
298 | str_index += direction; |
299 | } |
300 | } |
301 | return NULL; |
302 | } |
303 | |
304 | // Note: this function is used to check if an object is a str or bytes, which |
305 | // works because both those types use it as their binary_op method. Revisit |
306 | // mp_obj_is_str_or_bytes if this fact changes. |
307 | mp_obj_t mp_obj_str_binary_op(mp_binary_op_t op, mp_obj_t lhs_in, mp_obj_t rhs_in) { |
308 | // check for modulo |
309 | if (op == MP_BINARY_OP_MODULO) { |
310 | #if MICROPY_PY_BUILTINS_STR_OP_MODULO |
311 | mp_obj_t *args = &rhs_in; |
312 | size_t n_args = 1; |
313 | mp_obj_t dict = MP_OBJ_NULL; |
314 | if (mp_obj_is_type(rhs_in, &mp_type_tuple)) { |
315 | // TODO: Support tuple subclasses? |
316 | mp_obj_tuple_get(rhs_in, &n_args, &args); |
317 | } else if (mp_obj_is_type(rhs_in, &mp_type_dict)) { |
318 | dict = rhs_in; |
319 | } |
320 | return str_modulo_format(lhs_in, n_args, args, dict); |
321 | #else |
322 | return MP_OBJ_NULL; |
323 | #endif |
324 | } |
325 | |
326 | // from now on we need lhs type and data, so extract them |
327 | const mp_obj_type_t *lhs_type = mp_obj_get_type(lhs_in); |
328 | GET_STR_DATA_LEN(lhs_in, lhs_data, lhs_len); |
329 | |
330 | // check for multiply |
331 | if (op == MP_BINARY_OP_MULTIPLY) { |
332 | mp_int_t n; |
333 | if (!mp_obj_get_int_maybe(rhs_in, &n)) { |
334 | return MP_OBJ_NULL; // op not supported |
335 | } |
336 | if (n <= 0) { |
337 | if (lhs_type == &mp_type_str) { |
338 | return MP_OBJ_NEW_QSTR(MP_QSTR_); // empty str |
339 | } else { |
340 | return mp_const_empty_bytes; |
341 | } |
342 | } |
343 | vstr_t vstr; |
344 | vstr_init_len(&vstr, lhs_len * n); |
345 | mp_seq_multiply(lhs_data, sizeof(*lhs_data), lhs_len, n, vstr.buf); |
346 | return mp_obj_new_str_from_vstr(lhs_type, &vstr); |
347 | } |
348 | |
349 | // From now on all operations allow: |
350 | // - str with str |
351 | // - bytes with bytes |
352 | // - bytes with bytearray |
353 | // - bytes with array.array |
354 | // To do this efficiently we use the buffer protocol to extract the raw |
355 | // data for the rhs, but only if the lhs is a bytes object. |
356 | // |
357 | // NOTE: CPython does not allow comparison between bytes ard array.array |
358 | // (even if the array is of type 'b'), even though it allows addition of |
359 | // such types. We are not compatible with this (we do allow comparison |
360 | // of bytes with anything that has the buffer protocol). It would be |
361 | // easy to "fix" this with a bit of extra logic below, but it costs code |
362 | // size and execution time so we don't. |
363 | |
364 | const byte *rhs_data; |
365 | size_t rhs_len; |
366 | if (lhs_type == mp_obj_get_type(rhs_in)) { |
367 | GET_STR_DATA_LEN(rhs_in, rhs_data_, rhs_len_); |
368 | rhs_data = rhs_data_; |
369 | rhs_len = rhs_len_; |
370 | } else if (lhs_type == &mp_type_bytes) { |
371 | mp_buffer_info_t bufinfo; |
372 | if (!mp_get_buffer(rhs_in, &bufinfo, MP_BUFFER_READ)) { |
373 | return MP_OBJ_NULL; // op not supported |
374 | } |
375 | rhs_data = bufinfo.buf; |
376 | rhs_len = bufinfo.len; |
377 | } else { |
378 | // LHS is str and RHS has an incompatible type |
379 | // (except if operation is EQUAL, but that's handled by mp_obj_equal) |
380 | bad_implicit_conversion(rhs_in); |
381 | } |
382 | |
383 | switch (op) { |
384 | case MP_BINARY_OP_ADD: |
385 | case MP_BINARY_OP_INPLACE_ADD: { |
386 | if (lhs_len == 0 && mp_obj_get_type(rhs_in) == lhs_type) { |
387 | return rhs_in; |
388 | } |
389 | if (rhs_len == 0) { |
390 | return lhs_in; |
391 | } |
392 | |
393 | vstr_t vstr; |
394 | vstr_init_len(&vstr, lhs_len + rhs_len); |
395 | memcpy(vstr.buf, lhs_data, lhs_len); |
396 | memcpy(vstr.buf + lhs_len, rhs_data, rhs_len); |
397 | return mp_obj_new_str_from_vstr(lhs_type, &vstr); |
398 | } |
399 | |
400 | case MP_BINARY_OP_CONTAINS: |
401 | return mp_obj_new_bool(find_subbytes(lhs_data, lhs_len, rhs_data, rhs_len, 1) != NULL); |
402 | |
403 | // case MP_BINARY_OP_NOT_EQUAL: // This is never passed here |
404 | case MP_BINARY_OP_EQUAL: // This will be passed only for bytes, str is dealt with in mp_obj_equal() |
405 | case MP_BINARY_OP_LESS: |
406 | case MP_BINARY_OP_LESS_EQUAL: |
407 | case MP_BINARY_OP_MORE: |
408 | case MP_BINARY_OP_MORE_EQUAL: |
409 | return mp_obj_new_bool(mp_seq_cmp_bytes(op, lhs_data, lhs_len, rhs_data, rhs_len)); |
410 | |
411 | default: |
412 | return MP_OBJ_NULL; // op not supported |
413 | } |
414 | } |
415 | |
416 | #if !MICROPY_PY_BUILTINS_STR_UNICODE |
417 | // objstrunicode defines own version |
418 | const byte *str_index_to_ptr(const mp_obj_type_t *type, const byte *self_data, size_t self_len, |
419 | mp_obj_t index, bool is_slice) { |
420 | size_t index_val = mp_get_index(type, self_len, index, is_slice); |
421 | return self_data + index_val; |
422 | } |
423 | #endif |
424 | |
425 | // This is used for both bytes and 8-bit strings. This is not used for unicode strings. |
426 | STATIC mp_obj_t bytes_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) { |
427 | const mp_obj_type_t *type = mp_obj_get_type(self_in); |
428 | GET_STR_DATA_LEN(self_in, self_data, self_len); |
429 | if (value == MP_OBJ_SENTINEL) { |
430 | // load |
431 | #if MICROPY_PY_BUILTINS_SLICE |
432 | if (mp_obj_is_type(index, &mp_type_slice)) { |
433 | mp_bound_slice_t slice; |
434 | if (!mp_seq_get_fast_slice_indexes(self_len, index, &slice)) { |
435 | mp_raise_NotImplementedError(MP_ERROR_TEXT("only slices with step=1 (aka None) are supported" )); |
436 | } |
437 | return mp_obj_new_str_of_type(type, self_data + slice.start, slice.stop - slice.start); |
438 | } |
439 | #endif |
440 | size_t index_val = mp_get_index(type, self_len, index, false); |
441 | // If we have unicode enabled the type will always be bytes, so take the short cut. |
442 | if (MICROPY_PY_BUILTINS_STR_UNICODE || type == &mp_type_bytes) { |
443 | return MP_OBJ_NEW_SMALL_INT(self_data[index_val]); |
444 | } else { |
445 | return mp_obj_new_str_via_qstr((char *)&self_data[index_val], 1); |
446 | } |
447 | } else { |
448 | return MP_OBJ_NULL; // op not supported |
449 | } |
450 | } |
451 | |
452 | STATIC mp_obj_t str_join(mp_obj_t self_in, mp_obj_t arg) { |
453 | mp_check_self(mp_obj_is_str_or_bytes(self_in)); |
454 | const mp_obj_type_t *self_type = mp_obj_get_type(self_in); |
455 | |
456 | // get separation string |
457 | GET_STR_DATA_LEN(self_in, sep_str, sep_len); |
458 | |
459 | // process args |
460 | size_t seq_len; |
461 | mp_obj_t *seq_items; |
462 | |
463 | if (!mp_obj_is_type(arg, &mp_type_list) && !mp_obj_is_type(arg, &mp_type_tuple)) { |
464 | // arg is not a list nor a tuple, try to convert it to a list |
465 | // TODO: Try to optimize? |
466 | arg = mp_type_list.make_new(&mp_type_list, 1, 0, &arg); |
467 | } |
468 | mp_obj_get_array(arg, &seq_len, &seq_items); |
469 | |
470 | // count required length |
471 | size_t required_len = 0; |
472 | for (size_t i = 0; i < seq_len; i++) { |
473 | if (mp_obj_get_type(seq_items[i]) != self_type) { |
474 | mp_raise_TypeError( |
475 | MP_ERROR_TEXT("join expects a list of str/bytes objects consistent with self object" )); |
476 | } |
477 | if (i > 0) { |
478 | required_len += sep_len; |
479 | } |
480 | GET_STR_LEN(seq_items[i], l); |
481 | required_len += l; |
482 | } |
483 | |
484 | // make joined string |
485 | vstr_t vstr; |
486 | vstr_init_len(&vstr, required_len); |
487 | byte *data = (byte *)vstr.buf; |
488 | for (size_t i = 0; i < seq_len; i++) { |
489 | if (i > 0) { |
490 | memcpy(data, sep_str, sep_len); |
491 | data += sep_len; |
492 | } |
493 | GET_STR_DATA_LEN(seq_items[i], s, l); |
494 | memcpy(data, s, l); |
495 | data += l; |
496 | } |
497 | |
498 | // return joined string |
499 | return mp_obj_new_str_from_vstr(self_type, &vstr); |
500 | } |
501 | MP_DEFINE_CONST_FUN_OBJ_2(str_join_obj, str_join); |
502 | |
503 | mp_obj_t mp_obj_str_split(size_t n_args, const mp_obj_t *args) { |
504 | const mp_obj_type_t *self_type = mp_obj_get_type(args[0]); |
505 | mp_int_t splits = -1; |
506 | mp_obj_t sep = mp_const_none; |
507 | if (n_args > 1) { |
508 | sep = args[1]; |
509 | if (n_args > 2) { |
510 | splits = mp_obj_get_int(args[2]); |
511 | } |
512 | } |
513 | |
514 | mp_obj_t res = mp_obj_new_list(0, NULL); |
515 | GET_STR_DATA_LEN(args[0], s, len); |
516 | const byte *top = s + len; |
517 | |
518 | if (sep == mp_const_none) { |
519 | // sep not given, so separate on whitespace |
520 | |
521 | // Initial whitespace is not counted as split, so we pre-do it |
522 | while (s < top && unichar_isspace(*s)) { |
523 | s++; |
524 | } |
525 | while (s < top && splits != 0) { |
526 | const byte *start = s; |
527 | while (s < top && !unichar_isspace(*s)) { |
528 | s++; |
529 | } |
530 | mp_obj_list_append(res, mp_obj_new_str_of_type(self_type, start, s - start)); |
531 | if (s >= top) { |
532 | break; |
533 | } |
534 | while (s < top && unichar_isspace(*s)) { |
535 | s++; |
536 | } |
537 | if (splits > 0) { |
538 | splits--; |
539 | } |
540 | } |
541 | |
542 | if (s < top) { |
543 | mp_obj_list_append(res, mp_obj_new_str_of_type(self_type, s, top - s)); |
544 | } |
545 | |
546 | } else { |
547 | // sep given |
548 | if (mp_obj_get_type(sep) != self_type) { |
549 | bad_implicit_conversion(sep); |
550 | } |
551 | |
552 | size_t sep_len; |
553 | const char *sep_str = mp_obj_str_get_data(sep, &sep_len); |
554 | |
555 | if (sep_len == 0) { |
556 | mp_raise_ValueError(MP_ERROR_TEXT("empty separator" )); |
557 | } |
558 | |
559 | for (;;) { |
560 | const byte *start = s; |
561 | for (;;) { |
562 | if (splits == 0 || s + sep_len > top) { |
563 | s = top; |
564 | break; |
565 | } else if (memcmp(s, sep_str, sep_len) == 0) { |
566 | break; |
567 | } |
568 | s++; |
569 | } |
570 | mp_obj_list_append(res, mp_obj_new_str_of_type(self_type, start, s - start)); |
571 | if (s >= top) { |
572 | break; |
573 | } |
574 | s += sep_len; |
575 | if (splits > 0) { |
576 | splits--; |
577 | } |
578 | } |
579 | } |
580 | |
581 | return res; |
582 | } |
583 | MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_split_obj, 1, 3, mp_obj_str_split); |
584 | |
585 | #if MICROPY_PY_BUILTINS_STR_SPLITLINES |
586 | STATIC mp_obj_t str_splitlines(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) { |
587 | enum { ARG_keepends }; |
588 | static const mp_arg_t allowed_args[] = { |
589 | { MP_QSTR_keepends, MP_ARG_BOOL, {.u_bool = false} }, |
590 | }; |
591 | |
592 | // parse args |
593 | mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)]; |
594 | mp_arg_parse_all(n_args - 1, pos_args + 1, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args); |
595 | |
596 | const mp_obj_type_t *self_type = mp_obj_get_type(pos_args[0]); |
597 | mp_obj_t res = mp_obj_new_list(0, NULL); |
598 | |
599 | GET_STR_DATA_LEN(pos_args[0], s, len); |
600 | const byte *top = s + len; |
601 | |
602 | while (s < top) { |
603 | const byte *start = s; |
604 | size_t match = 0; |
605 | while (s < top) { |
606 | if (*s == '\n') { |
607 | match = 1; |
608 | break; |
609 | } else if (*s == '\r') { |
610 | if (s[1] == '\n') { |
611 | match = 2; |
612 | } else { |
613 | match = 1; |
614 | } |
615 | break; |
616 | } |
617 | s++; |
618 | } |
619 | size_t sub_len = s - start; |
620 | if (args[ARG_keepends].u_bool) { |
621 | sub_len += match; |
622 | } |
623 | mp_obj_list_append(res, mp_obj_new_str_of_type(self_type, start, sub_len)); |
624 | s += match; |
625 | } |
626 | |
627 | return res; |
628 | } |
629 | MP_DEFINE_CONST_FUN_OBJ_KW(str_splitlines_obj, 1, str_splitlines); |
630 | #endif |
631 | |
632 | STATIC mp_obj_t str_rsplit(size_t n_args, const mp_obj_t *args) { |
633 | if (n_args < 3) { |
634 | // If we don't have split limit, it doesn't matter from which side |
635 | // we split. |
636 | return mp_obj_str_split(n_args, args); |
637 | } |
638 | const mp_obj_type_t *self_type = mp_obj_get_type(args[0]); |
639 | mp_obj_t sep = args[1]; |
640 | GET_STR_DATA_LEN(args[0], s, len); |
641 | |
642 | mp_int_t splits = mp_obj_get_int(args[2]); |
643 | if (splits < 0) { |
644 | // Negative limit means no limit, so delegate to split(). |
645 | return mp_obj_str_split(n_args, args); |
646 | } |
647 | |
648 | mp_int_t org_splits = splits; |
649 | // Preallocate list to the max expected # of elements, as we |
650 | // will fill it from the end. |
651 | mp_obj_list_t *res = MP_OBJ_TO_PTR(mp_obj_new_list(splits + 1, NULL)); |
652 | mp_int_t idx = splits; |
653 | |
654 | if (sep == mp_const_none) { |
655 | mp_raise_NotImplementedError(MP_ERROR_TEXT("rsplit(None,n)" )); |
656 | } else { |
657 | size_t sep_len; |
658 | const char *sep_str = mp_obj_str_get_data(sep, &sep_len); |
659 | |
660 | if (sep_len == 0) { |
661 | mp_raise_ValueError(MP_ERROR_TEXT("empty separator" )); |
662 | } |
663 | |
664 | const byte *beg = s; |
665 | const byte *last = s + len; |
666 | for (;;) { |
667 | s = last - sep_len; |
668 | for (;;) { |
669 | if (splits == 0 || s < beg) { |
670 | break; |
671 | } else if (memcmp(s, sep_str, sep_len) == 0) { |
672 | break; |
673 | } |
674 | s--; |
675 | } |
676 | if (s < beg || splits == 0) { |
677 | res->items[idx] = mp_obj_new_str_of_type(self_type, beg, last - beg); |
678 | break; |
679 | } |
680 | res->items[idx--] = mp_obj_new_str_of_type(self_type, s + sep_len, last - s - sep_len); |
681 | last = s; |
682 | splits--; |
683 | } |
684 | if (idx != 0) { |
685 | // We split less parts than split limit, now go cleanup surplus |
686 | size_t used = org_splits + 1 - idx; |
687 | memmove(res->items, &res->items[idx], used * sizeof(mp_obj_t)); |
688 | mp_seq_clear(res->items, used, res->alloc, sizeof(*res->items)); |
689 | res->len = used; |
690 | } |
691 | } |
692 | |
693 | return MP_OBJ_FROM_PTR(res); |
694 | } |
695 | MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_rsplit_obj, 1, 3, str_rsplit); |
696 | |
697 | STATIC mp_obj_t str_finder(size_t n_args, const mp_obj_t *args, int direction, bool is_index) { |
698 | const mp_obj_type_t *self_type = mp_obj_get_type(args[0]); |
699 | mp_check_self(mp_obj_is_str_or_bytes(args[0])); |
700 | |
701 | // check argument type |
702 | if (mp_obj_get_type(args[1]) != self_type) { |
703 | bad_implicit_conversion(args[1]); |
704 | } |
705 | |
706 | GET_STR_DATA_LEN(args[0], haystack, haystack_len); |
707 | GET_STR_DATA_LEN(args[1], needle, needle_len); |
708 | |
709 | const byte *start = haystack; |
710 | const byte *end = haystack + haystack_len; |
711 | if (n_args >= 3 && args[2] != mp_const_none) { |
712 | start = str_index_to_ptr(self_type, haystack, haystack_len, args[2], true); |
713 | } |
714 | if (n_args >= 4 && args[3] != mp_const_none) { |
715 | end = str_index_to_ptr(self_type, haystack, haystack_len, args[3], true); |
716 | } |
717 | |
718 | if (end < start) { |
719 | goto out_error; |
720 | } |
721 | |
722 | const byte *p = find_subbytes(start, end - start, needle, needle_len, direction); |
723 | if (p == NULL) { |
724 | out_error: |
725 | // not found |
726 | if (is_index) { |
727 | mp_raise_ValueError(MP_ERROR_TEXT("substring not found" )); |
728 | } else { |
729 | return MP_OBJ_NEW_SMALL_INT(-1); |
730 | } |
731 | } else { |
732 | // found |
733 | #if MICROPY_PY_BUILTINS_STR_UNICODE |
734 | if (self_type == &mp_type_str) { |
735 | return MP_OBJ_NEW_SMALL_INT(utf8_ptr_to_index(haystack, p)); |
736 | } |
737 | #endif |
738 | return MP_OBJ_NEW_SMALL_INT(p - haystack); |
739 | } |
740 | } |
741 | |
742 | STATIC mp_obj_t str_find(size_t n_args, const mp_obj_t *args) { |
743 | return str_finder(n_args, args, 1, false); |
744 | } |
745 | MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_find_obj, 2, 4, str_find); |
746 | |
747 | STATIC mp_obj_t str_rfind(size_t n_args, const mp_obj_t *args) { |
748 | return str_finder(n_args, args, -1, false); |
749 | } |
750 | MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_rfind_obj, 2, 4, str_rfind); |
751 | |
752 | STATIC mp_obj_t str_index(size_t n_args, const mp_obj_t *args) { |
753 | return str_finder(n_args, args, 1, true); |
754 | } |
755 | MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_index_obj, 2, 4, str_index); |
756 | |
757 | STATIC mp_obj_t str_rindex(size_t n_args, const mp_obj_t *args) { |
758 | return str_finder(n_args, args, -1, true); |
759 | } |
760 | MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_rindex_obj, 2, 4, str_rindex); |
761 | |
762 | // TODO: (Much) more variety in args |
763 | STATIC mp_obj_t str_startswith(size_t n_args, const mp_obj_t *args) { |
764 | const mp_obj_type_t *self_type = mp_obj_get_type(args[0]); |
765 | GET_STR_DATA_LEN(args[0], str, str_len); |
766 | size_t prefix_len; |
767 | const char *prefix = mp_obj_str_get_data(args[1], &prefix_len); |
768 | const byte *start = str; |
769 | if (n_args > 2) { |
770 | start = str_index_to_ptr(self_type, str, str_len, args[2], true); |
771 | } |
772 | if (prefix_len + (start - str) > str_len) { |
773 | return mp_const_false; |
774 | } |
775 | return mp_obj_new_bool(memcmp(start, prefix, prefix_len) == 0); |
776 | } |
777 | MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_startswith_obj, 2, 3, str_startswith); |
778 | |
779 | STATIC mp_obj_t str_endswith(size_t n_args, const mp_obj_t *args) { |
780 | GET_STR_DATA_LEN(args[0], str, str_len); |
781 | size_t suffix_len; |
782 | const char *suffix = mp_obj_str_get_data(args[1], &suffix_len); |
783 | if (n_args > 2) { |
784 | mp_raise_NotImplementedError(MP_ERROR_TEXT("start/end indices" )); |
785 | } |
786 | |
787 | if (suffix_len > str_len) { |
788 | return mp_const_false; |
789 | } |
790 | return mp_obj_new_bool(memcmp(str + (str_len - suffix_len), suffix, suffix_len) == 0); |
791 | } |
792 | MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_endswith_obj, 2, 3, str_endswith); |
793 | |
794 | enum { LSTRIP, RSTRIP, STRIP }; |
795 | |
796 | STATIC mp_obj_t str_uni_strip(int type, size_t n_args, const mp_obj_t *args) { |
797 | mp_check_self(mp_obj_is_str_or_bytes(args[0])); |
798 | const mp_obj_type_t *self_type = mp_obj_get_type(args[0]); |
799 | |
800 | const byte *chars_to_del; |
801 | uint chars_to_del_len; |
802 | static const byte whitespace[] = " \t\n\r\v\f" ; |
803 | |
804 | if (n_args == 1) { |
805 | chars_to_del = whitespace; |
806 | chars_to_del_len = sizeof(whitespace) - 1; |
807 | } else { |
808 | if (mp_obj_get_type(args[1]) != self_type) { |
809 | bad_implicit_conversion(args[1]); |
810 | } |
811 | GET_STR_DATA_LEN(args[1], s, l); |
812 | chars_to_del = s; |
813 | chars_to_del_len = l; |
814 | } |
815 | |
816 | GET_STR_DATA_LEN(args[0], orig_str, orig_str_len); |
817 | |
818 | size_t first_good_char_pos = 0; |
819 | bool first_good_char_pos_set = false; |
820 | size_t last_good_char_pos = 0; |
821 | size_t i = 0; |
822 | int delta = 1; |
823 | if (type == RSTRIP) { |
824 | i = orig_str_len - 1; |
825 | delta = -1; |
826 | } |
827 | for (size_t len = orig_str_len; len > 0; len--) { |
828 | if (find_subbytes(chars_to_del, chars_to_del_len, &orig_str[i], 1, 1) == NULL) { |
829 | if (!first_good_char_pos_set) { |
830 | first_good_char_pos_set = true; |
831 | first_good_char_pos = i; |
832 | if (type == LSTRIP) { |
833 | last_good_char_pos = orig_str_len - 1; |
834 | break; |
835 | } else if (type == RSTRIP) { |
836 | first_good_char_pos = 0; |
837 | last_good_char_pos = i; |
838 | break; |
839 | } |
840 | } |
841 | last_good_char_pos = i; |
842 | } |
843 | i += delta; |
844 | } |
845 | |
846 | if (!first_good_char_pos_set) { |
847 | // string is all whitespace, return '' |
848 | if (self_type == &mp_type_str) { |
849 | return MP_OBJ_NEW_QSTR(MP_QSTR_); |
850 | } else { |
851 | return mp_const_empty_bytes; |
852 | } |
853 | } |
854 | |
855 | assert(last_good_char_pos >= first_good_char_pos); |
856 | // +1 to accommodate the last character |
857 | size_t stripped_len = last_good_char_pos - first_good_char_pos + 1; |
858 | if (stripped_len == orig_str_len) { |
859 | // If nothing was stripped, don't bother to dup original string |
860 | // TODO: watch out for this case when we'll get to bytearray.strip() |
861 | assert(first_good_char_pos == 0); |
862 | return args[0]; |
863 | } |
864 | return mp_obj_new_str_of_type(self_type, orig_str + first_good_char_pos, stripped_len); |
865 | } |
866 | |
867 | STATIC mp_obj_t str_strip(size_t n_args, const mp_obj_t *args) { |
868 | return str_uni_strip(STRIP, n_args, args); |
869 | } |
870 | MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_strip_obj, 1, 2, str_strip); |
871 | |
872 | STATIC mp_obj_t str_lstrip(size_t n_args, const mp_obj_t *args) { |
873 | return str_uni_strip(LSTRIP, n_args, args); |
874 | } |
875 | MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_lstrip_obj, 1, 2, str_lstrip); |
876 | |
877 | STATIC mp_obj_t str_rstrip(size_t n_args, const mp_obj_t *args) { |
878 | return str_uni_strip(RSTRIP, n_args, args); |
879 | } |
880 | MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_rstrip_obj, 1, 2, str_rstrip); |
881 | |
882 | #if MICROPY_PY_BUILTINS_STR_CENTER |
883 | STATIC mp_obj_t str_center(mp_obj_t str_in, mp_obj_t width_in) { |
884 | GET_STR_DATA_LEN(str_in, str, str_len); |
885 | mp_uint_t width = mp_obj_get_int(width_in); |
886 | if (str_len >= width) { |
887 | return str_in; |
888 | } |
889 | |
890 | vstr_t vstr; |
891 | vstr_init_len(&vstr, width); |
892 | memset(vstr.buf, ' ', width); |
893 | int left = (width - str_len) / 2; |
894 | memcpy(vstr.buf + left, str, str_len); |
895 | return mp_obj_new_str_from_vstr(mp_obj_get_type(str_in), &vstr); |
896 | } |
897 | MP_DEFINE_CONST_FUN_OBJ_2(str_center_obj, str_center); |
898 | #endif |
899 | |
900 | // Takes an int arg, but only parses unsigned numbers, and only changes |
901 | // *num if at least one digit was parsed. |
902 | STATIC const char *str_to_int(const char *str, const char *top, int *num) { |
903 | if (str < top && '0' <= *str && *str <= '9') { |
904 | *num = 0; |
905 | do { |
906 | *num = *num * 10 + (*str - '0'); |
907 | str++; |
908 | } |
909 | while (str < top && '0' <= *str && *str <= '9'); |
910 | } |
911 | return str; |
912 | } |
913 | |
914 | STATIC bool isalignment(char ch) { |
915 | return ch && strchr("<>=^" , ch) != NULL; |
916 | } |
917 | |
918 | STATIC bool istype(char ch) { |
919 | return ch && strchr("bcdeEfFgGnosxX%" , ch) != NULL; |
920 | } |
921 | |
922 | STATIC bool arg_looks_integer(mp_obj_t arg) { |
923 | return mp_obj_is_bool(arg) || mp_obj_is_int(arg); |
924 | } |
925 | |
926 | STATIC bool arg_looks_numeric(mp_obj_t arg) { |
927 | return arg_looks_integer(arg) |
928 | #if MICROPY_PY_BUILTINS_FLOAT |
929 | || mp_obj_is_float(arg) |
930 | #endif |
931 | ; |
932 | } |
933 | |
934 | #if MICROPY_PY_BUILTINS_STR_OP_MODULO |
935 | STATIC mp_obj_t arg_as_int(mp_obj_t arg) { |
936 | #if MICROPY_PY_BUILTINS_FLOAT |
937 | if (mp_obj_is_float(arg)) { |
938 | return mp_obj_new_int_from_float(mp_obj_float_get(arg)); |
939 | } |
940 | #endif |
941 | return arg; |
942 | } |
943 | #endif |
944 | |
945 | #if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE |
946 | STATIC NORETURN void terse_str_format_value_error(void) { |
947 | mp_raise_ValueError(MP_ERROR_TEXT("bad format string" )); |
948 | } |
949 | #else |
950 | // define to nothing to improve coverage |
951 | #define terse_str_format_value_error() |
952 | #endif |
953 | |
954 | STATIC vstr_t mp_obj_str_format_helper(const char *str, const char *top, int *arg_i, size_t n_args, const mp_obj_t *args, mp_map_t *kwargs) { |
955 | vstr_t vstr; |
956 | mp_print_t print; |
957 | vstr_init_print(&vstr, 16, &print); |
958 | |
959 | for (; str < top; str++) { |
960 | if (*str == '}') { |
961 | str++; |
962 | if (str < top && *str == '}') { |
963 | vstr_add_byte(&vstr, '}'); |
964 | continue; |
965 | } |
966 | #if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE |
967 | terse_str_format_value_error(); |
968 | #else |
969 | mp_raise_ValueError(MP_ERROR_TEXT("single '}' encountered in format string" )); |
970 | #endif |
971 | } |
972 | if (*str != '{') { |
973 | vstr_add_byte(&vstr, *str); |
974 | continue; |
975 | } |
976 | |
977 | str++; |
978 | if (str < top && *str == '{') { |
979 | vstr_add_byte(&vstr, '{'); |
980 | continue; |
981 | } |
982 | |
983 | // replacement_field ::= "{" [field_name] ["!" conversion] [":" format_spec] "}" |
984 | |
985 | const char *field_name = NULL; |
986 | const char *field_name_top = NULL; |
987 | char conversion = '\0'; |
988 | const char *format_spec = NULL; |
989 | |
990 | if (str < top && *str != '}' && *str != '!' && *str != ':') { |
991 | field_name = (const char *)str; |
992 | while (str < top && *str != '}' && *str != '!' && *str != ':') { |
993 | ++str; |
994 | } |
995 | field_name_top = (const char *)str; |
996 | } |
997 | |
998 | // conversion ::= "r" | "s" |
999 | |
1000 | if (str < top && *str == '!') { |
1001 | str++; |
1002 | if (str < top && (*str == 'r' || *str == 's')) { |
1003 | conversion = *str++; |
1004 | } else { |
1005 | #if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE |
1006 | terse_str_format_value_error(); |
1007 | #elif MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_NORMAL |
1008 | mp_raise_ValueError(MP_ERROR_TEXT("bad conversion specifier" )); |
1009 | #else |
1010 | if (str >= top) { |
1011 | mp_raise_ValueError( |
1012 | MP_ERROR_TEXT("end of format while looking for conversion specifier" )); |
1013 | } else { |
1014 | mp_raise_msg_varg(&mp_type_ValueError, |
1015 | MP_ERROR_TEXT("unknown conversion specifier %c" ), *str); |
1016 | } |
1017 | #endif |
1018 | } |
1019 | } |
1020 | |
1021 | if (str < top && *str == ':') { |
1022 | str++; |
1023 | // {:} is the same as {}, which is the same as {!s} |
1024 | // This makes a difference when passing in a True or False |
1025 | // '{}'.format(True) returns 'True' |
1026 | // '{:d}'.format(True) returns '1' |
1027 | // So we treat {:} as {} and this later gets treated to be {!s} |
1028 | if (*str != '}') { |
1029 | format_spec = str; |
1030 | for (int nest = 1; str < top;) { |
1031 | if (*str == '{') { |
1032 | ++nest; |
1033 | } else if (*str == '}') { |
1034 | if (--nest == 0) { |
1035 | break; |
1036 | } |
1037 | } |
1038 | ++str; |
1039 | } |
1040 | } |
1041 | } |
1042 | if (str >= top) { |
1043 | #if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE |
1044 | terse_str_format_value_error(); |
1045 | #else |
1046 | mp_raise_ValueError(MP_ERROR_TEXT("unmatched '{' in format" )); |
1047 | #endif |
1048 | } |
1049 | if (*str != '}') { |
1050 | #if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE |
1051 | terse_str_format_value_error(); |
1052 | #else |
1053 | mp_raise_ValueError(MP_ERROR_TEXT("expected ':' after format specifier" )); |
1054 | #endif |
1055 | } |
1056 | |
1057 | mp_obj_t arg = mp_const_none; |
1058 | |
1059 | if (field_name) { |
1060 | int index = 0; |
1061 | if (MP_LIKELY(unichar_isdigit(*field_name))) { |
1062 | if (*arg_i > 0) { |
1063 | #if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE |
1064 | terse_str_format_value_error(); |
1065 | #else |
1066 | mp_raise_ValueError( |
1067 | MP_ERROR_TEXT("can't switch from automatic field numbering to manual field specification" )); |
1068 | #endif |
1069 | } |
1070 | field_name = str_to_int(field_name, field_name_top, &index); |
1071 | if ((uint)index >= n_args - 1) { |
1072 | mp_raise_msg(&mp_type_IndexError, MP_ERROR_TEXT("tuple index out of range" )); |
1073 | } |
1074 | arg = args[index + 1]; |
1075 | *arg_i = -1; |
1076 | } else { |
1077 | const char *lookup; |
1078 | for (lookup = field_name; lookup < field_name_top && *lookup != '.' && *lookup != '['; lookup++) {; |
1079 | } |
1080 | mp_obj_t field_q = mp_obj_new_str_via_qstr(field_name, lookup - field_name); // should it be via qstr? |
1081 | field_name = lookup; |
1082 | mp_map_elem_t *key_elem = mp_map_lookup(kwargs, field_q, MP_MAP_LOOKUP); |
1083 | if (key_elem == NULL) { |
1084 | nlr_raise(mp_obj_new_exception_arg1(&mp_type_KeyError, field_q)); |
1085 | } |
1086 | arg = key_elem->value; |
1087 | } |
1088 | if (field_name < field_name_top) { |
1089 | mp_raise_NotImplementedError(MP_ERROR_TEXT("attributes not supported yet" )); |
1090 | } |
1091 | } else { |
1092 | if (*arg_i < 0) { |
1093 | #if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE |
1094 | terse_str_format_value_error(); |
1095 | #else |
1096 | mp_raise_ValueError( |
1097 | MP_ERROR_TEXT("can't switch from manual field specification to automatic field numbering" )); |
1098 | #endif |
1099 | } |
1100 | if ((uint)*arg_i >= n_args - 1) { |
1101 | mp_raise_msg(&mp_type_IndexError, MP_ERROR_TEXT("tuple index out of range" )); |
1102 | } |
1103 | arg = args[(*arg_i) + 1]; |
1104 | (*arg_i)++; |
1105 | } |
1106 | if (!format_spec && !conversion) { |
1107 | conversion = 's'; |
1108 | } |
1109 | if (conversion) { |
1110 | mp_print_kind_t print_kind; |
1111 | if (conversion == 's') { |
1112 | print_kind = PRINT_STR; |
1113 | } else { |
1114 | assert(conversion == 'r'); |
1115 | print_kind = PRINT_REPR; |
1116 | } |
1117 | vstr_t arg_vstr; |
1118 | mp_print_t arg_print; |
1119 | vstr_init_print(&arg_vstr, 16, &arg_print); |
1120 | mp_obj_print_helper(&arg_print, arg, print_kind); |
1121 | arg = mp_obj_new_str_from_vstr(&mp_type_str, &arg_vstr); |
1122 | } |
1123 | |
1124 | char fill = '\0'; |
1125 | char align = '\0'; |
1126 | int width = -1; |
1127 | int precision = -1; |
1128 | char type = '\0'; |
1129 | int flags = 0; |
1130 | |
1131 | if (format_spec) { |
1132 | // The format specifier (from http://docs.python.org/2/library/string.html#formatspec) |
1133 | // |
1134 | // [[fill]align][sign][#][0][width][,][.precision][type] |
1135 | // fill ::= <any character> |
1136 | // align ::= "<" | ">" | "=" | "^" |
1137 | // sign ::= "+" | "-" | " " |
1138 | // width ::= integer |
1139 | // precision ::= integer |
1140 | // type ::= "b" | "c" | "d" | "e" | "E" | "f" | "F" | "g" | "G" | "n" | "o" | "s" | "x" | "X" | "%" |
1141 | |
1142 | // recursively call the formatter to format any nested specifiers |
1143 | MP_STACK_CHECK(); |
1144 | vstr_t format_spec_vstr = mp_obj_str_format_helper(format_spec, str, arg_i, n_args, args, kwargs); |
1145 | const char *s = vstr_null_terminated_str(&format_spec_vstr); |
1146 | const char *stop = s + format_spec_vstr.len; |
1147 | if (isalignment(*s)) { |
1148 | align = *s++; |
1149 | } else if (*s && isalignment(s[1])) { |
1150 | fill = *s++; |
1151 | align = *s++; |
1152 | } |
1153 | if (*s == '+' || *s == '-' || *s == ' ') { |
1154 | if (*s == '+') { |
1155 | flags |= PF_FLAG_SHOW_SIGN; |
1156 | } else if (*s == ' ') { |
1157 | flags |= PF_FLAG_SPACE_SIGN; |
1158 | } |
1159 | s++; |
1160 | } |
1161 | if (*s == '#') { |
1162 | flags |= PF_FLAG_SHOW_PREFIX; |
1163 | s++; |
1164 | } |
1165 | if (*s == '0') { |
1166 | if (!align) { |
1167 | align = '='; |
1168 | } |
1169 | if (!fill) { |
1170 | fill = '0'; |
1171 | } |
1172 | } |
1173 | s = str_to_int(s, stop, &width); |
1174 | if (*s == ',') { |
1175 | flags |= PF_FLAG_SHOW_COMMA; |
1176 | s++; |
1177 | } |
1178 | if (*s == '.') { |
1179 | s++; |
1180 | s = str_to_int(s, stop, &precision); |
1181 | } |
1182 | if (istype(*s)) { |
1183 | type = *s++; |
1184 | } |
1185 | if (*s) { |
1186 | #if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE |
1187 | terse_str_format_value_error(); |
1188 | #else |
1189 | mp_raise_ValueError(MP_ERROR_TEXT("invalid format specifier" )); |
1190 | #endif |
1191 | } |
1192 | vstr_clear(&format_spec_vstr); |
1193 | } |
1194 | if (!align) { |
1195 | if (arg_looks_numeric(arg)) { |
1196 | align = '>'; |
1197 | } else { |
1198 | align = '<'; |
1199 | } |
1200 | } |
1201 | if (!fill) { |
1202 | fill = ' '; |
1203 | } |
1204 | |
1205 | if (flags & (PF_FLAG_SHOW_SIGN | PF_FLAG_SPACE_SIGN)) { |
1206 | if (type == 's') { |
1207 | #if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE |
1208 | terse_str_format_value_error(); |
1209 | #else |
1210 | mp_raise_ValueError(MP_ERROR_TEXT("sign not allowed in string format specifier" )); |
1211 | #endif |
1212 | } |
1213 | if (type == 'c') { |
1214 | #if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE |
1215 | terse_str_format_value_error(); |
1216 | #else |
1217 | mp_raise_ValueError( |
1218 | MP_ERROR_TEXT("sign not allowed with integer format specifier 'c'" )); |
1219 | #endif |
1220 | } |
1221 | } |
1222 | |
1223 | switch (align) { |
1224 | case '<': |
1225 | flags |= PF_FLAG_LEFT_ADJUST; |
1226 | break; |
1227 | case '=': |
1228 | flags |= PF_FLAG_PAD_AFTER_SIGN; |
1229 | break; |
1230 | case '^': |
1231 | flags |= PF_FLAG_CENTER_ADJUST; |
1232 | break; |
1233 | } |
1234 | |
1235 | if (arg_looks_integer(arg)) { |
1236 | switch (type) { |
1237 | case 'b': |
1238 | mp_print_mp_int(&print, arg, 2, 'a', flags, fill, width, 0); |
1239 | continue; |
1240 | |
1241 | case 'c': { |
1242 | char ch = mp_obj_get_int(arg); |
1243 | mp_print_strn(&print, &ch, 1, flags, fill, width); |
1244 | continue; |
1245 | } |
1246 | |
1247 | case '\0': // No explicit format type implies 'd' |
1248 | case 'n': // I don't think we support locales in uPy so use 'd' |
1249 | case 'd': |
1250 | mp_print_mp_int(&print, arg, 10, 'a', flags, fill, width, 0); |
1251 | continue; |
1252 | |
1253 | case 'o': |
1254 | if (flags & PF_FLAG_SHOW_PREFIX) { |
1255 | flags |= PF_FLAG_SHOW_OCTAL_LETTER; |
1256 | } |
1257 | |
1258 | mp_print_mp_int(&print, arg, 8, 'a', flags, fill, width, 0); |
1259 | continue; |
1260 | |
1261 | case 'X': |
1262 | case 'x': |
1263 | mp_print_mp_int(&print, arg, 16, type - ('X' - 'A'), flags, fill, width, 0); |
1264 | continue; |
1265 | |
1266 | case 'e': |
1267 | case 'E': |
1268 | case 'f': |
1269 | case 'F': |
1270 | case 'g': |
1271 | case 'G': |
1272 | case '%': |
1273 | // The floating point formatters all work with anything that |
1274 | // looks like an integer |
1275 | break; |
1276 | |
1277 | default: |
1278 | #if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE |
1279 | terse_str_format_value_error(); |
1280 | #else |
1281 | mp_raise_msg_varg(&mp_type_ValueError, |
1282 | MP_ERROR_TEXT("unknown format code '%c' for object of type '%s'" ), |
1283 | type, mp_obj_get_type_str(arg)); |
1284 | #endif |
1285 | } |
1286 | } |
1287 | |
1288 | // NOTE: no else here. We need the e, f, g etc formats for integer |
1289 | // arguments (from above if) to take this if. |
1290 | if (arg_looks_numeric(arg)) { |
1291 | if (!type) { |
1292 | |
1293 | // Even though the docs say that an unspecified type is the same |
1294 | // as 'g', there is one subtle difference, when the exponent |
1295 | // is one less than the precision. |
1296 | // |
1297 | // '{:10.1}'.format(0.0) ==> '0e+00' |
1298 | // '{:10.1g}'.format(0.0) ==> '0' |
1299 | // |
1300 | // TODO: Figure out how to deal with this. |
1301 | // |
1302 | // A proper solution would involve adding a special flag |
1303 | // or something to format_float, and create a format_double |
1304 | // to deal with doubles. In order to fix this when using |
1305 | // sprintf, we'd need to use the e format and tweak the |
1306 | // returned result to strip trailing zeros like the g format |
1307 | // does. |
1308 | // |
1309 | // {:10.3} and {:10.2e} with 1.23e2 both produce 1.23e+02 |
1310 | // but with 1.e2 you get 1e+02 and 1.00e+02 |
1311 | // |
1312 | // Stripping the trailing 0's (like g) does would make the |
1313 | // e format give us the right format. |
1314 | // |
1315 | // CPython sources say: |
1316 | // Omitted type specifier. Behaves in the same way as repr(x) |
1317 | // and str(x) if no precision is given, else like 'g', but with |
1318 | // at least one digit after the decimal point. */ |
1319 | |
1320 | type = 'g'; |
1321 | } |
1322 | if (type == 'n') { |
1323 | type = 'g'; |
1324 | } |
1325 | |
1326 | switch (type) { |
1327 | #if MICROPY_PY_BUILTINS_FLOAT |
1328 | case 'e': |
1329 | case 'E': |
1330 | case 'f': |
1331 | case 'F': |
1332 | case 'g': |
1333 | case 'G': |
1334 | mp_print_float(&print, mp_obj_get_float(arg), type, flags, fill, width, precision); |
1335 | break; |
1336 | |
1337 | case '%': |
1338 | flags |= PF_FLAG_ADD_PERCENT; |
1339 | #if MICROPY_FLOAT_IMPL == MICROPY_FLOAT_IMPL_FLOAT |
1340 | #define F100 100.0F |
1341 | #else |
1342 | #define F100 100.0 |
1343 | #endif |
1344 | mp_print_float(&print, mp_obj_get_float(arg) * F100, 'f', flags, fill, width, precision); |
1345 | #undef F100 |
1346 | break; |
1347 | #endif |
1348 | |
1349 | default: |
1350 | #if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE |
1351 | terse_str_format_value_error(); |
1352 | #else |
1353 | mp_raise_msg_varg(&mp_type_ValueError, |
1354 | MP_ERROR_TEXT("unknown format code '%c' for object of type '%s'" ), |
1355 | type, mp_obj_get_type_str(arg)); |
1356 | #endif |
1357 | } |
1358 | } else { |
1359 | // arg doesn't look like a number |
1360 | |
1361 | if (align == '=') { |
1362 | #if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE |
1363 | terse_str_format_value_error(); |
1364 | #else |
1365 | mp_raise_ValueError( |
1366 | MP_ERROR_TEXT("'=' alignment not allowed in string format specifier" )); |
1367 | #endif |
1368 | } |
1369 | |
1370 | switch (type) { |
1371 | case '\0': // no explicit format type implies 's' |
1372 | case 's': { |
1373 | size_t slen; |
1374 | const char *s = mp_obj_str_get_data(arg, &slen); |
1375 | if (precision < 0) { |
1376 | precision = slen; |
1377 | } |
1378 | if (slen > (size_t)precision) { |
1379 | slen = precision; |
1380 | } |
1381 | mp_print_strn(&print, s, slen, flags, fill, width); |
1382 | break; |
1383 | } |
1384 | |
1385 | default: |
1386 | #if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE |
1387 | terse_str_format_value_error(); |
1388 | #else |
1389 | mp_raise_msg_varg(&mp_type_ValueError, |
1390 | MP_ERROR_TEXT("unknown format code '%c' for object of type '%s'" ), |
1391 | type, mp_obj_get_type_str(arg)); |
1392 | #endif |
1393 | } |
1394 | } |
1395 | } |
1396 | |
1397 | return vstr; |
1398 | } |
1399 | |
1400 | mp_obj_t mp_obj_str_format(size_t n_args, const mp_obj_t *args, mp_map_t *kwargs) { |
1401 | mp_check_self(mp_obj_is_str_or_bytes(args[0])); |
1402 | |
1403 | GET_STR_DATA_LEN(args[0], str, len); |
1404 | int arg_i = 0; |
1405 | vstr_t vstr = mp_obj_str_format_helper((const char *)str, (const char *)str + len, &arg_i, n_args, args, kwargs); |
1406 | return mp_obj_new_str_from_vstr(mp_obj_get_type(args[0]), &vstr); |
1407 | } |
1408 | MP_DEFINE_CONST_FUN_OBJ_KW(str_format_obj, 1, mp_obj_str_format); |
1409 | |
1410 | #if MICROPY_PY_BUILTINS_STR_OP_MODULO |
1411 | STATIC mp_obj_t str_modulo_format(mp_obj_t pattern, size_t n_args, const mp_obj_t *args, mp_obj_t dict) { |
1412 | mp_check_self(mp_obj_is_str_or_bytes(pattern)); |
1413 | |
1414 | GET_STR_DATA_LEN(pattern, str, len); |
1415 | #if MICROPY_ERROR_REPORTING != MICROPY_ERROR_REPORTING_TERSE |
1416 | const byte *start_str = str; |
1417 | #endif |
1418 | bool is_bytes = mp_obj_is_type(pattern, &mp_type_bytes); |
1419 | size_t arg_i = 0; |
1420 | vstr_t vstr; |
1421 | mp_print_t print; |
1422 | vstr_init_print(&vstr, 16, &print); |
1423 | |
1424 | for (const byte *top = str + len; str < top; str++) { |
1425 | mp_obj_t arg = MP_OBJ_NULL; |
1426 | if (*str != '%') { |
1427 | vstr_add_byte(&vstr, *str); |
1428 | continue; |
1429 | } |
1430 | if (++str >= top) { |
1431 | goto incomplete_format; |
1432 | } |
1433 | if (*str == '%') { |
1434 | vstr_add_byte(&vstr, '%'); |
1435 | continue; |
1436 | } |
1437 | |
1438 | // Dictionary value lookup |
1439 | if (*str == '(') { |
1440 | if (dict == MP_OBJ_NULL) { |
1441 | mp_raise_TypeError(MP_ERROR_TEXT("format needs a dict" )); |
1442 | } |
1443 | arg_i = 1; // we used up the single dict argument |
1444 | const byte *key = ++str; |
1445 | while (*str != ')') { |
1446 | if (str >= top) { |
1447 | #if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE |
1448 | terse_str_format_value_error(); |
1449 | #else |
1450 | mp_raise_ValueError(MP_ERROR_TEXT("incomplete format key" )); |
1451 | #endif |
1452 | } |
1453 | ++str; |
1454 | } |
1455 | mp_obj_t k_obj = mp_obj_new_str_via_qstr((const char *)key, str - key); |
1456 | arg = mp_obj_dict_get(dict, k_obj); |
1457 | str++; |
1458 | } |
1459 | |
1460 | int flags = 0; |
1461 | char fill = ' '; |
1462 | int alt = 0; |
1463 | while (str < top) { |
1464 | if (*str == '-') { |
1465 | flags |= PF_FLAG_LEFT_ADJUST; |
1466 | } else if (*str == '+') { |
1467 | flags |= PF_FLAG_SHOW_SIGN; |
1468 | } else if (*str == ' ') { |
1469 | flags |= PF_FLAG_SPACE_SIGN; |
1470 | } else if (*str == '#') { |
1471 | alt = PF_FLAG_SHOW_PREFIX; |
1472 | } else if (*str == '0') { |
1473 | flags |= PF_FLAG_PAD_AFTER_SIGN; |
1474 | fill = '0'; |
1475 | } else { |
1476 | break; |
1477 | } |
1478 | str++; |
1479 | } |
1480 | // parse width, if it exists |
1481 | int width = 0; |
1482 | if (str < top) { |
1483 | if (*str == '*') { |
1484 | if (arg_i >= n_args) { |
1485 | goto not_enough_args; |
1486 | } |
1487 | width = mp_obj_get_int(args[arg_i++]); |
1488 | str++; |
1489 | } else { |
1490 | str = (const byte *)str_to_int((const char *)str, (const char *)top, &width); |
1491 | } |
1492 | } |
1493 | int prec = -1; |
1494 | if (str < top && *str == '.') { |
1495 | if (++str < top) { |
1496 | if (*str == '*') { |
1497 | if (arg_i >= n_args) { |
1498 | goto not_enough_args; |
1499 | } |
1500 | prec = mp_obj_get_int(args[arg_i++]); |
1501 | str++; |
1502 | } else { |
1503 | prec = 0; |
1504 | str = (const byte *)str_to_int((const char *)str, (const char *)top, &prec); |
1505 | } |
1506 | } |
1507 | } |
1508 | |
1509 | if (str >= top) { |
1510 | incomplete_format: |
1511 | #if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE |
1512 | terse_str_format_value_error(); |
1513 | #else |
1514 | mp_raise_ValueError(MP_ERROR_TEXT("incomplete format" )); |
1515 | #endif |
1516 | } |
1517 | |
1518 | // Tuple value lookup |
1519 | if (arg == MP_OBJ_NULL) { |
1520 | if (arg_i >= n_args) { |
1521 | not_enough_args: |
1522 | mp_raise_TypeError(MP_ERROR_TEXT("format string needs more arguments" )); |
1523 | } |
1524 | arg = args[arg_i++]; |
1525 | } |
1526 | switch (*str) { |
1527 | case 'c': |
1528 | if (mp_obj_is_str(arg)) { |
1529 | size_t slen; |
1530 | const char *s = mp_obj_str_get_data(arg, &slen); |
1531 | if (slen != 1) { |
1532 | mp_raise_TypeError(MP_ERROR_TEXT("%c needs int or char" )); |
1533 | } |
1534 | mp_print_strn(&print, s, 1, flags, ' ', width); |
1535 | } else if (arg_looks_integer(arg)) { |
1536 | char ch = mp_obj_get_int(arg); |
1537 | mp_print_strn(&print, &ch, 1, flags, ' ', width); |
1538 | } else { |
1539 | mp_raise_TypeError(MP_ERROR_TEXT("integer needed" )); |
1540 | } |
1541 | break; |
1542 | |
1543 | case 'd': |
1544 | case 'i': |
1545 | case 'u': |
1546 | mp_print_mp_int(&print, arg_as_int(arg), 10, 'a', flags, fill, width, prec); |
1547 | break; |
1548 | |
1549 | #if MICROPY_PY_BUILTINS_FLOAT |
1550 | case 'e': |
1551 | case 'E': |
1552 | case 'f': |
1553 | case 'F': |
1554 | case 'g': |
1555 | case 'G': |
1556 | mp_print_float(&print, mp_obj_get_float(arg), *str, flags, fill, width, prec); |
1557 | break; |
1558 | #endif |
1559 | |
1560 | case 'o': |
1561 | if (alt) { |
1562 | flags |= (PF_FLAG_SHOW_PREFIX | PF_FLAG_SHOW_OCTAL_LETTER); |
1563 | } |
1564 | mp_print_mp_int(&print, arg, 8, 'a', flags, fill, width, prec); |
1565 | break; |
1566 | |
1567 | case 'r': |
1568 | case 's': { |
1569 | vstr_t arg_vstr; |
1570 | mp_print_t arg_print; |
1571 | vstr_init_print(&arg_vstr, 16, &arg_print); |
1572 | mp_print_kind_t print_kind = (*str == 'r' ? PRINT_REPR : PRINT_STR); |
1573 | if (print_kind == PRINT_STR && is_bytes && mp_obj_is_type(arg, &mp_type_bytes)) { |
1574 | // If we have something like b"%s" % b"1", bytes arg should be |
1575 | // printed undecorated. |
1576 | print_kind = PRINT_RAW; |
1577 | } |
1578 | mp_obj_print_helper(&arg_print, arg, print_kind); |
1579 | uint vlen = arg_vstr.len; |
1580 | if (prec < 0) { |
1581 | prec = vlen; |
1582 | } |
1583 | if (vlen > (uint)prec) { |
1584 | vlen = prec; |
1585 | } |
1586 | mp_print_strn(&print, arg_vstr.buf, vlen, flags, ' ', width); |
1587 | vstr_clear(&arg_vstr); |
1588 | break; |
1589 | } |
1590 | |
1591 | case 'X': |
1592 | case 'x': |
1593 | mp_print_mp_int(&print, arg, 16, *str - ('X' - 'A'), flags | alt, fill, width, prec); |
1594 | break; |
1595 | |
1596 | default: |
1597 | #if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE |
1598 | terse_str_format_value_error(); |
1599 | #else |
1600 | mp_raise_msg_varg(&mp_type_ValueError, |
1601 | MP_ERROR_TEXT("unsupported format character '%c' (0x%x) at index %d" ), |
1602 | *str, *str, str - start_str); |
1603 | #endif |
1604 | } |
1605 | } |
1606 | |
1607 | if (arg_i != n_args) { |
1608 | mp_raise_TypeError(MP_ERROR_TEXT("format string didn't convert all arguments" )); |
1609 | } |
1610 | |
1611 | return mp_obj_new_str_from_vstr(is_bytes ? &mp_type_bytes : &mp_type_str, &vstr); |
1612 | } |
1613 | #endif |
1614 | |
1615 | // The implementation is optimized, returning the original string if there's |
1616 | // nothing to replace. |
1617 | STATIC mp_obj_t str_replace(size_t n_args, const mp_obj_t *args) { |
1618 | mp_check_self(mp_obj_is_str_or_bytes(args[0])); |
1619 | |
1620 | mp_int_t max_rep = -1; |
1621 | if (n_args == 4) { |
1622 | max_rep = mp_obj_get_int(args[3]); |
1623 | if (max_rep == 0) { |
1624 | return args[0]; |
1625 | } else if (max_rep < 0) { |
1626 | max_rep = -1; |
1627 | } |
1628 | } |
1629 | |
1630 | // if max_rep is still -1 by this point we will need to do all possible replacements |
1631 | |
1632 | // check argument types |
1633 | |
1634 | const mp_obj_type_t *self_type = mp_obj_get_type(args[0]); |
1635 | |
1636 | if (mp_obj_get_type(args[1]) != self_type) { |
1637 | bad_implicit_conversion(args[1]); |
1638 | } |
1639 | |
1640 | if (mp_obj_get_type(args[2]) != self_type) { |
1641 | bad_implicit_conversion(args[2]); |
1642 | } |
1643 | |
1644 | // extract string data |
1645 | |
1646 | GET_STR_DATA_LEN(args[0], str, str_len); |
1647 | GET_STR_DATA_LEN(args[1], old, old_len); |
1648 | GET_STR_DATA_LEN(args[2], new, new_len); |
1649 | |
1650 | // old won't exist in str if it's longer, so nothing to replace |
1651 | if (old_len > str_len) { |
1652 | return args[0]; |
1653 | } |
1654 | |
1655 | // data for the replaced string |
1656 | byte *data = NULL; |
1657 | vstr_t vstr; |
1658 | |
1659 | // do 2 passes over the string: |
1660 | // first pass computes the required length of the replaced string |
1661 | // second pass does the replacements |
1662 | for (;;) { |
1663 | size_t replaced_str_index = 0; |
1664 | size_t num_replacements_done = 0; |
1665 | const byte *old_occurrence; |
1666 | const byte *offset_ptr = str; |
1667 | size_t str_len_remain = str_len; |
1668 | if (old_len == 0) { |
1669 | // if old_str is empty, copy new_str to start of replaced string |
1670 | // copy the replacement string |
1671 | if (data != NULL) { |
1672 | memcpy(data, new, new_len); |
1673 | } |
1674 | replaced_str_index += new_len; |
1675 | num_replacements_done++; |
1676 | } |
1677 | while (num_replacements_done != (size_t)max_rep && str_len_remain > 0 && (old_occurrence = find_subbytes(offset_ptr, str_len_remain, old, old_len, 1)) != NULL) { |
1678 | if (old_len == 0) { |
1679 | old_occurrence += 1; |
1680 | } |
1681 | // copy from just after end of last occurrence of to-be-replaced string to right before start of next occurrence |
1682 | if (data != NULL) { |
1683 | memcpy(data + replaced_str_index, offset_ptr, old_occurrence - offset_ptr); |
1684 | } |
1685 | replaced_str_index += old_occurrence - offset_ptr; |
1686 | // copy the replacement string |
1687 | if (data != NULL) { |
1688 | memcpy(data + replaced_str_index, new, new_len); |
1689 | } |
1690 | replaced_str_index += new_len; |
1691 | offset_ptr = old_occurrence + old_len; |
1692 | str_len_remain = str + str_len - offset_ptr; |
1693 | num_replacements_done++; |
1694 | } |
1695 | |
1696 | // copy from just after end of last occurrence of to-be-replaced string to end of old string |
1697 | if (data != NULL) { |
1698 | memcpy(data + replaced_str_index, offset_ptr, str_len_remain); |
1699 | } |
1700 | replaced_str_index += str_len_remain; |
1701 | |
1702 | if (data == NULL) { |
1703 | // first pass |
1704 | if (num_replacements_done == 0) { |
1705 | // no substr found, return original string |
1706 | return args[0]; |
1707 | } else { |
1708 | // substr found, allocate new string |
1709 | vstr_init_len(&vstr, replaced_str_index); |
1710 | data = (byte *)vstr.buf; |
1711 | assert(data != NULL); |
1712 | } |
1713 | } else { |
1714 | // second pass, we are done |
1715 | break; |
1716 | } |
1717 | } |
1718 | |
1719 | return mp_obj_new_str_from_vstr(self_type, &vstr); |
1720 | } |
1721 | MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_replace_obj, 3, 4, str_replace); |
1722 | |
1723 | #if MICROPY_PY_BUILTINS_STR_COUNT |
1724 | STATIC mp_obj_t str_count(size_t n_args, const mp_obj_t *args) { |
1725 | const mp_obj_type_t *self_type = mp_obj_get_type(args[0]); |
1726 | mp_check_self(mp_obj_is_str_or_bytes(args[0])); |
1727 | |
1728 | // check argument type |
1729 | if (mp_obj_get_type(args[1]) != self_type) { |
1730 | bad_implicit_conversion(args[1]); |
1731 | } |
1732 | |
1733 | GET_STR_DATA_LEN(args[0], haystack, haystack_len); |
1734 | GET_STR_DATA_LEN(args[1], needle, needle_len); |
1735 | |
1736 | const byte *start = haystack; |
1737 | const byte *end = haystack + haystack_len; |
1738 | if (n_args >= 3 && args[2] != mp_const_none) { |
1739 | start = str_index_to_ptr(self_type, haystack, haystack_len, args[2], true); |
1740 | } |
1741 | if (n_args >= 4 && args[3] != mp_const_none) { |
1742 | end = str_index_to_ptr(self_type, haystack, haystack_len, args[3], true); |
1743 | } |
1744 | |
1745 | // if needle_len is zero then we count each gap between characters as an occurrence |
1746 | if (needle_len == 0) { |
1747 | return MP_OBJ_NEW_SMALL_INT(utf8_charlen(start, end - start) + 1); |
1748 | } |
1749 | |
1750 | // count the occurrences |
1751 | mp_int_t num_occurrences = 0; |
1752 | for (const byte *haystack_ptr = start; haystack_ptr + needle_len <= end;) { |
1753 | if (memcmp(haystack_ptr, needle, needle_len) == 0) { |
1754 | num_occurrences++; |
1755 | haystack_ptr += needle_len; |
1756 | } else { |
1757 | haystack_ptr = utf8_next_char(haystack_ptr); |
1758 | } |
1759 | } |
1760 | |
1761 | return MP_OBJ_NEW_SMALL_INT(num_occurrences); |
1762 | } |
1763 | MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_count_obj, 2, 4, str_count); |
1764 | #endif |
1765 | |
1766 | #if MICROPY_PY_BUILTINS_STR_PARTITION |
1767 | STATIC mp_obj_t str_partitioner(mp_obj_t self_in, mp_obj_t arg, int direction) { |
1768 | mp_check_self(mp_obj_is_str_or_bytes(self_in)); |
1769 | const mp_obj_type_t *self_type = mp_obj_get_type(self_in); |
1770 | if (self_type != mp_obj_get_type(arg)) { |
1771 | bad_implicit_conversion(arg); |
1772 | } |
1773 | |
1774 | GET_STR_DATA_LEN(self_in, str, str_len); |
1775 | GET_STR_DATA_LEN(arg, sep, sep_len); |
1776 | |
1777 | if (sep_len == 0) { |
1778 | mp_raise_ValueError(MP_ERROR_TEXT("empty separator" )); |
1779 | } |
1780 | |
1781 | mp_obj_t result[3]; |
1782 | if (self_type == &mp_type_str) { |
1783 | result[0] = MP_OBJ_NEW_QSTR(MP_QSTR_); |
1784 | result[1] = MP_OBJ_NEW_QSTR(MP_QSTR_); |
1785 | result[2] = MP_OBJ_NEW_QSTR(MP_QSTR_); |
1786 | } else { |
1787 | result[0] = mp_const_empty_bytes; |
1788 | result[1] = mp_const_empty_bytes; |
1789 | result[2] = mp_const_empty_bytes; |
1790 | } |
1791 | |
1792 | if (direction > 0) { |
1793 | result[0] = self_in; |
1794 | } else { |
1795 | result[2] = self_in; |
1796 | } |
1797 | |
1798 | const byte *position_ptr = find_subbytes(str, str_len, sep, sep_len, direction); |
1799 | if (position_ptr != NULL) { |
1800 | size_t position = position_ptr - str; |
1801 | result[0] = mp_obj_new_str_of_type(self_type, str, position); |
1802 | result[1] = arg; |
1803 | result[2] = mp_obj_new_str_of_type(self_type, str + position + sep_len, str_len - position - sep_len); |
1804 | } |
1805 | |
1806 | return mp_obj_new_tuple(3, result); |
1807 | } |
1808 | |
1809 | STATIC mp_obj_t str_partition(mp_obj_t self_in, mp_obj_t arg) { |
1810 | return str_partitioner(self_in, arg, 1); |
1811 | } |
1812 | MP_DEFINE_CONST_FUN_OBJ_2(str_partition_obj, str_partition); |
1813 | |
1814 | STATIC mp_obj_t str_rpartition(mp_obj_t self_in, mp_obj_t arg) { |
1815 | return str_partitioner(self_in, arg, -1); |
1816 | } |
1817 | MP_DEFINE_CONST_FUN_OBJ_2(str_rpartition_obj, str_rpartition); |
1818 | #endif |
1819 | |
1820 | // Supposedly not too critical operations, so optimize for code size |
1821 | STATIC mp_obj_t str_caseconv(unichar (*op)(unichar), mp_obj_t self_in) { |
1822 | GET_STR_DATA_LEN(self_in, self_data, self_len); |
1823 | vstr_t vstr; |
1824 | vstr_init_len(&vstr, self_len); |
1825 | byte *data = (byte *)vstr.buf; |
1826 | for (size_t i = 0; i < self_len; i++) { |
1827 | *data++ = op(*self_data++); |
1828 | } |
1829 | return mp_obj_new_str_from_vstr(mp_obj_get_type(self_in), &vstr); |
1830 | } |
1831 | |
1832 | STATIC mp_obj_t str_lower(mp_obj_t self_in) { |
1833 | return str_caseconv(unichar_tolower, self_in); |
1834 | } |
1835 | MP_DEFINE_CONST_FUN_OBJ_1(str_lower_obj, str_lower); |
1836 | |
1837 | STATIC mp_obj_t str_upper(mp_obj_t self_in) { |
1838 | return str_caseconv(unichar_toupper, self_in); |
1839 | } |
1840 | MP_DEFINE_CONST_FUN_OBJ_1(str_upper_obj, str_upper); |
1841 | |
1842 | STATIC mp_obj_t str_uni_istype(bool (*f)(unichar), mp_obj_t self_in) { |
1843 | GET_STR_DATA_LEN(self_in, self_data, self_len); |
1844 | |
1845 | if (self_len == 0) { |
1846 | return mp_const_false; // default to False for empty str |
1847 | } |
1848 | |
1849 | if (f != unichar_isupper && f != unichar_islower) { |
1850 | for (size_t i = 0; i < self_len; i++) { |
1851 | if (!f(*self_data++)) { |
1852 | return mp_const_false; |
1853 | } |
1854 | } |
1855 | } else { |
1856 | bool contains_alpha = false; |
1857 | |
1858 | for (size_t i = 0; i < self_len; i++) { // only check alphanumeric characters |
1859 | if (unichar_isalpha(*self_data++)) { |
1860 | contains_alpha = true; |
1861 | if (!f(*(self_data - 1))) { // -1 because we already incremented above |
1862 | return mp_const_false; |
1863 | } |
1864 | } |
1865 | } |
1866 | |
1867 | if (!contains_alpha) { |
1868 | return mp_const_false; |
1869 | } |
1870 | } |
1871 | |
1872 | return mp_const_true; |
1873 | } |
1874 | |
1875 | STATIC mp_obj_t str_isspace(mp_obj_t self_in) { |
1876 | return str_uni_istype(unichar_isspace, self_in); |
1877 | } |
1878 | MP_DEFINE_CONST_FUN_OBJ_1(str_isspace_obj, str_isspace); |
1879 | |
1880 | STATIC mp_obj_t str_isalpha(mp_obj_t self_in) { |
1881 | return str_uni_istype(unichar_isalpha, self_in); |
1882 | } |
1883 | MP_DEFINE_CONST_FUN_OBJ_1(str_isalpha_obj, str_isalpha); |
1884 | |
1885 | STATIC mp_obj_t str_isdigit(mp_obj_t self_in) { |
1886 | return str_uni_istype(unichar_isdigit, self_in); |
1887 | } |
1888 | MP_DEFINE_CONST_FUN_OBJ_1(str_isdigit_obj, str_isdigit); |
1889 | |
1890 | STATIC mp_obj_t str_isupper(mp_obj_t self_in) { |
1891 | return str_uni_istype(unichar_isupper, self_in); |
1892 | } |
1893 | MP_DEFINE_CONST_FUN_OBJ_1(str_isupper_obj, str_isupper); |
1894 | |
1895 | STATIC mp_obj_t str_islower(mp_obj_t self_in) { |
1896 | return str_uni_istype(unichar_islower, self_in); |
1897 | } |
1898 | MP_DEFINE_CONST_FUN_OBJ_1(str_islower_obj, str_islower); |
1899 | |
1900 | #if MICROPY_CPYTHON_COMPAT |
1901 | // These methods are superfluous in the presence of str() and bytes() |
1902 | // constructors. |
1903 | // TODO: should accept kwargs too |
1904 | STATIC mp_obj_t bytes_decode(size_t n_args, const mp_obj_t *args) { |
1905 | mp_obj_t new_args[2]; |
1906 | if (n_args == 1) { |
1907 | new_args[0] = args[0]; |
1908 | new_args[1] = MP_OBJ_NEW_QSTR(MP_QSTR_utf_hyphen_8); |
1909 | args = new_args; |
1910 | n_args++; |
1911 | } |
1912 | return mp_obj_str_make_new(&mp_type_str, n_args, 0, args); |
1913 | } |
1914 | MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(bytes_decode_obj, 1, 3, bytes_decode); |
1915 | |
1916 | // TODO: should accept kwargs too |
1917 | STATIC mp_obj_t str_encode(size_t n_args, const mp_obj_t *args) { |
1918 | mp_obj_t new_args[2]; |
1919 | if (n_args == 1) { |
1920 | new_args[0] = args[0]; |
1921 | new_args[1] = MP_OBJ_NEW_QSTR(MP_QSTR_utf_hyphen_8); |
1922 | args = new_args; |
1923 | n_args++; |
1924 | } |
1925 | return bytes_make_new(NULL, n_args, 0, args); |
1926 | } |
1927 | MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_encode_obj, 1, 3, str_encode); |
1928 | #endif |
1929 | |
1930 | mp_int_t mp_obj_str_get_buffer(mp_obj_t self_in, mp_buffer_info_t *bufinfo, mp_uint_t flags) { |
1931 | if (flags == MP_BUFFER_READ) { |
1932 | GET_STR_DATA_LEN(self_in, str_data, str_len); |
1933 | bufinfo->buf = (void *)str_data; |
1934 | bufinfo->len = str_len; |
1935 | bufinfo->typecode = 'B'; // bytes should be unsigned, so should unicode byte-access |
1936 | return 0; |
1937 | } else { |
1938 | // can't write to a string |
1939 | return 1; |
1940 | } |
1941 | } |
1942 | |
1943 | STATIC const mp_rom_map_elem_t str8_locals_dict_table[] = { |
1944 | #if MICROPY_CPYTHON_COMPAT |
1945 | { MP_ROM_QSTR(MP_QSTR_decode), MP_ROM_PTR(&bytes_decode_obj) }, |
1946 | #if !MICROPY_PY_BUILTINS_STR_UNICODE |
1947 | // If we have separate unicode type, then here we have methods only |
1948 | // for bytes type, and it should not have encode() methods. Otherwise, |
1949 | // we have non-compliant-but-practical bytestring type, which shares |
1950 | // method table with bytes, so they both have encode() and decode() |
1951 | // methods (which should do type checking at runtime). |
1952 | { MP_ROM_QSTR(MP_QSTR_encode), MP_ROM_PTR(&str_encode_obj) }, |
1953 | #endif |
1954 | #endif |
1955 | { MP_ROM_QSTR(MP_QSTR_find), MP_ROM_PTR(&str_find_obj) }, |
1956 | { MP_ROM_QSTR(MP_QSTR_rfind), MP_ROM_PTR(&str_rfind_obj) }, |
1957 | { MP_ROM_QSTR(MP_QSTR_index), MP_ROM_PTR(&str_index_obj) }, |
1958 | { MP_ROM_QSTR(MP_QSTR_rindex), MP_ROM_PTR(&str_rindex_obj) }, |
1959 | { MP_ROM_QSTR(MP_QSTR_join), MP_ROM_PTR(&str_join_obj) }, |
1960 | { MP_ROM_QSTR(MP_QSTR_split), MP_ROM_PTR(&str_split_obj) }, |
1961 | #if MICROPY_PY_BUILTINS_STR_SPLITLINES |
1962 | { MP_ROM_QSTR(MP_QSTR_splitlines), MP_ROM_PTR(&str_splitlines_obj) }, |
1963 | #endif |
1964 | { MP_ROM_QSTR(MP_QSTR_rsplit), MP_ROM_PTR(&str_rsplit_obj) }, |
1965 | { MP_ROM_QSTR(MP_QSTR_startswith), MP_ROM_PTR(&str_startswith_obj) }, |
1966 | { MP_ROM_QSTR(MP_QSTR_endswith), MP_ROM_PTR(&str_endswith_obj) }, |
1967 | { MP_ROM_QSTR(MP_QSTR_strip), MP_ROM_PTR(&str_strip_obj) }, |
1968 | { MP_ROM_QSTR(MP_QSTR_lstrip), MP_ROM_PTR(&str_lstrip_obj) }, |
1969 | { MP_ROM_QSTR(MP_QSTR_rstrip), MP_ROM_PTR(&str_rstrip_obj) }, |
1970 | { MP_ROM_QSTR(MP_QSTR_format), MP_ROM_PTR(&str_format_obj) }, |
1971 | { MP_ROM_QSTR(MP_QSTR_replace), MP_ROM_PTR(&str_replace_obj) }, |
1972 | #if MICROPY_PY_BUILTINS_STR_COUNT |
1973 | { MP_ROM_QSTR(MP_QSTR_count), MP_ROM_PTR(&str_count_obj) }, |
1974 | #endif |
1975 | #if MICROPY_PY_BUILTINS_STR_PARTITION |
1976 | { MP_ROM_QSTR(MP_QSTR_partition), MP_ROM_PTR(&str_partition_obj) }, |
1977 | { MP_ROM_QSTR(MP_QSTR_rpartition), MP_ROM_PTR(&str_rpartition_obj) }, |
1978 | #endif |
1979 | #if MICROPY_PY_BUILTINS_STR_CENTER |
1980 | { MP_ROM_QSTR(MP_QSTR_center), MP_ROM_PTR(&str_center_obj) }, |
1981 | #endif |
1982 | { MP_ROM_QSTR(MP_QSTR_lower), MP_ROM_PTR(&str_lower_obj) }, |
1983 | { MP_ROM_QSTR(MP_QSTR_upper), MP_ROM_PTR(&str_upper_obj) }, |
1984 | { MP_ROM_QSTR(MP_QSTR_isspace), MP_ROM_PTR(&str_isspace_obj) }, |
1985 | { MP_ROM_QSTR(MP_QSTR_isalpha), MP_ROM_PTR(&str_isalpha_obj) }, |
1986 | { MP_ROM_QSTR(MP_QSTR_isdigit), MP_ROM_PTR(&str_isdigit_obj) }, |
1987 | { MP_ROM_QSTR(MP_QSTR_isupper), MP_ROM_PTR(&str_isupper_obj) }, |
1988 | { MP_ROM_QSTR(MP_QSTR_islower), MP_ROM_PTR(&str_islower_obj) }, |
1989 | }; |
1990 | |
1991 | STATIC MP_DEFINE_CONST_DICT(str8_locals_dict, str8_locals_dict_table); |
1992 | |
1993 | #if !MICROPY_PY_BUILTINS_STR_UNICODE |
1994 | STATIC mp_obj_t mp_obj_new_str_iterator(mp_obj_t str, mp_obj_iter_buf_t *iter_buf); |
1995 | |
1996 | const mp_obj_type_t mp_type_str = { |
1997 | { &mp_type_type }, |
1998 | .name = MP_QSTR_str, |
1999 | .print = str_print, |
2000 | .make_new = mp_obj_str_make_new, |
2001 | .binary_op = mp_obj_str_binary_op, |
2002 | .subscr = bytes_subscr, |
2003 | .getiter = mp_obj_new_str_iterator, |
2004 | .buffer_p = { .get_buffer = mp_obj_str_get_buffer }, |
2005 | .locals_dict = (mp_obj_dict_t *)&str8_locals_dict, |
2006 | }; |
2007 | #endif |
2008 | |
2009 | // Reuses most of methods from str |
2010 | const mp_obj_type_t mp_type_bytes = { |
2011 | { &mp_type_type }, |
2012 | .name = MP_QSTR_bytes, |
2013 | .print = str_print, |
2014 | .make_new = bytes_make_new, |
2015 | .binary_op = mp_obj_str_binary_op, |
2016 | .subscr = bytes_subscr, |
2017 | .getiter = mp_obj_new_bytes_iterator, |
2018 | .buffer_p = { .get_buffer = mp_obj_str_get_buffer }, |
2019 | .locals_dict = (mp_obj_dict_t *)&str8_locals_dict, |
2020 | }; |
2021 | |
2022 | // The zero-length bytes object, with data that includes a null-terminating byte |
2023 | const mp_obj_str_t mp_const_empty_bytes_obj = {{&mp_type_bytes}, 0, 0, (const byte *)"" }; |
2024 | |
2025 | // Create a str/bytes object using the given data. New memory is allocated and |
2026 | // the data is copied across. This function should only be used if the type is bytes, |
2027 | // or if the type is str and the string data is known to be not interned. |
2028 | mp_obj_t mp_obj_new_str_copy(const mp_obj_type_t *type, const byte *data, size_t len) { |
2029 | mp_obj_str_t *o = m_new_obj(mp_obj_str_t); |
2030 | o->base.type = type; |
2031 | o->len = len; |
2032 | if (data) { |
2033 | o->hash = qstr_compute_hash(data, len); |
2034 | byte *p = m_new(byte, len + 1); |
2035 | o->data = p; |
2036 | memcpy(p, data, len * sizeof(byte)); |
2037 | p[len] = '\0'; // for now we add null for compatibility with C ASCIIZ strings |
2038 | } |
2039 | return MP_OBJ_FROM_PTR(o); |
2040 | } |
2041 | |
2042 | // Create a str/bytes object using the given data. If the type is str and the string |
2043 | // data is already interned, then a qstr object is returned. Otherwise new memory is |
2044 | // allocated for the object and the data is copied across. |
2045 | mp_obj_t mp_obj_new_str_of_type(const mp_obj_type_t *type, const byte *data, size_t len) { |
2046 | if (type == &mp_type_str) { |
2047 | return mp_obj_new_str((const char *)data, len); |
2048 | } else { |
2049 | return mp_obj_new_bytes(data, len); |
2050 | } |
2051 | } |
2052 | |
2053 | // Create a str using a qstr to store the data; may use existing or new qstr. |
2054 | mp_obj_t mp_obj_new_str_via_qstr(const char *data, size_t len) { |
2055 | return MP_OBJ_NEW_QSTR(qstr_from_strn(data, len)); |
2056 | } |
2057 | |
2058 | // Create a str/bytes object from the given vstr. The vstr buffer is resized to |
2059 | // the exact length required and then reused for the str/bytes object. The vstr |
2060 | // is cleared and can safely be passed to vstr_free if it was heap allocated. |
2061 | mp_obj_t mp_obj_new_str_from_vstr(const mp_obj_type_t *type, vstr_t *vstr) { |
2062 | // if not a bytes object, look if a qstr with this data already exists |
2063 | if (type == &mp_type_str) { |
2064 | qstr q = qstr_find_strn(vstr->buf, vstr->len); |
2065 | if (q != MP_QSTRnull) { |
2066 | vstr_clear(vstr); |
2067 | vstr->alloc = 0; |
2068 | return MP_OBJ_NEW_QSTR(q); |
2069 | } |
2070 | } |
2071 | |
2072 | // make a new str/bytes object |
2073 | mp_obj_str_t *o = m_new_obj(mp_obj_str_t); |
2074 | o->base.type = type; |
2075 | o->len = vstr->len; |
2076 | o->hash = qstr_compute_hash((byte *)vstr->buf, vstr->len); |
2077 | if (vstr->len + 1 == vstr->alloc) { |
2078 | o->data = (byte *)vstr->buf; |
2079 | } else { |
2080 | o->data = (byte *)m_renew(char, vstr->buf, vstr->alloc, vstr->len + 1); |
2081 | } |
2082 | ((byte *)o->data)[o->len] = '\0'; // add null byte |
2083 | vstr->buf = NULL; |
2084 | vstr->alloc = 0; |
2085 | return MP_OBJ_FROM_PTR(o); |
2086 | } |
2087 | |
2088 | mp_obj_t mp_obj_new_str(const char *data, size_t len) { |
2089 | qstr q = qstr_find_strn(data, len); |
2090 | if (q != MP_QSTRnull) { |
2091 | // qstr with this data already exists |
2092 | return MP_OBJ_NEW_QSTR(q); |
2093 | } else { |
2094 | // no existing qstr, don't make one |
2095 | return mp_obj_new_str_copy(&mp_type_str, (const byte *)data, len); |
2096 | } |
2097 | } |
2098 | |
2099 | mp_obj_t mp_obj_str_intern(mp_obj_t str) { |
2100 | GET_STR_DATA_LEN(str, data, len); |
2101 | return mp_obj_new_str_via_qstr((const char *)data, len); |
2102 | } |
2103 | |
2104 | mp_obj_t mp_obj_str_intern_checked(mp_obj_t obj) { |
2105 | size_t len; |
2106 | const char *data = mp_obj_str_get_data(obj, &len); |
2107 | return mp_obj_new_str_via_qstr((const char *)data, len); |
2108 | } |
2109 | |
2110 | mp_obj_t mp_obj_new_bytes(const byte *data, size_t len) { |
2111 | return mp_obj_new_str_copy(&mp_type_bytes, data, len); |
2112 | } |
2113 | |
2114 | bool mp_obj_str_equal(mp_obj_t s1, mp_obj_t s2) { |
2115 | if (mp_obj_is_qstr(s1) && mp_obj_is_qstr(s2)) { |
2116 | return s1 == s2; |
2117 | } else { |
2118 | GET_STR_HASH(s1, h1); |
2119 | GET_STR_HASH(s2, h2); |
2120 | // If any of hashes is 0, it means it's not valid |
2121 | if (h1 != 0 && h2 != 0 && h1 != h2) { |
2122 | return false; |
2123 | } |
2124 | GET_STR_DATA_LEN(s1, d1, l1); |
2125 | GET_STR_DATA_LEN(s2, d2, l2); |
2126 | if (l1 != l2) { |
2127 | return false; |
2128 | } |
2129 | return memcmp(d1, d2, l1) == 0; |
2130 | } |
2131 | } |
2132 | |
2133 | STATIC NORETURN void bad_implicit_conversion(mp_obj_t self_in) { |
2134 | #if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE |
2135 | mp_raise_TypeError(MP_ERROR_TEXT("can't convert to str implicitly" )); |
2136 | #else |
2137 | const qstr src_name = mp_obj_get_type(self_in)->name; |
2138 | mp_raise_msg_varg(&mp_type_TypeError, |
2139 | MP_ERROR_TEXT("can't convert '%q' object to %q implicitly" ), |
2140 | src_name, src_name == MP_QSTR_str ? MP_QSTR_bytes : MP_QSTR_str); |
2141 | #endif |
2142 | } |
2143 | |
2144 | // use this if you will anyway convert the string to a qstr |
2145 | // will be more efficient for the case where it's already a qstr |
2146 | qstr mp_obj_str_get_qstr(mp_obj_t self_in) { |
2147 | if (mp_obj_is_qstr(self_in)) { |
2148 | return MP_OBJ_QSTR_VALUE(self_in); |
2149 | } else if (mp_obj_is_type(self_in, &mp_type_str)) { |
2150 | mp_obj_str_t *self = MP_OBJ_TO_PTR(self_in); |
2151 | return qstr_from_strn((char *)self->data, self->len); |
2152 | } else { |
2153 | bad_implicit_conversion(self_in); |
2154 | } |
2155 | } |
2156 | |
2157 | // only use this function if you need the str data to be zero terminated |
2158 | // at the moment all strings are zero terminated to help with C ASCIIZ compatibility |
2159 | const char *mp_obj_str_get_str(mp_obj_t self_in) { |
2160 | if (mp_obj_is_str_or_bytes(self_in)) { |
2161 | GET_STR_DATA_LEN(self_in, s, l); |
2162 | (void)l; // len unused |
2163 | return (const char *)s; |
2164 | } else { |
2165 | bad_implicit_conversion(self_in); |
2166 | } |
2167 | } |
2168 | |
2169 | const char *mp_obj_str_get_data(mp_obj_t self_in, size_t *len) { |
2170 | if (mp_obj_is_str_or_bytes(self_in)) { |
2171 | GET_STR_DATA_LEN(self_in, s, l); |
2172 | *len = l; |
2173 | return (const char *)s; |
2174 | } else { |
2175 | bad_implicit_conversion(self_in); |
2176 | } |
2177 | } |
2178 | |
2179 | #if MICROPY_OBJ_REPR == MICROPY_OBJ_REPR_C || MICROPY_OBJ_REPR == MICROPY_OBJ_REPR_D |
2180 | const byte *mp_obj_str_get_data_no_check(mp_obj_t self_in, size_t *len) { |
2181 | if (mp_obj_is_qstr(self_in)) { |
2182 | return qstr_data(MP_OBJ_QSTR_VALUE(self_in), len); |
2183 | } else { |
2184 | *len = ((mp_obj_str_t *)MP_OBJ_TO_PTR(self_in))->len; |
2185 | return ((mp_obj_str_t *)MP_OBJ_TO_PTR(self_in))->data; |
2186 | } |
2187 | } |
2188 | #endif |
2189 | |
2190 | /******************************************************************************/ |
2191 | /* str iterator */ |
2192 | |
2193 | typedef struct _mp_obj_str8_it_t { |
2194 | mp_obj_base_t base; |
2195 | mp_fun_1_t iternext; |
2196 | mp_obj_t str; |
2197 | size_t cur; |
2198 | } mp_obj_str8_it_t; |
2199 | |
2200 | #if !MICROPY_PY_BUILTINS_STR_UNICODE |
2201 | STATIC mp_obj_t str_it_iternext(mp_obj_t self_in) { |
2202 | mp_obj_str8_it_t *self = MP_OBJ_TO_PTR(self_in); |
2203 | GET_STR_DATA_LEN(self->str, str, len); |
2204 | if (self->cur < len) { |
2205 | mp_obj_t o_out = mp_obj_new_str_via_qstr((const char *)str + self->cur, 1); |
2206 | self->cur += 1; |
2207 | return o_out; |
2208 | } else { |
2209 | return MP_OBJ_STOP_ITERATION; |
2210 | } |
2211 | } |
2212 | |
2213 | STATIC mp_obj_t mp_obj_new_str_iterator(mp_obj_t str, mp_obj_iter_buf_t *iter_buf) { |
2214 | assert(sizeof(mp_obj_str8_it_t) <= sizeof(mp_obj_iter_buf_t)); |
2215 | mp_obj_str8_it_t *o = (mp_obj_str8_it_t *)iter_buf; |
2216 | o->base.type = &mp_type_polymorph_iter; |
2217 | o->iternext = str_it_iternext; |
2218 | o->str = str; |
2219 | o->cur = 0; |
2220 | return MP_OBJ_FROM_PTR(o); |
2221 | } |
2222 | #endif |
2223 | |
2224 | STATIC mp_obj_t bytes_it_iternext(mp_obj_t self_in) { |
2225 | mp_obj_str8_it_t *self = MP_OBJ_TO_PTR(self_in); |
2226 | GET_STR_DATA_LEN(self->str, str, len); |
2227 | if (self->cur < len) { |
2228 | mp_obj_t o_out = MP_OBJ_NEW_SMALL_INT(str[self->cur]); |
2229 | self->cur += 1; |
2230 | return o_out; |
2231 | } else { |
2232 | return MP_OBJ_STOP_ITERATION; |
2233 | } |
2234 | } |
2235 | |
2236 | mp_obj_t mp_obj_new_bytes_iterator(mp_obj_t str, mp_obj_iter_buf_t *iter_buf) { |
2237 | assert(sizeof(mp_obj_str8_it_t) <= sizeof(mp_obj_iter_buf_t)); |
2238 | mp_obj_str8_it_t *o = (mp_obj_str8_it_t *)iter_buf; |
2239 | o->base.type = &mp_type_polymorph_iter; |
2240 | o->iternext = bytes_it_iternext; |
2241 | o->str = str; |
2242 | o->cur = 0; |
2243 | return MP_OBJ_FROM_PTR(o); |
2244 | } |
2245 | |