1/*
2 * This file is part of the MicroPython project, http://micropython.org/
3 *
4 * The MIT License (MIT)
5 *
6 * Copyright (c) 2013, 2014 Damien P. George
7 * Copyright (c) 2014-2018 Paul Sokolovsky
8 *
9 * Permission is hereby granted, free of charge, to any person obtaining a copy
10 * of this software and associated documentation files (the "Software"), to deal
11 * in the Software without restriction, including without limitation the rights
12 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 * copies of the Software, and to permit persons to whom the Software is
14 * furnished to do so, subject to the following conditions:
15 *
16 * The above copyright notice and this permission notice shall be included in
17 * all copies or substantial portions of the Software.
18 *
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25 * THE SOFTWARE.
26 */
27
28#include <string.h>
29#include <assert.h>
30
31#include "py/unicode.h"
32#include "py/objstr.h"
33#include "py/objlist.h"
34#include "py/runtime.h"
35#include "py/stackctrl.h"
36
37#if MICROPY_PY_BUILTINS_STR_OP_MODULO
38STATIC mp_obj_t str_modulo_format(mp_obj_t pattern, size_t n_args, const mp_obj_t *args, mp_obj_t dict);
39#endif
40
41STATIC mp_obj_t mp_obj_new_bytes_iterator(mp_obj_t str, mp_obj_iter_buf_t *iter_buf);
42STATIC NORETURN void bad_implicit_conversion(mp_obj_t self_in);
43
44/******************************************************************************/
45/* str */
46
47void mp_str_print_quoted(const mp_print_t *print, const byte *str_data, size_t str_len, bool is_bytes) {
48 // this escapes characters, but it will be very slow to print (calling print many times)
49 bool has_single_quote = false;
50 bool has_double_quote = false;
51 for (const byte *s = str_data, *top = str_data + str_len; !has_double_quote && s < top; s++) {
52 if (*s == '\'') {
53 has_single_quote = true;
54 } else if (*s == '"') {
55 has_double_quote = true;
56 }
57 }
58 int quote_char = '\'';
59 if (has_single_quote && !has_double_quote) {
60 quote_char = '"';
61 }
62 mp_printf(print, "%c", quote_char);
63 for (const byte *s = str_data, *top = str_data + str_len; s < top; s++) {
64 if (*s == quote_char) {
65 mp_printf(print, "\\%c", quote_char);
66 } else if (*s == '\\') {
67 mp_print_str(print, "\\\\");
68 } else if (*s >= 0x20 && *s != 0x7f && (!is_bytes || *s < 0x80)) {
69 // In strings, anything which is not ascii control character
70 // is printed as is, this includes characters in range 0x80-0xff
71 // (which can be non-Latin letters, etc.)
72 mp_printf(print, "%c", *s);
73 } else if (*s == '\n') {
74 mp_print_str(print, "\\n");
75 } else if (*s == '\r') {
76 mp_print_str(print, "\\r");
77 } else if (*s == '\t') {
78 mp_print_str(print, "\\t");
79 } else {
80 mp_printf(print, "\\x%02x", *s);
81 }
82 }
83 mp_printf(print, "%c", quote_char);
84}
85
86#if MICROPY_PY_UJSON
87void mp_str_print_json(const mp_print_t *print, const byte *str_data, size_t str_len) {
88 // for JSON spec, see http://www.ietf.org/rfc/rfc4627.txt
89 // if we are given a valid utf8-encoded string, we will print it in a JSON-conforming way
90 mp_print_str(print, "\"");
91 for (const byte *s = str_data, *top = str_data + str_len; s < top; s++) {
92 if (*s == '"' || *s == '\\') {
93 mp_printf(print, "\\%c", *s);
94 } else if (*s >= 32) {
95 // this will handle normal and utf-8 encoded chars
96 mp_printf(print, "%c", *s);
97 } else if (*s == '\n') {
98 mp_print_str(print, "\\n");
99 } else if (*s == '\r') {
100 mp_print_str(print, "\\r");
101 } else if (*s == '\t') {
102 mp_print_str(print, "\\t");
103 } else {
104 // this will handle control chars
105 mp_printf(print, "\\u%04x", *s);
106 }
107 }
108 mp_print_str(print, "\"");
109}
110#endif
111
112STATIC void str_print(const mp_print_t *print, mp_obj_t self_in, mp_print_kind_t kind) {
113 GET_STR_DATA_LEN(self_in, str_data, str_len);
114 #if MICROPY_PY_UJSON
115 if (kind == PRINT_JSON) {
116 mp_str_print_json(print, str_data, str_len);
117 return;
118 }
119 #endif
120 #if !MICROPY_PY_BUILTINS_STR_UNICODE
121 bool is_bytes = mp_obj_is_type(self_in, &mp_type_bytes);
122 #else
123 bool is_bytes = true;
124 #endif
125 if (kind == PRINT_RAW || (!MICROPY_PY_BUILTINS_STR_UNICODE && kind == PRINT_STR && !is_bytes)) {
126 print->print_strn(print->data, (const char *)str_data, str_len);
127 } else {
128 if (is_bytes) {
129 print->print_strn(print->data, "b", 1);
130 }
131 mp_str_print_quoted(print, str_data, str_len, is_bytes);
132 }
133}
134
135mp_obj_t mp_obj_str_make_new(const mp_obj_type_t *type, size_t n_args, size_t n_kw, const mp_obj_t *args) {
136 #if MICROPY_CPYTHON_COMPAT
137 if (n_kw != 0) {
138 mp_arg_error_unimpl_kw();
139 }
140 #endif
141
142 mp_arg_check_num(n_args, n_kw, 0, 3, false);
143
144 switch (n_args) {
145 case 0:
146 return MP_OBJ_NEW_QSTR(MP_QSTR_);
147
148 case 1: {
149 vstr_t vstr;
150 mp_print_t print;
151 vstr_init_print(&vstr, 16, &print);
152 mp_obj_print_helper(&print, args[0], PRINT_STR);
153 return mp_obj_new_str_from_vstr(type, &vstr);
154 }
155
156 default: // 2 or 3 args
157 // TODO: validate 2nd/3rd args
158 if (mp_obj_is_type(args[0], &mp_type_bytes)) {
159 GET_STR_DATA_LEN(args[0], str_data, str_len);
160 GET_STR_HASH(args[0], str_hash);
161 if (str_hash == 0) {
162 str_hash = qstr_compute_hash(str_data, str_len);
163 }
164 #if MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
165 if (!utf8_check(str_data, str_len)) {
166 mp_raise_msg(&mp_type_UnicodeError, NULL);
167 }
168 #endif
169
170 // Check if a qstr with this data already exists
171 qstr q = qstr_find_strn((const char *)str_data, str_len);
172 if (q != MP_QSTRnull) {
173 return MP_OBJ_NEW_QSTR(q);
174 }
175
176 mp_obj_str_t *o = MP_OBJ_TO_PTR(mp_obj_new_str_copy(type, NULL, str_len));
177 o->data = str_data;
178 o->hash = str_hash;
179 return MP_OBJ_FROM_PTR(o);
180 } else {
181 mp_buffer_info_t bufinfo;
182 mp_get_buffer_raise(args[0], &bufinfo, MP_BUFFER_READ);
183 #if MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
184 if (!utf8_check(bufinfo.buf, bufinfo.len)) {
185 mp_raise_msg(&mp_type_UnicodeError, NULL);
186 }
187 #endif
188 return mp_obj_new_str(bufinfo.buf, bufinfo.len);
189 }
190 }
191}
192
193STATIC mp_obj_t bytes_make_new(const mp_obj_type_t *type_in, size_t n_args, size_t n_kw, const mp_obj_t *args) {
194 (void)type_in;
195
196 #if MICROPY_CPYTHON_COMPAT
197 if (n_kw != 0) {
198 mp_arg_error_unimpl_kw();
199 }
200 #else
201 (void)n_kw;
202 #endif
203
204 if (n_args == 0) {
205 return mp_const_empty_bytes;
206 }
207
208 if (mp_obj_is_type(args[0], &mp_type_bytes)) {
209 return args[0];
210 }
211
212 if (mp_obj_is_str(args[0])) {
213 if (n_args < 2 || n_args > 3) {
214 goto wrong_args;
215 }
216 GET_STR_DATA_LEN(args[0], str_data, str_len);
217 GET_STR_HASH(args[0], str_hash);
218 if (str_hash == 0) {
219 str_hash = qstr_compute_hash(str_data, str_len);
220 }
221 mp_obj_str_t *o = MP_OBJ_TO_PTR(mp_obj_new_str_copy(&mp_type_bytes, NULL, str_len));
222 o->data = str_data;
223 o->hash = str_hash;
224 return MP_OBJ_FROM_PTR(o);
225 }
226
227 if (n_args > 1) {
228 goto wrong_args;
229 }
230
231 if (mp_obj_is_small_int(args[0])) {
232 mp_int_t len = MP_OBJ_SMALL_INT_VALUE(args[0]);
233 if (len < 0) {
234 mp_raise_ValueError(NULL);
235 }
236 vstr_t vstr;
237 vstr_init_len(&vstr, len);
238 memset(vstr.buf, 0, len);
239 return mp_obj_new_str_from_vstr(&mp_type_bytes, &vstr);
240 }
241
242 // check if argument has the buffer protocol
243 mp_buffer_info_t bufinfo;
244 if (mp_get_buffer(args[0], &bufinfo, MP_BUFFER_READ)) {
245 return mp_obj_new_bytes(bufinfo.buf, bufinfo.len);
246 }
247
248 vstr_t vstr;
249 // Try to create array of exact len if initializer len is known
250 mp_obj_t len_in = mp_obj_len_maybe(args[0]);
251 if (len_in == MP_OBJ_NULL) {
252 vstr_init(&vstr, 16);
253 } else {
254 mp_int_t len = MP_OBJ_SMALL_INT_VALUE(len_in);
255 vstr_init(&vstr, len);
256 }
257
258 mp_obj_iter_buf_t iter_buf;
259 mp_obj_t iterable = mp_getiter(args[0], &iter_buf);
260 mp_obj_t item;
261 while ((item = mp_iternext(iterable)) != MP_OBJ_STOP_ITERATION) {
262 mp_int_t val = mp_obj_get_int(item);
263 #if MICROPY_FULL_CHECKS
264 if (val < 0 || val > 255) {
265 mp_raise_ValueError(MP_ERROR_TEXT("bytes value out of range"));
266 }
267 #endif
268 vstr_add_byte(&vstr, val);
269 }
270
271 return mp_obj_new_str_from_vstr(&mp_type_bytes, &vstr);
272
273wrong_args:
274 mp_raise_TypeError(MP_ERROR_TEXT("wrong number of arguments"));
275}
276
277// like strstr but with specified length and allows \0 bytes
278// TODO replace with something more efficient/standard
279const byte *find_subbytes(const byte *haystack, size_t hlen, const byte *needle, size_t nlen, int direction) {
280 if (hlen >= nlen) {
281 size_t str_index, str_index_end;
282 if (direction > 0) {
283 str_index = 0;
284 str_index_end = hlen - nlen;
285 } else {
286 str_index = hlen - nlen;
287 str_index_end = 0;
288 }
289 for (;;) {
290 if (memcmp(&haystack[str_index], needle, nlen) == 0) {
291 // found
292 return haystack + str_index;
293 }
294 if (str_index == str_index_end) {
295 // not found
296 break;
297 }
298 str_index += direction;
299 }
300 }
301 return NULL;
302}
303
304// Note: this function is used to check if an object is a str or bytes, which
305// works because both those types use it as their binary_op method. Revisit
306// mp_obj_is_str_or_bytes if this fact changes.
307mp_obj_t mp_obj_str_binary_op(mp_binary_op_t op, mp_obj_t lhs_in, mp_obj_t rhs_in) {
308 // check for modulo
309 if (op == MP_BINARY_OP_MODULO) {
310 #if MICROPY_PY_BUILTINS_STR_OP_MODULO
311 mp_obj_t *args = &rhs_in;
312 size_t n_args = 1;
313 mp_obj_t dict = MP_OBJ_NULL;
314 if (mp_obj_is_type(rhs_in, &mp_type_tuple)) {
315 // TODO: Support tuple subclasses?
316 mp_obj_tuple_get(rhs_in, &n_args, &args);
317 } else if (mp_obj_is_type(rhs_in, &mp_type_dict)) {
318 dict = rhs_in;
319 }
320 return str_modulo_format(lhs_in, n_args, args, dict);
321 #else
322 return MP_OBJ_NULL;
323 #endif
324 }
325
326 // from now on we need lhs type and data, so extract them
327 const mp_obj_type_t *lhs_type = mp_obj_get_type(lhs_in);
328 GET_STR_DATA_LEN(lhs_in, lhs_data, lhs_len);
329
330 // check for multiply
331 if (op == MP_BINARY_OP_MULTIPLY) {
332 mp_int_t n;
333 if (!mp_obj_get_int_maybe(rhs_in, &n)) {
334 return MP_OBJ_NULL; // op not supported
335 }
336 if (n <= 0) {
337 if (lhs_type == &mp_type_str) {
338 return MP_OBJ_NEW_QSTR(MP_QSTR_); // empty str
339 } else {
340 return mp_const_empty_bytes;
341 }
342 }
343 vstr_t vstr;
344 vstr_init_len(&vstr, lhs_len * n);
345 mp_seq_multiply(lhs_data, sizeof(*lhs_data), lhs_len, n, vstr.buf);
346 return mp_obj_new_str_from_vstr(lhs_type, &vstr);
347 }
348
349 // From now on all operations allow:
350 // - str with str
351 // - bytes with bytes
352 // - bytes with bytearray
353 // - bytes with array.array
354 // To do this efficiently we use the buffer protocol to extract the raw
355 // data for the rhs, but only if the lhs is a bytes object.
356 //
357 // NOTE: CPython does not allow comparison between bytes ard array.array
358 // (even if the array is of type 'b'), even though it allows addition of
359 // such types. We are not compatible with this (we do allow comparison
360 // of bytes with anything that has the buffer protocol). It would be
361 // easy to "fix" this with a bit of extra logic below, but it costs code
362 // size and execution time so we don't.
363
364 const byte *rhs_data;
365 size_t rhs_len;
366 if (lhs_type == mp_obj_get_type(rhs_in)) {
367 GET_STR_DATA_LEN(rhs_in, rhs_data_, rhs_len_);
368 rhs_data = rhs_data_;
369 rhs_len = rhs_len_;
370 } else if (lhs_type == &mp_type_bytes) {
371 mp_buffer_info_t bufinfo;
372 if (!mp_get_buffer(rhs_in, &bufinfo, MP_BUFFER_READ)) {
373 return MP_OBJ_NULL; // op not supported
374 }
375 rhs_data = bufinfo.buf;
376 rhs_len = bufinfo.len;
377 } else {
378 // LHS is str and RHS has an incompatible type
379 // (except if operation is EQUAL, but that's handled by mp_obj_equal)
380 bad_implicit_conversion(rhs_in);
381 }
382
383 switch (op) {
384 case MP_BINARY_OP_ADD:
385 case MP_BINARY_OP_INPLACE_ADD: {
386 if (lhs_len == 0 && mp_obj_get_type(rhs_in) == lhs_type) {
387 return rhs_in;
388 }
389 if (rhs_len == 0) {
390 return lhs_in;
391 }
392
393 vstr_t vstr;
394 vstr_init_len(&vstr, lhs_len + rhs_len);
395 memcpy(vstr.buf, lhs_data, lhs_len);
396 memcpy(vstr.buf + lhs_len, rhs_data, rhs_len);
397 return mp_obj_new_str_from_vstr(lhs_type, &vstr);
398 }
399
400 case MP_BINARY_OP_CONTAINS:
401 return mp_obj_new_bool(find_subbytes(lhs_data, lhs_len, rhs_data, rhs_len, 1) != NULL);
402
403 // case MP_BINARY_OP_NOT_EQUAL: // This is never passed here
404 case MP_BINARY_OP_EQUAL: // This will be passed only for bytes, str is dealt with in mp_obj_equal()
405 case MP_BINARY_OP_LESS:
406 case MP_BINARY_OP_LESS_EQUAL:
407 case MP_BINARY_OP_MORE:
408 case MP_BINARY_OP_MORE_EQUAL:
409 return mp_obj_new_bool(mp_seq_cmp_bytes(op, lhs_data, lhs_len, rhs_data, rhs_len));
410
411 default:
412 return MP_OBJ_NULL; // op not supported
413 }
414}
415
416#if !MICROPY_PY_BUILTINS_STR_UNICODE
417// objstrunicode defines own version
418const byte *str_index_to_ptr(const mp_obj_type_t *type, const byte *self_data, size_t self_len,
419 mp_obj_t index, bool is_slice) {
420 size_t index_val = mp_get_index(type, self_len, index, is_slice);
421 return self_data + index_val;
422}
423#endif
424
425// This is used for both bytes and 8-bit strings. This is not used for unicode strings.
426STATIC mp_obj_t bytes_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) {
427 const mp_obj_type_t *type = mp_obj_get_type(self_in);
428 GET_STR_DATA_LEN(self_in, self_data, self_len);
429 if (value == MP_OBJ_SENTINEL) {
430 // load
431 #if MICROPY_PY_BUILTINS_SLICE
432 if (mp_obj_is_type(index, &mp_type_slice)) {
433 mp_bound_slice_t slice;
434 if (!mp_seq_get_fast_slice_indexes(self_len, index, &slice)) {
435 mp_raise_NotImplementedError(MP_ERROR_TEXT("only slices with step=1 (aka None) are supported"));
436 }
437 return mp_obj_new_str_of_type(type, self_data + slice.start, slice.stop - slice.start);
438 }
439 #endif
440 size_t index_val = mp_get_index(type, self_len, index, false);
441 // If we have unicode enabled the type will always be bytes, so take the short cut.
442 if (MICROPY_PY_BUILTINS_STR_UNICODE || type == &mp_type_bytes) {
443 return MP_OBJ_NEW_SMALL_INT(self_data[index_val]);
444 } else {
445 return mp_obj_new_str_via_qstr((char *)&self_data[index_val], 1);
446 }
447 } else {
448 return MP_OBJ_NULL; // op not supported
449 }
450}
451
452STATIC mp_obj_t str_join(mp_obj_t self_in, mp_obj_t arg) {
453 mp_check_self(mp_obj_is_str_or_bytes(self_in));
454 const mp_obj_type_t *self_type = mp_obj_get_type(self_in);
455
456 // get separation string
457 GET_STR_DATA_LEN(self_in, sep_str, sep_len);
458
459 // process args
460 size_t seq_len;
461 mp_obj_t *seq_items;
462
463 if (!mp_obj_is_type(arg, &mp_type_list) && !mp_obj_is_type(arg, &mp_type_tuple)) {
464 // arg is not a list nor a tuple, try to convert it to a list
465 // TODO: Try to optimize?
466 arg = mp_type_list.make_new(&mp_type_list, 1, 0, &arg);
467 }
468 mp_obj_get_array(arg, &seq_len, &seq_items);
469
470 // count required length
471 size_t required_len = 0;
472 for (size_t i = 0; i < seq_len; i++) {
473 if (mp_obj_get_type(seq_items[i]) != self_type) {
474 mp_raise_TypeError(
475 MP_ERROR_TEXT("join expects a list of str/bytes objects consistent with self object"));
476 }
477 if (i > 0) {
478 required_len += sep_len;
479 }
480 GET_STR_LEN(seq_items[i], l);
481 required_len += l;
482 }
483
484 // make joined string
485 vstr_t vstr;
486 vstr_init_len(&vstr, required_len);
487 byte *data = (byte *)vstr.buf;
488 for (size_t i = 0; i < seq_len; i++) {
489 if (i > 0) {
490 memcpy(data, sep_str, sep_len);
491 data += sep_len;
492 }
493 GET_STR_DATA_LEN(seq_items[i], s, l);
494 memcpy(data, s, l);
495 data += l;
496 }
497
498 // return joined string
499 return mp_obj_new_str_from_vstr(self_type, &vstr);
500}
501MP_DEFINE_CONST_FUN_OBJ_2(str_join_obj, str_join);
502
503mp_obj_t mp_obj_str_split(size_t n_args, const mp_obj_t *args) {
504 const mp_obj_type_t *self_type = mp_obj_get_type(args[0]);
505 mp_int_t splits = -1;
506 mp_obj_t sep = mp_const_none;
507 if (n_args > 1) {
508 sep = args[1];
509 if (n_args > 2) {
510 splits = mp_obj_get_int(args[2]);
511 }
512 }
513
514 mp_obj_t res = mp_obj_new_list(0, NULL);
515 GET_STR_DATA_LEN(args[0], s, len);
516 const byte *top = s + len;
517
518 if (sep == mp_const_none) {
519 // sep not given, so separate on whitespace
520
521 // Initial whitespace is not counted as split, so we pre-do it
522 while (s < top && unichar_isspace(*s)) {
523 s++;
524 }
525 while (s < top && splits != 0) {
526 const byte *start = s;
527 while (s < top && !unichar_isspace(*s)) {
528 s++;
529 }
530 mp_obj_list_append(res, mp_obj_new_str_of_type(self_type, start, s - start));
531 if (s >= top) {
532 break;
533 }
534 while (s < top && unichar_isspace(*s)) {
535 s++;
536 }
537 if (splits > 0) {
538 splits--;
539 }
540 }
541
542 if (s < top) {
543 mp_obj_list_append(res, mp_obj_new_str_of_type(self_type, s, top - s));
544 }
545
546 } else {
547 // sep given
548 if (mp_obj_get_type(sep) != self_type) {
549 bad_implicit_conversion(sep);
550 }
551
552 size_t sep_len;
553 const char *sep_str = mp_obj_str_get_data(sep, &sep_len);
554
555 if (sep_len == 0) {
556 mp_raise_ValueError(MP_ERROR_TEXT("empty separator"));
557 }
558
559 for (;;) {
560 const byte *start = s;
561 for (;;) {
562 if (splits == 0 || s + sep_len > top) {
563 s = top;
564 break;
565 } else if (memcmp(s, sep_str, sep_len) == 0) {
566 break;
567 }
568 s++;
569 }
570 mp_obj_list_append(res, mp_obj_new_str_of_type(self_type, start, s - start));
571 if (s >= top) {
572 break;
573 }
574 s += sep_len;
575 if (splits > 0) {
576 splits--;
577 }
578 }
579 }
580
581 return res;
582}
583MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_split_obj, 1, 3, mp_obj_str_split);
584
585#if MICROPY_PY_BUILTINS_STR_SPLITLINES
586STATIC mp_obj_t str_splitlines(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
587 enum { ARG_keepends };
588 static const mp_arg_t allowed_args[] = {
589 { MP_QSTR_keepends, MP_ARG_BOOL, {.u_bool = false} },
590 };
591
592 // parse args
593 mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
594 mp_arg_parse_all(n_args - 1, pos_args + 1, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
595
596 const mp_obj_type_t *self_type = mp_obj_get_type(pos_args[0]);
597 mp_obj_t res = mp_obj_new_list(0, NULL);
598
599 GET_STR_DATA_LEN(pos_args[0], s, len);
600 const byte *top = s + len;
601
602 while (s < top) {
603 const byte *start = s;
604 size_t match = 0;
605 while (s < top) {
606 if (*s == '\n') {
607 match = 1;
608 break;
609 } else if (*s == '\r') {
610 if (s[1] == '\n') {
611 match = 2;
612 } else {
613 match = 1;
614 }
615 break;
616 }
617 s++;
618 }
619 size_t sub_len = s - start;
620 if (args[ARG_keepends].u_bool) {
621 sub_len += match;
622 }
623 mp_obj_list_append(res, mp_obj_new_str_of_type(self_type, start, sub_len));
624 s += match;
625 }
626
627 return res;
628}
629MP_DEFINE_CONST_FUN_OBJ_KW(str_splitlines_obj, 1, str_splitlines);
630#endif
631
632STATIC mp_obj_t str_rsplit(size_t n_args, const mp_obj_t *args) {
633 if (n_args < 3) {
634 // If we don't have split limit, it doesn't matter from which side
635 // we split.
636 return mp_obj_str_split(n_args, args);
637 }
638 const mp_obj_type_t *self_type = mp_obj_get_type(args[0]);
639 mp_obj_t sep = args[1];
640 GET_STR_DATA_LEN(args[0], s, len);
641
642 mp_int_t splits = mp_obj_get_int(args[2]);
643 if (splits < 0) {
644 // Negative limit means no limit, so delegate to split().
645 return mp_obj_str_split(n_args, args);
646 }
647
648 mp_int_t org_splits = splits;
649 // Preallocate list to the max expected # of elements, as we
650 // will fill it from the end.
651 mp_obj_list_t *res = MP_OBJ_TO_PTR(mp_obj_new_list(splits + 1, NULL));
652 mp_int_t idx = splits;
653
654 if (sep == mp_const_none) {
655 mp_raise_NotImplementedError(MP_ERROR_TEXT("rsplit(None,n)"));
656 } else {
657 size_t sep_len;
658 const char *sep_str = mp_obj_str_get_data(sep, &sep_len);
659
660 if (sep_len == 0) {
661 mp_raise_ValueError(MP_ERROR_TEXT("empty separator"));
662 }
663
664 const byte *beg = s;
665 const byte *last = s + len;
666 for (;;) {
667 s = last - sep_len;
668 for (;;) {
669 if (splits == 0 || s < beg) {
670 break;
671 } else if (memcmp(s, sep_str, sep_len) == 0) {
672 break;
673 }
674 s--;
675 }
676 if (s < beg || splits == 0) {
677 res->items[idx] = mp_obj_new_str_of_type(self_type, beg, last - beg);
678 break;
679 }
680 res->items[idx--] = mp_obj_new_str_of_type(self_type, s + sep_len, last - s - sep_len);
681 last = s;
682 splits--;
683 }
684 if (idx != 0) {
685 // We split less parts than split limit, now go cleanup surplus
686 size_t used = org_splits + 1 - idx;
687 memmove(res->items, &res->items[idx], used * sizeof(mp_obj_t));
688 mp_seq_clear(res->items, used, res->alloc, sizeof(*res->items));
689 res->len = used;
690 }
691 }
692
693 return MP_OBJ_FROM_PTR(res);
694}
695MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_rsplit_obj, 1, 3, str_rsplit);
696
697STATIC mp_obj_t str_finder(size_t n_args, const mp_obj_t *args, int direction, bool is_index) {
698 const mp_obj_type_t *self_type = mp_obj_get_type(args[0]);
699 mp_check_self(mp_obj_is_str_or_bytes(args[0]));
700
701 // check argument type
702 if (mp_obj_get_type(args[1]) != self_type) {
703 bad_implicit_conversion(args[1]);
704 }
705
706 GET_STR_DATA_LEN(args[0], haystack, haystack_len);
707 GET_STR_DATA_LEN(args[1], needle, needle_len);
708
709 const byte *start = haystack;
710 const byte *end = haystack + haystack_len;
711 if (n_args >= 3 && args[2] != mp_const_none) {
712 start = str_index_to_ptr(self_type, haystack, haystack_len, args[2], true);
713 }
714 if (n_args >= 4 && args[3] != mp_const_none) {
715 end = str_index_to_ptr(self_type, haystack, haystack_len, args[3], true);
716 }
717
718 if (end < start) {
719 goto out_error;
720 }
721
722 const byte *p = find_subbytes(start, end - start, needle, needle_len, direction);
723 if (p == NULL) {
724 out_error:
725 // not found
726 if (is_index) {
727 mp_raise_ValueError(MP_ERROR_TEXT("substring not found"));
728 } else {
729 return MP_OBJ_NEW_SMALL_INT(-1);
730 }
731 } else {
732 // found
733 #if MICROPY_PY_BUILTINS_STR_UNICODE
734 if (self_type == &mp_type_str) {
735 return MP_OBJ_NEW_SMALL_INT(utf8_ptr_to_index(haystack, p));
736 }
737 #endif
738 return MP_OBJ_NEW_SMALL_INT(p - haystack);
739 }
740}
741
742STATIC mp_obj_t str_find(size_t n_args, const mp_obj_t *args) {
743 return str_finder(n_args, args, 1, false);
744}
745MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_find_obj, 2, 4, str_find);
746
747STATIC mp_obj_t str_rfind(size_t n_args, const mp_obj_t *args) {
748 return str_finder(n_args, args, -1, false);
749}
750MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_rfind_obj, 2, 4, str_rfind);
751
752STATIC mp_obj_t str_index(size_t n_args, const mp_obj_t *args) {
753 return str_finder(n_args, args, 1, true);
754}
755MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_index_obj, 2, 4, str_index);
756
757STATIC mp_obj_t str_rindex(size_t n_args, const mp_obj_t *args) {
758 return str_finder(n_args, args, -1, true);
759}
760MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_rindex_obj, 2, 4, str_rindex);
761
762// TODO: (Much) more variety in args
763STATIC mp_obj_t str_startswith(size_t n_args, const mp_obj_t *args) {
764 const mp_obj_type_t *self_type = mp_obj_get_type(args[0]);
765 GET_STR_DATA_LEN(args[0], str, str_len);
766 size_t prefix_len;
767 const char *prefix = mp_obj_str_get_data(args[1], &prefix_len);
768 const byte *start = str;
769 if (n_args > 2) {
770 start = str_index_to_ptr(self_type, str, str_len, args[2], true);
771 }
772 if (prefix_len + (start - str) > str_len) {
773 return mp_const_false;
774 }
775 return mp_obj_new_bool(memcmp(start, prefix, prefix_len) == 0);
776}
777MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_startswith_obj, 2, 3, str_startswith);
778
779STATIC mp_obj_t str_endswith(size_t n_args, const mp_obj_t *args) {
780 GET_STR_DATA_LEN(args[0], str, str_len);
781 size_t suffix_len;
782 const char *suffix = mp_obj_str_get_data(args[1], &suffix_len);
783 if (n_args > 2) {
784 mp_raise_NotImplementedError(MP_ERROR_TEXT("start/end indices"));
785 }
786
787 if (suffix_len > str_len) {
788 return mp_const_false;
789 }
790 return mp_obj_new_bool(memcmp(str + (str_len - suffix_len), suffix, suffix_len) == 0);
791}
792MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_endswith_obj, 2, 3, str_endswith);
793
794enum { LSTRIP, RSTRIP, STRIP };
795
796STATIC mp_obj_t str_uni_strip(int type, size_t n_args, const mp_obj_t *args) {
797 mp_check_self(mp_obj_is_str_or_bytes(args[0]));
798 const mp_obj_type_t *self_type = mp_obj_get_type(args[0]);
799
800 const byte *chars_to_del;
801 uint chars_to_del_len;
802 static const byte whitespace[] = " \t\n\r\v\f";
803
804 if (n_args == 1) {
805 chars_to_del = whitespace;
806 chars_to_del_len = sizeof(whitespace) - 1;
807 } else {
808 if (mp_obj_get_type(args[1]) != self_type) {
809 bad_implicit_conversion(args[1]);
810 }
811 GET_STR_DATA_LEN(args[1], s, l);
812 chars_to_del = s;
813 chars_to_del_len = l;
814 }
815
816 GET_STR_DATA_LEN(args[0], orig_str, orig_str_len);
817
818 size_t first_good_char_pos = 0;
819 bool first_good_char_pos_set = false;
820 size_t last_good_char_pos = 0;
821 size_t i = 0;
822 int delta = 1;
823 if (type == RSTRIP) {
824 i = orig_str_len - 1;
825 delta = -1;
826 }
827 for (size_t len = orig_str_len; len > 0; len--) {
828 if (find_subbytes(chars_to_del, chars_to_del_len, &orig_str[i], 1, 1) == NULL) {
829 if (!first_good_char_pos_set) {
830 first_good_char_pos_set = true;
831 first_good_char_pos = i;
832 if (type == LSTRIP) {
833 last_good_char_pos = orig_str_len - 1;
834 break;
835 } else if (type == RSTRIP) {
836 first_good_char_pos = 0;
837 last_good_char_pos = i;
838 break;
839 }
840 }
841 last_good_char_pos = i;
842 }
843 i += delta;
844 }
845
846 if (!first_good_char_pos_set) {
847 // string is all whitespace, return ''
848 if (self_type == &mp_type_str) {
849 return MP_OBJ_NEW_QSTR(MP_QSTR_);
850 } else {
851 return mp_const_empty_bytes;
852 }
853 }
854
855 assert(last_good_char_pos >= first_good_char_pos);
856 // +1 to accommodate the last character
857 size_t stripped_len = last_good_char_pos - first_good_char_pos + 1;
858 if (stripped_len == orig_str_len) {
859 // If nothing was stripped, don't bother to dup original string
860 // TODO: watch out for this case when we'll get to bytearray.strip()
861 assert(first_good_char_pos == 0);
862 return args[0];
863 }
864 return mp_obj_new_str_of_type(self_type, orig_str + first_good_char_pos, stripped_len);
865}
866
867STATIC mp_obj_t str_strip(size_t n_args, const mp_obj_t *args) {
868 return str_uni_strip(STRIP, n_args, args);
869}
870MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_strip_obj, 1, 2, str_strip);
871
872STATIC mp_obj_t str_lstrip(size_t n_args, const mp_obj_t *args) {
873 return str_uni_strip(LSTRIP, n_args, args);
874}
875MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_lstrip_obj, 1, 2, str_lstrip);
876
877STATIC mp_obj_t str_rstrip(size_t n_args, const mp_obj_t *args) {
878 return str_uni_strip(RSTRIP, n_args, args);
879}
880MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_rstrip_obj, 1, 2, str_rstrip);
881
882#if MICROPY_PY_BUILTINS_STR_CENTER
883STATIC mp_obj_t str_center(mp_obj_t str_in, mp_obj_t width_in) {
884 GET_STR_DATA_LEN(str_in, str, str_len);
885 mp_uint_t width = mp_obj_get_int(width_in);
886 if (str_len >= width) {
887 return str_in;
888 }
889
890 vstr_t vstr;
891 vstr_init_len(&vstr, width);
892 memset(vstr.buf, ' ', width);
893 int left = (width - str_len) / 2;
894 memcpy(vstr.buf + left, str, str_len);
895 return mp_obj_new_str_from_vstr(mp_obj_get_type(str_in), &vstr);
896}
897MP_DEFINE_CONST_FUN_OBJ_2(str_center_obj, str_center);
898#endif
899
900// Takes an int arg, but only parses unsigned numbers, and only changes
901// *num if at least one digit was parsed.
902STATIC const char *str_to_int(const char *str, const char *top, int *num) {
903 if (str < top && '0' <= *str && *str <= '9') {
904 *num = 0;
905 do {
906 *num = *num * 10 + (*str - '0');
907 str++;
908 }
909 while (str < top && '0' <= *str && *str <= '9');
910 }
911 return str;
912}
913
914STATIC bool isalignment(char ch) {
915 return ch && strchr("<>=^", ch) != NULL;
916}
917
918STATIC bool istype(char ch) {
919 return ch && strchr("bcdeEfFgGnosxX%", ch) != NULL;
920}
921
922STATIC bool arg_looks_integer(mp_obj_t arg) {
923 return mp_obj_is_bool(arg) || mp_obj_is_int(arg);
924}
925
926STATIC bool arg_looks_numeric(mp_obj_t arg) {
927 return arg_looks_integer(arg)
928 #if MICROPY_PY_BUILTINS_FLOAT
929 || mp_obj_is_float(arg)
930 #endif
931 ;
932}
933
934#if MICROPY_PY_BUILTINS_STR_OP_MODULO
935STATIC mp_obj_t arg_as_int(mp_obj_t arg) {
936 #if MICROPY_PY_BUILTINS_FLOAT
937 if (mp_obj_is_float(arg)) {
938 return mp_obj_new_int_from_float(mp_obj_float_get(arg));
939 }
940 #endif
941 return arg;
942}
943#endif
944
945#if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE
946STATIC NORETURN void terse_str_format_value_error(void) {
947 mp_raise_ValueError(MP_ERROR_TEXT("bad format string"));
948}
949#else
950// define to nothing to improve coverage
951#define terse_str_format_value_error()
952#endif
953
954STATIC vstr_t mp_obj_str_format_helper(const char *str, const char *top, int *arg_i, size_t n_args, const mp_obj_t *args, mp_map_t *kwargs) {
955 vstr_t vstr;
956 mp_print_t print;
957 vstr_init_print(&vstr, 16, &print);
958
959 for (; str < top; str++) {
960 if (*str == '}') {
961 str++;
962 if (str < top && *str == '}') {
963 vstr_add_byte(&vstr, '}');
964 continue;
965 }
966 #if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE
967 terse_str_format_value_error();
968 #else
969 mp_raise_ValueError(MP_ERROR_TEXT("single '}' encountered in format string"));
970 #endif
971 }
972 if (*str != '{') {
973 vstr_add_byte(&vstr, *str);
974 continue;
975 }
976
977 str++;
978 if (str < top && *str == '{') {
979 vstr_add_byte(&vstr, '{');
980 continue;
981 }
982
983 // replacement_field ::= "{" [field_name] ["!" conversion] [":" format_spec] "}"
984
985 const char *field_name = NULL;
986 const char *field_name_top = NULL;
987 char conversion = '\0';
988 const char *format_spec = NULL;
989
990 if (str < top && *str != '}' && *str != '!' && *str != ':') {
991 field_name = (const char *)str;
992 while (str < top && *str != '}' && *str != '!' && *str != ':') {
993 ++str;
994 }
995 field_name_top = (const char *)str;
996 }
997
998 // conversion ::= "r" | "s"
999
1000 if (str < top && *str == '!') {
1001 str++;
1002 if (str < top && (*str == 'r' || *str == 's')) {
1003 conversion = *str++;
1004 } else {
1005 #if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE
1006 terse_str_format_value_error();
1007 #elif MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_NORMAL
1008 mp_raise_ValueError(MP_ERROR_TEXT("bad conversion specifier"));
1009 #else
1010 if (str >= top) {
1011 mp_raise_ValueError(
1012 MP_ERROR_TEXT("end of format while looking for conversion specifier"));
1013 } else {
1014 mp_raise_msg_varg(&mp_type_ValueError,
1015 MP_ERROR_TEXT("unknown conversion specifier %c"), *str);
1016 }
1017 #endif
1018 }
1019 }
1020
1021 if (str < top && *str == ':') {
1022 str++;
1023 // {:} is the same as {}, which is the same as {!s}
1024 // This makes a difference when passing in a True or False
1025 // '{}'.format(True) returns 'True'
1026 // '{:d}'.format(True) returns '1'
1027 // So we treat {:} as {} and this later gets treated to be {!s}
1028 if (*str != '}') {
1029 format_spec = str;
1030 for (int nest = 1; str < top;) {
1031 if (*str == '{') {
1032 ++nest;
1033 } else if (*str == '}') {
1034 if (--nest == 0) {
1035 break;
1036 }
1037 }
1038 ++str;
1039 }
1040 }
1041 }
1042 if (str >= top) {
1043 #if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE
1044 terse_str_format_value_error();
1045 #else
1046 mp_raise_ValueError(MP_ERROR_TEXT("unmatched '{' in format"));
1047 #endif
1048 }
1049 if (*str != '}') {
1050 #if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE
1051 terse_str_format_value_error();
1052 #else
1053 mp_raise_ValueError(MP_ERROR_TEXT("expected ':' after format specifier"));
1054 #endif
1055 }
1056
1057 mp_obj_t arg = mp_const_none;
1058
1059 if (field_name) {
1060 int index = 0;
1061 if (MP_LIKELY(unichar_isdigit(*field_name))) {
1062 if (*arg_i > 0) {
1063 #if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE
1064 terse_str_format_value_error();
1065 #else
1066 mp_raise_ValueError(
1067 MP_ERROR_TEXT("can't switch from automatic field numbering to manual field specification"));
1068 #endif
1069 }
1070 field_name = str_to_int(field_name, field_name_top, &index);
1071 if ((uint)index >= n_args - 1) {
1072 mp_raise_msg(&mp_type_IndexError, MP_ERROR_TEXT("tuple index out of range"));
1073 }
1074 arg = args[index + 1];
1075 *arg_i = -1;
1076 } else {
1077 const char *lookup;
1078 for (lookup = field_name; lookup < field_name_top && *lookup != '.' && *lookup != '['; lookup++) {;
1079 }
1080 mp_obj_t field_q = mp_obj_new_str_via_qstr(field_name, lookup - field_name); // should it be via qstr?
1081 field_name = lookup;
1082 mp_map_elem_t *key_elem = mp_map_lookup(kwargs, field_q, MP_MAP_LOOKUP);
1083 if (key_elem == NULL) {
1084 nlr_raise(mp_obj_new_exception_arg1(&mp_type_KeyError, field_q));
1085 }
1086 arg = key_elem->value;
1087 }
1088 if (field_name < field_name_top) {
1089 mp_raise_NotImplementedError(MP_ERROR_TEXT("attributes not supported yet"));
1090 }
1091 } else {
1092 if (*arg_i < 0) {
1093 #if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE
1094 terse_str_format_value_error();
1095 #else
1096 mp_raise_ValueError(
1097 MP_ERROR_TEXT("can't switch from manual field specification to automatic field numbering"));
1098 #endif
1099 }
1100 if ((uint)*arg_i >= n_args - 1) {
1101 mp_raise_msg(&mp_type_IndexError, MP_ERROR_TEXT("tuple index out of range"));
1102 }
1103 arg = args[(*arg_i) + 1];
1104 (*arg_i)++;
1105 }
1106 if (!format_spec && !conversion) {
1107 conversion = 's';
1108 }
1109 if (conversion) {
1110 mp_print_kind_t print_kind;
1111 if (conversion == 's') {
1112 print_kind = PRINT_STR;
1113 } else {
1114 assert(conversion == 'r');
1115 print_kind = PRINT_REPR;
1116 }
1117 vstr_t arg_vstr;
1118 mp_print_t arg_print;
1119 vstr_init_print(&arg_vstr, 16, &arg_print);
1120 mp_obj_print_helper(&arg_print, arg, print_kind);
1121 arg = mp_obj_new_str_from_vstr(&mp_type_str, &arg_vstr);
1122 }
1123
1124 char fill = '\0';
1125 char align = '\0';
1126 int width = -1;
1127 int precision = -1;
1128 char type = '\0';
1129 int flags = 0;
1130
1131 if (format_spec) {
1132 // The format specifier (from http://docs.python.org/2/library/string.html#formatspec)
1133 //
1134 // [[fill]align][sign][#][0][width][,][.precision][type]
1135 // fill ::= <any character>
1136 // align ::= "<" | ">" | "=" | "^"
1137 // sign ::= "+" | "-" | " "
1138 // width ::= integer
1139 // precision ::= integer
1140 // type ::= "b" | "c" | "d" | "e" | "E" | "f" | "F" | "g" | "G" | "n" | "o" | "s" | "x" | "X" | "%"
1141
1142 // recursively call the formatter to format any nested specifiers
1143 MP_STACK_CHECK();
1144 vstr_t format_spec_vstr = mp_obj_str_format_helper(format_spec, str, arg_i, n_args, args, kwargs);
1145 const char *s = vstr_null_terminated_str(&format_spec_vstr);
1146 const char *stop = s + format_spec_vstr.len;
1147 if (isalignment(*s)) {
1148 align = *s++;
1149 } else if (*s && isalignment(s[1])) {
1150 fill = *s++;
1151 align = *s++;
1152 }
1153 if (*s == '+' || *s == '-' || *s == ' ') {
1154 if (*s == '+') {
1155 flags |= PF_FLAG_SHOW_SIGN;
1156 } else if (*s == ' ') {
1157 flags |= PF_FLAG_SPACE_SIGN;
1158 }
1159 s++;
1160 }
1161 if (*s == '#') {
1162 flags |= PF_FLAG_SHOW_PREFIX;
1163 s++;
1164 }
1165 if (*s == '0') {
1166 if (!align) {
1167 align = '=';
1168 }
1169 if (!fill) {
1170 fill = '0';
1171 }
1172 }
1173 s = str_to_int(s, stop, &width);
1174 if (*s == ',') {
1175 flags |= PF_FLAG_SHOW_COMMA;
1176 s++;
1177 }
1178 if (*s == '.') {
1179 s++;
1180 s = str_to_int(s, stop, &precision);
1181 }
1182 if (istype(*s)) {
1183 type = *s++;
1184 }
1185 if (*s) {
1186 #if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE
1187 terse_str_format_value_error();
1188 #else
1189 mp_raise_ValueError(MP_ERROR_TEXT("invalid format specifier"));
1190 #endif
1191 }
1192 vstr_clear(&format_spec_vstr);
1193 }
1194 if (!align) {
1195 if (arg_looks_numeric(arg)) {
1196 align = '>';
1197 } else {
1198 align = '<';
1199 }
1200 }
1201 if (!fill) {
1202 fill = ' ';
1203 }
1204
1205 if (flags & (PF_FLAG_SHOW_SIGN | PF_FLAG_SPACE_SIGN)) {
1206 if (type == 's') {
1207 #if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE
1208 terse_str_format_value_error();
1209 #else
1210 mp_raise_ValueError(MP_ERROR_TEXT("sign not allowed in string format specifier"));
1211 #endif
1212 }
1213 if (type == 'c') {
1214 #if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE
1215 terse_str_format_value_error();
1216 #else
1217 mp_raise_ValueError(
1218 MP_ERROR_TEXT("sign not allowed with integer format specifier 'c'"));
1219 #endif
1220 }
1221 }
1222
1223 switch (align) {
1224 case '<':
1225 flags |= PF_FLAG_LEFT_ADJUST;
1226 break;
1227 case '=':
1228 flags |= PF_FLAG_PAD_AFTER_SIGN;
1229 break;
1230 case '^':
1231 flags |= PF_FLAG_CENTER_ADJUST;
1232 break;
1233 }
1234
1235 if (arg_looks_integer(arg)) {
1236 switch (type) {
1237 case 'b':
1238 mp_print_mp_int(&print, arg, 2, 'a', flags, fill, width, 0);
1239 continue;
1240
1241 case 'c': {
1242 char ch = mp_obj_get_int(arg);
1243 mp_print_strn(&print, &ch, 1, flags, fill, width);
1244 continue;
1245 }
1246
1247 case '\0': // No explicit format type implies 'd'
1248 case 'n': // I don't think we support locales in uPy so use 'd'
1249 case 'd':
1250 mp_print_mp_int(&print, arg, 10, 'a', flags, fill, width, 0);
1251 continue;
1252
1253 case 'o':
1254 if (flags & PF_FLAG_SHOW_PREFIX) {
1255 flags |= PF_FLAG_SHOW_OCTAL_LETTER;
1256 }
1257
1258 mp_print_mp_int(&print, arg, 8, 'a', flags, fill, width, 0);
1259 continue;
1260
1261 case 'X':
1262 case 'x':
1263 mp_print_mp_int(&print, arg, 16, type - ('X' - 'A'), flags, fill, width, 0);
1264 continue;
1265
1266 case 'e':
1267 case 'E':
1268 case 'f':
1269 case 'F':
1270 case 'g':
1271 case 'G':
1272 case '%':
1273 // The floating point formatters all work with anything that
1274 // looks like an integer
1275 break;
1276
1277 default:
1278 #if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE
1279 terse_str_format_value_error();
1280 #else
1281 mp_raise_msg_varg(&mp_type_ValueError,
1282 MP_ERROR_TEXT("unknown format code '%c' for object of type '%s'"),
1283 type, mp_obj_get_type_str(arg));
1284 #endif
1285 }
1286 }
1287
1288 // NOTE: no else here. We need the e, f, g etc formats for integer
1289 // arguments (from above if) to take this if.
1290 if (arg_looks_numeric(arg)) {
1291 if (!type) {
1292
1293 // Even though the docs say that an unspecified type is the same
1294 // as 'g', there is one subtle difference, when the exponent
1295 // is one less than the precision.
1296 //
1297 // '{:10.1}'.format(0.0) ==> '0e+00'
1298 // '{:10.1g}'.format(0.0) ==> '0'
1299 //
1300 // TODO: Figure out how to deal with this.
1301 //
1302 // A proper solution would involve adding a special flag
1303 // or something to format_float, and create a format_double
1304 // to deal with doubles. In order to fix this when using
1305 // sprintf, we'd need to use the e format and tweak the
1306 // returned result to strip trailing zeros like the g format
1307 // does.
1308 //
1309 // {:10.3} and {:10.2e} with 1.23e2 both produce 1.23e+02
1310 // but with 1.e2 you get 1e+02 and 1.00e+02
1311 //
1312 // Stripping the trailing 0's (like g) does would make the
1313 // e format give us the right format.
1314 //
1315 // CPython sources say:
1316 // Omitted type specifier. Behaves in the same way as repr(x)
1317 // and str(x) if no precision is given, else like 'g', but with
1318 // at least one digit after the decimal point. */
1319
1320 type = 'g';
1321 }
1322 if (type == 'n') {
1323 type = 'g';
1324 }
1325
1326 switch (type) {
1327 #if MICROPY_PY_BUILTINS_FLOAT
1328 case 'e':
1329 case 'E':
1330 case 'f':
1331 case 'F':
1332 case 'g':
1333 case 'G':
1334 mp_print_float(&print, mp_obj_get_float(arg), type, flags, fill, width, precision);
1335 break;
1336
1337 case '%':
1338 flags |= PF_FLAG_ADD_PERCENT;
1339 #if MICROPY_FLOAT_IMPL == MICROPY_FLOAT_IMPL_FLOAT
1340 #define F100 100.0F
1341 #else
1342 #define F100 100.0
1343 #endif
1344 mp_print_float(&print, mp_obj_get_float(arg) * F100, 'f', flags, fill, width, precision);
1345#undef F100
1346 break;
1347 #endif
1348
1349 default:
1350 #if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE
1351 terse_str_format_value_error();
1352 #else
1353 mp_raise_msg_varg(&mp_type_ValueError,
1354 MP_ERROR_TEXT("unknown format code '%c' for object of type '%s'"),
1355 type, mp_obj_get_type_str(arg));
1356 #endif
1357 }
1358 } else {
1359 // arg doesn't look like a number
1360
1361 if (align == '=') {
1362 #if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE
1363 terse_str_format_value_error();
1364 #else
1365 mp_raise_ValueError(
1366 MP_ERROR_TEXT("'=' alignment not allowed in string format specifier"));
1367 #endif
1368 }
1369
1370 switch (type) {
1371 case '\0': // no explicit format type implies 's'
1372 case 's': {
1373 size_t slen;
1374 const char *s = mp_obj_str_get_data(arg, &slen);
1375 if (precision < 0) {
1376 precision = slen;
1377 }
1378 if (slen > (size_t)precision) {
1379 slen = precision;
1380 }
1381 mp_print_strn(&print, s, slen, flags, fill, width);
1382 break;
1383 }
1384
1385 default:
1386 #if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE
1387 terse_str_format_value_error();
1388 #else
1389 mp_raise_msg_varg(&mp_type_ValueError,
1390 MP_ERROR_TEXT("unknown format code '%c' for object of type '%s'"),
1391 type, mp_obj_get_type_str(arg));
1392 #endif
1393 }
1394 }
1395 }
1396
1397 return vstr;
1398}
1399
1400mp_obj_t mp_obj_str_format(size_t n_args, const mp_obj_t *args, mp_map_t *kwargs) {
1401 mp_check_self(mp_obj_is_str_or_bytes(args[0]));
1402
1403 GET_STR_DATA_LEN(args[0], str, len);
1404 int arg_i = 0;
1405 vstr_t vstr = mp_obj_str_format_helper((const char *)str, (const char *)str + len, &arg_i, n_args, args, kwargs);
1406 return mp_obj_new_str_from_vstr(mp_obj_get_type(args[0]), &vstr);
1407}
1408MP_DEFINE_CONST_FUN_OBJ_KW(str_format_obj, 1, mp_obj_str_format);
1409
1410#if MICROPY_PY_BUILTINS_STR_OP_MODULO
1411STATIC mp_obj_t str_modulo_format(mp_obj_t pattern, size_t n_args, const mp_obj_t *args, mp_obj_t dict) {
1412 mp_check_self(mp_obj_is_str_or_bytes(pattern));
1413
1414 GET_STR_DATA_LEN(pattern, str, len);
1415 #if MICROPY_ERROR_REPORTING != MICROPY_ERROR_REPORTING_TERSE
1416 const byte *start_str = str;
1417 #endif
1418 bool is_bytes = mp_obj_is_type(pattern, &mp_type_bytes);
1419 size_t arg_i = 0;
1420 vstr_t vstr;
1421 mp_print_t print;
1422 vstr_init_print(&vstr, 16, &print);
1423
1424 for (const byte *top = str + len; str < top; str++) {
1425 mp_obj_t arg = MP_OBJ_NULL;
1426 if (*str != '%') {
1427 vstr_add_byte(&vstr, *str);
1428 continue;
1429 }
1430 if (++str >= top) {
1431 goto incomplete_format;
1432 }
1433 if (*str == '%') {
1434 vstr_add_byte(&vstr, '%');
1435 continue;
1436 }
1437
1438 // Dictionary value lookup
1439 if (*str == '(') {
1440 if (dict == MP_OBJ_NULL) {
1441 mp_raise_TypeError(MP_ERROR_TEXT("format needs a dict"));
1442 }
1443 arg_i = 1; // we used up the single dict argument
1444 const byte *key = ++str;
1445 while (*str != ')') {
1446 if (str >= top) {
1447 #if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE
1448 terse_str_format_value_error();
1449 #else
1450 mp_raise_ValueError(MP_ERROR_TEXT("incomplete format key"));
1451 #endif
1452 }
1453 ++str;
1454 }
1455 mp_obj_t k_obj = mp_obj_new_str_via_qstr((const char *)key, str - key);
1456 arg = mp_obj_dict_get(dict, k_obj);
1457 str++;
1458 }
1459
1460 int flags = 0;
1461 char fill = ' ';
1462 int alt = 0;
1463 while (str < top) {
1464 if (*str == '-') {
1465 flags |= PF_FLAG_LEFT_ADJUST;
1466 } else if (*str == '+') {
1467 flags |= PF_FLAG_SHOW_SIGN;
1468 } else if (*str == ' ') {
1469 flags |= PF_FLAG_SPACE_SIGN;
1470 } else if (*str == '#') {
1471 alt = PF_FLAG_SHOW_PREFIX;
1472 } else if (*str == '0') {
1473 flags |= PF_FLAG_PAD_AFTER_SIGN;
1474 fill = '0';
1475 } else {
1476 break;
1477 }
1478 str++;
1479 }
1480 // parse width, if it exists
1481 int width = 0;
1482 if (str < top) {
1483 if (*str == '*') {
1484 if (arg_i >= n_args) {
1485 goto not_enough_args;
1486 }
1487 width = mp_obj_get_int(args[arg_i++]);
1488 str++;
1489 } else {
1490 str = (const byte *)str_to_int((const char *)str, (const char *)top, &width);
1491 }
1492 }
1493 int prec = -1;
1494 if (str < top && *str == '.') {
1495 if (++str < top) {
1496 if (*str == '*') {
1497 if (arg_i >= n_args) {
1498 goto not_enough_args;
1499 }
1500 prec = mp_obj_get_int(args[arg_i++]);
1501 str++;
1502 } else {
1503 prec = 0;
1504 str = (const byte *)str_to_int((const char *)str, (const char *)top, &prec);
1505 }
1506 }
1507 }
1508
1509 if (str >= top) {
1510 incomplete_format:
1511 #if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE
1512 terse_str_format_value_error();
1513 #else
1514 mp_raise_ValueError(MP_ERROR_TEXT("incomplete format"));
1515 #endif
1516 }
1517
1518 // Tuple value lookup
1519 if (arg == MP_OBJ_NULL) {
1520 if (arg_i >= n_args) {
1521 not_enough_args:
1522 mp_raise_TypeError(MP_ERROR_TEXT("format string needs more arguments"));
1523 }
1524 arg = args[arg_i++];
1525 }
1526 switch (*str) {
1527 case 'c':
1528 if (mp_obj_is_str(arg)) {
1529 size_t slen;
1530 const char *s = mp_obj_str_get_data(arg, &slen);
1531 if (slen != 1) {
1532 mp_raise_TypeError(MP_ERROR_TEXT("%c needs int or char"));
1533 }
1534 mp_print_strn(&print, s, 1, flags, ' ', width);
1535 } else if (arg_looks_integer(arg)) {
1536 char ch = mp_obj_get_int(arg);
1537 mp_print_strn(&print, &ch, 1, flags, ' ', width);
1538 } else {
1539 mp_raise_TypeError(MP_ERROR_TEXT("integer needed"));
1540 }
1541 break;
1542
1543 case 'd':
1544 case 'i':
1545 case 'u':
1546 mp_print_mp_int(&print, arg_as_int(arg), 10, 'a', flags, fill, width, prec);
1547 break;
1548
1549 #if MICROPY_PY_BUILTINS_FLOAT
1550 case 'e':
1551 case 'E':
1552 case 'f':
1553 case 'F':
1554 case 'g':
1555 case 'G':
1556 mp_print_float(&print, mp_obj_get_float(arg), *str, flags, fill, width, prec);
1557 break;
1558 #endif
1559
1560 case 'o':
1561 if (alt) {
1562 flags |= (PF_FLAG_SHOW_PREFIX | PF_FLAG_SHOW_OCTAL_LETTER);
1563 }
1564 mp_print_mp_int(&print, arg, 8, 'a', flags, fill, width, prec);
1565 break;
1566
1567 case 'r':
1568 case 's': {
1569 vstr_t arg_vstr;
1570 mp_print_t arg_print;
1571 vstr_init_print(&arg_vstr, 16, &arg_print);
1572 mp_print_kind_t print_kind = (*str == 'r' ? PRINT_REPR : PRINT_STR);
1573 if (print_kind == PRINT_STR && is_bytes && mp_obj_is_type(arg, &mp_type_bytes)) {
1574 // If we have something like b"%s" % b"1", bytes arg should be
1575 // printed undecorated.
1576 print_kind = PRINT_RAW;
1577 }
1578 mp_obj_print_helper(&arg_print, arg, print_kind);
1579 uint vlen = arg_vstr.len;
1580 if (prec < 0) {
1581 prec = vlen;
1582 }
1583 if (vlen > (uint)prec) {
1584 vlen = prec;
1585 }
1586 mp_print_strn(&print, arg_vstr.buf, vlen, flags, ' ', width);
1587 vstr_clear(&arg_vstr);
1588 break;
1589 }
1590
1591 case 'X':
1592 case 'x':
1593 mp_print_mp_int(&print, arg, 16, *str - ('X' - 'A'), flags | alt, fill, width, prec);
1594 break;
1595
1596 default:
1597 #if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE
1598 terse_str_format_value_error();
1599 #else
1600 mp_raise_msg_varg(&mp_type_ValueError,
1601 MP_ERROR_TEXT("unsupported format character '%c' (0x%x) at index %d"),
1602 *str, *str, str - start_str);
1603 #endif
1604 }
1605 }
1606
1607 if (arg_i != n_args) {
1608 mp_raise_TypeError(MP_ERROR_TEXT("format string didn't convert all arguments"));
1609 }
1610
1611 return mp_obj_new_str_from_vstr(is_bytes ? &mp_type_bytes : &mp_type_str, &vstr);
1612}
1613#endif
1614
1615// The implementation is optimized, returning the original string if there's
1616// nothing to replace.
1617STATIC mp_obj_t str_replace(size_t n_args, const mp_obj_t *args) {
1618 mp_check_self(mp_obj_is_str_or_bytes(args[0]));
1619
1620 mp_int_t max_rep = -1;
1621 if (n_args == 4) {
1622 max_rep = mp_obj_get_int(args[3]);
1623 if (max_rep == 0) {
1624 return args[0];
1625 } else if (max_rep < 0) {
1626 max_rep = -1;
1627 }
1628 }
1629
1630 // if max_rep is still -1 by this point we will need to do all possible replacements
1631
1632 // check argument types
1633
1634 const mp_obj_type_t *self_type = mp_obj_get_type(args[0]);
1635
1636 if (mp_obj_get_type(args[1]) != self_type) {
1637 bad_implicit_conversion(args[1]);
1638 }
1639
1640 if (mp_obj_get_type(args[2]) != self_type) {
1641 bad_implicit_conversion(args[2]);
1642 }
1643
1644 // extract string data
1645
1646 GET_STR_DATA_LEN(args[0], str, str_len);
1647 GET_STR_DATA_LEN(args[1], old, old_len);
1648 GET_STR_DATA_LEN(args[2], new, new_len);
1649
1650 // old won't exist in str if it's longer, so nothing to replace
1651 if (old_len > str_len) {
1652 return args[0];
1653 }
1654
1655 // data for the replaced string
1656 byte *data = NULL;
1657 vstr_t vstr;
1658
1659 // do 2 passes over the string:
1660 // first pass computes the required length of the replaced string
1661 // second pass does the replacements
1662 for (;;) {
1663 size_t replaced_str_index = 0;
1664 size_t num_replacements_done = 0;
1665 const byte *old_occurrence;
1666 const byte *offset_ptr = str;
1667 size_t str_len_remain = str_len;
1668 if (old_len == 0) {
1669 // if old_str is empty, copy new_str to start of replaced string
1670 // copy the replacement string
1671 if (data != NULL) {
1672 memcpy(data, new, new_len);
1673 }
1674 replaced_str_index += new_len;
1675 num_replacements_done++;
1676 }
1677 while (num_replacements_done != (size_t)max_rep && str_len_remain > 0 && (old_occurrence = find_subbytes(offset_ptr, str_len_remain, old, old_len, 1)) != NULL) {
1678 if (old_len == 0) {
1679 old_occurrence += 1;
1680 }
1681 // copy from just after end of last occurrence of to-be-replaced string to right before start of next occurrence
1682 if (data != NULL) {
1683 memcpy(data + replaced_str_index, offset_ptr, old_occurrence - offset_ptr);
1684 }
1685 replaced_str_index += old_occurrence - offset_ptr;
1686 // copy the replacement string
1687 if (data != NULL) {
1688 memcpy(data + replaced_str_index, new, new_len);
1689 }
1690 replaced_str_index += new_len;
1691 offset_ptr = old_occurrence + old_len;
1692 str_len_remain = str + str_len - offset_ptr;
1693 num_replacements_done++;
1694 }
1695
1696 // copy from just after end of last occurrence of to-be-replaced string to end of old string
1697 if (data != NULL) {
1698 memcpy(data + replaced_str_index, offset_ptr, str_len_remain);
1699 }
1700 replaced_str_index += str_len_remain;
1701
1702 if (data == NULL) {
1703 // first pass
1704 if (num_replacements_done == 0) {
1705 // no substr found, return original string
1706 return args[0];
1707 } else {
1708 // substr found, allocate new string
1709 vstr_init_len(&vstr, replaced_str_index);
1710 data = (byte *)vstr.buf;
1711 assert(data != NULL);
1712 }
1713 } else {
1714 // second pass, we are done
1715 break;
1716 }
1717 }
1718
1719 return mp_obj_new_str_from_vstr(self_type, &vstr);
1720}
1721MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_replace_obj, 3, 4, str_replace);
1722
1723#if MICROPY_PY_BUILTINS_STR_COUNT
1724STATIC mp_obj_t str_count(size_t n_args, const mp_obj_t *args) {
1725 const mp_obj_type_t *self_type = mp_obj_get_type(args[0]);
1726 mp_check_self(mp_obj_is_str_or_bytes(args[0]));
1727
1728 // check argument type
1729 if (mp_obj_get_type(args[1]) != self_type) {
1730 bad_implicit_conversion(args[1]);
1731 }
1732
1733 GET_STR_DATA_LEN(args[0], haystack, haystack_len);
1734 GET_STR_DATA_LEN(args[1], needle, needle_len);
1735
1736 const byte *start = haystack;
1737 const byte *end = haystack + haystack_len;
1738 if (n_args >= 3 && args[2] != mp_const_none) {
1739 start = str_index_to_ptr(self_type, haystack, haystack_len, args[2], true);
1740 }
1741 if (n_args >= 4 && args[3] != mp_const_none) {
1742 end = str_index_to_ptr(self_type, haystack, haystack_len, args[3], true);
1743 }
1744
1745 // if needle_len is zero then we count each gap between characters as an occurrence
1746 if (needle_len == 0) {
1747 return MP_OBJ_NEW_SMALL_INT(utf8_charlen(start, end - start) + 1);
1748 }
1749
1750 // count the occurrences
1751 mp_int_t num_occurrences = 0;
1752 for (const byte *haystack_ptr = start; haystack_ptr + needle_len <= end;) {
1753 if (memcmp(haystack_ptr, needle, needle_len) == 0) {
1754 num_occurrences++;
1755 haystack_ptr += needle_len;
1756 } else {
1757 haystack_ptr = utf8_next_char(haystack_ptr);
1758 }
1759 }
1760
1761 return MP_OBJ_NEW_SMALL_INT(num_occurrences);
1762}
1763MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_count_obj, 2, 4, str_count);
1764#endif
1765
1766#if MICROPY_PY_BUILTINS_STR_PARTITION
1767STATIC mp_obj_t str_partitioner(mp_obj_t self_in, mp_obj_t arg, int direction) {
1768 mp_check_self(mp_obj_is_str_or_bytes(self_in));
1769 const mp_obj_type_t *self_type = mp_obj_get_type(self_in);
1770 if (self_type != mp_obj_get_type(arg)) {
1771 bad_implicit_conversion(arg);
1772 }
1773
1774 GET_STR_DATA_LEN(self_in, str, str_len);
1775 GET_STR_DATA_LEN(arg, sep, sep_len);
1776
1777 if (sep_len == 0) {
1778 mp_raise_ValueError(MP_ERROR_TEXT("empty separator"));
1779 }
1780
1781 mp_obj_t result[3];
1782 if (self_type == &mp_type_str) {
1783 result[0] = MP_OBJ_NEW_QSTR(MP_QSTR_);
1784 result[1] = MP_OBJ_NEW_QSTR(MP_QSTR_);
1785 result[2] = MP_OBJ_NEW_QSTR(MP_QSTR_);
1786 } else {
1787 result[0] = mp_const_empty_bytes;
1788 result[1] = mp_const_empty_bytes;
1789 result[2] = mp_const_empty_bytes;
1790 }
1791
1792 if (direction > 0) {
1793 result[0] = self_in;
1794 } else {
1795 result[2] = self_in;
1796 }
1797
1798 const byte *position_ptr = find_subbytes(str, str_len, sep, sep_len, direction);
1799 if (position_ptr != NULL) {
1800 size_t position = position_ptr - str;
1801 result[0] = mp_obj_new_str_of_type(self_type, str, position);
1802 result[1] = arg;
1803 result[2] = mp_obj_new_str_of_type(self_type, str + position + sep_len, str_len - position - sep_len);
1804 }
1805
1806 return mp_obj_new_tuple(3, result);
1807}
1808
1809STATIC mp_obj_t str_partition(mp_obj_t self_in, mp_obj_t arg) {
1810 return str_partitioner(self_in, arg, 1);
1811}
1812MP_DEFINE_CONST_FUN_OBJ_2(str_partition_obj, str_partition);
1813
1814STATIC mp_obj_t str_rpartition(mp_obj_t self_in, mp_obj_t arg) {
1815 return str_partitioner(self_in, arg, -1);
1816}
1817MP_DEFINE_CONST_FUN_OBJ_2(str_rpartition_obj, str_rpartition);
1818#endif
1819
1820// Supposedly not too critical operations, so optimize for code size
1821STATIC mp_obj_t str_caseconv(unichar (*op)(unichar), mp_obj_t self_in) {
1822 GET_STR_DATA_LEN(self_in, self_data, self_len);
1823 vstr_t vstr;
1824 vstr_init_len(&vstr, self_len);
1825 byte *data = (byte *)vstr.buf;
1826 for (size_t i = 0; i < self_len; i++) {
1827 *data++ = op(*self_data++);
1828 }
1829 return mp_obj_new_str_from_vstr(mp_obj_get_type(self_in), &vstr);
1830}
1831
1832STATIC mp_obj_t str_lower(mp_obj_t self_in) {
1833 return str_caseconv(unichar_tolower, self_in);
1834}
1835MP_DEFINE_CONST_FUN_OBJ_1(str_lower_obj, str_lower);
1836
1837STATIC mp_obj_t str_upper(mp_obj_t self_in) {
1838 return str_caseconv(unichar_toupper, self_in);
1839}
1840MP_DEFINE_CONST_FUN_OBJ_1(str_upper_obj, str_upper);
1841
1842STATIC mp_obj_t str_uni_istype(bool (*f)(unichar), mp_obj_t self_in) {
1843 GET_STR_DATA_LEN(self_in, self_data, self_len);
1844
1845 if (self_len == 0) {
1846 return mp_const_false; // default to False for empty str
1847 }
1848
1849 if (f != unichar_isupper && f != unichar_islower) {
1850 for (size_t i = 0; i < self_len; i++) {
1851 if (!f(*self_data++)) {
1852 return mp_const_false;
1853 }
1854 }
1855 } else {
1856 bool contains_alpha = false;
1857
1858 for (size_t i = 0; i < self_len; i++) { // only check alphanumeric characters
1859 if (unichar_isalpha(*self_data++)) {
1860 contains_alpha = true;
1861 if (!f(*(self_data - 1))) { // -1 because we already incremented above
1862 return mp_const_false;
1863 }
1864 }
1865 }
1866
1867 if (!contains_alpha) {
1868 return mp_const_false;
1869 }
1870 }
1871
1872 return mp_const_true;
1873}
1874
1875STATIC mp_obj_t str_isspace(mp_obj_t self_in) {
1876 return str_uni_istype(unichar_isspace, self_in);
1877}
1878MP_DEFINE_CONST_FUN_OBJ_1(str_isspace_obj, str_isspace);
1879
1880STATIC mp_obj_t str_isalpha(mp_obj_t self_in) {
1881 return str_uni_istype(unichar_isalpha, self_in);
1882}
1883MP_DEFINE_CONST_FUN_OBJ_1(str_isalpha_obj, str_isalpha);
1884
1885STATIC mp_obj_t str_isdigit(mp_obj_t self_in) {
1886 return str_uni_istype(unichar_isdigit, self_in);
1887}
1888MP_DEFINE_CONST_FUN_OBJ_1(str_isdigit_obj, str_isdigit);
1889
1890STATIC mp_obj_t str_isupper(mp_obj_t self_in) {
1891 return str_uni_istype(unichar_isupper, self_in);
1892}
1893MP_DEFINE_CONST_FUN_OBJ_1(str_isupper_obj, str_isupper);
1894
1895STATIC mp_obj_t str_islower(mp_obj_t self_in) {
1896 return str_uni_istype(unichar_islower, self_in);
1897}
1898MP_DEFINE_CONST_FUN_OBJ_1(str_islower_obj, str_islower);
1899
1900#if MICROPY_CPYTHON_COMPAT
1901// These methods are superfluous in the presence of str() and bytes()
1902// constructors.
1903// TODO: should accept kwargs too
1904STATIC mp_obj_t bytes_decode(size_t n_args, const mp_obj_t *args) {
1905 mp_obj_t new_args[2];
1906 if (n_args == 1) {
1907 new_args[0] = args[0];
1908 new_args[1] = MP_OBJ_NEW_QSTR(MP_QSTR_utf_hyphen_8);
1909 args = new_args;
1910 n_args++;
1911 }
1912 return mp_obj_str_make_new(&mp_type_str, n_args, 0, args);
1913}
1914MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(bytes_decode_obj, 1, 3, bytes_decode);
1915
1916// TODO: should accept kwargs too
1917STATIC mp_obj_t str_encode(size_t n_args, const mp_obj_t *args) {
1918 mp_obj_t new_args[2];
1919 if (n_args == 1) {
1920 new_args[0] = args[0];
1921 new_args[1] = MP_OBJ_NEW_QSTR(MP_QSTR_utf_hyphen_8);
1922 args = new_args;
1923 n_args++;
1924 }
1925 return bytes_make_new(NULL, n_args, 0, args);
1926}
1927MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_encode_obj, 1, 3, str_encode);
1928#endif
1929
1930mp_int_t mp_obj_str_get_buffer(mp_obj_t self_in, mp_buffer_info_t *bufinfo, mp_uint_t flags) {
1931 if (flags == MP_BUFFER_READ) {
1932 GET_STR_DATA_LEN(self_in, str_data, str_len);
1933 bufinfo->buf = (void *)str_data;
1934 bufinfo->len = str_len;
1935 bufinfo->typecode = 'B'; // bytes should be unsigned, so should unicode byte-access
1936 return 0;
1937 } else {
1938 // can't write to a string
1939 return 1;
1940 }
1941}
1942
1943STATIC const mp_rom_map_elem_t str8_locals_dict_table[] = {
1944 #if MICROPY_CPYTHON_COMPAT
1945 { MP_ROM_QSTR(MP_QSTR_decode), MP_ROM_PTR(&bytes_decode_obj) },
1946 #if !MICROPY_PY_BUILTINS_STR_UNICODE
1947 // If we have separate unicode type, then here we have methods only
1948 // for bytes type, and it should not have encode() methods. Otherwise,
1949 // we have non-compliant-but-practical bytestring type, which shares
1950 // method table with bytes, so they both have encode() and decode()
1951 // methods (which should do type checking at runtime).
1952 { MP_ROM_QSTR(MP_QSTR_encode), MP_ROM_PTR(&str_encode_obj) },
1953 #endif
1954 #endif
1955 { MP_ROM_QSTR(MP_QSTR_find), MP_ROM_PTR(&str_find_obj) },
1956 { MP_ROM_QSTR(MP_QSTR_rfind), MP_ROM_PTR(&str_rfind_obj) },
1957 { MP_ROM_QSTR(MP_QSTR_index), MP_ROM_PTR(&str_index_obj) },
1958 { MP_ROM_QSTR(MP_QSTR_rindex), MP_ROM_PTR(&str_rindex_obj) },
1959 { MP_ROM_QSTR(MP_QSTR_join), MP_ROM_PTR(&str_join_obj) },
1960 { MP_ROM_QSTR(MP_QSTR_split), MP_ROM_PTR(&str_split_obj) },
1961 #if MICROPY_PY_BUILTINS_STR_SPLITLINES
1962 { MP_ROM_QSTR(MP_QSTR_splitlines), MP_ROM_PTR(&str_splitlines_obj) },
1963 #endif
1964 { MP_ROM_QSTR(MP_QSTR_rsplit), MP_ROM_PTR(&str_rsplit_obj) },
1965 { MP_ROM_QSTR(MP_QSTR_startswith), MP_ROM_PTR(&str_startswith_obj) },
1966 { MP_ROM_QSTR(MP_QSTR_endswith), MP_ROM_PTR(&str_endswith_obj) },
1967 { MP_ROM_QSTR(MP_QSTR_strip), MP_ROM_PTR(&str_strip_obj) },
1968 { MP_ROM_QSTR(MP_QSTR_lstrip), MP_ROM_PTR(&str_lstrip_obj) },
1969 { MP_ROM_QSTR(MP_QSTR_rstrip), MP_ROM_PTR(&str_rstrip_obj) },
1970 { MP_ROM_QSTR(MP_QSTR_format), MP_ROM_PTR(&str_format_obj) },
1971 { MP_ROM_QSTR(MP_QSTR_replace), MP_ROM_PTR(&str_replace_obj) },
1972 #if MICROPY_PY_BUILTINS_STR_COUNT
1973 { MP_ROM_QSTR(MP_QSTR_count), MP_ROM_PTR(&str_count_obj) },
1974 #endif
1975 #if MICROPY_PY_BUILTINS_STR_PARTITION
1976 { MP_ROM_QSTR(MP_QSTR_partition), MP_ROM_PTR(&str_partition_obj) },
1977 { MP_ROM_QSTR(MP_QSTR_rpartition), MP_ROM_PTR(&str_rpartition_obj) },
1978 #endif
1979 #if MICROPY_PY_BUILTINS_STR_CENTER
1980 { MP_ROM_QSTR(MP_QSTR_center), MP_ROM_PTR(&str_center_obj) },
1981 #endif
1982 { MP_ROM_QSTR(MP_QSTR_lower), MP_ROM_PTR(&str_lower_obj) },
1983 { MP_ROM_QSTR(MP_QSTR_upper), MP_ROM_PTR(&str_upper_obj) },
1984 { MP_ROM_QSTR(MP_QSTR_isspace), MP_ROM_PTR(&str_isspace_obj) },
1985 { MP_ROM_QSTR(MP_QSTR_isalpha), MP_ROM_PTR(&str_isalpha_obj) },
1986 { MP_ROM_QSTR(MP_QSTR_isdigit), MP_ROM_PTR(&str_isdigit_obj) },
1987 { MP_ROM_QSTR(MP_QSTR_isupper), MP_ROM_PTR(&str_isupper_obj) },
1988 { MP_ROM_QSTR(MP_QSTR_islower), MP_ROM_PTR(&str_islower_obj) },
1989};
1990
1991STATIC MP_DEFINE_CONST_DICT(str8_locals_dict, str8_locals_dict_table);
1992
1993#if !MICROPY_PY_BUILTINS_STR_UNICODE
1994STATIC mp_obj_t mp_obj_new_str_iterator(mp_obj_t str, mp_obj_iter_buf_t *iter_buf);
1995
1996const mp_obj_type_t mp_type_str = {
1997 { &mp_type_type },
1998 .name = MP_QSTR_str,
1999 .print = str_print,
2000 .make_new = mp_obj_str_make_new,
2001 .binary_op = mp_obj_str_binary_op,
2002 .subscr = bytes_subscr,
2003 .getiter = mp_obj_new_str_iterator,
2004 .buffer_p = { .get_buffer = mp_obj_str_get_buffer },
2005 .locals_dict = (mp_obj_dict_t *)&str8_locals_dict,
2006};
2007#endif
2008
2009// Reuses most of methods from str
2010const mp_obj_type_t mp_type_bytes = {
2011 { &mp_type_type },
2012 .name = MP_QSTR_bytes,
2013 .print = str_print,
2014 .make_new = bytes_make_new,
2015 .binary_op = mp_obj_str_binary_op,
2016 .subscr = bytes_subscr,
2017 .getiter = mp_obj_new_bytes_iterator,
2018 .buffer_p = { .get_buffer = mp_obj_str_get_buffer },
2019 .locals_dict = (mp_obj_dict_t *)&str8_locals_dict,
2020};
2021
2022// The zero-length bytes object, with data that includes a null-terminating byte
2023const mp_obj_str_t mp_const_empty_bytes_obj = {{&mp_type_bytes}, 0, 0, (const byte *)""};
2024
2025// Create a str/bytes object using the given data. New memory is allocated and
2026// the data is copied across. This function should only be used if the type is bytes,
2027// or if the type is str and the string data is known to be not interned.
2028mp_obj_t mp_obj_new_str_copy(const mp_obj_type_t *type, const byte *data, size_t len) {
2029 mp_obj_str_t *o = m_new_obj(mp_obj_str_t);
2030 o->base.type = type;
2031 o->len = len;
2032 if (data) {
2033 o->hash = qstr_compute_hash(data, len);
2034 byte *p = m_new(byte, len + 1);
2035 o->data = p;
2036 memcpy(p, data, len * sizeof(byte));
2037 p[len] = '\0'; // for now we add null for compatibility with C ASCIIZ strings
2038 }
2039 return MP_OBJ_FROM_PTR(o);
2040}
2041
2042// Create a str/bytes object using the given data. If the type is str and the string
2043// data is already interned, then a qstr object is returned. Otherwise new memory is
2044// allocated for the object and the data is copied across.
2045mp_obj_t mp_obj_new_str_of_type(const mp_obj_type_t *type, const byte *data, size_t len) {
2046 if (type == &mp_type_str) {
2047 return mp_obj_new_str((const char *)data, len);
2048 } else {
2049 return mp_obj_new_bytes(data, len);
2050 }
2051}
2052
2053// Create a str using a qstr to store the data; may use existing or new qstr.
2054mp_obj_t mp_obj_new_str_via_qstr(const char *data, size_t len) {
2055 return MP_OBJ_NEW_QSTR(qstr_from_strn(data, len));
2056}
2057
2058// Create a str/bytes object from the given vstr. The vstr buffer is resized to
2059// the exact length required and then reused for the str/bytes object. The vstr
2060// is cleared and can safely be passed to vstr_free if it was heap allocated.
2061mp_obj_t mp_obj_new_str_from_vstr(const mp_obj_type_t *type, vstr_t *vstr) {
2062 // if not a bytes object, look if a qstr with this data already exists
2063 if (type == &mp_type_str) {
2064 qstr q = qstr_find_strn(vstr->buf, vstr->len);
2065 if (q != MP_QSTRnull) {
2066 vstr_clear(vstr);
2067 vstr->alloc = 0;
2068 return MP_OBJ_NEW_QSTR(q);
2069 }
2070 }
2071
2072 // make a new str/bytes object
2073 mp_obj_str_t *o = m_new_obj(mp_obj_str_t);
2074 o->base.type = type;
2075 o->len = vstr->len;
2076 o->hash = qstr_compute_hash((byte *)vstr->buf, vstr->len);
2077 if (vstr->len + 1 == vstr->alloc) {
2078 o->data = (byte *)vstr->buf;
2079 } else {
2080 o->data = (byte *)m_renew(char, vstr->buf, vstr->alloc, vstr->len + 1);
2081 }
2082 ((byte *)o->data)[o->len] = '\0'; // add null byte
2083 vstr->buf = NULL;
2084 vstr->alloc = 0;
2085 return MP_OBJ_FROM_PTR(o);
2086}
2087
2088mp_obj_t mp_obj_new_str(const char *data, size_t len) {
2089 qstr q = qstr_find_strn(data, len);
2090 if (q != MP_QSTRnull) {
2091 // qstr with this data already exists
2092 return MP_OBJ_NEW_QSTR(q);
2093 } else {
2094 // no existing qstr, don't make one
2095 return mp_obj_new_str_copy(&mp_type_str, (const byte *)data, len);
2096 }
2097}
2098
2099mp_obj_t mp_obj_str_intern(mp_obj_t str) {
2100 GET_STR_DATA_LEN(str, data, len);
2101 return mp_obj_new_str_via_qstr((const char *)data, len);
2102}
2103
2104mp_obj_t mp_obj_str_intern_checked(mp_obj_t obj) {
2105 size_t len;
2106 const char *data = mp_obj_str_get_data(obj, &len);
2107 return mp_obj_new_str_via_qstr((const char *)data, len);
2108}
2109
2110mp_obj_t mp_obj_new_bytes(const byte *data, size_t len) {
2111 return mp_obj_new_str_copy(&mp_type_bytes, data, len);
2112}
2113
2114bool mp_obj_str_equal(mp_obj_t s1, mp_obj_t s2) {
2115 if (mp_obj_is_qstr(s1) && mp_obj_is_qstr(s2)) {
2116 return s1 == s2;
2117 } else {
2118 GET_STR_HASH(s1, h1);
2119 GET_STR_HASH(s2, h2);
2120 // If any of hashes is 0, it means it's not valid
2121 if (h1 != 0 && h2 != 0 && h1 != h2) {
2122 return false;
2123 }
2124 GET_STR_DATA_LEN(s1, d1, l1);
2125 GET_STR_DATA_LEN(s2, d2, l2);
2126 if (l1 != l2) {
2127 return false;
2128 }
2129 return memcmp(d1, d2, l1) == 0;
2130 }
2131}
2132
2133STATIC NORETURN void bad_implicit_conversion(mp_obj_t self_in) {
2134 #if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE
2135 mp_raise_TypeError(MP_ERROR_TEXT("can't convert to str implicitly"));
2136 #else
2137 const qstr src_name = mp_obj_get_type(self_in)->name;
2138 mp_raise_msg_varg(&mp_type_TypeError,
2139 MP_ERROR_TEXT("can't convert '%q' object to %q implicitly"),
2140 src_name, src_name == MP_QSTR_str ? MP_QSTR_bytes : MP_QSTR_str);
2141 #endif
2142}
2143
2144// use this if you will anyway convert the string to a qstr
2145// will be more efficient for the case where it's already a qstr
2146qstr mp_obj_str_get_qstr(mp_obj_t self_in) {
2147 if (mp_obj_is_qstr(self_in)) {
2148 return MP_OBJ_QSTR_VALUE(self_in);
2149 } else if (mp_obj_is_type(self_in, &mp_type_str)) {
2150 mp_obj_str_t *self = MP_OBJ_TO_PTR(self_in);
2151 return qstr_from_strn((char *)self->data, self->len);
2152 } else {
2153 bad_implicit_conversion(self_in);
2154 }
2155}
2156
2157// only use this function if you need the str data to be zero terminated
2158// at the moment all strings are zero terminated to help with C ASCIIZ compatibility
2159const char *mp_obj_str_get_str(mp_obj_t self_in) {
2160 if (mp_obj_is_str_or_bytes(self_in)) {
2161 GET_STR_DATA_LEN(self_in, s, l);
2162 (void)l; // len unused
2163 return (const char *)s;
2164 } else {
2165 bad_implicit_conversion(self_in);
2166 }
2167}
2168
2169const char *mp_obj_str_get_data(mp_obj_t self_in, size_t *len) {
2170 if (mp_obj_is_str_or_bytes(self_in)) {
2171 GET_STR_DATA_LEN(self_in, s, l);
2172 *len = l;
2173 return (const char *)s;
2174 } else {
2175 bad_implicit_conversion(self_in);
2176 }
2177}
2178
2179#if MICROPY_OBJ_REPR == MICROPY_OBJ_REPR_C || MICROPY_OBJ_REPR == MICROPY_OBJ_REPR_D
2180const byte *mp_obj_str_get_data_no_check(mp_obj_t self_in, size_t *len) {
2181 if (mp_obj_is_qstr(self_in)) {
2182 return qstr_data(MP_OBJ_QSTR_VALUE(self_in), len);
2183 } else {
2184 *len = ((mp_obj_str_t *)MP_OBJ_TO_PTR(self_in))->len;
2185 return ((mp_obj_str_t *)MP_OBJ_TO_PTR(self_in))->data;
2186 }
2187}
2188#endif
2189
2190/******************************************************************************/
2191/* str iterator */
2192
2193typedef struct _mp_obj_str8_it_t {
2194 mp_obj_base_t base;
2195 mp_fun_1_t iternext;
2196 mp_obj_t str;
2197 size_t cur;
2198} mp_obj_str8_it_t;
2199
2200#if !MICROPY_PY_BUILTINS_STR_UNICODE
2201STATIC mp_obj_t str_it_iternext(mp_obj_t self_in) {
2202 mp_obj_str8_it_t *self = MP_OBJ_TO_PTR(self_in);
2203 GET_STR_DATA_LEN(self->str, str, len);
2204 if (self->cur < len) {
2205 mp_obj_t o_out = mp_obj_new_str_via_qstr((const char *)str + self->cur, 1);
2206 self->cur += 1;
2207 return o_out;
2208 } else {
2209 return MP_OBJ_STOP_ITERATION;
2210 }
2211}
2212
2213STATIC mp_obj_t mp_obj_new_str_iterator(mp_obj_t str, mp_obj_iter_buf_t *iter_buf) {
2214 assert(sizeof(mp_obj_str8_it_t) <= sizeof(mp_obj_iter_buf_t));
2215 mp_obj_str8_it_t *o = (mp_obj_str8_it_t *)iter_buf;
2216 o->base.type = &mp_type_polymorph_iter;
2217 o->iternext = str_it_iternext;
2218 o->str = str;
2219 o->cur = 0;
2220 return MP_OBJ_FROM_PTR(o);
2221}
2222#endif
2223
2224STATIC mp_obj_t bytes_it_iternext(mp_obj_t self_in) {
2225 mp_obj_str8_it_t *self = MP_OBJ_TO_PTR(self_in);
2226 GET_STR_DATA_LEN(self->str, str, len);
2227 if (self->cur < len) {
2228 mp_obj_t o_out = MP_OBJ_NEW_SMALL_INT(str[self->cur]);
2229 self->cur += 1;
2230 return o_out;
2231 } else {
2232 return MP_OBJ_STOP_ITERATION;
2233 }
2234}
2235
2236mp_obj_t mp_obj_new_bytes_iterator(mp_obj_t str, mp_obj_iter_buf_t *iter_buf) {
2237 assert(sizeof(mp_obj_str8_it_t) <= sizeof(mp_obj_iter_buf_t));
2238 mp_obj_str8_it_t *o = (mp_obj_str8_it_t *)iter_buf;
2239 o->base.type = &mp_type_polymorph_iter;
2240 o->iternext = bytes_it_iternext;
2241 o->str = str;
2242 o->cur = 0;
2243 return MP_OBJ_FROM_PTR(o);
2244}
2245