objstr.c source code [MicroPython/py/objstr.c]

1	/*
2	* This file is part of the MicroPython project, http://micropython.org/
3	*
4	* The MIT License (MIT)
5	*
6	* Copyright (c) 2013, 2014 Damien P. George
7	* Copyright (c) 2014-2018 Paul Sokolovsky
8	*
9	* Permission is hereby granted, free of charge, to any person obtaining a copy
10	* of this software and associated documentation files (the "Software"), to deal
11	* in the Software without restriction, including without limitation the rights
12	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13	* copies of the Software, and to permit persons to whom the Software is
14	* furnished to do so, subject to the following conditions:
15	*
16	* The above copyright notice and this permission notice shall be included in
17	* all copies or substantial portions of the Software.
18	*
19	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25	* THE SOFTWARE.
26	*/
27
28	#include <string.h>
29	#include <assert.h>
30
31	#include "py/unicode.h"
32	#include "py/objstr.h"
33	#include "py/objlist.h"
34	#include "py/runtime.h"
35	#include "py/stackctrl.h"
36
37	#if MICROPY_PY_BUILTINS_STR_OP_MODULO
38	STATIC mp_obj_t str_modulo_format(mp_obj_t pattern, size_t n_args, const mp_obj_t *args, mp_obj_t dict);
39	#endif
40
41	STATIC mp_obj_t mp_obj_new_bytes_iterator(mp_obj_t str, mp_obj_iter_buf_t *iter_buf);
42	STATIC NORETURN void bad_implicit_conversion(mp_obj_t self_in);
43
44	/****************************************************************************/
45	/ str /
46
47	void mp_str_print_quoted(const mp_print_t print, const* byte *str_data, size_t str_len, bool is_bytes) {
48	// this escapes characters, but it will be very slow to print (calling print many times)
49	bool has_single_quote = false;
50	bool has_double_quote = false;
51	for (const byte s = str_data, top = str_data + str_len; !has_double_quote && s < top; s++) {
52	if (*s == `'\''`) {
53	has_single_quote = true;
54	} else if (*s == `'"'`) {
55	has_double_quote = true;
56	}
57	}
58	int quote_char = `'\''`;
59	if (has_single_quote && !has_double_quote) {
60	quote_char = `'"'`;
61	}
62	mp_printf(print, "%c", quote_char);
63	for (const byte s = str_data, top = str_data + str_len; s < top; s++) {
64	if (*s == quote_char) {
65	mp_printf(print, "\\%c", quote_char);
66	} else if (*s == `'\\'`) {
67	mp_print_str(print, "\\\\");
68	} else if (s >= `0x20` && s != `0x7f` && (!is_bytes \|\| *s < `0x80`)) {
69	// In strings, anything which is not ascii control character
70	// is printed as is, this includes characters in range 0x80-0xff
71	// (which can be non-Latin letters, etc.)
72	mp_printf(print, "%c", *s);
73	} else if (*s == `'\n'`) {
74	mp_print_str(print, "\\n");
75	} else if (*s == `'\r'`) {
76	mp_print_str(print, "\\r");
77	} else if (*s == `'\t'`) {
78	mp_print_str(print, "\\t");
79	} else {
80	mp_printf(print, "\\x%02x", *s);
81	}
82	}
83	mp_printf(print, "%c", quote_char);
84	}
85
86	#if MICROPY_PY_UJSON
87	void mp_str_print_json(const mp_print_t print, const* byte *str_data, size_t str_len) {
88	// for JSON spec, see http://www.ietf.org/rfc/rfc4627.txt
89	// if we are given a valid utf8-encoded string, we will print it in a JSON-conforming way
90	mp_print_str(print, "\"");
91	for (const byte s = str_data, top = str_data + str_len; s < top; s++) {
92	if (s == `'"'` \|\| s == `'\\'`) {
93	mp_printf(print, "\\%c", *s);
94	} else if (*s >= `32`) {
95	// this will handle normal and utf-8 encoded chars
96	mp_printf(print, "%c", *s);
97	} else if (*s == `'\n'`) {
98	mp_print_str(print, "\\n");
99	} else if (*s == `'\r'`) {
100	mp_print_str(print, "\\r");
101	} else if (*s == `'\t'`) {
102	mp_print_str(print, "\\t");
103	} else {
104	// this will handle control chars
105	mp_printf(print, "\\u%04x", *s);
106	}
107	}
108	mp_print_str(print, "\"");
109	}
110	#endif
111
112	STATIC void str_print(const mp_print_t *print, mp_obj_t self_in, mp_print_kind_t kind) {
113	GET_STR_DATA_LEN(self_in, str_data, str_len);
114	#if MICROPY_PY_UJSON
115	if (kind == PRINT_JSON) {
116	mp_str_print_json(print, str_data, str_len);
117	return;
118	}
119	#endif
120	#if !MICROPY_PY_BUILTINS_STR_UNICODE
121	bool is_bytes = mp_obj_is_type(self_in, &mp_type_bytes);
122	#else
123	bool is_bytes = true;
124	#endif
125	if (kind == PRINT_RAW \|\| (!MICROPY_PY_BUILTINS_STR_UNICODE && kind == PRINT_STR && !is_bytes)) {
126	print->print_strn(print->data, (const char *)str_data, str_len);
127	} else {
128	if (is_bytes) {
129	print->print_strn(print->data, "b", `1`);
130	}
131	mp_str_print_quoted(print, str_data, str_len, is_bytes);
132	}
133	}
134
135	mp_obj_t mp_obj_str_make_new(const mp_obj_type_t type, size_t n_args, size_t n_kw, const* mp_obj_t *args) {
136	#if MICROPY_CPYTHON_COMPAT
137	if (n_kw != `0`) {
138	mp_arg_error_unimpl_kw();
139	}
140	#endif
141
142	mp_arg_check_num(n_args, n_kw, `0`, `3`, false);
143
144	switch (n_args) {
145	case `0`:
146	return MP_OBJ_NEW_QSTR(MP_QSTR_);
147
148	case `1`: {
149	vstr_t vstr;
150	mp_print_t print;
151	vstr_init_print(&vstr, `16`, &print);
152	mp_obj_print_helper(&print, args[`0`], PRINT_STR);
153	return mp_obj_new_str_from_vstr(type, &vstr);
154	}
155
156	default: // 2 or 3 args
157	// TODO: validate 2nd/3rd args
158	if (mp_obj_is_type(args[`0`], &mp_type_bytes)) {
159	GET_STR_DATA_LEN(args[`0`], str_data, str_len);
160	GET_STR_HASH(args[`0`], str_hash);
161	if (str_hash == `0`) {
162	str_hash = qstr_compute_hash(str_data, str_len);
163	}
164	#if MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
165	if (!utf8_check(str_data, str_len)) {
166	mp_raise_msg(&mp_type_UnicodeError, NULL);
167	}
168	#endif
169
170	// Check if a qstr with this data already exists
171	qstr q = qstr_find_strn((const char *)str_data, str_len);
172	if (q != MP_QSTRnull) {
173	return MP_OBJ_NEW_QSTR(q);
174	}
175
176	mp_obj_str_t *o = MP_OBJ_TO_PTR(mp_obj_new_str_copy(type, NULL, str_len));
177	o->data = str_data;
178	o->hash = str_hash;
179	return MP_OBJ_FROM_PTR(o);
180	} else {
181	mp_buffer_info_t bufinfo;
182	mp_get_buffer_raise(args[`0`], &bufinfo, MP_BUFFER_READ);
183	#if MICROPY_PY_BUILTINS_STR_UNICODE_CHECK
184	if (!utf8_check(bufinfo.buf, bufinfo.len)) {
185	mp_raise_msg(&mp_type_UnicodeError, NULL);
186	}
187	#endif
188	return mp_obj_new_str(bufinfo.buf, bufinfo.len);
189	}
190	}
191	}
192
193	STATIC mp_obj_t bytes_make_new(const mp_obj_type_t type_in, size_t n_args, size_t n_kw, const* mp_obj_t *args) {
194	(void)type_in;
195
196	#if MICROPY_CPYTHON_COMPAT
197	if (n_kw != `0`) {
198	mp_arg_error_unimpl_kw();
199	}
200	#else
201	(void)n_kw;
202	#endif
203
204	if (n_args == `0`) {
205	return mp_const_empty_bytes;
206	}
207
208	if (mp_obj_is_type(args[`0`], &mp_type_bytes)) {
209	return args[`0`];
210	}
211
212	if (mp_obj_is_str(args[`0`])) {
213	if (n_args < `2` \|\| n_args > `3`) {
214	goto wrong_args;
215	}
216	GET_STR_DATA_LEN(args[`0`], str_data, str_len);
217	GET_STR_HASH(args[`0`], str_hash);
218	if (str_hash == `0`) {
219	str_hash = qstr_compute_hash(str_data, str_len);
220	}
221	mp_obj_str_t *o = MP_OBJ_TO_PTR(mp_obj_new_str_copy(&mp_type_bytes, NULL, str_len));
222	o->data = str_data;
223	o->hash = str_hash;
224	return MP_OBJ_FROM_PTR(o);
225	}
226
227	if (n_args > `1`) {
228	goto wrong_args;
229	}
230
231	if (mp_obj_is_small_int(args[`0`])) {
232	mp_int_t len = MP_OBJ_SMALL_INT_VALUE(args[`0`]);
233	if (len < `0`) {
234	mp_raise_ValueError(NULL);
235	}
236	vstr_t vstr;
237	vstr_init_len(&vstr, len);
238	memset(vstr.buf, `0`, len);
239	return mp_obj_new_str_from_vstr(&mp_type_bytes, &vstr);
240	}
241
242	// check if argument has the buffer protocol
243	mp_buffer_info_t bufinfo;
244	if (mp_get_buffer(args[`0`], &bufinfo, MP_BUFFER_READ)) {
245	return mp_obj_new_bytes(bufinfo.buf, bufinfo.len);
246	}
247
248	vstr_t vstr;
249	// Try to create array of exact len if initializer len is known
250	mp_obj_t len_in = mp_obj_len_maybe(args[`0`]);
251	if (len_in == MP_OBJ_NULL) {
252	vstr_init(&vstr, `16`);
253	} else {
254	mp_int_t len = MP_OBJ_SMALL_INT_VALUE(len_in);
255	vstr_init(&vstr, len);
256	}
257
258	mp_obj_iter_buf_t iter_buf;
259	mp_obj_t iterable = mp_getiter(args[`0`], &iter_buf);
260	mp_obj_t item;
261	while ((item = mp_iternext(iterable)) != MP_OBJ_STOP_ITERATION) {
262	mp_int_t val = mp_obj_get_int(item);
263	#if MICROPY_FULL_CHECKS
264	if (val < `0` \|\| val > `255`) {
265	mp_raise_ValueError(MP_ERROR_TEXT("bytes value out of range"));
266	}
267	#endif
268	vstr_add_byte(&vstr, val);
269	}
270
271	return mp_obj_new_str_from_vstr(&mp_type_bytes, &vstr);
272
273	wrong_args:
274	mp_raise_TypeError(MP_ERROR_TEXT("wrong number of arguments"));
275	}
276
277	// like strstr but with specified length and allows \0 bytes
278	// TODO replace with something more efficient/standard
279	const byte find_subbytes(const* byte haystack, size_t hlen, const* byte needle, size_t nlen, int* direction) {
280	if (hlen >= nlen) {
281	size_t str_index, str_index_end;
282	if (direction > `0`) {
283	str_index = `0`;
284	str_index_end = hlen - nlen;
285	} else {
286	str_index = hlen - nlen;
287	str_index_end = `0`;
288	}
289	for (;;) {
290	if (memcmp(&haystack[str_index], needle, nlen) == `0`) {
291	// found
292	return haystack + str_index;
293	}
294	if (str_index == str_index_end) {
295	// not found
296	break;
297	}
298	str_index += direction;
299	}
300	}
301	return NULL;
302	}
303
304	// Note: this function is used to check if an object is a str or bytes, which
305	// works because both those types use it as their binary_op method. Revisit
306	// mp_obj_is_str_or_bytes if this fact changes.
307	mp_obj_t mp_obj_str_binary_op(mp_binary_op_t op, mp_obj_t lhs_in, mp_obj_t rhs_in) {
308	// check for modulo
309	if (op == MP_BINARY_OP_MODULO) {
310	#if MICROPY_PY_BUILTINS_STR_OP_MODULO
311	mp_obj_t *args = &rhs_in;
312	size_t n_args = `1`;
313	mp_obj_t dict = MP_OBJ_NULL;
314	if (mp_obj_is_type(rhs_in, &mp_type_tuple)) {
315	// TODO: Support tuple subclasses?
316	mp_obj_tuple_get(rhs_in, &n_args, &args);
317	} else if (mp_obj_is_type(rhs_in, &mp_type_dict)) {
318	dict = rhs_in;
319	}
320	return str_modulo_format(lhs_in, n_args, args, dict);
321	#else
322	return MP_OBJ_NULL;
323	#endif
324	}
325
326	// from now on we need lhs type and data, so extract them
327	const mp_obj_type_t *lhs_type = mp_obj_get_type(lhs_in);
328	GET_STR_DATA_LEN(lhs_in, lhs_data, lhs_len);
329
330	// check for multiply
331	if (op == MP_BINARY_OP_MULTIPLY) {
332	mp_int_t n;
333	if (!mp_obj_get_int_maybe(rhs_in, &n)) {
334	return MP_OBJ_NULL; // op not supported
335	}
336	if (n <= `0`) {
337	if (lhs_type == &mp_type_str) {
338	return MP_OBJ_NEW_QSTR(MP_QSTR_); // empty str
339	} else {
340	return mp_const_empty_bytes;
341	}
342	}
343	vstr_t vstr;
344	vstr_init_len(&vstr, lhs_len * n);
345	mp_seq_multiply(lhs_data, sizeof(*lhs_data), lhs_len, n, vstr.buf);
346	return mp_obj_new_str_from_vstr(lhs_type, &vstr);
347	}
348
349	// From now on all operations allow:
350	// - str with str
351	// - bytes with bytes
352	// - bytes with bytearray
353	// - bytes with array.array
354	// To do this efficiently we use the buffer protocol to extract the raw
355	// data for the rhs, but only if the lhs is a bytes object.
356	//
357	// NOTE: CPython does not allow comparison between bytes ard array.array
358	// (even if the array is of type 'b'), even though it allows addition of
359	// such types. We are not compatible with this (we do allow comparison
360	// of bytes with anything that has the buffer protocol). It would be
361	// easy to "fix" this with a bit of extra logic below, but it costs code
362	// size and execution time so we don't.
363
364	const byte *rhs_data;
365	size_t rhs_len;
366	if (lhs_type == mp_obj_get_type(rhs_in)) {
367	GET_STR_DATA_LEN(rhs_in, rhs_data_, rhs_len_);
368	rhs_data = rhs_data_;
369	rhs_len = rhs_len_;
370	} else if (lhs_type == &mp_type_bytes) {
371	mp_buffer_info_t bufinfo;
372	if (!mp_get_buffer(rhs_in, &bufinfo, MP_BUFFER_READ)) {
373	return MP_OBJ_NULL; // op not supported
374	}
375	rhs_data = bufinfo.buf;
376	rhs_len = bufinfo.len;
377	} else {
378	// LHS is str and RHS has an incompatible type
379	// (except if operation is EQUAL, but that's handled by mp_obj_equal)
380	bad_implicit_conversion(rhs_in);
381	}
382
383	switch (op) {
384	case MP_BINARY_OP_ADD:
385	case MP_BINARY_OP_INPLACE_ADD: {
386	if (lhs_len == `0` && mp_obj_get_type(rhs_in) == lhs_type) {
387	return rhs_in;
388	}
389	if (rhs_len == `0`) {
390	return lhs_in;
391	}
392
393	vstr_t vstr;
394	vstr_init_len(&vstr, lhs_len + rhs_len);
395	memcpy(vstr.buf, lhs_data, lhs_len);
396	memcpy(vstr.buf + lhs_len, rhs_data, rhs_len);
397	return mp_obj_new_str_from_vstr(lhs_type, &vstr);
398	}
399
400	case MP_BINARY_OP_CONTAINS:
401	return mp_obj_new_bool(find_subbytes(lhs_data, lhs_len, rhs_data, rhs_len, `1`) != NULL);
402
403	// case MP_BINARY_OP_NOT_EQUAL: // This is never passed here
404	case MP_BINARY_OP_EQUAL: // This will be passed only for bytes, str is dealt with in mp_obj_equal()
405	case MP_BINARY_OP_LESS:
406	case MP_BINARY_OP_LESS_EQUAL:
407	case MP_BINARY_OP_MORE:
408	case MP_BINARY_OP_MORE_EQUAL:
409	return mp_obj_new_bool(mp_seq_cmp_bytes(op, lhs_data, lhs_len, rhs_data, rhs_len));
410
411	default:
412	return MP_OBJ_NULL; // op not supported
413	}
414	}
415
416	#if !MICROPY_PY_BUILTINS_STR_UNICODE
417	// objstrunicode defines own version
418	const byte str_index_to_ptr(const* mp_obj_type_t type, const* byte *self_data, size_t self_len,
419	mp_obj_t index, bool is_slice) {
420	size_t index_val = mp_get_index(type, self_len, index, is_slice);
421	return self_data + index_val;
422	}
423	#endif
424
425	// This is used for both bytes and 8-bit strings. This is not used for unicode strings.
426	STATIC mp_obj_t bytes_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) {
427	const mp_obj_type_t *type = mp_obj_get_type(self_in);
428	GET_STR_DATA_LEN(self_in, self_data, self_len);
429	if (value == MP_OBJ_SENTINEL) {
430	// load
431	#if MICROPY_PY_BUILTINS_SLICE
432	if (mp_obj_is_type(index, &mp_type_slice)) {
433	mp_bound_slice_t slice;
434	if (!mp_seq_get_fast_slice_indexes(self_len, index, &slice)) {
435	mp_raise_NotImplementedError(MP_ERROR_TEXT("only slices with step=1 (aka None) are supported"));
436	}
437	return mp_obj_new_str_of_type(type, self_data + slice.start, slice.stop - slice.start);
438	}
439	#endif
440	size_t index_val = mp_get_index(type, self_len, index, false);
441	// If we have unicode enabled the type will always be bytes, so take the short cut.
442	if (MICROPY_PY_BUILTINS_STR_UNICODE \|\| type == &mp_type_bytes) {
443	return MP_OBJ_NEW_SMALL_INT(self_data[index_val]);
444	} else {
445	return mp_obj_new_str_via_qstr((char *)&self_data[index_val], `1`);
446	}
447	} else {
448	return MP_OBJ_NULL; // op not supported
449	}
450	}
451
452	STATIC mp_obj_t str_join(mp_obj_t self_in, mp_obj_t arg) {
453	mp_check_self(mp_obj_is_str_or_bytes(self_in));
454	const mp_obj_type_t *self_type = mp_obj_get_type(self_in);
455
456	// get separation string
457	GET_STR_DATA_LEN(self_in, sep_str, sep_len);
458
459	// process args
460	size_t seq_len;
461	mp_obj_t *seq_items;
462
463	if (!mp_obj_is_type(arg, &mp_type_list) && !mp_obj_is_type(arg, &mp_type_tuple)) {
464	// arg is not a list nor a tuple, try to convert it to a list
465	// TODO: Try to optimize?
466	arg = mp_type_list.make_new(&mp_type_list, `1`, `0`, &arg);
467	}
468	mp_obj_get_array(arg, &seq_len, &seq_items);
469
470	// count required length
471	size_t required_len = `0`;
472	for (size_t i = `0`; i < seq_len; i++) {
473	if (mp_obj_get_type(seq_items[i]) != self_type) {
474	mp_raise_TypeError(
475	MP_ERROR_TEXT("join expects a list of str/bytes objects consistent with self object"));
476	}
477	if (i > `0`) {
478	required_len += sep_len;
479	}
480	GET_STR_LEN(seq_items[i], l);
481	required_len += l;
482	}
483
484	// make joined string
485	vstr_t vstr;
486	vstr_init_len(&vstr, required_len);
487	byte data = (byte )vstr.buf;
488	for (size_t i = `0`; i < seq_len; i++) {
489	if (i > `0`) {
490	memcpy(data, sep_str, sep_len);
491	data += sep_len;
492	}
493	GET_STR_DATA_LEN(seq_items[i], s, l);
494	memcpy(data, s, l);
495	data += l;
496	}
497
498	// return joined string
499	return mp_obj_new_str_from_vstr(self_type, &vstr);
500	}
501	MP_DEFINE_CONST_FUN_OBJ_2(str_join_obj, str_join);
502
503	mp_obj_t mp_obj_str_split(size_t n_args, const mp_obj_t *args) {
504	const mp_obj_type_t *self_type = mp_obj_get_type(args[`0`]);
505	mp_int_t splits = -`1`;
506	mp_obj_t sep = mp_const_none;
507	if (n_args > `1`) {
508	sep = args[`1`];
509	if (n_args > `2`) {
510	splits = mp_obj_get_int(args[`2`]);
511	}
512	}
513
514	mp_obj_t res = mp_obj_new_list(`0`, NULL);
515	GET_STR_DATA_LEN(args[`0`], s, len);
516	const byte *top = s + len;
517
518	if (sep == mp_const_none) {
519	// sep not given, so separate on whitespace
520
521	// Initial whitespace is not counted as split, so we pre-do it
522	while (s < top && unichar_isspace(*s)) {
523	s++;
524	}
525	while (s < top && splits != `0`) {
526	const byte *start = s;
527	while (s < top && !unichar_isspace(*s)) {
528	s++;
529	}
530	mp_obj_list_append(res, mp_obj_new_str_of_type(self_type, start, s - start));
531	if (s >= top) {
532	break;
533	}
534	while (s < top && unichar_isspace(*s)) {
535	s++;
536	}
537	if (splits > `0`) {
538	splits--;
539	}
540	}
541
542	if (s < top) {
543	mp_obj_list_append(res, mp_obj_new_str_of_type(self_type, s, top - s));
544	}
545
546	} else {
547	// sep given
548	if (mp_obj_get_type(sep) != self_type) {
549	bad_implicit_conversion(sep);
550	}
551
552	size_t sep_len;
553	const char *sep_str = mp_obj_str_get_data(sep, &sep_len);
554
555	if (sep_len == `0`) {
556	mp_raise_ValueError(MP_ERROR_TEXT("empty separator"));
557	}
558
559	for (;;) {
560	const byte *start = s;
561	for (;;) {
562	if (splits == `0` \|\| s + sep_len > top) {
563	s = top;
564	break;
565	} else if (memcmp(s, sep_str, sep_len) == `0`) {
566	break;
567	}
568	s++;
569	}
570	mp_obj_list_append(res, mp_obj_new_str_of_type(self_type, start, s - start));
571	if (s >= top) {
572	break;
573	}
574	s += sep_len;
575	if (splits > `0`) {
576	splits--;
577	}
578	}
579	}
580
581	return res;
582	}
583	MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_split_obj, `1`, `3`, mp_obj_str_split);
584
585	#if MICROPY_PY_BUILTINS_STR_SPLITLINES
586	STATIC mp_obj_t str_splitlines(size_t n_args, const mp_obj_t pos_args, mp_map_t kw_args) {
587	enum { ARG_keepends };
588	static const mp_arg_t allowed_args[] = {
589	{ MP_QSTR_keepends, MP_ARG_BOOL, {.u_bool = false} },
590	};
591
592	// parse args
593	mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
594	mp_arg_parse_all(n_args - `1`, pos_args + `1`, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
595
596	const mp_obj_type_t *self_type = mp_obj_get_type(pos_args[`0`]);
597	mp_obj_t res = mp_obj_new_list(`0`, NULL);
598
599	GET_STR_DATA_LEN(pos_args[`0`], s, len);
600	const byte *top = s + len;
601
602	while (s < top) {
603	const byte *start = s;
604	size_t match = `0`;
605	while (s < top) {
606	if (*s == `'\n'`) {
607	match = `1`;
608	break;
609	} else if (*s == `'\r'`) {
610	if (s[`1`] == `'\n'`) {
611	match = `2`;
612	} else {
613	match = `1`;
614	}
615	break;
616	}
617	s++;
618	}
619	size_t sub_len = s - start;
620	if (args[ARG_keepends].u_bool) {
621	sub_len += match;
622	}
623	mp_obj_list_append(res, mp_obj_new_str_of_type(self_type, start, sub_len));
624	s += match;
625	}
626
627	return res;
628	}
629	MP_DEFINE_CONST_FUN_OBJ_KW(str_splitlines_obj, `1`, str_splitlines);
630	#endif
631
632	STATIC mp_obj_t str_rsplit(size_t n_args, const mp_obj_t *args) {
633	if (n_args < `3`) {
634	// If we don't have split limit, it doesn't matter from which side
635	// we split.
636	return mp_obj_str_split(n_args, args);
637	}
638	const mp_obj_type_t *self_type = mp_obj_get_type(args[`0`]);
639	mp_obj_t sep = args[`1`];
640	GET_STR_DATA_LEN(args[`0`], s, len);
641
642	mp_int_t splits = mp_obj_get_int(args[`2`]);
643	if (splits < `0`) {
644	// Negative limit means no limit, so delegate to split().
645	return mp_obj_str_split(n_args, args);
646	}
647
648	mp_int_t org_splits = splits;
649	// Preallocate list to the max expected # of elements, as we
650	// will fill it from the end.
651	mp_obj_list_t *res = MP_OBJ_TO_PTR(mp_obj_new_list(splits + `1`, NULL));
652	mp_int_t idx = splits;
653
654	if (sep == mp_const_none) {
655	mp_raise_NotImplementedError(MP_ERROR_TEXT("rsplit(None,n)"));
656	} else {
657	size_t sep_len;
658	const char *sep_str = mp_obj_str_get_data(sep, &sep_len);
659
660	if (sep_len == `0`) {
661	mp_raise_ValueError(MP_ERROR_TEXT("empty separator"));
662	}
663
664	const byte *beg = s;
665	const byte *last = s + len;
666	for (;;) {
667	s = last - sep_len;
668	for (;;) {
669	if (splits == `0` \|\| s < beg) {
670	break;
671	} else if (memcmp(s, sep_str, sep_len) == `0`) {
672	break;
673	}
674	s--;
675	}
676	if (s < beg \|\| splits == `0`) {
677	res->items[idx] = mp_obj_new_str_of_type(self_type, beg, last - beg);
678	break;
679	}
680	res->items[idx--] = mp_obj_new_str_of_type(self_type, s + sep_len, last - s - sep_len);
681	last = s;
682	splits--;
683	}
684	if (idx != `0`) {
685	// We split less parts than split limit, now go cleanup surplus
686	size_t used = org_splits + `1` - idx;
687	memmove(res->items, &res->items[idx], used * sizeof(mp_obj_t));
688	mp_seq_clear(res->items, used, res->alloc, sizeof(*res->items));
689	res->len = used;
690	}
691	}
692
693	return MP_OBJ_FROM_PTR(res);
694	}
695	MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_rsplit_obj, `1`, `3`, str_rsplit);
696
697	STATIC mp_obj_t str_finder(size_t n_args, const mp_obj_t args, int* direction, bool is_index) {
698	const mp_obj_type_t *self_type = mp_obj_get_type(args[`0`]);
699	mp_check_self(mp_obj_is_str_or_bytes(args[`0`]));
700
701	// check argument type
702	if (mp_obj_get_type(args[`1`]) != self_type) {
703	bad_implicit_conversion(args[`1`]);
704	}
705
706	GET_STR_DATA_LEN(args[`0`], haystack, haystack_len);
707	GET_STR_DATA_LEN(args[`1`], needle, needle_len);
708
709	const byte *start = haystack;
710	const byte *end = haystack + haystack_len;
711	if (n_args >= `3` && args[`2`] != mp_const_none) {
712	start = str_index_to_ptr(self_type, haystack, haystack_len, args[`2`], true);
713	}
714	if (n_args >= `4` && args[`3`] != mp_const_none) {
715	end = str_index_to_ptr(self_type, haystack, haystack_len, args[`3`], true);
716	}
717
718	if (end < start) {
719	goto out_error;
720	}
721
722	const byte *p = find_subbytes(start, end - start, needle, needle_len, direction);
723	if (p == NULL) {
724	out_error:
725	// not found
726	if (is_index) {
727	mp_raise_ValueError(MP_ERROR_TEXT("substring not found"));
728	} else {
729	return MP_OBJ_NEW_SMALL_INT(-`1`);
730	}
731	} else {
732	// found
733	#if MICROPY_PY_BUILTINS_STR_UNICODE
734	if (self_type == &mp_type_str) {
735	return MP_OBJ_NEW_SMALL_INT(utf8_ptr_to_index(haystack, p));
736	}
737	#endif
738	return MP_OBJ_NEW_SMALL_INT(p - haystack);
739	}
740	}
741
742	STATIC mp_obj_t str_find(size_t n_args, const mp_obj_t *args) {
743	return str_finder(n_args, args, `1`, false);
744	}
745	MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_find_obj, `2`, `4`, str_find);
746
747	STATIC mp_obj_t str_rfind(size_t n_args, const mp_obj_t *args) {
748	return str_finder(n_args, args, -`1`, false);
749	}
750	MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_rfind_obj, `2`, `4`, str_rfind);
751
752	STATIC mp_obj_t str_index(size_t n_args, const mp_obj_t *args) {
753	return str_finder(n_args, args, `1`, true);
754	}
755	MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_index_obj, `2`, `4`, str_index);
756
757	STATIC mp_obj_t str_rindex(size_t n_args, const mp_obj_t *args) {
758	return str_finder(n_args, args, -`1`, true);
759	}
760	MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_rindex_obj, `2`, `4`, str_rindex);
761
762	// TODO: (Much) more variety in args
763	STATIC mp_obj_t str_startswith(size_t n_args, const mp_obj_t *args) {
764	const mp_obj_type_t *self_type = mp_obj_get_type(args[`0`]);
765	GET_STR_DATA_LEN(args[`0`], str, str_len);
766	size_t prefix_len;
767	const char *prefix = mp_obj_str_get_data(args[`1`], &prefix_len);
768	const byte *start = str;
769	if (n_args > `2`) {
770	start = str_index_to_ptr(self_type, str, str_len, args[`2`], true);
771	}
772	if (prefix_len + (start - str) > str_len) {
773	return mp_const_false;
774	}
775	return mp_obj_new_bool(memcmp(start, prefix, prefix_len) == `0`);
776	}
777	MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_startswith_obj, `2`, `3`, str_startswith);
778
779	STATIC mp_obj_t str_endswith(size_t n_args, const mp_obj_t *args) {
780	GET_STR_DATA_LEN(args[`0`], str, str_len);
781	size_t suffix_len;
782	const char *suffix = mp_obj_str_get_data(args[`1`], &suffix_len);
783	if (n_args > `2`) {
784	mp_raise_NotImplementedError(MP_ERROR_TEXT("start/end indices"));
785	}
786
787	if (suffix_len > str_len) {
788	return mp_const_false;
789	}
790	return mp_obj_new_bool(memcmp(str + (str_len - suffix_len), suffix, suffix_len) == `0`);
791	}
792	MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_endswith_obj, `2`, `3`, str_endswith);
793
794	enum { LSTRIP, RSTRIP, STRIP };
795
796	STATIC mp_obj_t str_uni_strip(int type, size_t n_args, const mp_obj_t *args) {
797	mp_check_self(mp_obj_is_str_or_bytes(args[`0`]));
798	const mp_obj_type_t *self_type = mp_obj_get_type(args[`0`]);
799
800	const byte *chars_to_del;
801	uint chars_to_del_len;
802	static const byte whitespace[] = " \t\n\r\v\f";
803
804	if (n_args == `1`) {
805	chars_to_del = whitespace;
806	chars_to_del_len = sizeof(whitespace) - `1`;
807	} else {
808	if (mp_obj_get_type(args[`1`]) != self_type) {
809	bad_implicit_conversion(args[`1`]);
810	}
811	GET_STR_DATA_LEN(args[`1`], s, l);
812	chars_to_del = s;
813	chars_to_del_len = l;
814	}
815
816	GET_STR_DATA_LEN(args[`0`], orig_str, orig_str_len);
817
818	size_t first_good_char_pos = `0`;
819	bool first_good_char_pos_set = false;
820	size_t last_good_char_pos = `0`;
821	size_t i = `0`;
822	int delta = `1`;
823	if (type == RSTRIP) {
824	i = orig_str_len - `1`;
825	delta = -`1`;
826	}
827	for (size_t len = orig_str_len; len > `0`; len--) {
828	if (find_subbytes(chars_to_del, chars_to_del_len, &orig_str[i], `1`, `1`) == NULL) {
829	if (!first_good_char_pos_set) {
830	first_good_char_pos_set = true;
831	first_good_char_pos = i;
832	if (type == LSTRIP) {
833	last_good_char_pos = orig_str_len - `1`;
834	break;
835	} else if (type == RSTRIP) {
836	first_good_char_pos = `0`;
837	last_good_char_pos = i;
838	break;
839	}
840	}
841	last_good_char_pos = i;
842	}
843	i += delta;
844	}
845
846	if (!first_good_char_pos_set) {
847	// string is all whitespace, return ''
848	if (self_type == &mp_type_str) {
849	return MP_OBJ_NEW_QSTR(MP_QSTR_);
850	} else {
851	return mp_const_empty_bytes;
852	}
853	}
854
855	assert(last_good_char_pos >= first_good_char_pos);
856	// +1 to accommodate the last character
857	size_t stripped_len = last_good_char_pos - first_good_char_pos + `1`;
858	if (stripped_len == orig_str_len) {
859	// If nothing was stripped, don't bother to dup original string
860	// TODO: watch out for this case when we'll get to bytearray.strip()
861	assert(first_good_char_pos == `0`);
862	return args[`0`];
863	}
864	return mp_obj_new_str_of_type(self_type, orig_str + first_good_char_pos, stripped_len);
865	}
866
867	STATIC mp_obj_t str_strip(size_t n_args, const mp_obj_t *args) {
868	return str_uni_strip(STRIP, n_args, args);
869	}
870	MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_strip_obj, `1`, `2`, str_strip);
871
872	STATIC mp_obj_t str_lstrip(size_t n_args, const mp_obj_t *args) {
873	return str_uni_strip(LSTRIP, n_args, args);
874	}
875	MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_lstrip_obj, `1`, `2`, str_lstrip);
876
877	STATIC mp_obj_t str_rstrip(size_t n_args, const mp_obj_t *args) {
878	return str_uni_strip(RSTRIP, n_args, args);
879	}
880	MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_rstrip_obj, `1`, `2`, str_rstrip);
881
882	#if MICROPY_PY_BUILTINS_STR_CENTER
883	STATIC mp_obj_t str_center(mp_obj_t str_in, mp_obj_t width_in) {
884	GET_STR_DATA_LEN(str_in, str, str_len);
885	mp_uint_t width = mp_obj_get_int(width_in);
886	if (str_len >= width) {
887	return str_in;
888	}
889
890	vstr_t vstr;
891	vstr_init_len(&vstr, width);
892	memset(vstr.buf, `' '`, width);
893	int left = (width - str_len) / `2`;
894	memcpy(vstr.buf + left, str, str_len);
895	return mp_obj_new_str_from_vstr(mp_obj_get_type(str_in), &vstr);
896	}
897	MP_DEFINE_CONST_FUN_OBJ_2(str_center_obj, str_center);
898	#endif
899
900	// Takes an int arg, but only parses unsigned numbers, and only changes
901	// num if at least one digit was parsed.*
902	STATIC const char str_to_int(const* char str, const* char top, int* *num) {
903	if (str < top && `'0'` <= str && str <= `'9'`) {
904	*num = `0`;
905	do {
906	num = num * `10` + (*str - `'0'`);
907	str++;
908	}
909	while (str < top && `'0'` <= str && str <= `'9'`);
910	}
911	return str;
912	}
913
914	STATIC bool isalignment(char ch) {
915	return ch && strchr("<>=^", ch) != NULL;
916	}
917
918	STATIC bool istype(char ch) {
919	return ch && strchr("bcdeEfFgGnosxX%", ch) != NULL;
920	}
921
922	STATIC bool arg_looks_integer(mp_obj_t arg) {
923	return mp_obj_is_bool(arg) \|\| mp_obj_is_int(arg);
924	}
925
926	STATIC bool arg_looks_numeric(mp_obj_t arg) {
927	return arg_looks_integer(arg)
928	#if MICROPY_PY_BUILTINS_FLOAT
929	\|\| mp_obj_is_float(arg)
930	#endif
931	;
932	}
933
934	#if MICROPY_PY_BUILTINS_STR_OP_MODULO
935	STATIC mp_obj_t arg_as_int(mp_obj_t arg) {
936	#if MICROPY_PY_BUILTINS_FLOAT
937	if (mp_obj_is_float(arg)) {
938	return mp_obj_new_int_from_float(mp_obj_float_get(arg));
939	}
940	#endif
941	return arg;
942	}
943	#endif
944
945	#if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE
946	STATIC NORETURN void terse_str_format_value_error(void) {
947	mp_raise_ValueError(MP_ERROR_TEXT("bad format string"));
948	}
949	#else
950	// define to nothing to improve coverage
951	#define terse_str_format_value_error()
952	#endif
953
954	STATIC vstr_t mp_obj_str_format_helper(const char str, const* char top, int* arg_i, size_t n_args, const* mp_obj_t args, mp_map_t kwargs) {
955	vstr_t vstr;
956	mp_print_t print;
957	vstr_init_print(&vstr, `16`, &print);
958
959	for (; str < top; str++) {
960	if (*str == `'}'`) {
961	str++;
962	if (str < top && *str == `'}'`) {
963	vstr_add_byte(&vstr, `'}'`);
964	continue;
965	}
966	#if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE
967	terse_str_format_value_error();
968	#else
969	mp_raise_ValueError(MP_ERROR_TEXT("single '}' encountered in format string"));
970	#endif
971	}
972	if (*str != `'{'`) {
973	vstr_add_byte(&vstr, *str);
974	continue;
975	}
976
977	str++;
978	if (str < top && *str == `'{'`) {
979	vstr_add_byte(&vstr, `'{'`);
980	continue;
981	}
982
983	// replacement_field ::= "{" [field_name] ["!" conversion] [":" format_spec] "}"
984
985	const char *field_name = NULL;
986	const char *field_name_top = NULL;
987	char conversion = `'\0'`;
988	const char *format_spec = NULL;
989
990	if (str < top && str != `'}'` && str != `'!'` && *str != `':'`) {
991	field_name = (const char *)str;
992	while (str < top && str != `'}'` && str != `'!'` && *str != `':'`) {
993	++str;
994	}
995	field_name_top = (const char *)str;
996	}
997
998	// conversion ::= "r" \| "s"
999
1000	if (str < top && *str == `'!'`) {
1001	str++;
1002	if (str < top && (str == `'r'` \|\| str == `'s'`)) {
1003	conversion = *str++;
1004	} else {
1005	#if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE
1006	terse_str_format_value_error();
1007	#elif MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_NORMAL
1008	mp_raise_ValueError(MP_ERROR_TEXT("bad conversion specifier"));
1009	#else
1010	if (str >= top) {
1011	mp_raise_ValueError(
1012	MP_ERROR_TEXT("end of format while looking for conversion specifier"));
1013	} else {
1014	mp_raise_msg_varg(&mp_type_ValueError,
1015	MP_ERROR_TEXT("unknown conversion specifier %c"), *str);
1016	}
1017	#endif
1018	}
1019	}
1020
1021	if (str < top && *str == `':'`) {
1022	str++;
1023	// {:} is the same as {}, which is the same as {!s}
1024	// This makes a difference when passing in a True or False
1025	// '{}'.format(True) returns 'True'
1026	// '{:d}'.format(True) returns '1'
1027	// So we treat {:} as {} and this later gets treated to be {!s}
1028	if (*str != `'}'`) {
1029	format_spec = str;
1030	for (int nest = `1`; str < top;) {
1031	if (*str == `'{'`) {
1032	++nest;
1033	} else if (*str == `'}'`) {
1034	if (--nest == `0`) {
1035	break;
1036	}
1037	}
1038	++str;
1039	}
1040	}
1041	}
1042	if (str >= top) {
1043	#if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE
1044	terse_str_format_value_error();
1045	#else
1046	mp_raise_ValueError(MP_ERROR_TEXT("unmatched '{' in format"));
1047	#endif
1048	}
1049	if (*str != `'}'`) {
1050	#if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE
1051	terse_str_format_value_error();
1052	#else
1053	mp_raise_ValueError(MP_ERROR_TEXT("expected ':' after format specifier"));
1054	#endif
1055	}
1056
1057	mp_obj_t arg = mp_const_none;
1058
1059	if (field_name) {
1060	int index = `0`;
1061	if (MP_LIKELY(unichar_isdigit(*field_name))) {
1062	if (*arg_i > `0`) {
1063	#if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE
1064	terse_str_format_value_error();
1065	#else
1066	mp_raise_ValueError(
1067	MP_ERROR_TEXT("can't switch from automatic field numbering to manual field specification"));
1068	#endif
1069	}
1070	field_name = str_to_int(field_name, field_name_top, &index);
1071	if ((uint)index >= n_args - `1`) {
1072	mp_raise_msg(&mp_type_IndexError, MP_ERROR_TEXT("tuple index out of range"));
1073	}
1074	arg = args[index + `1`];
1075	*arg_i = -`1`;
1076	} else {
1077	const char *lookup;
1078	for (lookup = field_name; lookup < field_name_top && lookup != `'.'` && lookup != `'['`; lookup++) {;
1079	}
1080	mp_obj_t field_q = mp_obj_new_str_via_qstr(field_name, lookup - field_name); // should it be via qstr?
1081	field_name = lookup;
1082	mp_map_elem_t *key_elem = mp_map_lookup(kwargs, field_q, MP_MAP_LOOKUP);
1083	if (key_elem == NULL) {
1084	nlr_raise(mp_obj_new_exception_arg1(&mp_type_KeyError, field_q));
1085	}
1086	arg = key_elem->value;
1087	}
1088	if (field_name < field_name_top) {
1089	mp_raise_NotImplementedError(MP_ERROR_TEXT("attributes not supported yet"));
1090	}
1091	} else {
1092	if (*arg_i < `0`) {
1093	#if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE
1094	terse_str_format_value_error();
1095	#else
1096	mp_raise_ValueError(
1097	MP_ERROR_TEXT("can't switch from manual field specification to automatic field numbering"));
1098	#endif
1099	}
1100	if ((uint)*arg_i >= n_args - `1`) {
1101	mp_raise_msg(&mp_type_IndexError, MP_ERROR_TEXT("tuple index out of range"));
1102	}
1103	arg = args[(*arg_i) + `1`];
1104	(*arg_i)++;
1105	}
1106	if (!format_spec && !conversion) {
1107	conversion = `'s'`;
1108	}
1109	if (conversion) {
1110	mp_print_kind_t print_kind;
1111	if (conversion == `'s'`) {
1112	print_kind = PRINT_STR;
1113	} else {
1114	assert(conversion == `'r'`);
1115	print_kind = PRINT_REPR;
1116	}
1117	vstr_t arg_vstr;
1118	mp_print_t arg_print;
1119	vstr_init_print(&arg_vstr, `16`, &arg_print);
1120	mp_obj_print_helper(&arg_print, arg, print_kind);
1121	arg = mp_obj_new_str_from_vstr(&mp_type_str, &arg_vstr);
1122	}
1123
1124	char fill = `'\0'`;
1125	char align = `'\0'`;
1126	int width = -`1`;
1127	int precision = -`1`;
1128	char type = `'\0'`;
1129	int flags = `0`;
1130
1131	if (format_spec) {
1132	// The format specifier (from http://docs.python.org/2/library/string.html#formatspec)
1133	//
1134	// [[fill]align][sign][#][0][width][,][.precision][type]
1135	// fill ::= <any character>
1136	// align ::= "<" \| ">" \| "=" \| "^"
1137	// sign ::= "+" \| "-" \| " "
1138	// width ::= integer
1139	// precision ::= integer
1140	// type ::= "b" \| "c" \| "d" \| "e" \| "E" \| "f" \| "F" \| "g" \| "G" \| "n" \| "o" \| "s" \| "x" \| "X" \| "%"
1141
1142	// recursively call the formatter to format any nested specifiers
1143	MP_STACK_CHECK();
1144	vstr_t format_spec_vstr = mp_obj_str_format_helper(format_spec, str, arg_i, n_args, args, kwargs);
1145	const char *s = vstr_null_terminated_str(&format_spec_vstr);
1146	const char *stop = s + format_spec_vstr.len;
1147	if (isalignment(*s)) {
1148	align = *s++;
1149	} else if (*s && isalignment(s[`1`])) {
1150	fill = *s++;
1151	align = *s++;
1152	}
1153	if (s == `'+'` \|\| s == `'-'` \|\| *s == `' '`) {
1154	if (*s == `'+'`) {
1155	flags \|= PF_FLAG_SHOW_SIGN;
1156	} else if (*s == `' '`) {
1157	flags \|= PF_FLAG_SPACE_SIGN;
1158	}
1159	s++;
1160	}
1161	if (*s == `'#'`) {
1162	flags \|= PF_FLAG_SHOW_PREFIX;
1163	s++;
1164	}
1165	if (*s == `'0'`) {
1166	if (!align) {
1167	align = `'='`;
1168	}
1169	if (!fill) {
1170	fill = `'0'`;
1171	}
1172	}
1173	s = str_to_int(s, stop, &width);
1174	if (*s == `','`) {
1175	flags \|= PF_FLAG_SHOW_COMMA;
1176	s++;
1177	}
1178	if (*s == `'.'`) {
1179	s++;
1180	s = str_to_int(s, stop, &precision);
1181	}
1182	if (istype(*s)) {
1183	type = *s++;
1184	}
1185	if (*s) {
1186	#if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE
1187	terse_str_format_value_error();
1188	#else
1189	mp_raise_ValueError(MP_ERROR_TEXT("invalid format specifier"));
1190	#endif
1191	}
1192	vstr_clear(&format_spec_vstr);
1193	}
1194	if (!align) {
1195	if (arg_looks_numeric(arg)) {
1196	align = `'>'`;
1197	} else {
1198	align = `'<'`;
1199	}
1200	}
1201	if (!fill) {
1202	fill = `' '`;
1203	}
1204
1205	if (flags & (PF_FLAG_SHOW_SIGN \| PF_FLAG_SPACE_SIGN)) {
1206	if (type == `'s'`) {
1207	#if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE
1208	terse_str_format_value_error();
1209	#else
1210	mp_raise_ValueError(MP_ERROR_TEXT("sign not allowed in string format specifier"));
1211	#endif
1212	}
1213	if (type == `'c'`) {
1214	#if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE
1215	terse_str_format_value_error();
1216	#else
1217	mp_raise_ValueError(
1218	MP_ERROR_TEXT("sign not allowed with integer format specifier 'c'"));
1219	#endif
1220	}
1221	}
1222
1223	switch (align) {
1224	case `'<'`:
1225	flags \|= PF_FLAG_LEFT_ADJUST;
1226	break;
1227	case `'='`:
1228	flags \|= PF_FLAG_PAD_AFTER_SIGN;
1229	break;
1230	case `'^'`:
1231	flags \|= PF_FLAG_CENTER_ADJUST;
1232	break;
1233	}
1234
1235	if (arg_looks_integer(arg)) {
1236	switch (type) {
1237	case `'b'`:
1238	mp_print_mp_int(&print, arg, `2`, `'a'`, flags, fill, width, `0`);
1239	continue;
1240
1241	case `'c'`: {
1242	char ch = mp_obj_get_int(arg);
1243	mp_print_strn(&print, &ch, `1`, flags, fill, width);
1244	continue;
1245	}
1246
1247	case `'\0'`: // No explicit format type implies 'd'
1248	case `'n'`: // I don't think we support locales in uPy so use 'd'
1249	case `'d'`:
1250	mp_print_mp_int(&print, arg, `10`, `'a'`, flags, fill, width, `0`);
1251	continue;
1252
1253	case `'o'`:
1254	if (flags & PF_FLAG_SHOW_PREFIX) {
1255	flags \|= PF_FLAG_SHOW_OCTAL_LETTER;
1256	}
1257
1258	mp_print_mp_int(&print, arg, `8`, `'a'`, flags, fill, width, `0`);
1259	continue;
1260
1261	case `'X'`:
1262	case `'x'`:
1263	mp_print_mp_int(&print, arg, `16`, type - (`'X'` - `'A'`), flags, fill, width, `0`);
1264	continue;
1265
1266	case `'e'`:
1267	case `'E'`:
1268	case `'f'`:
1269	case `'F'`:
1270	case `'g'`:
1271	case `'G'`:
1272	case `'%'`:
1273	// The floating point formatters all work with anything that
1274	// looks like an integer
1275	break;
1276
1277	default:
1278	#if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE
1279	terse_str_format_value_error();
1280	#else
1281	mp_raise_msg_varg(&mp_type_ValueError,
1282	MP_ERROR_TEXT("unknown format code '%c' for object of type '%s'"),
1283	type, mp_obj_get_type_str(arg));
1284	#endif
1285	}
1286	}
1287
1288	// NOTE: no else here. We need the e, f, g etc formats for integer
1289	// arguments (from above if) to take this if.
1290	if (arg_looks_numeric(arg)) {
1291	if (!type) {
1292
1293	// Even though the docs say that an unspecified type is the same
1294	// as 'g', there is one subtle difference, when the exponent
1295	// is one less than the precision.
1296	//
1297	// '{:10.1}'.format(0.0) ==> '0e+00'
1298	// '{:10.1g}'.format(0.0) ==> '0'
1299	//
1300	// TODO: Figure out how to deal with this.
1301	//
1302	// A proper solution would involve adding a special flag
1303	// or something to format_float, and create a format_double
1304	// to deal with doubles. In order to fix this when using
1305	// sprintf, we'd need to use the e format and tweak the
1306	// returned result to strip trailing zeros like the g format
1307	// does.
1308	//
1309	// {:10.3} and {:10.2e} with 1.23e2 both produce 1.23e+02
1310	// but with 1.e2 you get 1e+02 and 1.00e+02
1311	//
1312	// Stripping the trailing 0's (like g) does would make the
1313	// e format give us the right format.
1314	//
1315	// CPython sources say:
1316	// Omitted type specifier. Behaves in the same way as repr(x)
1317	// and str(x) if no precision is given, else like 'g', but with
1318	// at least one digit after the decimal point. /*
1319
1320	type = `'g'`;
1321	}
1322	if (type == `'n'`) {
1323	type = `'g'`;
1324	}
1325
1326	switch (type) {
1327	#if MICROPY_PY_BUILTINS_FLOAT
1328	case `'e'`:
1329	case `'E'`:
1330	case `'f'`:
1331	case `'F'`:
1332	case `'g'`:
1333	case `'G'`:
1334	mp_print_float(&print, mp_obj_get_float(arg), type, flags, fill, width, precision);
1335	break;
1336
1337	case `'%'`:
1338	flags \|= PF_FLAG_ADD_PERCENT;
1339	#if MICROPY_FLOAT_IMPL == MICROPY_FLOAT_IMPL_FLOAT
1340	#define F100 100.0F
1341	#else
1342	#define F100 100.0
1343	#endif
1344	mp_print_float(&print, mp_obj_get_float(arg) * F100, `'f'`, flags, fill, width, precision);
1345	#undef F100
1346	break;
1347	#endif
1348
1349	default:
1350	#if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE
1351	terse_str_format_value_error();
1352	#else
1353	mp_raise_msg_varg(&mp_type_ValueError,
1354	MP_ERROR_TEXT("unknown format code '%c' for object of type '%s'"),
1355	type, mp_obj_get_type_str(arg));
1356	#endif
1357	}
1358	} else {
1359	// arg doesn't look like a number
1360
1361	if (align == `'='`) {
1362	#if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE
1363	terse_str_format_value_error();
1364	#else
1365	mp_raise_ValueError(
1366	MP_ERROR_TEXT("'=' alignment not allowed in string format specifier"));
1367	#endif
1368	}
1369
1370	switch (type) {
1371	case `'\0'`: // no explicit format type implies 's'
1372	case `'s'`: {
1373	size_t slen;
1374	const char *s = mp_obj_str_get_data(arg, &slen);
1375	if (precision < `0`) {
1376	precision = slen;
1377	}
1378	if (slen > (size_t)precision) {
1379	slen = precision;
1380	}
1381	mp_print_strn(&print, s, slen, flags, fill, width);
1382	break;
1383	}
1384
1385	default:
1386	#if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE
1387	terse_str_format_value_error();
1388	#else
1389	mp_raise_msg_varg(&mp_type_ValueError,
1390	MP_ERROR_TEXT("unknown format code '%c' for object of type '%s'"),
1391	type, mp_obj_get_type_str(arg));
1392	#endif
1393	}
1394	}
1395	}
1396
1397	return vstr;
1398	}
1399
1400	mp_obj_t mp_obj_str_format(size_t n_args, const mp_obj_t args, mp_map_t kwargs) {
1401	mp_check_self(mp_obj_is_str_or_bytes(args[`0`]));
1402
1403	GET_STR_DATA_LEN(args[`0`], str, len);
1404	int arg_i = `0`;
1405	vstr_t vstr = mp_obj_str_format_helper((const char )str, (const* char *)str + len, &arg_i, n_args, args, kwargs);
1406	return mp_obj_new_str_from_vstr(mp_obj_get_type(args[`0`]), &vstr);
1407	}
1408	MP_DEFINE_CONST_FUN_OBJ_KW(str_format_obj, `1`, mp_obj_str_format);
1409
1410	#if MICROPY_PY_BUILTINS_STR_OP_MODULO
1411	STATIC mp_obj_t str_modulo_format(mp_obj_t pattern, size_t n_args, const mp_obj_t *args, mp_obj_t dict) {
1412	mp_check_self(mp_obj_is_str_or_bytes(pattern));
1413
1414	GET_STR_DATA_LEN(pattern, str, len);
1415	#if MICROPY_ERROR_REPORTING != MICROPY_ERROR_REPORTING_TERSE
1416	const byte *start_str = str;
1417	#endif
1418	bool is_bytes = mp_obj_is_type(pattern, &mp_type_bytes);
1419	size_t arg_i = `0`;
1420	vstr_t vstr;
1421	mp_print_t print;
1422	vstr_init_print(&vstr, `16`, &print);
1423
1424	for (const byte *top = str + len; str < top; str++) {
1425	mp_obj_t arg = MP_OBJ_NULL;
1426	if (*str != `'%'`) {
1427	vstr_add_byte(&vstr, *str);
1428	continue;
1429	}
1430	if (++str >= top) {
1431	goto incomplete_format;
1432	}
1433	if (*str == `'%'`) {
1434	vstr_add_byte(&vstr, `'%'`);
1435	continue;
1436	}
1437
1438	// Dictionary value lookup
1439	if (*str == `'('`) {
1440	if (dict == MP_OBJ_NULL) {
1441	mp_raise_TypeError(MP_ERROR_TEXT("format needs a dict"));
1442	}
1443	arg_i = `1`; // we used up the single dict argument
1444	const byte *key = ++str;
1445	while (*str != `')'`) {
1446	if (str >= top) {
1447	#if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE
1448	terse_str_format_value_error();
1449	#else
1450	mp_raise_ValueError(MP_ERROR_TEXT("incomplete format key"));
1451	#endif
1452	}
1453	++str;
1454	}
1455	mp_obj_t k_obj = mp_obj_new_str_via_qstr((const char *)key, str - key);
1456	arg = mp_obj_dict_get(dict, k_obj);
1457	str++;
1458	}
1459
1460	int flags = `0`;
1461	char fill = `' '`;
1462	int alt = `0`;
1463	while (str < top) {
1464	if (*str == `'-'`) {
1465	flags \|= PF_FLAG_LEFT_ADJUST;
1466	} else if (*str == `'+'`) {
1467	flags \|= PF_FLAG_SHOW_SIGN;
1468	} else if (*str == `' '`) {
1469	flags \|= PF_FLAG_SPACE_SIGN;
1470	} else if (*str == `'#'`) {
1471	alt = PF_FLAG_SHOW_PREFIX;
1472	} else if (*str == `'0'`) {
1473	flags \|= PF_FLAG_PAD_AFTER_SIGN;
1474	fill = `'0'`;
1475	} else {
1476	break;
1477	}
1478	str++;
1479	}
1480	// parse width, if it exists
1481	int width = `0`;
1482	if (str < top) {
1483	if (str == `''`) {
1484	if (arg_i >= n_args) {
1485	goto not_enough_args;
1486	}
1487	width = mp_obj_get_int(args[arg_i++]);
1488	str++;
1489	} else {
1490	str = (const byte )str_to_int((const* char )str, (const* char *)top, &width);
1491	}
1492	}
1493	int prec = -`1`;
1494	if (str < top && *str == `'.'`) {
1495	if (++str < top) {
1496	if (str == `''`) {
1497	if (arg_i >= n_args) {
1498	goto not_enough_args;
1499	}
1500	prec = mp_obj_get_int(args[arg_i++]);
1501	str++;
1502	} else {
1503	prec = `0`;
1504	str = (const byte )str_to_int((const* char )str, (const* char *)top, &prec);
1505	}
1506	}
1507	}
1508
1509	if (str >= top) {
1510	incomplete_format:
1511	#if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE
1512	terse_str_format_value_error();
1513	#else
1514	mp_raise_ValueError(MP_ERROR_TEXT("incomplete format"));
1515	#endif
1516	}
1517
1518	// Tuple value lookup
1519	if (arg == MP_OBJ_NULL) {
1520	if (arg_i >= n_args) {
1521	not_enough_args:
1522	mp_raise_TypeError(MP_ERROR_TEXT("format string needs more arguments"));
1523	}
1524	arg = args[arg_i++];
1525	}
1526	switch (*str) {
1527	case `'c'`:
1528	if (mp_obj_is_str(arg)) {
1529	size_t slen;
1530	const char *s = mp_obj_str_get_data(arg, &slen);
1531	if (slen != `1`) {
1532	mp_raise_TypeError(MP_ERROR_TEXT("%c needs int or char"));
1533	}
1534	mp_print_strn(&print, s, `1`, flags, `' '`, width);
1535	} else if (arg_looks_integer(arg)) {
1536	char ch = mp_obj_get_int(arg);
1537	mp_print_strn(&print, &ch, `1`, flags, `' '`, width);
1538	} else {
1539	mp_raise_TypeError(MP_ERROR_TEXT("integer needed"));
1540	}
1541	break;
1542
1543	case `'d'`:
1544	case `'i'`:
1545	case `'u'`:
1546	mp_print_mp_int(&print, arg_as_int(arg), `10`, `'a'`, flags, fill, width, prec);
1547	break;
1548
1549	#if MICROPY_PY_BUILTINS_FLOAT
1550	case `'e'`:
1551	case `'E'`:
1552	case `'f'`:
1553	case `'F'`:
1554	case `'g'`:
1555	case `'G'`:
1556	mp_print_float(&print, mp_obj_get_float(arg), *str, flags, fill, width, prec);
1557	break;
1558	#endif
1559
1560	case `'o'`:
1561	if (alt) {
1562	flags \|= (PF_FLAG_SHOW_PREFIX \| PF_FLAG_SHOW_OCTAL_LETTER);
1563	}
1564	mp_print_mp_int(&print, arg, `8`, `'a'`, flags, fill, width, prec);
1565	break;
1566
1567	case `'r'`:
1568	case `'s'`: {
1569	vstr_t arg_vstr;
1570	mp_print_t arg_print;
1571	vstr_init_print(&arg_vstr, `16`, &arg_print);
1572	mp_print_kind_t print_kind = (*str == `'r'` ? PRINT_REPR : PRINT_STR);
1573	if (print_kind == PRINT_STR && is_bytes && mp_obj_is_type(arg, &mp_type_bytes)) {
1574	// If we have something like b"%s" % b"1", bytes arg should be
1575	// printed undecorated.
1576	print_kind = PRINT_RAW;
1577	}
1578	mp_obj_print_helper(&arg_print, arg, print_kind);
1579	uint vlen = arg_vstr.len;
1580	if (prec < `0`) {
1581	prec = vlen;
1582	}
1583	if (vlen > (uint)prec) {
1584	vlen = prec;
1585	}
1586	mp_print_strn(&print, arg_vstr.buf, vlen, flags, `' '`, width);
1587	vstr_clear(&arg_vstr);
1588	break;
1589	}
1590
1591	case `'X'`:
1592	case `'x'`:
1593	mp_print_mp_int(&print, arg, `16`, *str - (`'X'` - `'A'`), flags \| alt, fill, width, prec);
1594	break;
1595
1596	default:
1597	#if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE
1598	terse_str_format_value_error();
1599	#else
1600	mp_raise_msg_varg(&mp_type_ValueError,
1601	MP_ERROR_TEXT("unsupported format character '%c' (0x%x) at index %d"),
1602	str, str, str - start_str);
1603	#endif
1604	}
1605	}
1606
1607	if (arg_i != n_args) {
1608	mp_raise_TypeError(MP_ERROR_TEXT("format string didn't convert all arguments"));
1609	}
1610
1611	return mp_obj_new_str_from_vstr(is_bytes ? &mp_type_bytes : &mp_type_str, &vstr);
1612	}
1613	#endif
1614
1615	// The implementation is optimized, returning the original string if there's
1616	// nothing to replace.
1617	STATIC mp_obj_t str_replace(size_t n_args, const mp_obj_t *args) {
1618	mp_check_self(mp_obj_is_str_or_bytes(args[`0`]));
1619
1620	mp_int_t max_rep = -`1`;
1621	if (n_args == `4`) {
1622	max_rep = mp_obj_get_int(args[`3`]);
1623	if (max_rep == `0`) {
1624	return args[`0`];
1625	} else if (max_rep < `0`) {
1626	max_rep = -`1`;
1627	}
1628	}
1629
1630	// if max_rep is still -1 by this point we will need to do all possible replacements
1631
1632	// check argument types
1633
1634	const mp_obj_type_t *self_type = mp_obj_get_type(args[`0`]);
1635
1636	if (mp_obj_get_type(args[`1`]) != self_type) {
1637	bad_implicit_conversion(args[`1`]);
1638	}
1639
1640	if (mp_obj_get_type(args[`2`]) != self_type) {
1641	bad_implicit_conversion(args[`2`]);
1642	}
1643
1644	// extract string data
1645
1646	GET_STR_DATA_LEN(args[`0`], str, str_len);
1647	GET_STR_DATA_LEN(args[`1`], old, old_len);
1648	GET_STR_DATA_LEN(args[`2`], new, new_len);
1649
1650	// old won't exist in str if it's longer, so nothing to replace
1651	if (old_len > str_len) {
1652	return args[`0`];
1653	}
1654
1655	// data for the replaced string
1656	byte *data = NULL;
1657	vstr_t vstr;
1658
1659	// do 2 passes over the string:
1660	// first pass computes the required length of the replaced string
1661	// second pass does the replacements
1662	for (;;) {
1663	size_t replaced_str_index = `0`;
1664	size_t num_replacements_done = `0`;
1665	const byte *old_occurrence;
1666	const byte *offset_ptr = str;
1667	size_t str_len_remain = str_len;
1668	if (old_len == `0`) {
1669	// if old_str is empty, copy new_str to start of replaced string
1670	// copy the replacement string
1671	if (data != NULL) {
1672	memcpy(data, new, new_len);
1673	}
1674	replaced_str_index += new_len;
1675	num_replacements_done++;
1676	}
1677	while (num_replacements_done != (size_t)max_rep && str_len_remain > `0` && (old_occurrence = find_subbytes(offset_ptr, str_len_remain, old, old_len, `1`)) != NULL) {
1678	if (old_len == `0`) {
1679	old_occurrence += `1`;
1680	}
1681	// copy from just after end of last occurrence of to-be-replaced string to right before start of next occurrence
1682	if (data != NULL) {
1683	memcpy(data + replaced_str_index, offset_ptr, old_occurrence - offset_ptr);
1684	}
1685	replaced_str_index += old_occurrence - offset_ptr;
1686	// copy the replacement string
1687	if (data != NULL) {
1688	memcpy(data + replaced_str_index, new, new_len);
1689	}
1690	replaced_str_index += new_len;
1691	offset_ptr = old_occurrence + old_len;
1692	str_len_remain = str + str_len - offset_ptr;
1693	num_replacements_done++;
1694	}
1695
1696	// copy from just after end of last occurrence of to-be-replaced string to end of old string
1697	if (data != NULL) {
1698	memcpy(data + replaced_str_index, offset_ptr, str_len_remain);
1699	}
1700	replaced_str_index += str_len_remain;
1701
1702	if (data == NULL) {
1703	// first pass
1704	if (num_replacements_done == `0`) {
1705	// no substr found, return original string
1706	return args[`0`];
1707	} else {
1708	// substr found, allocate new string
1709	vstr_init_len(&vstr, replaced_str_index);
1710	data = (byte *)vstr.buf;
1711	assert(data != NULL);
1712	}
1713	} else {
1714	// second pass, we are done
1715	break;
1716	}
1717	}
1718
1719	return mp_obj_new_str_from_vstr(self_type, &vstr);
1720	}
1721	MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_replace_obj, `3`, `4`, str_replace);
1722
1723	#if MICROPY_PY_BUILTINS_STR_COUNT
1724	STATIC mp_obj_t str_count(size_t n_args, const mp_obj_t *args) {
1725	const mp_obj_type_t *self_type = mp_obj_get_type(args[`0`]);
1726	mp_check_self(mp_obj_is_str_or_bytes(args[`0`]));
1727
1728	// check argument type
1729	if (mp_obj_get_type(args[`1`]) != self_type) {
1730	bad_implicit_conversion(args[`1`]);
1731	}
1732
1733	GET_STR_DATA_LEN(args[`0`], haystack, haystack_len);
1734	GET_STR_DATA_LEN(args[`1`], needle, needle_len);
1735
1736	const byte *start = haystack;
1737	const byte *end = haystack + haystack_len;
1738	if (n_args >= `3` && args[`2`] != mp_const_none) {
1739	start = str_index_to_ptr(self_type, haystack, haystack_len, args[`2`], true);
1740	}
1741	if (n_args >= `4` && args[`3`] != mp_const_none) {
1742	end = str_index_to_ptr(self_type, haystack, haystack_len, args[`3`], true);
1743	}
1744
1745	// if needle_len is zero then we count each gap between characters as an occurrence
1746	if (needle_len == `0`) {
1747	return MP_OBJ_NEW_SMALL_INT(utf8_charlen(start, end - start) + `1`);
1748	}
1749
1750	// count the occurrences
1751	mp_int_t num_occurrences = `0`;
1752	for (const byte *haystack_ptr = start; haystack_ptr + needle_len <= end;) {
1753	if (memcmp(haystack_ptr, needle, needle_len) == `0`) {
1754	num_occurrences++;
1755	haystack_ptr += needle_len;
1756	} else {
1757	haystack_ptr = utf8_next_char(haystack_ptr);
1758	}
1759	}
1760
1761	return MP_OBJ_NEW_SMALL_INT(num_occurrences);
1762	}
1763	MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_count_obj, `2`, `4`, str_count);
1764	#endif
1765
1766	#if MICROPY_PY_BUILTINS_STR_PARTITION
1767	STATIC mp_obj_t str_partitioner(mp_obj_t self_in, mp_obj_t arg, int direction) {
1768	mp_check_self(mp_obj_is_str_or_bytes(self_in));
1769	const mp_obj_type_t *self_type = mp_obj_get_type(self_in);
1770	if (self_type != mp_obj_get_type(arg)) {
1771	bad_implicit_conversion(arg);
1772	}
1773
1774	GET_STR_DATA_LEN(self_in, str, str_len);
1775	GET_STR_DATA_LEN(arg, sep, sep_len);
1776
1777	if (sep_len == `0`) {
1778	mp_raise_ValueError(MP_ERROR_TEXT("empty separator"));
1779	}
1780
1781	mp_obj_t result[`3`];
1782	if (self_type == &mp_type_str) {
1783	result[`0`] = MP_OBJ_NEW_QSTR(MP_QSTR_);
1784	result[`1`] = MP_OBJ_NEW_QSTR(MP_QSTR_);
1785	result[`2`] = MP_OBJ_NEW_QSTR(MP_QSTR_);
1786	} else {
1787	result[`0`] = mp_const_empty_bytes;
1788	result[`1`] = mp_const_empty_bytes;
1789	result[`2`] = mp_const_empty_bytes;
1790	}
1791
1792	if (direction > `0`) {
1793	result[`0`] = self_in;
1794	} else {
1795	result[`2`] = self_in;
1796	}
1797
1798	const byte *position_ptr = find_subbytes(str, str_len, sep, sep_len, direction);
1799	if (position_ptr != NULL) {
1800	size_t position = position_ptr - str;
1801	result[`0`] = mp_obj_new_str_of_type(self_type, str, position);
1802	result[`1`] = arg;
1803	result[`2`] = mp_obj_new_str_of_type(self_type, str + position + sep_len, str_len - position - sep_len);
1804	}
1805
1806	return mp_obj_new_tuple(`3`, result);
1807	}
1808
1809	STATIC mp_obj_t str_partition(mp_obj_t self_in, mp_obj_t arg) {
1810	return str_partitioner(self_in, arg, `1`);
1811	}
1812	MP_DEFINE_CONST_FUN_OBJ_2(str_partition_obj, str_partition);
1813
1814	STATIC mp_obj_t str_rpartition(mp_obj_t self_in, mp_obj_t arg) {
1815	return str_partitioner(self_in, arg, -`1`);
1816	}
1817	MP_DEFINE_CONST_FUN_OBJ_2(str_rpartition_obj, str_rpartition);
1818	#endif
1819
1820	// Supposedly not too critical operations, so optimize for code size
1821	STATIC mp_obj_t str_caseconv(unichar (*op)(unichar), mp_obj_t self_in) {
1822	GET_STR_DATA_LEN(self_in, self_data, self_len);
1823	vstr_t vstr;
1824	vstr_init_len(&vstr, self_len);
1825	byte data = (byte )vstr.buf;
1826	for (size_t i = `0`; i < self_len; i++) {
1827	data++ = op(self_data++);
1828	}
1829	return mp_obj_new_str_from_vstr(mp_obj_get_type(self_in), &vstr);
1830	}
1831
1832	STATIC mp_obj_t str_lower(mp_obj_t self_in) {
1833	return str_caseconv(unichar_tolower, self_in);
1834	}
1835	MP_DEFINE_CONST_FUN_OBJ_1(str_lower_obj, str_lower);
1836
1837	STATIC mp_obj_t str_upper(mp_obj_t self_in) {
1838	return str_caseconv(unichar_toupper, self_in);
1839	}
1840	MP_DEFINE_CONST_FUN_OBJ_1(str_upper_obj, str_upper);
1841
1842	STATIC mp_obj_t str_uni_istype(bool (*f)(unichar), mp_obj_t self_in) {
1843	GET_STR_DATA_LEN(self_in, self_data, self_len);
1844
1845	if (self_len == `0`) {
1846	return mp_const_false; // default to False for empty str
1847	}
1848
1849	if (f != unichar_isupper && f != unichar_islower) {
1850	for (size_t i = `0`; i < self_len; i++) {
1851	if (!f(*self_data++)) {
1852	return mp_const_false;
1853	}
1854	}
1855	} else {
1856	bool contains_alpha = false;
1857
1858	for (size_t i = `0`; i < self_len; i++) { // only check alphanumeric characters
1859	if (unichar_isalpha(*self_data++)) {
1860	contains_alpha = true;
1861	if (!f((self_data - `1`))) { // -1 because we already incremented above*
1862	return mp_const_false;
1863	}
1864	}
1865	}
1866
1867	if (!contains_alpha) {
1868	return mp_const_false;
1869	}
1870	}
1871
1872	return mp_const_true;
1873	}
1874
1875	STATIC mp_obj_t str_isspace(mp_obj_t self_in) {
1876	return str_uni_istype(unichar_isspace, self_in);
1877	}
1878	MP_DEFINE_CONST_FUN_OBJ_1(str_isspace_obj, str_isspace);
1879
1880	STATIC mp_obj_t str_isalpha(mp_obj_t self_in) {
1881	return str_uni_istype(unichar_isalpha, self_in);
1882	}
1883	MP_DEFINE_CONST_FUN_OBJ_1(str_isalpha_obj, str_isalpha);
1884
1885	STATIC mp_obj_t str_isdigit(mp_obj_t self_in) {
1886	return str_uni_istype(unichar_isdigit, self_in);
1887	}
1888	MP_DEFINE_CONST_FUN_OBJ_1(str_isdigit_obj, str_isdigit);
1889
1890	STATIC mp_obj_t str_isupper(mp_obj_t self_in) {
1891	return str_uni_istype(unichar_isupper, self_in);
1892	}
1893	MP_DEFINE_CONST_FUN_OBJ_1(str_isupper_obj, str_isupper);
1894
1895	STATIC mp_obj_t str_islower(mp_obj_t self_in) {
1896	return str_uni_istype(unichar_islower, self_in);
1897	}
1898	MP_DEFINE_CONST_FUN_OBJ_1(str_islower_obj, str_islower);
1899
1900	#if MICROPY_CPYTHON_COMPAT
1901	// These methods are superfluous in the presence of str() and bytes()
1902	// constructors.
1903	// TODO: should accept kwargs too
1904	STATIC mp_obj_t bytes_decode(size_t n_args, const mp_obj_t *args) {
1905	mp_obj_t new_args[`2`];
1906	if (n_args == `1`) {
1907	new_args[`0`] = args[`0`];
1908	new_args[`1`] = MP_OBJ_NEW_QSTR(MP_QSTR_utf_hyphen_8);
1909	args = new_args;
1910	n_args++;
1911	}
1912	return mp_obj_str_make_new(&mp_type_str, n_args, `0`, args);
1913	}
1914	MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(bytes_decode_obj, `1`, `3`, bytes_decode);
1915
1916	// TODO: should accept kwargs too
1917	STATIC mp_obj_t str_encode(size_t n_args, const mp_obj_t *args) {
1918	mp_obj_t new_args[`2`];
1919	if (n_args == `1`) {
1920	new_args[`0`] = args[`0`];
1921	new_args[`1`] = MP_OBJ_NEW_QSTR(MP_QSTR_utf_hyphen_8);
1922	args = new_args;
1923	n_args++;
1924	}
1925	return bytes_make_new(NULL, n_args, `0`, args);
1926	}
1927	MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(str_encode_obj, `1`, `3`, str_encode);
1928	#endif
1929
1930	mp_int_t mp_obj_str_get_buffer(mp_obj_t self_in, mp_buffer_info_t *bufinfo, mp_uint_t flags) {
1931	if (flags == MP_BUFFER_READ) {
1932	GET_STR_DATA_LEN(self_in, str_data, str_len);
1933	bufinfo->buf = (void *)str_data;
1934	bufinfo->len = str_len;
1935	bufinfo->typecode = `'B'`; // bytes should be unsigned, so should unicode byte-access
1936	return `0`;
1937	} else {
1938	// can't write to a string
1939	return `1`;
1940	}
1941	}
1942
1943	STATIC const mp_rom_map_elem_t str8_locals_dict_table[] = {
1944	#if MICROPY_CPYTHON_COMPAT
1945	{ MP_ROM_QSTR(MP_QSTR_decode), MP_ROM_PTR(&bytes_decode_obj) },
1946	#if !MICROPY_PY_BUILTINS_STR_UNICODE
1947	// If we have separate unicode type, then here we have methods only
1948	// for bytes type, and it should not have encode() methods. Otherwise,
1949	// we have non-compliant-but-practical bytestring type, which shares
1950	// method table with bytes, so they both have encode() and decode()
1951	// methods (which should do type checking at runtime).
1952	{ MP_ROM_QSTR(MP_QSTR_encode), MP_ROM_PTR(&str_encode_obj) },
1953	#endif
1954	#endif
1955	{ MP_ROM_QSTR(MP_QSTR_find), MP_ROM_PTR(&str_find_obj) },
1956	{ MP_ROM_QSTR(MP_QSTR_rfind), MP_ROM_PTR(&str_rfind_obj) },
1957	{ MP_ROM_QSTR(MP_QSTR_index), MP_ROM_PTR(&str_index_obj) },
1958	{ MP_ROM_QSTR(MP_QSTR_rindex), MP_ROM_PTR(&str_rindex_obj) },
1959	{ MP_ROM_QSTR(MP_QSTR_join), MP_ROM_PTR(&str_join_obj) },
1960	{ MP_ROM_QSTR(MP_QSTR_split), MP_ROM_PTR(&str_split_obj) },
1961	#if MICROPY_PY_BUILTINS_STR_SPLITLINES
1962	{ MP_ROM_QSTR(MP_QSTR_splitlines), MP_ROM_PTR(&str_splitlines_obj) },
1963	#endif
1964	{ MP_ROM_QSTR(MP_QSTR_rsplit), MP_ROM_PTR(&str_rsplit_obj) },
1965	{ MP_ROM_QSTR(MP_QSTR_startswith), MP_ROM_PTR(&str_startswith_obj) },
1966	{ MP_ROM_QSTR(MP_QSTR_endswith), MP_ROM_PTR(&str_endswith_obj) },
1967	{ MP_ROM_QSTR(MP_QSTR_strip), MP_ROM_PTR(&str_strip_obj) },
1968	{ MP_ROM_QSTR(MP_QSTR_lstrip), MP_ROM_PTR(&str_lstrip_obj) },
1969	{ MP_ROM_QSTR(MP_QSTR_rstrip), MP_ROM_PTR(&str_rstrip_obj) },
1970	{ MP_ROM_QSTR(MP_QSTR_format), MP_ROM_PTR(&str_format_obj) },
1971	{ MP_ROM_QSTR(MP_QSTR_replace), MP_ROM_PTR(&str_replace_obj) },
1972	#if MICROPY_PY_BUILTINS_STR_COUNT
1973	{ MP_ROM_QSTR(MP_QSTR_count), MP_ROM_PTR(&str_count_obj) },
1974	#endif
1975	#if MICROPY_PY_BUILTINS_STR_PARTITION
1976	{ MP_ROM_QSTR(MP_QSTR_partition), MP_ROM_PTR(&str_partition_obj) },
1977	{ MP_ROM_QSTR(MP_QSTR_rpartition), MP_ROM_PTR(&str_rpartition_obj) },
1978	#endif
1979	#if MICROPY_PY_BUILTINS_STR_CENTER
1980	{ MP_ROM_QSTR(MP_QSTR_center), MP_ROM_PTR(&str_center_obj) },
1981	#endif
1982	{ MP_ROM_QSTR(MP_QSTR_lower), MP_ROM_PTR(&str_lower_obj) },
1983	{ MP_ROM_QSTR(MP_QSTR_upper), MP_ROM_PTR(&str_upper_obj) },
1984	{ MP_ROM_QSTR(MP_QSTR_isspace), MP_ROM_PTR(&str_isspace_obj) },
1985	{ MP_ROM_QSTR(MP_QSTR_isalpha), MP_ROM_PTR(&str_isalpha_obj) },
1986	{ MP_ROM_QSTR(MP_QSTR_isdigit), MP_ROM_PTR(&str_isdigit_obj) },
1987	{ MP_ROM_QSTR(MP_QSTR_isupper), MP_ROM_PTR(&str_isupper_obj) },
1988	{ MP_ROM_QSTR(MP_QSTR_islower), MP_ROM_PTR(&str_islower_obj) },
1989	};
1990
1991	STATIC MP_DEFINE_CONST_DICT(str8_locals_dict, str8_locals_dict_table);
1992
1993	#if !MICROPY_PY_BUILTINS_STR_UNICODE
1994	STATIC mp_obj_t mp_obj_new_str_iterator(mp_obj_t str, mp_obj_iter_buf_t *iter_buf);
1995
1996	const mp_obj_type_t mp_type_str = {
1997	{ &mp_type_type },
1998	.name = MP_QSTR_str,
1999	.print = str_print,
2000	.make_new = mp_obj_str_make_new,
2001	.binary_op = mp_obj_str_binary_op,
2002	.subscr = bytes_subscr,
2003	.getiter = mp_obj_new_str_iterator,
2004	.buffer_p = { .get_buffer = mp_obj_str_get_buffer },
2005	.locals_dict = (mp_obj_dict_t *)&str8_locals_dict,
2006	};
2007	#endif
2008
2009	// Reuses most of methods from str
2010	const mp_obj_type_t mp_type_bytes = {
2011	{ &mp_type_type },
2012	.name = MP_QSTR_bytes,
2013	.print = str_print,
2014	.make_new = bytes_make_new,
2015	.binary_op = mp_obj_str_binary_op,
2016	.subscr = bytes_subscr,
2017	.getiter = mp_obj_new_bytes_iterator,
2018	.buffer_p = { .get_buffer = mp_obj_str_get_buffer },
2019	.locals_dict = (mp_obj_dict_t *)&str8_locals_dict,
2020	};
2021
2022	// The zero-length bytes object, with data that includes a null-terminating byte
2023	const mp_obj_str_t mp_const_empty_bytes_obj = {{&mp_type_bytes}, `0`, `0`, (const byte *)""};
2024
2025	// Create a str/bytes object using the given data. New memory is allocated and
2026	// the data is copied across. This function should only be used if the type is bytes,
2027	// or if the type is str and the string data is known to be not interned.
2028	mp_obj_t mp_obj_new_str_copy(const mp_obj_type_t type, const* byte *data, size_t len) {
2029	mp_obj_str_t *o = m_new_obj(mp_obj_str_t);
2030	o->base.type = type;
2031	o->len = len;
2032	if (data) {
2033	o->hash = qstr_compute_hash(data, len);
2034	byte *p = m_new(byte, len + `1`);
2035	o->data = p;
2036	memcpy(p, data, len * sizeof(byte));
2037	p[len] = `'\0'`; // for now we add null for compatibility with C ASCIIZ strings
2038	}
2039	return MP_OBJ_FROM_PTR(o);
2040	}
2041
2042	// Create a str/bytes object using the given data. If the type is str and the string
2043	// data is already interned, then a qstr object is returned. Otherwise new memory is
2044	// allocated for the object and the data is copied across.
2045	mp_obj_t mp_obj_new_str_of_type(const mp_obj_type_t type, const* byte *data, size_t len) {
2046	if (type == &mp_type_str) {
2047	return mp_obj_new_str((const char *)data, len);
2048	} else {
2049	return mp_obj_new_bytes(data, len);
2050	}
2051	}
2052
2053	// Create a str using a qstr to store the data; may use existing or new qstr.
2054	mp_obj_t mp_obj_new_str_via_qstr(const char *data, size_t len) {
2055	return MP_OBJ_NEW_QSTR(qstr_from_strn(data, len));
2056	}
2057
2058	// Create a str/bytes object from the given vstr. The vstr buffer is resized to
2059	// the exact length required and then reused for the str/bytes object. The vstr
2060	// is cleared and can safely be passed to vstr_free if it was heap allocated.
2061	mp_obj_t mp_obj_new_str_from_vstr(const mp_obj_type_t type, vstr_t vstr) {
2062	// if not a bytes object, look if a qstr with this data already exists
2063	if (type == &mp_type_str) {
2064	qstr q = qstr_find_strn(vstr->buf, vstr->len);
2065	if (q != MP_QSTRnull) {
2066	vstr_clear(vstr);
2067	vstr->alloc = `0`;
2068	return MP_OBJ_NEW_QSTR(q);
2069	}
2070	}
2071
2072	// make a new str/bytes object
2073	mp_obj_str_t *o = m_new_obj(mp_obj_str_t);
2074	o->base.type = type;
2075	o->len = vstr->len;
2076	o->hash = qstr_compute_hash((byte *)vstr->buf, vstr->len);
2077	if (vstr->len + `1` == vstr->alloc) {
2078	o->data = (byte *)vstr->buf;
2079	} else {
2080	o->data = (byte )m_renew(char*, vstr->buf, vstr->alloc, vstr->len + `1`);
2081	}
2082	((byte )o->data)[o->len] = `'\0'`; // add null byte*
2083	vstr->buf = NULL;
2084	vstr->alloc = `0`;
2085	return MP_OBJ_FROM_PTR(o);
2086	}
2087
2088	mp_obj_t mp_obj_new_str(const char *data, size_t len) {
2089	qstr q = qstr_find_strn(data, len);
2090	if (q != MP_QSTRnull) {
2091	// qstr with this data already exists
2092	return MP_OBJ_NEW_QSTR(q);
2093	} else {
2094	// no existing qstr, don't make one
2095	return mp_obj_new_str_copy(&mp_type_str, (const byte *)data, len);
2096	}
2097	}
2098
2099	mp_obj_t mp_obj_str_intern(mp_obj_t str) {
2100	GET_STR_DATA_LEN(str, data, len);
2101	return mp_obj_new_str_via_qstr((const char *)data, len);
2102	}
2103
2104	mp_obj_t mp_obj_str_intern_checked(mp_obj_t obj) {
2105	size_t len;
2106	const char *data = mp_obj_str_get_data(obj, &len);
2107	return mp_obj_new_str_via_qstr((const char *)data, len);
2108	}
2109
2110	mp_obj_t mp_obj_new_bytes(const byte *data, size_t len) {
2111	return mp_obj_new_str_copy(&mp_type_bytes, data, len);
2112	}
2113
2114	bool mp_obj_str_equal(mp_obj_t s1, mp_obj_t s2) {
2115	if (mp_obj_is_qstr(s1) && mp_obj_is_qstr(s2)) {
2116	return s1 == s2;
2117	} else {
2118	GET_STR_HASH(s1, h1);
2119	GET_STR_HASH(s2, h2);
2120	// If any of hashes is 0, it means it's not valid
2121	if (h1 != `0` && h2 != `0` && h1 != h2) {
2122	return false;
2123	}
2124	GET_STR_DATA_LEN(s1, d1, l1);
2125	GET_STR_DATA_LEN(s2, d2, l2);
2126	if (l1 != l2) {
2127	return false;
2128	}
2129	return memcmp(d1, d2, l1) == `0`;
2130	}
2131	}
2132
2133	STATIC NORETURN void bad_implicit_conversion(mp_obj_t self_in) {
2134	#if MICROPY_ERROR_REPORTING == MICROPY_ERROR_REPORTING_TERSE
2135	mp_raise_TypeError(MP_ERROR_TEXT("can't convert to str implicitly"));
2136	#else
2137	const qstr src_name = mp_obj_get_type(self_in)->name;
2138	mp_raise_msg_varg(&mp_type_TypeError,
2139	MP_ERROR_TEXT("can't convert '%q' object to %q implicitly"),
2140	src_name, src_name == MP_QSTR_str ? MP_QSTR_bytes : MP_QSTR_str);
2141	#endif
2142	}
2143
2144	// use this if you will anyway convert the string to a qstr
2145	// will be more efficient for the case where it's already a qstr
2146	qstr mp_obj_str_get_qstr(mp_obj_t self_in) {
2147	if (mp_obj_is_qstr(self_in)) {
2148	return MP_OBJ_QSTR_VALUE(self_in);
2149	} else if (mp_obj_is_type(self_in, &mp_type_str)) {
2150	mp_obj_str_t *self = MP_OBJ_TO_PTR(self_in);
2151	return qstr_from_strn((char *)self->data, self->len);
2152	} else {
2153	bad_implicit_conversion(self_in);
2154	}
2155	}
2156
2157	// only use this function if you need the str data to be zero terminated
2158	// at the moment all strings are zero terminated to help with C ASCIIZ compatibility
2159	const char *mp_obj_str_get_str(mp_obj_t self_in) {
2160	if (mp_obj_is_str_or_bytes(self_in)) {
2161	GET_STR_DATA_LEN(self_in, s, l);
2162	(void)l; // len unused
2163	return (const char *)s;
2164	} else {
2165	bad_implicit_conversion(self_in);
2166	}
2167	}
2168
2169	const char mp_obj_str_get_data(mp_obj_t self_in, size_t len) {
2170	if (mp_obj_is_str_or_bytes(self_in)) {
2171	GET_STR_DATA_LEN(self_in, s, l);
2172	*len = l;
2173	return (const char *)s;
2174	} else {
2175	bad_implicit_conversion(self_in);
2176	}
2177	}
2178
2179	#if MICROPY_OBJ_REPR == MICROPY_OBJ_REPR_C \|\| MICROPY_OBJ_REPR == MICROPY_OBJ_REPR_D
2180	const byte mp_obj_str_get_data_no_check(mp_obj_t self_in, size_t len) {
2181	if (mp_obj_is_qstr(self_in)) {
2182	return qstr_data(MP_OBJ_QSTR_VALUE(self_in), len);
2183	} else {
2184	len = ((mp_obj_str_t )MP_OBJ_TO_PTR(self_in))->len;
2185	return ((mp_obj_str_t *)MP_OBJ_TO_PTR(self_in))->data;
2186	}
2187	}
2188	#endif
2189
2190	/****************************************************************************/
2191	/ str iterator /
2192
2193	typedef struct _mp_obj_str8_it_t {
2194	mp_obj_base_t base;
2195	mp_fun_1_t iternext;
2196	mp_obj_t str;
2197	size_t cur;
2198	} mp_obj_str8_it_t;
2199
2200	#if !MICROPY_PY_BUILTINS_STR_UNICODE
2201	STATIC mp_obj_t str_it_iternext(mp_obj_t self_in) {
2202	mp_obj_str8_it_t *self = MP_OBJ_TO_PTR(self_in);
2203	GET_STR_DATA_LEN(self->str, str, len);
2204	if (self->cur < len) {
2205	mp_obj_t o_out = mp_obj_new_str_via_qstr((const char *)str + self->cur, `1`);
2206	self->cur += `1`;
2207	return o_out;
2208	} else {
2209	return MP_OBJ_STOP_ITERATION;
2210	}
2211	}
2212
2213	STATIC mp_obj_t mp_obj_new_str_iterator(mp_obj_t str, mp_obj_iter_buf_t *iter_buf) {
2214	assert(sizeof(mp_obj_str8_it_t) <= sizeof(mp_obj_iter_buf_t));
2215	mp_obj_str8_it_t o = (mp_obj_str8_it_t )iter_buf;
2216	o->base.type = &mp_type_polymorph_iter;
2217	o->iternext = str_it_iternext;
2218	o->str = str;
2219	o->cur = `0`;
2220	return MP_OBJ_FROM_PTR(o);
2221	}
2222	#endif
2223
2224	STATIC mp_obj_t bytes_it_iternext(mp_obj_t self_in) {
2225	mp_obj_str8_it_t *self = MP_OBJ_TO_PTR(self_in);
2226	GET_STR_DATA_LEN(self->str, str, len);
2227	if (self->cur < len) {
2228	mp_obj_t o_out = MP_OBJ_NEW_SMALL_INT(str[self->cur]);
2229	self->cur += `1`;
2230	return o_out;
2231	} else {
2232	return MP_OBJ_STOP_ITERATION;
2233	}
2234	}
2235
2236	mp_obj_t mp_obj_new_bytes_iterator(mp_obj_t str, mp_obj_iter_buf_t *iter_buf) {
2237	assert(sizeof(mp_obj_str8_it_t) <= sizeof(mp_obj_iter_buf_t));
2238	mp_obj_str8_it_t o = (mp_obj_str8_it_t )iter_buf;
2239	o->base.type = &mp_type_polymorph_iter;
2240	o->iternext = bytes_it_iternext;
2241	o->str = str;
2242	o->cur = `0`;
2243	return MP_OBJ_FROM_PTR(o);
2244	}
2245

Browse the source code of MicroPython/py/objstr.c