// Convert an index into a pointer to its lead byte. Out of bounds indexing will raise IndexError or // be capped to the first/last character of the string, depending on is_slice. const byte *str_index_to_ptr(const mp_obj_type_t *type, const byte *self_data, size_t self_len, mp_obj_t index, bool is_slice) { (void)type; mp_int_t i; // Copied from mp_get_index; I don't want bounds checking, just give me // the integer as-is. (I can't bounds-check without scanning the whole // string; an out-of-bounds index will be caught in the loops below.) if (MP_OBJ_IS_SMALL_INT(index)) { i = MP_OBJ_SMALL_INT_VALUE(index); } else if (!mp_obj_get_int_maybe(index, &i)) { nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_TypeError, "string indices must be integers, not %s", mp_obj_get_type_str(index))); } const byte *s, *top = self_data + self_len; if (i < 0) { // Negative indexing is performed by counting from the end of the string. for (s = top - 1; i; --s) { if (s < self_data) { if (is_slice) { return self_data; } nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_IndexError, "string index out of range")); } if (!UTF8_IS_CONT(*s)) { ++i; } } ++s; } else if (!i) { return self_data; // Shortcut - str[0] is its base pointer } else { // Positive indexing, correspondingly, counts from the start of the string. // It's assumed that negative indexing will generally be used with small // absolute values (eg str[-1], not str[-1000000]), which means it'll be // more efficient this way. for (s = self_data; true; ++s) { if (s >= top) { if (is_slice) { return top; } nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_IndexError, "string index out of range")); } while (UTF8_IS_CONT(*s)) { ++s; } if (!i--) { return s; } } } return s; }
machine_uint_t utf8_ptr_to_index(const char *s, const char *ptr) { machine_uint_t i = 0; while (ptr > s) { if (!UTF8_IS_CONT(*--ptr)) { i++; } } return i; }
mp_uint_t utf8_ptr_to_index(const byte *s, const byte *ptr) { mp_uint_t i = 0; while (ptr > s) { if (!UTF8_IS_CONT(*--ptr)) { i++; } } return i; }
// TODO: Rename to str_next_char const byte *utf8_next_char(const byte *s) { #if MICROPY_PY_BUILTINS_STR_UNICODE ++s; while (UTF8_IS_CONT(*s)) { ++s; } return s; #else return s + 1; #endif }
// TODO: Rename to str_charlen machine_uint_t unichar_charlen(const char *str, machine_uint_t len) { #if MICROPY_PY_BUILTINS_STR_UNICODE machine_uint_t charlen = 0; for (const char *top = str + len; str < top; ++str) { if (!UTF8_IS_CONT(*str)) { ++charlen; } } return charlen; #else return len; #endif }
// TODO: Rename to str_get_char unichar utf8_get_char(const byte *s) { #if MICROPY_PY_BUILTINS_STR_UNICODE unichar ord = *s++; if (!UTF8_IS_NONASCII(ord)) return ord; ord &= 0x7F; for (unichar mask = 0x40; ord & mask; mask >>= 1) { ord &= ~mask; } while (UTF8_IS_CONT(*s)) { ord = (ord << 6) | (*s++ & 0x3F); } return ord; #else return *s; #endif }
// Convert an index into a pointer to its lead byte. Out of bounds indexing will raise IndexError or // be capped to the first/last character of the string, depending on is_slice. const byte *str_index_to_ptr(const mp_obj_type_t *type, const byte *self_data, size_t self_len, mp_obj_t index, bool is_slice) { // All str functions also handle bytes objects, and they call str_index_to_ptr(), // so it must handle bytes. if (type == &mp_type_bytes) { // Taken from objstr.c:str_index_to_ptr() size_t index_val = mp_get_index(type, self_len, index, is_slice); return self_data + index_val; } mp_int_t i; // Copied from mp_get_index; I don't want bounds checking, just give me // the integer as-is. (I can't bounds-check without scanning the whole // string; an out-of-bounds index will be caught in the loops below.) if (MP_OBJ_IS_SMALL_INT(index)) { i = MP_OBJ_SMALL_INT_VALUE(index); } else if (!mp_obj_get_int_maybe(index, &i)) { nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_TypeError, "string indices must be integers, not %s", mp_obj_get_type_str(index))); } const byte *s, *top = self_data + self_len; if (i < 0) { // Negative indexing is performed by counting from the end of the string. for (s = top - 1; i; --s) { if (s < self_data) { if (is_slice) { return self_data; } mp_raise_msg(&mp_type_IndexError, "string index out of range"); } if (!UTF8_IS_CONT(*s)) { ++i; } } ++s; } else { // Positive indexing, correspondingly, counts from the start of the string. // It's assumed that negative indexing will generally be used with small // absolute values (eg str[-1], not str[-1000000]), which means it'll be // more efficient this way. s = self_data; while (1) { // First check out-of-bounds if (s >= top) { if (is_slice) { return top; } mp_raise_msg(&mp_type_IndexError, "string index out of range"); } // Then check completion if (i-- == 0) { break; } // Then skip UTF-8 char ++s; while (UTF8_IS_CONT(*s)) { ++s; } } } return s; }