Beispiel #1
0
// Convert an index into a pointer to its lead byte. Out of bounds indexing will raise IndexError or
// be capped to the first/last character of the string, depending on is_slice.
const byte *str_index_to_ptr(const mp_obj_type_t *type, const byte *self_data, size_t self_len,
                             mp_obj_t index, bool is_slice) {
    (void)type;
    mp_int_t i;
    // Copied from mp_get_index; I don't want bounds checking, just give me
    // the integer as-is. (I can't bounds-check without scanning the whole
    // string; an out-of-bounds index will be caught in the loops below.)
    if (MP_OBJ_IS_SMALL_INT(index)) {
        i = MP_OBJ_SMALL_INT_VALUE(index);
    } else if (!mp_obj_get_int_maybe(index, &i)) {
        nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_TypeError, "string indices must be integers, not %s", mp_obj_get_type_str(index)));
    }
    const byte *s, *top = self_data + self_len;
    if (i < 0)
    {
        // Negative indexing is performed by counting from the end of the string.
        for (s = top - 1; i; --s) {
            if (s < self_data) {
                if (is_slice) {
                    return self_data;
                }
                nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_IndexError, "string index out of range"));
            }
            if (!UTF8_IS_CONT(*s)) {
                ++i;
            }
        }
        ++s;
    } else if (!i) {
        return self_data; // Shortcut - str[0] is its base pointer
    } else {
        // Positive indexing, correspondingly, counts from the start of the string.
        // It's assumed that negative indexing will generally be used with small
        // absolute values (eg str[-1], not str[-1000000]), which means it'll be
        // more efficient this way.
        for (s = self_data; true; ++s) {
            if (s >= top) {
                if (is_slice) {
                    return top;
                }
                nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_IndexError, "string index out of range"));
            }
            while (UTF8_IS_CONT(*s)) {
                ++s;
            }
            if (!i--) {
                return s;
            }
        }
    }
    return s;
}
Beispiel #2
0
machine_uint_t utf8_ptr_to_index(const char *s, const char *ptr) {
    machine_uint_t i = 0;
    while (ptr > s) {
        if (!UTF8_IS_CONT(*--ptr)) {
            i++;
        }
    }

    return i;
}
Beispiel #3
0
mp_uint_t utf8_ptr_to_index(const byte *s, const byte *ptr) {
    mp_uint_t i = 0;
    while (ptr > s) {
        if (!UTF8_IS_CONT(*--ptr)) {
            i++;
        }
    }

    return i;
}
Beispiel #4
0
// TODO: Rename to str_next_char
const byte *utf8_next_char(const byte *s) {
#if MICROPY_PY_BUILTINS_STR_UNICODE
    ++s;
    while (UTF8_IS_CONT(*s)) {
        ++s;
    }
    return s;
#else
    return s + 1;
#endif
}
Beispiel #5
0
// TODO: Rename to str_charlen
machine_uint_t unichar_charlen(const char *str, machine_uint_t len)
{
#if MICROPY_PY_BUILTINS_STR_UNICODE
    machine_uint_t charlen = 0;
    for (const char *top = str + len; str < top; ++str) {
        if (!UTF8_IS_CONT(*str)) {
            ++charlen;
        }
    }
    return charlen;
#else
    return len;
#endif
}
Beispiel #6
0
// TODO: Rename to str_get_char
unichar utf8_get_char(const byte *s) {
#if MICROPY_PY_BUILTINS_STR_UNICODE
    unichar ord = *s++;
    if (!UTF8_IS_NONASCII(ord)) return ord;
    ord &= 0x7F;
    for (unichar mask = 0x40; ord & mask; mask >>= 1) {
        ord &= ~mask;
    }
    while (UTF8_IS_CONT(*s)) {
        ord = (ord << 6) | (*s++ & 0x3F);
    }
    return ord;
#else
    return *s;
#endif
}
// Convert an index into a pointer to its lead byte. Out of bounds indexing will raise IndexError or
// be capped to the first/last character of the string, depending on is_slice.
const byte *str_index_to_ptr(const mp_obj_type_t *type, const byte *self_data, size_t self_len,
                             mp_obj_t index, bool is_slice) {
    // All str functions also handle bytes objects, and they call str_index_to_ptr(),
    // so it must handle bytes.
    if (type == &mp_type_bytes) {
        // Taken from objstr.c:str_index_to_ptr()
        size_t index_val = mp_get_index(type, self_len, index, is_slice);
        return self_data + index_val;
    }

    mp_int_t i;
    // Copied from mp_get_index; I don't want bounds checking, just give me
    // the integer as-is. (I can't bounds-check without scanning the whole
    // string; an out-of-bounds index will be caught in the loops below.)
    if (MP_OBJ_IS_SMALL_INT(index)) {
        i = MP_OBJ_SMALL_INT_VALUE(index);
    } else if (!mp_obj_get_int_maybe(index, &i)) {
        nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_TypeError, "string indices must be integers, not %s", mp_obj_get_type_str(index)));
    }
    const byte *s, *top = self_data + self_len;
    if (i < 0)
    {
        // Negative indexing is performed by counting from the end of the string.
        for (s = top - 1; i; --s) {
            if (s < self_data) {
                if (is_slice) {
                    return self_data;
                }
                mp_raise_msg(&mp_type_IndexError, "string index out of range");
            }
            if (!UTF8_IS_CONT(*s)) {
                ++i;
            }
        }
        ++s;
    } else {
        // Positive indexing, correspondingly, counts from the start of the string.
        // It's assumed that negative indexing will generally be used with small
        // absolute values (eg str[-1], not str[-1000000]), which means it'll be
        // more efficient this way.
        s = self_data;
        while (1) {
            // First check out-of-bounds
            if (s >= top) {
                if (is_slice) {
                    return top;
                }
                mp_raise_msg(&mp_type_IndexError, "string index out of range");
            }
            // Then check completion
            if (i-- == 0) {
                break;
            }
            // Then skip UTF-8 char
            ++s;
            while (UTF8_IS_CONT(*s)) {
                ++s;
            }
        }
    }
    return s;
}