Ejemplo n.º 1
0
// TODO: Rename to str_get_char
unichar utf8_get_char(const byte *s) {
#if MICROPY_PY_BUILTINS_STR_UNICODE
    unichar ord = *s++;
    if (!UTF8_IS_NONASCII(ord)) return ord;
    ord &= 0x7F;
    for (unichar mask = 0x40; ord & mask; mask >>= 1) {
        ord &= ~mask;
    }
    while (UTF8_IS_CONT(*s)) {
        ord = (ord << 6) | (*s++ & 0x3F);
    }
    return ord;
#else
    return *s;
#endif
}
Ejemplo n.º 2
0
STATIC mp_obj_t str_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) {
    mp_obj_type_t *type = mp_obj_get_type(self_in);
    assert(type == &mp_type_str);
    GET_STR_DATA_LEN(self_in, self_data, self_len);
    if (value == MP_OBJ_SENTINEL) {
        // load
#if MICROPY_PY_BUILTINS_SLICE
        if (MP_OBJ_IS_TYPE(index, &mp_type_slice)) {
            mp_obj_t ostart, ostop, ostep;
            mp_obj_slice_get(index, &ostart, &ostop, &ostep);
            if (ostep != mp_const_none && ostep != MP_OBJ_NEW_SMALL_INT(1)) {
                nlr_raise(mp_obj_new_exception_msg(&mp_type_NotImplementedError,
                    "only slices with step=1 (aka None) are supported"));
            }

            const byte *pstart, *pstop;
            if (ostart != mp_const_none) {
                pstart = str_index_to_ptr(type, self_data, self_len, ostart, true);
            } else {
                pstart = self_data;
            }
            if (ostop != mp_const_none) {
                // pstop will point just after the stop character. This depends on
                // the \0 at the end of the string.
                pstop = str_index_to_ptr(type, self_data, self_len, ostop, true);
            } else {
                pstop = self_data + self_len;
            }
            if (pstop < pstart) {
                return MP_OBJ_NEW_QSTR(MP_QSTR_);
            }
            return mp_obj_new_str_of_type(type, (const byte *)pstart, pstop - pstart);
        }
#endif
        const byte *s = str_index_to_ptr(type, self_data, self_len, index, false);
        int len = 1;
        if (UTF8_IS_NONASCII(*s)) {
            // Count the number of 1 bits (after the first)
            for (char mask = 0x40; *s & mask; mask >>= 1) {
                ++len;
            }
        }
        return mp_obj_new_str((const char*)s, len, true); // This will create a one-character string
    } else {
        return MP_OBJ_NULL; // op not supported
Ejemplo n.º 3
0
STATIC mp_obj_t stream_read_generic(size_t n_args, const mp_obj_t *args, byte flags) {
    // What to do if sz < -1?  Python docs don't specify this case.
    // CPython does a readall, but here we silently let negatives through,
    // and they will cause a MemoryError.
    mp_int_t sz;
    if (n_args == 1 || ((sz = mp_obj_get_int(args[1])) == -1)) {
        return stream_readall(args[0]);
    }

    const mp_stream_p_t *stream_p = mp_get_stream(args[0]);

    #if MICROPY_PY_BUILTINS_STR_UNICODE
    if (stream_p->is_text) {
        // We need to read sz number of unicode characters.  Because we don't have any
        // buffering, and because the stream API can only read bytes, we must read here
        // in units of bytes and must never over read.  If we want sz chars, then reading
        // sz bytes will never over-read, so we follow this approach, in a loop to keep
        // reading until we have exactly enough chars.  This will be 1 read for text
        // with ASCII-only chars, and about 2 reads for text with a couple of non-ASCII
        // chars.  For text with lots of non-ASCII chars, it'll be pretty inefficient
        // in time and memory.

        vstr_t vstr;
        vstr_init(&vstr, sz);
        mp_uint_t more_bytes = sz;
        mp_uint_t last_buf_offset = 0;
        while (more_bytes > 0) {
            char *p = vstr_add_len(&vstr, more_bytes);
            int error;
            mp_uint_t out_sz = mp_stream_read_exactly(args[0], p, more_bytes, &error);
            if (error != 0) {
                vstr_cut_tail_bytes(&vstr, more_bytes);
                if (mp_is_nonblocking_error(error)) {
                    // With non-blocking streams, we read as much as we can.
                    // If we read nothing, return None, just like read().
                    // Otherwise, return data read so far.
                    // TODO what if we have read only half a non-ASCII char?
                    if (vstr.len == 0) {
                        vstr_clear(&vstr);
                        return mp_const_none;
                    }
                    break;
                }
                mp_raise_OSError(error);
            }

            if (out_sz < more_bytes) {
                // Finish reading.
                // TODO what if we have read only half a non-ASCII char?
                vstr_cut_tail_bytes(&vstr, more_bytes - out_sz);
                if (out_sz == 0) {
                    break;
                }
            }

            // count chars from bytes just read
            for (mp_uint_t off = last_buf_offset;;) {
                byte b = vstr.buf[off];
                int n;
                if (!UTF8_IS_NONASCII(b)) {
                    // 1-byte ASCII char
                    n = 1;
                } else if ((b & 0xe0) == 0xc0) {
                    // 2-byte char
                    n = 2;
                } else if ((b & 0xf0) == 0xe0) {
                    // 3-byte char
                    n = 3;
                } else if ((b & 0xf8) == 0xf0) {
                    // 4-byte char
                    n = 4;
                } else {
                    // TODO
                    n = 5;
                }
                if (off + n <= vstr.len) {
                    // got a whole char in n bytes
                    off += n;
                    sz -= 1;
                    last_buf_offset = off;
                    if (off >= vstr.len) {
                        more_bytes = sz;
                        break;
                    }
                } else {
                    // didn't get a whole char, so work out how many extra bytes are needed for
                    // this partial char, plus bytes for additional chars that we want
                    more_bytes = (off + n - vstr.len) + (sz - 1);
                    break;
                }
            }
        }

        return mp_obj_new_str_from_vstr(&mp_type_str, &vstr);
    }
    #endif

    vstr_t vstr;
    vstr_init_len(&vstr, sz);
    int error;
    mp_uint_t out_sz = mp_stream_rw(args[0], vstr.buf, sz, &error, flags);
    if (error != 0) {
        vstr_clear(&vstr);
        if (mp_is_nonblocking_error(error)) {
            // https://docs.python.org/3.4/library/io.html#io.RawIOBase.read
            // "If the object is in non-blocking mode and no bytes are available,
            // None is returned."
            // This is actually very weird, as naive truth check will treat
            // this as EOF.
            return mp_const_none;
        }
        mp_raise_OSError(error);
    } else {
        vstr.len = out_sz;
        return mp_obj_new_str_from_vstr(STREAM_CONTENT_TYPE(stream_p), &vstr);
    }
}