PARROT_WARN_UNUSED_RESULT INTVAL encoding_equal(PARROT_INTERP, ARGIN(const STRING *lhs), ARGIN(const STRING *rhs)) { ASSERT_ARGS(encoding_equal) String_iter l_iter, r_iter; const UINTVAL len = STRING_length(lhs); if (len != STRING_length(rhs)) return 0; if (len == 0) return 1; if (lhs == rhs) return 1; if (lhs->hashval && rhs->hashval && lhs->hashval != rhs->hashval) return 0; if (lhs->encoding == rhs->encoding) return memcmp(lhs->strstart, rhs->strstart, STRING_byte_length(lhs)) == 0; STRING_ITER_INIT(interp, &l_iter); STRING_ITER_INIT(interp, &r_iter); while (l_iter.charpos < len) { const UINTVAL cl = STRING_iter_get_and_advance(interp, lhs, &l_iter); const UINTVAL cr = STRING_iter_get_and_advance(interp, rhs, &r_iter); if (cl != cr) return 0; } return 1; }
PARROT_WARN_UNUSED_RESULT INTVAL encoding_rindex(PARROT_INTERP, ARGIN(const STRING *src), ARGIN(const STRING *search), INTVAL offset) { ASSERT_ARGS(encoding_rindex) String_iter search_iter, search_start, start; const UINTVAL len = search->strlen; UINTVAL c0; INTVAL skip; if (offset < 0 || len == 0 || src->strlen < len) return -1; skip = src->strlen - len; if (offset < skip) skip = offset; STRING_ITER_INIT(interp, &start); STRING_iter_skip(interp, src, &start, skip); STRING_ITER_INIT(interp, &search_start); c0 = STRING_iter_get_and_advance(interp, search, &search_start); while (1) { UINTVAL c1 = STRING_iter_get(interp, src, &start, 0); if (c1 == c0) { UINTVAL c2; String_iter iter = start; STRING_iter_skip(interp, src, &iter, 1); search_iter = search_start; do { if (search_iter.charpos >= len) return start.charpos; c1 = STRING_iter_get_and_advance(interp, src, &iter); c2 = STRING_iter_get_and_advance(interp, search, &search_iter); } while (c1 == c2); } if (start.charpos == 0) break; STRING_iter_skip(interp, src, &start, -1); } return -1; }
PARROT_CANNOT_RETURN_NULL PARROT_WARN_UNUSED_RESULT STRING * encoding_to_encoding(PARROT_INTERP, ARGIN(const STRING *src), ARGIN(const STR_VTABLE *encoding), double avg_bytes) { ASSERT_ARGS(encoding_to_encoding) STRING *result; String_iter src_iter, dest_iter; UINTVAL src_len, alloc_bytes; UINTVAL max_bytes = encoding->max_bytes_per_codepoint; if (src->encoding == encoding) return Parrot_str_clone(interp, src); src_len = src->strlen; result = Parrot_gc_new_string_header(interp, 0); result->encoding = encoding; result->strlen = src_len; if (!src_len) return result; alloc_bytes = (UINTVAL)(src_len * avg_bytes); if (alloc_bytes < max_bytes) alloc_bytes = max_bytes; Parrot_gc_allocate_string_storage(interp, result, alloc_bytes); result->bufused = alloc_bytes; STRING_ITER_INIT(interp, &src_iter); STRING_ITER_INIT(interp, &dest_iter); while (src_iter.charpos < src_len) { const UINTVAL c = STRING_iter_get_and_advance(interp, src, &src_iter); const UINTVAL needed = dest_iter.bytepos + max_bytes; if (needed > result->bufused) { alloc_bytes = src_len - src_iter.charpos; alloc_bytes = (UINTVAL)(alloc_bytes * avg_bytes); alloc_bytes += needed; Parrot_gc_reallocate_string_storage(interp, result, alloc_bytes); result->bufused = alloc_bytes; } STRING_iter_set_and_advance(interp, result, &dest_iter, c); } result->bufused = dest_iter.bytepos; return result; }
PARROT_CANNOT_RETURN_NULL STRING * encoding_substr(PARROT_INTERP, ARGIN(const STRING *src), INTVAL offset, INTVAL length) { ASSERT_ARGS(encoding_substr) const UINTVAL strlen = STRING_length(src); STRING *return_string; String_iter iter; UINTVAL start; if (offset < 0) offset += strlen; if ((UINTVAL)offset >= strlen || length <= 0) { /* Allow regexes to return $' easily for "aaa" =~ /aaa/ */ if ((UINTVAL)offset == strlen || length <= 0) return Parrot_str_new_constant(interp, ""); Parrot_ex_throw_from_c_noargs(interp, EXCEPTION_SUBSTR_OUT_OF_STRING, "Cannot take substr outside string"); } return_string = Parrot_str_copy(interp, src); if (offset == 0 && (UINTVAL)length >= strlen) return return_string; STRING_ITER_INIT(interp, &iter); if (offset) STRING_iter_skip(interp, src, &iter, offset); start = iter.bytepos; return_string->strstart += start; if ((UINTVAL)length >= strlen - (UINTVAL)offset) { return_string->bufused -= start; return_string->strlen -= offset; } else { STRING_iter_skip(interp, src, &iter, length); return_string->bufused = iter.bytepos - start; return_string->strlen = length; } return_string->hashval = 0; return return_string; }
PARROT_WARN_UNUSED_RESULT INTVAL encoding_find_cclass(PARROT_INTERP, INTVAL flags, ARGIN(const STRING *src), UINTVAL offset, UINTVAL count) { ASSERT_ARGS(encoding_find_cclass) String_iter iter; UINTVAL codepoint; UINTVAL end = offset + count; static UINTVAL last_char_offset; static String_iter cached_iter; static STRING *last_string = 0; if (last_string == src && offset > last_char_offset) { iter = cached_iter; STRING_iter_skip(interp, src, &iter, offset - last_char_offset); } else if (last_string == src && offset == last_char_offset) { iter = cached_iter; } else { STRING_ITER_INIT(interp, &iter); STRING_iter_skip(interp, src, &iter, offset); } end = src->strlen < end ? src->strlen : end; while (iter.charpos < end) { codepoint = STRING_iter_get_and_advance(interp, src, &iter); if (codepoint >= 256) { if (u_iscclass(interp, codepoint, flags)) goto return_and_cache; } else { if (Parrot_iso_8859_1_typetable[codepoint] & flags) goto return_and_cache; } } return end; return_and_cache: if (iter.charpos > 128) { last_char_offset = iter.charpos; cached_iter = iter; last_string = (STRING*)PTR2INTVAL(src); } return iter.charpos - 1; }
PARROT_WARN_UNUSED_RESULT INTVAL encoding_compare(PARROT_INTERP, ARGIN(const STRING *lhs), ARGIN(const STRING *rhs)) { ASSERT_ARGS(encoding_compare) String_iter l_iter, r_iter; const UINTVAL l_len = STRING_length(lhs); const UINTVAL r_len = STRING_length(rhs); UINTVAL min_len; if (r_len == 0) return l_len != 0; if (l_len == 0) return -1; STRING_ITER_INIT(interp, &l_iter); STRING_ITER_INIT(interp, &r_iter); min_len = l_len > r_len ? r_len : l_len; while (l_iter.charpos < min_len) { const UINTVAL cl = STRING_iter_get_and_advance(interp, lhs, &l_iter); const UINTVAL cr = STRING_iter_get_and_advance(interp, rhs, &r_iter); if (cl != cr) return cl < cr ? -1 : 1; } if (l_len < r_len) return -1; if (l_len > r_len) return 1; return 0; }
PARROT_WARN_UNUSED_RESULT INTVAL encoding_index(PARROT_INTERP, ARGIN(const STRING *src), ARGIN(const STRING *search), INTVAL offset) { ASSERT_ARGS(encoding_index) String_iter start, end; if ((UINTVAL)offset >= STRING_length(src) || !STRING_length(search)) return -1; STRING_ITER_INIT(interp, &start); STRING_iter_skip(interp, src, &start, offset); return Parrot_str_iter_index(interp, src, &start, &end, search); }
PARROT_CANNOT_RETURN_NULL static STRING * ascii_to_encoding(PARROT_INTERP, ARGIN(const STRING *src)) { ASSERT_ARGS(ascii_to_encoding) STRING *dest; if (STRING_max_bytes_per_codepoint(src) == 1) { unsigned char * const src_buf = (unsigned char *)src->strstart; UINTVAL offs; for (offs = 0; offs < src->strlen; ++offs) { UINTVAL c = src_buf[offs]; if (c >= 0x80) Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LOSSY_CONVERSION, "lossy conversion to ascii"); } dest = Parrot_str_clone(interp, src); dest->encoding = Parrot_ascii_encoding_ptr; } else { String_iter iter; unsigned char *p; const UINTVAL len = src->strlen; dest = Parrot_str_new_init(interp, NULL, len, Parrot_ascii_encoding_ptr, 0); p = (unsigned char *)dest->strstart; STRING_ITER_INIT(interp, &iter); while (iter.charpos < len) { const UINTVAL c = STRING_iter_get_and_advance(interp, src, &iter); if (c >= 0x80) Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_LOSSY_CONVERSION, "can't convert unicode string to ascii"); *p++ = c; } dest->bufused = len; dest->strlen = len; } return dest; }
PARROT_WARN_UNUSED_RESULT PARROT_CANNOT_RETURN_NULL static STRING * ucs4_to_encoding(PARROT_INTERP, ARGIN(const STRING *src)) { ASSERT_ARGS(ucs4_to_encoding) const UINTVAL len = src->strlen; UINTVAL i; STRING *res; utf32_t *ptr; if (src->encoding == Parrot_ucs4_encoding_ptr) return Parrot_str_copy(interp, src); res = Parrot_str_new_init(interp, NULL, len * 4, Parrot_ucs4_encoding_ptr, 0); ptr = (utf32_t *)res->strstart; if (STRING_max_bytes_per_codepoint(src) == 1) { const unsigned char *s = (unsigned char *)src->strstart; for (i = 0; i < len; i++) { ptr[i] = s[i]; } } else { String_iter iter; STRING_ITER_INIT(interp, &iter); while (iter.charpos < len) { i = iter.charpos; ptr[i] = STRING_iter_get_and_advance(interp, src, &iter); } } res->strlen = len; res->bufused = len * 4; return res; }
PARROT_WARN_UNUSED_RESULT size_t encoding_hash(PARROT_INTERP, ARGIN(const STRING *src), size_t hashval) { ASSERT_ARGS(encoding_hash) DECL_CONST_CAST; STRING * const s = PARROT_const_cast(STRING *, src); String_iter iter; STRING_ITER_INIT(interp, &iter); while (iter.charpos < s->strlen) { const UINTVAL c = STRING_iter_get_and_advance(interp, s, &iter); hashval += hashval << 5; hashval += c; } s->hashval = hashval; return hashval; }
PARROT_WARN_UNUSED_RESULT INTVAL encoding_find_not_cclass(PARROT_INTERP, INTVAL flags, ARGIN(const STRING *src), UINTVAL offset, UINTVAL count) { ASSERT_ARGS(encoding_find_not_cclass) String_iter iter; UINTVAL codepoint; UINTVAL end = offset + count; int bit; static UINTVAL last_char_offset; static String_iter cached_iter; static STRING *last_string = 0; if (offset > src->strlen) { /* XXX: Throw in this case? */ return offset + count; } if (last_string == src && offset > last_char_offset) { iter = cached_iter; STRING_iter_skip(interp, src, &iter, offset - last_char_offset); } else if (last_string == src && offset == last_char_offset) { iter = cached_iter; } else { STRING_ITER_INIT(interp, &iter); if (offset) STRING_iter_skip(interp, src, &iter, offset); } end = src->strlen < end ? src->strlen : end; if (flags == enum_cclass_any) return end; while (iter.charpos < end) { codepoint = STRING_iter_get_and_advance(interp, src, &iter); if (codepoint >= 256) { for (bit = enum_cclass_uppercase; bit <= enum_cclass_word ; bit <<= 1) { if ((bit & flags) && !u_iscclass(interp, codepoint, bit)) goto return_and_cache; } } else { if (!(Parrot_iso_8859_1_typetable[codepoint] & flags)) goto return_and_cache; } } return end; return_and_cache: if (iter.charpos > 128) { last_char_offset = iter.charpos; cached_iter = iter; last_string = (STRING*)PTR2INTVAL(src); } return iter.charpos - 1; }