static INTVAL ucs4_partial_scan(PARROT_INTERP, ARGIN(const char *buf), ARGMOD(Parrot_String_Bounds *bounds)) { ASSERT_ARGS(ucs4_partial_scan) const utf32_t * const ptr = (const utf32_t *)buf; UINTVAL len = bounds->bytes >> 1; const INTVAL chars = bounds->chars; const INTVAL delim = bounds->delim; INTVAL c = -1; UINTVAL i; if (chars >= 0 && (UINTVAL)chars < len) len = chars; for (i = 0; i < len; ++i) { c = ptr[i]; if (UNICODE_IS_INVALID(c)) Parrot_ex_throw_from_c_noargs(interp, EXCEPTION_INVALID_CHARACTER, "Invalid character in UCS-4 string"); if (c == delim) { len = i + 1; break; } } bounds->bytes = len << 2; bounds->chars = len; bounds->delim = c; return 0; }
static void null_error(PARROT_INTERP) { ASSERT_ARGS(null_error) Parrot_ex_throw_from_c_noargs(interp, EXCEPTION_UNEXPECTED_NULL, "Invalid operation on null string"); }
PARROT_EXPORT PARROT_WARN_UNUSED_RESULT Parrot_Int imcc_get_pir_compreg_api(Parrot_PMC interp_pmc, int add_compreg, ARGOUT(Parrot_PMC *compiler)) { ASSERT_ARGS(imcc_get_pir_compreg_api) IMCC_API_CALLIN(interp_pmc, interp) *compiler = get_compreg_pmc(interp, 0, add_compreg); if (PMC_IS_NULL(*compiler)) Parrot_ex_throw_from_c_noargs(interp, EXCEPTION_UNEXPECTED_NULL, "Could not create PIR compiler PMC"); IMCC_API_CALLOUT(interp_pmc, interp) }
static void ucs4_scan(PARROT_INTERP, ARGMOD(STRING *src)) { ASSERT_ARGS(ucs4_scan) const utf32_t * const ptr = (utf32_t *)src->strstart; const UINTVAL len = src->bufused >> 2; UINTVAL i; if (src->bufused & 3) Parrot_ex_throw_from_c_noargs(interp, EXCEPTION_INVALID_CHARACTER, "Unaligned end in UCS-4 string"); for (i = 0; i < len; ++i) { UINTVAL c = ptr[i]; if (UNICODE_IS_INVALID(c)) Parrot_ex_throw_from_c_noargs(interp, EXCEPTION_INVALID_CHARACTER, "Invalid character in UCS-4 string"); } src->strlen = len; }
PARROT_CANNOT_RETURN_NULL STRING * encoding_substr(PARROT_INTERP, ARGIN(const STRING *src), INTVAL offset, INTVAL length) { ASSERT_ARGS(encoding_substr) const UINTVAL strlen = STRING_length(src); STRING *return_string; String_iter iter; UINTVAL start; if (offset < 0) offset += strlen; if ((UINTVAL)offset >= strlen || length <= 0) { /* Allow regexes to return $' easily for "aaa" =~ /aaa/ */ if ((UINTVAL)offset == strlen || length <= 0) return Parrot_str_new_constant(interp, ""); Parrot_ex_throw_from_c_noargs(interp, EXCEPTION_SUBSTR_OUT_OF_STRING, "Cannot take substr outside string"); } return_string = Parrot_str_copy(interp, src); if (offset == 0 && (UINTVAL)length >= strlen) return return_string; STRING_ITER_INIT(interp, &iter); if (offset) STRING_iter_skip(interp, src, &iter, offset); start = iter.bytepos; return_string->strstart += start; if ((UINTVAL)length >= strlen - (UINTVAL)offset) { return_string->bufused -= start; return_string->strlen -= offset; } else { STRING_iter_skip(interp, src, &iter, length); return_string->bufused = iter.bytepos - start; return_string->strlen = length; } return_string->hashval = 0; return return_string; }
PARROT_DOES_NOT_RETURN void encoding_ord_error(PARROT_INTERP, ARGIN(const STRING *s), INTVAL offset) { ASSERT_ARGS(encoding_ord_error) const UINTVAL len = STRING_length(s); const char *err_msg; if (!len) err_msg = "Cannot get character of empty string"; else if (offset >= 0) err_msg = "Cannot get character past end of string"; else if (offset < 0) err_msg = "Cannot get character before beginning of string"; else err_msg = "Unknown encoding_ord_error"; Parrot_ex_throw_from_c_noargs(interp, EXCEPTION_ORD_OUT_OF_STRING, err_msg); }
static int u_iscclass(PARROT_INTERP, UINTVAL codepoint, INTVAL flags) { ASSERT_ARGS(u_iscclass) #if PARROT_HAS_ICU UNUSED(interp); /* XXX which one return u_charDigitValue(codepoint); */ if ((flags & enum_cclass_uppercase) && u_isupper(codepoint)) return 1; if ((flags & enum_cclass_lowercase) && u_islower(codepoint)) return 1; if ((flags & enum_cclass_alphabetic) && u_isalpha(codepoint)) return 1; if ((flags & enum_cclass_numeric) && u_isdigit(codepoint)) return 1; if ((flags & enum_cclass_hexadecimal) && u_isxdigit(codepoint)) return 1; if ((flags & enum_cclass_whitespace) && u_isspace(codepoint)) return 1; if ((flags & enum_cclass_printing) && u_isprint(codepoint)) return 1; if ((flags & enum_cclass_graphical) && u_isgraph(codepoint)) return 1; if ((flags & enum_cclass_blank) && u_isblank(codepoint)) return 1; if ((flags & enum_cclass_control) && u_iscntrl(codepoint)) return 1; if ((flags & enum_cclass_alphanumeric) && u_isalnum(codepoint)) return 1; if ((flags & enum_cclass_word) && (u_isalnum(codepoint) || codepoint == '_')) return 1; if ((flags & enum_cclass_newline) && (codepoint == 0x2028 || codepoint == 0x2029 || u_hasBinaryProperty(codepoint, UCHAR_LINE_BREAK))) return 1; return 0; #else if (codepoint < 256) return (Parrot_iso_8859_1_typetable[codepoint] & flags) ? 1 : 0; if (flags == enum_cclass_any) return 1; /* All codepoints from u+0100 to u+02af are alphabetic, so we * cheat on the WORD and ALPHABETIC properties to include these * (and incorrectly exclude all others). This is a stopgap until * ICU is everywhere, or we have better non-ICU unicode support. */ if (flags == enum_cclass_word || flags == enum_cclass_alphabetic) return (codepoint < 0x2b0); if (flags & enum_cclass_whitespace) { /* from http://www.unicode.org/Public/UNIDATA/PropList.txt */ switch (codepoint) { case 0x1680: case 0x180e: case 0x2000: case 0x2001: case 0x2002: case 0x2003: case 0x2004: case 0x2005: case 0x2006: case 0x2007: case 0x2008: case 0x2009: case 0x200a: case 0x2028: case 0x2029: case 0x202f: case 0x205f: case 0x3000: return 1; default: break; } } if (flags & enum_cclass_numeric) { /* from http://www.unicode.org/Public/UNIDATA/UnicodeData.txt */ if (codepoint >= 0x0660 && codepoint <= 0x0669) return 1; if (codepoint >= 0x06f0 && codepoint <= 0x06f9) return 1; if (codepoint >= 0x07c0 && codepoint <= 0x07c9) return 1; if (codepoint >= 0x0966 && codepoint <= 0x096f) return 1; if (codepoint >= 0x09e6 && codepoint <= 0x09ef) return 1; if (codepoint >= 0x0a66 && codepoint <= 0x0a6f) return 1; if (codepoint >= 0x0ae6 && codepoint <= 0x0aef) return 1; if (codepoint >= 0x0b66 && codepoint <= 0x0b6f) return 1; if (codepoint >= 0x0be6 && codepoint <= 0x0bef) return 1; if (codepoint >= 0x0c66 && codepoint <= 0x0c6f) return 1; if (codepoint >= 0x0ce6 && codepoint <= 0x0cef) return 1; if (codepoint >= 0x0d66 && codepoint <= 0x0d6f) return 1; if (codepoint >= 0x0e50 && codepoint <= 0x0e59) return 1; if (codepoint >= 0x0ed0 && codepoint <= 0x0ed9) return 1; if (codepoint >= 0x0f20 && codepoint <= 0x0f29) return 1; if (codepoint >= 0x1040 && codepoint <= 0x1049) return 1; if (codepoint >= 0x17e0 && codepoint <= 0x17e9) return 1; if (codepoint >= 0x1810 && codepoint <= 0x1819) return 1; if (codepoint >= 0x1946 && codepoint <= 0x194f) return 1; if (codepoint >= 0x19d0 && codepoint <= 0x19d9) return 1; if (codepoint >= 0x1b50 && codepoint <= 0x1b59) return 1; if (codepoint >= 0xff10 && codepoint <= 0xff19) return 1; } if (flags & enum_cclass_newline) { /* from http://www.unicode.org/Public/UNIDATA/extracted/DerivedLineBreak.txt * Line_Break=Mandatory_Break*/ if (codepoint == 0x2028 || codepoint == 0x2029) return 1; } if (flags & ~(enum_cclass_whitespace | enum_cclass_numeric | enum_cclass_newline)) Parrot_ex_throw_from_c_noargs(interp, EXCEPTION_LIBRARY_ERROR, "no ICU lib loaded"); return 0; #endif }