Beispiel #1
0
static INTVAL
ucs4_partial_scan(PARROT_INTERP, ARGIN(const char *buf),
        ARGMOD(Parrot_String_Bounds *bounds))
{
    ASSERT_ARGS(ucs4_partial_scan)
    const utf32_t * const ptr = (const utf32_t *)buf;
    UINTVAL               len   = bounds->bytes >> 1;
    const INTVAL          chars = bounds->chars;
    const INTVAL          delim = bounds->delim;
    INTVAL                c     = -1;
    UINTVAL               i;

    if (chars >= 0 && (UINTVAL)chars < len)
        len = chars;

    for (i = 0; i < len; ++i) {
        c = ptr[i];

        if (UNICODE_IS_INVALID(c))
            Parrot_ex_throw_from_c_noargs(interp, EXCEPTION_INVALID_CHARACTER,
                    "Invalid character in UCS-4 string");

        if (c == delim) {
            len = i + 1;
            break;
        }
    }

    bounds->bytes = len << 2;
    bounds->chars = len;
    bounds->delim = c;

    return 0;
}
Beispiel #2
0
static void
null_error(PARROT_INTERP)
{
    ASSERT_ARGS(null_error)

    Parrot_ex_throw_from_c_noargs(interp, EXCEPTION_UNEXPECTED_NULL,
        "Invalid operation on null string");
}
Beispiel #3
0
PARROT_EXPORT
PARROT_WARN_UNUSED_RESULT
Parrot_Int
imcc_get_pir_compreg_api(Parrot_PMC interp_pmc, int add_compreg, ARGOUT(Parrot_PMC *compiler))
{
    ASSERT_ARGS(imcc_get_pir_compreg_api)
    IMCC_API_CALLIN(interp_pmc, interp)
    *compiler = get_compreg_pmc(interp, 0, add_compreg);
    if (PMC_IS_NULL(*compiler))
        Parrot_ex_throw_from_c_noargs(interp, EXCEPTION_UNEXPECTED_NULL,
            "Could not create PIR compiler PMC");
    IMCC_API_CALLOUT(interp_pmc, interp)
}
Beispiel #4
0
static void
ucs4_scan(PARROT_INTERP, ARGMOD(STRING *src))
{
    ASSERT_ARGS(ucs4_scan)
    const utf32_t * const ptr = (utf32_t *)src->strstart;
    const UINTVAL         len = src->bufused >> 2;
    UINTVAL               i;

    if (src->bufused & 3)
        Parrot_ex_throw_from_c_noargs(interp, EXCEPTION_INVALID_CHARACTER,
            "Unaligned end in UCS-4 string");

    for (i = 0; i < len; ++i) {
        UINTVAL c = ptr[i];

        if (UNICODE_IS_INVALID(c))
            Parrot_ex_throw_from_c_noargs(interp, EXCEPTION_INVALID_CHARACTER,
                    "Invalid character in UCS-4 string");
    }

    src->strlen = len;
}
Beispiel #5
0
PARROT_CANNOT_RETURN_NULL
STRING *
encoding_substr(PARROT_INTERP, ARGIN(const STRING *src), INTVAL offset, INTVAL length)
{
    ASSERT_ARGS(encoding_substr)
    const UINTVAL  strlen = STRING_length(src);
    STRING        *return_string;
    String_iter    iter;
    UINTVAL        start;

    if (offset < 0)
        offset += strlen;

    if ((UINTVAL)offset >= strlen || length <= 0) {
        /* Allow regexes to return $' easily for "aaa" =~ /aaa/ */
        if ((UINTVAL)offset == strlen || length <= 0)
            return Parrot_str_new_constant(interp, "");

        Parrot_ex_throw_from_c_noargs(interp,
            EXCEPTION_SUBSTR_OUT_OF_STRING,
            "Cannot take substr outside string");
    }

    return_string = Parrot_str_copy(interp, src);

    if (offset == 0 && (UINTVAL)length >= strlen)
        return return_string;

    STRING_ITER_INIT(interp, &iter);

    if (offset)
        STRING_iter_skip(interp, src, &iter, offset);

    start = iter.bytepos;
    return_string->strstart += start;

    if ((UINTVAL)length >= strlen - (UINTVAL)offset) {
        return_string->bufused -= start;
        return_string->strlen  -= offset;
    }
    else {
        STRING_iter_skip(interp, src, &iter, length);
        return_string->bufused = iter.bytepos - start;
        return_string->strlen  = length;
    }

    return_string->hashval = 0;

    return return_string;
}
Beispiel #6
0
PARROT_DOES_NOT_RETURN
void
encoding_ord_error(PARROT_INTERP, ARGIN(const STRING *s), INTVAL offset)
{
    ASSERT_ARGS(encoding_ord_error)
    const UINTVAL len = STRING_length(s);
    const char   *err_msg;

    if (!len)
        err_msg = "Cannot get character of empty string";
    else if (offset >= 0)
        err_msg = "Cannot get character past end of string";
    else if (offset < 0)
        err_msg = "Cannot get character before beginning of string";
    else
        err_msg = "Unknown encoding_ord_error";

    Parrot_ex_throw_from_c_noargs(interp, EXCEPTION_ORD_OUT_OF_STRING,
        err_msg);
}
Beispiel #7
0
static int
u_iscclass(PARROT_INTERP, UINTVAL codepoint, INTVAL flags)
{
    ASSERT_ARGS(u_iscclass)
#if PARROT_HAS_ICU
    UNUSED(interp);
            /* XXX which one
               return u_charDigitValue(codepoint);
               */
    if ((flags & enum_cclass_uppercase)    && u_isupper(codepoint))  return 1;
    if ((flags & enum_cclass_lowercase)    && u_islower(codepoint))  return 1;
    if ((flags & enum_cclass_alphabetic)   && u_isalpha(codepoint))  return 1;
    if ((flags & enum_cclass_numeric)      && u_isdigit(codepoint))  return 1;
    if ((flags & enum_cclass_hexadecimal)  && u_isxdigit(codepoint)) return 1;
    if ((flags & enum_cclass_whitespace)   && u_isspace(codepoint))  return 1;
    if ((flags & enum_cclass_printing)     && u_isprint(codepoint))  return 1;
    if ((flags & enum_cclass_graphical)    && u_isgraph(codepoint))  return 1;
    if ((flags & enum_cclass_blank)        && u_isblank(codepoint))  return 1;
    if ((flags & enum_cclass_control)      && u_iscntrl(codepoint))  return 1;
    if ((flags & enum_cclass_alphanumeric) && u_isalnum(codepoint))  return 1;
    if ((flags & enum_cclass_word)         &&
        (u_isalnum(codepoint) || codepoint == '_'))                  return 1;
    if ((flags & enum_cclass_newline)      &&
        (codepoint == 0x2028 || codepoint == 0x2029 ||
         u_hasBinaryProperty(codepoint, UCHAR_LINE_BREAK)))          return 1;

    return 0;
#else
    if (codepoint < 256)
        return (Parrot_iso_8859_1_typetable[codepoint] & flags) ? 1 : 0;

    if (flags == enum_cclass_any)
        return 1;

    /* All codepoints from u+0100 to u+02af are alphabetic, so we
     * cheat on the WORD and ALPHABETIC properties to include these
     * (and incorrectly exclude all others).  This is a stopgap until
     * ICU is everywhere, or we have better non-ICU unicode support. */
    if (flags == enum_cclass_word || flags == enum_cclass_alphabetic)
        return (codepoint < 0x2b0);

    if (flags & enum_cclass_whitespace) {
        /* from http://www.unicode.org/Public/UNIDATA/PropList.txt */
        switch (codepoint) {
          case 0x1680: case 0x180e: case 0x2000: case 0x2001:
          case 0x2002: case 0x2003: case 0x2004: case 0x2005:
          case 0x2006: case 0x2007: case 0x2008: case 0x2009:
          case 0x200a: case 0x2028: case 0x2029: case 0x202f:
          case 0x205f: case 0x3000:
            return 1;
          default:
            break;
        }
    }

    if (flags & enum_cclass_numeric) {
        /* from http://www.unicode.org/Public/UNIDATA/UnicodeData.txt */
        if (codepoint >= 0x0660 && codepoint <= 0x0669) return 1;
        if (codepoint >= 0x06f0 && codepoint <= 0x06f9) return 1;
        if (codepoint >= 0x07c0 && codepoint <= 0x07c9) return 1;
        if (codepoint >= 0x0966 && codepoint <= 0x096f) return 1;
        if (codepoint >= 0x09e6 && codepoint <= 0x09ef) return 1;
        if (codepoint >= 0x0a66 && codepoint <= 0x0a6f) return 1;
        if (codepoint >= 0x0ae6 && codepoint <= 0x0aef) return 1;
        if (codepoint >= 0x0b66 && codepoint <= 0x0b6f) return 1;
        if (codepoint >= 0x0be6 && codepoint <= 0x0bef) return 1;
        if (codepoint >= 0x0c66 && codepoint <= 0x0c6f) return 1;
        if (codepoint >= 0x0ce6 && codepoint <= 0x0cef) return 1;
        if (codepoint >= 0x0d66 && codepoint <= 0x0d6f) return 1;
        if (codepoint >= 0x0e50 && codepoint <= 0x0e59) return 1;
        if (codepoint >= 0x0ed0 && codepoint <= 0x0ed9) return 1;
        if (codepoint >= 0x0f20 && codepoint <= 0x0f29) return 1;
        if (codepoint >= 0x1040 && codepoint <= 0x1049) return 1;
        if (codepoint >= 0x17e0 && codepoint <= 0x17e9) return 1;
        if (codepoint >= 0x1810 && codepoint <= 0x1819) return 1;
        if (codepoint >= 0x1946 && codepoint <= 0x194f) return 1;
        if (codepoint >= 0x19d0 && codepoint <= 0x19d9) return 1;
        if (codepoint >= 0x1b50 && codepoint <= 0x1b59) return 1;
        if (codepoint >= 0xff10 && codepoint <= 0xff19) return 1;
    }

    if (flags & enum_cclass_newline) {
        /* from http://www.unicode.org/Public/UNIDATA/extracted/DerivedLineBreak.txt
         * Line_Break=Mandatory_Break*/
        if (codepoint == 0x2028 || codepoint == 0x2029) return 1;
    }

    if (flags & ~(enum_cclass_whitespace | enum_cclass_numeric | enum_cclass_newline))
        Parrot_ex_throw_from_c_noargs(interp, EXCEPTION_LIBRARY_ERROR,
            "no ICU lib loaded");

    return 0;
#endif
}