C++ (Cpp) UTF8SKIPの例

コード例 #1

0

ファイルを表示

ファイル: stolen_chunk_of_toke.c プロジェクト: p5sagit/Devel-Declare

STATIC char *
S_scan_word(pTHX_ register char *s, char *dest, STRLEN destlen, int allow_package, STRLEN *slp)
{
    register char *d = dest;
    register char * const e = d + destlen - 3;  /* two-character token, ending NUL */
    for (;;) {
	if (d >= e)
	    Perl_croak(aTHX_ ident_too_long);
	if (isALNUM(*s))	/* UTF handled below */
	    *d++ = *s++;
	else if (*s == '\'' && allow_package && isIDFIRST_lazy_if(s+1,UTF)) {
	    *d++ = ':';
	    *d++ = ':';
	    s++;
	}
	else if (*s == ':' && s[1] == ':' && allow_package && s[2] != '$') {
	    *d++ = *s++;
	    *d++ = *s++;
	}
	else if (UTF && UTF8_IS_START(*s) && isALNUM_utf8((U8*)s)) {
	    char *t = s + UTF8SKIP(s);
	    while (UTF8_IS_CONTINUED(*t) && _is_utf8_mark((U8*)t))
		t += UTF8SKIP(t);
	    if (d + (t - s) > e)
		Perl_croak(aTHX_ ident_too_long);
	    Copy(s, d, t - s, char);
	    d += t - s;
	    s = t;
	}
	else {

コード例 #2

0

ファイルを表示

ファイル: mathoms.c プロジェクト: evalEmpire/piledriver

STRLEN
Perl_is_utf8_char(const U8 *s)
{
    PERL_ARGS_ASSERT_IS_UTF8_CHAR;

    /* Assumes we have enough space, which is why this is deprecated */
    return isUTF8_CHAR(s, s + UTF8SKIP(s));
}

コード例 #3

0

ファイルを表示

ファイル: utf8.c プロジェクト: Cristofor/parrot

static UINTVAL
utf8_scan(PARROT_INTERP, ARGIN(const STRING *src))
{
    ASSERT_ARGS(utf8_scan)
    const utf8_t *u8ptr = (const utf8_t *)src->strstart;
    const utf8_t *u8end = (const utf8_t *)(src->strstart + src->bufused);
    UINTVAL characters = 0;

    while (u8ptr < u8end) {
        UINTVAL c = *u8ptr;

        if (UTF8_IS_START(c)) {
            size_t len = UTF8SKIP(u8ptr);
            size_t count;

            if (u8ptr + len > u8end)
                Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
                    "Unaligned end in UTF-8 string\n");

            /* Check for overlong forms */
            if (UTF8_IS_OVERLONG(c, u8ptr[1]))
                Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
                    "Overlong form in UTF-8 string\n");

            c &= UTF8_START_MASK(len);

            for (count = 1; count < len; ++count) {
                ++u8ptr;

                if (!UTF8_IS_CONTINUATION(*u8ptr))
                    Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
                        "Malformed UTF-8 string\n");

                c = UTF8_ACCUMULATE(c, *u8ptr);
            }

            if (UNICODE_IS_INVALID(c))
                Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_CHARACTER,
                    "Invalid character in UTF-8 string\n");
        }
        else if (!UNICODE_IS_INVARIANT(c)) {
            Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8,
                "Malformed UTF-8 string\n");
        }

        ++u8ptr;
        ++characters;
    }

    return characters;
}

コード例 #4

0

ファイルを表示

ファイル: utf8.c プロジェクト: Cristofor/parrot

static UINTVAL
utf8_decode(PARROT_INTERP, ARGIN(const utf8_t *ptr))
{
    ASSERT_ARGS(utf8_decode)
    const utf8_t *u8ptr = ptr;
    UINTVAL c = *u8ptr;

    if (UTF8_IS_START(c)) {
        UINTVAL len = UTF8SKIP(u8ptr);
        UINTVAL count;

        c &= UTF8_START_MASK(len);
        for (count = 1; count < len; ++count) {
            ++u8ptr;

            c = UTF8_ACCUMULATE(c, *u8ptr);
        }
    }

    return c;
}

コード例 #5

0

ファイルを表示

ファイル: util.c プロジェクト: PATRIC3/patric3_website

EXTERN bool
probably_utf8_chunk(pTHX_ char *s, STRLEN len)
{
    char *e = s + len;
    STRLEN clen;

    /* ignore partial utf8 char at end of buffer */
    while (s < e && UTF8_IS_CONTINUATION((U8)*(e - 1)))
	e--;
    if (s < e && UTF8_IS_START((U8)*(e - 1)))
	e--;
    clen = len - (e - s);
    if (clen && UTF8SKIP(e) == clen) {
	/* all promised continuation bytes are present */
	e = s + len;
    }

    if (!has_hibit(s, e))
	return 0;

    return is_utf8_string((U8*)s, e - s);
}

コード例 #6

0

ファイルを表示

ファイル: dquote_static.c プロジェクト: evalEmpire/piledriver

STATIC bool
S_grok_bslash_o(pTHX_ char **s, UV *uv, const char** error_msg,
                      const bool output_warning, const bool strict,
                      const bool silence_non_portable,
                      const bool UTF)
{

/*  Documentation to be supplied when interface nailed down finally
 *  This returns FALSE if there is an error which the caller need not recover
 *  from; otherwise TRUE.  In either case the caller should look at *len [???].
 *  It guarantees that the returned codepoint, *uv, when expressed as
 *  utf8 bytes, would fit within the skipped "\o{...}" bytes.
 *  On input:
 *	s   is the address of a pointer to a NULL terminated string that begins
 *	    with 'o', and the previous character was a backslash.  At exit, *s
 *	    will be advanced to the byte just after those absorbed by this
 *	    function.  Hence the caller can continue parsing from there.  In
 *	    the case of an error, this routine has generally positioned *s to
 *	    point just to the right of the first bad spot, so that a message
 *	    that has a "<--" to mark the spot will be correctly positioned.
 *	uv  points to a UV that will hold the output value, valid only if the
 *	    return from the function is TRUE
 *      error_msg is a pointer that will be set to an internal buffer giving an
 *	    error message upon failure (the return is FALSE).  Untouched if
 *	    function succeeds
 *	output_warning says whether to output any warning messages, or suppress
 *	    them
 *	strict is true if this should fail instead of warn if there are
 *	    non-octal digits within the braces
 *      silence_non_portable is true if to suppress warnings about the code
 *          point returned being too large to fit on all platforms.
 *	UTF is true iff the string *s is encoded in UTF-8.
 */
    char* e;
    STRLEN numbers_len;
    I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
                | PERL_SCAN_DISALLOW_PREFIX
                /* XXX Until the message is improved in grok_oct, handle errors
                 * ourselves */
                | PERL_SCAN_SILENT_ILLDIGIT;

#ifdef DEBUGGING
    char *start = *s - 1;
    assert(*start == '\\');
#endif

    PERL_ARGS_ASSERT_GROK_BSLASH_O;


    assert(**s == 'o');
    (*s)++;

    if (**s != '{') {
        *error_msg = "Missing braces on \\o{}";
        return FALSE;
    }

    e = strchr(*s, '}');
    if (!e) {
        (*s)++;  /* Move past the '{' */
        while (isOCTAL(**s)) { /* Position beyond the legal digits */
            (*s)++;
        }
        *error_msg = "Missing right brace on \\o{";
        return FALSE;
    }

    (*s)++;    /* Point to expected first digit (could be first byte of utf8
                  sequence if not a digit) */
    numbers_len = e - *s;
    if (numbers_len == 0) {
        (*s)++;    /* Move past the } */
        *error_msg = "Number with no digits";
        return FALSE;
    }

    if (silence_non_portable) {
        flags |= PERL_SCAN_SILENT_NON_PORTABLE;
    }

    *uv = grok_oct(*s, &numbers_len, &flags, NULL);
    /* Note that if has non-octal, will ignore everything starting with that up
     * to the '}' */

    if (numbers_len != (STRLEN) (e - *s)) {
        if (strict) {
            *s += numbers_len;
            *s += (UTF) ? UTF8SKIP(*s) : (STRLEN) 1;
            *error_msg = "Non-octal character";
            return FALSE;
        }
        else if (output_warning) {
            Perl_ck_warner(aTHX_ packWARN(WARN_DIGIT),
            /* diag_listed_as: Non-octal character '%c'.  Resolved as "%s" */
                        "Non-octal character '%c'.  Resolved as \"\\o{%.*s}\"",
                        *(*s + numbers_len),
                        (int) numbers_len,
                        *s);
        }
    }

    /* Return past the '}' */
    *s = e + 1;

    /* guarantee replacing "\o{...}" with utf8 bytes fits within
     * existing space */
    assert(OFFUNISKIP(*uv) < *s - start);

    return TRUE;
}

コード例 #7

0

ファイルを表示

ファイル: dquote_static.c プロジェクト: evalEmpire/piledriver

PERL_STATIC_INLINE bool
S_grok_bslash_x(pTHX_ char **s, UV *uv, const char** error_msg,
                      const bool output_warning, const bool strict,
                      const bool silence_non_portable,
                      const bool UTF)
{

/*  Documentation to be supplied when interface nailed down finally
 *  This returns FALSE if there is an error which the caller need not recover
 *  from; otherwise TRUE.
 *  It guarantees that the returned codepoint, *uv, when expressed as
 *  utf8 bytes, would fit within the skipped "\x{...}" bytes.
 *
 *  On input:
 *	s   is the address of a pointer to a NULL terminated string that begins
 *	    with 'x', and the previous character was a backslash.  At exit, *s
 *	    will be advanced to the byte just after those absorbed by this
 *	    function.  Hence the caller can continue parsing from there.  In
 *	    the case of an error, this routine has generally positioned *s to
 *	    point just to the right of the first bad spot, so that a message
 *	    that has a "<--" to mark the spot will be correctly positioned.
 *	uv  points to a UV that will hold the output value, valid only if the
 *	    return from the function is TRUE
 *      error_msg is a pointer that will be set to an internal buffer giving an
 *	    error message upon failure (the return is FALSE).  Untouched if
 *	    function succeeds
 *	output_warning says whether to output any warning messages, or suppress
 *	    them
 *	strict is true if anything out of the ordinary should cause this to
 *	    fail instead of warn or be silent.  For example, it requires
 *	    exactly 2 digits following the \x (when there are no braces).
 *	    3 digits could be a mistake, so is forbidden in this mode.
 *      silence_non_portable is true if to suppress warnings about the code
 *          point returned being too large to fit on all platforms.
 *	UTF is true iff the string *s is encoded in UTF-8.
 */
    char* e;
    STRLEN numbers_len;
    I32 flags = PERL_SCAN_DISALLOW_PREFIX;
#ifdef DEBUGGING
    char *start = *s - 1;
    assert(*start == '\\');
#endif

    PERL_ARGS_ASSERT_GROK_BSLASH_X;

    assert(**s == 'x');
    (*s)++;

    if (strict || ! output_warning) {
        flags |= PERL_SCAN_SILENT_ILLDIGIT;
    }

    if (**s != '{') {
        STRLEN len = (strict) ? 3 : 2;

        *uv = grok_hex(*s, &len, &flags, NULL);
        *s += len;
        if (strict && len != 2) {
            if (len < 2) {
                *s += (UTF) ? UTF8SKIP(*s) : 1;
                *error_msg = "Non-hex character";
            }
            else {
                *error_msg = "Use \\x{...} for more than two hex characters";
            }
            return FALSE;
        }
        goto ok;
    }

    e = strchr(*s, '}');
    if (!e) {
        (*s)++;  /* Move past the '{' */
        while (isXDIGIT(**s)) { /* Position beyond the legal digits */
            (*s)++;
        }
        /* XXX The corresponding message above for \o is just '\\o{'; other
         * messages for other constructs include the '}', so are inconsistent.
         */
        *error_msg = "Missing right brace on \\x{}";
        return FALSE;
    }

    (*s)++;    /* Point to expected first digit (could be first byte of utf8
                  sequence if not a digit) */
    numbers_len = e - *s;
    if (numbers_len == 0) {
        if (strict) {
            (*s)++;    /* Move past the } */
            *error_msg = "Number with no digits";
            return FALSE;
        }
        *s = e + 1;
        *uv = 0;
        goto ok;
    }

    flags |= PERL_SCAN_ALLOW_UNDERSCORES;
    if (silence_non_portable) {
        flags |= PERL_SCAN_SILENT_NON_PORTABLE;
    }

    *uv = grok_hex(*s, &numbers_len, &flags, NULL);
    /* Note that if has non-hex, will ignore everything starting with that up
     * to the '}' */

    if (strict && numbers_len != (STRLEN) (e - *s)) {
        *s += numbers_len;
        *s += (UTF) ? UTF8SKIP(*s) : 1;
        *error_msg = "Non-hex character";
        return FALSE;
    }

    /* Return past the '}' */
    *s = e + 1;

  ok:
    /* guarantee replacing "\x{...}" with utf8 bytes fits within
     * existing space */
    assert(OFFUNISKIP(*uv) < *s - start);
    return TRUE;
}

コード例 #8

0

ファイルを表示

ファイル: doop.c プロジェクト: ggoossen/perlbenchapps

STATIC I32
S_do_trans_simple_utf8(pTHX_ SV * const sv)
{
    dVAR;
    U8 *s;
    U8 *send;
    U8 *d;
    U8 *start;
    U8 *dstart, *dend;
    I32 matches = 0;
    const I32 grows = PL_op->op_private & OPpTRANS_GROWS;
    STRLEN len;

    SV* const  rv =
#ifdef USE_ITHREADS
		    PAD_SVl(cPADOP->op_padix);
#else
		    (SV*)cSVOP->op_sv;
#endif
    HV* const  hv = (HV*)SvRV(rv);
    SV* const * svp = hv_fetchs(hv, "NONE", FALSE);
    const UV none = svp ? SvUV(*svp) : 0x7fffffff;
    const UV extra = none + 1;
    UV final = 0;
    U8 hibit = 0;

    s = (U8*)SvPV(sv, len);
    if (!SvUTF8(sv)) {
	const U8 *t = s;
	const U8 * const e = s + len;
	while (t < e) {
	    const U8 ch = *t++;
	    hibit = !NATIVE_IS_INVARIANT(ch);
	    if (hibit) {
		s = bytes_to_utf8(s, &len);
		break;
	    }
	}
    }
    send = s + len;
    start = s;

    svp = hv_fetchs(hv, "FINAL", FALSE);
    if (svp)
	final = SvUV(*svp);

    if (grows) {
	/* d needs to be bigger than s, in case e.g. upgrading is required */
	Newx(d, len * 3 + UTF8_MAXBYTES, U8);
	dend = d + len * 3;
	dstart = d;
    }
    else {
	dstart = d = s;
	dend = d + len;
    }

    while (s < send) {
	const UV uv = swash_fetch(rv, s, TRUE);
	if (uv < none) {
	    s += UTF8SKIP(s);
	    matches++;
	    d = uvuni_to_utf8(d, uv);
	}
	else if (uv == none) {
	    const int i = UTF8SKIP(s);
	    Move(s, d, i, U8);
	    d += i;
	    s += i;
	}
	else if (uv == extra) {
	    s += UTF8SKIP(s);
	    matches++;
	    d = uvuni_to_utf8(d, final);
	}
	else

コード例 #9

0

ファイルを表示

ファイル: doop.c プロジェクト: coder-chenzhi/daikon

STATIC I32
S_do_trans_simple_utf8(pTHX_ SV *sv)
{
    U8 *s;
    U8 *send;
    U8 *d;
    U8 *start;
    U8 *dstart, *dend;
    I32 matches = 0;
    I32 grows = PL_op->op_private & OPpTRANS_GROWS;
    STRLEN len;

    SV* rv = (SV*)cSVOP->op_sv;
    HV* hv = (HV*)SvRV(rv);
    SV** svp = hv_fetch(hv, "NONE", 4, FALSE);
    UV none = svp ? SvUV(*svp) : 0x7fffffff;
    UV extra = none + 1;
    UV final = 0;
    UV uv;
    I32 isutf8;
    U8 hibit = 0;

    s = (U8*)SvPV(sv, len);
    isutf8 = SvUTF8(sv);
    if (!isutf8) {
	U8 *t = s, *e = s + len;
	while (t < e) {
	    U8 ch = *t++;
	    if ((hibit = !NATIVE_IS_INVARIANT(ch)))
		break;
	}
	if (hibit)
	    s = bytes_to_utf8(s, &len);
    }
    send = s + len;
    start = s;

    svp = hv_fetch(hv, "FINAL", 5, FALSE);
    if (svp)
	final = SvUV(*svp);

    if (grows) {
	/* d needs to be bigger than s, in case e.g. upgrading is required */
	New(0, d, len*3+UTF8_MAXLEN, U8);
	dend = d + len * 3;
	dstart = d;
    }
    else {
	dstart = d = s;
	dend = d + len;
    }

    while (s < send) {
	if ((uv = swash_fetch(rv, s, TRUE)) < none) {
	    s += UTF8SKIP(s);
	    matches++;
	    d = uvuni_to_utf8(d, uv);
	}
	else if (uv == none) {
	    int i = UTF8SKIP(s);
	    Move(s, d, i, U8);
	    d += i;
	    s += i;
	}
	else if (uv == extra) {
	    int i = UTF8SKIP(s);
	    s += i;
	    matches++;
	    d = uvuni_to_utf8(d, final);
	}
	else