STATIC char * S_scan_word(pTHX_ register char *s, char *dest, STRLEN destlen, int allow_package, STRLEN *slp) { register char *d = dest; register char * const e = d + destlen - 3; /* two-character token, ending NUL */ for (;;) { if (d >= e) Perl_croak(aTHX_ ident_too_long); if (isALNUM(*s)) /* UTF handled below */ *d++ = *s++; else if (*s == '\'' && allow_package && isIDFIRST_lazy_if(s+1,UTF)) { *d++ = ':'; *d++ = ':'; s++; } else if (*s == ':' && s[1] == ':' && allow_package && s[2] != '$') { *d++ = *s++; *d++ = *s++; } else if (UTF && UTF8_IS_START(*s) && isALNUM_utf8((U8*)s)) { char *t = s + UTF8SKIP(s); while (UTF8_IS_CONTINUED(*t) && _is_utf8_mark((U8*)t)) t += UTF8SKIP(t); if (d + (t - s) > e) Perl_croak(aTHX_ ident_too_long); Copy(s, d, t - s, char); d += t - s; s = t; } else {
STRLEN Perl_is_utf8_char(const U8 *s) { PERL_ARGS_ASSERT_IS_UTF8_CHAR; /* Assumes we have enough space, which is why this is deprecated */ return isUTF8_CHAR(s, s + UTF8SKIP(s)); }
static UINTVAL utf8_scan(PARROT_INTERP, ARGIN(const STRING *src)) { ASSERT_ARGS(utf8_scan) const utf8_t *u8ptr = (const utf8_t *)src->strstart; const utf8_t *u8end = (const utf8_t *)(src->strstart + src->bufused); UINTVAL characters = 0; while (u8ptr < u8end) { UINTVAL c = *u8ptr; if (UTF8_IS_START(c)) { size_t len = UTF8SKIP(u8ptr); size_t count; if (u8ptr + len > u8end) Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8, "Unaligned end in UTF-8 string\n"); /* Check for overlong forms */ if (UTF8_IS_OVERLONG(c, u8ptr[1])) Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8, "Overlong form in UTF-8 string\n"); c &= UTF8_START_MASK(len); for (count = 1; count < len; ++count) { ++u8ptr; if (!UTF8_IS_CONTINUATION(*u8ptr)) Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8, "Malformed UTF-8 string\n"); c = UTF8_ACCUMULATE(c, *u8ptr); } if (UNICODE_IS_INVALID(c)) Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_CHARACTER, "Invalid character in UTF-8 string\n"); } else if (!UNICODE_IS_INVARIANT(c)) { Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_MALFORMED_UTF8, "Malformed UTF-8 string\n"); } ++u8ptr; ++characters; } return characters; }
static UINTVAL utf8_decode(PARROT_INTERP, ARGIN(const utf8_t *ptr)) { ASSERT_ARGS(utf8_decode) const utf8_t *u8ptr = ptr; UINTVAL c = *u8ptr; if (UTF8_IS_START(c)) { UINTVAL len = UTF8SKIP(u8ptr); UINTVAL count; c &= UTF8_START_MASK(len); for (count = 1; count < len; ++count) { ++u8ptr; c = UTF8_ACCUMULATE(c, *u8ptr); } } return c; }
EXTERN bool probably_utf8_chunk(pTHX_ char *s, STRLEN len) { char *e = s + len; STRLEN clen; /* ignore partial utf8 char at end of buffer */ while (s < e && UTF8_IS_CONTINUATION((U8)*(e - 1))) e--; if (s < e && UTF8_IS_START((U8)*(e - 1))) e--; clen = len - (e - s); if (clen && UTF8SKIP(e) == clen) { /* all promised continuation bytes are present */ e = s + len; } if (!has_hibit(s, e)) return 0; return is_utf8_string((U8*)s, e - s); }
STATIC bool S_grok_bslash_o(pTHX_ char **s, UV *uv, const char** error_msg, const bool output_warning, const bool strict, const bool silence_non_portable, const bool UTF) { /* Documentation to be supplied when interface nailed down finally * This returns FALSE if there is an error which the caller need not recover * from; otherwise TRUE. In either case the caller should look at *len [???]. * It guarantees that the returned codepoint, *uv, when expressed as * utf8 bytes, would fit within the skipped "\o{...}" bytes. * On input: * s is the address of a pointer to a NULL terminated string that begins * with 'o', and the previous character was a backslash. At exit, *s * will be advanced to the byte just after those absorbed by this * function. Hence the caller can continue parsing from there. In * the case of an error, this routine has generally positioned *s to * point just to the right of the first bad spot, so that a message * that has a "<--" to mark the spot will be correctly positioned. * uv points to a UV that will hold the output value, valid only if the * return from the function is TRUE * error_msg is a pointer that will be set to an internal buffer giving an * error message upon failure (the return is FALSE). Untouched if * function succeeds * output_warning says whether to output any warning messages, or suppress * them * strict is true if this should fail instead of warn if there are * non-octal digits within the braces * silence_non_portable is true if to suppress warnings about the code * point returned being too large to fit on all platforms. * UTF is true iff the string *s is encoded in UTF-8. */ char* e; STRLEN numbers_len; I32 flags = PERL_SCAN_ALLOW_UNDERSCORES | PERL_SCAN_DISALLOW_PREFIX /* XXX Until the message is improved in grok_oct, handle errors * ourselves */ | PERL_SCAN_SILENT_ILLDIGIT; #ifdef DEBUGGING char *start = *s - 1; assert(*start == '\\'); #endif PERL_ARGS_ASSERT_GROK_BSLASH_O; assert(**s == 'o'); (*s)++; if (**s != '{') { *error_msg = "Missing braces on \\o{}"; return FALSE; } e = strchr(*s, '}'); if (!e) { (*s)++; /* Move past the '{' */ while (isOCTAL(**s)) { /* Position beyond the legal digits */ (*s)++; } *error_msg = "Missing right brace on \\o{"; return FALSE; } (*s)++; /* Point to expected first digit (could be first byte of utf8 sequence if not a digit) */ numbers_len = e - *s; if (numbers_len == 0) { (*s)++; /* Move past the } */ *error_msg = "Number with no digits"; return FALSE; } if (silence_non_portable) { flags |= PERL_SCAN_SILENT_NON_PORTABLE; } *uv = grok_oct(*s, &numbers_len, &flags, NULL); /* Note that if has non-octal, will ignore everything starting with that up * to the '}' */ if (numbers_len != (STRLEN) (e - *s)) { if (strict) { *s += numbers_len; *s += (UTF) ? UTF8SKIP(*s) : (STRLEN) 1; *error_msg = "Non-octal character"; return FALSE; } else if (output_warning) { Perl_ck_warner(aTHX_ packWARN(WARN_DIGIT), /* diag_listed_as: Non-octal character '%c'. Resolved as "%s" */ "Non-octal character '%c'. Resolved as \"\\o{%.*s}\"", *(*s + numbers_len), (int) numbers_len, *s); } } /* Return past the '}' */ *s = e + 1; /* guarantee replacing "\o{...}" with utf8 bytes fits within * existing space */ assert(OFFUNISKIP(*uv) < *s - start); return TRUE; }
PERL_STATIC_INLINE bool S_grok_bslash_x(pTHX_ char **s, UV *uv, const char** error_msg, const bool output_warning, const bool strict, const bool silence_non_portable, const bool UTF) { /* Documentation to be supplied when interface nailed down finally * This returns FALSE if there is an error which the caller need not recover * from; otherwise TRUE. * It guarantees that the returned codepoint, *uv, when expressed as * utf8 bytes, would fit within the skipped "\x{...}" bytes. * * On input: * s is the address of a pointer to a NULL terminated string that begins * with 'x', and the previous character was a backslash. At exit, *s * will be advanced to the byte just after those absorbed by this * function. Hence the caller can continue parsing from there. In * the case of an error, this routine has generally positioned *s to * point just to the right of the first bad spot, so that a message * that has a "<--" to mark the spot will be correctly positioned. * uv points to a UV that will hold the output value, valid only if the * return from the function is TRUE * error_msg is a pointer that will be set to an internal buffer giving an * error message upon failure (the return is FALSE). Untouched if * function succeeds * output_warning says whether to output any warning messages, or suppress * them * strict is true if anything out of the ordinary should cause this to * fail instead of warn or be silent. For example, it requires * exactly 2 digits following the \x (when there are no braces). * 3 digits could be a mistake, so is forbidden in this mode. * silence_non_portable is true if to suppress warnings about the code * point returned being too large to fit on all platforms. * UTF is true iff the string *s is encoded in UTF-8. */ char* e; STRLEN numbers_len; I32 flags = PERL_SCAN_DISALLOW_PREFIX; #ifdef DEBUGGING char *start = *s - 1; assert(*start == '\\'); #endif PERL_ARGS_ASSERT_GROK_BSLASH_X; assert(**s == 'x'); (*s)++; if (strict || ! output_warning) { flags |= PERL_SCAN_SILENT_ILLDIGIT; } if (**s != '{') { STRLEN len = (strict) ? 3 : 2; *uv = grok_hex(*s, &len, &flags, NULL); *s += len; if (strict && len != 2) { if (len < 2) { *s += (UTF) ? UTF8SKIP(*s) : 1; *error_msg = "Non-hex character"; } else { *error_msg = "Use \\x{...} for more than two hex characters"; } return FALSE; } goto ok; } e = strchr(*s, '}'); if (!e) { (*s)++; /* Move past the '{' */ while (isXDIGIT(**s)) { /* Position beyond the legal digits */ (*s)++; } /* XXX The corresponding message above for \o is just '\\o{'; other * messages for other constructs include the '}', so are inconsistent. */ *error_msg = "Missing right brace on \\x{}"; return FALSE; } (*s)++; /* Point to expected first digit (could be first byte of utf8 sequence if not a digit) */ numbers_len = e - *s; if (numbers_len == 0) { if (strict) { (*s)++; /* Move past the } */ *error_msg = "Number with no digits"; return FALSE; } *s = e + 1; *uv = 0; goto ok; } flags |= PERL_SCAN_ALLOW_UNDERSCORES; if (silence_non_portable) { flags |= PERL_SCAN_SILENT_NON_PORTABLE; } *uv = grok_hex(*s, &numbers_len, &flags, NULL); /* Note that if has non-hex, will ignore everything starting with that up * to the '}' */ if (strict && numbers_len != (STRLEN) (e - *s)) { *s += numbers_len; *s += (UTF) ? UTF8SKIP(*s) : 1; *error_msg = "Non-hex character"; return FALSE; } /* Return past the '}' */ *s = e + 1; ok: /* guarantee replacing "\x{...}" with utf8 bytes fits within * existing space */ assert(OFFUNISKIP(*uv) < *s - start); return TRUE; }
STATIC I32 S_do_trans_simple_utf8(pTHX_ SV * const sv) { dVAR; U8 *s; U8 *send; U8 *d; U8 *start; U8 *dstart, *dend; I32 matches = 0; const I32 grows = PL_op->op_private & OPpTRANS_GROWS; STRLEN len; SV* const rv = #ifdef USE_ITHREADS PAD_SVl(cPADOP->op_padix); #else (SV*)cSVOP->op_sv; #endif HV* const hv = (HV*)SvRV(rv); SV* const * svp = hv_fetchs(hv, "NONE", FALSE); const UV none = svp ? SvUV(*svp) : 0x7fffffff; const UV extra = none + 1; UV final = 0; U8 hibit = 0; s = (U8*)SvPV(sv, len); if (!SvUTF8(sv)) { const U8 *t = s; const U8 * const e = s + len; while (t < e) { const U8 ch = *t++; hibit = !NATIVE_IS_INVARIANT(ch); if (hibit) { s = bytes_to_utf8(s, &len); break; } } } send = s + len; start = s; svp = hv_fetchs(hv, "FINAL", FALSE); if (svp) final = SvUV(*svp); if (grows) { /* d needs to be bigger than s, in case e.g. upgrading is required */ Newx(d, len * 3 + UTF8_MAXBYTES, U8); dend = d + len * 3; dstart = d; } else { dstart = d = s; dend = d + len; } while (s < send) { const UV uv = swash_fetch(rv, s, TRUE); if (uv < none) { s += UTF8SKIP(s); matches++; d = uvuni_to_utf8(d, uv); } else if (uv == none) { const int i = UTF8SKIP(s); Move(s, d, i, U8); d += i; s += i; } else if (uv == extra) { s += UTF8SKIP(s); matches++; d = uvuni_to_utf8(d, final); } else
STATIC I32 S_do_trans_simple_utf8(pTHX_ SV *sv) { U8 *s; U8 *send; U8 *d; U8 *start; U8 *dstart, *dend; I32 matches = 0; I32 grows = PL_op->op_private & OPpTRANS_GROWS; STRLEN len; SV* rv = (SV*)cSVOP->op_sv; HV* hv = (HV*)SvRV(rv); SV** svp = hv_fetch(hv, "NONE", 4, FALSE); UV none = svp ? SvUV(*svp) : 0x7fffffff; UV extra = none + 1; UV final = 0; UV uv; I32 isutf8; U8 hibit = 0; s = (U8*)SvPV(sv, len); isutf8 = SvUTF8(sv); if (!isutf8) { U8 *t = s, *e = s + len; while (t < e) { U8 ch = *t++; if ((hibit = !NATIVE_IS_INVARIANT(ch))) break; } if (hibit) s = bytes_to_utf8(s, &len); } send = s + len; start = s; svp = hv_fetch(hv, "FINAL", 5, FALSE); if (svp) final = SvUV(*svp); if (grows) { /* d needs to be bigger than s, in case e.g. upgrading is required */ New(0, d, len*3+UTF8_MAXLEN, U8); dend = d + len * 3; dstart = d; } else { dstart = d = s; dend = d + len; } while (s < send) { if ((uv = swash_fetch(rv, s, TRUE)) < none) { s += UTF8SKIP(s); matches++; d = uvuni_to_utf8(d, uv); } else if (uv == none) { int i = UTF8SKIP(s); Move(s, d, i, U8); d += i; s += i; } else if (uv == extra) { int i = UTF8SKIP(s); s += i; matches++; d = uvuni_to_utf8(d, final); } else