EXTERN SV* decode_entities(pTHX_ SV* sv, HV* entity2char, bool expand_prefix) { STRLEN len; char *s = SvPV_force(sv, len); char *t = s; char *end = s + len; char *ent_start; char *repl; STRLEN repl_len; #ifdef UNICODE_HTML_PARSER char buf[UTF8_MAXLEN]; int repl_utf8; int high_surrogate = 0; #else char buf[1]; #endif #if defined(__GNUC__) && defined(UNICODE_HTML_PARSER) /* gcc -Wall reports this variable as possibly used uninitialized */ repl_utf8 = 0; #endif while (s < end) { assert(t <= s); if ((*t++ = *s++) != '&') continue; ent_start = s; repl = 0; if (s < end && *s == '#') { UV num = 0; int ok = 0; s++; if (s < end && (*s == 'x' || *s == 'X')) { s++; while (s < end) { char *tmp = strchr(PL_hexdigit, *s); if (!tmp) break; num = num << 4 | ((tmp - PL_hexdigit) & 15); if (num > 0x10FFFF) { /* overflow */ ok = 0; break; } s++; ok = 1; } } else { while (s < end && isDIGIT(*s)) { num = num * 10 + (*s - '0'); if (num > 0x10FFFF) { /* overflow */ ok = 0; break; } s++; ok = 1; } } if (num && ok) { #ifdef UNICODE_HTML_PARSER if (!SvUTF8(sv) && num <= 255) { buf[0] = (char) num; repl = buf; repl_len = 1; repl_utf8 = 0; } else if (num == 0xFFFE || num == 0xFFFF) { /* illegal */ } else { char *tmp; if ((num & 0xFFFFFC00) == 0xDC00) { /* low-surrogate */ if (high_surrogate != 0) { t -= 3; /* Back up past 0xFFFD */ num = ((high_surrogate - 0xD800) << 10) + (num - 0xDC00) + 0x10000; high_surrogate = 0; } else { num = 0xFFFD; } } else if ((num & 0xFFFFFC00) == 0xD800) { /* high-surrogate */ high_surrogate = num; num = 0xFFFD; } else { high_surrogate = 0; /* otherwise invalid? */ if ((num >= 0xFDD0 && num <= 0xFDEF) || ((num & 0xFFFE) == 0xFFFE) || num > 0x10FFFF) { num = 0xFFFD; } } tmp = (char*)uvuni_to_utf8((U8*)buf, num); repl = buf; repl_len = tmp - buf; repl_utf8 = 1; } #else if (num <= 255) { buf[0] = (char) num & 0xFF; repl = buf; repl_len = 1; } #endif } } else { char *ent_name = s; while (s < end && isALNUM(*s)) s++; if (ent_name != s && entity2char) { SV** svp; if ( (svp = hv_fetch(entity2char, ent_name, s - ent_name, 0)) || (*s == ';' && (svp = hv_fetch(entity2char, ent_name, s - ent_name + 1, 0))) ) { repl = SvPV(*svp, repl_len); #ifdef UNICODE_HTML_PARSER repl_utf8 = SvUTF8(*svp); #endif } else if (expand_prefix) { char *ss = s - 1; while (ss > ent_name) { svp = hv_fetch(entity2char, ent_name, ss - ent_name, 0); if (svp) { repl = SvPV(*svp, repl_len); #ifdef UNICODE_HTML_PARSER repl_utf8 = SvUTF8(*svp); #endif s = ss; break; } ss--; } } } #ifdef UNICODE_HTML_PARSER high_surrogate = 0; #endif } if (repl) { char *repl_allocated = 0; if (s < end && *s == ';') s++; t--; /* '&' already copied, undo it */ #ifdef UNICODE_HTML_PARSER if (*s != '&') { high_surrogate = 0; } if (!SvUTF8(sv) && repl_utf8) { /* need to upgrade sv before we continue */ STRLEN before_gap_len = t - SvPVX(sv); char *before_gap = (char*)bytes_to_utf8((U8*)SvPVX(sv), &before_gap_len); STRLEN after_gap_len = end - s; char *after_gap = (char*)bytes_to_utf8((U8*)s, &after_gap_len); sv_setpvn(sv, before_gap, before_gap_len); sv_catpvn(sv, after_gap, after_gap_len); SvUTF8_on(sv); Safefree(before_gap); Safefree(after_gap); s = t = SvPVX(sv) + before_gap_len; end = SvPVX(sv) + before_gap_len + after_gap_len; } else if (SvUTF8(sv) && !repl_utf8) { repl = (char*)bytes_to_utf8((U8*)repl, &repl_len); repl_allocated = repl; } #endif if (t + repl_len > s) { /* need to grow the string */ grow_gap(aTHX_ sv, repl_len - (s - t), &t, &s, &end); } /* copy replacement string into string */ while (repl_len--) *t++ = *repl++; if (repl_allocated) Safefree(repl_allocated); } else { while (ent_start < s) *t++ = *ent_start++; } } *t = '\0'; SvCUR_set(sv, t - SvPVX(sv)); return sv; }
STATIC I32 S_do_trans_simple_utf8(pTHX_ SV * const sv) { dVAR; U8 *s; U8 *send; U8 *d; U8 *start; U8 *dstart, *dend; I32 matches = 0; const I32 grows = PL_op->op_private & OPpTRANS_GROWS; STRLEN len; SV* const rv = #ifdef USE_ITHREADS PAD_SVl(cPADOP->op_padix); #else (SV*)cSVOP->op_sv; #endif HV* const hv = (HV*)SvRV(rv); SV* const * svp = hv_fetchs(hv, "NONE", FALSE); const UV none = svp ? SvUV(*svp) : 0x7fffffff; const UV extra = none + 1; UV final = 0; U8 hibit = 0; s = (U8*)SvPV(sv, len); if (!SvUTF8(sv)) { const U8 *t = s; const U8 * const e = s + len; while (t < e) { const U8 ch = *t++; hibit = !NATIVE_IS_INVARIANT(ch); if (hibit) { s = bytes_to_utf8(s, &len); break; } } } send = s + len; start = s; svp = hv_fetchs(hv, "FINAL", FALSE); if (svp) final = SvUV(*svp); if (grows) { /* d needs to be bigger than s, in case e.g. upgrading is required */ Newx(d, len * 3 + UTF8_MAXBYTES, U8); dend = d + len * 3; dstart = d; } else { dstart = d = s; dend = d + len; } while (s < send) { const UV uv = swash_fetch(rv, s, TRUE); if (uv < none) { s += UTF8SKIP(s); matches++; d = uvuni_to_utf8(d, uv); } else if (uv == none) { const int i = UTF8SKIP(s); Move(s, d, i, U8); d += i; s += i; } else if (uv == extra) { s += UTF8SKIP(s); matches++; d = uvuni_to_utf8(d, final); } else
STATIC I32 S_do_trans_simple_utf8(pTHX_ SV *sv) { U8 *s; U8 *send; U8 *d; U8 *start; U8 *dstart, *dend; I32 matches = 0; I32 grows = PL_op->op_private & OPpTRANS_GROWS; STRLEN len; SV* rv = (SV*)cSVOP->op_sv; HV* hv = (HV*)SvRV(rv); SV** svp = hv_fetch(hv, "NONE", 4, FALSE); UV none = svp ? SvUV(*svp) : 0x7fffffff; UV extra = none + 1; UV final = 0; UV uv; I32 isutf8; U8 hibit = 0; s = (U8*)SvPV(sv, len); isutf8 = SvUTF8(sv); if (!isutf8) { U8 *t = s, *e = s + len; while (t < e) { U8 ch = *t++; if ((hibit = !NATIVE_IS_INVARIANT(ch))) break; } if (hibit) s = bytes_to_utf8(s, &len); } send = s + len; start = s; svp = hv_fetch(hv, "FINAL", 5, FALSE); if (svp) final = SvUV(*svp); if (grows) { /* d needs to be bigger than s, in case e.g. upgrading is required */ New(0, d, len*3+UTF8_MAXLEN, U8); dend = d + len * 3; dstart = d; } else { dstart = d = s; dend = d + len; } while (s < send) { if ((uv = swash_fetch(rv, s, TRUE)) < none) { s += UTF8SKIP(s); matches++; d = uvuni_to_utf8(d, uv); } else if (uv == none) { int i = UTF8SKIP(s); Move(s, d, i, U8); d += i; s += i; } else if (uv == extra) { int i = UTF8SKIP(s); s += i; matches++; d = uvuni_to_utf8(d, final); } else