static SV *sv_soundex_utf8 (SV* source) { U8 *source_p; U8 *source_end; { STRLEN source_len; source_p = (U8 *) SvPV(source, source_len); source_end = &source_p[source_len]; } while (source_p < source_end) { STRLEN offset; UV c = utf8n_to_uvchr(source_p, source_end-source_p, &offset, 0); char codepart_last = (c <= 0xFF) ? sv_soundex_table[c] : '\0'; source_p = (offset >= 1) ? &source_p[offset] : source_end; if (codepart_last != '\0') { SV *code = newSV(SOUNDEX_ACCURACY); char *code_p = SvPVX(code); char *code_end = &code_p[SOUNDEX_ACCURACY]; SvCUR_set(code, SOUNDEX_ACCURACY); SvPOK_only(code); *code_p++ = toupper(c); while (source_p != source_end && code_p != code_end) { char codepart; c = utf8n_to_uvchr(source_p, source_end-source_p, &offset, 0); codepart = (c <= 0xFF) ? sv_soundex_table[c] : '\0'; source_p = (offset >= 1) ? &source_p[offset] : source_end; if (codepart != '\0') if (codepart != codepart_last && (codepart_last = codepart) != '0') *code_p++ = codepart; } while (code_p != code_end) *code_p++ = '0'; *code_end = '\0'; return code; } source_p++; } return SvREFCNT_inc(perl_get_sv("Text::Soundex::nocode", FALSE)); }
STATIC I32 S_do_trans_count(pTHX_ SV * const sv) { dVAR; STRLEN len; const U8 *s = (const U8*)SvPV_const(sv, len); const U8 * const send = s + len; I32 matches = 0; const short * const tbl = (short*)cPVOP->op_pv; if (!tbl) Perl_croak(aTHX_ "panic: do_trans_count line %d",__LINE__); if (!SvUTF8(sv)) { while (s < send) { if (tbl[*s++] >= 0) matches++; } } else { const I32 complement = PL_op->op_private & OPpTRANS_COMPLEMENT; while (s < send) { STRLEN ulen; const UV c = utf8n_to_uvchr(s, send - s, &ulen, UTF8_ALLOW_DEFAULT); if (c < 0x100) { if (tbl[c] >= 0) matches++; } else if (complement) matches++; s += ulen; } } return matches; }
STATIC I32 S_do_trans_count(pTHX_ SV *sv) { U8 *s; U8 *send; I32 matches = 0; STRLEN len; short *tbl; I32 complement = PL_op->op_private & OPpTRANS_COMPLEMENT; tbl = (short*)cPVOP->op_pv; if (!tbl) Perl_croak(aTHX_ "panic: do_trans_count line %d",__LINE__); s = (U8*)SvPV(sv, len); send = s + len; if (!SvUTF8(sv)) while (s < send) { if (tbl[*s++] >= 0) matches++; } else while (s < send) { UV c; STRLEN ulen; c = utf8n_to_uvchr(s, send - s, &ulen, 0); if (c < 0x100) { if (tbl[c] >= 0) matches++; } else if (complement) matches++; s += ulen; } return matches; }
STATIC I32 S_do_trans_simple(pTHX_ SV * const sv) { dVAR; I32 matches = 0; STRLEN len; U8 *s = (U8*)SvPV(sv,len); U8 * const send = s+len; const short * const tbl = (short*)cPVOP->op_pv; if (!tbl) Perl_croak(aTHX_ "panic: do_trans_simple line %d",__LINE__); /* First, take care of non-UTF-8 input strings, because they're easy */ if (!SvUTF8(sv)) { while (s < send) { const I32 ch = tbl[*s]; if (ch >= 0) { matches++; *s = (U8)ch; } s++; } SvSETMAGIC(sv); } else { const I32 grows = PL_op->op_private & OPpTRANS_GROWS; U8 *d; U8 *dstart; /* Allow for expansion: $_="a".chr(400); tr/a/\xFE/, FE needs encoding */ if (grows) Newx(d, len*2+1, U8); else d = s; dstart = d; while (s < send) { STRLEN ulen; I32 ch; /* Need to check this, otherwise 128..255 won't match */ const UV c = utf8n_to_uvchr(s, send - s, &ulen, UTF8_ALLOW_DEFAULT); if (c < 0x100 && (ch = tbl[c]) >= 0) { matches++; d = uvchr_to_utf8(d, ch); s += ulen; } else { /* No match -> copy */ Move(s, d, ulen, U8); d += ulen; s += ulen; } } if (grows) { sv_setpvn(sv, (char*)dstart, d - dstart); Safefree(dstart); } else { *d = '\0'; SvCUR_set(sv, d - dstart); } SvUTF8_on(sv); SvSETMAGIC(sv); } return matches; }
STATIC I32 S_do_trans_complex(pTHX_ SV * const sv) { dVAR; STRLEN len; U8 *s = (U8*)SvPV(sv, len); U8 * const send = s+len; I32 matches = 0; const short * const tbl = (short*)cPVOP->op_pv; if (!tbl) Perl_croak(aTHX_ "panic: do_trans_complex line %d",__LINE__); if (!SvUTF8(sv)) { U8 *d = s; U8 * const dstart = d; if (PL_op->op_private & OPpTRANS_SQUASH) { const U8* p = send; while (s < send) { const I32 ch = tbl[*s]; if (ch >= 0) { *d = (U8)ch; matches++; if (p != d - 1 || *p != *d) p = d++; } else if (ch == -1) /* -1 is unmapped character */ *d++ = *s; else if (ch == -2) /* -2 is delete character */ matches++; s++; } } else { while (s < send) { const I32 ch = tbl[*s]; if (ch >= 0) { matches++; *d++ = (U8)ch; } else if (ch == -1) /* -1 is unmapped character */ *d++ = *s; else if (ch == -2) /* -2 is delete character */ matches++; s++; } } *d = '\0'; SvCUR_set(sv, d - dstart); } else { /* is utf8 */ const I32 complement = PL_op->op_private & OPpTRANS_COMPLEMENT; const I32 grows = PL_op->op_private & OPpTRANS_GROWS; const I32 del = PL_op->op_private & OPpTRANS_DELETE; U8 *d; U8 *dstart; STRLEN rlen = 0; if (grows) Newx(d, len*2+1, U8); else d = s; dstart = d; if (complement && !del) rlen = tbl[0x100]; #ifdef MACOS_TRADITIONAL #define comp CoMP /* "comp" is a keyword in some compilers ... */ #endif if (PL_op->op_private & OPpTRANS_SQUASH) { UV pch = 0xfeedface; while (s < send) { STRLEN len; const UV comp = utf8n_to_uvchr(s, send - s, &len, UTF8_ALLOW_DEFAULT); I32 ch; if (comp > 0xff) { if (!complement) { Move(s, d, len, U8); d += len; } else { matches++; if (!del) { ch = (rlen == 0) ? (I32)comp : (comp - 0x100 < rlen) ? tbl[comp+1] : tbl[0x100+rlen]; if ((UV)ch != pch) { d = uvchr_to_utf8(d, ch); pch = (UV)ch; } s += len; continue; } } } else if ((ch = tbl[comp]) >= 0) { matches++; if ((UV)ch != pch) { d = uvchr_to_utf8(d, ch); pch = (UV)ch; } s += len; continue; } else if (ch == -1) { /* -1 is unmapped character */ Move(s, d, len, U8); d += len; } else if (ch == -2) /* -2 is delete character */ matches++; s += len; pch = 0xfeedface; } } else { while (s < send) { STRLEN len; const UV comp = utf8n_to_uvchr(s, send - s, &len, UTF8_ALLOW_DEFAULT); I32 ch; if (comp > 0xff) { if (!complement) { Move(s, d, len, U8); d += len; } else { matches++; if (!del) { if (comp - 0x100 < rlen) d = uvchr_to_utf8(d, tbl[comp+1]); else d = uvchr_to_utf8(d, tbl[0x100+rlen]); } } } else if ((ch = tbl[comp]) >= 0) { d = uvchr_to_utf8(d, ch); matches++; } else if (ch == -1) { /* -1 is unmapped character */ Move(s, d, len, U8); d += len; } else if (ch == -2) /* -2 is delete character */ matches++; s += len; } } if (grows) { sv_setpvn(sv, (char*)dstart, d - dstart); Safefree(dstart); } else { *d = '\0'; SvCUR_set(sv, d - dstart); } SvUTF8_on(sv); } SvSETMAGIC(sv); return matches; }