예제 #1
0
static SV *sv_soundex_utf8 (SV* source)
{
  U8 *source_p;
  U8 *source_end;

  {
    STRLEN source_len;
    source_p = (U8 *) SvPV(source, source_len);
    source_end = &source_p[source_len];
  }

  while (source_p < source_end)
    {
      STRLEN offset;
      UV c = utf8n_to_uvchr(source_p, source_end-source_p, &offset, 0);
      char codepart_last = (c <= 0xFF) ? sv_soundex_table[c] : '\0';
      source_p = (offset >= 1) ? &source_p[offset] : source_end;

      if (codepart_last != '\0')
        {
          SV   *code     = newSV(SOUNDEX_ACCURACY);
          char *code_p   = SvPVX(code);
          char *code_end = &code_p[SOUNDEX_ACCURACY];

          SvCUR_set(code, SOUNDEX_ACCURACY);
          SvPOK_only(code);

          *code_p++ = toupper(c);

          while (source_p != source_end && code_p != code_end)
            {
              char codepart;
              c = utf8n_to_uvchr(source_p, source_end-source_p, &offset, 0);
              codepart = (c <= 0xFF) ? sv_soundex_table[c] : '\0';
              source_p = (offset >= 1) ? &source_p[offset] : source_end;

              if (codepart != '\0')
                if (codepart != codepart_last && (codepart_last = codepart) != '0')
                  *code_p++ = codepart;
            }

          while (code_p != code_end)
            *code_p++ = '0';

          *code_end = '\0';

          return code;
        }

      source_p++;
    }

  return SvREFCNT_inc(perl_get_sv("Text::Soundex::nocode", FALSE));
}
예제 #2
0
STATIC I32
S_do_trans_count(pTHX_ SV * const sv)
{
    dVAR;
    STRLEN len;
    const U8 *s = (const U8*)SvPV_const(sv, len);
    const U8 * const send = s + len;
    I32 matches = 0;

    const short * const tbl = (short*)cPVOP->op_pv;
    if (!tbl)
	Perl_croak(aTHX_ "panic: do_trans_count line %d",__LINE__);

    if (!SvUTF8(sv)) {
	while (s < send) {
            if (tbl[*s++] >= 0)
                matches++;
	}
    }
    else {
	const I32 complement = PL_op->op_private & OPpTRANS_COMPLEMENT;
	while (s < send) {
	    STRLEN ulen;
	    const UV c = utf8n_to_uvchr(s, send - s, &ulen, UTF8_ALLOW_DEFAULT);
	    if (c < 0x100) {
		if (tbl[c] >= 0)
		    matches++;
	    } else if (complement)
		matches++;
	    s += ulen;
	}
    }

    return matches;
}
예제 #3
0
파일: doop.c 프로젝트: coder-chenzhi/daikon
STATIC I32
S_do_trans_count(pTHX_ SV *sv)
{
    U8 *s;
    U8 *send;
    I32 matches = 0;
    STRLEN len;
    short *tbl;
    I32 complement = PL_op->op_private & OPpTRANS_COMPLEMENT;

    tbl = (short*)cPVOP->op_pv;
    if (!tbl)
	Perl_croak(aTHX_ "panic: do_trans_count line %d",__LINE__);

    s = (U8*)SvPV(sv, len);
    send = s + len;

    if (!SvUTF8(sv))
	while (s < send) {
            if (tbl[*s++] >= 0)
                matches++;
	}
    else
	while (s < send) {
	    UV c;
	    STRLEN ulen;
	    c = utf8n_to_uvchr(s, send - s, &ulen, 0);
	    if (c < 0x100) {
		if (tbl[c] >= 0)
		    matches++;
	    } else if (complement)
		matches++;
	    s += ulen;
	}

    return matches;
}
예제 #4
0
STATIC I32
S_do_trans_simple(pTHX_ SV * const sv)
{
    dVAR;
    I32 matches = 0;
    STRLEN len;
    U8 *s = (U8*)SvPV(sv,len);
    U8 * const send = s+len;

    const short * const tbl = (short*)cPVOP->op_pv;
    if (!tbl)
	Perl_croak(aTHX_ "panic: do_trans_simple line %d",__LINE__);

    /* First, take care of non-UTF-8 input strings, because they're easy */
    if (!SvUTF8(sv)) {
	while (s < send) {
	    const I32 ch = tbl[*s];
	    if (ch >= 0) {
		matches++;
		*s = (U8)ch;
	    }
	    s++;
	}
	SvSETMAGIC(sv);
    }
    else {
	const I32 grows = PL_op->op_private & OPpTRANS_GROWS;
	U8 *d;
	U8 *dstart;

	/* Allow for expansion: $_="a".chr(400); tr/a/\xFE/, FE needs encoding */
	if (grows)
	    Newx(d, len*2+1, U8);
	else
	    d = s;
	dstart = d;
	while (s < send) {
	    STRLEN ulen;
	    I32 ch;

	    /* Need to check this, otherwise 128..255 won't match */
	    const UV c = utf8n_to_uvchr(s, send - s, &ulen, UTF8_ALLOW_DEFAULT);
	    if (c < 0x100 && (ch = tbl[c]) >= 0) {
		matches++;
		d = uvchr_to_utf8(d, ch);
		s += ulen;
	    }
	    else { /* No match -> copy */
		Move(s, d, ulen, U8);
		d += ulen;
		s += ulen;
	    }
	}
	if (grows) {
	    sv_setpvn(sv, (char*)dstart, d - dstart);
	    Safefree(dstart);
	}
	else {
	    *d = '\0';
	    SvCUR_set(sv, d - dstart);
	}
	SvUTF8_on(sv);
	SvSETMAGIC(sv);
    }
    return matches;
}
예제 #5
0
STATIC I32
S_do_trans_complex(pTHX_ SV * const sv)
{
    dVAR;
    STRLEN len;
    U8 *s = (U8*)SvPV(sv, len);
    U8 * const send = s+len;
    I32 matches = 0;

    const short * const tbl = (short*)cPVOP->op_pv;
    if (!tbl)
	Perl_croak(aTHX_ "panic: do_trans_complex line %d",__LINE__);

    if (!SvUTF8(sv)) {
	U8 *d = s;
	U8 * const dstart = d;

	if (PL_op->op_private & OPpTRANS_SQUASH) {
	    const U8* p = send;
	    while (s < send) {
		const I32 ch = tbl[*s];
		if (ch >= 0) {
		    *d = (U8)ch;
		    matches++;
		    if (p != d - 1 || *p != *d)
			p = d++;
		}
		else if (ch == -1)	/* -1 is unmapped character */
		    *d++ = *s;	
		else if (ch == -2)	/* -2 is delete character */
		    matches++;
		s++;
	    }
	}
	else {
	    while (s < send) {
		const I32 ch = tbl[*s];
		if (ch >= 0) {
		    matches++;
		    *d++ = (U8)ch;
		}
		else if (ch == -1)	/* -1 is unmapped character */
		    *d++ = *s;
		else if (ch == -2)      /* -2 is delete character */
		    matches++;
		s++;
	    }
	}
	*d = '\0';
	SvCUR_set(sv, d - dstart);
    }
    else { /* is utf8 */
	const I32 complement = PL_op->op_private & OPpTRANS_COMPLEMENT;
	const I32 grows = PL_op->op_private & OPpTRANS_GROWS;
	const I32 del = PL_op->op_private & OPpTRANS_DELETE;
	U8 *d;
	U8 *dstart;
	STRLEN rlen = 0;

	if (grows)
	    Newx(d, len*2+1, U8);
	else
	    d = s;
	dstart = d;
	if (complement && !del)
	    rlen = tbl[0x100];

#ifdef MACOS_TRADITIONAL
#define comp CoMP   /* "comp" is a keyword in some compilers ... */
#endif

	if (PL_op->op_private & OPpTRANS_SQUASH) {
	    UV pch = 0xfeedface;
	    while (s < send) {
		STRLEN len;
		const UV comp = utf8n_to_uvchr(s, send - s, &len,
					       UTF8_ALLOW_DEFAULT);
		I32 ch;

		if (comp > 0xff) {
		    if (!complement) {
			Move(s, d, len, U8);
			d += len;
		    }
		    else {
			matches++;
			if (!del) {
			    ch = (rlen == 0) ? (I32)comp :
				(comp - 0x100 < rlen) ?
				tbl[comp+1] : tbl[0x100+rlen];
			    if ((UV)ch != pch) {
				d = uvchr_to_utf8(d, ch);
				pch = (UV)ch;
			    }
			    s += len;
			    continue;
			}
		    }
		}
		else if ((ch = tbl[comp]) >= 0) {
		    matches++;
		    if ((UV)ch != pch) {
		        d = uvchr_to_utf8(d, ch);
		        pch = (UV)ch;
		    }
		    s += len;
		    continue;
		}
		else if (ch == -1) {	/* -1 is unmapped character */
		    Move(s, d, len, U8);
		    d += len;
		}
		else if (ch == -2)      /* -2 is delete character */
		    matches++;
		s += len;
		pch = 0xfeedface;
	    }
	}
	else {
	    while (s < send) {
		STRLEN len;
		const UV comp = utf8n_to_uvchr(s, send - s, &len,
					       UTF8_ALLOW_DEFAULT);
		I32 ch;
		if (comp > 0xff) {
		    if (!complement) {
			Move(s, d, len, U8);
			d += len;
		    }
		    else {
			matches++;
			if (!del) {
			    if (comp - 0x100 < rlen)
				d = uvchr_to_utf8(d, tbl[comp+1]);
			    else
				d = uvchr_to_utf8(d, tbl[0x100+rlen]);
			}
		    }
		}
		else if ((ch = tbl[comp]) >= 0) {
		    d = uvchr_to_utf8(d, ch);
		    matches++;
		}
		else if (ch == -1) {	/* -1 is unmapped character */
		    Move(s, d, len, U8);
		    d += len;
		}
		else if (ch == -2)      /* -2 is delete character */
		    matches++;
		s += len;
	    }
	}
	if (grows) {
	    sv_setpvn(sv, (char*)dstart, d - dstart);
	    Safefree(dstart);
	}
	else {
	    *d = '\0';
	    SvCUR_set(sv, d - dstart);
	}
	SvUTF8_on(sv);
    }
    SvSETMAGIC(sv);
    return matches;
}