double stringdist(Stringdist *S, unsigned int *str_a, int len_a, unsigned int *str_b, int len_b){ double d = -1.0; switch(S->distance){ case osa : return osa_dist(str_a, len_a, str_b, len_b, S->weight, S->work); case lv : return lv_dist( str_a, len_a, str_b, len_b, S->weight, S->work); case dl : return dl_dist(str_a, len_a, str_b, len_b, S->weight, S->dict, S->work); case hamming : return hamming_dist(str_a, len_a, str_b, len_b); case lcs : return lcs_dist(str_a, len_a, str_b, len_b, S->work); case qgram : return qgram_dist(str_a, len_a, str_b, len_b, S->q, S->tree, 0L); case cosine : return qgram_dist(str_a, len_a, str_b, len_b, S->q, S->tree, 1L); case jaccard : d = qgram_dist(str_a, len_a, str_b, len_b, S->q, S->tree, 2L); break; case jw : return jaro_winkler_dist(str_a, len_a, str_b, len_b, S->p, S->weight, S->work); case soundex : return soundex_dist(str_a, len_a, str_b, len_b, &(S->ifail)); default : break; // set errno, return -1 } return d; }
SEXP R_match_soundex(SEXP x, SEXP table, SEXP nomatch, SEXP matchNA) { int nx = length(x); int ntable = length(table); int no_match = INTEGER(nomatch)[0]; int match_na = INTEGER(matchNA)[0]; int bytes = IS_CHARACTER(x); // when a and b are character vectors; create unsigned int vectors in which // the elements of and b will be copied unsigned int *s = NULL, *t = NULL; if (bytes) { int ml_x = max_length(x); int ml_t = max_length(table); s = (unsigned int *) malloc((ml_x + ml_t) * sizeof(unsigned int)); t = s + ml_x; if (s == NULL) { free(s); error("Unable to allocate enough memory"); } } // output vector SEXP yy = allocVector(INTSXP, nx); PROTECT(yy); int* y = INTEGER(yy); int index, isna_s, isna_t, len_s, len_t; unsigned int nfail = 0; double d; for (int i=0; i<nx; ++i) { index = no_match; s = get_elem(x, i, bytes, &len_s, &isna_s, s); for (int j=0; j<ntable; ++j) { t = get_elem(table, j, bytes, &len_t, &isna_t, t); if (!isna_s && !isna_t) { // both are char (usual case) d = soundex_dist(s, t, len_s, len_t, &nfail); if (d < 0.5) { // exact match as d can only take on values 0 and 1 index = j + 1; break; } } else if (isna_s && isna_t) { // both are NA index = match_na ? j + 1 : no_match; break; } } y[i] = index; } // cleanup and return check_fail(nfail); if (bytes) free(s); UNPROTECT(1); return(yy); }
SEXP R_soundex_dist(SEXP a, SEXP b) { int na = length(a); int nb = length(b); int nt = MAX(na,nb); int bytes = IS_CHARACTER(a); // when a and b are character vectors; create unsigned int vectors in which // the elements of and b will be copied unsigned int *s = NULL, *t = NULL; if (bytes) { int ml_a = max_length(a); int ml_b = max_length(b); s = (unsigned int *) malloc((ml_a + ml_b) * sizeof(unsigned int)); t = s + ml_a; if (s == NULL) { free(s); error("Unable to allocate enough memory"); } } // create output variable SEXP yy = allocVector(REALSXP, nt); PROTECT(yy); double *y = REAL(yy); // compute distances, skipping NA's unsigned int nfail = 0; int len_s, len_t, isna_s, isna_t; for (int k=0; k < nt; ++k, ++y) { s = get_elem(a, k % na, bytes, &len_s, &isna_s, s); t = get_elem(b, k % nb, bytes, &len_t, &isna_t, t); if (isna_s || isna_t) { (*y) = NA_REAL; } else { (*y) = soundex_dist(s, t, len_s, len_t, &nfail); } } // cleanup and return check_fail(nfail); if (bytes) free(s); UNPROTECT(1); return yy; }