Esempio n. 1
0
double stringdist(Stringdist *S, unsigned int *str_a, int len_a, unsigned int *str_b, int len_b){
  double d = -1.0;

  switch(S->distance){
    case osa :
     return osa_dist(str_a, len_a, str_b, len_b, S->weight, S->work);
    case lv :
      return lv_dist( str_a, len_a, str_b, len_b, S->weight, S->work);
    case dl :
      return dl_dist(str_a, len_a, str_b, len_b, S->weight, S->dict, S->work);
    case hamming :
      return hamming_dist(str_a, len_a, str_b, len_b);
    case lcs :
      return lcs_dist(str_a, len_a, str_b, len_b, S->work); 
    case qgram :
      return qgram_dist(str_a, len_a, str_b, len_b, S->q, S->tree, 0L);
    case cosine :
      return qgram_dist(str_a, len_a, str_b, len_b, S->q, S->tree, 1L);
    case jaccard :
      d = qgram_dist(str_a, len_a, str_b, len_b, S->q, S->tree, 2L);
      break;
    case jw :
      return jaro_winkler_dist(str_a, len_a, str_b, len_b, S->p, S->weight, S->work);
    case soundex :
      return soundex_dist(str_a, len_a, str_b, len_b, &(S->ifail));
    default :
      break;
      // set errno, return -1
  }
    return d;
}
Esempio n. 2
0
SEXP R_match_soundex(SEXP x, SEXP table, SEXP nomatch, SEXP matchNA) {

  int nx = length(x);
  int ntable = length(table);
  int no_match = INTEGER(nomatch)[0];
  int match_na = INTEGER(matchNA)[0];
  int bytes = IS_CHARACTER(x);

  // when a and b are character vectors; create unsigned int vectors in which
  // the elements of and b will be copied
  unsigned int *s = NULL, *t = NULL;
  if (bytes) {
    int ml_x = max_length(x);
    int ml_t = max_length(table);
    s = (unsigned int *) malloc((ml_x + ml_t) * sizeof(unsigned int));
    t = s + ml_x;
    if (s == NULL) {
       free(s);
       error("Unable to allocate enough memory");
    }
  }

  // output vector
  SEXP yy = allocVector(INTSXP, nx);
  PROTECT(yy);
  int* y = INTEGER(yy);

  int index, isna_s, isna_t, len_s, len_t;
  unsigned int nfail = 0;
  double d;
  for (int i=0; i<nx; ++i) {
    index = no_match;
    s = get_elem(x, i, bytes, &len_s, &isna_s, s);

    for (int j=0; j<ntable; ++j) {
      t = get_elem(table, j, bytes, &len_t, &isna_t, t);

      if (!isna_s && !isna_t) {        // both are char (usual case)
        d = soundex_dist(s, t, len_s, len_t, &nfail);
        if (d < 0.5) { // exact match as d can only take on values 0 and 1
          index = j + 1;
          break;
        } 
      } else if (isna_s && isna_t) {  // both are NA
        index = match_na ? j + 1 : no_match;
        break;
      }
    }
    y[i] = index;
  }   
  // cleanup and return
  check_fail(nfail);
  if (bytes) free(s); 
  UNPROTECT(1);
  return(yy);
}
Esempio n. 3
0
SEXP R_soundex_dist(SEXP a, SEXP b) {
  int na = length(a);
  int nb = length(b);
  int nt = MAX(na,nb);
  int bytes = IS_CHARACTER(a);

  // when a and b are character vectors; create unsigned int vectors in which
  // the elements of and b will be copied
  unsigned int *s = NULL, *t = NULL;
  if (bytes) {
    int ml_a = max_length(a);
    int ml_b = max_length(b);
    s = (unsigned int *) malloc((ml_a + ml_b) * sizeof(unsigned int));
    t = s + ml_a;
    if (s == NULL) {
       free(s);
       error("Unable to allocate enough memory");
    }
  }

  // create output variable
  SEXP yy = allocVector(REALSXP, nt);
  PROTECT(yy);
  double *y = REAL(yy);

  // compute distances, skipping NA's
  unsigned int nfail = 0;
  int len_s, len_t, isna_s, isna_t;
  for (int k=0; k < nt; ++k, ++y) {
    s = get_elem(a, k % na, bytes, &len_s, &isna_s, s);
    t = get_elem(b, k % nb, bytes, &len_t, &isna_t, t);
    if (isna_s || isna_t) {
      (*y) = NA_REAL;
    } else { 
      (*y) = soundex_dist(s, t, len_s, len_t, &nfail);
    } 
  }
  // cleanup and return
  check_fail(nfail);
  if (bytes) free(s);
  UNPROTECT(1);
  return yy;
}