Exemple #1
0
/* 
 *
 *
 * TODO check for memory allocation failure
 */
Stringdist *open_stringdist(Distance d, int str_len_a, int str_len_b, ...){
  va_list args;
  va_start(args, str_len_b);

  Stringdist *S = (Stringdist *) malloc(sizeof(Stringdist)); 
  (*S) = (Stringdist) {d, NULL, NULL, NULL, NULL, 0L, 0.0, 0L};
  switch (d){
    case osa :
      S->work = (double *) malloc( (str_len_a + 1) * (str_len_b + 1) * sizeof(double)); 
      S->weight = (double *) malloc(4*sizeof(double));
      memcpy(S->weight, va_arg(args, double *), 4*sizeof(double));
      break;
    case lv :
      S->work = (double *) malloc( (str_len_a + 1) * (str_len_b + 1) *sizeof(double));
      S->weight = (double *) malloc(3 * sizeof(double));
      memcpy(S->weight, va_arg(args, double *), 3*sizeof(double));
      break;
    case dl :
      S->dict = new_dictionary( str_len_a + str_len_b + 1);
      S->work = (double *) malloc( (str_len_a + 3) * (str_len_b + 3) * sizeof(double)); 
      S->weight = (double *) malloc(4*sizeof(double));
      memcpy(S->weight, va_arg(args, double *), 4*sizeof(double));
      break;
    case hamming :
      break;
    case lcs :
      S->work = (double *) malloc( (str_len_a + 1) * (str_len_b + 1) *sizeof(double));
      break;
    case qgram :
      S->q = va_arg(args, unsigned int);
      S->tree = new_qtree(S->q, 2L); 
      break;
    case cosine :
      S->q = va_arg(args, unsigned int);
      S->tree = new_qtree(S->q, 2L); 
      break;
    case jaccard :
      S->q = va_arg(args, unsigned int);
      S->tree = new_qtree(S->q, 2L); 
      break;
    case jw :
//      S->work = (double *) malloc( sizeof(double) * MAX(str_len_a,str_len_b));
      S->work = (double *) malloc( sizeof(double) * (str_len_a+str_len_b));

      S->weight = (double *) malloc(3L*sizeof(double));
      memcpy(S->weight, va_arg(args, double *), 3*sizeof(double));
      S->p = va_arg(args, double);
      break;
    case soundex :
      break;
    default :
      break;
      //TODO: set errno, return NULL
  };

  va_end(args);
  return S; 
  
}
Exemple #2
0
SEXP R_get_qgrams(SEXP a, SEXP qq){
  PROTECT(a);
  PROTECT(qq);

  int q = INTEGER(qq)[0];

  if ( q < 0 ){
    UNPROTECT(2);
    error("q must be a nonnegative integer");
  }


  SEXP strlist;
  int nstr, nchar, nLoc = length(a);
  unsigned int *str;
  
  // set up a tree; push all the qgrams.
  qtree *Q = new_qtree( q, nLoc);
  
  for ( int iLoc = 0; iLoc < nLoc; ++iLoc ){
    strlist = VECTOR_ELT(a, iLoc);
    nstr    = length(strlist);
 
    for ( int i=0; i < nstr; ++i ){
      str   = (unsigned int *) INTEGER(VECTOR_ELT(strlist,i));
      nchar = length(VECTOR_ELT(strlist,i));
      if ( str[0] == NA_INTEGER 
          || q > nchar
          || ( q == 0 && nchar > 0 )
        ){
        continue ;
      }
      Q = push_string(str, nchar, q, Q, iLoc, nLoc);
      if ( Q == NULL ){
        UNPROTECT(2);
        error("could not allocate enough memory");
      }
    }
  }
  // pick and delete the tree

  int nqgram[1] = {0};

  // helper variable for get_counts 
  int index[1] = {0};

  count_qtree(Q,nqgram);  

  SEXP qgrams, qcount;
  PROTECT(qgrams = allocVector(INTSXP, q*nqgram[0]));
  PROTECT(qcount = allocVector(REALSXP, nLoc*nqgram[0]));

  get_counts(Q, q, INTEGER(qgrams), nLoc, index, REAL(qcount));
  
  setAttrib(qcount, install("qgrams"), qgrams);
  
  free_qtree();
  UNPROTECT(4);

  return(qcount);
}
Exemple #3
0
SEXP R_match_qgram_tree(SEXP x, SEXP table, SEXP nomatch, SEXP matchNA, SEXP qq, SEXP maxDist, SEXP distance){
  PROTECT(x);
  PROTECT(table);
  PROTECT(nomatch);
  PROTECT(matchNA);
  PROTECT(qq);
  PROTECT(maxDist);
  PROTECT(distance);

  double max_dist = REAL(maxDist)[0] == 0.0 ? R_PosInf : REAL(maxDist)[0];
  
  
  int dist = INTEGER(distance)[0] // choose distance function
    , q = INTEGER(qq)[0]
    , nx = length(x)
    , ntable = length(table)
    , no_match = INTEGER(nomatch)[0]
    , match_na = INTEGER(matchNA)[0]
    , bytes = IS_CHARACTER(x)
    , ml_x = max_length(x)
    , ml_t = max_length(table);
  
  // set up a qtree;
  qtree *Q = new_qtree(q, 2);

  unsigned int *X = NULL, *T = NULL;
  if (bytes){
    X = (unsigned int *) malloc( (ml_x + ml_t) * sizeof(int));
    if ( X == NULL){
      UNPROTECT(7);
      error("Unable to allocate enough memory");
    }
    T = X + ml_x;
  }

  // output vector
  SEXP yy;
  PROTECT(yy = allocVector(INTSXP, nx));
  int *y = INTEGER(yy);


  double d = R_PosInf, d1 = R_PosInf;
  int index, isna_X, isna_T, len_X, len_T;

  for ( int i=0; i<nx; i++){
    index = no_match;
    X = get_elem(x, i, bytes, &len_X, &isna_X, X);
    d1 = R_PosInf;
    for ( int j=0; j<ntable; j++){

      T = get_elem(table, j, bytes, &len_T, &isna_T,T);

      if ( !isna_X && !isna_T ){        // both are char (usual case)
        d = qgram_tree(
          X, T, len_X, len_T, q, Q, dist
        );
        if ( d == -2.0 ){
          UNPROTECT(7);
          error("Unable to allocate enough memory for qgram storage");
        }
        if ( d > max_dist ){
          continue;
        } else if ( d > -1 && d < d1){ 
          index = j + 1;
          if ( abs(d) < 1e-14 ) break; 
          d1 = d;
        }
      } else if ( isna_X && isna_T ) {  // both are NA
        index = match_na ? j + 1 : no_match;
        break;
      }
    }
    
    y[i] = index;
  } 
  if ( bytes ) free(X);
  free_qtree();
  UNPROTECT(8);
  return(yy);
}
Exemple #4
0
/* R interface to qgram distance */
SEXP R_qgram_tree(SEXP a, SEXP b, SEXP qq, SEXP distance){
  PROTECT(a);
  PROTECT(b);
  PROTECT(qq);
  PROTECT(distance);
  // choose distance function

  int dist = INTEGER(distance)[0]
    , q = INTEGER(qq)[0]
    , na = length(a)
    , nb = length(b)
    , ml_a = max_length(a)
    , ml_b = max_length(b)
    , bytes = IS_CHARACTER(a);

  // set up a qtree; 
  qtree *Q = new_qtree(q, 2L);
  unsigned int *s = NULL, *t = NULL;
  if ( bytes ){
    s = (unsigned int *) malloc( (ml_a + ml_b) * sizeof(int) );
    if ( s == NULL ){ 
      UNPROTECT(4);
      error("Unable to allocate enough memory");
    }
    t = s + ml_a;
  }

  // output
  int nt = (na > nb) ? na : nb;
  SEXP yy; 
  PROTECT(yy = allocVector(REALSXP, nt));
  double *y = REAL(yy);

  
 
  int i=0, j=0, len_s, len_t, isna_s, isna_t;
  for ( int k=0; k < nt; ++k 
      , i = RECYCLE(i+1,na)
      , j = RECYCLE(j+1,nb) ){

    s = get_elem(a, i, bytes, &len_s, &isna_s, s);
    t = get_elem(b, j, bytes, &len_t, &isna_t, t);

    if ( isna_s || isna_t ){
      y[k] = NA_REAL;
      continue;
    }
    y[k] = qgram_tree(s, t, len_s, len_t, q, Q, dist);
    if (y[k] == -2.0){
      UNPROTECT(5);
      error("Unable to allocate enough memory");
    }
    if (y[k] == -1.0){
      y[k] = R_PosInf;
    }
  }


  free_qtree();
  if ( bytes ) free(s);
  UNPROTECT(5);
  return yy;
}