/* * * * TODO check for memory allocation failure */ Stringdist *open_stringdist(Distance d, int str_len_a, int str_len_b, ...){ va_list args; va_start(args, str_len_b); Stringdist *S = (Stringdist *) malloc(sizeof(Stringdist)); (*S) = (Stringdist) {d, NULL, NULL, NULL, NULL, 0L, 0.0, 0L}; switch (d){ case osa : S->work = (double *) malloc( (str_len_a + 1) * (str_len_b + 1) * sizeof(double)); S->weight = (double *) malloc(4*sizeof(double)); memcpy(S->weight, va_arg(args, double *), 4*sizeof(double)); break; case lv : S->work = (double *) malloc( (str_len_a + 1) * (str_len_b + 1) *sizeof(double)); S->weight = (double *) malloc(3 * sizeof(double)); memcpy(S->weight, va_arg(args, double *), 3*sizeof(double)); break; case dl : S->dict = new_dictionary( str_len_a + str_len_b + 1); S->work = (double *) malloc( (str_len_a + 3) * (str_len_b + 3) * sizeof(double)); S->weight = (double *) malloc(4*sizeof(double)); memcpy(S->weight, va_arg(args, double *), 4*sizeof(double)); break; case hamming : break; case lcs : S->work = (double *) malloc( (str_len_a + 1) * (str_len_b + 1) *sizeof(double)); break; case qgram : S->q = va_arg(args, unsigned int); S->tree = new_qtree(S->q, 2L); break; case cosine : S->q = va_arg(args, unsigned int); S->tree = new_qtree(S->q, 2L); break; case jaccard : S->q = va_arg(args, unsigned int); S->tree = new_qtree(S->q, 2L); break; case jw : // S->work = (double *) malloc( sizeof(double) * MAX(str_len_a,str_len_b)); S->work = (double *) malloc( sizeof(double) * (str_len_a+str_len_b)); S->weight = (double *) malloc(3L*sizeof(double)); memcpy(S->weight, va_arg(args, double *), 3*sizeof(double)); S->p = va_arg(args, double); break; case soundex : break; default : break; //TODO: set errno, return NULL }; va_end(args); return S; }
SEXP R_get_qgrams(SEXP a, SEXP qq){ PROTECT(a); PROTECT(qq); int q = INTEGER(qq)[0]; if ( q < 0 ){ UNPROTECT(2); error("q must be a nonnegative integer"); } SEXP strlist; int nstr, nchar, nLoc = length(a); unsigned int *str; // set up a tree; push all the qgrams. qtree *Q = new_qtree( q, nLoc); for ( int iLoc = 0; iLoc < nLoc; ++iLoc ){ strlist = VECTOR_ELT(a, iLoc); nstr = length(strlist); for ( int i=0; i < nstr; ++i ){ str = (unsigned int *) INTEGER(VECTOR_ELT(strlist,i)); nchar = length(VECTOR_ELT(strlist,i)); if ( str[0] == NA_INTEGER || q > nchar || ( q == 0 && nchar > 0 ) ){ continue ; } Q = push_string(str, nchar, q, Q, iLoc, nLoc); if ( Q == NULL ){ UNPROTECT(2); error("could not allocate enough memory"); } } } // pick and delete the tree int nqgram[1] = {0}; // helper variable for get_counts int index[1] = {0}; count_qtree(Q,nqgram); SEXP qgrams, qcount; PROTECT(qgrams = allocVector(INTSXP, q*nqgram[0])); PROTECT(qcount = allocVector(REALSXP, nLoc*nqgram[0])); get_counts(Q, q, INTEGER(qgrams), nLoc, index, REAL(qcount)); setAttrib(qcount, install("qgrams"), qgrams); free_qtree(); UNPROTECT(4); return(qcount); }
SEXP R_match_qgram_tree(SEXP x, SEXP table, SEXP nomatch, SEXP matchNA, SEXP qq, SEXP maxDist, SEXP distance){ PROTECT(x); PROTECT(table); PROTECT(nomatch); PROTECT(matchNA); PROTECT(qq); PROTECT(maxDist); PROTECT(distance); double max_dist = REAL(maxDist)[0] == 0.0 ? R_PosInf : REAL(maxDist)[0]; int dist = INTEGER(distance)[0] // choose distance function , q = INTEGER(qq)[0] , nx = length(x) , ntable = length(table) , no_match = INTEGER(nomatch)[0] , match_na = INTEGER(matchNA)[0] , bytes = IS_CHARACTER(x) , ml_x = max_length(x) , ml_t = max_length(table); // set up a qtree; qtree *Q = new_qtree(q, 2); unsigned int *X = NULL, *T = NULL; if (bytes){ X = (unsigned int *) malloc( (ml_x + ml_t) * sizeof(int)); if ( X == NULL){ UNPROTECT(7); error("Unable to allocate enough memory"); } T = X + ml_x; } // output vector SEXP yy; PROTECT(yy = allocVector(INTSXP, nx)); int *y = INTEGER(yy); double d = R_PosInf, d1 = R_PosInf; int index, isna_X, isna_T, len_X, len_T; for ( int i=0; i<nx; i++){ index = no_match; X = get_elem(x, i, bytes, &len_X, &isna_X, X); d1 = R_PosInf; for ( int j=0; j<ntable; j++){ T = get_elem(table, j, bytes, &len_T, &isna_T,T); if ( !isna_X && !isna_T ){ // both are char (usual case) d = qgram_tree( X, T, len_X, len_T, q, Q, dist ); if ( d == -2.0 ){ UNPROTECT(7); error("Unable to allocate enough memory for qgram storage"); } if ( d > max_dist ){ continue; } else if ( d > -1 && d < d1){ index = j + 1; if ( abs(d) < 1e-14 ) break; d1 = d; } } else if ( isna_X && isna_T ) { // both are NA index = match_na ? j + 1 : no_match; break; } } y[i] = index; } if ( bytes ) free(X); free_qtree(); UNPROTECT(8); return(yy); }
/* R interface to qgram distance */ SEXP R_qgram_tree(SEXP a, SEXP b, SEXP qq, SEXP distance){ PROTECT(a); PROTECT(b); PROTECT(qq); PROTECT(distance); // choose distance function int dist = INTEGER(distance)[0] , q = INTEGER(qq)[0] , na = length(a) , nb = length(b) , ml_a = max_length(a) , ml_b = max_length(b) , bytes = IS_CHARACTER(a); // set up a qtree; qtree *Q = new_qtree(q, 2L); unsigned int *s = NULL, *t = NULL; if ( bytes ){ s = (unsigned int *) malloc( (ml_a + ml_b) * sizeof(int) ); if ( s == NULL ){ UNPROTECT(4); error("Unable to allocate enough memory"); } t = s + ml_a; } // output int nt = (na > nb) ? na : nb; SEXP yy; PROTECT(yy = allocVector(REALSXP, nt)); double *y = REAL(yy); int i=0, j=0, len_s, len_t, isna_s, isna_t; for ( int k=0; k < nt; ++k , i = RECYCLE(i+1,na) , j = RECYCLE(j+1,nb) ){ s = get_elem(a, i, bytes, &len_s, &isna_s, s); t = get_elem(b, j, bytes, &len_t, &isna_t, t); if ( isna_s || isna_t ){ y[k] = NA_REAL; continue; } y[k] = qgram_tree(s, t, len_s, len_t, q, Q, dist); if (y[k] == -2.0){ UNPROTECT(5); error("Unable to allocate enough memory"); } if (y[k] == -1.0){ y[k] = R_PosInf; } } free_qtree(); if ( bytes ) free(s); UNPROTECT(5); return yy; }