void free_qtree(qNode *qtree_node) { if(qtree_node) { if(!qtree_node->leaf){ free_qtree(qtree_node->child_NW); free_qtree(qtree_node->child_NE); free_qtree(qtree_node->child_SE); free_qtree(qtree_node->child_SW); } free(qtree_node); } }
/* push qgrams of a string into binary tree */ static qtree *push_string(unsigned int *str, int strlen, unsigned int q, qtree *Q, int iLoc, int nLoc){ qtree *P; for ( int i=0; i < strlen - q + 1; ++i ){ P = push(Q, str + i, q, iLoc, nLoc); if ( P == NULL ){ free_qtree(); return NULL; } Q = P; } return Q; }
void close_stringdist(Stringdist *S){ free(S->work); free(S->weight); if (S->distance == dl){ free_dictionary(S->dict); } if (S->distance == qgram || S->distance == cosine || S->distance == jaccard){ free_qtree(S->tree); } free(S); }
SEXP R_get_qgrams(SEXP a, SEXP qq){ PROTECT(a); PROTECT(qq); int q = INTEGER(qq)[0]; if ( q < 0 ){ UNPROTECT(2); error("q must be a nonnegative integer"); } SEXP strlist; int nstr, nchar, nLoc = length(a); unsigned int *str; // set up a tree; push all the qgrams. qtree *Q = new_qtree( q, nLoc); for ( int iLoc = 0; iLoc < nLoc; ++iLoc ){ strlist = VECTOR_ELT(a, iLoc); nstr = length(strlist); for ( int i=0; i < nstr; ++i ){ str = (unsigned int *) INTEGER(VECTOR_ELT(strlist,i)); nchar = length(VECTOR_ELT(strlist,i)); if ( str[0] == NA_INTEGER || q > nchar || ( q == 0 && nchar > 0 ) ){ continue ; } Q = push_string(str, nchar, q, Q, iLoc, nLoc); if ( Q == NULL ){ UNPROTECT(2); error("could not allocate enough memory"); } } } // pick and delete the tree int nqgram[1] = {0}; // helper variable for get_counts int index[1] = {0}; count_qtree(Q,nqgram); SEXP qgrams, qcount; PROTECT(qgrams = allocVector(INTSXP, q*nqgram[0])); PROTECT(qcount = allocVector(REALSXP, nLoc*nqgram[0])); get_counts(Q, q, INTEGER(qgrams), nLoc, index, REAL(qcount)); setAttrib(qcount, install("qgrams"), qgrams); free_qtree(); UNPROTECT(4); return(qcount); }
SEXP R_match_qgram_tree(SEXP x, SEXP table, SEXP nomatch, SEXP matchNA, SEXP qq, SEXP maxDist, SEXP distance){ PROTECT(x); PROTECT(table); PROTECT(nomatch); PROTECT(matchNA); PROTECT(qq); PROTECT(maxDist); PROTECT(distance); double max_dist = REAL(maxDist)[0] == 0.0 ? R_PosInf : REAL(maxDist)[0]; int dist = INTEGER(distance)[0] // choose distance function , q = INTEGER(qq)[0] , nx = length(x) , ntable = length(table) , no_match = INTEGER(nomatch)[0] , match_na = INTEGER(matchNA)[0] , bytes = IS_CHARACTER(x) , ml_x = max_length(x) , ml_t = max_length(table); // set up a qtree; qtree *Q = new_qtree(q, 2); unsigned int *X = NULL, *T = NULL; if (bytes){ X = (unsigned int *) malloc( (ml_x + ml_t) * sizeof(int)); if ( X == NULL){ UNPROTECT(7); error("Unable to allocate enough memory"); } T = X + ml_x; } // output vector SEXP yy; PROTECT(yy = allocVector(INTSXP, nx)); int *y = INTEGER(yy); double d = R_PosInf, d1 = R_PosInf; int index, isna_X, isna_T, len_X, len_T; for ( int i=0; i<nx; i++){ index = no_match; X = get_elem(x, i, bytes, &len_X, &isna_X, X); d1 = R_PosInf; for ( int j=0; j<ntable; j++){ T = get_elem(table, j, bytes, &len_T, &isna_T,T); if ( !isna_X && !isna_T ){ // both are char (usual case) d = qgram_tree( X, T, len_X, len_T, q, Q, dist ); if ( d == -2.0 ){ UNPROTECT(7); error("Unable to allocate enough memory for qgram storage"); } if ( d > max_dist ){ continue; } else if ( d > -1 && d < d1){ index = j + 1; if ( abs(d) < 1e-14 ) break; d1 = d; } } else if ( isna_X && isna_T ) { // both are NA index = match_na ? j + 1 : no_match; break; } } y[i] = index; } if ( bytes ) free(X); free_qtree(); UNPROTECT(8); return(yy); }
/* R interface to qgram distance */ SEXP R_qgram_tree(SEXP a, SEXP b, SEXP qq, SEXP distance){ PROTECT(a); PROTECT(b); PROTECT(qq); PROTECT(distance); // choose distance function int dist = INTEGER(distance)[0] , q = INTEGER(qq)[0] , na = length(a) , nb = length(b) , ml_a = max_length(a) , ml_b = max_length(b) , bytes = IS_CHARACTER(a); // set up a qtree; qtree *Q = new_qtree(q, 2L); unsigned int *s = NULL, *t = NULL; if ( bytes ){ s = (unsigned int *) malloc( (ml_a + ml_b) * sizeof(int) ); if ( s == NULL ){ UNPROTECT(4); error("Unable to allocate enough memory"); } t = s + ml_a; } // output int nt = (na > nb) ? na : nb; SEXP yy; PROTECT(yy = allocVector(REALSXP, nt)); double *y = REAL(yy); int i=0, j=0, len_s, len_t, isna_s, isna_t; for ( int k=0; k < nt; ++k , i = RECYCLE(i+1,na) , j = RECYCLE(j+1,nb) ){ s = get_elem(a, i, bytes, &len_s, &isna_s, s); t = get_elem(b, j, bytes, &len_t, &isna_t, t); if ( isna_s || isna_t ){ y[k] = NA_REAL; continue; } y[k] = qgram_tree(s, t, len_s, len_t, q, Q, dist); if (y[k] == -2.0){ UNPROTECT(5); error("Unable to allocate enough memory"); } if (y[k] == -1.0){ y[k] = R_PosInf; } } free_qtree(); if ( bytes ) free(s); UNPROTECT(5); return yy; }