/*Get qgram distances * Input * s: a string * t: a string * x: length of s * y: length of t * q: the 'q' in q-gram * Q: a qtree * int: distance distance function to compute: * 0 : q-gram distance * 1 : cosine distance * 2 : jaccard distance * * * Return values: * >=0 : qgram distance * -1 : infinite distance * -2 : Not enough memory */ double qgram_dist( unsigned int *s, int x, unsigned int *t, int y, unsigned int q, qtree **Qp, int distance ){ // rare edge case: q==0. Note that we return 0 for all cases where // q equals zero. In the R journal paper we used Inf for cases where // q=0 and |s| or |t| > 0 if ( q == 0 ) return 0.0; double dist[3] = {0,0,0}; *Qp = push_string(s, x, q, *Qp, 0, 2); *Qp = push_string(t, y, q, *Qp, 1, 2); if (*Qp == NULL) return 0; qtree *Q = *Qp; switch ( distance ){ case 0: getdist(Q,dist); break; case 1: getcosine(Q, dist); if (dist[0]==dist[1] && dist[0]==dist[2]){ // strings are equal. Prevent machine rounding about 0.0 dist[0] = 0.0; } else { // there are several ways to express the rhs (including ones that give 0L // at equal strings) but this has least chance of overflow // fabs is taken to avoid numerical -0. dist[0] = fabs(1.0 - dist[0]/(sqrt(dist[1]) * sqrt(dist[2]))); } break; case 2: getjaccard(*Qp,dist); dist[0] = 1.0 - dist[0]/dist[1]; break; default: break; } return dist[0]; }
/*Get qgram distances * Input * s: a string * t: a string * x: length of s * y: length of t * q: the 'q' in q-gram * Q: a qtree * int: distance distance function to compute: * 0 : q-gram distance * 1 : cosine distance * 2 : jaccard distance * * * Return values: * >=0 : qgram distance * -1 : infinite distance * -2 : Not enough memory */ double qgram_dist( unsigned int *s, int x, unsigned int *t, int y, unsigned int q, qtree *Q, int distance ){ // return -1 when q is larger than the length of the shortest string. if ( q > (x <= y ? x : y) ) return -1.0; // rare edge case: q==0. Note that we return 0 for all cases where // q equals zero. In the R journal paper we used Inf for cases where // q=0 and |s| or |t| > 0 if ( q == 0 ) return 0.0; double dist[3] = {0,0,0}; Q = push_string(s, x, q, Q, 0, 2); if (Q == NULL) return -2.0; Q = push_string(t, y, q, Q, 1, 2); if (Q == NULL) return -2.0; switch ( distance ){ case 0: getdist(Q,dist); break; case 1: getcosine(Q, dist); if (dist[0]==dist[1] && dist[0]==dist[2]){ // strings are equal. Prevent machine rounding about 0.0 dist[0] = 0.0; } else { // there are several ways to express the rhs (including ones that give 0L // at equal strings) but this has least chance of overflow. dist[0] = 1.0 - dist[0]/(sqrt(dist[1]) * sqrt(dist[2])); } break; case 2: getjaccard(Q,dist); dist[0] = 1.0 - dist[0]/dist[1]; break; default: break; } return dist[0]; }