/* get x.y,||x||and ||y|| for cosine distance from the tree and set all qgram-freqencies * to 0 so the tree van be reused. */ static void getcosine(qtree *Q, double *d){ if ( Q == NULL ) return; // inner product d[0] += (double) Q->n[0] * Q->n[1]; // norm of v(s,q) d[1] += (double) Q->n[0]*Q->n[0]; d[2] += (double) Q->n[1]*Q->n[1]; // clean up and continue Q->n[0] = 0; Q->n[1] = 0; getcosine(Q->left,d); getcosine(Q->right,d); }
/*Get qgram distances * Input * s: a string * t: a string * x: length of s * y: length of t * q: the 'q' in q-gram * Q: a qtree * int: distance distance function to compute: * 0 : q-gram distance * 1 : cosine distance * 2 : jaccard distance * * * Return values: * >=0 : qgram distance * -1 : infinite distance * -2 : Not enough memory */ static double qgram_tree( unsigned int *s, unsigned int *t, unsigned int x, unsigned int y, unsigned int q, qtree *Q, int distance ){ // return -1 when q is larger than the length of the shortest string. if ( q > (x <= y ? x : y) ) return -1.0; // rare edge cases. if ( q == 0 ){ if ( x + y > 0 ){ // distance undefined return -1.0; } else { // x == y == 0. return 0.0; } } double dist[3] = {0,0,0}; Q = push_string(s, x, q, Q, 0, 2); if (Q == NULL) return -2.0; Q = push_string(t, y, q, Q, 1, 2); if (Q == NULL) return -2.0; switch ( distance ){ case 0: getdist(Q,dist); break; case 1: getcosine(Q, dist); if (dist[0]==dist[1] && dist[0]==dist[2]){ // strings are equal. Prevent machine rounding about 0.0 dist[0] = 0.0; } else { // there are several ways to express the rhs (including ones that give 0L // at equal strings) but this has least chance of overflow. dist[0] = 1.0 - dist[0]/(sqrt(dist[1]) * sqrt(dist[2])); } break; case 2: getjaccard(Q,dist); dist[0] = 1.0 - dist[0]/dist[1]; break; default: break; } return dist[0]; }
/*Get qgram distances * Input * s: a string * t: a string * x: length of s * y: length of t * q: the 'q' in q-gram * Q: a qtree * int: distance distance function to compute: * 0 : q-gram distance * 1 : cosine distance * 2 : jaccard distance * * * Return values: * >=0 : qgram distance * -1 : infinite distance * -2 : Not enough memory */ double qgram_dist( unsigned int *s, int x, unsigned int *t, int y, unsigned int q, qtree **Qp, int distance ){ // rare edge case: q==0. Note that we return 0 for all cases where // q equals zero. In the R journal paper we used Inf for cases where // q=0 and |s| or |t| > 0 if ( q == 0 ) return 0.0; double dist[3] = {0,0,0}; *Qp = push_string(s, x, q, *Qp, 0, 2); *Qp = push_string(t, y, q, *Qp, 1, 2); if (*Qp == NULL) return 0; qtree *Q = *Qp; switch ( distance ){ case 0: getdist(Q,dist); break; case 1: getcosine(Q, dist); if (dist[0]==dist[1] && dist[0]==dist[2]){ // strings are equal. Prevent machine rounding about 0.0 dist[0] = 0.0; } else { // there are several ways to express the rhs (including ones that give 0L // at equal strings) but this has least chance of overflow // fabs is taken to avoid numerical -0. dist[0] = fabs(1.0 - dist[0]/(sqrt(dist[1]) * sqrt(dist[2]))); } break; case 2: getjaccard(*Qp,dist); dist[0] = 1.0 - dist[0]/dist[1]; break; default: break; } return dist[0]; }
/*Get qgram distances * Input * s: a string * t: a string * x: length of s * y: length of t * q: the 'q' in q-gram * Q: a qtree * int: distance distance function to compute: * 0 : q-gram distance * 1 : cosine distance * 2 : jaccard distance * * * Return values: * >=0 : qgram distance * -1 : infinite distance * -2 : Not enough memory */ double qgram_dist( unsigned int *s, int x, unsigned int *t, int y, unsigned int q, qtree *Q, int distance ){ // return -1 when q is larger than the length of the shortest string. if ( q > (x <= y ? x : y) ) return -1.0; // rare edge case: q==0. Note that we return 0 for all cases where // q equals zero. In the R journal paper we used Inf for cases where // q=0 and |s| or |t| > 0 if ( q == 0 ) return 0.0; double dist[3] = {0,0,0}; Q = push_string(s, x, q, Q, 0, 2); if (Q == NULL) return -2.0; Q = push_string(t, y, q, Q, 1, 2); if (Q == NULL) return -2.0; switch ( distance ){ case 0: getdist(Q,dist); break; case 1: getcosine(Q, dist); if (dist[0]==dist[1] && dist[0]==dist[2]){ // strings are equal. Prevent machine rounding about 0.0 dist[0] = 0.0; } else { // there are several ways to express the rhs (including ones that give 0L // at equal strings) but this has least chance of overflow. dist[0] = 1.0 - dist[0]/(sqrt(dist[1]) * sqrt(dist[2])); } break; case 2: getjaccard(Q,dist); dist[0] = 1.0 - dist[0]/dist[1]; break; default: break; } return dist[0]; }