Ejemplo n.º 1
0
/*Get qgram distances 
 * Input
 * s: a string
 * t: a string
 * x: length of s
 * y: length of t
 * q: the 'q' in q-gram
 * Q: a qtree
 * int: distance distance function to compute:
 *  0 : q-gram distance
 *  1 : cosine distance
 *  2 : jaccard distance
 *
 *
 * Return values:
 *  >=0 : qgram distance
 * -1   : infinite distance
 * -2   : Not enough memory
 */
double qgram_dist(
    unsigned int *s, 
    int x,
    unsigned int *t, 
    int y,
    unsigned int q, 
    qtree **Qp,
    int distance
  ){

  // rare edge case: q==0. Note that we return 0 for all cases where
  // q equals zero. In the R journal paper we used Inf for cases where 
  // q=0 and |s| or |t| > 0
  if ( q == 0 ) return 0.0;

  double dist[3] = {0,0,0};
  *Qp = push_string(s, x, q, *Qp, 0, 2);

  *Qp = push_string(t, y, q, *Qp, 1, 2);
  if (*Qp == NULL) return 0;
  

  qtree *Q = *Qp;
  switch ( distance ){
    case 0:
      getdist(Q,dist);
      break;
    case 1:
      getcosine(Q, dist);
      if (dist[0]==dist[1] && dist[0]==dist[2]){
        // strings are equal. Prevent machine rounding about 0.0
        dist[0] =  0.0;
      } else {
        // there are several ways to express the rhs (including ones that give 0L 
        // at equal strings) but this has least chance of overflow
        // fabs is taken to avoid numerical -0.
        dist[0] = fabs(1.0 - dist[0]/(sqrt(dist[1]) * sqrt(dist[2])));
      }
      break;
    case 2:
      getjaccard(*Qp,dist);
      dist[0] = 1.0 - dist[0]/dist[1];
      break;
    default:
      break;
  }

  return dist[0];
}
Ejemplo n.º 2
0
/*Get qgram distances 
 * Input
 * s: a string
 * t: a string
 * x: length of s
 * y: length of t
 * q: the 'q' in q-gram
 * Q: a qtree
 * int: distance distance function to compute:
 *  0 : q-gram distance
 *  1 : cosine distance
 *  2 : jaccard distance
 *
 *
 * Return values:
 *  >=0 : qgram distance
 * -1   : infinite distance
 * -2   : Not enough memory
 */
double qgram_dist(
    unsigned int *s, 
    int x,
    unsigned int *t, 
    int y,
    unsigned int q, 
    qtree *Q,
    int distance
  ){
  // return -1 when q is larger than the length of the shortest string.
  if ( q > (x <= y ? x : y) ) return -1.0;
  // rare edge case: q==0. Note that we return 0 for all cases where
  // q equals zero. In the R journal paper we used Inf for cases where 
  // q=0 and |s| or |t| > 0
  if ( q == 0 ) return 0.0;
  
  double dist[3] = {0,0,0};
  Q = push_string(s, x, q, Q, 0, 2);
  if (Q == NULL) return -2.0;
  Q = push_string(t, y, q, Q, 1, 2);
  if (Q == NULL) return -2.0;

  switch ( distance ){
    case 0:
      getdist(Q,dist);
      break;
    case 1:
      getcosine(Q, dist);
      if (dist[0]==dist[1] && dist[0]==dist[2]){
        // strings are equal. Prevent machine rounding about 0.0
        dist[0] =  0.0;
      } else {
        // there are several ways to express the rhs (including ones that give 0L 
        // at equal strings) but this has least chance of overflow.
        dist[0] = 1.0 - dist[0]/(sqrt(dist[1]) * sqrt(dist[2]));
      }
      break;
    case 2:
      getjaccard(Q,dist);
      dist[0] = 1.0 - dist[0]/dist[1];
      break;
    default:
      break;
  }
  return dist[0];
}