Beispiel #1
0
/*
 * lca_naive_lookup
 *
 * Compute the LCA of two suffix tree nodes, using the naive algorithm
 * of walking up the two paths from the nodes to the root until arriving
 * at a common node on both paths.
 *
 * This works because the suffix tree identifiers are given in a depth-first
 * search manner.  So, repeatedly taking the node with the higher numbered
 * identifier and moving to its parent (until the two identifiers are
 * equal) will find the least common ancestor.
 *
 * Parameters:  lca  -  an LCA_STRUCT structure
 *              x    -  a suffix tree node
 *              y    -  another suffix tree node
 *
 * Returns:  the suffix tree node which is the LCA of x and y
 */
STREE_NODE lca_naive_lookup(LCA_STRUCT *lca, STREE_NODE x, STREE_NODE y)
{
    assert (lca && lca->type == LCA_NAIVE && x && y);

    SUFFIX_TREE tree = lca->tree;

    int xid = stree_get_ident(tree, x);
    int yid = stree_get_ident(tree, y);
    while (xid != yid) {
        
        while (xid > yid) {
            x = stree_get_parent(tree, x);
            xid = stree_get_ident(tree, x);
            IF_STATS(lca->num_compares++);
        }

        while (xid < yid) {
            y = stree_get_parent(tree, y);
            yid = stree_get_ident(tree, y);
            IF_STATS(lca->num_compares++);
        }
    }

    IF_STATS(lca->num_compares++);
    return x;
}
Beispiel #2
0
/*
 * lca_lookup
 *
 * Perform the constant time LCA computation, finding the least
 * common ancestor of x and y.
 *
 * Parameters:  lca  -  an LCA_STRUCT structure
 *              x    -  a suffix tree node
 *              y    -  another suffix tree node
 *
 * Returns:  the suffix tree node which is the LCA of x and y
 */
STREE_NODE lca_lookup(LCA_STRUCT *lca, STREE_NODE x, STREE_NODE y)
{
    assert(lca && lca->tree && lca->type == LCA_LINEAR && x && y);
   
    SUFFIX_TREE tree = lca->tree;
    unsigned int *I = lca->I;
    unsigned int *A = lca->A;
    STREE_NODE *L = lca->L;
      
    // Shift idents so that they go from 1..num_nodes.
    unsigned int xid = (unsigned int)stree_get_ident(tree, x) + 1;
    unsigned int yid = (unsigned int)stree_get_ident(tree, y) + 1;

   /*
    * Steps 1 and 2.
    *
    * Step 1 here differs from the book in that it returns the most
    * significant bit counting from the right (and starting the count
    * with 0), and then simply OR's the k+1..32 bits of I[xid] and the
    * number 2^k.
    */
    //printf("xid=%3d, I[xid]=%3d\n", xid, I[xid]); 
    // printf("yid=%3d, I[yid]=%3d\n", yid, I[yid]); 
    unsigned int k = MSB(I[xid] ^ I[yid]);
    unsigned int b = (I[xid] & HIGH_BITS(k+1)) | (1 << k);
    unsigned int j = h( (A[xid] & A[yid]) & HIGH_BITS(h(b)) );
//    printf("k=%d, b=%d, j =%d\n", k, b, j);

    IF_STATS(lca->num_compares++);

    // Step 3.
    unsigned int l = h(A[xid]);
    STREE_NODE xbar, ybar; 
    if (l == j) {
        xbar = x;
    } else {
        k = MSB(A[xid] & ~HIGH_BITS(j));
        xbar = stree_get_parent(tree, L[(I[xid] & HIGH_BITS(k+1)) | (1 << k)]);
    }
    IF_STATS(lca->num_compares++);
   
    //  Step 4.
    l = h(A[yid]);
    if (l == j) {
        ybar = y;
    } else {
        k = MSB(A[yid] & ~HIGH_BITS(j));
        ybar = stree_get_parent(tree, L[(I[yid] & HIGH_BITS(k+1)) | (1 << k)]);
    }
    IF_STATS(lca->num_compares++);

    // Step 5.
    IF_STATS(lca->num_compares++);

    return (stree_get_ident(tree, xbar) < stree_get_ident(tree, ybar)) ? xbar : ybar;
}
Beispiel #3
0
/*
 * int_stree_convert_leafnode
 *
 * Convert a LEAF structure into a NODE structure and replace the
 * NODE for the LEAF in the suffix tree..
 *
 * Parameters:  tree  -  a suffix tree
 *              node  -  a leaf of the tree
 *
 * Returns:  The NODE structure corresponding to the leaf, or NULL.
 */
STREE_NODE int_stree_convert_leafnode(SUFFIX_TREE tree, STREE_NODE node)
{
  STREE_NODE newnode;
  STREE_LEAF leaf;
  STREE_INTLEAF intleaf;

  if (!int_stree_isaleaf(tree, node))
    return node;

  leaf = (STREE_LEAF) node;

  newnode = int_stree_new_node(tree, stree_get_edgestr(tree, node),
                               stree_get_edgelen(tree, node));
  if (newnode == NULL)
    return NULL;

  intleaf = int_stree_new_intleaf(tree, leaf->strid,
                                  int_stree_get_leafpos(tree, leaf));
  if (intleaf == NULL) {
    int_stree_free_node(tree, newnode);
    return NULL;
  }

  newnode->id = leaf->id;
  newnode->isaleaf = 0;
  newnode->ch = 1;
  newnode->children = (STREE_NODE) intleaf;

  int_stree_reconnect(tree, stree_get_parent(tree, node), node, newnode);
  int_stree_free_leaf(tree, leaf);

  return newnode;
}
Beispiel #4
0
/*
 * int_stree_edge_merge
 *
 * When a node has no "leaves" and only one child, this function will
 * remove that node and merge the edges from parent to node and node
 * to child into a single edge from parent to child.
 *
 * Parameters:  tree  -  A suffix tree
 *              node  -  The tree node to be removed
 *
 * Return:  nothing.
 */
void int_stree_edge_merge(SUFFIX_TREE tree, STREE_NODE node)
{
  int len;
  STREE_NODE parent, child;
  STREE_LEAF leaf;

  if (node == stree_get_root(tree) || int_stree_isaleaf(tree, node) ||
      int_stree_has_intleaves(tree, node))
    return;
  
  parent = stree_get_parent(tree, node);
  child = stree_get_children(tree, node);
  if (stree_get_next(tree, child) != NULL)
    return;

  len = stree_get_edgelen(tree, node);
  if (int_stree_isaleaf(tree, child)) {
    leaf = (STREE_LEAF) child;
    leaf->pos -= len;
    leaf->ch = stree_get_mapch(tree, node);
  }
  else {
    child->edgestr -= len;
    child->edgelen += len;
  }

  int_stree_reconnect(tree, parent, node, child);
  tree->num_nodes--;
  tree->idents_dirty = 1;

  int_stree_free_node(tree, node);
}
Beispiel #5
0
/*
 * stree_get_labellen
 *
 * Get the length of the string labelling the path from the root to
 * a tree node.
 *
 * Parameters:  tree  -  a suffix tree
 *              node  -  a tree node
 *
 * Returns:  the length of the node's label.
 */
int stree_get_labellen(SUFFIX_TREE tree, STREE_NODE node)
{
  int len;
  
  len = 0;
  while (node != stree_get_root(tree)) {
    len += stree_get_edgelen(tree, node);
    node = stree_get_parent(tree, node);
  }
  return len;
}
Beispiel #6
0
/*
 * int_stree_edge_split
 *
 * Splits an edge of the suffix tree, and adds a new node between two
 * existing nodes at that split point.
 *
 * Parameters:  tree  -  a suffix tree
 *              node  -  The tree node just below the split.
 *              len   -  How far down node's edge label the split is.
 *
 * Return:  The new node added at the split.
 */
STREE_NODE int_stree_edge_split(SUFFIX_TREE tree, STREE_NODE node, int len)
{
  char *edgestr;
  STREE_NODE newnode, parent;
  STREE_LEAF leaf;

  if (tree->num_nodes == MAXNUMNODES || len == 0 ||
      stree_get_edgelen(tree, node) <= len)
    return NULL;

  edgestr = stree_get_edgestr(tree, node);

  newnode = int_stree_new_node(tree, edgestr, len);
  if (newnode == NULL)
    return NULL;

  parent = stree_get_parent(tree, node);
  int_stree_reconnect(tree, parent, node, newnode);

  if (int_stree_isaleaf(tree, node)) {
    leaf = (STREE_LEAF) node;
    leaf->pos += len;
    leaf->ch = stree_mapch(tree, edgestr[len]);
  }
  else {
    node->edgestr += len;
    node->edgelen -= len;
  }

  if (int_stree_connect(tree, newnode, node) == NULL) {
    if (int_stree_isaleaf(tree, node)) {
      leaf = (STREE_LEAF) node;
      leaf->pos -= len;
      leaf->ch = stree_mapch(tree, *edgestr);
    }
    else {
      node->edgestr -= len;
      node->edgelen += len;
    }
    int_stree_reconnect(tree, parent, newnode, node);
    int_stree_free_node(tree, newnode);
    return NULL;
  }

  tree->num_nodes++;
  tree->idents_dirty = 1;

  return newnode;
}
Beispiel #7
0
void stree_traverse_subtree(SUFFIX_TREE tree, STREE_NODE root,
                            int (*preorder_fn)(), int (*postorder_fn)())
{
  STREE_NODE node, next;

  /*
   * Use a non-recursive traversal
   */
  node = root;
  while (1) {
    /*
     * Begin processing a node.  If it has any children, then move down
     * and process the children.
     */
    if (preorder_fn != NULL)
      (*preorder_fn)(tree, node);

    next = stree_get_children(tree, node);
    if (next != NULL) {
      node = next;
      continue;
    }

    /*
     * We've finished processing the children (if any).  Finish the
     * processing of the node, then either move to the next child
     * below the parent of node (accessed by the next field, instead
     * of moving up the tree to the parent and then down), or move up
     * to the parent if there is no next.
     *
     * If we've finished processing the root of the subtree, then return.
     */
    while (1) {
      if (postorder_fn != NULL)
        (*postorder_fn)(tree, node);

      if (node == root)
        return;
      if ((next = stree_get_next(tree, node)) != NULL)
        break;

      node = stree_get_parent(tree, node);
    }

    node = next;
  }
}
Beispiel #8
0
/*
 * stree_get_label
 *
 * Get the string labelling the path from the root to a tree node and
 * store that string (or a part of the string) in the given buffer.
 *
 * If the node's label is longer than the buffer, then `buflen'
 * characters from either the beginning or end of the label (depending
 * on the value of `endflag') are copied into the buffer and the string
 * is NOT NULL-terminated.  Otherwise, the string will be NULL-terminated.
 *
 * Parameters:  tree     -  a suffix tree
 *              node     -  a tree node
 *              buffer   -  the character buffer
 *              buflen   -  the buffer length
 *              endflag  -  copy from the end of the label?
 *
 * Returns:  nothing.
 */
void stree_get_label(SUFFIX_TREE tree, STREE_NODE node, char *buffer,
                     int buflen, int endflag)
{
  int len, skip, edgelen;
  char *edgestr, *bufptr;

  len = stree_get_labellen(tree, node);
  skip = 0;

  if (buflen > len)
    buffer[len] = '\0';
  else {
    if (len > buflen && !endflag)
      skip = len - buflen;
    len = buflen;
  }

  /*
   * Fill in the buffer from the end to the beginning, as we move up
   * the tree.  If `endflag' is false and the buffer is smaller than
   * the label, then skip past the "last" `len - buflen' characters (i.e., the
   * last characters on the path to the node, but the first characters
   * that will be seen moving up to the root).
   */
  bufptr = buffer + len;
  while (len > 0 && node != stree_get_root(tree)) {
    edgelen = stree_get_edgelen(tree, node);

    if (skip >= edgelen)
      skip -= edgelen;
    else {
      if (skip > 0) {
        edgelen -= skip;
        skip = 0;
      }
      edgestr = stree_get_edgestr(tree, node) + edgelen;
      for ( ; len > 0 && edgelen > 0; edgelen--,len--)
        *--bufptr = *--edgestr;
    }

    node = stree_get_parent(tree, node);
  }
}
Beispiel #9
0
/*
 * int_stree_set_idents
 *
 * Uses the non-recursive traversal to set the identifiers for the current
 * nodes of the suffix tree.  The nodes are numbered in a depth-first
 * manner, beginning from the root and taking the nodes in the order they
 * appear in the children lists.
 *
 * Parameters:  tree  -  A suffix tree
 *
 * Return:  nothing.
 */
void int_stree_set_idents(SUFFIX_TREE tree)
{
  int id;
  STREE_NODE node, next;

  if (!tree->idents_dirty)
    return;

  tree->idents_dirty = 0;

  /*
   * Use a non-recursive traversal.  See stree_traverse_subtree for 
   * details.
   */
  id = 0;
  node = stree_get_root(tree);
  while (1) {
    node->id = id++;

    next = stree_get_children(tree, node);
    if (next != NULL) {
      node = next;
      continue;
    }

    while (1) {
      if (node == stree_get_root(tree))
        return;
      if ((next = stree_get_next(tree, node)) != NULL)
        break;

      node = stree_get_parent(tree, node);
    }

    node = next;
  }
}
Beispiel #10
0
/*
 * int_stree_get_suffix_link
 *
 * Traverses the suffix link from a node, and returns the node at the
 * end of the suffix link.
 *
 * Parameters:  tree  -  a suffix tree
 *              node  -  a tree node
 *
 * Return:  The node at the end of the suffix line.
 */
STREE_NODE int_stree_get_suffix_link(SUFFIX_TREE tree, STREE_NODE node)
{
  int len, edgelen;
  char *edgestr;
  STREE_NODE parent;

  if (node == stree_get_root(tree))
    return NULL;
  else if (!int_stree_isaleaf(tree, node))
    return node->suffix_link;

  edgestr = stree_get_edgestr(tree, node);
  edgelen = stree_get_edgelen(tree, node);
  parent = stree_get_parent(tree, node);

  /*
   * Do the skip/count trip to skip down to the proper node.
   */
  if (parent != stree_get_root(tree))
    parent = parent->suffix_link;
  else {
    edgestr++;
    edgelen--;
  }

  node = parent;
  while (edgelen > 0) {
    node = stree_find_child(tree, node, *edgestr);
    assert(node != NULL);

    len = stree_get_edgelen(tree, node);
    edgestr += len;
    edgelen -= len;
  }

  return node;
}
Beispiel #11
0
/*
 * int_stree_disconnect
 *
 * Disconnects a node from its parent, and compacts the tree if that
 * parent is no longer needed.
 *
 * Parameters:  tree  -  a suffix tree
 *              node  -  a tree node
 *
 * Return:  The node at the end of the suffix line.
 */
void int_stree_disconnect(SUFFIX_TREE tree, STREE_NODE node)
{
  int num;
  STREE_NODE parent;

  if (node == stree_get_root(tree))
    return;

  parent = stree_get_parent(tree, node);
  int_stree_disc_from_parent(tree, parent, node);

  if (!int_stree_has_intleaves(tree, parent) &&
      parent != stree_get_root(tree) && 
      (num = stree_get_num_children(tree, parent)) < 2) {
    if (num == 0) {
      int_stree_disconnect(tree, parent);
      int_stree_delete_subtree(tree, parent);
    }
    else if (num == 1)
      int_stree_edge_merge(tree, parent);
  }

  tree->idents_dirty = 1;
}
Beispiel #12
0
/*
 * stree_add_string
 *
 * Implements Ukkonen's construction algorithm to add a string to the
 * suffix tree.
 *
 * This operation is an "atomic" operation.  In the far too likely case
 * that the program runs out of memory (or hits the maximum allocated
 * memory set by stree_set_max_alloc), this operation undoes any partial
 * changes it may have made to the algorithm, and it leaves the tree in
 * its original form.  Thus, you can just keep adding strings until the
 * function returns 0, and not have too worry about whether a call to the
 * function will trash the tree just because there's no memory left.
 *
 * NOTE:  The `id' value given must be unique to any of the strings
 *        added to the tree, and must be a small integer greater than
 *        0.
 *
 *        The best id's to use are to number the strings from 1 to K.
 *
 * Parameters:  tree  -  a suffix tree
 *              S     -  the sequence
 *              M     -  the sequence length
 *              id    -  the sequence identifier
 *              Sraw  -  the raw sequence (i.e. whose characters
 *                       are not translated to 0..alphasize-1)
 *
 * Returns:  non-zero on success, zero on error.
 */
int stree_add_string(SUFFIX_TREE tree, char *S, int M, int strid)
{
  int i, j, g, h, gprime, edgelen, id;
  char *edgestr;
  STREE_NODE node, lastnode, root, child, parent;
  STREE_LEAF leaf;

  id = int_stree_insert_string(tree, S, M, strid);
  if (id == -1)
    return 0;

  /*
   * Run Ukkonen's algorithm to add the string to the suffix tree.
   */
  root = stree_get_root(tree);
  node = lastnode = root;
  g = 0;
  edgelen = 0;
  edgestr = NULL;

  for (i=0,j=0; i <= M; i++)  {
    for ( ; j <= i && j < M; j++) {
      /*
       * Perform the extension from S[j..i-1] to S[j..i].  One of the
       * following two cases holds:
       *    a) g == 0, node == root and i == j.
       *         (meaning that in the previous outer loop,
       *          all of the extensions S[1..i-1], S[2..i-1], ...,
       *          S[i-1..i-1] were done.)
       *    b) g > 0, node != root and the string S[j..i-1]
       *       ends at the g'th character of node's edge.
       */
      if (g == 0 || g == edgelen) {
        if (i < M) {
          if ((child = stree_find_child(tree, node, S[i])) != NULL) {
            node = child;
            g = 1;
            edgestr = stree_get_edgestr(tree, node);
            edgelen = stree_get_edgelen(tree, node);
            break;
          }

          if ((leaf = int_stree_new_leaf(tree, id, i)) == NULL ||
              tree->num_nodes == MAXNUMNODES ||
              (node = int_stree_connect(tree, node,
                                        (STREE_NODE) leaf)) == NULL) {
            if (leaf != NULL)
              int_stree_free_leaf(tree, leaf);
            int_stree_remove_to_position(tree, id, j);
            int_stree_delete_string(tree, id);
            return 0;
          }

          tree->num_nodes++;
        }
        else {
          if ((int_stree_isaleaf(tree, node) &&
               (node = int_stree_convert_leafnode(tree, node)) == NULL)) {
            int_stree_remove_to_position(tree, id, j);
            int_stree_delete_string(tree, id);
            return 0;
          }

          if (!int_stree_add_intleaf(tree, node, id, j)) {
            int_stree_remove_to_position(tree, id, j);
            int_stree_delete_string(tree, id);
            return 0;
          }
        }

        if (lastnode != root && lastnode->suffix_link == NULL)
          lastnode->suffix_link = node;
        lastnode = node;
      }
      else {
        /*
         * g > 0 && g < edgelen, and so S[j..i-1] ends in the middle
         * of some edge.
         *
         * If the next character in the edge label matches the next
         * input character, keep moving down that edge.  Otherwise,
         * split the edge at that point and add a new leaf for the
         * suffix.
         */
        if (i < M &&
            stree_mapch(tree, S[i]) == stree_mapch(tree, edgestr[g])) {
          g++;
          break;
        }

        if (tree->num_nodes == MAXNUMNODES ||
            (node = int_stree_edge_split(tree, node, g)) == NULL) {
          int_stree_remove_to_position(tree, id, j);
          int_stree_delete_string(tree, id);
          return 0;
        }

        edgestr = stree_get_edgestr(tree, node);
        edgelen = stree_get_edgelen(tree, node);

        if (i < M) {
          if ((leaf = int_stree_new_leaf(tree, id, i)) == NULL ||
              tree->num_nodes == MAXNUMNODES ||
              (node = int_stree_connect(tree, node,
                                        (STREE_NODE) leaf)) == NULL) {
            if (leaf != NULL)
              int_stree_free_leaf(tree, leaf);
            int_stree_remove_to_position(tree, id, j);
            int_stree_delete_string(tree, id);
            return 0;
          }

          tree->num_nodes++;
        }
        else {
          if ((int_stree_isaleaf(tree, node) &&
               (node = int_stree_convert_leafnode(tree, node)) == NULL)) {
            int_stree_remove_to_position(tree, id, j);
            int_stree_delete_string(tree, id);
            return 0;
          }

          if (!int_stree_add_intleaf(tree, node, id, j)) {
            int_stree_remove_to_position(tree, id, j);
            int_stree_delete_string(tree, id);
            return 0;
          }
        }

        if (lastnode != root && lastnode->suffix_link == NULL)
          lastnode->suffix_link = node;
        lastnode = node;
      }

      /* 
       * Now, having extended S[j..i-1] to S[j..i] by rule 2, find where
       * S[j+1..i-1] is.
       */
      if (node == root)
        ;
      else if (g == edgelen && node->suffix_link != NULL) {
        node = node->suffix_link;
        edgestr = stree_get_edgestr(tree, node);
        edgelen = stree_get_edgelen(tree, node);
        g = edgelen;
      }
      else {
        parent = stree_get_parent(tree, node);
        if (parent != root)
          node = parent->suffix_link;
        else {
          node = root;
          g--;
        }
        edgelen = stree_get_edgelen(tree, node);

        h = i - g;
        while (g > 0) {
          node = stree_find_child(tree, node, S[h]);
          gprime = stree_get_edgelen(tree, node);
          if (gprime > g)
            break;

          g -= gprime;
          h += gprime;
        }

        edgelen = stree_get_edgelen(tree, node);
        edgestr = stree_get_edgestr(tree, node);

        if (g == 0) {
          if (lastnode != root && !int_stree_isaleaf(tree, node) &&
              lastnode->suffix_link == NULL) {
            lastnode->suffix_link = node;
            lastnode = node;
          }

          if (node != root)
            g = edgelen;
        }
      }
    }
  }

  return 1;
}