示例#1
0
/*
 * lca_prep
 *
 * Preprocessing for the constant time LCA algorithm.
 *
 * Parameters:  tree  -   a suffix tree
 *
 * Returns:  An LCA_STRUCT structure
 */
LCA_STRUCT *lca_prep(SUFFIX_TREE tree)
{
    assert(tree);

    LCA_STRUCT *lca = (LCA_STRUCT *)my_calloc(sizeof(LCA_STRUCT), 1);
    if (!lca) {
        return NULL;
    }

    lca->type = LCA_LINEAR;
    lca->tree = tree;

    int num_nodes = (int)stree_get_num_nodes(tree) + 1;

    if ((lca->I = (unsigned int *)my_calloc(num_nodes, sizeof(unsigned int))) == NULL) {
        lca_free(lca);
        return NULL;
    }
  
    if ((lca->A = (unsigned int *)my_calloc(num_nodes, sizeof(unsigned int))) == NULL) {
        lca_free(lca);
        return NULL;
    }

    if ((lca->L = (STREE_NODE *)my_calloc(num_nodes, sizeof(STREE_NODE))) == NULL) {
        lca_free(lca);
        return NULL;
    }

    // Compute the I and L values, then compute the A values.
    compute_I_and_L(lca, tree, stree_get_root(tree));
    compute_A(lca, tree, stree_get_root(tree), 0);
    
    return lca;
}
示例#2
0
文件: stree.c 项目: cherry-wb/strmat
/*
 * int_stree_remove_to_position
 *
 * Remove the suffixes of a string from the suffix tree, and compact the
 * tree as necessary.
 *
 * NOTE:  This can be used either to remove a successfully added string,
 *        or a string which was only partially added to the tree (because
 *        an error stopped the add operation).  But it should only be used
 *        to completely remove a string.
 *
 * Parameters:  tree        -  A suffix tree
 *              id          -  The internally used id of the string to remove
 *              num_remove  -  How many positions to remove.
 *
 * Return:  nothing.
 */
void int_stree_remove_to_position(SUFFIX_TREE tree, int id, int num_remove)
{
  int M, walklen, rempos, pos, num, status;
  char *S;
  STREE_NODE node, next;
  STREE_LEAF leaf;

  if (num_remove == 0)
    return;

  S = int_stree_get_string(tree, id);
  M = int_stree_get_length(tree, id);

  walklen = int_stree_walk_to_leaf(tree, stree_get_root(tree), 0, S, M,
                                   &node, &pos);
  assert(walklen == M || int_stree_isaleaf(tree, node));

  next = NULL;
  rempos = 0;
  while (rempos < num_remove) {
    if (rempos < num_remove - 1) {
      next = stree_get_suffix_link(tree, node);
      assert(next != NULL);
    }

    if (int_stree_isaleaf(tree, node)) {
      leaf = (STREE_LEAF) node;
      assert(leaf->strid == id &&
             int_stree_get_leafpos(tree, leaf) == rempos);

      int_stree_disconnect(tree, node);
      int_stree_free_leaf(tree, leaf);
    }
    else {
      status = int_stree_remove_intleaf(tree, node, id, rempos);
      assert(status != 0);

      if (!int_stree_has_intleaves(tree, node) &&
          node != stree_get_root(tree) &&
          (num = stree_get_num_children(tree, node)) < 2) {
        if (num == 0) {
          int_stree_disconnect(tree, node);
          int_stree_delete_subtree(tree, node);
        }
        else if (num == 1)
          int_stree_edge_merge(tree, node);
      }
    }

    if (rempos < num_remove - 1)
      node = next;
    rempos++;
  }
}
示例#3
0
文件: stree.c 项目: cherry-wb/strmat
/*
 * int_stree_edge_merge
 *
 * When a node has no "leaves" and only one child, this function will
 * remove that node and merge the edges from parent to node and node
 * to child into a single edge from parent to child.
 *
 * Parameters:  tree  -  A suffix tree
 *              node  -  The tree node to be removed
 *
 * Return:  nothing.
 */
void int_stree_edge_merge(SUFFIX_TREE tree, STREE_NODE node)
{
  int len;
  STREE_NODE parent, child;
  STREE_LEAF leaf;

  if (node == stree_get_root(tree) || int_stree_isaleaf(tree, node) ||
      int_stree_has_intleaves(tree, node))
    return;
  
  parent = stree_get_parent(tree, node);
  child = stree_get_children(tree, node);
  if (stree_get_next(tree, child) != NULL)
    return;

  len = stree_get_edgelen(tree, node);
  if (int_stree_isaleaf(tree, child)) {
    leaf = (STREE_LEAF) child;
    leaf->pos -= len;
    leaf->ch = stree_get_mapch(tree, node);
  }
  else {
    child->edgestr -= len;
    child->edgelen += len;
  }

  int_stree_reconnect(tree, parent, node, child);
  tree->num_nodes--;
  tree->idents_dirty = 1;

  int_stree_free_node(tree, node);
}
示例#4
0
文件: stree.c 项目: cherry-wb/strmat
/*
 * stree_get_labellen
 *
 * Get the length of the string labelling the path from the root to
 * a tree node.
 *
 * Parameters:  tree  -  a suffix tree
 *              node  -  a tree node
 *
 * Returns:  the length of the node's label.
 */
int stree_get_labellen(SUFFIX_TREE tree, STREE_NODE node)
{
  int len;
  
  len = 0;
  while (node != stree_get_root(tree)) {
    len += stree_get_edgelen(tree, node);
    node = stree_get_parent(tree, node);
  }
  return len;
}
示例#5
0
文件: stree.c 项目: cherry-wb/strmat
/*
 * int_stree_set_idents
 *
 * Uses the non-recursive traversal to set the identifiers for the current
 * nodes of the suffix tree.  The nodes are numbered in a depth-first
 * manner, beginning from the root and taking the nodes in the order they
 * appear in the children lists.
 *
 * Parameters:  tree  -  A suffix tree
 *
 * Return:  nothing.
 */
void int_stree_set_idents(SUFFIX_TREE tree)
{
  int id;
  STREE_NODE node, next;

  if (!tree->idents_dirty)
    return;

  tree->idents_dirty = 0;

  /*
   * Use a non-recursive traversal.  See stree_traverse_subtree for 
   * details.
   */
  id = 0;
  node = stree_get_root(tree);
  while (1) {
    node->id = id++;

    next = stree_get_children(tree, node);
    if (next != NULL) {
      node = next;
      continue;
    }

    while (1) {
      if (node == stree_get_root(tree))
        return;
      if ((next = stree_get_next(tree, node)) != NULL)
        break;

      node = stree_get_parent(tree, node);
    }

    node = next;
  }
}
示例#6
0
文件: stree.c 项目: cherry-wb/strmat
/*
 * int_stree_get_suffix_link
 *
 * Traverses the suffix link from a node, and returns the node at the
 * end of the suffix link.
 *
 * Parameters:  tree  -  a suffix tree
 *              node  -  a tree node
 *
 * Return:  The node at the end of the suffix line.
 */
STREE_NODE int_stree_get_suffix_link(SUFFIX_TREE tree, STREE_NODE node)
{
  int len, edgelen;
  char *edgestr;
  STREE_NODE parent;

  if (node == stree_get_root(tree))
    return NULL;
  else if (!int_stree_isaleaf(tree, node))
    return node->suffix_link;

  edgestr = stree_get_edgestr(tree, node);
  edgelen = stree_get_edgelen(tree, node);
  parent = stree_get_parent(tree, node);

  /*
   * Do the skip/count trip to skip down to the proper node.
   */
  if (parent != stree_get_root(tree))
    parent = parent->suffix_link;
  else {
    edgestr++;
    edgelen--;
  }

  node = parent;
  while (edgelen > 0) {
    node = stree_find_child(tree, node, *edgestr);
    assert(node != NULL);

    len = stree_get_edgelen(tree, node);
    edgestr += len;
    edgelen -= len;
  }

  return node;
}
示例#7
0
文件: stree.c 项目: cherry-wb/strmat
/*
 * int_stree_disconnect
 *
 * Disconnects a node from its parent, and compacts the tree if that
 * parent is no longer needed.
 *
 * Parameters:  tree  -  a suffix tree
 *              node  -  a tree node
 *
 * Return:  The node at the end of the suffix line.
 */
void int_stree_disconnect(SUFFIX_TREE tree, STREE_NODE node)
{
  int num;
  STREE_NODE parent;

  if (node == stree_get_root(tree))
    return;

  parent = stree_get_parent(tree, node);
  int_stree_disc_from_parent(tree, parent, node);

  if (!int_stree_has_intleaves(tree, parent) &&
      parent != stree_get_root(tree) && 
      (num = stree_get_num_children(tree, parent)) < 2) {
    if (num == 0) {
      int_stree_disconnect(tree, parent);
      int_stree_delete_subtree(tree, parent);
    }
    else if (num == 1)
      int_stree_edge_merge(tree, parent);
  }

  tree->idents_dirty = 1;
}
示例#8
0
文件: stree.c 项目: cherry-wb/strmat
/*
 * stree_get_label
 *
 * Get the string labelling the path from the root to a tree node and
 * store that string (or a part of the string) in the given buffer.
 *
 * If the node's label is longer than the buffer, then `buflen'
 * characters from either the beginning or end of the label (depending
 * on the value of `endflag') are copied into the buffer and the string
 * is NOT NULL-terminated.  Otherwise, the string will be NULL-terminated.
 *
 * Parameters:  tree     -  a suffix tree
 *              node     -  a tree node
 *              buffer   -  the character buffer
 *              buflen   -  the buffer length
 *              endflag  -  copy from the end of the label?
 *
 * Returns:  nothing.
 */
void stree_get_label(SUFFIX_TREE tree, STREE_NODE node, char *buffer,
                     int buflen, int endflag)
{
  int len, skip, edgelen;
  char *edgestr, *bufptr;

  len = stree_get_labellen(tree, node);
  skip = 0;

  if (buflen > len)
    buffer[len] = '\0';
  else {
    if (len > buflen && !endflag)
      skip = len - buflen;
    len = buflen;
  }

  /*
   * Fill in the buffer from the end to the beginning, as we move up
   * the tree.  If `endflag' is false and the buffer is smaller than
   * the label, then skip past the "last" `len - buflen' characters (i.e., the
   * last characters on the path to the node, but the first characters
   * that will be seen moving up to the root).
   */
  bufptr = buffer + len;
  while (len > 0 && node != stree_get_root(tree)) {
    edgelen = stree_get_edgelen(tree, node);

    if (skip >= edgelen)
      skip -= edgelen;
    else {
      if (skip > 0) {
        edgelen -= skip;
        skip = 0;
      }
      edgestr = stree_get_edgestr(tree, node) + edgelen;
      for ( ; len > 0 && edgelen > 0; edgelen--,len--)
        *--bufptr = *--edgestr;
    }

    node = stree_get_parent(tree, node);
  }
}
示例#9
0
文件: stree.c 项目: cherry-wb/strmat
/*
 * stree_delete_tree
 *
 * Frees the SUFFIX_TREE data structure and all of its allocated space.
 *
 * Parameters:  tree  -  a suffix tree
 *
 * Returns:  nothing.
 */
void stree_delete_tree(SUFFIX_TREE tree)
{
  int i;

  int_stree_delete_subtree(tree, stree_get_root(tree));

  if (tree->strings != NULL) {
    if (tree->copyflag) {
      for (i=0; i < MAXNUMSTR; i++)
        if (tree->strings[i] != NULL)
          free(tree->strings[i]);
    }
    free(tree->strings);
  }
  if (tree->ids != NULL)
    free(tree->ids);
  if (tree->lengths != NULL)
    free(tree->lengths);

  free(tree);
}
示例#10
0
文件: stree.c 项目: cherry-wb/strmat
/*
 * int_stree_get_leafpos
 *
 * Return the position of the suffix that ends at that leaf.  This value
 * must be reconstructed by walking back up the tree to the root.
 *
 * Parameters:  tree   -  A suffix tree
 *              node   -  A tree node
 *
 * Returns:  The edge label length.
 */
int int_stree_get_leafpos(SUFFIX_TREE tree, STREE_LEAF leaf)
{
  int pos;
  STREE_NODE node, root;

  pos = leaf->pos;

  node = (STREE_NODE) leaf;
  root = stree_get_root(tree);
  while (1) {
    while (!node->nextisparent)
      node = node->next;

    if (node == root)
      break;

    node = node->next;
    pos -= node->edgelen;
  }

  return pos;
}
示例#11
0
文件: stree.c 项目: cherry-wb/strmat
/*
 * stree_match & stree_walk
 *
 * Traverse the path down the tree whose path label matches T, and return
 * the number of characters of T matches, and the node and position along
 * the node's edge where the matching to T ends.
 *
 * Parameters:  tree      -  a suffix tree
 *              node      -  what node to start the walk down the tree
 *              pos       -  position along node's edge to start the walk
 *                              (`node' and `pos' are stree_walk only)
 *              T         -  the sequence to match
 *              N         -  the sequence length
 *              node_out  -  address of where to store the node where
 *                           the traversal ends
 *              pos_out   -  address of where to store the character position
 *                           along the ending node's edge of the endpoint of
 *                           the traversal
 *
 * Returns:  The number of characters of T matched.
 */
int stree_match(SUFFIX_TREE tree, char *T, int N,
                STREE_NODE *node_out, int *pos_out)
{
  return stree_walk(tree, stree_get_root(tree), 0, T, N, node_out, pos_out);
}
示例#12
0
文件: stree.c 项目: cherry-wb/strmat
/*
 * stree_traverse & stree_traverse_subtree
 *
 * Use a non-recursive traversal of the tree (or a subtree), calling the
 * two function parameters before and after recursing at each node, resp.
 * When memory is at a premium, this traversal may be useful.
 *
 * Note that either of the function parameters can be NULL, if you just
 * need to do pre-order or post-order processing.
 *
 * The traversal uses the `ch' field of the tree nodes to hold its
 * state information.  After the traversal is finished with a node, it
 * will restore that ch value.
 *
 * Parameters:  tree          -  a suffix tree
 *              node          -  root node of the traversal
 *                                 (stree_traverse_subtree only)
 *              preorder_fn   -  function to call before visiting the children
 *              postorder_fn  -  function to call after visiting all children
 *
 * Returns:  nothing.
 */
void stree_traverse(SUFFIX_TREE tree, int (*preorder_fn)(),
                    int (*postorder_fn)())
{
  stree_traverse_subtree(tree, stree_get_root(tree), preorder_fn,
                         postorder_fn);
}
示例#13
0
文件: stree.c 项目: cherry-wb/strmat
/*
 * stree_add_string
 *
 * Implements Ukkonen's construction algorithm to add a string to the
 * suffix tree.
 *
 * This operation is an "atomic" operation.  In the far too likely case
 * that the program runs out of memory (or hits the maximum allocated
 * memory set by stree_set_max_alloc), this operation undoes any partial
 * changes it may have made to the algorithm, and it leaves the tree in
 * its original form.  Thus, you can just keep adding strings until the
 * function returns 0, and not have too worry about whether a call to the
 * function will trash the tree just because there's no memory left.
 *
 * NOTE:  The `id' value given must be unique to any of the strings
 *        added to the tree, and must be a small integer greater than
 *        0.
 *
 *        The best id's to use are to number the strings from 1 to K.
 *
 * Parameters:  tree  -  a suffix tree
 *              S     -  the sequence
 *              M     -  the sequence length
 *              id    -  the sequence identifier
 *              Sraw  -  the raw sequence (i.e. whose characters
 *                       are not translated to 0..alphasize-1)
 *
 * Returns:  non-zero on success, zero on error.
 */
int stree_add_string(SUFFIX_TREE tree, char *S, int M, int strid)
{
  int i, j, g, h, gprime, edgelen, id;
  char *edgestr;
  STREE_NODE node, lastnode, root, child, parent;
  STREE_LEAF leaf;

  id = int_stree_insert_string(tree, S, M, strid);
  if (id == -1)
    return 0;

  /*
   * Run Ukkonen's algorithm to add the string to the suffix tree.
   */
  root = stree_get_root(tree);
  node = lastnode = root;
  g = 0;
  edgelen = 0;
  edgestr = NULL;

  for (i=0,j=0; i <= M; i++)  {
    for ( ; j <= i && j < M; j++) {
      /*
       * Perform the extension from S[j..i-1] to S[j..i].  One of the
       * following two cases holds:
       *    a) g == 0, node == root and i == j.
       *         (meaning that in the previous outer loop,
       *          all of the extensions S[1..i-1], S[2..i-1], ...,
       *          S[i-1..i-1] were done.)
       *    b) g > 0, node != root and the string S[j..i-1]
       *       ends at the g'th character of node's edge.
       */
      if (g == 0 || g == edgelen) {
        if (i < M) {
          if ((child = stree_find_child(tree, node, S[i])) != NULL) {
            node = child;
            g = 1;
            edgestr = stree_get_edgestr(tree, node);
            edgelen = stree_get_edgelen(tree, node);
            break;
          }

          if ((leaf = int_stree_new_leaf(tree, id, i)) == NULL ||
              tree->num_nodes == MAXNUMNODES ||
              (node = int_stree_connect(tree, node,
                                        (STREE_NODE) leaf)) == NULL) {
            if (leaf != NULL)
              int_stree_free_leaf(tree, leaf);
            int_stree_remove_to_position(tree, id, j);
            int_stree_delete_string(tree, id);
            return 0;
          }

          tree->num_nodes++;
        }
        else {
          if ((int_stree_isaleaf(tree, node) &&
               (node = int_stree_convert_leafnode(tree, node)) == NULL)) {
            int_stree_remove_to_position(tree, id, j);
            int_stree_delete_string(tree, id);
            return 0;
          }

          if (!int_stree_add_intleaf(tree, node, id, j)) {
            int_stree_remove_to_position(tree, id, j);
            int_stree_delete_string(tree, id);
            return 0;
          }
        }

        if (lastnode != root && lastnode->suffix_link == NULL)
          lastnode->suffix_link = node;
        lastnode = node;
      }
      else {
        /*
         * g > 0 && g < edgelen, and so S[j..i-1] ends in the middle
         * of some edge.
         *
         * If the next character in the edge label matches the next
         * input character, keep moving down that edge.  Otherwise,
         * split the edge at that point and add a new leaf for the
         * suffix.
         */
        if (i < M &&
            stree_mapch(tree, S[i]) == stree_mapch(tree, edgestr[g])) {
          g++;
          break;
        }

        if (tree->num_nodes == MAXNUMNODES ||
            (node = int_stree_edge_split(tree, node, g)) == NULL) {
          int_stree_remove_to_position(tree, id, j);
          int_stree_delete_string(tree, id);
          return 0;
        }

        edgestr = stree_get_edgestr(tree, node);
        edgelen = stree_get_edgelen(tree, node);

        if (i < M) {
          if ((leaf = int_stree_new_leaf(tree, id, i)) == NULL ||
              tree->num_nodes == MAXNUMNODES ||
              (node = int_stree_connect(tree, node,
                                        (STREE_NODE) leaf)) == NULL) {
            if (leaf != NULL)
              int_stree_free_leaf(tree, leaf);
            int_stree_remove_to_position(tree, id, j);
            int_stree_delete_string(tree, id);
            return 0;
          }

          tree->num_nodes++;
        }
        else {
          if ((int_stree_isaleaf(tree, node) &&
               (node = int_stree_convert_leafnode(tree, node)) == NULL)) {
            int_stree_remove_to_position(tree, id, j);
            int_stree_delete_string(tree, id);
            return 0;
          }

          if (!int_stree_add_intleaf(tree, node, id, j)) {
            int_stree_remove_to_position(tree, id, j);
            int_stree_delete_string(tree, id);
            return 0;
          }
        }

        if (lastnode != root && lastnode->suffix_link == NULL)
          lastnode->suffix_link = node;
        lastnode = node;
      }

      /* 
       * Now, having extended S[j..i-1] to S[j..i] by rule 2, find where
       * S[j+1..i-1] is.
       */
      if (node == root)
        ;
      else if (g == edgelen && node->suffix_link != NULL) {
        node = node->suffix_link;
        edgestr = stree_get_edgestr(tree, node);
        edgelen = stree_get_edgelen(tree, node);
        g = edgelen;
      }
      else {
        parent = stree_get_parent(tree, node);
        if (parent != root)
          node = parent->suffix_link;
        else {
          node = root;
          g--;
        }
        edgelen = stree_get_edgelen(tree, node);

        h = i - g;
        while (g > 0) {
          node = stree_find_child(tree, node, S[h]);
          gprime = stree_get_edgelen(tree, node);
          if (gprime > g)
            break;

          g -= gprime;
          h += gprime;
        }

        edgelen = stree_get_edgelen(tree, node);
        edgestr = stree_get_edgestr(tree, node);

        if (g == 0) {
          if (lastnode != root && !int_stree_isaleaf(tree, node) &&
              lastnode->suffix_link == NULL) {
            lastnode->suffix_link = node;
            lastnode = node;
          }

          if (node != root)
            g = edgelen;
        }
      }
    }
  }

  return 1;
}