Esempio n. 1
0
/*
 * stree_remove_string
 *
 * Removes a string from the suffix tree, pruning branches and suffix
 * links from the tree and compacting the tree as necessary.
 *
 * Parameters:  tree   -  A suffix tree
 *              strid  -  The identifier of the sequence to be removed.
 *
 * Returns:  non-zero on success, zero on error.
 */
int stree_remove_string(SUFFIX_TREE tree, int strid)
{
  int i;

  for (i=0; i < MAXNUMSTR; i++) {
    if (tree->strings[i] != NULL && tree->ids[i] == strid)
      break;
  }

  if (i == MAXNUMSTR)
    return 0;

  int_stree_remove_to_position(tree, i, tree->lengths[i]);
  /* debug --- this is causing segfaults!  None of the strings are
     guaranteed to be null terminated, so maybe there's some problems
     here.

     I'm not convinced int_stree_remove_to_position is doing the
     right thing, as the code isn't used anywhere else except when
     stree_add_string fails... and that probably means that this code has
     not been tested!
  */
  int_stree_delete_string(tree, i);
  return 1;
}
Esempio n. 2
0
/*
 * stree_remove_string
 *
 * Removes a string from the suffix tree, pruning branches and suffix
 * links from the tree and compacting the tree as necessary.
 *
 * Parameters:  tree   -  A suffix tree
 *              strid  -  The identifier of the sequence to be removed.
 *
 * Returns:  non-zero on success, zero on error.
 */
int stree_remove_string(SUFFIX_TREE tree, int strid)
{
  int i;

  for (i=0; i < MAXNUMSTR; i++)
    if (tree->strings[i] != NULL && tree->ids[i] == strid)
      break;

  if (i == MAXNUMSTR)
    return 0;

  int_stree_remove_to_position(tree, i, tree->lengths[i]);
  int_stree_delete_string(tree, i);
  return 1;
}
Esempio n. 3
0
/*
 * stree_add_string
 *
 * Implements Ukkonen's construction algorithm to add a string to the
 * suffix tree.
 *
 * This operation is an "atomic" operation.  In the far too likely case
 * that the program runs out of memory (or hits the maximum allocated
 * memory set by stree_set_max_alloc), this operation undoes any partial
 * changes it may have made to the algorithm, and it leaves the tree in
 * its original form.  Thus, you can just keep adding strings until the
 * function returns 0, and not have too worry about whether a call to the
 * function will trash the tree just because there's no memory left.
 *
 * NOTE:  The `id' value given must be unique to any of the strings
 *        added to the tree, and must be a small integer greater than
 *        0.
 *
 *        The best id's to use are to number the strings from 1 to K.
 *
 * Parameters:  tree  -  a suffix tree
 *              S     -  the sequence
 *              M     -  the sequence length
 *              id    -  the sequence identifier
 *              Sraw  -  the raw sequence (i.e. whose characters
 *                       are not translated to 0..alphasize-1)
 *
 * Returns:  non-zero on success, zero on error.
 */
int stree_add_string(SUFFIX_TREE tree, char *S, int M, int strid)
{
  int i, j, g, h, gprime, edgelen, id;
  char *edgestr;
  STREE_NODE node, lastnode, root, child, parent;
  STREE_LEAF leaf;

  id = int_stree_insert_string(tree, S, M, strid);
  if (id == -1)
    return 0;

  /*
   * Run Ukkonen's algorithm to add the string to the suffix tree.
   */
  root = stree_get_root(tree);
  node = lastnode = root;
  g = 0;
  edgelen = 0;
  edgestr = NULL;

  for (i=0,j=0; i <= M; i++)  {
    for ( ; j <= i && j < M; j++) {
      /*
       * Perform the extension from S[j..i-1] to S[j..i].  One of the
       * following two cases holds:
       *    a) g == 0, node == root and i == j.
       *         (meaning that in the previous outer loop,
       *          all of the extensions S[1..i-1], S[2..i-1], ...,
       *          S[i-1..i-1] were done.)
       *    b) g > 0, node != root and the string S[j..i-1]
       *       ends at the g'th character of node's edge.
       */
      if (g == 0 || g == edgelen) {
        if (i < M) {
          if ((child = stree_find_child(tree, node, S[i])) != NULL) {
            node = child;
            g = 1;
            edgestr = stree_get_edgestr(tree, node);
            edgelen = stree_get_edgelen(tree, node);
            break;
          }

          if ((leaf = int_stree_new_leaf(tree, id, i)) == NULL ||
              tree->num_nodes == MAXNUMNODES ||
              (node = int_stree_connect(tree, node,
                                        (STREE_NODE) leaf)) == NULL) {
            if (leaf != NULL)
              int_stree_free_leaf(tree, leaf);
            int_stree_remove_to_position(tree, id, j);
            int_stree_delete_string(tree, id);
            return 0;
          }

          tree->num_nodes++;
        }
        else {
          if ((int_stree_isaleaf(tree, node) &&
               (node = int_stree_convert_leafnode(tree, node)) == NULL)) {
            int_stree_remove_to_position(tree, id, j);
            int_stree_delete_string(tree, id);
            return 0;
          }

          if (!int_stree_add_intleaf(tree, node, id, j)) {
            int_stree_remove_to_position(tree, id, j);
            int_stree_delete_string(tree, id);
            return 0;
          }
        }

        if (lastnode != root && lastnode->suffix_link == NULL)
          lastnode->suffix_link = node;
        lastnode = node;
      }
      else {
        /*
         * g > 0 && g < edgelen, and so S[j..i-1] ends in the middle
         * of some edge.
         *
         * If the next character in the edge label matches the next
         * input character, keep moving down that edge.  Otherwise,
         * split the edge at that point and add a new leaf for the
         * suffix.
         */
        if (i < M &&
            stree_mapch(tree, S[i]) == stree_mapch(tree, edgestr[g])) {
          g++;
          break;
        }

        if (tree->num_nodes == MAXNUMNODES ||
            (node = int_stree_edge_split(tree, node, g)) == NULL) {
          int_stree_remove_to_position(tree, id, j);
          int_stree_delete_string(tree, id);
          return 0;
        }

        edgestr = stree_get_edgestr(tree, node);
        edgelen = stree_get_edgelen(tree, node);

        if (i < M) {
          if ((leaf = int_stree_new_leaf(tree, id, i)) == NULL ||
              tree->num_nodes == MAXNUMNODES ||
              (node = int_stree_connect(tree, node,
                                        (STREE_NODE) leaf)) == NULL) {
            if (leaf != NULL)
              int_stree_free_leaf(tree, leaf);
            int_stree_remove_to_position(tree, id, j);
            int_stree_delete_string(tree, id);
            return 0;
          }

          tree->num_nodes++;
        }
        else {
          if ((int_stree_isaleaf(tree, node) &&
               (node = int_stree_convert_leafnode(tree, node)) == NULL)) {
            int_stree_remove_to_position(tree, id, j);
            int_stree_delete_string(tree, id);
            return 0;
          }

          if (!int_stree_add_intleaf(tree, node, id, j)) {
            int_stree_remove_to_position(tree, id, j);
            int_stree_delete_string(tree, id);
            return 0;
          }
        }

        if (lastnode != root && lastnode->suffix_link == NULL)
          lastnode->suffix_link = node;
        lastnode = node;
      }

      /* 
       * Now, having extended S[j..i-1] to S[j..i] by rule 2, find where
       * S[j+1..i-1] is.
       */
      if (node == root)
        ;
      else if (g == edgelen && node->suffix_link != NULL) {
        node = node->suffix_link;
        edgestr = stree_get_edgestr(tree, node);
        edgelen = stree_get_edgelen(tree, node);
        g = edgelen;
      }
      else {
        parent = stree_get_parent(tree, node);
        if (parent != root)
          node = parent->suffix_link;
        else {
          node = root;
          g--;
        }
        edgelen = stree_get_edgelen(tree, node);

        h = i - g;
        while (g > 0) {
          node = stree_find_child(tree, node, S[h]);
          gprime = stree_get_edgelen(tree, node);
          if (gprime > g)
            break;

          g -= gprime;
          h += gprime;
        }

        edgelen = stree_get_edgelen(tree, node);
        edgestr = stree_get_edgestr(tree, node);

        if (g == 0) {
          if (lastnode != root && !int_stree_isaleaf(tree, node) &&
              lastnode->suffix_link == NULL) {
            lastnode->suffix_link = node;
            lastnode = node;
          }

          if (node != root)
            g = edgelen;
        }
      }
    }
  }

  return 1;
}