/* * stree_remove_string * * Removes a string from the suffix tree, pruning branches and suffix * links from the tree and compacting the tree as necessary. * * Parameters: tree - A suffix tree * strid - The identifier of the sequence to be removed. * * Returns: non-zero on success, zero on error. */ int stree_remove_string(SUFFIX_TREE tree, int strid) { int i; for (i=0; i < MAXNUMSTR; i++) { if (tree->strings[i] != NULL && tree->ids[i] == strid) break; } if (i == MAXNUMSTR) return 0; int_stree_remove_to_position(tree, i, tree->lengths[i]); /* debug --- this is causing segfaults! None of the strings are guaranteed to be null terminated, so maybe there's some problems here. I'm not convinced int_stree_remove_to_position is doing the right thing, as the code isn't used anywhere else except when stree_add_string fails... and that probably means that this code has not been tested! */ int_stree_delete_string(tree, i); return 1; }
/* * stree_remove_string * * Removes a string from the suffix tree, pruning branches and suffix * links from the tree and compacting the tree as necessary. * * Parameters: tree - A suffix tree * strid - The identifier of the sequence to be removed. * * Returns: non-zero on success, zero on error. */ int stree_remove_string(SUFFIX_TREE tree, int strid) { int i; for (i=0; i < MAXNUMSTR; i++) if (tree->strings[i] != NULL && tree->ids[i] == strid) break; if (i == MAXNUMSTR) return 0; int_stree_remove_to_position(tree, i, tree->lengths[i]); int_stree_delete_string(tree, i); return 1; }
/* * stree_add_string * * Implements Ukkonen's construction algorithm to add a string to the * suffix tree. * * This operation is an "atomic" operation. In the far too likely case * that the program runs out of memory (or hits the maximum allocated * memory set by stree_set_max_alloc), this operation undoes any partial * changes it may have made to the algorithm, and it leaves the tree in * its original form. Thus, you can just keep adding strings until the * function returns 0, and not have too worry about whether a call to the * function will trash the tree just because there's no memory left. * * NOTE: The `id' value given must be unique to any of the strings * added to the tree, and must be a small integer greater than * 0. * * The best id's to use are to number the strings from 1 to K. * * Parameters: tree - a suffix tree * S - the sequence * M - the sequence length * id - the sequence identifier * Sraw - the raw sequence (i.e. whose characters * are not translated to 0..alphasize-1) * * Returns: non-zero on success, zero on error. */ int stree_add_string(SUFFIX_TREE tree, char *S, int M, int strid) { int i, j, g, h, gprime, edgelen, id; char *edgestr; STREE_NODE node, lastnode, root, child, parent; STREE_LEAF leaf; id = int_stree_insert_string(tree, S, M, strid); if (id == -1) return 0; /* * Run Ukkonen's algorithm to add the string to the suffix tree. */ root = stree_get_root(tree); node = lastnode = root; g = 0; edgelen = 0; edgestr = NULL; for (i=0,j=0; i <= M; i++) { for ( ; j <= i && j < M; j++) { /* * Perform the extension from S[j..i-1] to S[j..i]. One of the * following two cases holds: * a) g == 0, node == root and i == j. * (meaning that in the previous outer loop, * all of the extensions S[1..i-1], S[2..i-1], ..., * S[i-1..i-1] were done.) * b) g > 0, node != root and the string S[j..i-1] * ends at the g'th character of node's edge. */ if (g == 0 || g == edgelen) { if (i < M) { if ((child = stree_find_child(tree, node, S[i])) != NULL) { node = child; g = 1; edgestr = stree_get_edgestr(tree, node); edgelen = stree_get_edgelen(tree, node); break; } if ((leaf = int_stree_new_leaf(tree, id, i)) == NULL || tree->num_nodes == MAXNUMNODES || (node = int_stree_connect(tree, node, (STREE_NODE) leaf)) == NULL) { if (leaf != NULL) int_stree_free_leaf(tree, leaf); int_stree_remove_to_position(tree, id, j); int_stree_delete_string(tree, id); return 0; } tree->num_nodes++; } else { if ((int_stree_isaleaf(tree, node) && (node = int_stree_convert_leafnode(tree, node)) == NULL)) { int_stree_remove_to_position(tree, id, j); int_stree_delete_string(tree, id); return 0; } if (!int_stree_add_intleaf(tree, node, id, j)) { int_stree_remove_to_position(tree, id, j); int_stree_delete_string(tree, id); return 0; } } if (lastnode != root && lastnode->suffix_link == NULL) lastnode->suffix_link = node; lastnode = node; } else { /* * g > 0 && g < edgelen, and so S[j..i-1] ends in the middle * of some edge. * * If the next character in the edge label matches the next * input character, keep moving down that edge. Otherwise, * split the edge at that point and add a new leaf for the * suffix. */ if (i < M && stree_mapch(tree, S[i]) == stree_mapch(tree, edgestr[g])) { g++; break; } if (tree->num_nodes == MAXNUMNODES || (node = int_stree_edge_split(tree, node, g)) == NULL) { int_stree_remove_to_position(tree, id, j); int_stree_delete_string(tree, id); return 0; } edgestr = stree_get_edgestr(tree, node); edgelen = stree_get_edgelen(tree, node); if (i < M) { if ((leaf = int_stree_new_leaf(tree, id, i)) == NULL || tree->num_nodes == MAXNUMNODES || (node = int_stree_connect(tree, node, (STREE_NODE) leaf)) == NULL) { if (leaf != NULL) int_stree_free_leaf(tree, leaf); int_stree_remove_to_position(tree, id, j); int_stree_delete_string(tree, id); return 0; } tree->num_nodes++; } else { if ((int_stree_isaleaf(tree, node) && (node = int_stree_convert_leafnode(tree, node)) == NULL)) { int_stree_remove_to_position(tree, id, j); int_stree_delete_string(tree, id); return 0; } if (!int_stree_add_intleaf(tree, node, id, j)) { int_stree_remove_to_position(tree, id, j); int_stree_delete_string(tree, id); return 0; } } if (lastnode != root && lastnode->suffix_link == NULL) lastnode->suffix_link = node; lastnode = node; } /* * Now, having extended S[j..i-1] to S[j..i] by rule 2, find where * S[j+1..i-1] is. */ if (node == root) ; else if (g == edgelen && node->suffix_link != NULL) { node = node->suffix_link; edgestr = stree_get_edgestr(tree, node); edgelen = stree_get_edgelen(tree, node); g = edgelen; } else { parent = stree_get_parent(tree, node); if (parent != root) node = parent->suffix_link; else { node = root; g--; } edgelen = stree_get_edgelen(tree, node); h = i - g; while (g > 0) { node = stree_find_child(tree, node, S[h]); gprime = stree_get_edgelen(tree, node); if (gprime > g) break; g -= gprime; h += gprime; } edgelen = stree_get_edgelen(tree, node); edgestr = stree_get_edgestr(tree, node); if (g == 0) { if (lastnode != root && !int_stree_isaleaf(tree, node) && lastnode->suffix_link == NULL) { lastnode->suffix_link = node; lastnode = node; } if (node != root) g = edgelen; } } } } return 1; }