/* * lca_prep * * Preprocessing for the constant time LCA algorithm. * * Parameters: tree - a suffix tree * * Returns: An LCA_STRUCT structure */ LCA_STRUCT *lca_prep(SUFFIX_TREE tree) { assert(tree); LCA_STRUCT *lca = (LCA_STRUCT *)my_calloc(sizeof(LCA_STRUCT), 1); if (!lca) { return NULL; } lca->type = LCA_LINEAR; lca->tree = tree; int num_nodes = (int)stree_get_num_nodes(tree) + 1; if ((lca->I = (unsigned int *)my_calloc(num_nodes, sizeof(unsigned int))) == NULL) { lca_free(lca); return NULL; } if ((lca->A = (unsigned int *)my_calloc(num_nodes, sizeof(unsigned int))) == NULL) { lca_free(lca); return NULL; } if ((lca->L = (STREE_NODE *)my_calloc(num_nodes, sizeof(STREE_NODE))) == NULL) { lca_free(lca); return NULL; } // Compute the I and L values, then compute the A values. compute_I_and_L(lca, tree, stree_get_root(tree)); compute_A(lca, tree, stree_get_root(tree), 0); return lca; }
/* * int_stree_remove_to_position * * Remove the suffixes of a string from the suffix tree, and compact the * tree as necessary. * * NOTE: This can be used either to remove a successfully added string, * or a string which was only partially added to the tree (because * an error stopped the add operation). But it should only be used * to completely remove a string. * * Parameters: tree - A suffix tree * id - The internally used id of the string to remove * num_remove - How many positions to remove. * * Return: nothing. */ void int_stree_remove_to_position(SUFFIX_TREE tree, int id, int num_remove) { int M, walklen, rempos, pos, num, status; char *S; STREE_NODE node, next; STREE_LEAF leaf; if (num_remove == 0) return; S = int_stree_get_string(tree, id); M = int_stree_get_length(tree, id); walklen = int_stree_walk_to_leaf(tree, stree_get_root(tree), 0, S, M, &node, &pos); assert(walklen == M || int_stree_isaleaf(tree, node)); next = NULL; rempos = 0; while (rempos < num_remove) { if (rempos < num_remove - 1) { next = stree_get_suffix_link(tree, node); assert(next != NULL); } if (int_stree_isaleaf(tree, node)) { leaf = (STREE_LEAF) node; assert(leaf->strid == id && int_stree_get_leafpos(tree, leaf) == rempos); int_stree_disconnect(tree, node); int_stree_free_leaf(tree, leaf); } else { status = int_stree_remove_intleaf(tree, node, id, rempos); assert(status != 0); if (!int_stree_has_intleaves(tree, node) && node != stree_get_root(tree) && (num = stree_get_num_children(tree, node)) < 2) { if (num == 0) { int_stree_disconnect(tree, node); int_stree_delete_subtree(tree, node); } else if (num == 1) int_stree_edge_merge(tree, node); } } if (rempos < num_remove - 1) node = next; rempos++; } }
/* * int_stree_edge_merge * * When a node has no "leaves" and only one child, this function will * remove that node and merge the edges from parent to node and node * to child into a single edge from parent to child. * * Parameters: tree - A suffix tree * node - The tree node to be removed * * Return: nothing. */ void int_stree_edge_merge(SUFFIX_TREE tree, STREE_NODE node) { int len; STREE_NODE parent, child; STREE_LEAF leaf; if (node == stree_get_root(tree) || int_stree_isaleaf(tree, node) || int_stree_has_intleaves(tree, node)) return; parent = stree_get_parent(tree, node); child = stree_get_children(tree, node); if (stree_get_next(tree, child) != NULL) return; len = stree_get_edgelen(tree, node); if (int_stree_isaleaf(tree, child)) { leaf = (STREE_LEAF) child; leaf->pos -= len; leaf->ch = stree_get_mapch(tree, node); } else { child->edgestr -= len; child->edgelen += len; } int_stree_reconnect(tree, parent, node, child); tree->num_nodes--; tree->idents_dirty = 1; int_stree_free_node(tree, node); }
/* * stree_get_labellen * * Get the length of the string labelling the path from the root to * a tree node. * * Parameters: tree - a suffix tree * node - a tree node * * Returns: the length of the node's label. */ int stree_get_labellen(SUFFIX_TREE tree, STREE_NODE node) { int len; len = 0; while (node != stree_get_root(tree)) { len += stree_get_edgelen(tree, node); node = stree_get_parent(tree, node); } return len; }
/* * int_stree_set_idents * * Uses the non-recursive traversal to set the identifiers for the current * nodes of the suffix tree. The nodes are numbered in a depth-first * manner, beginning from the root and taking the nodes in the order they * appear in the children lists. * * Parameters: tree - A suffix tree * * Return: nothing. */ void int_stree_set_idents(SUFFIX_TREE tree) { int id; STREE_NODE node, next; if (!tree->idents_dirty) return; tree->idents_dirty = 0; /* * Use a non-recursive traversal. See stree_traverse_subtree for * details. */ id = 0; node = stree_get_root(tree); while (1) { node->id = id++; next = stree_get_children(tree, node); if (next != NULL) { node = next; continue; } while (1) { if (node == stree_get_root(tree)) return; if ((next = stree_get_next(tree, node)) != NULL) break; node = stree_get_parent(tree, node); } node = next; } }
/* * int_stree_get_suffix_link * * Traverses the suffix link from a node, and returns the node at the * end of the suffix link. * * Parameters: tree - a suffix tree * node - a tree node * * Return: The node at the end of the suffix line. */ STREE_NODE int_stree_get_suffix_link(SUFFIX_TREE tree, STREE_NODE node) { int len, edgelen; char *edgestr; STREE_NODE parent; if (node == stree_get_root(tree)) return NULL; else if (!int_stree_isaleaf(tree, node)) return node->suffix_link; edgestr = stree_get_edgestr(tree, node); edgelen = stree_get_edgelen(tree, node); parent = stree_get_parent(tree, node); /* * Do the skip/count trip to skip down to the proper node. */ if (parent != stree_get_root(tree)) parent = parent->suffix_link; else { edgestr++; edgelen--; } node = parent; while (edgelen > 0) { node = stree_find_child(tree, node, *edgestr); assert(node != NULL); len = stree_get_edgelen(tree, node); edgestr += len; edgelen -= len; } return node; }
/* * int_stree_disconnect * * Disconnects a node from its parent, and compacts the tree if that * parent is no longer needed. * * Parameters: tree - a suffix tree * node - a tree node * * Return: The node at the end of the suffix line. */ void int_stree_disconnect(SUFFIX_TREE tree, STREE_NODE node) { int num; STREE_NODE parent; if (node == stree_get_root(tree)) return; parent = stree_get_parent(tree, node); int_stree_disc_from_parent(tree, parent, node); if (!int_stree_has_intleaves(tree, parent) && parent != stree_get_root(tree) && (num = stree_get_num_children(tree, parent)) < 2) { if (num == 0) { int_stree_disconnect(tree, parent); int_stree_delete_subtree(tree, parent); } else if (num == 1) int_stree_edge_merge(tree, parent); } tree->idents_dirty = 1; }
/* * stree_get_label * * Get the string labelling the path from the root to a tree node and * store that string (or a part of the string) in the given buffer. * * If the node's label is longer than the buffer, then `buflen' * characters from either the beginning or end of the label (depending * on the value of `endflag') are copied into the buffer and the string * is NOT NULL-terminated. Otherwise, the string will be NULL-terminated. * * Parameters: tree - a suffix tree * node - a tree node * buffer - the character buffer * buflen - the buffer length * endflag - copy from the end of the label? * * Returns: nothing. */ void stree_get_label(SUFFIX_TREE tree, STREE_NODE node, char *buffer, int buflen, int endflag) { int len, skip, edgelen; char *edgestr, *bufptr; len = stree_get_labellen(tree, node); skip = 0; if (buflen > len) buffer[len] = '\0'; else { if (len > buflen && !endflag) skip = len - buflen; len = buflen; } /* * Fill in the buffer from the end to the beginning, as we move up * the tree. If `endflag' is false and the buffer is smaller than * the label, then skip past the "last" `len - buflen' characters (i.e., the * last characters on the path to the node, but the first characters * that will be seen moving up to the root). */ bufptr = buffer + len; while (len > 0 && node != stree_get_root(tree)) { edgelen = stree_get_edgelen(tree, node); if (skip >= edgelen) skip -= edgelen; else { if (skip > 0) { edgelen -= skip; skip = 0; } edgestr = stree_get_edgestr(tree, node) + edgelen; for ( ; len > 0 && edgelen > 0; edgelen--,len--) *--bufptr = *--edgestr; } node = stree_get_parent(tree, node); } }
/* * stree_delete_tree * * Frees the SUFFIX_TREE data structure and all of its allocated space. * * Parameters: tree - a suffix tree * * Returns: nothing. */ void stree_delete_tree(SUFFIX_TREE tree) { int i; int_stree_delete_subtree(tree, stree_get_root(tree)); if (tree->strings != NULL) { if (tree->copyflag) { for (i=0; i < MAXNUMSTR; i++) if (tree->strings[i] != NULL) free(tree->strings[i]); } free(tree->strings); } if (tree->ids != NULL) free(tree->ids); if (tree->lengths != NULL) free(tree->lengths); free(tree); }
/* * int_stree_get_leafpos * * Return the position of the suffix that ends at that leaf. This value * must be reconstructed by walking back up the tree to the root. * * Parameters: tree - A suffix tree * node - A tree node * * Returns: The edge label length. */ int int_stree_get_leafpos(SUFFIX_TREE tree, STREE_LEAF leaf) { int pos; STREE_NODE node, root; pos = leaf->pos; node = (STREE_NODE) leaf; root = stree_get_root(tree); while (1) { while (!node->nextisparent) node = node->next; if (node == root) break; node = node->next; pos -= node->edgelen; } return pos; }
/* * stree_match & stree_walk * * Traverse the path down the tree whose path label matches T, and return * the number of characters of T matches, and the node and position along * the node's edge where the matching to T ends. * * Parameters: tree - a suffix tree * node - what node to start the walk down the tree * pos - position along node's edge to start the walk * (`node' and `pos' are stree_walk only) * T - the sequence to match * N - the sequence length * node_out - address of where to store the node where * the traversal ends * pos_out - address of where to store the character position * along the ending node's edge of the endpoint of * the traversal * * Returns: The number of characters of T matched. */ int stree_match(SUFFIX_TREE tree, char *T, int N, STREE_NODE *node_out, int *pos_out) { return stree_walk(tree, stree_get_root(tree), 0, T, N, node_out, pos_out); }
/* * stree_traverse & stree_traverse_subtree * * Use a non-recursive traversal of the tree (or a subtree), calling the * two function parameters before and after recursing at each node, resp. * When memory is at a premium, this traversal may be useful. * * Note that either of the function parameters can be NULL, if you just * need to do pre-order or post-order processing. * * The traversal uses the `ch' field of the tree nodes to hold its * state information. After the traversal is finished with a node, it * will restore that ch value. * * Parameters: tree - a suffix tree * node - root node of the traversal * (stree_traverse_subtree only) * preorder_fn - function to call before visiting the children * postorder_fn - function to call after visiting all children * * Returns: nothing. */ void stree_traverse(SUFFIX_TREE tree, int (*preorder_fn)(), int (*postorder_fn)()) { stree_traverse_subtree(tree, stree_get_root(tree), preorder_fn, postorder_fn); }
/* * stree_add_string * * Implements Ukkonen's construction algorithm to add a string to the * suffix tree. * * This operation is an "atomic" operation. In the far too likely case * that the program runs out of memory (or hits the maximum allocated * memory set by stree_set_max_alloc), this operation undoes any partial * changes it may have made to the algorithm, and it leaves the tree in * its original form. Thus, you can just keep adding strings until the * function returns 0, and not have too worry about whether a call to the * function will trash the tree just because there's no memory left. * * NOTE: The `id' value given must be unique to any of the strings * added to the tree, and must be a small integer greater than * 0. * * The best id's to use are to number the strings from 1 to K. * * Parameters: tree - a suffix tree * S - the sequence * M - the sequence length * id - the sequence identifier * Sraw - the raw sequence (i.e. whose characters * are not translated to 0..alphasize-1) * * Returns: non-zero on success, zero on error. */ int stree_add_string(SUFFIX_TREE tree, char *S, int M, int strid) { int i, j, g, h, gprime, edgelen, id; char *edgestr; STREE_NODE node, lastnode, root, child, parent; STREE_LEAF leaf; id = int_stree_insert_string(tree, S, M, strid); if (id == -1) return 0; /* * Run Ukkonen's algorithm to add the string to the suffix tree. */ root = stree_get_root(tree); node = lastnode = root; g = 0; edgelen = 0; edgestr = NULL; for (i=0,j=0; i <= M; i++) { for ( ; j <= i && j < M; j++) { /* * Perform the extension from S[j..i-1] to S[j..i]. One of the * following two cases holds: * a) g == 0, node == root and i == j. * (meaning that in the previous outer loop, * all of the extensions S[1..i-1], S[2..i-1], ..., * S[i-1..i-1] were done.) * b) g > 0, node != root and the string S[j..i-1] * ends at the g'th character of node's edge. */ if (g == 0 || g == edgelen) { if (i < M) { if ((child = stree_find_child(tree, node, S[i])) != NULL) { node = child; g = 1; edgestr = stree_get_edgestr(tree, node); edgelen = stree_get_edgelen(tree, node); break; } if ((leaf = int_stree_new_leaf(tree, id, i)) == NULL || tree->num_nodes == MAXNUMNODES || (node = int_stree_connect(tree, node, (STREE_NODE) leaf)) == NULL) { if (leaf != NULL) int_stree_free_leaf(tree, leaf); int_stree_remove_to_position(tree, id, j); int_stree_delete_string(tree, id); return 0; } tree->num_nodes++; } else { if ((int_stree_isaleaf(tree, node) && (node = int_stree_convert_leafnode(tree, node)) == NULL)) { int_stree_remove_to_position(tree, id, j); int_stree_delete_string(tree, id); return 0; } if (!int_stree_add_intleaf(tree, node, id, j)) { int_stree_remove_to_position(tree, id, j); int_stree_delete_string(tree, id); return 0; } } if (lastnode != root && lastnode->suffix_link == NULL) lastnode->suffix_link = node; lastnode = node; } else { /* * g > 0 && g < edgelen, and so S[j..i-1] ends in the middle * of some edge. * * If the next character in the edge label matches the next * input character, keep moving down that edge. Otherwise, * split the edge at that point and add a new leaf for the * suffix. */ if (i < M && stree_mapch(tree, S[i]) == stree_mapch(tree, edgestr[g])) { g++; break; } if (tree->num_nodes == MAXNUMNODES || (node = int_stree_edge_split(tree, node, g)) == NULL) { int_stree_remove_to_position(tree, id, j); int_stree_delete_string(tree, id); return 0; } edgestr = stree_get_edgestr(tree, node); edgelen = stree_get_edgelen(tree, node); if (i < M) { if ((leaf = int_stree_new_leaf(tree, id, i)) == NULL || tree->num_nodes == MAXNUMNODES || (node = int_stree_connect(tree, node, (STREE_NODE) leaf)) == NULL) { if (leaf != NULL) int_stree_free_leaf(tree, leaf); int_stree_remove_to_position(tree, id, j); int_stree_delete_string(tree, id); return 0; } tree->num_nodes++; } else { if ((int_stree_isaleaf(tree, node) && (node = int_stree_convert_leafnode(tree, node)) == NULL)) { int_stree_remove_to_position(tree, id, j); int_stree_delete_string(tree, id); return 0; } if (!int_stree_add_intleaf(tree, node, id, j)) { int_stree_remove_to_position(tree, id, j); int_stree_delete_string(tree, id); return 0; } } if (lastnode != root && lastnode->suffix_link == NULL) lastnode->suffix_link = node; lastnode = node; } /* * Now, having extended S[j..i-1] to S[j..i] by rule 2, find where * S[j+1..i-1] is. */ if (node == root) ; else if (g == edgelen && node->suffix_link != NULL) { node = node->suffix_link; edgestr = stree_get_edgestr(tree, node); edgelen = stree_get_edgelen(tree, node); g = edgelen; } else { parent = stree_get_parent(tree, node); if (parent != root) node = parent->suffix_link; else { node = root; g--; } edgelen = stree_get_edgelen(tree, node); h = i - g; while (g > 0) { node = stree_find_child(tree, node, S[h]); gprime = stree_get_edgelen(tree, node); if (gprime > g) break; g -= gprime; h += gprime; } edgelen = stree_get_edgelen(tree, node); edgestr = stree_get_edgestr(tree, node); if (g == 0) { if (lastnode != root && !int_stree_isaleaf(tree, node) && lastnode->suffix_link == NULL) { lastnode->suffix_link = node; lastnode = node; } if (node != root) g = edgelen; } } } } return 1; }