/* * lca_naive_lookup * * Compute the LCA of two suffix tree nodes, using the naive algorithm * of walking up the two paths from the nodes to the root until arriving * at a common node on both paths. * * This works because the suffix tree identifiers are given in a depth-first * search manner. So, repeatedly taking the node with the higher numbered * identifier and moving to its parent (until the two identifiers are * equal) will find the least common ancestor. * * Parameters: lca - an LCA_STRUCT structure * x - a suffix tree node * y - another suffix tree node * * Returns: the suffix tree node which is the LCA of x and y */ STREE_NODE lca_naive_lookup(LCA_STRUCT *lca, STREE_NODE x, STREE_NODE y) { assert (lca && lca->type == LCA_NAIVE && x && y); SUFFIX_TREE tree = lca->tree; int xid = stree_get_ident(tree, x); int yid = stree_get_ident(tree, y); while (xid != yid) { while (xid > yid) { x = stree_get_parent(tree, x); xid = stree_get_ident(tree, x); IF_STATS(lca->num_compares++); } while (xid < yid) { y = stree_get_parent(tree, y); yid = stree_get_ident(tree, y); IF_STATS(lca->num_compares++); } } IF_STATS(lca->num_compares++); return x; }
/* * lca_lookup * * Perform the constant time LCA computation, finding the least * common ancestor of x and y. * * Parameters: lca - an LCA_STRUCT structure * x - a suffix tree node * y - another suffix tree node * * Returns: the suffix tree node which is the LCA of x and y */ STREE_NODE lca_lookup(LCA_STRUCT *lca, STREE_NODE x, STREE_NODE y) { assert(lca && lca->tree && lca->type == LCA_LINEAR && x && y); SUFFIX_TREE tree = lca->tree; unsigned int *I = lca->I; unsigned int *A = lca->A; STREE_NODE *L = lca->L; // Shift idents so that they go from 1..num_nodes. unsigned int xid = (unsigned int)stree_get_ident(tree, x) + 1; unsigned int yid = (unsigned int)stree_get_ident(tree, y) + 1; /* * Steps 1 and 2. * * Step 1 here differs from the book in that it returns the most * significant bit counting from the right (and starting the count * with 0), and then simply OR's the k+1..32 bits of I[xid] and the * number 2^k. */ //printf("xid=%3d, I[xid]=%3d\n", xid, I[xid]); // printf("yid=%3d, I[yid]=%3d\n", yid, I[yid]); unsigned int k = MSB(I[xid] ^ I[yid]); unsigned int b = (I[xid] & HIGH_BITS(k+1)) | (1 << k); unsigned int j = h( (A[xid] & A[yid]) & HIGH_BITS(h(b)) ); // printf("k=%d, b=%d, j =%d\n", k, b, j); IF_STATS(lca->num_compares++); // Step 3. unsigned int l = h(A[xid]); STREE_NODE xbar, ybar; if (l == j) { xbar = x; } else { k = MSB(A[xid] & ~HIGH_BITS(j)); xbar = stree_get_parent(tree, L[(I[xid] & HIGH_BITS(k+1)) | (1 << k)]); } IF_STATS(lca->num_compares++); // Step 4. l = h(A[yid]); if (l == j) { ybar = y; } else { k = MSB(A[yid] & ~HIGH_BITS(j)); ybar = stree_get_parent(tree, L[(I[yid] & HIGH_BITS(k+1)) | (1 << k)]); } IF_STATS(lca->num_compares++); // Step 5. IF_STATS(lca->num_compares++); return (stree_get_ident(tree, xbar) < stree_get_ident(tree, ybar)) ? xbar : ybar; }
/* * int_stree_convert_leafnode * * Convert a LEAF structure into a NODE structure and replace the * NODE for the LEAF in the suffix tree.. * * Parameters: tree - a suffix tree * node - a leaf of the tree * * Returns: The NODE structure corresponding to the leaf, or NULL. */ STREE_NODE int_stree_convert_leafnode(SUFFIX_TREE tree, STREE_NODE node) { STREE_NODE newnode; STREE_LEAF leaf; STREE_INTLEAF intleaf; if (!int_stree_isaleaf(tree, node)) return node; leaf = (STREE_LEAF) node; newnode = int_stree_new_node(tree, stree_get_edgestr(tree, node), stree_get_edgelen(tree, node)); if (newnode == NULL) return NULL; intleaf = int_stree_new_intleaf(tree, leaf->strid, int_stree_get_leafpos(tree, leaf)); if (intleaf == NULL) { int_stree_free_node(tree, newnode); return NULL; } newnode->id = leaf->id; newnode->isaleaf = 0; newnode->ch = 1; newnode->children = (STREE_NODE) intleaf; int_stree_reconnect(tree, stree_get_parent(tree, node), node, newnode); int_stree_free_leaf(tree, leaf); return newnode; }
/* * int_stree_edge_merge * * When a node has no "leaves" and only one child, this function will * remove that node and merge the edges from parent to node and node * to child into a single edge from parent to child. * * Parameters: tree - A suffix tree * node - The tree node to be removed * * Return: nothing. */ void int_stree_edge_merge(SUFFIX_TREE tree, STREE_NODE node) { int len; STREE_NODE parent, child; STREE_LEAF leaf; if (node == stree_get_root(tree) || int_stree_isaleaf(tree, node) || int_stree_has_intleaves(tree, node)) return; parent = stree_get_parent(tree, node); child = stree_get_children(tree, node); if (stree_get_next(tree, child) != NULL) return; len = stree_get_edgelen(tree, node); if (int_stree_isaleaf(tree, child)) { leaf = (STREE_LEAF) child; leaf->pos -= len; leaf->ch = stree_get_mapch(tree, node); } else { child->edgestr -= len; child->edgelen += len; } int_stree_reconnect(tree, parent, node, child); tree->num_nodes--; tree->idents_dirty = 1; int_stree_free_node(tree, node); }
/* * stree_get_labellen * * Get the length of the string labelling the path from the root to * a tree node. * * Parameters: tree - a suffix tree * node - a tree node * * Returns: the length of the node's label. */ int stree_get_labellen(SUFFIX_TREE tree, STREE_NODE node) { int len; len = 0; while (node != stree_get_root(tree)) { len += stree_get_edgelen(tree, node); node = stree_get_parent(tree, node); } return len; }
/* * int_stree_edge_split * * Splits an edge of the suffix tree, and adds a new node between two * existing nodes at that split point. * * Parameters: tree - a suffix tree * node - The tree node just below the split. * len - How far down node's edge label the split is. * * Return: The new node added at the split. */ STREE_NODE int_stree_edge_split(SUFFIX_TREE tree, STREE_NODE node, int len) { char *edgestr; STREE_NODE newnode, parent; STREE_LEAF leaf; if (tree->num_nodes == MAXNUMNODES || len == 0 || stree_get_edgelen(tree, node) <= len) return NULL; edgestr = stree_get_edgestr(tree, node); newnode = int_stree_new_node(tree, edgestr, len); if (newnode == NULL) return NULL; parent = stree_get_parent(tree, node); int_stree_reconnect(tree, parent, node, newnode); if (int_stree_isaleaf(tree, node)) { leaf = (STREE_LEAF) node; leaf->pos += len; leaf->ch = stree_mapch(tree, edgestr[len]); } else { node->edgestr += len; node->edgelen -= len; } if (int_stree_connect(tree, newnode, node) == NULL) { if (int_stree_isaleaf(tree, node)) { leaf = (STREE_LEAF) node; leaf->pos -= len; leaf->ch = stree_mapch(tree, *edgestr); } else { node->edgestr -= len; node->edgelen += len; } int_stree_reconnect(tree, parent, newnode, node); int_stree_free_node(tree, newnode); return NULL; } tree->num_nodes++; tree->idents_dirty = 1; return newnode; }
void stree_traverse_subtree(SUFFIX_TREE tree, STREE_NODE root, int (*preorder_fn)(), int (*postorder_fn)()) { STREE_NODE node, next; /* * Use a non-recursive traversal */ node = root; while (1) { /* * Begin processing a node. If it has any children, then move down * and process the children. */ if (preorder_fn != NULL) (*preorder_fn)(tree, node); next = stree_get_children(tree, node); if (next != NULL) { node = next; continue; } /* * We've finished processing the children (if any). Finish the * processing of the node, then either move to the next child * below the parent of node (accessed by the next field, instead * of moving up the tree to the parent and then down), or move up * to the parent if there is no next. * * If we've finished processing the root of the subtree, then return. */ while (1) { if (postorder_fn != NULL) (*postorder_fn)(tree, node); if (node == root) return; if ((next = stree_get_next(tree, node)) != NULL) break; node = stree_get_parent(tree, node); } node = next; } }
/* * stree_get_label * * Get the string labelling the path from the root to a tree node and * store that string (or a part of the string) in the given buffer. * * If the node's label is longer than the buffer, then `buflen' * characters from either the beginning or end of the label (depending * on the value of `endflag') are copied into the buffer and the string * is NOT NULL-terminated. Otherwise, the string will be NULL-terminated. * * Parameters: tree - a suffix tree * node - a tree node * buffer - the character buffer * buflen - the buffer length * endflag - copy from the end of the label? * * Returns: nothing. */ void stree_get_label(SUFFIX_TREE tree, STREE_NODE node, char *buffer, int buflen, int endflag) { int len, skip, edgelen; char *edgestr, *bufptr; len = stree_get_labellen(tree, node); skip = 0; if (buflen > len) buffer[len] = '\0'; else { if (len > buflen && !endflag) skip = len - buflen; len = buflen; } /* * Fill in the buffer from the end to the beginning, as we move up * the tree. If `endflag' is false and the buffer is smaller than * the label, then skip past the "last" `len - buflen' characters (i.e., the * last characters on the path to the node, but the first characters * that will be seen moving up to the root). */ bufptr = buffer + len; while (len > 0 && node != stree_get_root(tree)) { edgelen = stree_get_edgelen(tree, node); if (skip >= edgelen) skip -= edgelen; else { if (skip > 0) { edgelen -= skip; skip = 0; } edgestr = stree_get_edgestr(tree, node) + edgelen; for ( ; len > 0 && edgelen > 0; edgelen--,len--) *--bufptr = *--edgestr; } node = stree_get_parent(tree, node); } }
/* * int_stree_set_idents * * Uses the non-recursive traversal to set the identifiers for the current * nodes of the suffix tree. The nodes are numbered in a depth-first * manner, beginning from the root and taking the nodes in the order they * appear in the children lists. * * Parameters: tree - A suffix tree * * Return: nothing. */ void int_stree_set_idents(SUFFIX_TREE tree) { int id; STREE_NODE node, next; if (!tree->idents_dirty) return; tree->idents_dirty = 0; /* * Use a non-recursive traversal. See stree_traverse_subtree for * details. */ id = 0; node = stree_get_root(tree); while (1) { node->id = id++; next = stree_get_children(tree, node); if (next != NULL) { node = next; continue; } while (1) { if (node == stree_get_root(tree)) return; if ((next = stree_get_next(tree, node)) != NULL) break; node = stree_get_parent(tree, node); } node = next; } }
/* * int_stree_get_suffix_link * * Traverses the suffix link from a node, and returns the node at the * end of the suffix link. * * Parameters: tree - a suffix tree * node - a tree node * * Return: The node at the end of the suffix line. */ STREE_NODE int_stree_get_suffix_link(SUFFIX_TREE tree, STREE_NODE node) { int len, edgelen; char *edgestr; STREE_NODE parent; if (node == stree_get_root(tree)) return NULL; else if (!int_stree_isaleaf(tree, node)) return node->suffix_link; edgestr = stree_get_edgestr(tree, node); edgelen = stree_get_edgelen(tree, node); parent = stree_get_parent(tree, node); /* * Do the skip/count trip to skip down to the proper node. */ if (parent != stree_get_root(tree)) parent = parent->suffix_link; else { edgestr++; edgelen--; } node = parent; while (edgelen > 0) { node = stree_find_child(tree, node, *edgestr); assert(node != NULL); len = stree_get_edgelen(tree, node); edgestr += len; edgelen -= len; } return node; }
/* * int_stree_disconnect * * Disconnects a node from its parent, and compacts the tree if that * parent is no longer needed. * * Parameters: tree - a suffix tree * node - a tree node * * Return: The node at the end of the suffix line. */ void int_stree_disconnect(SUFFIX_TREE tree, STREE_NODE node) { int num; STREE_NODE parent; if (node == stree_get_root(tree)) return; parent = stree_get_parent(tree, node); int_stree_disc_from_parent(tree, parent, node); if (!int_stree_has_intleaves(tree, parent) && parent != stree_get_root(tree) && (num = stree_get_num_children(tree, parent)) < 2) { if (num == 0) { int_stree_disconnect(tree, parent); int_stree_delete_subtree(tree, parent); } else if (num == 1) int_stree_edge_merge(tree, parent); } tree->idents_dirty = 1; }
/* * stree_add_string * * Implements Ukkonen's construction algorithm to add a string to the * suffix tree. * * This operation is an "atomic" operation. In the far too likely case * that the program runs out of memory (or hits the maximum allocated * memory set by stree_set_max_alloc), this operation undoes any partial * changes it may have made to the algorithm, and it leaves the tree in * its original form. Thus, you can just keep adding strings until the * function returns 0, and not have too worry about whether a call to the * function will trash the tree just because there's no memory left. * * NOTE: The `id' value given must be unique to any of the strings * added to the tree, and must be a small integer greater than * 0. * * The best id's to use are to number the strings from 1 to K. * * Parameters: tree - a suffix tree * S - the sequence * M - the sequence length * id - the sequence identifier * Sraw - the raw sequence (i.e. whose characters * are not translated to 0..alphasize-1) * * Returns: non-zero on success, zero on error. */ int stree_add_string(SUFFIX_TREE tree, char *S, int M, int strid) { int i, j, g, h, gprime, edgelen, id; char *edgestr; STREE_NODE node, lastnode, root, child, parent; STREE_LEAF leaf; id = int_stree_insert_string(tree, S, M, strid); if (id == -1) return 0; /* * Run Ukkonen's algorithm to add the string to the suffix tree. */ root = stree_get_root(tree); node = lastnode = root; g = 0; edgelen = 0; edgestr = NULL; for (i=0,j=0; i <= M; i++) { for ( ; j <= i && j < M; j++) { /* * Perform the extension from S[j..i-1] to S[j..i]. One of the * following two cases holds: * a) g == 0, node == root and i == j. * (meaning that in the previous outer loop, * all of the extensions S[1..i-1], S[2..i-1], ..., * S[i-1..i-1] were done.) * b) g > 0, node != root and the string S[j..i-1] * ends at the g'th character of node's edge. */ if (g == 0 || g == edgelen) { if (i < M) { if ((child = stree_find_child(tree, node, S[i])) != NULL) { node = child; g = 1; edgestr = stree_get_edgestr(tree, node); edgelen = stree_get_edgelen(tree, node); break; } if ((leaf = int_stree_new_leaf(tree, id, i)) == NULL || tree->num_nodes == MAXNUMNODES || (node = int_stree_connect(tree, node, (STREE_NODE) leaf)) == NULL) { if (leaf != NULL) int_stree_free_leaf(tree, leaf); int_stree_remove_to_position(tree, id, j); int_stree_delete_string(tree, id); return 0; } tree->num_nodes++; } else { if ((int_stree_isaleaf(tree, node) && (node = int_stree_convert_leafnode(tree, node)) == NULL)) { int_stree_remove_to_position(tree, id, j); int_stree_delete_string(tree, id); return 0; } if (!int_stree_add_intleaf(tree, node, id, j)) { int_stree_remove_to_position(tree, id, j); int_stree_delete_string(tree, id); return 0; } } if (lastnode != root && lastnode->suffix_link == NULL) lastnode->suffix_link = node; lastnode = node; } else { /* * g > 0 && g < edgelen, and so S[j..i-1] ends in the middle * of some edge. * * If the next character in the edge label matches the next * input character, keep moving down that edge. Otherwise, * split the edge at that point and add a new leaf for the * suffix. */ if (i < M && stree_mapch(tree, S[i]) == stree_mapch(tree, edgestr[g])) { g++; break; } if (tree->num_nodes == MAXNUMNODES || (node = int_stree_edge_split(tree, node, g)) == NULL) { int_stree_remove_to_position(tree, id, j); int_stree_delete_string(tree, id); return 0; } edgestr = stree_get_edgestr(tree, node); edgelen = stree_get_edgelen(tree, node); if (i < M) { if ((leaf = int_stree_new_leaf(tree, id, i)) == NULL || tree->num_nodes == MAXNUMNODES || (node = int_stree_connect(tree, node, (STREE_NODE) leaf)) == NULL) { if (leaf != NULL) int_stree_free_leaf(tree, leaf); int_stree_remove_to_position(tree, id, j); int_stree_delete_string(tree, id); return 0; } tree->num_nodes++; } else { if ((int_stree_isaleaf(tree, node) && (node = int_stree_convert_leafnode(tree, node)) == NULL)) { int_stree_remove_to_position(tree, id, j); int_stree_delete_string(tree, id); return 0; } if (!int_stree_add_intleaf(tree, node, id, j)) { int_stree_remove_to_position(tree, id, j); int_stree_delete_string(tree, id); return 0; } } if (lastnode != root && lastnode->suffix_link == NULL) lastnode->suffix_link = node; lastnode = node; } /* * Now, having extended S[j..i-1] to S[j..i] by rule 2, find where * S[j+1..i-1] is. */ if (node == root) ; else if (g == edgelen && node->suffix_link != NULL) { node = node->suffix_link; edgestr = stree_get_edgestr(tree, node); edgelen = stree_get_edgelen(tree, node); g = edgelen; } else { parent = stree_get_parent(tree, node); if (parent != root) node = parent->suffix_link; else { node = root; g--; } edgelen = stree_get_edgelen(tree, node); h = i - g; while (g > 0) { node = stree_find_child(tree, node, S[h]); gprime = stree_get_edgelen(tree, node); if (gprime > g) break; g -= gprime; h += gprime; } edgelen = stree_get_edgelen(tree, node); edgestr = stree_get_edgestr(tree, node); if (g == 0) { if (lastnode != root && !int_stree_isaleaf(tree, node) && lastnode->suffix_link == NULL) { lastnode->suffix_link = node; lastnode = node; } if (node != root) g = edgelen; } } } } return 1; }