static bool hashesAreEqual(stHash *observedHash, stHash *expectedHash) { stHashIterator *hit = stHash_getIterator(observedHash); char *key; while ((key = stHash_getNext(hit)) != NULL) { if (stHash_search(expectedHash, key) == NULL) { printBlockHash(observedHash, "observed"); printBlockHash(expectedHash, "expected"); return false; } if (!rowsAreEqual(stHash_search(observedHash, key), stHash_search(expectedHash, key))) { printBlockHash(observedHash, "observed"); printBlockHash(expectedHash, "expected"); return false; } } stHash_destructIterator(hit); hit = stHash_getIterator(expectedHash); while ((key = stHash_getNext(hit)) != NULL) { if (stHash_search(observedHash, key) == NULL) { printBlockHash(observedHash, "observed"); printBlockHash(expectedHash, "expected"); return false; } if (!rowsAreEqual(stHash_search(observedHash, key), stHash_search(expectedHash, key))) { printBlockHash(observedHash, "observed"); printBlockHash(expectedHash, "expected"); return false; } } stHash_destructIterator(hit); return true; }
/* * Uses the functions above to build an adjacency list, then by DFS attempts to create * a valid topological sort, returning non-zero if the graph contains a cycle. */ static int64_t containsACycle(stList *pairs, int64_t sequenceNumber) { //Build an adjacency list structure.. stHash *adjacencyList = buildAdjacencyList(pairs, sequenceNumber); //Do a topological sort of the adjacency list stSortedSet *started = stSortedSet_construct3((int (*)(const void *, const void *))stIntTuple_cmpFn, NULL); stSortedSet *done = stSortedSet_construct3((int (*)(const void *, const void *))stIntTuple_cmpFn, NULL); int64_t cyclic = 0; for(int64_t seq=0; seq<sequenceNumber; seq++) { stIntTuple *seqPos = stIntTuple_construct2( seq, 0); //The following hacks avoid memory cleanup.. stSortedSet *column = stHash_search(adjacencyList, seqPos); assert(column != NULL); stIntTuple *seqPos2 = stSortedSet_search(column, seqPos); assert(seqPos2 != NULL); cyclic = cyclic || dfs(adjacencyList, seqPos2, started, done); stIntTuple_destruct(seqPos); } //cleanup stHashIterator *it = stHash_getIterator(adjacencyList); stIntTuple *seqPos; stSortedSet *columns = stSortedSet_construct2((void (*)(void *))stSortedSet_destruct); while((seqPos = stHash_getNext(it)) != NULL) { stSortedSet *column = stHash_search(adjacencyList, seqPos); assert(column != NULL); stSortedSet_insert(columns, column); } stHash_destructIterator(it); stHash_destruct(adjacencyList); stSortedSet_destruct(columns); stSortedSet_destruct(started); stSortedSet_destruct(done); return cyclic; }
static void printBlockHash(stHash *hash, const char *title) { stHashIterator *hit = stHash_getIterator(hash); char *key = NULL; row_t *r = NULL; printf("%s:\n", title); while ((key = stHash_getNext(hit)) != NULL) { r = stHash_search(hash, key); printf("%20s %6"PRIu64" %6"PRIu64" %c %9"PRIu64" %s\n", r->name ,r->start, r->length, r->strand, r->sourceLength, r->sequence); } stHash_destructIterator(hit); }
// Compute the connected components, if they haven't been computed // already since the last modification. static void computeConnectedComponents(stNaiveConnectivity *connectivity) { if (connectivity->connectedComponentCache != NULL) { // Already computed the connected components. return; } stHashIterator *nodeIt = stHash_getIterator(connectivity->nodesToAdjList); void *node; stNaiveConnectedComponent *componentsHead = NULL; while ((node = stHash_getNext(nodeIt)) != NULL) { stSet *myNodeSet = stSet_construct(); stSet_insert(myNodeSet, node); struct adjacency *adjList = stHash_search(connectivity->nodesToAdjList, node); if (adjList != NULL) { while (adjList != NULL) { stSet_insert(myNodeSet, adjList->toNode); adjList = adjList->next; } } // Now go through the existing connected components and see if // this overlaps any of them. If it's not a full overlap, then // this set becomes the union, and we continue looking for // additional overlaps, then this becomes a new connected // component. If we find that this is a subset of an existing // component, we can quit early, since we can't possibly add // to it or any others. stNaiveConnectedComponent *curComponent = componentsHead; while (curComponent != NULL) { stNaiveConnectedComponent *next = curComponent->next; // Find out whether our node set is a subset of this // connected component, or if it shares any overlap. bool isSubset = true; bool overlap = false; stSetIterator *myNodeIt = stSet_getIterator(myNodeSet); void *node; while ((node = stSet_getNext(myNodeIt)) != NULL) { if (stSet_search(curComponent->nodes, node)) { overlap = true; } else { isSubset = false; } } stSet_destructIterator(myNodeIt); if (isSubset) { assert(overlap == true); // Quit early. stSet_destruct(myNodeSet); myNodeSet = NULL; break; } else if (overlap) { stSet *newNodeSet = stSet_getUnion(myNodeSet, curComponent->nodes); stSet_destruct(myNodeSet); removeComponent(&componentsHead, curComponent); myNodeSet = newNodeSet; } curComponent = next; } if (myNodeSet != NULL) { // We have a new (or possibly merged) connected component to // add to the list. stNaiveConnectedComponent *newComponent = malloc(sizeof(stNaiveConnectedComponent)); newComponent->nodes = myNodeSet; newComponent->next = componentsHead; componentsHead = newComponent; } } stHash_destructIterator(nodeIt); connectivity->connectedComponentCache = componentsHead; }
stSetIterator *stSet_getIterator(stSet *set) { stSetIterator *sit = st_malloc(sizeof(*sit)); sit->hashIterator = stHash_getIterator(set->hash); return sit; }
int main(int argc, char *argv[]) { // Parse arguments if (argc != 3) { usage(argv); return 1; } // You would load a custom HMM here if you wanted using // hmm_getStateMachine (see the realign code) StateMachine *stateMachine = stateMachine5_construct(fiveState); PairwiseAlignmentParameters *parameters = pairwiseAlignmentBandingParameters_construct(); stHash *targetSequences = readFastaFile(argv[1]); stHash *querySequences = readFastaFile(argv[2]); // For each query sequence, align it against all target sequences. stHashIterator *queryIt = stHash_getIterator(querySequences); char *queryHeader; while ((queryHeader = stHash_getNext(queryIt)) != NULL) { char *querySeq = stHash_search(querySequences, queryHeader); stHashIterator *targetIt = stHash_getIterator(targetSequences); char *targetHeader; while ((targetHeader = stHash_getNext(targetIt)) != NULL) { char *targetSeq = stHash_search(targetSequences, targetHeader); // Here we should try both the target sequence and its // reverse-complemented version // Aligns the sequences. // If you have alignment constraints (anchors) you should // replace this with getAlignedPairsUsingAnchors. stList *alignedPairs = getAlignedPairs(stateMachine, targetSeq, querySeq, parameters, true, true); // Takes into account the probability of aligning to a // gap, by transforming the posterior probability into the // AMAP objective function (see Schwartz & Pachter, 2007). alignedPairs = reweightAlignedPairs2(alignedPairs, strlen(targetSeq), strlen(querySeq), parameters->gapGamma); // I think this calculates the optimal ordered set of // alignments from the unordered set of aligned pairs, not // completely sure. alignedPairs = filterPairwiseAlignmentToMakePairsOrdered(alignedPairs, targetSeq, querySeq, // This parameter says that the minimum posterior probability we will accept has to be at least 0.9. 0.9); // After this the "aligned pairs" data structure changes, // which is a little sketchy. It's just so that the // alignment can be printed properly. stList_mapReplace(alignedPairs, convertToAnchorPair, NULL); stList_sort(alignedPairs, (int (*)(const void *, const void *)) stIntTuple_cmpFn); struct PairwiseAlignment *alignment = convertAlignedPairsToPairwiseAlignment(targetHeader, queryHeader, 0, strlen(targetSeq), strlen(querySeq), alignedPairs); // Output the cigar string cigarWrite(stdout, alignment, 0); stList_destruct(alignedPairs); destructPairwiseAlignment(alignment); } stHash_destructIterator(targetIt); } stHash_destructIterator(queryIt); // Clean up stHash_destruct(targetSequences); stHash_destruct(querySequences); pairwiseAlignmentBandingParameters_destruct(parameters); stateMachine_destruct(stateMachine); }