static void test_penalize_0(CuTest *testCase) { stList *observedList = stList_construct3(0, free); stList *expectedList = stList_construct3(0, free); stHash *observedHash = createBlockHashFromString("a score=0\n" "s reference.chr0 0 13 + 158545518 gcagctgaaaaca\n" "s name.chr1 0 10 + 100 ATGT---ATGCCG\n" "s name2.chr1 0 10 + 100 ATGT---ATGCCG\n", observedList); stHash *expectedHash = createBlockHashFromString("a score=0\n" "s reference.chr0 0 13 + 158545518 gcagctgaaaaca-----\n" "s name.chr1 0 15 + 100 ATGT---ATGCCGNNNNN\n" "s name2.chr1 0 10 + 100 ATGT---ATGCCG-----\n", expectedList); penalize(observedHash, "name.chr1", 5); CuAssertTrue(testCase, hashesAreEqual(observedHash, expectedHash)); CuAssertTrue(testCase, listsAreEqual(observedList, expectedList)); // clean up stHash_destruct(observedHash); stHash_destruct(expectedHash); stList_destruct(observedList); stList_destruct(expectedList); }
/* write a malnSet to a MAF file */ void malnSet_writeMaf(struct malnSet *malnSet, char *mafFileName) { malnSet_assert(malnSet); stList *sorted = buildRootSorted(malnSet); FILE *mafFh = mustOpen(mafFileName, "w"); mafWriteStart(mafFh, NULL); stListIterator *iter = stList_getIterator(sorted); struct malnBlk *blk; while ((blk = stList_getNext(iter)) != NULL) { writeBlkToMaf(blk, mafFh); } stList_destructIterator(iter); mafWriteEnd(mafFh); carefulClose(&mafFh); stList_destruct(sorted); }
/* clone the root. */ static stTree *subrangeCloneRoot(stTree *srcRoot, struct malnCompCompMap *srcDestCompMap) { // clone root, if deleted, these must only be one child (due to the way // the trees are constructed). stList *pendingSubtrees = stList_construct(); stTree *destRoot = subrangeCloneNode(srcRoot, srcDestCompMap, pendingSubtrees); if (destRoot == NULL) { if (stList_length(pendingSubtrees) > 1) { struct mafTreeNodeCompLink *srcNcLink = getNodeCompLink(srcRoot); errAbort("deleted tree root %s (component: %s:%d-%d/%c)) has more that one child", stTree_getLabel(srcRoot), srcNcLink->comp->seq->orgSeqName, srcNcLink->comp->start, srcNcLink->comp->end, srcNcLink->comp->strand); } else if (stList_length(pendingSubtrees) == 1) { destRoot = stList_pop(pendingSubtrees); } } stList_destruct(pendingSubtrees); return destRoot; }
static void test_stSet_removeAndFreeKey(CuTest* testCase) { stSet *set2 = stSet_construct2(free); stList *keys = stList_construct(); int64_t keyNumber = 1000; for (int64_t i = 0; i < keyNumber; i++) { int64_t *key = st_malloc(sizeof(*key)); stList_append(keys, key); stSet_insert(set2, key); } for (int64_t i = 0; i < keyNumber; i++) { int64_t *key = stList_get(keys, i); CuAssertPtrEquals(testCase, key, stSet_removeAndFreeKey(set2, key)); } CuAssertIntEquals(testCase, 0, stSet_size(set2)); stSet_destruct(set2); stList_destruct(keys); }
/* * A thread is trivial if all the segments it contains come from blocks containing only a reference segment. * These reference only segments represent scaffold gaps. At the same time, it processes the thread string * to remove the boolean values use to indicate if a thread is trivial or not. */ static bool isTrivialString(char **threadString) { stList *strings = stString_split(*threadString); //Split splits into individual segments. bool trivialString = 1; for(int64_t i=0; i<stList_length(strings); i++) { char *segmentString = stList_get(strings, i); int64_t j = strlen(segmentString)-1; //Location of the boolean value within a segment. assert(j > 0); assert(segmentString[j] == '0' || segmentString[j] == '1'); if(segmentString[j] == '1') { //Found a non-trivial segment, hence the thread is non-trivial. trivialString = 0; } segmentString[j] = '\0'; } free(*threadString); //Free old thread string. Doing it this way is a bit more memory efficient, as we don't keep two copies of the string around. *threadString = stString_join2("", strings); //Concatenation makes one sequence, now without the booleans. stList_destruct(strings); return trivialString; }
void cactusDisk_destruct(CactusDisk *cactusDisk) { Flower *flower; MetaSequence *metaSequence; while ((flower = stSortedSet_getFirst(cactusDisk->flowers)) != NULL) { flower_destruct(flower, FALSE); } stSortedSet_destruct(cactusDisk->flowers); stSortedSet_destruct(cactusDisk->flowerNamesMarkedForDeletion); while ((metaSequence = stSortedSet_getFirst(cactusDisk->metaSequences)) != NULL) { metaSequence_destruct(metaSequence); } stSortedSet_destruct(cactusDisk->metaSequences); //close DB stKVDatabase_destruct(cactusDisk->database); //Close the sequences files. if (cactusDisk->storeSequencesInAFile) { free(cactusDisk->sequencesFileName); free(cactusDisk->absSequencesFileName); if (cactusDisk->sequencesReadFileHandle != NULL) { fclose(cactusDisk->sequencesReadFileHandle); } if (cactusDisk->sequencesWriteFileHandle != NULL) { fsync(fileno(cactusDisk->sequencesWriteFileHandle)); fclose(cactusDisk->sequencesWriteFileHandle); } } else { assert(cactusDisk->sequencesFileName == NULL); assert(cactusDisk->sequencesReadFileHandle == NULL); assert(cactusDisk->absSequencesFileName == NULL); } stCache_destruct(cactusDisk->cache); //Get rid of the cache stCache_destruct(cactusDisk->stringCache); stList_destruct(cactusDisk->updateRequests); free(cactusDisk); }
stList *getComponents2(stList *adjacencyEdges, stList *stubEdges, stList *chainEdges) { /* * Gets a list of connected components for a set of adjacency, stub and chain edges. * If adjacencyEdges, stubEdges or chainEdges are NULL then they are ignored. */ stList *allEdges = stList_construct(); //Build a concatenated list of all the chain, stub and adjacency edges. if (adjacencyEdges != NULL) { stList_appendAll(allEdges, adjacencyEdges); } if (stubEdges != NULL) { stList_appendAll(allEdges, stubEdges); } if (chainEdges != NULL) { stList_appendAll(allEdges, chainEdges); } stList *components = getComponents(allEdges); //Gets the graph components. stList_destruct(allEdges); //Cleanup the all edges. return components; }
static void testBulkRemoveRecords(CuTest *testCase) { /* * Tests doing a bulk update of a set of records. */ setup(); int64_t i = 100, j = 110, k = 120, l = 130; stKVDatabase_insertRecord(database, 1, &i, sizeof(int64_t)); stKVDatabase_insertRecord(database, 2, &j, sizeof(int64_t)); stKVDatabase_insertRecord(database, 3, &k, sizeof(int64_t)); stKVDatabase_insertRecord(database, 4, &l, sizeof(int64_t)); stKVDatabase_insertRecord(database, 5, &i, 0); //Test null record addition CuAssertTrue(testCase, stKVDatabase_containsRecord(database, 1)); CuAssertTrue(testCase, stKVDatabase_containsRecord(database, 2)); CuAssertTrue(testCase, stKVDatabase_containsRecord(database, 3)); CuAssertTrue(testCase, stKVDatabase_containsRecord(database, 4)); CuAssertTrue(testCase, stKVDatabase_containsRecord(database, 5)); CuAssertTrue(testCase, stKVDatabase_getNumberOfRecords(database) == 5); stList *requests = stList_construct3(0, (void(*)(void *)) stInt64Tuple_destruct); // test empty request list stKVDatabase_bulkRemoveRecords(database, requests); stList_append(requests, stInt64Tuple_construct(1, (int64_t)1)); stList_append(requests, stInt64Tuple_construct(1, (int64_t)2)); stList_append(requests, stInt64Tuple_construct(1, (int64_t)3)); stList_append(requests, stInt64Tuple_construct(1, (int64_t)4)); stList_append(requests, stInt64Tuple_construct(1, (int64_t)5)); stKVDatabase_bulkRemoveRecords(database, requests); CuAssertTrue(testCase, !stKVDatabase_containsRecord(database, 1)); CuAssertTrue(testCase, !stKVDatabase_containsRecord(database, 2)); CuAssertTrue(testCase, !stKVDatabase_containsRecord(database, 3)); CuAssertTrue(testCase, !stKVDatabase_containsRecord(database, 4)); CuAssertTrue(testCase, !stKVDatabase_containsRecord(database, 5)); CuAssertTrue(testCase, stKVDatabase_getNumberOfRecords(database) == 0); stList_destruct(requests); teardown(); }
static void testPinchIteratorFromFile(CuTest *testCase) { for (int64_t test = 0; test < 100; test++) { stList *pairwiseAlignments = getRandomPairwiseAlignments(); st_logInfo("Doing a random pinch iterator from file test %" PRIi64 " with %" PRIi64 " alignments\n", test, stList_length(pairwiseAlignments)); //Put alignments in a file char *tempFile = "tempFileForPinchIteratorTest.cig"; FILE *fileHandle = fopen(tempFile, "w"); for (int64_t i = 0; i < stList_length(pairwiseAlignments); i++) { cigarWrite(fileHandle, stList_get(pairwiseAlignments, i), 0); } fclose(fileHandle); //Get an iterator stPinchIterator *pinchIterator = stPinchIterator_constructFromFile(tempFile); //Now test it testIterator(testCase, pinchIterator, pairwiseAlignments); //Cleanup stPinchIterator_destruct(pinchIterator); stFile_rmrf(tempFile); stList_destruct(pairwiseAlignments); } }
void checkInputs(stSortedSet *nodes, stList *adjacencyEdges, stList *stubEdges, stList *chainEdges) { /* * Checks the inputs to the algorithm are as expected. */ int64_t nodeNumber = stSortedSet_size(nodes); assert(nodeNumber % 2 == 0); if (nodeNumber > 0) { assert(stList_length(stubEdges) > 0); } assert( stList_length(stubEdges) + stList_length(chainEdges) == (nodeNumber / 2)); checkEdges(stubEdges, nodes, 0, 0); checkEdges(chainEdges, nodes, 0, 0); stList *stubsAndChainEdges = stList_copy(stubEdges, NULL); stList_appendAll(stubsAndChainEdges, chainEdges); checkEdges(stubsAndChainEdges, nodes, 1, 0); stList_destruct(stubsAndChainEdges); checkEdges(adjacencyEdges, nodes, 1, 1); }
static void test_st_randomChoice(CuTest *testCase) { /* * Excercies the random int function. */ stList *list = stList_construct3(0, (void (*)(void *))stIntTuple_destruct); stTry { st_randomChoice(list); } stCatch(except) { CuAssertTrue(testCase, stExcept_getId(except) == RANDOM_EXCEPTION_ID); } stTryEnd for(int32_t i = 0; i < 10; i++) { stList_append(list, stIntTuple_construct(1, i)); } for(int32_t i = 0; i < 100; i++) { CuAssertTrue(testCase, stList_contains(list, st_randomChoice(list))); } stList_destruct(list); }
stList *makeMatchingObeyCyclicConstraints(stSortedSet *nodes, stList *chosenEdges, stSortedSet *allAdjacencyEdges, stList *nonZeroWeightAdjacencyEdges, stList *stubEdges, stList *chainEdges, bool makeStubCyclesDisjoint) { if (stSortedSet_size(nodes) == 0) { //Some of the following functions assume there are at least 2 nodes. return stList_construct(); } /* * Merge in the stub free components. */ chosenEdges = mergeSimpleCycles2(chosenEdges, nonZeroWeightAdjacencyEdges, allAdjacencyEdges, stubEdges, chainEdges); st_logDebug( "After merging in chain only cycles the matching has %" PRIi64 " edges, %" PRIi64 " cardinality and %" PRIi64 " weight\n", stList_length(chosenEdges), matchingCardinality(chosenEdges), matchingWeight(chosenEdges)); /* * Split stub components. */ if (makeStubCyclesDisjoint) { stList *updatedChosenEdges = splitMultipleStubCycles(chosenEdges, nonZeroWeightAdjacencyEdges, allAdjacencyEdges, stubEdges, chainEdges); stList_destruct(chosenEdges); chosenEdges = updatedChosenEdges; st_logDebug( "After making stub cycles disjoint the matching has %" PRIi64 " edges, %" PRIi64 " cardinality and %" PRIi64 " weight\n", stList_length(chosenEdges), matchingCardinality(chosenEdges), matchingWeight(chosenEdges)); } else { st_logDebug("Not making stub cycles disjoint\n"); } return chosenEdges; }
char *cactusDisk_getString(CactusDisk *cactusDisk, Name name, int64_t start, int64_t length, int64_t strand, int64_t totalSequenceLength) { /* * Gets a string from the database. * */ assert(length >= 0); if (length == 0) { return stString_copy(""); } //First try getting it from the cache char *string = cactusDisk_getStringFromCache(cactusDisk, name, start, length, strand); if (string == NULL) { //If not in the cache, add it to the cache and then get it from the cache. stList *list = stList_construct3(0, (void (*)(void *)) substring_destruct); stList_append(list, substring_construct(name, start, length)); cacheSubstringsFromDB(cactusDisk, list); stList_destruct(list); string = cactusDisk_getStringFromCache(cactusDisk, name, start, length, strand); } assert(string != NULL); return string; }
/* * Recursive function which fills a givenlist with the * connected nodes within a module and fills their lifted * edges in the same pass */ static void buildFaces_fillTopNodeList2(Cap * cap, stList *list, stHash *liftedEdgesTable) { stList *liftedEdges = stList_construct3(2, buildFaces_stList_destructElem); int64_t index; // Orientation check cap = cap_getPositiveOrientation(cap); // Limit of recursion if (stList_contains(list, cap)) return; // Actual filling st_logInfo("Adding cap %p to face\n", cap); stList_append(list, cap); // Compute lifted edges for (index = 0; index < cap_getChildNumber(cap); index++) buildFaces_computeLiftedEdgesAtTopNode(cap_getChild(cap, index), liftedEdges); // If emptylist... if (stList_length(liftedEdges) == 0) stList_destruct(liftedEdges); // Recursion through lifted edges else { stHash_insert(liftedEdgesTable, cap, liftedEdges); for (index = 0; index < stList_length(liftedEdges); index++) buildFaces_fillTopNodeList2( ((LiftedEdge *) stList_get(liftedEdges, index))->destination, list, liftedEdgesTable); } // Recursion through adjacency if (cap_getAdjacency(cap)) buildFaces_fillTopNodeList2(cap_getAdjacency(cap),list, liftedEdgesTable); }
stList *getPerfectMatching(stSortedSet *nodes, stList *adjacencyEdges, stList *(*matchingAlgorithm)(stList *edges, int64_t nodeNumber)) { checkEdges(adjacencyEdges, nodes, 1, 1); //Checks edges are clique if (stSortedSet_size(nodes) == 0) { //Some of the following functions assume there are at least 2 nodes. return stList_construct(); } stList *nonZeroWeightAdjacencyEdges = getEdgesWithGreaterThanZeroWeight( adjacencyEdges); stList *chosenEdges = getSparseMatching(nodes, nonZeroWeightAdjacencyEdges, matchingAlgorithm); stList_destruct(nonZeroWeightAdjacencyEdges); makeMatchingPerfect(chosenEdges, adjacencyEdges, nodes); st_logDebug( "Chosen a perfect matching with %" PRIi64 " edges, %" PRIi64 " cardinality and %" PRIi64 " weight\n", stList_length(chosenEdges), matchingCardinality(chosenEdges), matchingWeight(chosenEdges)); return chosenEdges; }
static void getComponentsP(stHash *nodesToEdges, int64_t node, stSortedSet *component) { stIntTuple *key = stIntTuple_construct1( node); stList *edges = stHash_search(nodesToEdges, key); if (edges != NULL) { stHash_remove(nodesToEdges, key); for (int64_t i = 0; i < stList_length(edges); i++) { stIntTuple *edge = stList_get(edges, i); if (stSortedSet_search(component, edge) == NULL) { stSortedSet_insert(component, edge); } /* * Recursion on stack could equal the total number of nodes. */ getComponentsP(nodesToEdges, stIntTuple_get(edge, 0), component); getComponentsP(nodesToEdges, stIntTuple_get(edge, 1), component); } stList_destruct(edges); } stIntTuple_destruct(key); }
static void test_stSortedSet_searchLessThan(CuTest* testCase) { sonLibSortedSetTestSetup(); for(int32_t i=0; i<size; i++) { stSortedSet_insert(sortedSet, stIntTuple_construct(1, input[i])); } //static int32_t sortedInput[] = { -10, -1, 1, 3, 5, 10, 12 }; CuAssertTrue(testCase, stSortedSet_searchLessThan(sortedSet, stIntTuple_construct(1, -11)) == NULL); CuAssertTrue(testCase, stSortedSet_searchLessThan(sortedSet, stIntTuple_construct(1, -10)) == NULL); CuAssertTrue(testCase, stSortedSet_searchLessThan(sortedSet, stIntTuple_construct(1, -5)) == stSortedSet_search(sortedSet, stIntTuple_construct(1, -10))); CuAssertTrue(testCase, stSortedSet_searchLessThan(sortedSet, stIntTuple_construct(1, 13)) == stSortedSet_search(sortedSet, stIntTuple_construct(1, 12))); CuAssertTrue(testCase, stSortedSet_searchLessThan(sortedSet, stIntTuple_construct(1, 8)) == stSortedSet_search(sortedSet, stIntTuple_construct(1, 5))); CuAssertTrue(testCase, stSortedSet_searchLessThan(sortedSet, stIntTuple_construct(1, 1)) == stSortedSet_search(sortedSet, stIntTuple_construct(1, -1))); CuAssertTrue(testCase, stSortedSet_searchLessThan(sortedSet, stIntTuple_construct(1, 10)) == stSortedSet_search(sortedSet, stIntTuple_construct(1, 5))); for(int32_t i=0; i<100; i++) { stSortedSet_insert(sortedSet, stIntTuple_construct(1, st_randomInt(-1000, 1000))); } stList *list = stSortedSet_getList(sortedSet); for(int32_t i=1; i<stList_length(list); i++) { stIntTuple *p = stList_get(list, i-1); stIntTuple *j = stList_get(list, i); stIntTuple *k = stIntTuple_construct(1, st_randomInt(stIntTuple_getPosition(p, 0)+1, stIntTuple_getPosition(j, 0)+1)); CuAssertTrue(testCase, stSortedSet_searchLessThan(sortedSet, k) == p); stIntTuple_destruct(k); } stList_destruct(list); sonLibSortedSetTestTeardown(); }
stSortedSet *makeEndAlignment(StateMachine *sM, End *end, int64_t spanningTrees, int64_t maxSequenceLength, bool useProgressiveMerging, float gapGamma, PairwiseAlignmentParameters *pairwiseAlignmentBandingParameters) { //Make an alignment of the sequences in the ends //Get the adjacency sequences to be aligned. Cap *cap; End_InstanceIterator *it = end_getInstanceIterator(end); stList *sequences = stList_construct3(0, (void (*)(void *))adjacencySequence_destruct); stList *seqFrags = stList_construct3(0, (void (*)(void *))seqFrag_destruct); stHash *endInstanceNumbers = stHash_construct2(NULL, free); while((cap = end_getNext(it)) != NULL) { if(cap_getSide(cap)) { cap = cap_getReverse(cap); } AdjacencySequence *adjacencySequence = adjacencySequence_construct(cap, maxSequenceLength); stList_append(sequences, adjacencySequence); assert(cap_getAdjacency(cap) != NULL); End *otherEnd = end_getPositiveOrientation(cap_getEnd(cap_getAdjacency(cap))); stList_append(seqFrags, seqFrag_construct(adjacencySequence->string, 0, end_getName(otherEnd))); //Increase count of seqfrags with a given end. int64_t *c = stHash_search(endInstanceNumbers, otherEnd); if(c == NULL) { c = st_calloc(1, sizeof(int64_t)); assert(*c == 0); stHash_insert(endInstanceNumbers, otherEnd, c); } (*c)++; } end_destructInstanceIterator(it); //Get the alignment. MultipleAlignment *mA = makeAlignment(sM, seqFrags, spanningTrees, 100000000, useProgressiveMerging, gapGamma, pairwiseAlignmentBandingParameters); //Build an array of weights to reweight pairs in the alignment. int64_t *pairwiseAlignmentsPerSequenceNonCommonEnds = st_calloc(stList_length(seqFrags), sizeof(int64_t)); int64_t *pairwiseAlignmentsPerSequenceCommonEnds = st_calloc(stList_length(seqFrags), sizeof(int64_t)); //First build array on number of pairwise alignments to each sequence, distinguishing alignments between sequences sharing //common ends. for(int64_t i=0; i<stList_length(mA->chosenPairwiseAlignments); i++) { stIntTuple *pairwiseAlignment = stList_get(mA->chosenPairwiseAlignments, i); int64_t seq1 = stIntTuple_get(pairwiseAlignment, 1); int64_t seq2 = stIntTuple_get(pairwiseAlignment, 2); assert(seq1 != seq2); SeqFrag *seqFrag1 = stList_get(seqFrags, seq1); SeqFrag *seqFrag2 = stList_get(seqFrags, seq2); int64_t *pairwiseAlignmentsPerSequence = seqFrag1->rightEndId == seqFrag2->rightEndId ? pairwiseAlignmentsPerSequenceCommonEnds : pairwiseAlignmentsPerSequenceNonCommonEnds; pairwiseAlignmentsPerSequence[seq1]++; pairwiseAlignmentsPerSequence[seq2]++; } //Now calculate score adjustments. double *scoreAdjustmentsNonCommonEnds = st_malloc(stList_length(seqFrags) * sizeof(double)); double *scoreAdjustmentsCommonEnds = st_malloc(stList_length(seqFrags) * sizeof(double)); for(int64_t i=0; i<stList_length(seqFrags); i++) { SeqFrag *seqFrag = stList_get(seqFrags, i); End *otherEnd = flower_getEnd(end_getFlower(end), seqFrag->rightEndId); assert(otherEnd != NULL); assert(stHash_search(endInstanceNumbers, otherEnd) != NULL); int64_t commonInstanceNumber = *(int64_t *)stHash_search(endInstanceNumbers, otherEnd); int64_t nonCommonInstanceNumber = stList_length(seqFrags) - commonInstanceNumber; assert(commonInstanceNumber > 0 && nonCommonInstanceNumber >= 0); assert(pairwiseAlignmentsPerSequenceNonCommonEnds[i] <= nonCommonInstanceNumber); assert(pairwiseAlignmentsPerSequenceNonCommonEnds[i] >= 0); assert(pairwiseAlignmentsPerSequenceCommonEnds[i] < commonInstanceNumber); assert(pairwiseAlignmentsPerSequenceCommonEnds[i] >= 0); //scoreAdjustmentsNonCommonEnds[i] = ((double)nonCommonInstanceNumber + commonInstanceNumber - 1)/(pairwiseAlignmentsPerSequenceNonCommonEnds[i] + pairwiseAlignmentsPerSequenceCommonEnds[i]); //scoreAdjustmentsCommonEnds[i] = scoreAdjustmentsNonCommonEnds[i]; if(pairwiseAlignmentsPerSequenceNonCommonEnds[i] > 0) { scoreAdjustmentsNonCommonEnds[i] = ((double)nonCommonInstanceNumber)/pairwiseAlignmentsPerSequenceNonCommonEnds[i]; assert(scoreAdjustmentsNonCommonEnds[i] >= 1.0); assert(scoreAdjustmentsNonCommonEnds[i] <= nonCommonInstanceNumber); } else { scoreAdjustmentsNonCommonEnds[i] = INT64_MIN; } if(pairwiseAlignmentsPerSequenceCommonEnds[i] > 0) { scoreAdjustmentsCommonEnds[i] = ((double)commonInstanceNumber-1)/pairwiseAlignmentsPerSequenceCommonEnds[i]; assert(scoreAdjustmentsCommonEnds[i] >= 1.0); assert(scoreAdjustmentsCommonEnds[i] <= commonInstanceNumber-1); } else { scoreAdjustmentsCommonEnds[i] = INT64_MIN; } } //Convert the alignment pairs to an alignment of the caps.. stSortedSet *sortedAlignment = stSortedSet_construct3((int (*)(const void *, const void *))alignedPair_cmpFn, (void (*)(void *))alignedPair_destruct); while(stList_length(mA->alignedPairs) > 0) { stIntTuple *alignedPair = stList_pop(mA->alignedPairs); assert(stIntTuple_length(alignedPair) == 5); int64_t seqIndex1 = stIntTuple_get(alignedPair, 1); int64_t seqIndex2 = stIntTuple_get(alignedPair, 3); AdjacencySequence *i = stList_get(sequences, seqIndex1); AdjacencySequence *j = stList_get(sequences, seqIndex2); assert(i != j); int64_t offset1 = stIntTuple_get(alignedPair, 2); int64_t offset2 = stIntTuple_get(alignedPair, 4); int64_t score = stIntTuple_get(alignedPair, 0); if(score <= 0) { //Happens when indel probs are included score = 1; //This is the minimum } assert(score > 0 && score <= PAIR_ALIGNMENT_PROB_1); SeqFrag *seqFrag1 = stList_get(seqFrags, seqIndex1); SeqFrag *seqFrag2 = stList_get(seqFrags, seqIndex2); assert(seqFrag1 != seqFrag2); double *scoreAdjustments = seqFrag1->rightEndId == seqFrag2->rightEndId ? scoreAdjustmentsCommonEnds : scoreAdjustmentsNonCommonEnds; assert(scoreAdjustments[seqIndex1] != INT64_MIN); assert(scoreAdjustments[seqIndex2] != INT64_MIN); AlignedPair *alignedPair2 = alignedPair_construct( i->subsequenceIdentifier, i->start + (i->strand ? offset1 : -offset1), i->strand, j->subsequenceIdentifier, j->start + (j->strand ? offset2 : -offset2), j->strand, score*scoreAdjustments[seqIndex1], score*scoreAdjustments[seqIndex2]); //Do the reweighting here. assert(stSortedSet_search(sortedAlignment, alignedPair2) == NULL); assert(stSortedSet_search(sortedAlignment, alignedPair2->reverse) == NULL); stSortedSet_insert(sortedAlignment, alignedPair2); stSortedSet_insert(sortedAlignment, alignedPair2->reverse); stIntTuple_destruct(alignedPair); } //Cleanup stList_destruct(seqFrags); stList_destruct(sequences); free(pairwiseAlignmentsPerSequenceNonCommonEnds); free(pairwiseAlignmentsPerSequenceCommonEnds); free(scoreAdjustmentsNonCommonEnds); free(scoreAdjustmentsCommonEnds); multipleAlignment_destruct(mA); stHash_destruct(endInstanceNumbers); return sortedAlignment; }
static void test_addBlockToHash_4(CuTest *testCase) { // concatenation with sequnece breakpoint due to *strand* alone // note that name3 is well within the interstitial boundary, the two blocks // essentially looking like >>>>>>>>>>>>> <<<<< (strand diffs) options_t *options = options_construct(); options->breakpointPenalty = 10; options->interstitialSequence = 5; stList *observedList = stList_construct3(0, free); stList *expectedList = stList_construct3(0, free); stHash *observedHash = createBlockHashFromString("a score=0\n" "s reference.chr0 0 13 + 158545518 gcagctgaaaaca\n" "s name.chr1 0 10 + 100 ATGT---ATGCCG\n" "s name2.chr1 0 10 + 100 ATGT---ATGCCG\n" "s name3.chr1 0 13 + 100 GCAGCTGAAAACA\n", observedList ); mafBlock_t *mb = maf_newMafBlockListFromString("a score=0 test\n" "s reference.chr0 13 5 + 158545518 ACGTA\n" "s name.chr1 12 5 + 100 gtcGG\n" "s name2.chr1 10 5 + 100 ATGTg\n" "s name3.chr1 82 5 - 100 GGGGG\n" , 3); stHash *expectedHash = NULL; expectedHash = createBlockHashFromString("a score=0\n" "s reference.chr0 0 18 + 158545518 gcagctgaaaaca------------ACGTA\n" "s name.chr1 0 17 + 100 ATGT---ATGCCGac----------gtcGG\n" "s name2.chr1 0 15 + 100 ATGT---ATGCCG------------ATGTg\n" "s name3.chr1 0 28 + 100 GCAGCTGAAAACA--NNNNNNNNNNGGGGG\n", expectedList ); row_t *r = stHash_search(expectedHash, "name3"); r->prevRightPos = 86; r->strand = '*'; r->prevStrand = '-'; stHash *seqHash = createSeqHashFromString("name.chr1", "ATGTATGCCGacgtc" "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG" "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG"); mtfseq_t *mtfs = newMtfseqFromString("gcagctgaaaacaACGTA" "tttttttttttttttttttttttttttttttt" "tttttttttttttttttttttttttttttttttttttttttttttttttt"); stHash_insert(seqHash, stString_copy("reference.chr0"), mtfs); mtfs = newMtfseqFromString("ATGTATGCCGATGTg" "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC" "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"); stHash_insert(seqHash, stString_copy("name2.chr1"), mtfs); mtfs = newMtfseqFromString("GCAGCTGAAAACACCCCCgggggggggggggggggggggggggggggggg" "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" ); stHash_insert(seqHash, stString_copy("name3.chr1"), mtfs); addMafBlockToRowHash(observedHash, seqHash, observedList, mb, options); CuAssertTrue(testCase, hashesAreEqual(observedHash, expectedHash)); CuAssertTrue(testCase, listsAreEqual(observedList, expectedList)); // clean up stHash_destruct(observedHash); stHash_destruct(expectedHash); stHash_destruct(seqHash); stList_destruct(observedList); stList_destruct(expectedList); maf_destroyMafBlockList(mb); destroyOptions(options); }
static void test_addBlockToHash_6(CuTest *testCase) { // concatenation with sequence breakpoint due to sequence name options_t *options = options_construct(); options->breakpointPenalty = 10; options->interstitialSequence = 5; stList *observedList = stList_construct3(0, free); stList *expectedList = stList_construct3(0, free); stHash *observedHash = createBlockHashFromString("a score=0\n" "s reference.chr0 0 13 + 158545518 gcagctgaaaaca\n" "s name.chr1 0 10 + 100 ATGT---ATGCCG\n" "s name2.chr1 0 10 + 100 ATGT---ATGCCG\n" "s name3.chr1 0 13 + 100 GCAGCTGAAAACA\n", observedList ); mafBlock_t *mb = maf_newMafBlockListFromString("a score=0 test\n" "s reference.chr0 13 5 + 158545518 ACGTA\n" "s name.chr1 12 5 + 100 gtcGG\n" "s name2.chr1 10 5 + 100 ATGTg\n" "s name3.chr2 13 5 + 100 aaaaa\n" , 3); stHash *expectedHash = NULL; expectedHash = createBlockHashFromString("a score=0\n" "s reference.chr0 0 18 + 158545518 gcagctgaaaaca------------ACGTA\n" "s name.chr1 0 17 + 100 ATGT---ATGCCGac----------gtcGG\n" "s name2.chr1 0 15 + 100 ATGT---ATGCCG------------ATGTg\n" "s name3 0 28 + 28 GCAGCTGAAAACA--NNNNNNNNNNaaaaa\n", expectedList ); row_t *r = stHash_search(expectedHash, "name3"); r->prevRightPos = 17; r->strand = '*'; r->prevStrand = '+'; r->multipleNames = true; free(r->prevName); r->prevName = stString_copy("name3.chr2"); stHash *seqHash = createSeqHashFromString("name.chr1", "ATGTATGCCGacgtc" "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG" "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG"); mtfseq_t *mtfs = newMtfseqFromString("gcagctgaaaacaACGTA" "tttttttttttttttttttttttttttttttt" "tttttttttttttttttttttttttttttttttttttttttttttttttt"); stHash_insert(seqHash, stString_copy("reference.chr0"), mtfs); mtfs = newMtfseqFromString("ATGTATGCCGATGTg" "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC" "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"); stHash_insert(seqHash, stString_copy("name2.chr1"), mtfs); mtfs = newMtfseqFromString("GCAGCTGAAAACAggggggggggggggggggggggggggggggggggggg" "CCCCCaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" ); stHash_insert(seqHash, stString_copy("name3.chr1"), mtfs); mtfs = newMtfseqFromString("TTTTTTTTTTTTTaaaaaGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG" "CCCCCAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" ); stHash_insert(seqHash, stString_copy("name3.chr2"), mtfs); addMafBlockToRowHash(observedHash, seqHash, observedList, mb, options); CuAssertTrue(testCase, hashesAreEqual(observedHash, expectedHash)); CuAssertTrue(testCase, listsAreEqual(observedList, expectedList)); // clean up stHash_destruct(observedHash); stHash_destruct(expectedHash); stHash_destruct(seqHash); stList_destruct(observedList); stList_destruct(expectedList); maf_destroyMafBlockList(mb); destroyOptions(options); }
Name cactusDisk_addString(CactusDisk *cactusDisk, const char *string) { /* * Adds a string to the database. */ if (cactusDisk->storeSequencesInAFile) { if (cactusDisk->sequencesWriteFileHandle == NULL) { //We do not allow the read file handle to be open at the same time. if (cactusDisk->sequencesReadFileHandle != NULL) { fclose(cactusDisk->sequencesReadFileHandle); cactusDisk->sequencesReadFileHandle = NULL; } cactusDisk->sequencesWriteFileHandle = fopen(cactusDisk->absSequencesFileName, "a"); assert(cactusDisk->sequencesWriteFileHandle != NULL); } else { //The read file handle should not be open at the same time. assert(cactusDisk->sequencesReadFileHandle == NULL); } Name name = ftell(cactusDisk->sequencesWriteFileHandle) + 1; //Extra temporary cheesy code to avoid potential overflow in fprintf int64_t chunkSize = 1000000000; //1 gig approx chunks, to avoid a possible overflow issue with fprintf int64_t length = strlen(string); if (length > chunkSize) { fprintf(cactusDisk->sequencesWriteFileHandle, ">"); for (int64_t i = 0; i < length;) { int64_t j = i + chunkSize <= length ? chunkSize : length - i; char *string2 = memcpy(st_malloc(sizeof(char) * (j + 1)), string + i, sizeof(char) * j); string2[j] = '\0'; int64_t k = fprintf(cactusDisk->sequencesWriteFileHandle, "%s", string2); (void) k; assert(k == j); free(string2); i += j; } } else { //Replacing this line int64_t k = fprintf(cactusDisk->sequencesWriteFileHandle, ">%s", string); (void) k; assert(k == length + 1); } #ifndef NDEBUG // Extra fsync may not be necessary. fsync(fileno(cactusDisk->sequencesWriteFileHandle)); fclose(cactusDisk->sequencesWriteFileHandle); cactusDisk->sequencesWriteFileHandle = NULL; cactusDisk->sequencesReadFileHandle = fopen(cactusDisk->absSequencesFileName, "r"); char *string2 = getStringFromDisk(cactusDisk->sequencesReadFileHandle, name, 0, length); for (int64_t i = 0; i < length; i++) { assert(string[i] == string2[i]); } free(string2); #endif return name; } else { int64_t stringSize = strlen(string); int64_t intervalSize = ceil((double) stringSize / CACTUS_DISK_SEQUENCE_CHUNK_SIZE); Name name = cactusDisk_getUniqueIDInterval(cactusDisk, intervalSize); stList *insertRequests = stList_construct3(0, (void (*)(void *)) stKVDatabaseBulkRequest_destruct); for (int64_t i = 0; i * CACTUS_DISK_SEQUENCE_CHUNK_SIZE < stringSize; i++) { int64_t j = (i + 1) * CACTUS_DISK_SEQUENCE_CHUNK_SIZE < stringSize ? CACTUS_DISK_SEQUENCE_CHUNK_SIZE : stringSize - i * CACTUS_DISK_SEQUENCE_CHUNK_SIZE; char *subString = stString_getSubString(string, i * CACTUS_DISK_SEQUENCE_CHUNK_SIZE, j); stList_append(insertRequests, stKVDatabaseBulkRequest_constructInsertRequest(name + i, subString, j + 1)); free(subString); } stTry { stKVDatabase_bulkSetRecords(cactusDisk->database, insertRequests); } stCatch(except) { stThrowNewCause(except, ST_KV_DATABASE_EXCEPTION_ID, "An unknown database error occurred when we tried to add a string to the cactus disk"); }stTryEnd ; stList_destruct(insertRequests); return name; } }
int main(int argc, char *argv[]) { /* * Script for adding a reference genome to a flower. */ /* * Arguments/options */ char * logLevelString = NULL; char * cactusDiskDatabaseString = NULL; char * secondaryDatabaseString = NULL; char *referenceEventString = (char *) cactusMisc_getDefaultReferenceEventHeader(); bool bottomUpPhase = 0; /////////////////////////////////////////////////////////////////////////// // (0) Parse the inputs handed by genomeCactus.py / setup stuff. /////////////////////////////////////////////////////////////////////////// while (1) { static struct option long_options[] = { { "logLevel", required_argument, 0, 'a' }, { "cactusDisk", required_argument, 0, 'b' }, { "secondaryDisk", required_argument, 0, 'd' }, { "referenceEventString", required_argument, 0, 'g' }, { "help", no_argument, 0, 'h' }, { "bottomUpPhase", no_argument, 0, 'j' }, { 0, 0, 0, 0 } }; int option_index = 0; int key = getopt_long(argc, argv, "a:b:c:d:e:g:hi:j", long_options, &option_index); if (key == -1) { break; } switch (key) { case 'a': logLevelString = stString_copy(optarg); break; case 'b': cactusDiskDatabaseString = stString_copy(optarg); break; case 'd': secondaryDatabaseString = stString_copy(optarg); break; case 'g': referenceEventString = stString_copy(optarg); break; case 'h': usage(); return 0; case 'j': bottomUpPhase = 1; break; default: usage(); return 1; } } /////////////////////////////////////////////////////////////////////////// // (0) Check the inputs. /////////////////////////////////////////////////////////////////////////// assert(cactusDiskDatabaseString != NULL); ////////////////////////////////////////////// //Set up logging ////////////////////////////////////////////// st_setLogLevelFromString(logLevelString); ////////////////////////////////////////////// //Load the database ////////////////////////////////////////////// st_logInfo("referenceEventString = %s\n", referenceEventString); st_logInfo("bottomUpPhase = %i\n", bottomUpPhase); stKVDatabaseConf *kvDatabaseConf = stKVDatabaseConf_constructFromString(cactusDiskDatabaseString); CactusDisk *cactusDisk = cactusDisk_construct(kvDatabaseConf, false, true); stKVDatabaseConf_destruct(kvDatabaseConf); st_logInfo("Set up the flower disk\n"); stKVDatabase *sequenceDatabase = NULL; if (secondaryDatabaseString != NULL) { kvDatabaseConf = stKVDatabaseConf_constructFromString(secondaryDatabaseString); sequenceDatabase = stKVDatabase_construct(kvDatabaseConf, 0); stKVDatabaseConf_destruct(kvDatabaseConf); } FlowerStream *flowerStream = flowerWriter_getFlowerStream(cactusDisk, stdin); Flower *flower; while ((flower = flowerStream_getNext(flowerStream)) != NULL) { st_logDebug("Processing flower %" PRIi64 "\n", flower_getName(flower)); /////////////////////////////////////////////////////////////////////////// // Get the appropriate event names /////////////////////////////////////////////////////////////////////////// st_logInfo("%s\n", eventTree_makeNewickString(flower_getEventTree(flower))); Event *referenceEvent = eventTree_getEventByHeader(flower_getEventTree(flower), referenceEventString); if (referenceEvent == NULL) { st_errAbort("Reference event %s not found in tree. Check your " "--referenceEventString option", referenceEventString); } Name referenceEventName = event_getName(referenceEvent); /////////////////////////////////////////////////////////////////////////// // Now do bottom up or top down, depending /////////////////////////////////////////////////////////////////////////// stList *flowers = stList_construct(); stList_append(flowers, flower); preCacheNestedFlowers(cactusDisk, flowers); if (bottomUpPhase) { assert(sequenceDatabase != NULL); cactusDisk_preCacheSegmentStrings(cactusDisk, flowers); bottomUp(flowers, sequenceDatabase, referenceEventName, !flower_hasParentGroup(flower), generateJukesCantorMatrix); // Unload the nested flowers to save memory. They haven't // been changed, so we don't write them to the cactus // disk. Flower_GroupIterator *groupIt = flower_getGroupIterator(flower); Group *group; while ((group = flower_getNextGroup(groupIt)) != NULL) { if (!group_isLeaf(group)) { flower_unload(group_getNestedFlower(group)); } } flower_destructGroupIterator(groupIt); assert(!flower_isParentLoaded(flower)); // Write this flower to disk. cactusDisk_addUpdateRequest(cactusDisk, flower); } else { topDown(flower, referenceEventName); // We've changed the nested flowers, but not this // flower. We write the nested flowers to disk, then // unload them to save memory. This flower will be // unloaded by the flower-stream code. Flower_GroupIterator *groupIt = flower_getGroupIterator(flower); Group *group; while ((group = flower_getNextGroup(groupIt)) != NULL) { if (!group_isLeaf(group)) { cactusDisk_addUpdateRequest(cactusDisk, group_getNestedFlower(group)); flower_unload(group_getNestedFlower(group)); } } flower_destructGroupIterator(groupIt); } stList_destruct(flowers); } /////////////////////////////////////////////////////////////////////////// // Write the flower(s) back to disk. /////////////////////////////////////////////////////////////////////////// cactusDisk_write(cactusDisk); st_logInfo("Updated the flower on disk\n"); /////////////////////////////////////////////////////////////////////////// //Clean up. /////////////////////////////////////////////////////////////////////////// if (sequenceDatabase != NULL) { stKVDatabase_destruct(sequenceDatabase); } cactusDisk_destruct(cactusDisk); return 0; //Exit without clean up is quicker, enable cleanup when doing memory leak detection. free(cactusDiskDatabaseString); free(referenceEventString); free(logLevelString); st_logInfo("Cleaned stuff up and am finished\n"); return 0; }
static void cacheSubstringsFromDB(CactusDisk *cactusDisk, stList *substrings) { /* * Caches the given set of substrings in the cactusDisk cache. */ if (cactusDisk->storeSequencesInAFile) { if (cactusDisk->sequencesReadFileHandle == NULL) { if(cactusDisk->sequencesWriteFileHandle != NULL) { fsync(fileno(cactusDisk->sequencesWriteFileHandle)); fclose(cactusDisk->sequencesWriteFileHandle); cactusDisk->sequencesWriteFileHandle = NULL; } cactusDisk->sequencesReadFileHandle = fopen(cactusDisk->absSequencesFileName, "r"); assert(cactusDisk->sequencesReadFileHandle != NULL); } else { assert(cactusDisk->sequencesWriteFileHandle == NULL); } for (int64_t i = 0; i < stList_length(substrings); i++) { Substring *substring = stList_get(substrings, i); char *string = getStringFromDisk(cactusDisk->sequencesReadFileHandle, substring->name, substring->start, substring->length); stCache_setRecord(cactusDisk->stringCache, substring->name, substring->start, substring->length, string); #ifndef NDEBUG int64_t bytesRead; char *string2 = stCache_getRecord(cactusDisk->stringCache, substring->name, substring->start, substring->length, &bytesRead); assert(bytesRead == substring->length); for (int64_t j = 0; j < substring->length; j++) { assert(string2[j] == string[j]); } free(string2); #endif free(string); } } else { stList *getRequests = stList_construct3(0, free); for (int64_t i = 0; i < stList_length(substrings); i++) { Substring *substring = stList_get(substrings, i); int64_t intervalSize = (substring->length + substring->start - 1) / CACTUS_DISK_SEQUENCE_CHUNK_SIZE - substring->start / CACTUS_DISK_SEQUENCE_CHUNK_SIZE + 1; Name shiftedName = substring->name + substring->start / CACTUS_DISK_SEQUENCE_CHUNK_SIZE; for (int64_t j = 0; j < intervalSize; j++) { int64_t *k = st_malloc(sizeof(int64_t)); k[0] = shiftedName + j; stList_append(getRequests, k); } } if (stList_length(getRequests) == 0) { stList_destruct(getRequests); return; } stList *records = NULL; stTry { records = stKVDatabase_bulkGetRecords(cactusDisk->database, getRequests); } stCatch(except) { stThrowNewCause(except, ST_KV_DATABASE_EXCEPTION_ID, "An unknown database error occurred when getting a sequence string"); }stTryEnd ; assert(records != NULL); assert(stList_length(records) == stList_length(getRequests)); stList_destruct(getRequests); stListIterator *recordsIt = stList_getIterator(records); for (int64_t i = 0; i < stList_length(substrings); i++) { Substring *substring = stList_get(substrings, i); int64_t intervalSize = (substring->length + substring->start - 1) / CACTUS_DISK_SEQUENCE_CHUNK_SIZE - substring->start / CACTUS_DISK_SEQUENCE_CHUNK_SIZE + 1; stList *strings = stList_construct(); while (intervalSize-- > 0) { int64_t recordSize; stKVDatabaseBulkResult *result = stList_getNext(recordsIt); assert(result != NULL); char *string = stKVDatabaseBulkResult_getRecord(result, &recordSize); assert(string != NULL); assert(strlen(string) == recordSize - 1); stList_append(strings, string); assert(recordSize <= CACTUS_DISK_SEQUENCE_CHUNK_SIZE + 1); } assert(stList_length(strings) > 0); char *joinedString = stString_join2("", strings); stCache_setRecord(cactusDisk->stringCache, substring->name, (substring->start / CACTUS_DISK_SEQUENCE_CHUNK_SIZE) * CACTUS_DISK_SEQUENCE_CHUNK_SIZE, strlen(joinedString), joinedString); free(joinedString); stList_destruct(strings); } assert(stList_getNext(recordsIt) == NULL); stList_destructIterator(recordsIt); stList_destruct(records); } }
static void teardown() { if(list != NULL) { stList_destruct(list); list = NULL; } }
int main(int argc, char *argv[]) { /* * Script for adding alignments to cactus tree. */ int64_t startTime; stKVDatabaseConf *kvDatabaseConf; CactusDisk *cactusDisk; int key, k; bool (*filterFn)(stPinchSegment *, stPinchSegment *) = NULL; stSet *outgroupThreads = NULL; /* * Arguments/options */ char * logLevelString = NULL; char * alignmentsFile = NULL; char * constraintsFile = NULL; char * cactusDiskDatabaseString = NULL; char * lastzArguments = ""; int64_t minimumSequenceLengthForBlast = 1; //Parameters for annealing/melting rounds int64_t *annealingRounds = NULL; int64_t annealingRoundsLength = 0; int64_t *meltingRounds = NULL; int64_t meltingRoundsLength = 0; //Parameters for melting float maximumAdjacencyComponentSizeRatio = 10; int64_t blockTrim = 0; int64_t alignmentTrimLength = 0; int64_t *alignmentTrims = NULL; int64_t chainLengthForBigFlower = 1000000; int64_t longChain = 2; int64_t minLengthForChromosome = 1000000; float proportionOfUnalignedBasesForNewChromosome = 0.8; bool breakChainsAtReverseTandems = 1; int64_t maximumMedianSequenceLengthBetweenLinkedEnds = INT64_MAX; bool realign = 0; char *realignArguments = ""; bool removeRecoverableChains = false; bool (*recoverableChainsFilter)(stCactusEdgeEnd *, Flower *) = NULL; int64_t maxRecoverableChainsIterations = 1; int64_t maxRecoverableChainLength = INT64_MAX; //Parameters for removing ancient homologies bool doPhylogeny = false; int64_t phylogenyNumTrees = 1; enum stCaf_RootingMethod phylogenyRootingMethod = BEST_RECON; enum stCaf_ScoringMethod phylogenyScoringMethod = COMBINED_LIKELIHOOD; double breakpointScalingFactor = 1.0; bool phylogenySkipSingleCopyBlocks = 0; int64_t phylogenyMaxBaseDistance = 1000; int64_t phylogenyMaxBlockDistance = 100; bool phylogenyKeepSingleDegreeBlocks = 0; stList *phylogenyTreeBuildingMethods = stList_construct(); enum stCaf_TreeBuildingMethod defaultMethod = GUIDED_NEIGHBOR_JOINING; stList_append(phylogenyTreeBuildingMethods, &defaultMethod); double phylogenyCostPerDupPerBase = 0.2; double phylogenyCostPerLossPerBase = 0.2; const char *debugFileName = NULL; const char *referenceEventHeader = NULL; double phylogenyDoSplitsWithSupportHigherThanThisAllAtOnce = 1.0; int64_t numTreeBuildingThreads = 2; int64_t minimumBlockDegreeToCheckSupport = 10; double minimumBlockHomologySupport = 0.7; double nucleotideScalingFactor = 1.0; HomologyUnitType phylogenyHomologyUnitType = BLOCK; enum stCaf_DistanceCorrectionMethod phylogenyDistanceCorrectionMethod = JUKES_CANTOR; bool sortAlignments = false; /////////////////////////////////////////////////////////////////////////// // (0) Parse the inputs handed by genomeCactus.py / setup stuff. /////////////////////////////////////////////////////////////////////////// while (1) { static struct option long_options[] = { { "logLevel", required_argument, 0, 'a' }, { "alignments", required_argument, 0, 'b' }, { "cactusDisk", required_argument, 0, 'c' }, { "lastzArguments", required_argument, 0, 'd' }, { "help", no_argument, 0, 'h' }, { "annealingRounds", required_argument, 0, 'i' }, { "trim", required_argument, 0, 'k' }, { "trimChange", required_argument, 0, 'l', }, { "minimumTreeCoverage", required_argument, 0, 'm' }, { "blockTrim", required_argument, 0, 'n' }, { "deannealingRounds", required_argument, 0, 'o' }, { "minimumDegree", required_argument, 0, 'p' }, { "minimumIngroupDegree", required_argument, 0, 'q' }, { "minimumOutgroupDegree", required_argument, 0, 'r' }, { "alignmentFilter", required_argument, 0, 't' }, { "minimumSequenceLengthForBlast", required_argument, 0, 'v' }, { "maxAdjacencyComponentSizeRatio", required_argument, 0, 'w' }, { "constraints", required_argument, 0, 'x' }, { "minLengthForChromosome", required_argument, 0, 'y' }, { "proportionOfUnalignedBasesForNewChromosome", required_argument, 0, 'z' }, { "maximumMedianSequenceLengthBetweenLinkedEnds", required_argument, 0, 'A' }, { "realign", no_argument, 0, 'B' }, { "realignArguments", required_argument, 0, 'C' }, { "phylogenyNumTrees", required_argument, 0, 'D' }, { "phylogenyRootingMethod", required_argument, 0, 'E' }, { "phylogenyScoringMethod", required_argument, 0, 'F' }, { "phylogenyBreakpointScalingFactor", required_argument, 0, 'G' }, { "phylogenySkipSingleCopyBlocks", no_argument, 0, 'H' }, { "phylogenyMaxBaseDistance", required_argument, 0, 'I' }, { "phylogenyMaxBlockDistance", required_argument, 0, 'J' }, { "phylogenyDebugFile", required_argument, 0, 'K' }, { "phylogenyKeepSingleDegreeBlocks", no_argument, 0, 'L' }, { "phylogenyTreeBuildingMethod", required_argument, 0, 'M' }, { "phylogenyCostPerDupPerBase", required_argument, 0, 'N' }, { "phylogenyCostPerLossPerBase", required_argument, 0, 'O' }, { "referenceEventHeader", required_argument, 0, 'P' }, { "phylogenyDoSplitsWithSupportHigherThanThisAllAtOnce", required_argument, 0, 'Q' }, { "numTreeBuildingThreads", required_argument, 0, 'R' }, { "phylogeny", no_argument, 0, 'S' }, { "minimumBlockHomologySupport", required_argument, 0, 'T' }, { "phylogenyNucleotideScalingFactor", required_argument, 0, 'U' }, { "minimumBlockDegreeToCheckSupport", required_argument, 0, 'V' }, { "removeRecoverableChains", required_argument, 0, 'W' }, { "minimumNumberOfSpecies", required_argument, 0, 'X' }, { "phylogenyHomologyUnitType", required_argument, 0, 'Y' }, { "phylogenyDistanceCorrectionMethod", required_argument, 0, 'Z' }, { "maxRecoverableChainsIterations", required_argument, 0, '1' }, { "maxRecoverableChainLength", required_argument, 0, '2' }, { 0, 0, 0, 0 } }; int option_index = 0; key = getopt_long(argc, argv, "a:b:c:hi:k:m:n:o:p:q:r:stv:w:x:y:z:A:BC:D:E:", long_options, &option_index); if (key == -1) { break; } switch (key) { case 'a': logLevelString = stString_copy(optarg); st_setLogLevelFromString(logLevelString); break; case 'b': alignmentsFile = stString_copy(optarg); break; case 'c': cactusDiskDatabaseString = stString_copy(optarg); break; case 'd': lastzArguments = stString_copy(optarg); break; case 'h': usage(); return 0; case 'i': annealingRounds = getInts(optarg, &annealingRoundsLength); break; case 'o': meltingRounds = getInts(optarg, &meltingRoundsLength); break; case 'k': alignmentTrims = getInts(optarg, &alignmentTrimLength); break; case 'm': k = sscanf(optarg, "%f", &minimumTreeCoverage); assert(k == 1); break; case 'n': k = sscanf(optarg, "%" PRIi64 "", &blockTrim); assert(k == 1); break; case 'p': k = sscanf(optarg, "%" PRIi64 "", &minimumDegree); assert(k == 1); break; case 'q': k = sscanf(optarg, "%" PRIi64 "", &minimumIngroupDegree); assert(k == 1); break; case 'r': k = sscanf(optarg, "%" PRIi64 "", &minimumOutgroupDegree); assert(k == 1); break; case 't': if (strcmp(optarg, "singleCopyOutgroup") == 0) { sortAlignments = true; filterFn = stCaf_filterByOutgroup; } else if (strcmp(optarg, "relaxedSingleCopyOutgroup") == 0) { sortAlignments = true; filterFn = stCaf_relaxedFilterByOutgroup; } else if (strcmp(optarg, "singleCopy") == 0) { sortAlignments = true; filterFn = stCaf_filterByRepeatSpecies; } else if (strcmp(optarg, "relaxedSingleCopy") == 0) { sortAlignments = true; filterFn = stCaf_relaxedFilterByRepeatSpecies; } else if (strcmp(optarg, "singleCopyChr") == 0) { sortAlignments = true; filterFn = stCaf_singleCopyChr; } else if (strcmp(optarg, "singleCopyIngroup") == 0) { sortAlignments = true; filterFn = stCaf_singleCopyIngroup; } else if (strcmp(optarg, "relaxedSingleCopyIngroup") == 0) { sortAlignments = true; filterFn = stCaf_relaxedSingleCopyIngroup; } else if (strcmp(optarg, "none") == 0) { sortAlignments = false; filterFn = NULL; } else { st_errAbort("Could not recognize alignmentFilter option %s", optarg); } break; case 'v': k = sscanf(optarg, "%" PRIi64 "", &minimumSequenceLengthForBlast); assert(k == 1); break; case 'w': k = sscanf(optarg, "%f", &maximumAdjacencyComponentSizeRatio); assert(k == 1); break; case 'x': constraintsFile = stString_copy(optarg); break; case 'y': k = sscanf(optarg, "%" PRIi64 "", &minLengthForChromosome); assert(k == 1); break; case 'z': k = sscanf(optarg, "%f", &proportionOfUnalignedBasesForNewChromosome); assert(k == 1); break; case 'A': k = sscanf(optarg, "%" PRIi64 "", &maximumMedianSequenceLengthBetweenLinkedEnds); assert(k == 1); break; case 'B': realign = 1; break; case 'C': realignArguments = stString_copy(optarg); break; case 'D': k = sscanf(optarg, "%" PRIi64, &phylogenyNumTrees); assert(k == 1); break; case 'E': if (!strcmp(optarg, "outgroupBranch")) { phylogenyRootingMethod = OUTGROUP_BRANCH; } else if (!strcmp(optarg, "longestBranch")) { phylogenyRootingMethod = LONGEST_BRANCH; } else if (!strcmp(optarg, "bestRecon")) { phylogenyRootingMethod = BEST_RECON; } else { st_errAbort("Invalid tree rooting method: %s", optarg); } break; case 'F': if (!strcmp(optarg, "reconCost")) { phylogenyScoringMethod = RECON_COST; } else if (!strcmp(optarg, "nucLikelihood")) { phylogenyScoringMethod = NUCLEOTIDE_LIKELIHOOD; } else if (!strcmp(optarg, "reconLikelihood")) { phylogenyScoringMethod = RECON_LIKELIHOOD; } else if (!strcmp(optarg, "combinedLikelihood")) { phylogenyScoringMethod = COMBINED_LIKELIHOOD; } else { st_errAbort("Invalid tree scoring method: %s", optarg); } break; case 'G': k = sscanf(optarg, "%lf", &breakpointScalingFactor); assert(k == 1); break; case 'H': phylogenySkipSingleCopyBlocks = true; break; case 'I': k = sscanf(optarg, "%" PRIi64, &phylogenyMaxBaseDistance); assert(k == 1); break; case 'J': k = sscanf(optarg, "%" PRIi64, &phylogenyMaxBlockDistance); assert(k == 1); break; case 'K': debugFileName = stString_copy(optarg); break; case 'L': phylogenyKeepSingleDegreeBlocks = true; break; case 'M': // clear the default setting of the list stList_destruct(phylogenyTreeBuildingMethods); phylogenyTreeBuildingMethods = stList_construct(); stList *methodStrings = stString_splitByString(optarg, ","); for (int64_t i = 0; i < stList_length(methodStrings); i++) { char *methodString = stList_get(methodStrings, i); enum stCaf_TreeBuildingMethod *method = st_malloc(sizeof(enum stCaf_TreeBuildingMethod)); if (strcmp(methodString, "neighborJoining") == 0) { *method = NEIGHBOR_JOINING; } else if (strcmp(methodString, "guidedNeighborJoining") == 0) { *method = GUIDED_NEIGHBOR_JOINING; } else if (strcmp(methodString, "splitDecomposition") == 0) { *method = SPLIT_DECOMPOSITION; } else if (strcmp(methodString, "strictSplitDecomposition") == 0) { *method = STRICT_SPLIT_DECOMPOSITION; } else if (strcmp(methodString, "removeBadChains") == 0) { *method = REMOVE_BAD_CHAINS; } else { st_errAbort("Unknown tree building method: %s", methodString); } stList_append(phylogenyTreeBuildingMethods, method); } stList_destruct(methodStrings); break; case 'N': k = sscanf(optarg, "%lf", &phylogenyCostPerDupPerBase); assert(k == 1); break; case 'O': k = sscanf(optarg, "%lf", &phylogenyCostPerLossPerBase); assert(k == 1); break; case 'P': referenceEventHeader = stString_copy(optarg); break; case 'Q': k = sscanf(optarg, "%lf", &phylogenyDoSplitsWithSupportHigherThanThisAllAtOnce); assert(k == 1); break; case 'R': k = sscanf(optarg, "%" PRIi64, &numTreeBuildingThreads); assert(k == 1); break; case 'S': doPhylogeny = true; break; case 'T': k = sscanf(optarg, "%lf", &minimumBlockHomologySupport); assert(k == 1); assert(minimumBlockHomologySupport <= 1.0); assert(minimumBlockHomologySupport >= 0.0); break; case 'U': k = sscanf(optarg, "%lf", &nucleotideScalingFactor); assert(k == 1); break; case 'V': k = sscanf(optarg, "%" PRIi64, &minimumBlockDegreeToCheckSupport); assert(k == 1); break; case 'W': if (strcmp(optarg, "1") == 0) { removeRecoverableChains = true; recoverableChainsFilter = NULL; } else if (strcmp(optarg, "unequalNumberOfIngroupCopies") == 0) { removeRecoverableChains = true; recoverableChainsFilter = stCaf_chainHasUnequalNumberOfIngroupCopies; } else if (strcmp(optarg, "unequalNumberOfIngroupCopiesOrNoOutgroup") == 0) { removeRecoverableChains = true; recoverableChainsFilter = stCaf_chainHasUnequalNumberOfIngroupCopiesOrNoOutgroup; } else if (strcmp(optarg, "0") == 0) { removeRecoverableChains = false; } else { st_errAbort("Could not parse removeRecoverableChains argument"); } break; case 'X': k = sscanf(optarg, "%" PRIi64, &minimumNumberOfSpecies); if (k != 1) { st_errAbort("Error parsing the minimumNumberOfSpecies argument"); } break; case 'Y': if (strcmp(optarg, "chain") == 0) { phylogenyHomologyUnitType = CHAIN; } else if (strcmp(optarg, "block") == 0) { phylogenyHomologyUnitType = BLOCK; } else { st_errAbort("Could not parse the phylogenyHomologyUnitType argument"); } break; case 'Z': if (strcmp(optarg, "jukesCantor") == 0) { phylogenyDistanceCorrectionMethod = JUKES_CANTOR; } else if (strcmp(optarg, "none") == 0 ) { phylogenyDistanceCorrectionMethod = NONE; } else { st_errAbort("Could not parse the phylogenyDistanceCorrectionMethod argument"); } break; case '1': k = sscanf(optarg, "%" PRIi64, &maxRecoverableChainsIterations); if (k != 1) { st_errAbort("Error parsing the maxRecoverableChainsIterations argument"); } break; case '2': k = sscanf(optarg, "%" PRIi64, &maxRecoverableChainLength); if (k != 1) { st_errAbort("Error parsing the maxRecoverableChainLength argument"); } break; default: usage(); return 1; } } /////////////////////////////////////////////////////////////////////////// // (0) Check the inputs. /////////////////////////////////////////////////////////////////////////// assert(cactusDiskDatabaseString != NULL); assert(minimumTreeCoverage >= 0.0); assert(minimumTreeCoverage <= 1.0); assert(blockTrim >= 0); assert(annealingRoundsLength >= 0); for (int64_t i = 0; i < annealingRoundsLength; i++) { assert(annealingRounds[i] >= 0); } assert(meltingRoundsLength >= 0); for (int64_t i = 1; i < meltingRoundsLength; i++) { assert(meltingRounds[i - 1] < meltingRounds[i]); assert(meltingRounds[i - 1] >= 1); } assert(alignmentTrimLength >= 0); for (int64_t i = 0; i < alignmentTrimLength; i++) { assert(alignmentTrims[i] >= 0); } assert(minimumOutgroupDegree >= 0); assert(minimumIngroupDegree >= 0); ////////////////////////////////////////////// //Set up logging ////////////////////////////////////////////// st_setLogLevelFromString(logLevelString); ////////////////////////////////////////////// //Log (some of) the inputs ////////////////////////////////////////////// st_logInfo("Flower disk name : %s\n", cactusDiskDatabaseString); ////////////////////////////////////////////// //Load the database ////////////////////////////////////////////// kvDatabaseConf = stKVDatabaseConf_constructFromString(cactusDiskDatabaseString); cactusDisk = cactusDisk_construct(kvDatabaseConf, 0); st_logInfo("Set up the flower disk\n"); /////////////////////////////////////////////////////////////////////////// // Sort the constraints /////////////////////////////////////////////////////////////////////////// stPinchIterator *pinchIteratorForConstraints = NULL; if (constraintsFile != NULL) { pinchIteratorForConstraints = stPinchIterator_constructFromFile(constraintsFile); st_logInfo("Created an iterator for the alignment constaints from file: %s\n", constraintsFile); } /////////////////////////////////////////////////////////////////////////// // Do the alignment /////////////////////////////////////////////////////////////////////////// startTime = time(NULL); stList *flowers = flowerWriter_parseFlowersFromStdin(cactusDisk); if (alignmentsFile == NULL) { cactusDisk_preCacheStrings(cactusDisk, flowers); } char *tempFile1 = NULL; for (int64_t i = 0; i < stList_length(flowers); i++) { flower = stList_get(flowers, i); if (!flower_builtBlocks(flower)) { // Do nothing if the flower already has defined blocks st_logDebug("Processing flower: %lli\n", flower_getName(flower)); stCaf_setFlowerForAlignmentFiltering(flower); //Set up the graph and add the initial alignments stPinchThreadSet *threadSet = stCaf_setup(flower); //Build the set of outgroup threads outgroupThreads = stCaf_getOutgroupThreads(flower, threadSet); //Setup the alignments stPinchIterator *pinchIterator; stList *alignmentsList = NULL; if (alignmentsFile != NULL) { assert(i == 0); assert(stList_length(flowers) == 1); if (sortAlignments) { tempFile1 = getTempFile(); stCaf_sortCigarsFileByScoreInDescendingOrder(alignmentsFile, tempFile1); pinchIterator = stPinchIterator_constructFromFile(tempFile1); } else { pinchIterator = stPinchIterator_constructFromFile(alignmentsFile); } } else { if (tempFile1 == NULL) { tempFile1 = getTempFile(); } alignmentsList = stCaf_selfAlignFlower(flower, minimumSequenceLengthForBlast, lastzArguments, realign, realignArguments, tempFile1); if (sortAlignments) { stCaf_sortCigarsByScoreInDescendingOrder(alignmentsList); } st_logDebug("Ran lastz and have %" PRIi64 " alignments\n", stList_length(alignmentsList)); pinchIterator = stPinchIterator_constructFromList(alignmentsList); } for (int64_t annealingRound = 0; annealingRound < annealingRoundsLength; annealingRound++) { int64_t minimumChainLength = annealingRounds[annealingRound]; int64_t alignmentTrim = annealingRound < alignmentTrimLength ? alignmentTrims[annealingRound] : 0; st_logDebug("Starting annealing round with a minimum chain length of %" PRIi64 " and an alignment trim of %" PRIi64 "\n", minimumChainLength, alignmentTrim); stPinchIterator_setTrim(pinchIterator, alignmentTrim); //Add back in the constraints if (pinchIteratorForConstraints != NULL) { stCaf_anneal(threadSet, pinchIteratorForConstraints, filterFn); } //Do the annealing if (annealingRound == 0) { stCaf_anneal(threadSet, pinchIterator, filterFn); } else { stCaf_annealBetweenAdjacencyComponents(threadSet, pinchIterator, filterFn); } // Dump the block degree and length distribution to a file if (debugFileName != NULL) { dumpBlockInfo(threadSet, stString_print("%s-blockStats-preMelting", debugFileName)); } printf("Sequence graph statistics after annealing:\n"); printThreadSetStatistics(threadSet, flower, stdout); // Check for poorly-supported blocks--those that have // been transitively aligned together but with very // few homologies supporting the transitive // alignment. These "megablocks" can snarl up the // graph so that a lot of extra gets thrown away in // the first melting step. stPinchThreadSetBlockIt blockIt = stPinchThreadSet_getBlockIt(threadSet); stPinchBlock *block; while ((block = stPinchThreadSetBlockIt_getNext(&blockIt)) != NULL) { if (stPinchBlock_getDegree(block) > minimumBlockDegreeToCheckSupport) { uint64_t supportingHomologies = stPinchBlock_getNumSupportingHomologies(block); uint64_t possibleSupportingHomologies = numPossibleSupportingHomologies(block, flower); double support = ((double) supportingHomologies) / possibleSupportingHomologies; if (support < minimumBlockHomologySupport) { fprintf(stdout, "Destroyed a megablock with degree %" PRIi64 " and %" PRIi64 " supporting homologies out of a maximum " "of %" PRIi64 " (%lf%%).\n", stPinchBlock_getDegree(block), supportingHomologies, possibleSupportingHomologies, support); stPinchBlock_destruct(block); } } } //Do the melting rounds for (int64_t meltingRound = 0; meltingRound < meltingRoundsLength; meltingRound++) { int64_t minimumChainLengthForMeltingRound = meltingRounds[meltingRound]; st_logDebug("Starting melting round with a minimum chain length of %" PRIi64 " \n", minimumChainLengthForMeltingRound); if (minimumChainLengthForMeltingRound >= minimumChainLength) { break; } stCaf_melt(flower, threadSet, NULL, 0, minimumChainLengthForMeltingRound, 0, INT64_MAX); } st_logDebug("Last melting round of cycle with a minimum chain length of %" PRIi64 " \n", minimumChainLength); stCaf_melt(flower, threadSet, NULL, 0, minimumChainLength, breakChainsAtReverseTandems, maximumMedianSequenceLengthBetweenLinkedEnds); //This does the filtering of blocks that do not have the required species/tree-coverage/degree. stCaf_melt(flower, threadSet, blockFilterFn, blockTrim, 0, 0, INT64_MAX); } if (removeRecoverableChains) { stCaf_meltRecoverableChains(flower, threadSet, breakChainsAtReverseTandems, maximumMedianSequenceLengthBetweenLinkedEnds, recoverableChainsFilter, maxRecoverableChainsIterations, maxRecoverableChainLength); } if (debugFileName != NULL) { dumpBlockInfo(threadSet, stString_print("%s-blockStats-postMelting", debugFileName)); } printf("Sequence graph statistics after melting:\n"); printThreadSetStatistics(threadSet, flower, stdout); // Build a tree for each block, then use each tree to // partition the homologies between the ingroups sequences // into those that occur before the speciation with the // outgroup and those which occur late. if (stSet_size(outgroupThreads) > 0 && doPhylogeny) { st_logDebug("Starting to build trees and partition ingroup homologies\n"); stHash *threadStrings = stCaf_getThreadStrings(flower, threadSet); st_logDebug("Got sets of thread strings and set of threads that are outgroups\n"); stCaf_PhylogenyParameters params; params.distanceCorrectionMethod = phylogenyDistanceCorrectionMethod; params.treeBuildingMethods = phylogenyTreeBuildingMethods; params.rootingMethod = phylogenyRootingMethod; params.scoringMethod = phylogenyScoringMethod; params.breakpointScalingFactor = breakpointScalingFactor; params.nucleotideScalingFactor = nucleotideScalingFactor; params.skipSingleCopyBlocks = phylogenySkipSingleCopyBlocks; params.keepSingleDegreeBlocks = phylogenyKeepSingleDegreeBlocks; params.costPerDupPerBase = phylogenyCostPerDupPerBase; params.costPerLossPerBase = phylogenyCostPerLossPerBase; params.maxBaseDistance = phylogenyMaxBaseDistance; params.maxBlockDistance = phylogenyMaxBlockDistance; params.numTrees = phylogenyNumTrees; params.ignoreUnalignedBases = 1; params.onlyIncludeCompleteFeatureBlocks = 0; params.doSplitsWithSupportHigherThanThisAllAtOnce = phylogenyDoSplitsWithSupportHigherThanThisAllAtOnce; params.numTreeBuildingThreads = numTreeBuildingThreads; assert(params.numTreeBuildingThreads >= 1); stCaf_buildTreesToRemoveAncientHomologies( threadSet, phylogenyHomologyUnitType, threadStrings, outgroupThreads, flower, ¶ms, debugFileName == NULL ? NULL : stString_print("%s-phylogeny", debugFileName), referenceEventHeader); stHash_destruct(threadStrings); st_logDebug("Finished building trees\n"); if (removeRecoverableChains) { // We melt recoverable chains after splitting, as // well as before, to alleviate coverage loss // caused by bad splits. stCaf_meltRecoverableChains(flower, threadSet, breakChainsAtReverseTandems, maximumMedianSequenceLengthBetweenLinkedEnds, recoverableChainsFilter, maxRecoverableChainsIterations, maxRecoverableChainLength); } // Enforce the block constraints on minimum degree, // etc. after splitting. stCaf_melt(flower, threadSet, blockFilterFn, 0, 0, 0, INT64_MAX); } //Sort out case when we allow blocks of degree 1 if (minimumDegree < 2) { st_logDebug("Creating degree 1 blocks\n"); stCaf_makeDegreeOneBlocks(threadSet); stCaf_melt(flower, threadSet, blockFilterFn, blockTrim, 0, 0, INT64_MAX); } else if (maximumAdjacencyComponentSizeRatio < INT64_MAX) { //Deal with giant components st_logDebug("Breaking up components greedily\n"); stCaf_breakupComponentsGreedily(threadSet, maximumAdjacencyComponentSizeRatio); } //Finish up stCaf_finish(flower, threadSet, chainLengthForBigFlower, longChain, minLengthForChromosome, proportionOfUnalignedBasesForNewChromosome); //Flower is then destroyed at this point. st_logInfo("Ran the cactus core script\n"); //Cleanup stPinchThreadSet_destruct(threadSet); stPinchIterator_destruct(pinchIterator); stSet_destruct(outgroupThreads); if (alignmentsList != NULL) { stList_destruct(alignmentsList); } st_logInfo("Cleaned up from main loop\n"); } else { st_logInfo("We've already built blocks / alignments for this flower\n"); } } stList_destruct(flowers); if (tempFile1 != NULL) { st_system("rm %s", tempFile1); } if (constraintsFile != NULL) { stPinchIterator_destruct(pinchIteratorForConstraints); } /////////////////////////////////////////////////////////////////////////// // Write the flower to disk. /////////////////////////////////////////////////////////////////////////// st_logDebug("Writing the flowers to disk\n"); cactusDisk_write(cactusDisk); st_logInfo("Updated the flower on disk and %" PRIi64 " seconds have elapsed\n", time(NULL) - startTime); /////////////////////////////////////////////////////////////////////////// // Clean up. /////////////////////////////////////////////////////////////////////////// cactusDisk_destruct(cactusDisk); }
int main(int argc, char *argv[]) { // Parse arguments if (argc != 3) { usage(argv); return 1; } // You would load a custom HMM here if you wanted using // hmm_getStateMachine (see the realign code) StateMachine *stateMachine = stateMachine5_construct(fiveState); PairwiseAlignmentParameters *parameters = pairwiseAlignmentBandingParameters_construct(); stHash *targetSequences = readFastaFile(argv[1]); stHash *querySequences = readFastaFile(argv[2]); // For each query sequence, align it against all target sequences. stHashIterator *queryIt = stHash_getIterator(querySequences); char *queryHeader; while ((queryHeader = stHash_getNext(queryIt)) != NULL) { char *querySeq = stHash_search(querySequences, queryHeader); stHashIterator *targetIt = stHash_getIterator(targetSequences); char *targetHeader; while ((targetHeader = stHash_getNext(targetIt)) != NULL) { char *targetSeq = stHash_search(targetSequences, targetHeader); // Here we should try both the target sequence and its // reverse-complemented version // Aligns the sequences. // If you have alignment constraints (anchors) you should // replace this with getAlignedPairsUsingAnchors. stList *alignedPairs = getAlignedPairs(stateMachine, targetSeq, querySeq, parameters, true, true); // Takes into account the probability of aligning to a // gap, by transforming the posterior probability into the // AMAP objective function (see Schwartz & Pachter, 2007). alignedPairs = reweightAlignedPairs2(alignedPairs, strlen(targetSeq), strlen(querySeq), parameters->gapGamma); // I think this calculates the optimal ordered set of // alignments from the unordered set of aligned pairs, not // completely sure. alignedPairs = filterPairwiseAlignmentToMakePairsOrdered(alignedPairs, targetSeq, querySeq, // This parameter says that the minimum posterior probability we will accept has to be at least 0.9. 0.9); // After this the "aligned pairs" data structure changes, // which is a little sketchy. It's just so that the // alignment can be printed properly. stList_mapReplace(alignedPairs, convertToAnchorPair, NULL); stList_sort(alignedPairs, (int (*)(const void *, const void *)) stIntTuple_cmpFn); struct PairwiseAlignment *alignment = convertAlignedPairsToPairwiseAlignment(targetHeader, queryHeader, 0, strlen(targetSeq), strlen(querySeq), alignedPairs); // Output the cigar string cigarWrite(stdout, alignment, 0); stList_destruct(alignedPairs); destructPairwiseAlignment(alignment); } stHash_destructIterator(targetIt); } stHash_destructIterator(queryIt); // Clean up stHash_destruct(targetSequences); stHash_destruct(querySequences); pairwiseAlignmentBandingParameters_destruct(parameters); stateMachine_destruct(stateMachine); }
static void partialRecordRetrieval(CuTest *testCase) { setup(); //Make some number of large records stList *records = stList_construct3(0, free); stList *recordSizes = stList_construct3(0, (void(*)(void *)) stIntTuple_destruct); for (int32_t i = 0; i < 300; i++) { int32_t size = st_randomInt(0, 80); size = size * size * size; //Use cubic size distribution char *randomRecord = st_malloc(size * sizeof(char)); for (int32_t j = 0; j < size; j++) { randomRecord[j] = (char) st_randomInt(0, 100); } stList_append(records, randomRecord); stList_append(recordSizes, stIntTuple_construct(1, size)); CuAssertTrue(testCase, !stKVDatabase_containsRecord(database, i)); stKVDatabase_insertRecord(database, i, randomRecord, size * sizeof(char)); CuAssertTrue(testCase, stKVDatabase_containsRecord(database, i)); //st_uglyf("I am creating the record %i %i\n", i, size); } while (st_random() > 0.001) { int32_t recordKey = st_randomInt(0, stList_length(records)); CuAssertTrue(testCase, stKVDatabase_containsRecord(database, recordKey)); char *record = stList_get(records, recordKey); int32_t size = stIntTuple_getPosition(stList_get(recordSizes, recordKey), 0); //Get partial record int32_t start = size > 0 ? st_randomInt(0, size) : 0; int32_t partialSize = size - start > 0 ? st_randomInt(start, size) - start : 0; assert(start >= 0); assert(partialSize >= 0); assert(partialSize + start <= size); //st_uglyf("I am getting record %i %i %i %i\n", recordKey, start, partialSize, size); char *partialRecord = stKVDatabase_getPartialRecord(database, recordKey, start * sizeof(char), partialSize * sizeof(char), size * sizeof(char)); //Check they are equivalent.. for (int32_t i = 0; i < partialSize; i++) { if (record[start + i] != partialRecord[i]) { st_uglyf("There was a difference %i %i for record %i %i\n", record[start + i], partialRecord[i], i, partialSize); } //CuAssertTrue(testCase, record[start + i] == partialRecord[i]); } //Check we can not get out of bounds.. (start less than zero) stTry { stKVDatabase_getPartialRecord(database, recordKey, -1, 1, size * sizeof(char)); }stCatch(except) { CuAssertTrue(testCase, stExcept_getId(except) == ST_KV_DATABASE_EXCEPTION_ID); }stTryEnd; //Check we can not get out of bounds.. (start greater than index start) stTry { stKVDatabase_getPartialRecord(database, recordKey, size, 1, size * sizeof(char)); }stCatch(except) { CuAssertTrue(testCase, stExcept_getId(except) == ST_KV_DATABASE_EXCEPTION_ID); }stTryEnd; //Check we can not get out of bounds.. (total size if greater than record length) stTry { stKVDatabase_getPartialRecord(database, recordKey, 0, size + 1, size * sizeof(char)); }stCatch(except) { CuAssertTrue(testCase, stExcept_getId(except) == ST_KV_DATABASE_EXCEPTION_ID); }stTryEnd; //Check we can not get non existent record stTry { stKVDatabase_getPartialRecord(database, 1000000, 0, size, size * sizeof(char)); }stCatch(except) { CuAssertTrue(testCase, stExcept_getId(except) == ST_KV_DATABASE_EXCEPTION_ID); }stTryEnd; } stList_destruct(records); stList_destruct(recordSizes); teardown(); }
static void testBulkGetRecords(CuTest* testCase) { /* * Tests the new bulk get functions */ setup(); int64_t i = 100, j = 110, k = 120, l = 130; int64_t bigRecSize = 184500800; int64_t* m = (int64_t*)st_malloc(bigRecSize); int64_t ki = 4, kj = 5, kk = 3, kl = 1, km = 2; stKVDatabase_insertRecord(database, 1, &i, sizeof(int64_t)); stList *requests = stList_construct3(0, (void(*)(void *)) stKVDatabaseBulkRequest_destruct); stList_append(requests, stKVDatabaseBulkRequest_constructInsertRequest(ki, &i, sizeof(int64_t))); stList_append(requests, stKVDatabaseBulkRequest_constructInsertRequest(kj, &j, sizeof(int64_t))); stList_append(requests, stKVDatabaseBulkRequest_constructSetRequest(kk, &k, sizeof(int64_t))); stList_append(requests, stKVDatabaseBulkRequest_constructUpdateRequest(kl, &l, sizeof(int64_t))); stKVDatabase_bulkSetRecords(database, requests); stList_destruct(requests); stKVDatabase_setRecord(database, km, m, bigRecSize); stList* keys = stList_construct2(5); stList_set(keys, 0, &ki); stList_set(keys, 1, &kj); stList_set(keys, 2, &kk); stList_set(keys, 3, &kl); stList_set(keys, 4, &km); stList* results = stKVDatabase_bulkGetRecords(database, keys); CuAssertTrue(testCase, stList_length(results) == 5); void* record; int64_t size; stKVDatabaseBulkResult* res0 = (stKVDatabaseBulkResult*)stList_get(results, 0); record = stKVDatabaseBulkResult_getRecord(res0, &size); CuAssertTrue(testCase, record != NULL); CuAssertTrue(testCase, *(int64_t*)record == i && size == sizeof(int64_t)); stKVDatabaseBulkResult* res1 = (stKVDatabaseBulkResult*)stList_get(results, 1); record = stKVDatabaseBulkResult_getRecord(res1, &size); CuAssertTrue(testCase, record != NULL); CuAssertTrue(testCase, *(int64_t*)record == j && size == sizeof(int64_t)); stKVDatabaseBulkResult* res2 = (stKVDatabaseBulkResult*)stList_get(results, 2); record = stKVDatabaseBulkResult_getRecord(res2, &size); CuAssertTrue(testCase, record != NULL); CuAssertTrue(testCase, *(int64_t*)record == k && size == sizeof(int64_t)); stKVDatabaseBulkResult* res3 = (stKVDatabaseBulkResult*)stList_get(results, 3); record = stKVDatabaseBulkResult_getRecord(res3, &size); CuAssertTrue(testCase, record != NULL); CuAssertTrue(testCase, *(int64_t*)record == l && size == sizeof(int64_t)); stKVDatabaseBulkResult* res4 = (stKVDatabaseBulkResult*)stList_get(results, 4); record = stKVDatabaseBulkResult_getRecord(res4, &size); CuAssertTrue(testCase, record != NULL); CuAssertTrue(testCase, size == bigRecSize); stList_destruct(results); results = stKVDatabase_bulkGetRecordsRange(database, 1, 6); CuAssertTrue(testCase, stList_length(results) == 6); res0 = (stKVDatabaseBulkResult*)stList_get(results, 0); record = stKVDatabaseBulkResult_getRecord(res0, &size); CuAssertTrue(testCase, record != NULL); CuAssertTrue(testCase, *(int64_t*)record == l && size == sizeof(int64_t)); res1 = (stKVDatabaseBulkResult*)stList_get(results, 1); record = stKVDatabaseBulkResult_getRecord(res1, &size); CuAssertTrue(testCase, record != NULL); CuAssertTrue(testCase, size == bigRecSize); res2 = (stKVDatabaseBulkResult*)stList_get(results, 2); record = stKVDatabaseBulkResult_getRecord(res2, &size); CuAssertTrue(testCase, record != NULL); CuAssertTrue(testCase, *(int64_t*)record == k && size == sizeof(int64_t)); res3 = (stKVDatabaseBulkResult*)stList_get(results, 3); record = stKVDatabaseBulkResult_getRecord(res3, &size); CuAssertTrue(testCase, record != NULL); CuAssertTrue(testCase, *(int64_t*)record == i && size == sizeof(int64_t)); res4 = (stKVDatabaseBulkResult*)stList_get(results, 4); record = stKVDatabaseBulkResult_getRecord(res4, &size); CuAssertTrue(testCase, record != NULL); CuAssertTrue(testCase, *(int64_t*)record == j && size == sizeof(int64_t)); stKVDatabaseBulkResult* res5 = (stKVDatabaseBulkResult*)stList_get(results, 5); record = stKVDatabaseBulkResult_getRecord(res5, &size); CuAssertTrue(testCase, record == NULL); stList_destruct(results); free(m); teardown(); }
int main(int argc, char *argv[]) { /* * Open the database. * Construct a flower. * Construct an event tree representing the species tree. * For each sequence contruct two ends each containing an cap. * Make a file for the sequence. * Link the two caps. * Finish! */ int64_t key, j; Group *group; Flower_EndIterator *endIterator; End *end; bool makeEventHeadersAlphaNumeric = 0; /* * Arguments/options */ char * logLevelString = NULL; char * speciesTree = NULL; char * outgroupEvents = NULL; /////////////////////////////////////////////////////////////////////////// // (0) Parse the inputs handed by genomeCactus.py / setup stuff. /////////////////////////////////////////////////////////////////////////// while (1) { static struct option long_options[] = { { "logLevel", required_argument, 0, 'a' }, { "cactusDisk", required_argument, 0, 'b' }, { "speciesTree", required_argument, 0, 'f' }, { "outgroupEvents", required_argument, 0, 'g' }, { "help", no_argument, 0, 'h' }, { "makeEventHeadersAlphaNumeric", no_argument, 0, 'i' }, { 0, 0, 0, 0 } }; int option_index = 0; key = getopt_long(argc, argv, "a:b:f:hg:i", long_options, &option_index); if (key == -1) { break; } switch (key) { case 'a': logLevelString = optarg; break; case 'b': cactusDiskDatabaseString = optarg; break; case 'f': speciesTree = optarg; break; case 'g': outgroupEvents = optarg; break; case 'h': usage(); return 0; case 'i': makeEventHeadersAlphaNumeric = 1; break; default: usage(); return 1; } } /////////////////////////////////////////////////////////////////////////// // (0) Check the inputs. /////////////////////////////////////////////////////////////////////////// //assert(logLevelString == NULL || strcmp(logLevelString, "CRITICAL") == 0 || strcmp(logLevelString, "INFO") == 0 || strcmp(logLevelString, "DEBUG") == 0); assert(cactusDiskDatabaseString != NULL); assert(speciesTree != NULL); ////////////////////////////////////////////// //Set up logging ////////////////////////////////////////////// st_setLogLevelFromString(logLevelString); ////////////////////////////////////////////// //Log (some of) the inputs ////////////////////////////////////////////// st_logInfo("Flower disk name : %s\n", cactusDiskDatabaseString); for (j = optind; j < argc; j++) { st_logInfo("Sequence file/directory %s\n", argv[j]); } ////////////////////////////////////////////// //Load the database ////////////////////////////////////////////// stKVDatabaseConf *kvDatabaseConf = kvDatabaseConf = stKVDatabaseConf_constructFromString(cactusDiskDatabaseString); if (stKVDatabaseConf_getType(kvDatabaseConf) == stKVDatabaseTypeTokyoCabinet || stKVDatabaseConf_getType(kvDatabaseConf) == stKVDatabaseTypeKyotoTycoon) { assert(stKVDatabaseConf_getDir(kvDatabaseConf) != NULL); cactusDisk = cactusDisk_construct2(kvDatabaseConf, "cactusSequences"); } else { cactusDisk = cactusDisk_construct(kvDatabaseConf, 1); } st_logInfo("Set up the flower disk\n"); ////////////////////////////////////////////// //Construct the flower ////////////////////////////////////////////// if (cactusDisk_getFlower(cactusDisk, 0) != NULL) { cactusDisk_destruct(cactusDisk); st_logInfo("The first flower already exists\n"); return 0; } flower = flower_construct2(0, cactusDisk); assert(flower_getName(flower) == 0); st_logInfo("Constructed the flower\n"); ////////////////////////////////////////////// //Construct the event tree ////////////////////////////////////////////// st_logInfo("Going to build the event tree with newick string: %s\n", speciesTree); stTree *tree = stTree_parseNewickString(speciesTree); st_logInfo("Parsed the tree\n"); if (makeEventHeadersAlphaNumeric) { makeEventHeadersAlphaNumericFn(tree); } stTree_setBranchLength(tree, INT64_MAX); checkBranchLengthsAreDefined(tree); eventTree = eventTree_construct2(flower); //creates the event tree and the root even totalEventNumber = 1; st_logInfo("Constructed the basic event tree\n"); // Construct a set of outgroup names so that ancestral outgroups // get recognized. stSet *outgroupNameSet = stSet_construct3(stHash_stringKey, stHash_stringEqualKey, free); if(outgroupEvents != NULL) { stList *outgroupNames = stString_split(outgroupEvents); for(int64_t i = 0; i < stList_length(outgroupNames); i++) { char *outgroupName = stList_get(outgroupNames, i); stSet_insert(outgroupNameSet, stString_copy(outgroupName)); } stList_destruct(outgroupNames); } //now traverse the tree j = optind; assignEventsAndSequences(eventTree_getRootEvent(eventTree), tree, outgroupNameSet, argv, &j); char *eventTreeString = eventTree_makeNewickString(eventTree); st_logInfo( "Constructed the initial flower with %" PRIi64 " sequences and %" PRIi64 " events with string: %s\n", totalSequenceNumber, totalEventNumber, eventTreeString); assert(event_getSubTreeBranchLength(eventTree_getRootEvent(eventTree)) >= 0.0); free(eventTreeString); //assert(0); ////////////////////////////////////////////// //Label any outgroup events. ////////////////////////////////////////////// if (outgroupEvents != NULL) { stList *outgroupEventsList = stString_split(outgroupEvents); for (int64_t i = 0; i < stList_length(outgroupEventsList); i++) { char *outgroupEvent = makeEventHeadersAlphaNumeric ? makeAlphaNumeric(stList_get(outgroupEventsList, i)) : stString_copy(stList_get(outgroupEventsList, i)); Event *event = eventTree_getEventByHeader(eventTree, outgroupEvent); if (event == NULL) { st_errAbort("Got an outgroup string that does not match an event, outgroup string %s", outgroupEvent); } assert(!event_isOutgroup(event)); event_setOutgroupStatus(event, 1); assert(event_isOutgroup(event)); free(outgroupEvent); } stList_destruct(outgroupEventsList); } ////////////////////////////////////////////// //Construct the terminal group. ////////////////////////////////////////////// if (flower_getEndNumber(flower) > 0) { group = group_construct2(flower); endIterator = flower_getEndIterator(flower); while ((end = flower_getNextEnd(endIterator)) != NULL) { end_setGroup(end, group); } flower_destructEndIterator(endIterator); assert(group_isLeaf(group)); // Create a one link chain if there is only one pair of attached ends.. group_constructChainForLink(group); assert(!flower_builtBlocks(flower)); } else { flower_setBuiltBlocks(flower, 1); } /////////////////////////////////////////////////////////////////////////// // Write the flower to disk. /////////////////////////////////////////////////////////////////////////// //flower_check(flower); cactusDisk_write(cactusDisk); st_logInfo("Updated the flower on disk\n"); /////////////////////////////////////////////////////////////////////////// // Cleanup. /////////////////////////////////////////////////////////////////////////// cactusDisk_destruct(cactusDisk); return 0; //Exit without clean up is quicker, enable cleanup when doing memory leak detection. stSet_destruct(outgroupNameSet); stTree_destruct(tree); stKVDatabaseConf_destruct(kvDatabaseConf); return 0; }
static stList *splitMultipleStubCycle(stList *cycle, stList *nonZeroWeightAdjacencyEdges, stSortedSet *allAdjacencyEdges, stList *stubEdges, stList *chainEdges) { /* * Takes a simple cycle containing k stub edges and splits into k cycles, each containing 1 stub edge. */ /* * Get sub-components containing only adjacency and chain edges. */ stSortedSet *stubAndChainEdgesSet = getSetOfMergedLists(stubEdges, chainEdges); stList *adjacencyEdgeMatching = stList_filterToExclude(cycle, stubAndChainEdgesSet); //Filter out the the non-adjacency edges //Make it only the chain edges present in the original component stList *stubFreePaths = getComponents2(adjacencyEdgeMatching, NULL, chainEdges); stList_destruct(adjacencyEdgeMatching); assert(stList_length(stubFreePaths) >= 1); stList *splitCycles = stList_construct3(0, (void(*)(void *)) stList_destruct); //The list to return. if (stList_length(stubFreePaths) > 1) { /* * Build the list of adjacency edges acceptable in the merge */ stSortedSet *oddNodes = getOddNodes(cycle); stList *oddToEvenNonZeroWeightAdjacencyEdges = getOddToEvenAdjacencyEdges(oddNodes, nonZeroWeightAdjacencyEdges); stSortedSet *oddToEvenAllAdjacencyEdges = getOddToEvenAdjacencyEdges2(oddNodes, allAdjacencyEdges); /* * Merge together the best two components. */ stList *l = filterListsToExclude(stubFreePaths, stubAndChainEdgesSet); doBestMergeOfTwoSimpleCycles(l, oddToEvenNonZeroWeightAdjacencyEdges, oddToEvenAllAdjacencyEdges); //This is inplace. stList *l2 = stList_join(l); stList_destruct(l); l = getComponents2(l2, stubEdges, chainEdges); assert(stList_length(l) == 2); stList_destruct(l2); /* * Cleanup */ stSortedSet_destruct(oddNodes); stList_destruct(oddToEvenNonZeroWeightAdjacencyEdges); stSortedSet_destruct(oddToEvenAllAdjacencyEdges); /* * Call procedure recursively. */ for (int64_t i = 0; i < stList_length(l); i++) { /* * Split into adjacency edges, stub edges and chain edges. */ stList *subCycle = stList_get(l, i); stList *subAdjacencyEdges; stList *subStubEdges; stList *subChainEdges; splitIntoAdjacenciesStubsAndChains(subCycle, nonZeroWeightAdjacencyEdges, stubEdges, chainEdges, &subAdjacencyEdges, &subStubEdges, &subChainEdges); /* * Call recursively. */ l2 = splitMultipleStubCycle(subCycle, subAdjacencyEdges, allAdjacencyEdges, subStubEdges, subChainEdges); stList_appendAll(splitCycles, l2); /* * Clean up */ stList_setDestructor(l2, NULL); stList_destruct(l2); stList_destruct(subAdjacencyEdges); stList_destruct(subStubEdges); stList_destruct(subChainEdges); } stList_destruct(l); } else { stList_append(splitCycles, stList_copy(cycle, NULL)); } stSortedSet_destruct(stubAndChainEdgesSet); stList_destruct(stubFreePaths); return splitCycles; }