void mapGenes(Flower *flower, FILE *fileHandle, struct bed *gene, char *species){ st_logInfo("Flower %s\n", cactusMisc_nameToString(flower_getName(flower))); printOpeningTag("geneMap", fileHandle); fprintf(fileHandle, "\n"); int level = 0;//Flower level while(gene != NULL){ //Get the start of the target sequence: st_logInfo("Gene %s:\n", gene->name); Cap *startCap; struct List *capList = flower_getThreadStarts(flower, species); for(int i=0; i < capList->length; i++){ startCap = capList->list[i]; st_logInfo("Cap %d, %s\n", i, cactusMisc_nameToString(cap_getName(startCap))); //Traverse cactus and get regions that overlap with exons of the gene, report the involved chains relations fprintf(fileHandle, "\t<gene name=\"%s\" target=\"%s\" start=\"%" PRIi64 "\" end=\"%" PRIi64 "\" exonCount=\"%" PRIi64 "\" strand=\"%c\">\n", gene->name, species, gene->chromStart, gene->chromEnd, gene->blockCount, gene->strand[0]); fprintf(fileHandle, "\t\t<exon id=\"0\" start=\"%" PRIi64 "\" end=\"%" PRIi64 "\">\n", gene->chromStart, gene->chromStart + gene->blockSizes->list[0]); mapGene(startCap, level, 0, gene, fileHandle); fprintf(fileHandle, "\t</gene>\n"); } gene = gene->next; } printClosingTag("geneMap", fileHandle); return; }
Group *flower_getParentGroup(Flower *flower) { if (flower->parentFlowerName == NULL_NAME) { return NULL; } Flower *flower2 = cactusDisk_getFlower(flower_getCactusDisk(flower), flower->parentFlowerName); assert(flower2 != NULL); return flower_getGroup(flower2, flower_getName(flower)); }
void flower_writeBinaryRepresentation(Flower *flower, void(*writeFn)(const void * ptr, size_t size, size_t count)) { Flower_SequenceIterator *sequenceIterator; Flower_EndIterator *endIterator; Flower_BlockIterator *blockIterator; Flower_GroupIterator *groupIterator; Flower_ChainIterator *chainIterator; Sequence *sequence; End *end; Block *block; Group *group; Chain *chain; binaryRepresentation_writeElementType(CODE_FLOWER, writeFn); binaryRepresentation_writeName(flower_getName(flower), writeFn); binaryRepresentation_writeBool(flower_builtBlocks(flower), writeFn); binaryRepresentation_writeBool(flower_builtTrees(flower), writeFn); binaryRepresentation_writeBool(flower_builtFaces(flower), writeFn); binaryRepresentation_writeName(flower->parentFlowerName, writeFn); if (flower_getEventTree(flower) != NULL) { eventTree_writeBinaryRepresentation(flower_getEventTree(flower), writeFn); } sequenceIterator = flower_getSequenceIterator(flower); while ((sequence = flower_getNextSequence(sequenceIterator)) != NULL) { sequence_writeBinaryRepresentation(sequence, writeFn); } flower_destructSequenceIterator(sequenceIterator); endIterator = flower_getEndIterator(flower); while ((end = flower_getNextEnd(endIterator)) != NULL) { end_writeBinaryRepresentation(end, writeFn); } flower_destructEndIterator(endIterator); blockIterator = flower_getBlockIterator(flower); while ((block = flower_getNextBlock(blockIterator)) != NULL) { block_writeBinaryRepresentation(block, writeFn); } flower_destructBlockIterator(blockIterator); groupIterator = flower_getGroupIterator(flower); while ((group = flower_getNextGroup(groupIterator)) != NULL) { group_writeBinaryRepresentation(group, writeFn); } flower_destructGroupIterator(groupIterator); chainIterator = flower_getChainIterator(flower); while ((chain = flower_getNextChain(chainIterator)) != NULL) { chain_writeBinaryRepresentation(chain, writeFn); } flower_destructChainIterator(chainIterator); binaryRepresentation_writeElementType(CODE_FLOWER, writeFn); //this avoids interpretting things wrong. }
void cactusDisk_addUpdateRequest(CactusDisk *cactusDisk, Flower *flower) { int64_t recordSize; void *vA = binaryRepresentation_makeBinaryRepresentation(flower, (void (*)(void *, void (*)(const void * ptr, size_t size, size_t count))) flower_writeBinaryRepresentation, &recordSize); //Compression vA = compress(vA, &recordSize); if (containsRecord(cactusDisk, flower_getName(flower))) { int64_t recordSize2; void *vA2 = stCache_getRecord(cactusDisk->cache, flower_getName(flower), 0, INT64_MAX, &recordSize2); if (!stCache_recordsIdentical(vA, recordSize, vA2, recordSize2)) { //Only rewrite if we actually did something stList_append(cactusDisk->updateRequests, stKVDatabaseBulkRequest_constructUpdateRequest(flower_getName(flower), vA, recordSize)); } free(vA2); } else { stList_append(cactusDisk->updateRequests, stKVDatabaseBulkRequest_constructInsertRequest(flower_getName(flower), vA, recordSize)); } free(vA); }
void makeCactusTree_terminalNode(Flower *flower, FILE *fileHandle, const char *parentNodeName, const char *parentEdgeColour) { char *groupNameString = cactusMisc_nameToString(flower_getName(flower)); double scalingFactor = flower_getTotalBaseLength(flower) / totalProblemSize; assert(scalingFactor <= 1.001); assert(scalingFactor >= -0.001); addNodeToGraph(groupNameString, fileHandle, scalingFactor, "triangle", groupNameString); //Write in the parent edge. if (parentNodeName != NULL) { graphViz_addEdgeToGraph(parentNodeName, groupNameString, fileHandle, "", parentEdgeColour, 10, 1, "forward"); } free(groupNameString); }
void testGroup_makeNonLeaf(CuTest *testCase) { cactusGroupTestSetup(); CuAssertTrue(testCase, group_isLeaf(group2)); end_setGroup(end4, group2); group_makeNestedFlower(group2); CuAssertTrue(testCase, !group_isLeaf(group2)); Flower *nestedFlower = group_getNestedFlower(group2); CuAssertTrue(testCase, nestedFlower != NULL); CuAssertTrue(testCase, !flower_builtBlocks(flower)); CuAssertTrue(testCase, !flower_builtTrees(flower)); CuAssertTrue(testCase, !flower_builtFaces(flower)); CuAssertTrue(testCase, flower_getName(nestedFlower) == group_getName(group2)); CuAssertTrue(testCase, flower_getParentGroup(nestedFlower) == group2); CuAssertTrue(testCase, flower_getEndNumber(nestedFlower) == 1); End *nestedEnd = flower_getFirstEnd(nestedFlower); CuAssertTrue(testCase, end_getName(end4) == end_getName(nestedEnd)); CuAssertTrue(testCase, end_getGroup(nestedEnd) != NULL); CuAssertTrue(testCase, flower_getGroupNumber(nestedFlower) == 1); CuAssertTrue(testCase, flower_isTerminal(nestedFlower)); cactusGroupTestTeardown(); }
void testCactusDisk_getFlower(CuTest* testCase) { cactusDiskTestSetup(); Flower *flower = flower_construct(cactusDisk); Flower *flower2 = flower_construct(cactusDisk); CuAssertTrue(testCase, cactusDisk_getFlower(cactusDisk, flower_getName(flower)) == flower); CuAssertTrue(testCase, cactusDisk_getFlower(cactusDisk, flower_getName(flower2)) == flower2); //now try closing the disk, then reloading it, to see if we get the same result. Name name1 = flower_getName(flower); Name name2 = flower_getName(flower2); cactusDisk_write(cactusDisk); cactusDisk_destruct(cactusDisk); cactusDisk = cactusDisk_construct(conf, 0); flower = cactusDisk_getFlower(cactusDisk, name1); flower2 = cactusDisk_getFlower(cactusDisk, name2); CuAssertTrue(testCase, flower != NULL); CuAssertTrue(testCase, flower2 != NULL); CuAssertTrue(testCase, flower_getName(flower) == name1); CuAssertTrue(testCase, flower_getName(flower2) == name2); cactusDiskTestTeardown(); }
int main(int argc, char *argv[]) { /* * Script for adding a reference genome to a flower. */ /* * Arguments/options */ char * logLevelString = NULL; char * cactusDiskDatabaseString = NULL; char * secondaryDatabaseString = NULL; char *referenceEventString = (char *) cactusMisc_getDefaultReferenceEventHeader(); bool bottomUpPhase = 0; /////////////////////////////////////////////////////////////////////////// // (0) Parse the inputs handed by genomeCactus.py / setup stuff. /////////////////////////////////////////////////////////////////////////// while (1) { static struct option long_options[] = { { "logLevel", required_argument, 0, 'a' }, { "cactusDisk", required_argument, 0, 'b' }, { "secondaryDisk", required_argument, 0, 'd' }, { "referenceEventString", required_argument, 0, 'g' }, { "help", no_argument, 0, 'h' }, { "bottomUpPhase", no_argument, 0, 'j' }, { 0, 0, 0, 0 } }; int option_index = 0; int key = getopt_long(argc, argv, "a:b:c:d:e:g:hi:j", long_options, &option_index); if (key == -1) { break; } switch (key) { case 'a': logLevelString = stString_copy(optarg); break; case 'b': cactusDiskDatabaseString = stString_copy(optarg); break; case 'd': secondaryDatabaseString = stString_copy(optarg); break; case 'g': referenceEventString = stString_copy(optarg); break; case 'h': usage(); return 0; case 'j': bottomUpPhase = 1; break; default: usage(); return 1; } } /////////////////////////////////////////////////////////////////////////// // (0) Check the inputs. /////////////////////////////////////////////////////////////////////////// assert(cactusDiskDatabaseString != NULL); ////////////////////////////////////////////// //Set up logging ////////////////////////////////////////////// st_setLogLevelFromString(logLevelString); ////////////////////////////////////////////// //Load the database ////////////////////////////////////////////// st_logInfo("referenceEventString = %s\n", referenceEventString); st_logInfo("bottomUpPhase = %i\n", bottomUpPhase); stKVDatabaseConf *kvDatabaseConf = stKVDatabaseConf_constructFromString(cactusDiskDatabaseString); CactusDisk *cactusDisk = cactusDisk_construct(kvDatabaseConf, false, true); stKVDatabaseConf_destruct(kvDatabaseConf); st_logInfo("Set up the flower disk\n"); stKVDatabase *sequenceDatabase = NULL; if (secondaryDatabaseString != NULL) { kvDatabaseConf = stKVDatabaseConf_constructFromString(secondaryDatabaseString); sequenceDatabase = stKVDatabase_construct(kvDatabaseConf, 0); stKVDatabaseConf_destruct(kvDatabaseConf); } FlowerStream *flowerStream = flowerWriter_getFlowerStream(cactusDisk, stdin); Flower *flower; while ((flower = flowerStream_getNext(flowerStream)) != NULL) { st_logDebug("Processing flower %" PRIi64 "\n", flower_getName(flower)); /////////////////////////////////////////////////////////////////////////// // Get the appropriate event names /////////////////////////////////////////////////////////////////////////// st_logInfo("%s\n", eventTree_makeNewickString(flower_getEventTree(flower))); Event *referenceEvent = eventTree_getEventByHeader(flower_getEventTree(flower), referenceEventString); if (referenceEvent == NULL) { st_errAbort("Reference event %s not found in tree. Check your " "--referenceEventString option", referenceEventString); } Name referenceEventName = event_getName(referenceEvent); /////////////////////////////////////////////////////////////////////////// // Now do bottom up or top down, depending /////////////////////////////////////////////////////////////////////////// stList *flowers = stList_construct(); stList_append(flowers, flower); preCacheNestedFlowers(cactusDisk, flowers); if (bottomUpPhase) { assert(sequenceDatabase != NULL); cactusDisk_preCacheSegmentStrings(cactusDisk, flowers); bottomUp(flowers, sequenceDatabase, referenceEventName, !flower_hasParentGroup(flower), generateJukesCantorMatrix); // Unload the nested flowers to save memory. They haven't // been changed, so we don't write them to the cactus // disk. Flower_GroupIterator *groupIt = flower_getGroupIterator(flower); Group *group; while ((group = flower_getNextGroup(groupIt)) != NULL) { if (!group_isLeaf(group)) { flower_unload(group_getNestedFlower(group)); } } flower_destructGroupIterator(groupIt); assert(!flower_isParentLoaded(flower)); // Write this flower to disk. cactusDisk_addUpdateRequest(cactusDisk, flower); } else { topDown(flower, referenceEventName); // We've changed the nested flowers, but not this // flower. We write the nested flowers to disk, then // unload them to save memory. This flower will be // unloaded by the flower-stream code. Flower_GroupIterator *groupIt = flower_getGroupIterator(flower); Group *group; while ((group = flower_getNextGroup(groupIt)) != NULL) { if (!group_isLeaf(group)) { cactusDisk_addUpdateRequest(cactusDisk, group_getNestedFlower(group)); flower_unload(group_getNestedFlower(group)); } } flower_destructGroupIterator(groupIt); } stList_destruct(flowers); } /////////////////////////////////////////////////////////////////////////// // Write the flower(s) back to disk. /////////////////////////////////////////////////////////////////////////// cactusDisk_write(cactusDisk); st_logInfo("Updated the flower on disk\n"); /////////////////////////////////////////////////////////////////////////// //Clean up. /////////////////////////////////////////////////////////////////////////// if (sequenceDatabase != NULL) { stKVDatabase_destruct(sequenceDatabase); } cactusDisk_destruct(cactusDisk); return 0; //Exit without clean up is quicker, enable cleanup when doing memory leak detection. free(cactusDiskDatabaseString); free(referenceEventString); free(logLevelString); st_logInfo("Cleaned stuff up and am finished\n"); return 0; }
int main(int argc, char *argv[]) { st_setLogLevelFromString(argv[1]); st_logDebug("Set up logging\n"); stKVDatabaseConf *kvDatabaseConf = stKVDatabaseConf_constructFromString(argv[2]); CactusDisk *cactusDisk = cactusDisk_construct(kvDatabaseConf, 0); stKVDatabaseConf_destruct(kvDatabaseConf); st_logDebug("Set up the flower disk\n"); Name flowerName = cactusMisc_stringToName(argv[3]); Flower *flower = cactusDisk_getFlower(cactusDisk, flowerName); int64_t totalBases = flower_getTotalBaseLength(flower); int64_t totalEnds = flower_getEndNumber(flower); int64_t totalFreeEnds = flower_getFreeStubEndNumber(flower); int64_t totalAttachedEnds = flower_getAttachedStubEndNumber(flower); int64_t totalCaps = flower_getCapNumber(flower); int64_t totalBlocks = flower_getBlockNumber(flower); int64_t totalGroups = flower_getGroupNumber(flower); int64_t totalChains = flower_getChainNumber(flower); int64_t totalLinkGroups = 0; int64_t maxEndDegree = 0; int64_t maxAdjacencyLength = 0; int64_t totalEdges = 0; Flower_EndIterator *endIt = flower_getEndIterator(flower); End *end; while((end = flower_getNextEnd(endIt)) != NULL) { assert(end_getOrientation(end)); if(end_getInstanceNumber(end) > maxEndDegree) { maxEndDegree = end_getInstanceNumber(end); } stSortedSet *ends = stSortedSet_construct(); End_InstanceIterator *capIt = end_getInstanceIterator(end); Cap *cap; while((cap = end_getNext(capIt)) != NULL) { if(cap_getSequence(cap) != NULL) { Cap *adjacentCap = cap_getAdjacency(cap); assert(adjacentCap != NULL); End *adjacentEnd = end_getPositiveOrientation(cap_getEnd(adjacentCap)); stSortedSet_insert(ends, adjacentEnd); int64_t adjacencyLength = cap_getCoordinate(cap) - cap_getCoordinate(adjacentCap); if(adjacencyLength < 0) { adjacencyLength *= -1; } assert(adjacencyLength >= 1); if(adjacencyLength >= maxAdjacencyLength) { maxAdjacencyLength = adjacencyLength; } } } end_destructInstanceIterator(capIt); totalEdges += stSortedSet_size(ends); if(stSortedSet_search(ends, end) != NULL) { //This ensures we count self edges twice, so that the division works. totalEdges += 1; } stSortedSet_destruct(ends); } assert(totalEdges % 2 == 0); flower_destructEndIterator(endIt); Flower_GroupIterator *groupIt = flower_getGroupIterator(flower); Group *group; while((group = flower_getNextGroup(groupIt)) != NULL) { if(group_getLink(group) != NULL) { totalLinkGroups++; } } flower_destructGroupIterator(groupIt); printf("flower name: %" PRIi64 " total bases: %" PRIi64 " total-ends: %" PRIi64 " total-caps: %" PRIi64 " max-end-degree: %" PRIi64 " max-adjacency-length: %" PRIi64 " total-blocks: %" PRIi64 " total-groups: %" PRIi64 " total-edges: %" PRIi64 " total-free-ends: %" PRIi64 " total-attached-ends: %" PRIi64 " total-chains: %" PRIi64 " total-link groups: %" PRIi64 "\n", flower_getName(flower), totalBases, totalEnds, totalCaps, maxEndDegree, maxAdjacencyLength, totalBlocks, totalGroups, totalEdges/2, totalFreeEnds, totalAttachedEnds, totalChains, totalLinkGroups); return 0; }
void testGroup_getNestedFlowerName(CuTest* testCase) { cactusGroupTestSetup(); CuAssertTrue(testCase, group_getName(group) == flower_getName(nestedFlower)); cactusGroupTestTeardown(); }
void flower_setParentGroup(Flower *flower, Group *group) { //assert(flower->parentFlowerName == NULL_NAME); we can change this if merging the parent flowers, so this no longer applies. flower->parentFlowerName = flower_getName(group_getFlower(group)); }
void testFlower_getName(CuTest* testCase) { cactusFlowerTestSetup(); CuAssertTrue(testCase, flower_getName(flower) != NULL_NAME); CuAssertTrue(testCase, cactusDisk_getFlower(cactusDisk, flower_getName(flower)) == flower); cactusFlowerTestTeardown(); }
void testFlower_removeIfRedundant(CuTest *testCase) { /* * Do a simple test to see if function can remove a redundant flower. */ cactusFlowerTestSetup(); endsSetup(); //First construct a redundant flower from the root. Flower *flower2 = flower_construct(cactusDisk); Group *group = group_construct(flower, flower2); end_setGroup(end, group); end_setGroup(end2, group); //Now hang another couple of flowers of that. Flower *flower3 = flower_construct(cactusDisk); group_construct(flower2, flower3); //Now hang another flower of that. Group *group3b = group_construct2(flower2); //Finally hang one more flower on the end.. Flower *flower4 = flower_construct(cactusDisk); group_construct(flower3, flower4); //Copy the ends into the flowers. end_copyConstruct(end, flower2); end_copyConstruct(end2, flower2); end_copyConstruct(end, flower3); end_setGroup(flower_getEnd(flower2, end_getName(end2)), group3b); end_copyConstruct(end, flower4); //st_uglyf("I got %" PRIi64 " %" PRIi64 " %" PRIi64 " %" PRIi64 "\n", flower_getName(flower), flower_getName(flower2), flower_getName(flower3), flower_getName(flower4)); //Write the mess to disk. cactusDisk_write(cactusDisk); //Now test the removal function (check we get a negative on this leaf). CuAssertTrue(testCase, !flower_removeIfRedundant(flower4)); //Check we can't remove the root.. CuAssertTrue(testCase, !flower_removeIfRedundant(flower)); //We will remove flower2 //Before CuAssertTrue(testCase, flower_getGroupNumber(flower) == 1); CuAssertTrue(testCase, group_getFlower(flower_getParentGroup(flower2)) == flower); CuAssertTrue(testCase, flower_removeIfRedundant(flower2)); //After, check the flower/group connections CuAssertTrue(testCase, flower_getGroupNumber(flower) == 2); CuAssertTrue(testCase, !flower_isLeaf(flower)); CuAssertTrue(testCase, group_getFlower(flower_getParentGroup(flower3)) == flower); group3b = end_getGroup(end2); CuAssertTrue(testCase, group_getFlower(group3b) == flower); CuAssertTrue(testCase, group_isLeaf(group3b)); CuAssertTrue(testCase, flower_getGroup(flower, flower_getName(flower3)) == flower_getParentGroup(flower3)); //Check the ends.. CuAssertTrue(testCase, flower_getEndNumber(flower) == 2); CuAssertTrue(testCase, flower_getEndNumber(flower3) == 1); CuAssertTrue(testCase, group_getEndNumber(group3b) == 1); CuAssertTrue(testCase, end_getGroup(end) == flower_getParentGroup(flower3)); CuAssertTrue(testCase, end_getGroup(end2) == group3b); CuAssertTrue(testCase, flower_getEnd(flower3, end_getName(end)) != NULL); //Check the child of 3 is still okay.. CuAssertTrue(testCase, group_getFlower(flower_getParentGroup(flower4)) == flower3); //Now do removal of flower3 CuAssertTrue(testCase, !flower_removeIfRedundant(flower)); CuAssertTrue(testCase, !flower_removeIfRedundant(flower4)); CuAssertTrue(testCase, flower_removeIfRedundant(flower3)); //Check groups again CuAssertTrue(testCase, flower_getGroupNumber(flower) == 2); CuAssertTrue(testCase, !flower_isLeaf(flower)); CuAssertTrue(testCase, group_getFlower(flower_getParentGroup(flower4)) == flower); CuAssertTrue(testCase, group_getFlower(group3b) == flower); CuAssertTrue(testCase, flower_getGroup(flower, flower_getName(flower4)) == flower_getParentGroup(flower4)); //Check the ends again.. CuAssertTrue(testCase, flower_getEndNumber(flower) == 2); CuAssertTrue(testCase, flower_getEndNumber(flower4) == 1); CuAssertTrue(testCase, group_getEndNumber(group3b) == 1); CuAssertTrue(testCase, end_getGroup(end) == flower_getParentGroup(flower4)); CuAssertTrue(testCase, end_getGroup(end2) == group3b); CuAssertTrue(testCase, flower_getEnd(flower4, end_getName(end)) != NULL); cactusFlowerTestTeardown(); }
void makeCactusTree_flower(Flower *flower, FILE *fileHandle, const char *parentNodeName, const char *parentEdgeColour) { if(flower_isTerminal(flower)) { makeCactusTree_terminalNode(flower, fileHandle, parentNodeName, parentEdgeColour); } else { //Write the flower nodes. char *flowerNameString = cactusMisc_nameToString(flower_getName(flower)); const char *edgeColour = graphViz_getColour(); addNodeToGraph(flowerNameString, fileHandle, flower_getTotalBaseLength(flower) / totalProblemSize, "ellipse", flowerNameString); //Write in the parent edge. if (parentNodeName != NULL) { graphViz_addEdgeToGraph(parentNodeName, flowerNameString, fileHandle, "", parentEdgeColour, 10, 1, "forward"); } //Create the chains. Flower_ChainIterator *chainIterator = flower_getChainIterator(flower); Chain *chain; while ((chain = flower_getNextChain(chainIterator)) != NULL) { makeCactusTree_chain(chain, fileHandle, flowerNameString, edgeColour); } flower_destructChainIterator(chainIterator); //Create the diamond node char *diamondNodeNameString = st_malloc(sizeof(char) * (strlen( flowerNameString) + 2)); sprintf(diamondNodeNameString, "z%s", flowerNameString); const char *diamondEdgeColour = graphViz_getColour(); //Create all the groups linked to the diamond. Flower_GroupIterator *groupIterator = flower_getGroupIterator(flower); Group *group; double size = 0.0; //get the size of the group organising node.. int64_t nonTrivialGroupCount = 0; while ((group = flower_getNextGroup(groupIterator)) != NULL) { assert(!group_isLeaf(group)); if (group_isTangle(group)) { size += group_getTotalBaseLength(group); nonTrivialGroupCount++; } } flower_destructGroupIterator(groupIterator); if(nonTrivialGroupCount == 0) { assert(flower_getParentGroup(flower) == 0); } else { //assert(nonTrivialGroupCount > 0); addNodeToGraph(diamondNodeNameString, fileHandle, size / totalProblemSize, "diamond", ""); graphViz_addEdgeToGraph(flowerNameString, diamondNodeNameString, fileHandle, "", edgeColour, 10, 1, "forward"); groupIterator = flower_getGroupIterator(flower); while ((group = flower_getNextGroup(groupIterator)) != NULL) { if (group_isTangle(group)) { assert(!group_isLeaf(group)); makeCactusTree_flower(group_getNestedFlower(group), fileHandle, diamondNodeNameString, diamondEdgeColour); } } flower_destructGroupIterator(groupIterator); } free(flowerNameString); free(diamondNodeNameString); } }
static int cactusDisk_constructFlowersP(const void *o1, const void *o2) { return cactusMisc_nameCompare(flower_getName((Flower *) o1), flower_getName((Flower *) o2)); }
int main(int argc, char *argv[]) { /* * Script for adding alignments to cactus tree. */ int64_t startTime; stKVDatabaseConf *kvDatabaseConf; CactusDisk *cactusDisk; int key, k; bool (*filterFn)(stPinchSegment *, stPinchSegment *) = NULL; stSet *outgroupThreads = NULL; /* * Arguments/options */ char * logLevelString = NULL; char * alignmentsFile = NULL; char * constraintsFile = NULL; char * cactusDiskDatabaseString = NULL; char * lastzArguments = ""; int64_t minimumSequenceLengthForBlast = 1; //Parameters for annealing/melting rounds int64_t *annealingRounds = NULL; int64_t annealingRoundsLength = 0; int64_t *meltingRounds = NULL; int64_t meltingRoundsLength = 0; //Parameters for melting float maximumAdjacencyComponentSizeRatio = 10; int64_t blockTrim = 0; int64_t alignmentTrimLength = 0; int64_t *alignmentTrims = NULL; int64_t chainLengthForBigFlower = 1000000; int64_t longChain = 2; int64_t minLengthForChromosome = 1000000; float proportionOfUnalignedBasesForNewChromosome = 0.8; bool breakChainsAtReverseTandems = 1; int64_t maximumMedianSequenceLengthBetweenLinkedEnds = INT64_MAX; bool realign = 0; char *realignArguments = ""; bool removeRecoverableChains = false; bool (*recoverableChainsFilter)(stCactusEdgeEnd *, Flower *) = NULL; int64_t maxRecoverableChainsIterations = 1; int64_t maxRecoverableChainLength = INT64_MAX; //Parameters for removing ancient homologies bool doPhylogeny = false; int64_t phylogenyNumTrees = 1; enum stCaf_RootingMethod phylogenyRootingMethod = BEST_RECON; enum stCaf_ScoringMethod phylogenyScoringMethod = COMBINED_LIKELIHOOD; double breakpointScalingFactor = 1.0; bool phylogenySkipSingleCopyBlocks = 0; int64_t phylogenyMaxBaseDistance = 1000; int64_t phylogenyMaxBlockDistance = 100; bool phylogenyKeepSingleDegreeBlocks = 0; stList *phylogenyTreeBuildingMethods = stList_construct(); enum stCaf_TreeBuildingMethod defaultMethod = GUIDED_NEIGHBOR_JOINING; stList_append(phylogenyTreeBuildingMethods, &defaultMethod); double phylogenyCostPerDupPerBase = 0.2; double phylogenyCostPerLossPerBase = 0.2; const char *debugFileName = NULL; const char *referenceEventHeader = NULL; double phylogenyDoSplitsWithSupportHigherThanThisAllAtOnce = 1.0; int64_t numTreeBuildingThreads = 2; int64_t minimumBlockDegreeToCheckSupport = 10; double minimumBlockHomologySupport = 0.7; double nucleotideScalingFactor = 1.0; HomologyUnitType phylogenyHomologyUnitType = BLOCK; enum stCaf_DistanceCorrectionMethod phylogenyDistanceCorrectionMethod = JUKES_CANTOR; bool sortAlignments = false; /////////////////////////////////////////////////////////////////////////// // (0) Parse the inputs handed by genomeCactus.py / setup stuff. /////////////////////////////////////////////////////////////////////////// while (1) { static struct option long_options[] = { { "logLevel", required_argument, 0, 'a' }, { "alignments", required_argument, 0, 'b' }, { "cactusDisk", required_argument, 0, 'c' }, { "lastzArguments", required_argument, 0, 'd' }, { "help", no_argument, 0, 'h' }, { "annealingRounds", required_argument, 0, 'i' }, { "trim", required_argument, 0, 'k' }, { "trimChange", required_argument, 0, 'l', }, { "minimumTreeCoverage", required_argument, 0, 'm' }, { "blockTrim", required_argument, 0, 'n' }, { "deannealingRounds", required_argument, 0, 'o' }, { "minimumDegree", required_argument, 0, 'p' }, { "minimumIngroupDegree", required_argument, 0, 'q' }, { "minimumOutgroupDegree", required_argument, 0, 'r' }, { "alignmentFilter", required_argument, 0, 't' }, { "minimumSequenceLengthForBlast", required_argument, 0, 'v' }, { "maxAdjacencyComponentSizeRatio", required_argument, 0, 'w' }, { "constraints", required_argument, 0, 'x' }, { "minLengthForChromosome", required_argument, 0, 'y' }, { "proportionOfUnalignedBasesForNewChromosome", required_argument, 0, 'z' }, { "maximumMedianSequenceLengthBetweenLinkedEnds", required_argument, 0, 'A' }, { "realign", no_argument, 0, 'B' }, { "realignArguments", required_argument, 0, 'C' }, { "phylogenyNumTrees", required_argument, 0, 'D' }, { "phylogenyRootingMethod", required_argument, 0, 'E' }, { "phylogenyScoringMethod", required_argument, 0, 'F' }, { "phylogenyBreakpointScalingFactor", required_argument, 0, 'G' }, { "phylogenySkipSingleCopyBlocks", no_argument, 0, 'H' }, { "phylogenyMaxBaseDistance", required_argument, 0, 'I' }, { "phylogenyMaxBlockDistance", required_argument, 0, 'J' }, { "phylogenyDebugFile", required_argument, 0, 'K' }, { "phylogenyKeepSingleDegreeBlocks", no_argument, 0, 'L' }, { "phylogenyTreeBuildingMethod", required_argument, 0, 'M' }, { "phylogenyCostPerDupPerBase", required_argument, 0, 'N' }, { "phylogenyCostPerLossPerBase", required_argument, 0, 'O' }, { "referenceEventHeader", required_argument, 0, 'P' }, { "phylogenyDoSplitsWithSupportHigherThanThisAllAtOnce", required_argument, 0, 'Q' }, { "numTreeBuildingThreads", required_argument, 0, 'R' }, { "phylogeny", no_argument, 0, 'S' }, { "minimumBlockHomologySupport", required_argument, 0, 'T' }, { "phylogenyNucleotideScalingFactor", required_argument, 0, 'U' }, { "minimumBlockDegreeToCheckSupport", required_argument, 0, 'V' }, { "removeRecoverableChains", required_argument, 0, 'W' }, { "minimumNumberOfSpecies", required_argument, 0, 'X' }, { "phylogenyHomologyUnitType", required_argument, 0, 'Y' }, { "phylogenyDistanceCorrectionMethod", required_argument, 0, 'Z' }, { "maxRecoverableChainsIterations", required_argument, 0, '1' }, { "maxRecoverableChainLength", required_argument, 0, '2' }, { 0, 0, 0, 0 } }; int option_index = 0; key = getopt_long(argc, argv, "a:b:c:hi:k:m:n:o:p:q:r:stv:w:x:y:z:A:BC:D:E:", long_options, &option_index); if (key == -1) { break; } switch (key) { case 'a': logLevelString = stString_copy(optarg); st_setLogLevelFromString(logLevelString); break; case 'b': alignmentsFile = stString_copy(optarg); break; case 'c': cactusDiskDatabaseString = stString_copy(optarg); break; case 'd': lastzArguments = stString_copy(optarg); break; case 'h': usage(); return 0; case 'i': annealingRounds = getInts(optarg, &annealingRoundsLength); break; case 'o': meltingRounds = getInts(optarg, &meltingRoundsLength); break; case 'k': alignmentTrims = getInts(optarg, &alignmentTrimLength); break; case 'm': k = sscanf(optarg, "%f", &minimumTreeCoverage); assert(k == 1); break; case 'n': k = sscanf(optarg, "%" PRIi64 "", &blockTrim); assert(k == 1); break; case 'p': k = sscanf(optarg, "%" PRIi64 "", &minimumDegree); assert(k == 1); break; case 'q': k = sscanf(optarg, "%" PRIi64 "", &minimumIngroupDegree); assert(k == 1); break; case 'r': k = sscanf(optarg, "%" PRIi64 "", &minimumOutgroupDegree); assert(k == 1); break; case 't': if (strcmp(optarg, "singleCopyOutgroup") == 0) { sortAlignments = true; filterFn = stCaf_filterByOutgroup; } else if (strcmp(optarg, "relaxedSingleCopyOutgroup") == 0) { sortAlignments = true; filterFn = stCaf_relaxedFilterByOutgroup; } else if (strcmp(optarg, "singleCopy") == 0) { sortAlignments = true; filterFn = stCaf_filterByRepeatSpecies; } else if (strcmp(optarg, "relaxedSingleCopy") == 0) { sortAlignments = true; filterFn = stCaf_relaxedFilterByRepeatSpecies; } else if (strcmp(optarg, "singleCopyChr") == 0) { sortAlignments = true; filterFn = stCaf_singleCopyChr; } else if (strcmp(optarg, "singleCopyIngroup") == 0) { sortAlignments = true; filterFn = stCaf_singleCopyIngroup; } else if (strcmp(optarg, "relaxedSingleCopyIngroup") == 0) { sortAlignments = true; filterFn = stCaf_relaxedSingleCopyIngroup; } else if (strcmp(optarg, "none") == 0) { sortAlignments = false; filterFn = NULL; } else { st_errAbort("Could not recognize alignmentFilter option %s", optarg); } break; case 'v': k = sscanf(optarg, "%" PRIi64 "", &minimumSequenceLengthForBlast); assert(k == 1); break; case 'w': k = sscanf(optarg, "%f", &maximumAdjacencyComponentSizeRatio); assert(k == 1); break; case 'x': constraintsFile = stString_copy(optarg); break; case 'y': k = sscanf(optarg, "%" PRIi64 "", &minLengthForChromosome); assert(k == 1); break; case 'z': k = sscanf(optarg, "%f", &proportionOfUnalignedBasesForNewChromosome); assert(k == 1); break; case 'A': k = sscanf(optarg, "%" PRIi64 "", &maximumMedianSequenceLengthBetweenLinkedEnds); assert(k == 1); break; case 'B': realign = 1; break; case 'C': realignArguments = stString_copy(optarg); break; case 'D': k = sscanf(optarg, "%" PRIi64, &phylogenyNumTrees); assert(k == 1); break; case 'E': if (!strcmp(optarg, "outgroupBranch")) { phylogenyRootingMethod = OUTGROUP_BRANCH; } else if (!strcmp(optarg, "longestBranch")) { phylogenyRootingMethod = LONGEST_BRANCH; } else if (!strcmp(optarg, "bestRecon")) { phylogenyRootingMethod = BEST_RECON; } else { st_errAbort("Invalid tree rooting method: %s", optarg); } break; case 'F': if (!strcmp(optarg, "reconCost")) { phylogenyScoringMethod = RECON_COST; } else if (!strcmp(optarg, "nucLikelihood")) { phylogenyScoringMethod = NUCLEOTIDE_LIKELIHOOD; } else if (!strcmp(optarg, "reconLikelihood")) { phylogenyScoringMethod = RECON_LIKELIHOOD; } else if (!strcmp(optarg, "combinedLikelihood")) { phylogenyScoringMethod = COMBINED_LIKELIHOOD; } else { st_errAbort("Invalid tree scoring method: %s", optarg); } break; case 'G': k = sscanf(optarg, "%lf", &breakpointScalingFactor); assert(k == 1); break; case 'H': phylogenySkipSingleCopyBlocks = true; break; case 'I': k = sscanf(optarg, "%" PRIi64, &phylogenyMaxBaseDistance); assert(k == 1); break; case 'J': k = sscanf(optarg, "%" PRIi64, &phylogenyMaxBlockDistance); assert(k == 1); break; case 'K': debugFileName = stString_copy(optarg); break; case 'L': phylogenyKeepSingleDegreeBlocks = true; break; case 'M': // clear the default setting of the list stList_destruct(phylogenyTreeBuildingMethods); phylogenyTreeBuildingMethods = stList_construct(); stList *methodStrings = stString_splitByString(optarg, ","); for (int64_t i = 0; i < stList_length(methodStrings); i++) { char *methodString = stList_get(methodStrings, i); enum stCaf_TreeBuildingMethod *method = st_malloc(sizeof(enum stCaf_TreeBuildingMethod)); if (strcmp(methodString, "neighborJoining") == 0) { *method = NEIGHBOR_JOINING; } else if (strcmp(methodString, "guidedNeighborJoining") == 0) { *method = GUIDED_NEIGHBOR_JOINING; } else if (strcmp(methodString, "splitDecomposition") == 0) { *method = SPLIT_DECOMPOSITION; } else if (strcmp(methodString, "strictSplitDecomposition") == 0) { *method = STRICT_SPLIT_DECOMPOSITION; } else if (strcmp(methodString, "removeBadChains") == 0) { *method = REMOVE_BAD_CHAINS; } else { st_errAbort("Unknown tree building method: %s", methodString); } stList_append(phylogenyTreeBuildingMethods, method); } stList_destruct(methodStrings); break; case 'N': k = sscanf(optarg, "%lf", &phylogenyCostPerDupPerBase); assert(k == 1); break; case 'O': k = sscanf(optarg, "%lf", &phylogenyCostPerLossPerBase); assert(k == 1); break; case 'P': referenceEventHeader = stString_copy(optarg); break; case 'Q': k = sscanf(optarg, "%lf", &phylogenyDoSplitsWithSupportHigherThanThisAllAtOnce); assert(k == 1); break; case 'R': k = sscanf(optarg, "%" PRIi64, &numTreeBuildingThreads); assert(k == 1); break; case 'S': doPhylogeny = true; break; case 'T': k = sscanf(optarg, "%lf", &minimumBlockHomologySupport); assert(k == 1); assert(minimumBlockHomologySupport <= 1.0); assert(minimumBlockHomologySupport >= 0.0); break; case 'U': k = sscanf(optarg, "%lf", &nucleotideScalingFactor); assert(k == 1); break; case 'V': k = sscanf(optarg, "%" PRIi64, &minimumBlockDegreeToCheckSupport); assert(k == 1); break; case 'W': if (strcmp(optarg, "1") == 0) { removeRecoverableChains = true; recoverableChainsFilter = NULL; } else if (strcmp(optarg, "unequalNumberOfIngroupCopies") == 0) { removeRecoverableChains = true; recoverableChainsFilter = stCaf_chainHasUnequalNumberOfIngroupCopies; } else if (strcmp(optarg, "unequalNumberOfIngroupCopiesOrNoOutgroup") == 0) { removeRecoverableChains = true; recoverableChainsFilter = stCaf_chainHasUnequalNumberOfIngroupCopiesOrNoOutgroup; } else if (strcmp(optarg, "0") == 0) { removeRecoverableChains = false; } else { st_errAbort("Could not parse removeRecoverableChains argument"); } break; case 'X': k = sscanf(optarg, "%" PRIi64, &minimumNumberOfSpecies); if (k != 1) { st_errAbort("Error parsing the minimumNumberOfSpecies argument"); } break; case 'Y': if (strcmp(optarg, "chain") == 0) { phylogenyHomologyUnitType = CHAIN; } else if (strcmp(optarg, "block") == 0) { phylogenyHomologyUnitType = BLOCK; } else { st_errAbort("Could not parse the phylogenyHomologyUnitType argument"); } break; case 'Z': if (strcmp(optarg, "jukesCantor") == 0) { phylogenyDistanceCorrectionMethod = JUKES_CANTOR; } else if (strcmp(optarg, "none") == 0 ) { phylogenyDistanceCorrectionMethod = NONE; } else { st_errAbort("Could not parse the phylogenyDistanceCorrectionMethod argument"); } break; case '1': k = sscanf(optarg, "%" PRIi64, &maxRecoverableChainsIterations); if (k != 1) { st_errAbort("Error parsing the maxRecoverableChainsIterations argument"); } break; case '2': k = sscanf(optarg, "%" PRIi64, &maxRecoverableChainLength); if (k != 1) { st_errAbort("Error parsing the maxRecoverableChainLength argument"); } break; default: usage(); return 1; } } /////////////////////////////////////////////////////////////////////////// // (0) Check the inputs. /////////////////////////////////////////////////////////////////////////// assert(cactusDiskDatabaseString != NULL); assert(minimumTreeCoverage >= 0.0); assert(minimumTreeCoverage <= 1.0); assert(blockTrim >= 0); assert(annealingRoundsLength >= 0); for (int64_t i = 0; i < annealingRoundsLength; i++) { assert(annealingRounds[i] >= 0); } assert(meltingRoundsLength >= 0); for (int64_t i = 1; i < meltingRoundsLength; i++) { assert(meltingRounds[i - 1] < meltingRounds[i]); assert(meltingRounds[i - 1] >= 1); } assert(alignmentTrimLength >= 0); for (int64_t i = 0; i < alignmentTrimLength; i++) { assert(alignmentTrims[i] >= 0); } assert(minimumOutgroupDegree >= 0); assert(minimumIngroupDegree >= 0); ////////////////////////////////////////////// //Set up logging ////////////////////////////////////////////// st_setLogLevelFromString(logLevelString); ////////////////////////////////////////////// //Log (some of) the inputs ////////////////////////////////////////////// st_logInfo("Flower disk name : %s\n", cactusDiskDatabaseString); ////////////////////////////////////////////// //Load the database ////////////////////////////////////////////// kvDatabaseConf = stKVDatabaseConf_constructFromString(cactusDiskDatabaseString); cactusDisk = cactusDisk_construct(kvDatabaseConf, 0); st_logInfo("Set up the flower disk\n"); /////////////////////////////////////////////////////////////////////////// // Sort the constraints /////////////////////////////////////////////////////////////////////////// stPinchIterator *pinchIteratorForConstraints = NULL; if (constraintsFile != NULL) { pinchIteratorForConstraints = stPinchIterator_constructFromFile(constraintsFile); st_logInfo("Created an iterator for the alignment constaints from file: %s\n", constraintsFile); } /////////////////////////////////////////////////////////////////////////// // Do the alignment /////////////////////////////////////////////////////////////////////////// startTime = time(NULL); stList *flowers = flowerWriter_parseFlowersFromStdin(cactusDisk); if (alignmentsFile == NULL) { cactusDisk_preCacheStrings(cactusDisk, flowers); } char *tempFile1 = NULL; for (int64_t i = 0; i < stList_length(flowers); i++) { flower = stList_get(flowers, i); if (!flower_builtBlocks(flower)) { // Do nothing if the flower already has defined blocks st_logDebug("Processing flower: %lli\n", flower_getName(flower)); stCaf_setFlowerForAlignmentFiltering(flower); //Set up the graph and add the initial alignments stPinchThreadSet *threadSet = stCaf_setup(flower); //Build the set of outgroup threads outgroupThreads = stCaf_getOutgroupThreads(flower, threadSet); //Setup the alignments stPinchIterator *pinchIterator; stList *alignmentsList = NULL; if (alignmentsFile != NULL) { assert(i == 0); assert(stList_length(flowers) == 1); if (sortAlignments) { tempFile1 = getTempFile(); stCaf_sortCigarsFileByScoreInDescendingOrder(alignmentsFile, tempFile1); pinchIterator = stPinchIterator_constructFromFile(tempFile1); } else { pinchIterator = stPinchIterator_constructFromFile(alignmentsFile); } } else { if (tempFile1 == NULL) { tempFile1 = getTempFile(); } alignmentsList = stCaf_selfAlignFlower(flower, minimumSequenceLengthForBlast, lastzArguments, realign, realignArguments, tempFile1); if (sortAlignments) { stCaf_sortCigarsByScoreInDescendingOrder(alignmentsList); } st_logDebug("Ran lastz and have %" PRIi64 " alignments\n", stList_length(alignmentsList)); pinchIterator = stPinchIterator_constructFromList(alignmentsList); } for (int64_t annealingRound = 0; annealingRound < annealingRoundsLength; annealingRound++) { int64_t minimumChainLength = annealingRounds[annealingRound]; int64_t alignmentTrim = annealingRound < alignmentTrimLength ? alignmentTrims[annealingRound] : 0; st_logDebug("Starting annealing round with a minimum chain length of %" PRIi64 " and an alignment trim of %" PRIi64 "\n", minimumChainLength, alignmentTrim); stPinchIterator_setTrim(pinchIterator, alignmentTrim); //Add back in the constraints if (pinchIteratorForConstraints != NULL) { stCaf_anneal(threadSet, pinchIteratorForConstraints, filterFn); } //Do the annealing if (annealingRound == 0) { stCaf_anneal(threadSet, pinchIterator, filterFn); } else { stCaf_annealBetweenAdjacencyComponents(threadSet, pinchIterator, filterFn); } // Dump the block degree and length distribution to a file if (debugFileName != NULL) { dumpBlockInfo(threadSet, stString_print("%s-blockStats-preMelting", debugFileName)); } printf("Sequence graph statistics after annealing:\n"); printThreadSetStatistics(threadSet, flower, stdout); // Check for poorly-supported blocks--those that have // been transitively aligned together but with very // few homologies supporting the transitive // alignment. These "megablocks" can snarl up the // graph so that a lot of extra gets thrown away in // the first melting step. stPinchThreadSetBlockIt blockIt = stPinchThreadSet_getBlockIt(threadSet); stPinchBlock *block; while ((block = stPinchThreadSetBlockIt_getNext(&blockIt)) != NULL) { if (stPinchBlock_getDegree(block) > minimumBlockDegreeToCheckSupport) { uint64_t supportingHomologies = stPinchBlock_getNumSupportingHomologies(block); uint64_t possibleSupportingHomologies = numPossibleSupportingHomologies(block, flower); double support = ((double) supportingHomologies) / possibleSupportingHomologies; if (support < minimumBlockHomologySupport) { fprintf(stdout, "Destroyed a megablock with degree %" PRIi64 " and %" PRIi64 " supporting homologies out of a maximum " "of %" PRIi64 " (%lf%%).\n", stPinchBlock_getDegree(block), supportingHomologies, possibleSupportingHomologies, support); stPinchBlock_destruct(block); } } } //Do the melting rounds for (int64_t meltingRound = 0; meltingRound < meltingRoundsLength; meltingRound++) { int64_t minimumChainLengthForMeltingRound = meltingRounds[meltingRound]; st_logDebug("Starting melting round with a minimum chain length of %" PRIi64 " \n", minimumChainLengthForMeltingRound); if (minimumChainLengthForMeltingRound >= minimumChainLength) { break; } stCaf_melt(flower, threadSet, NULL, 0, minimumChainLengthForMeltingRound, 0, INT64_MAX); } st_logDebug("Last melting round of cycle with a minimum chain length of %" PRIi64 " \n", minimumChainLength); stCaf_melt(flower, threadSet, NULL, 0, minimumChainLength, breakChainsAtReverseTandems, maximumMedianSequenceLengthBetweenLinkedEnds); //This does the filtering of blocks that do not have the required species/tree-coverage/degree. stCaf_melt(flower, threadSet, blockFilterFn, blockTrim, 0, 0, INT64_MAX); } if (removeRecoverableChains) { stCaf_meltRecoverableChains(flower, threadSet, breakChainsAtReverseTandems, maximumMedianSequenceLengthBetweenLinkedEnds, recoverableChainsFilter, maxRecoverableChainsIterations, maxRecoverableChainLength); } if (debugFileName != NULL) { dumpBlockInfo(threadSet, stString_print("%s-blockStats-postMelting", debugFileName)); } printf("Sequence graph statistics after melting:\n"); printThreadSetStatistics(threadSet, flower, stdout); // Build a tree for each block, then use each tree to // partition the homologies between the ingroups sequences // into those that occur before the speciation with the // outgroup and those which occur late. if (stSet_size(outgroupThreads) > 0 && doPhylogeny) { st_logDebug("Starting to build trees and partition ingroup homologies\n"); stHash *threadStrings = stCaf_getThreadStrings(flower, threadSet); st_logDebug("Got sets of thread strings and set of threads that are outgroups\n"); stCaf_PhylogenyParameters params; params.distanceCorrectionMethod = phylogenyDistanceCorrectionMethod; params.treeBuildingMethods = phylogenyTreeBuildingMethods; params.rootingMethod = phylogenyRootingMethod; params.scoringMethod = phylogenyScoringMethod; params.breakpointScalingFactor = breakpointScalingFactor; params.nucleotideScalingFactor = nucleotideScalingFactor; params.skipSingleCopyBlocks = phylogenySkipSingleCopyBlocks; params.keepSingleDegreeBlocks = phylogenyKeepSingleDegreeBlocks; params.costPerDupPerBase = phylogenyCostPerDupPerBase; params.costPerLossPerBase = phylogenyCostPerLossPerBase; params.maxBaseDistance = phylogenyMaxBaseDistance; params.maxBlockDistance = phylogenyMaxBlockDistance; params.numTrees = phylogenyNumTrees; params.ignoreUnalignedBases = 1; params.onlyIncludeCompleteFeatureBlocks = 0; params.doSplitsWithSupportHigherThanThisAllAtOnce = phylogenyDoSplitsWithSupportHigherThanThisAllAtOnce; params.numTreeBuildingThreads = numTreeBuildingThreads; assert(params.numTreeBuildingThreads >= 1); stCaf_buildTreesToRemoveAncientHomologies( threadSet, phylogenyHomologyUnitType, threadStrings, outgroupThreads, flower, ¶ms, debugFileName == NULL ? NULL : stString_print("%s-phylogeny", debugFileName), referenceEventHeader); stHash_destruct(threadStrings); st_logDebug("Finished building trees\n"); if (removeRecoverableChains) { // We melt recoverable chains after splitting, as // well as before, to alleviate coverage loss // caused by bad splits. stCaf_meltRecoverableChains(flower, threadSet, breakChainsAtReverseTandems, maximumMedianSequenceLengthBetweenLinkedEnds, recoverableChainsFilter, maxRecoverableChainsIterations, maxRecoverableChainLength); } // Enforce the block constraints on minimum degree, // etc. after splitting. stCaf_melt(flower, threadSet, blockFilterFn, 0, 0, 0, INT64_MAX); } //Sort out case when we allow blocks of degree 1 if (minimumDegree < 2) { st_logDebug("Creating degree 1 blocks\n"); stCaf_makeDegreeOneBlocks(threadSet); stCaf_melt(flower, threadSet, blockFilterFn, blockTrim, 0, 0, INT64_MAX); } else if (maximumAdjacencyComponentSizeRatio < INT64_MAX) { //Deal with giant components st_logDebug("Breaking up components greedily\n"); stCaf_breakupComponentsGreedily(threadSet, maximumAdjacencyComponentSizeRatio); } //Finish up stCaf_finish(flower, threadSet, chainLengthForBigFlower, longChain, minLengthForChromosome, proportionOfUnalignedBasesForNewChromosome); //Flower is then destroyed at this point. st_logInfo("Ran the cactus core script\n"); //Cleanup stPinchThreadSet_destruct(threadSet); stPinchIterator_destruct(pinchIterator); stSet_destruct(outgroupThreads); if (alignmentsList != NULL) { stList_destruct(alignmentsList); } st_logInfo("Cleaned up from main loop\n"); } else { st_logInfo("We've already built blocks / alignments for this flower\n"); } } stList_destruct(flowers); if (tempFile1 != NULL) { st_system("rm %s", tempFile1); } if (constraintsFile != NULL) { stPinchIterator_destruct(pinchIteratorForConstraints); } /////////////////////////////////////////////////////////////////////////// // Write the flower to disk. /////////////////////////////////////////////////////////////////////////// st_logDebug("Writing the flowers to disk\n"); cactusDisk_write(cactusDisk); st_logInfo("Updated the flower on disk and %" PRIi64 " seconds have elapsed\n", time(NULL) - startTime); /////////////////////////////////////////////////////////////////////////// // Clean up. /////////////////////////////////////////////////////////////////////////// cactusDisk_destruct(cactusDisk); }
int main(int argc, char *argv[]) { /* * Open the database. * Construct a flower. * Construct an event tree representing the species tree. * For each sequence contruct two ends each containing an cap. * Make a file for the sequence. * Link the two caps. * Finish! */ int64_t key, j; Group *group; Flower_EndIterator *endIterator; End *end; bool makeEventHeadersAlphaNumeric = 0; /* * Arguments/options */ char * logLevelString = NULL; char * speciesTree = NULL; char * outgroupEvents = NULL; /////////////////////////////////////////////////////////////////////////// // (0) Parse the inputs handed by genomeCactus.py / setup stuff. /////////////////////////////////////////////////////////////////////////// while (1) { static struct option long_options[] = { { "logLevel", required_argument, 0, 'a' }, { "cactusDisk", required_argument, 0, 'b' }, { "speciesTree", required_argument, 0, 'f' }, { "outgroupEvents", required_argument, 0, 'g' }, { "help", no_argument, 0, 'h' }, { "makeEventHeadersAlphaNumeric", no_argument, 0, 'i' }, { 0, 0, 0, 0 } }; int option_index = 0; key = getopt_long(argc, argv, "a:b:f:hg:i", long_options, &option_index); if (key == -1) { break; } switch (key) { case 'a': logLevelString = optarg; break; case 'b': cactusDiskDatabaseString = optarg; break; case 'f': speciesTree = optarg; break; case 'g': outgroupEvents = optarg; break; case 'h': usage(); return 0; case 'i': makeEventHeadersAlphaNumeric = 1; break; default: usage(); return 1; } } /////////////////////////////////////////////////////////////////////////// // (0) Check the inputs. /////////////////////////////////////////////////////////////////////////// //assert(logLevelString == NULL || strcmp(logLevelString, "CRITICAL") == 0 || strcmp(logLevelString, "INFO") == 0 || strcmp(logLevelString, "DEBUG") == 0); assert(cactusDiskDatabaseString != NULL); assert(speciesTree != NULL); ////////////////////////////////////////////// //Set up logging ////////////////////////////////////////////// st_setLogLevelFromString(logLevelString); ////////////////////////////////////////////// //Log (some of) the inputs ////////////////////////////////////////////// st_logInfo("Flower disk name : %s\n", cactusDiskDatabaseString); for (j = optind; j < argc; j++) { st_logInfo("Sequence file/directory %s\n", argv[j]); } ////////////////////////////////////////////// //Load the database ////////////////////////////////////////////// stKVDatabaseConf *kvDatabaseConf = kvDatabaseConf = stKVDatabaseConf_constructFromString(cactusDiskDatabaseString); if (stKVDatabaseConf_getType(kvDatabaseConf) == stKVDatabaseTypeTokyoCabinet || stKVDatabaseConf_getType(kvDatabaseConf) == stKVDatabaseTypeKyotoTycoon) { assert(stKVDatabaseConf_getDir(kvDatabaseConf) != NULL); cactusDisk = cactusDisk_construct2(kvDatabaseConf, "cactusSequences"); } else { cactusDisk = cactusDisk_construct(kvDatabaseConf, 1); } st_logInfo("Set up the flower disk\n"); ////////////////////////////////////////////// //Construct the flower ////////////////////////////////////////////// if (cactusDisk_getFlower(cactusDisk, 0) != NULL) { cactusDisk_destruct(cactusDisk); st_logInfo("The first flower already exists\n"); return 0; } flower = flower_construct2(0, cactusDisk); assert(flower_getName(flower) == 0); st_logInfo("Constructed the flower\n"); ////////////////////////////////////////////// //Construct the event tree ////////////////////////////////////////////// st_logInfo("Going to build the event tree with newick string: %s\n", speciesTree); stTree *tree = stTree_parseNewickString(speciesTree); st_logInfo("Parsed the tree\n"); if (makeEventHeadersAlphaNumeric) { makeEventHeadersAlphaNumericFn(tree); } stTree_setBranchLength(tree, INT64_MAX); checkBranchLengthsAreDefined(tree); eventTree = eventTree_construct2(flower); //creates the event tree and the root even totalEventNumber = 1; st_logInfo("Constructed the basic event tree\n"); // Construct a set of outgroup names so that ancestral outgroups // get recognized. stSet *outgroupNameSet = stSet_construct3(stHash_stringKey, stHash_stringEqualKey, free); if(outgroupEvents != NULL) { stList *outgroupNames = stString_split(outgroupEvents); for(int64_t i = 0; i < stList_length(outgroupNames); i++) { char *outgroupName = stList_get(outgroupNames, i); stSet_insert(outgroupNameSet, stString_copy(outgroupName)); } stList_destruct(outgroupNames); } //now traverse the tree j = optind; assignEventsAndSequences(eventTree_getRootEvent(eventTree), tree, outgroupNameSet, argv, &j); char *eventTreeString = eventTree_makeNewickString(eventTree); st_logInfo( "Constructed the initial flower with %" PRIi64 " sequences and %" PRIi64 " events with string: %s\n", totalSequenceNumber, totalEventNumber, eventTreeString); assert(event_getSubTreeBranchLength(eventTree_getRootEvent(eventTree)) >= 0.0); free(eventTreeString); //assert(0); ////////////////////////////////////////////// //Label any outgroup events. ////////////////////////////////////////////// if (outgroupEvents != NULL) { stList *outgroupEventsList = stString_split(outgroupEvents); for (int64_t i = 0; i < stList_length(outgroupEventsList); i++) { char *outgroupEvent = makeEventHeadersAlphaNumeric ? makeAlphaNumeric(stList_get(outgroupEventsList, i)) : stString_copy(stList_get(outgroupEventsList, i)); Event *event = eventTree_getEventByHeader(eventTree, outgroupEvent); if (event == NULL) { st_errAbort("Got an outgroup string that does not match an event, outgroup string %s", outgroupEvent); } assert(!event_isOutgroup(event)); event_setOutgroupStatus(event, 1); assert(event_isOutgroup(event)); free(outgroupEvent); } stList_destruct(outgroupEventsList); } ////////////////////////////////////////////// //Construct the terminal group. ////////////////////////////////////////////// if (flower_getEndNumber(flower) > 0) { group = group_construct2(flower); endIterator = flower_getEndIterator(flower); while ((end = flower_getNextEnd(endIterator)) != NULL) { end_setGroup(end, group); } flower_destructEndIterator(endIterator); assert(group_isLeaf(group)); // Create a one link chain if there is only one pair of attached ends.. group_constructChainForLink(group); assert(!flower_builtBlocks(flower)); } else { flower_setBuiltBlocks(flower, 1); } /////////////////////////////////////////////////////////////////////////// // Write the flower to disk. /////////////////////////////////////////////////////////////////////////// //flower_check(flower); cactusDisk_write(cactusDisk); st_logInfo("Updated the flower on disk\n"); /////////////////////////////////////////////////////////////////////////// // Cleanup. /////////////////////////////////////////////////////////////////////////// cactusDisk_destruct(cactusDisk); return 0; //Exit without clean up is quicker, enable cleanup when doing memory leak detection. stSet_destruct(outgroupNameSet); stTree_destruct(tree); stKVDatabaseConf_destruct(kvDatabaseConf); return 0; }