static void splitIntoAdjacenciesStubsAndChains(stList *subCycle, stList *adjacencyEdges, stList *stubEdges, stList *chainEdges, stList **subAdjacencyEdges, stList **subStubEdges, stList **subChainEdges) { /* * Splits run into cycles and chains.. */ *subStubEdges = stList_construct(); *subChainEdges = stList_construct(); for (int64_t j = 0; j < stList_length(subCycle); j++) { stIntTuple *edge = stList_get(subCycle, j); if (stList_contains(stubEdges, edge)) { stList_append(*subStubEdges, edge); } else if (stList_contains(chainEdges, edge)) { stList_append(*subChainEdges, edge); } } *subAdjacencyEdges = stList_construct(); stSortedSet *nodes = getNodeSetOfEdges(subCycle); for (int64_t j = 0; j < stList_length(adjacencyEdges); j++) { stIntTuple *edge = stList_get(adjacencyEdges, j); if (nodeInSet(nodes, stIntTuple_get(edge, 0)) && nodeInSet( nodes, stIntTuple_get(edge, 1))) { stList_append(*subAdjacencyEdges, edge); } } stSortedSet_destruct(nodes); }
static stList *mergeSimpleCycles2(stList *chosenEdges, stList *nonZeroWeightAdjacencyEdges, stSortedSet *allAdjacencyEdges, stList *stubEdges, stList *chainEdges) { /* * Returns a new set of chosen edges, modified by adjacency switches such that every simple cycle * contains at least one stub edge. */ /* * Calculate components. */ stList *components = getComponents2(chosenEdges, stubEdges, chainEdges); /* * Divide the components by the presence of one or more stub edges. */ stSortedSet *stubEdgesSet = stList_getSortedSet(stubEdges, (int(*)(const void *, const void *)) stIntTuple_cmpFn); stList *stubContainingComponents = stList_construct(); stList *stubFreeComponents = stList_construct(); for (int64_t i = 0; i < stList_length(components); i++) { stList *component = stList_get(components, i); stList_append( intersectionSize(stubEdgesSet, component) > 0 ? stubContainingComponents : stubFreeComponents, component); } assert(stList_length(stubContainingComponents) > 0); stSortedSet_destruct(stubEdgesSet); /* * Merge the stub containing components into one 'global' component */ stList *globalComponent = stList_join(stubContainingComponents); stList_destruct(stubContainingComponents); /* * Remove the stub/chain edges from the components. */ stList_append(stubFreeComponents, globalComponent); stList *adjacencyOnlyComponents = getStubAndChainEdgeFreeComponents( stubFreeComponents, stubEdges, chainEdges); stList_destruct(stubFreeComponents); stList_destruct(globalComponent); stList_destruct(components); //We only clean this up now, as this frees the components it contains. /* * Merge stub free components into the others. */ stList *updatedChosenEdges = mergeSimpleCycles(adjacencyOnlyComponents, nonZeroWeightAdjacencyEdges, allAdjacencyEdges); stList_destruct(adjacencyOnlyComponents); return updatedChosenEdges; }
stList *getMatchingWithCyclicConstraints(stSortedSet *nodes, stList *adjacencyEdges, stList *stubEdges, stList *chainEdges, bool makeStubCyclesDisjoint, stList *(*matchingAlgorithm)(stList *edges, int64_t nodeNumber)) { /* * Check the inputs. */ checkInputs(nodes, adjacencyEdges, stubEdges, chainEdges); st_logDebug("Checked the inputs\n"); if (stSortedSet_size(nodes) == 0) { //Some of the following functions assume there are at least 2 nodes. return stList_construct(); } stList *chosenEdges = getPerfectMatching(nodes, adjacencyEdges, matchingAlgorithm); stSortedSet *allAdjacencyEdges = stList_getSortedSet(adjacencyEdges, (int(*)(const void *, const void *)) stIntTuple_cmpFn); stList *nonZeroWeightAdjacencyEdges = getEdgesWithGreaterThanZeroWeight( adjacencyEdges); stList *updatedChosenEdges = makeMatchingObeyCyclicConstraints(nodes, chosenEdges, allAdjacencyEdges, nonZeroWeightAdjacencyEdges, stubEdges, chainEdges, makeStubCyclesDisjoint); stList_destruct(chosenEdges); chosenEdges = updatedChosenEdges; stList_destruct(nonZeroWeightAdjacencyEdges); stSortedSet_destruct(allAdjacencyEdges); return chosenEdges; }
stList *chooseMatching_greedy(stList *edges, int64_t nodeNumber) { /* * Greedily picks the edge from the list such that each node has at most one edge. */ //First clone the list.. edges = stList_copy(edges, NULL); stSortedSet *seen = getEmptyNodeOrEdgeSetWithCleanup(); stList *matching = stList_construct(); //Sort the adjacency pairs.. stList_sort(edges, chooseMatching_greedyP); double strength = INT64_MAX; while (stList_length(edges) > 0) { stIntTuple *edge = stList_pop(edges); double d = stIntTuple_get(edge, 2); assert(d <= strength); strength = d; if(!nodeInSet(seen, stIntTuple_get(edge, 0)) && !nodeInSet(seen, stIntTuple_get(edge, 1))) { addNodeToSet(seen, stIntTuple_get(edge, 0)); addNodeToSet(seen, stIntTuple_get(edge, 1)); stList_append(matching,edge); } } assert(stList_length(edges) == 0); stList_destruct(edges); stSortedSet_destruct(seen); return matching; }
static stList *readMatching(FILE *fileHandle, stList *originalEdges) { /* * Reads the matching created by Blossum. */ stHash *originalEdgesHash = putEdgesInHash(originalEdges); char *line = stFile_getLineFromFile(fileHandle); assert(line != NULL); int64_t nodeNumber, edgeNumber; int64_t i = sscanf(line, "%" PRIi64 " %" PRIi64 "\n", &nodeNumber, &edgeNumber); assert(i == 2); free(line); stList *chosenEdges = stList_construct(); for(int64_t j=0; j<edgeNumber; j++) { line = stFile_getLineFromFile(fileHandle); int64_t node1, node2; i = sscanf(line, "%" PRIi64 " %" PRIi64 "", &node1, &node2); assert(i == 2); free(line); assert(node1 >= 0); assert(node1 < nodeNumber); assert(node2 >= 0); assert(node2 < nodeNumber); stIntTuple *edge = constructEdge(node1, node2); stIntTuple *originalEdge = stHash_search(originalEdgesHash, edge); if(originalEdge != NULL) { stList_append(chosenEdges, originalEdge); } stIntTuple_destruct(edge); } stHash_destruct(originalEdgesHash); return chosenEdges; }
void stCaf_addAdjacencies(Flower *flower) { //Build a list of caps. stList *list = stList_construct(); Flower_EndIterator *endIterator = flower_getEndIterator(flower); End *end; while ((end = flower_getNextEnd(endIterator)) != NULL) { End_InstanceIterator *instanceIterator = end_getInstanceIterator(end); Cap *cap; while ((cap = end_getNext(instanceIterator)) != NULL) { if (!cap_getStrand(cap)) { cap = cap_getReverse(cap); } stList_append(list, cap); } end_destructInstanceIterator(instanceIterator); } flower_destructEndIterator(endIterator); assert(stList_length(list) % 2 == 0); //Sort the list of caps. stList_sort(list, (int(*)(const void *, const void *)) addAdjacenciesPP); //Now make the adjacencies. for (int64_t i = 1; i < stList_length(list); i += 2) { Cap *cap = stList_get(list, i - 1); Cap *cap2 = stList_get(list, i); cap_makeAdjacent(cap, cap2); } //Clean up. stList_destruct(list); }
static void setup() { teardown(); list = stList_construct(); int64_t i; for(i=0; i<stringNumber; i++) { stList_append(list, strings[i]); } }
stList *stSet_getKeys(stSet *set) { stList *list = stList_construct(); stSetIterator *iterator = stSet_getIterator(set); void *item; while ((item = stSet_getNext(iterator)) != NULL) { stList_append(list, item); } stSet_destructIterator(iterator); return list; }
static AdjacencySwitch *getBest2EdgeAdjacencySwitch(stList *components, stSortedSet *allAdjacencyEdges) { /* * Look for the two lowest value adjacency edges in all current edges that are in a separate component and returns them as an adjacency switch * with now new adjacency edges. */ /* * Get lowest scoring edge for each component. */ stList *lowestScoringEdgeFromEachComponent = stList_construct(); for (int64_t i = 0; i < stList_length(components); i++) { stList_append(lowestScoringEdgeFromEachComponent, getLowestScoringEdge(stList_get(components, i))); } /* * Get two lowest scoring edges. */ stList_sort(lowestScoringEdgeFromEachComponent, getBest2EdgeAdjacencySwitchP); stIntTuple *lowestScoreEdge1 = stList_get( lowestScoringEdgeFromEachComponent, 0); stIntTuple *lowestScoreEdge2 = stList_get( lowestScoringEdgeFromEachComponent, 1); assert(lowestScoreEdge1 != lowestScoreEdge2); stList_destruct(lowestScoringEdgeFromEachComponent); //Cleanup stIntTuple *newEdge1 = getWeightedEdgeFromSet( stIntTuple_get(lowestScoreEdge1, 0), stIntTuple_get(lowestScoreEdge2, 0), allAdjacencyEdges); stIntTuple *newEdge2 = getWeightedEdgeFromSet( stIntTuple_get(lowestScoreEdge1, 1), stIntTuple_get(lowestScoreEdge2, 1), allAdjacencyEdges); if (newEdge1 == NULL) { assert(newEdge2 == NULL); newEdge1 = getWeightedEdgeFromSet( stIntTuple_get(lowestScoreEdge1, 0), stIntTuple_get(lowestScoreEdge2, 1), allAdjacencyEdges); newEdge2 = getWeightedEdgeFromSet( stIntTuple_get(lowestScoreEdge1, 1), stIntTuple_get(lowestScoreEdge2, 0), allAdjacencyEdges); } assert(newEdge1 != NULL); assert(newEdge2 != NULL); return adjacencySwitch_construct( lowestScoreEdge1, lowestScoreEdge2, newEdge1, newEdge2, stIntTuple_get(lowestScoreEdge1, 2) + stIntTuple_get(lowestScoreEdge2, 2)); }
/* build a list of blocks, sorted by the root components */ static stList *buildRootSorted(struct malnSet *malnSet) { stList *sorted = stList_construct(); struct malnBlkSetIterator *iter = malnBlkSet_getIterator(malnSet->blks); struct malnBlk *blk; while ((blk = malnBlkSetIterator_getNext(iter)) != NULL) { stList_append(sorted, blk); } malnBlkSetIterator_destruct(iter); stList_sort(sorted, blkCmpRootComp); return sorted; }
static stList *chooseAdjacencyPairing_externalProgram(stList *edges, int64_t nodeNumber, const char *programName) { /* * We create temp files to hold stuff. */ if(nodeNumber <= 1) { assert(stList_length(edges) == 0); return stList_construct(); } char *tempInputFile = getTempFile(), *tempOutputFile = getTempFile(); /* * We write the graph to a temp file. */ FILE *fileHandle = fopen(tempInputFile, "w"); if(strcmp(programName, "blossom5") == 0) { //Must be all connected as //generates perfect matchings. writeCliqueGraph(fileHandle, edges, nodeNumber, 1); } else { writeGraph(fileHandle, edges, nodeNumber); } fclose(fileHandle); /* * We run the external program. */ char *command = stString_print("%s -e %s -w %s >& /dev/null", programName, tempInputFile, tempOutputFile); int64_t i = st_system(command); if(i != 0) { st_errAbort("Something went wrong with the command: %s", command); //For some reason this causes a seg fault //stThrowNew(MATCHING_EXCEPTION, "Something went wrong with the command: %s", command); } free(command); /* * We get back the matching. */ fileHandle = fopen(tempOutputFile, "r"); stList *matching = readMatching(fileHandle, edges); fclose(fileHandle); st_logDebug("The adjacency matching for %" PRIi64 " nodes with %" PRIi64 " initial edges contains %" PRIi64 " edges\n", nodeNumber, stList_length(edges), stList_length(matching)); /* * Get rid of the temp files.. */ st_system("rm -rf %s %s", tempInputFile, tempOutputFile); free(tempInputFile); free(tempOutputFile); return matching; }
int main(int argc, char *argv[]) { ////////////////////////////////////////////// //Parse the inputs ////////////////////////////////////////////// parseBasicArguments(argc, argv, "linkageStats"); /////////////////////////////////////////////////////////////////////////// // Get the intervals /////////////////////////////////////////////////////////////////////////// stList *haplotypeEventStrings = getEventStrings( treatHaplotype1AsContamination ? NULL : hap1EventString, treatHaplotype2AsContamination ? NULL : hap2EventString); stList *assemblyEventStringInList = stList_construct(); stList_append(assemblyEventStringInList, assemblyEventString); stList *intervals = stList_construct3(0, (void (*)(void *))sequenceInterval_destruct); for(int64_t i=0; i<stList_length(haplotypeEventStrings); i++) { const char *hapEventString = stList_get(haplotypeEventStrings, i); st_logInfo("Getting contig paths for haplotype: %s", hapEventString); stList *contigPaths = getContigPaths(flower, hapEventString, assemblyEventStringInList); stList *hapIntervals = getSplitContigPathIntervals(flower, contigPaths, hapEventString, assemblyEventStringInList); stList_destruct(contigPaths); st_logInfo("Getting contig paths\n"); stList_appendAll(intervals, hapIntervals); stList_setDestructor(hapIntervals, NULL); stList_destruct(hapIntervals); } st_logDebug("Got a total of %" PRIi64 " intervals\n", stList_length(intervals)); /////////////////////////////////////////////////////////////////////////// // Write it out. /////////////////////////////////////////////////////////////////////////// FILE *fileHandle = fopen(outputFile, "w"); for (int64_t i = 0; i < stList_length(intervals); i++) { SequenceInterval *sequenceInterval = stList_get(intervals, i); st_logDebug("We have a path interval %s %" PRIi64 " %" PRIi64 "\n", sequenceInterval->sequenceName, sequenceInterval->start, sequenceInterval->end); fprintf(fileHandle, "%s %" PRIi64 " %" PRIi64 "\n", sequenceInterval->sequenceName, sequenceInterval->start, sequenceInterval->end); } st_logInfo("Finished writing out the stats.\n"); fclose(fileHandle); return 0; }
static stList *getOddToEvenAdjacencyEdges(stSortedSet *oddNodes, stList *adjacencyEdges) { /* * Gets edges that include one node in the set of oddNodes, but not both. */ stList *oddToEvenAdjacencyEdges = stList_construct(); for (int64_t i = 0; i < stList_length(adjacencyEdges); i++) { stIntTuple *edge = stList_get(adjacencyEdges, i); if (nodeInSet(oddNodes, stIntTuple_get(edge, 0)) ^ nodeInSet( oddNodes, stIntTuple_get(edge, 1))) { stList_append(oddToEvenAdjacencyEdges, edge); } } return oddToEvenAdjacencyEdges; }
/* clone the root. */ static stTree *subrangeCloneRoot(stTree *srcRoot, struct malnCompCompMap *srcDestCompMap) { // clone root, if deleted, these must only be one child (due to the way // the trees are constructed). stList *pendingSubtrees = stList_construct(); stTree *destRoot = subrangeCloneNode(srcRoot, srcDestCompMap, pendingSubtrees); if (destRoot == NULL) { if (stList_length(pendingSubtrees) > 1) { struct mafTreeNodeCompLink *srcNcLink = getNodeCompLink(srcRoot); errAbort("deleted tree root %s (component: %s:%d-%d/%c)) has more that one child", stTree_getLabel(srcRoot), srcNcLink->comp->seq->orgSeqName, srcNcLink->comp->start, srcNcLink->comp->end, srcNcLink->comp->strand); } else if (stList_length(pendingSubtrees) == 1) { destRoot = stList_pop(pendingSubtrees); } } stList_destruct(pendingSubtrees); return destRoot; }
static void test_stSet_removeAndFreeKey(CuTest* testCase) { stSet *set2 = stSet_construct2(free); stList *keys = stList_construct(); int64_t keyNumber = 1000; for (int64_t i = 0; i < keyNumber; i++) { int64_t *key = st_malloc(sizeof(*key)); stList_append(keys, key); stSet_insert(set2, key); } for (int64_t i = 0; i < keyNumber; i++) { int64_t *key = stList_get(keys, i); CuAssertPtrEquals(testCase, key, stSet_removeAndFreeKey(set2, key)); } CuAssertIntEquals(testCase, 0, stSet_size(set2)); stSet_destruct(set2); stList_destruct(keys); }
static stList *getEdgesThatBridgeComponents(stList *components, stHash *nodesToNonZeroWeightedAdjacencyEdges) { /* * Get set of adjacency edges that bridge between (have a node in two) components. */ stList *bridgingAdjacencyEdges = stList_construct(); for (int64_t i = 0; i < stList_length(components); i++) { stSortedSet *componentNodes = getNodeSetOfEdges( stList_get(components, i)); stSortedSetIterator *it = stSortedSet_getIterator(componentNodes); stIntTuple *node; while ((node = stSortedSet_getNext(it)) != NULL) { stList *edges = stHash_search(nodesToNonZeroWeightedAdjacencyEdges, node); if (edges != NULL) { for (int64_t j = 0; j < stList_length(edges); j++) { stIntTuple *edge = stList_get(edges, j); stIntTuple *node1 = stIntTuple_construct1( stIntTuple_get(edge, 0)); stIntTuple *node2 = stIntTuple_construct1( stIntTuple_get(edge, 1)); assert( stSortedSet_search(componentNodes, node1) != NULL || stSortedSet_search(componentNodes, node2) != NULL); if (stSortedSet_search(componentNodes, node1) == NULL || stSortedSet_search(componentNodes, node2) == NULL) { stList_append(bridgingAdjacencyEdges, edge); } stIntTuple_destruct(node1); stIntTuple_destruct(node2); } } } stSortedSet_destructIterator(it); stSortedSet_destruct(componentNodes); } return bridgingAdjacencyEdges; }
stList *getComponents2(stList *adjacencyEdges, stList *stubEdges, stList *chainEdges) { /* * Gets a list of connected components for a set of adjacency, stub and chain edges. * If adjacencyEdges, stubEdges or chainEdges are NULL then they are ignored. */ stList *allEdges = stList_construct(); //Build a concatenated list of all the chain, stub and adjacency edges. if (adjacencyEdges != NULL) { stList_appendAll(allEdges, adjacencyEdges); } if (stubEdges != NULL) { stList_appendAll(allEdges, stubEdges); } if (chainEdges != NULL) { stList_appendAll(allEdges, chainEdges); } stList *components = getComponents(allEdges); //Gets the graph components. stList_destruct(allEdges); //Cleanup the all edges. return components; }
static void getMaximalHaplotypePathsP(Flower *flower, stList *maximalHaplotypePaths, stSortedSet *segmentSet, const char *eventString, stList *eventStrings) { /* * Iterate through the segments in this flower. */ Flower_SegmentIterator *segmentIt = flower_getSegmentIterator(flower); Segment *segment; while ((segment = flower_getNextSegment(segmentIt)) != NULL) { if (stSortedSet_search(segmentSet, segment) == NULL && stSortedSet_search(segmentSet, segment_getReverse(segment)) == NULL) { //Check we haven't yet seen this segment if (strcmp(event_getHeader(segment_getEvent(segment)), eventString) == 0) { //Check if the segment is in the assembly if (hasCapInEvents(cap_getEnd(segment_get5Cap(segment)), eventStrings)) { //Is a block in a haplotype segment assert(hasCapInEvents(cap_getEnd(segment_get3Cap(segment)), eventStrings)); //isHaplotypeEnd(cap_getEnd(segment_get3Cap(segment)))); stList *maximalHaplotypePath = stList_construct(); stList_append(maximalHaplotypePaths, maximalHaplotypePath); getMaximalHaplotypePathsP2(segment, maximalHaplotypePath, segmentSet, eventStrings); } else { assert(!hasCapInEvents(cap_getEnd(segment_get3Cap(segment)), eventStrings));//assert(!isHaplotypeEnd(cap_getEnd(segment_get3Cap(segment)))); } } } } flower_destructSegmentIterator(segmentIt); /* * Now recurse on the contained flowers. */ Flower_GroupIterator *groupIt = flower_getGroupIterator(flower); Group *group; while ((group = flower_getNextGroup(groupIt)) != NULL) { if (group_getNestedFlower(group) != NULL) { getMaximalHaplotypePathsP(group_getNestedFlower(group), maximalHaplotypePaths, segmentSet, eventString, eventStrings); } } flower_destructGroupIterator(groupIt); }
stList *makeMatchingObeyCyclicConstraints(stSortedSet *nodes, stList *chosenEdges, stSortedSet *allAdjacencyEdges, stList *nonZeroWeightAdjacencyEdges, stList *stubEdges, stList *chainEdges, bool makeStubCyclesDisjoint) { if (stSortedSet_size(nodes) == 0) { //Some of the following functions assume there are at least 2 nodes. return stList_construct(); } /* * Merge in the stub free components. */ chosenEdges = mergeSimpleCycles2(chosenEdges, nonZeroWeightAdjacencyEdges, allAdjacencyEdges, stubEdges, chainEdges); st_logDebug( "After merging in chain only cycles the matching has %" PRIi64 " edges, %" PRIi64 " cardinality and %" PRIi64 " weight\n", stList_length(chosenEdges), matchingCardinality(chosenEdges), matchingWeight(chosenEdges)); /* * Split stub components. */ if (makeStubCyclesDisjoint) { stList *updatedChosenEdges = splitMultipleStubCycles(chosenEdges, nonZeroWeightAdjacencyEdges, allAdjacencyEdges, stubEdges, chainEdges); stList_destruct(chosenEdges); chosenEdges = updatedChosenEdges; st_logDebug( "After making stub cycles disjoint the matching has %" PRIi64 " edges, %" PRIi64 " cardinality and %" PRIi64 " weight\n", stList_length(chosenEdges), matchingCardinality(chosenEdges), matchingWeight(chosenEdges)); } else { st_logDebug("Not making stub cycles disjoint\n"); } return chosenEdges; }
static stList *getCaps(stList *flowers, Name referenceEventName) { stList *caps = stList_construct(); for (int64_t i = 0; i < stList_length(flowers); i++) { Flower *flower = stList_get(flowers, i); //Get list of caps Flower_EndIterator *endIt = flower_getEndIterator(flower); End *end; while ((end = flower_getNextEnd(endIt)) != NULL) { if (end_isStubEnd(end)) { Cap *cap = getCapForReferenceEvent(end, referenceEventName); //The cap in the reference if(cap != NULL) { cap = cap_getStrand(cap) ? cap : cap_getReverse(cap); if (!cap_getSide(cap)) { stList_append(caps, cap); } } } } flower_destructEndIterator(endIt); } return caps; }
stList *getPerfectMatching(stSortedSet *nodes, stList *adjacencyEdges, stList *(*matchingAlgorithm)(stList *edges, int64_t nodeNumber)) { checkEdges(adjacencyEdges, nodes, 1, 1); //Checks edges are clique if (stSortedSet_size(nodes) == 0) { //Some of the following functions assume there are at least 2 nodes. return stList_construct(); } stList *nonZeroWeightAdjacencyEdges = getEdgesWithGreaterThanZeroWeight( adjacencyEdges); stList *chosenEdges = getSparseMatching(nodes, nonZeroWeightAdjacencyEdges, matchingAlgorithm); stList_destruct(nonZeroWeightAdjacencyEdges); makeMatchingPerfect(chosenEdges, adjacencyEdges, nodes); st_logDebug( "Chosen a perfect matching with %" PRIi64 " edges, %" PRIi64 " cardinality and %" PRIi64 " weight\n", stList_length(chosenEdges), matchingCardinality(chosenEdges), matchingWeight(chosenEdges)); return chosenEdges; }
/* Get a list of components that overlap the specified guide range and are in * blocks not flagged as dying and matches treeLoc filters. Return NULL if no * overlaps. List is sorted by ascending width, which helps the merge * efficiency. */ stList *malnSet_getOverlappingComps(struct malnSet *malnSet, struct Seq *seq, int chromStart, int chromEnd, unsigned treeLocFilter) { if (malnSet->compRangeMap == NULL) { buildRangeTree(malnSet); } stList *overComps = NULL; for (struct range *rng = genomeRangeTreeAllOverlapping(malnSet->compRangeMap, seq->orgSeqName, chromStart, chromEnd); rng != NULL; rng = rng->next) { for (struct slRef *compRef = rng->val; compRef != NULL; compRef = compRef->next) { struct malnComp *comp = compRef->val; if (keepOverlap(comp, seq, chromStart, chromEnd, treeLocFilter)) { if (overComps == NULL) { overComps = stList_construct(); } stList_append(overComps, comp); } } } // sort so tests are reproducible if (overComps != NULL) { stList_sort(overComps, sortCompListCmpFn); } return overComps; }
int main(int argc, char *argv[]) { /* * Script for adding a reference genome to a flower. */ /* * Arguments/options */ char * logLevelString = NULL; char * cactusDiskDatabaseString = NULL; char * secondaryDatabaseString = NULL; char *referenceEventString = (char *) cactusMisc_getDefaultReferenceEventHeader(); bool bottomUpPhase = 0; /////////////////////////////////////////////////////////////////////////// // (0) Parse the inputs handed by genomeCactus.py / setup stuff. /////////////////////////////////////////////////////////////////////////// while (1) { static struct option long_options[] = { { "logLevel", required_argument, 0, 'a' }, { "cactusDisk", required_argument, 0, 'b' }, { "secondaryDisk", required_argument, 0, 'd' }, { "referenceEventString", required_argument, 0, 'g' }, { "help", no_argument, 0, 'h' }, { "bottomUpPhase", no_argument, 0, 'j' }, { 0, 0, 0, 0 } }; int option_index = 0; int key = getopt_long(argc, argv, "a:b:c:d:e:g:hi:j", long_options, &option_index); if (key == -1) { break; } switch (key) { case 'a': logLevelString = stString_copy(optarg); break; case 'b': cactusDiskDatabaseString = stString_copy(optarg); break; case 'd': secondaryDatabaseString = stString_copy(optarg); break; case 'g': referenceEventString = stString_copy(optarg); break; case 'h': usage(); return 0; case 'j': bottomUpPhase = 1; break; default: usage(); return 1; } } /////////////////////////////////////////////////////////////////////////// // (0) Check the inputs. /////////////////////////////////////////////////////////////////////////// assert(cactusDiskDatabaseString != NULL); ////////////////////////////////////////////// //Set up logging ////////////////////////////////////////////// st_setLogLevelFromString(logLevelString); ////////////////////////////////////////////// //Load the database ////////////////////////////////////////////// st_logInfo("referenceEventString = %s\n", referenceEventString); st_logInfo("bottomUpPhase = %i\n", bottomUpPhase); stKVDatabaseConf *kvDatabaseConf = stKVDatabaseConf_constructFromString(cactusDiskDatabaseString); CactusDisk *cactusDisk = cactusDisk_construct(kvDatabaseConf, false, true); stKVDatabaseConf_destruct(kvDatabaseConf); st_logInfo("Set up the flower disk\n"); stKVDatabase *sequenceDatabase = NULL; if (secondaryDatabaseString != NULL) { kvDatabaseConf = stKVDatabaseConf_constructFromString(secondaryDatabaseString); sequenceDatabase = stKVDatabase_construct(kvDatabaseConf, 0); stKVDatabaseConf_destruct(kvDatabaseConf); } FlowerStream *flowerStream = flowerWriter_getFlowerStream(cactusDisk, stdin); Flower *flower; while ((flower = flowerStream_getNext(flowerStream)) != NULL) { st_logDebug("Processing flower %" PRIi64 "\n", flower_getName(flower)); /////////////////////////////////////////////////////////////////////////// // Get the appropriate event names /////////////////////////////////////////////////////////////////////////// st_logInfo("%s\n", eventTree_makeNewickString(flower_getEventTree(flower))); Event *referenceEvent = eventTree_getEventByHeader(flower_getEventTree(flower), referenceEventString); if (referenceEvent == NULL) { st_errAbort("Reference event %s not found in tree. Check your " "--referenceEventString option", referenceEventString); } Name referenceEventName = event_getName(referenceEvent); /////////////////////////////////////////////////////////////////////////// // Now do bottom up or top down, depending /////////////////////////////////////////////////////////////////////////// stList *flowers = stList_construct(); stList_append(flowers, flower); preCacheNestedFlowers(cactusDisk, flowers); if (bottomUpPhase) { assert(sequenceDatabase != NULL); cactusDisk_preCacheSegmentStrings(cactusDisk, flowers); bottomUp(flowers, sequenceDatabase, referenceEventName, !flower_hasParentGroup(flower), generateJukesCantorMatrix); // Unload the nested flowers to save memory. They haven't // been changed, so we don't write them to the cactus // disk. Flower_GroupIterator *groupIt = flower_getGroupIterator(flower); Group *group; while ((group = flower_getNextGroup(groupIt)) != NULL) { if (!group_isLeaf(group)) { flower_unload(group_getNestedFlower(group)); } } flower_destructGroupIterator(groupIt); assert(!flower_isParentLoaded(flower)); // Write this flower to disk. cactusDisk_addUpdateRequest(cactusDisk, flower); } else { topDown(flower, referenceEventName); // We've changed the nested flowers, but not this // flower. We write the nested flowers to disk, then // unload them to save memory. This flower will be // unloaded by the flower-stream code. Flower_GroupIterator *groupIt = flower_getGroupIterator(flower); Group *group; while ((group = flower_getNextGroup(groupIt)) != NULL) { if (!group_isLeaf(group)) { cactusDisk_addUpdateRequest(cactusDisk, group_getNestedFlower(group)); flower_unload(group_getNestedFlower(group)); } } flower_destructGroupIterator(groupIt); } stList_destruct(flowers); } /////////////////////////////////////////////////////////////////////////// // Write the flower(s) back to disk. /////////////////////////////////////////////////////////////////////////// cactusDisk_write(cactusDisk); st_logInfo("Updated the flower on disk\n"); /////////////////////////////////////////////////////////////////////////// //Clean up. /////////////////////////////////////////////////////////////////////////// if (sequenceDatabase != NULL) { stKVDatabase_destruct(sequenceDatabase); } cactusDisk_destruct(cactusDisk); return 0; //Exit without clean up is quicker, enable cleanup when doing memory leak detection. free(cactusDiskDatabaseString); free(referenceEventString); free(logLevelString); st_logInfo("Cleaned stuff up and am finished\n"); return 0; }
static void cacheSubstringsFromDB(CactusDisk *cactusDisk, stList *substrings) { /* * Caches the given set of substrings in the cactusDisk cache. */ if (cactusDisk->storeSequencesInAFile) { if (cactusDisk->sequencesReadFileHandle == NULL) { if(cactusDisk->sequencesWriteFileHandle != NULL) { fsync(fileno(cactusDisk->sequencesWriteFileHandle)); fclose(cactusDisk->sequencesWriteFileHandle); cactusDisk->sequencesWriteFileHandle = NULL; } cactusDisk->sequencesReadFileHandle = fopen(cactusDisk->absSequencesFileName, "r"); assert(cactusDisk->sequencesReadFileHandle != NULL); } else { assert(cactusDisk->sequencesWriteFileHandle == NULL); } for (int64_t i = 0; i < stList_length(substrings); i++) { Substring *substring = stList_get(substrings, i); char *string = getStringFromDisk(cactusDisk->sequencesReadFileHandle, substring->name, substring->start, substring->length); stCache_setRecord(cactusDisk->stringCache, substring->name, substring->start, substring->length, string); #ifndef NDEBUG int64_t bytesRead; char *string2 = stCache_getRecord(cactusDisk->stringCache, substring->name, substring->start, substring->length, &bytesRead); assert(bytesRead == substring->length); for (int64_t j = 0; j < substring->length; j++) { assert(string2[j] == string[j]); } free(string2); #endif free(string); } } else { stList *getRequests = stList_construct3(0, free); for (int64_t i = 0; i < stList_length(substrings); i++) { Substring *substring = stList_get(substrings, i); int64_t intervalSize = (substring->length + substring->start - 1) / CACTUS_DISK_SEQUENCE_CHUNK_SIZE - substring->start / CACTUS_DISK_SEQUENCE_CHUNK_SIZE + 1; Name shiftedName = substring->name + substring->start / CACTUS_DISK_SEQUENCE_CHUNK_SIZE; for (int64_t j = 0; j < intervalSize; j++) { int64_t *k = st_malloc(sizeof(int64_t)); k[0] = shiftedName + j; stList_append(getRequests, k); } } if (stList_length(getRequests) == 0) { stList_destruct(getRequests); return; } stList *records = NULL; stTry { records = stKVDatabase_bulkGetRecords(cactusDisk->database, getRequests); } stCatch(except) { stThrowNewCause(except, ST_KV_DATABASE_EXCEPTION_ID, "An unknown database error occurred when getting a sequence string"); }stTryEnd ; assert(records != NULL); assert(stList_length(records) == stList_length(getRequests)); stList_destruct(getRequests); stListIterator *recordsIt = stList_getIterator(records); for (int64_t i = 0; i < stList_length(substrings); i++) { Substring *substring = stList_get(substrings, i); int64_t intervalSize = (substring->length + substring->start - 1) / CACTUS_DISK_SEQUENCE_CHUNK_SIZE - substring->start / CACTUS_DISK_SEQUENCE_CHUNK_SIZE + 1; stList *strings = stList_construct(); while (intervalSize-- > 0) { int64_t recordSize; stKVDatabaseBulkResult *result = stList_getNext(recordsIt); assert(result != NULL); char *string = stKVDatabaseBulkResult_getRecord(result, &recordSize); assert(string != NULL); assert(strlen(string) == recordSize - 1); stList_append(strings, string); assert(recordSize <= CACTUS_DISK_SEQUENCE_CHUNK_SIZE + 1); } assert(stList_length(strings) > 0); char *joinedString = stString_join2("", strings); stCache_setRecord(cactusDisk->stringCache, substring->name, (substring->start / CACTUS_DISK_SEQUENCE_CHUNK_SIZE) * CACTUS_DISK_SEQUENCE_CHUNK_SIZE, strlen(joinedString), joinedString); free(joinedString); stList_destruct(strings); } assert(stList_getNext(recordsIt) == NULL); stList_destructIterator(recordsIt); stList_destruct(records); } }
int main(int argc, char *argv[]) { /* * Script for adding alignments to cactus tree. */ int64_t startTime; stKVDatabaseConf *kvDatabaseConf; CactusDisk *cactusDisk; int key, k; bool (*filterFn)(stPinchSegment *, stPinchSegment *) = NULL; stSet *outgroupThreads = NULL; /* * Arguments/options */ char * logLevelString = NULL; char * alignmentsFile = NULL; char * constraintsFile = NULL; char * cactusDiskDatabaseString = NULL; char * lastzArguments = ""; int64_t minimumSequenceLengthForBlast = 1; //Parameters for annealing/melting rounds int64_t *annealingRounds = NULL; int64_t annealingRoundsLength = 0; int64_t *meltingRounds = NULL; int64_t meltingRoundsLength = 0; //Parameters for melting float maximumAdjacencyComponentSizeRatio = 10; int64_t blockTrim = 0; int64_t alignmentTrimLength = 0; int64_t *alignmentTrims = NULL; int64_t chainLengthForBigFlower = 1000000; int64_t longChain = 2; int64_t minLengthForChromosome = 1000000; float proportionOfUnalignedBasesForNewChromosome = 0.8; bool breakChainsAtReverseTandems = 1; int64_t maximumMedianSequenceLengthBetweenLinkedEnds = INT64_MAX; bool realign = 0; char *realignArguments = ""; bool removeRecoverableChains = false; bool (*recoverableChainsFilter)(stCactusEdgeEnd *, Flower *) = NULL; int64_t maxRecoverableChainsIterations = 1; int64_t maxRecoverableChainLength = INT64_MAX; //Parameters for removing ancient homologies bool doPhylogeny = false; int64_t phylogenyNumTrees = 1; enum stCaf_RootingMethod phylogenyRootingMethod = BEST_RECON; enum stCaf_ScoringMethod phylogenyScoringMethod = COMBINED_LIKELIHOOD; double breakpointScalingFactor = 1.0; bool phylogenySkipSingleCopyBlocks = 0; int64_t phylogenyMaxBaseDistance = 1000; int64_t phylogenyMaxBlockDistance = 100; bool phylogenyKeepSingleDegreeBlocks = 0; stList *phylogenyTreeBuildingMethods = stList_construct(); enum stCaf_TreeBuildingMethod defaultMethod = GUIDED_NEIGHBOR_JOINING; stList_append(phylogenyTreeBuildingMethods, &defaultMethod); double phylogenyCostPerDupPerBase = 0.2; double phylogenyCostPerLossPerBase = 0.2; const char *debugFileName = NULL; const char *referenceEventHeader = NULL; double phylogenyDoSplitsWithSupportHigherThanThisAllAtOnce = 1.0; int64_t numTreeBuildingThreads = 2; int64_t minimumBlockDegreeToCheckSupport = 10; double minimumBlockHomologySupport = 0.7; double nucleotideScalingFactor = 1.0; HomologyUnitType phylogenyHomologyUnitType = BLOCK; enum stCaf_DistanceCorrectionMethod phylogenyDistanceCorrectionMethod = JUKES_CANTOR; bool sortAlignments = false; /////////////////////////////////////////////////////////////////////////// // (0) Parse the inputs handed by genomeCactus.py / setup stuff. /////////////////////////////////////////////////////////////////////////// while (1) { static struct option long_options[] = { { "logLevel", required_argument, 0, 'a' }, { "alignments", required_argument, 0, 'b' }, { "cactusDisk", required_argument, 0, 'c' }, { "lastzArguments", required_argument, 0, 'd' }, { "help", no_argument, 0, 'h' }, { "annealingRounds", required_argument, 0, 'i' }, { "trim", required_argument, 0, 'k' }, { "trimChange", required_argument, 0, 'l', }, { "minimumTreeCoverage", required_argument, 0, 'm' }, { "blockTrim", required_argument, 0, 'n' }, { "deannealingRounds", required_argument, 0, 'o' }, { "minimumDegree", required_argument, 0, 'p' }, { "minimumIngroupDegree", required_argument, 0, 'q' }, { "minimumOutgroupDegree", required_argument, 0, 'r' }, { "alignmentFilter", required_argument, 0, 't' }, { "minimumSequenceLengthForBlast", required_argument, 0, 'v' }, { "maxAdjacencyComponentSizeRatio", required_argument, 0, 'w' }, { "constraints", required_argument, 0, 'x' }, { "minLengthForChromosome", required_argument, 0, 'y' }, { "proportionOfUnalignedBasesForNewChromosome", required_argument, 0, 'z' }, { "maximumMedianSequenceLengthBetweenLinkedEnds", required_argument, 0, 'A' }, { "realign", no_argument, 0, 'B' }, { "realignArguments", required_argument, 0, 'C' }, { "phylogenyNumTrees", required_argument, 0, 'D' }, { "phylogenyRootingMethod", required_argument, 0, 'E' }, { "phylogenyScoringMethod", required_argument, 0, 'F' }, { "phylogenyBreakpointScalingFactor", required_argument, 0, 'G' }, { "phylogenySkipSingleCopyBlocks", no_argument, 0, 'H' }, { "phylogenyMaxBaseDistance", required_argument, 0, 'I' }, { "phylogenyMaxBlockDistance", required_argument, 0, 'J' }, { "phylogenyDebugFile", required_argument, 0, 'K' }, { "phylogenyKeepSingleDegreeBlocks", no_argument, 0, 'L' }, { "phylogenyTreeBuildingMethod", required_argument, 0, 'M' }, { "phylogenyCostPerDupPerBase", required_argument, 0, 'N' }, { "phylogenyCostPerLossPerBase", required_argument, 0, 'O' }, { "referenceEventHeader", required_argument, 0, 'P' }, { "phylogenyDoSplitsWithSupportHigherThanThisAllAtOnce", required_argument, 0, 'Q' }, { "numTreeBuildingThreads", required_argument, 0, 'R' }, { "phylogeny", no_argument, 0, 'S' }, { "minimumBlockHomologySupport", required_argument, 0, 'T' }, { "phylogenyNucleotideScalingFactor", required_argument, 0, 'U' }, { "minimumBlockDegreeToCheckSupport", required_argument, 0, 'V' }, { "removeRecoverableChains", required_argument, 0, 'W' }, { "minimumNumberOfSpecies", required_argument, 0, 'X' }, { "phylogenyHomologyUnitType", required_argument, 0, 'Y' }, { "phylogenyDistanceCorrectionMethod", required_argument, 0, 'Z' }, { "maxRecoverableChainsIterations", required_argument, 0, '1' }, { "maxRecoverableChainLength", required_argument, 0, '2' }, { 0, 0, 0, 0 } }; int option_index = 0; key = getopt_long(argc, argv, "a:b:c:hi:k:m:n:o:p:q:r:stv:w:x:y:z:A:BC:D:E:", long_options, &option_index); if (key == -1) { break; } switch (key) { case 'a': logLevelString = stString_copy(optarg); st_setLogLevelFromString(logLevelString); break; case 'b': alignmentsFile = stString_copy(optarg); break; case 'c': cactusDiskDatabaseString = stString_copy(optarg); break; case 'd': lastzArguments = stString_copy(optarg); break; case 'h': usage(); return 0; case 'i': annealingRounds = getInts(optarg, &annealingRoundsLength); break; case 'o': meltingRounds = getInts(optarg, &meltingRoundsLength); break; case 'k': alignmentTrims = getInts(optarg, &alignmentTrimLength); break; case 'm': k = sscanf(optarg, "%f", &minimumTreeCoverage); assert(k == 1); break; case 'n': k = sscanf(optarg, "%" PRIi64 "", &blockTrim); assert(k == 1); break; case 'p': k = sscanf(optarg, "%" PRIi64 "", &minimumDegree); assert(k == 1); break; case 'q': k = sscanf(optarg, "%" PRIi64 "", &minimumIngroupDegree); assert(k == 1); break; case 'r': k = sscanf(optarg, "%" PRIi64 "", &minimumOutgroupDegree); assert(k == 1); break; case 't': if (strcmp(optarg, "singleCopyOutgroup") == 0) { sortAlignments = true; filterFn = stCaf_filterByOutgroup; } else if (strcmp(optarg, "relaxedSingleCopyOutgroup") == 0) { sortAlignments = true; filterFn = stCaf_relaxedFilterByOutgroup; } else if (strcmp(optarg, "singleCopy") == 0) { sortAlignments = true; filterFn = stCaf_filterByRepeatSpecies; } else if (strcmp(optarg, "relaxedSingleCopy") == 0) { sortAlignments = true; filterFn = stCaf_relaxedFilterByRepeatSpecies; } else if (strcmp(optarg, "singleCopyChr") == 0) { sortAlignments = true; filterFn = stCaf_singleCopyChr; } else if (strcmp(optarg, "singleCopyIngroup") == 0) { sortAlignments = true; filterFn = stCaf_singleCopyIngroup; } else if (strcmp(optarg, "relaxedSingleCopyIngroup") == 0) { sortAlignments = true; filterFn = stCaf_relaxedSingleCopyIngroup; } else if (strcmp(optarg, "none") == 0) { sortAlignments = false; filterFn = NULL; } else { st_errAbort("Could not recognize alignmentFilter option %s", optarg); } break; case 'v': k = sscanf(optarg, "%" PRIi64 "", &minimumSequenceLengthForBlast); assert(k == 1); break; case 'w': k = sscanf(optarg, "%f", &maximumAdjacencyComponentSizeRatio); assert(k == 1); break; case 'x': constraintsFile = stString_copy(optarg); break; case 'y': k = sscanf(optarg, "%" PRIi64 "", &minLengthForChromosome); assert(k == 1); break; case 'z': k = sscanf(optarg, "%f", &proportionOfUnalignedBasesForNewChromosome); assert(k == 1); break; case 'A': k = sscanf(optarg, "%" PRIi64 "", &maximumMedianSequenceLengthBetweenLinkedEnds); assert(k == 1); break; case 'B': realign = 1; break; case 'C': realignArguments = stString_copy(optarg); break; case 'D': k = sscanf(optarg, "%" PRIi64, &phylogenyNumTrees); assert(k == 1); break; case 'E': if (!strcmp(optarg, "outgroupBranch")) { phylogenyRootingMethod = OUTGROUP_BRANCH; } else if (!strcmp(optarg, "longestBranch")) { phylogenyRootingMethod = LONGEST_BRANCH; } else if (!strcmp(optarg, "bestRecon")) { phylogenyRootingMethod = BEST_RECON; } else { st_errAbort("Invalid tree rooting method: %s", optarg); } break; case 'F': if (!strcmp(optarg, "reconCost")) { phylogenyScoringMethod = RECON_COST; } else if (!strcmp(optarg, "nucLikelihood")) { phylogenyScoringMethod = NUCLEOTIDE_LIKELIHOOD; } else if (!strcmp(optarg, "reconLikelihood")) { phylogenyScoringMethod = RECON_LIKELIHOOD; } else if (!strcmp(optarg, "combinedLikelihood")) { phylogenyScoringMethod = COMBINED_LIKELIHOOD; } else { st_errAbort("Invalid tree scoring method: %s", optarg); } break; case 'G': k = sscanf(optarg, "%lf", &breakpointScalingFactor); assert(k == 1); break; case 'H': phylogenySkipSingleCopyBlocks = true; break; case 'I': k = sscanf(optarg, "%" PRIi64, &phylogenyMaxBaseDistance); assert(k == 1); break; case 'J': k = sscanf(optarg, "%" PRIi64, &phylogenyMaxBlockDistance); assert(k == 1); break; case 'K': debugFileName = stString_copy(optarg); break; case 'L': phylogenyKeepSingleDegreeBlocks = true; break; case 'M': // clear the default setting of the list stList_destruct(phylogenyTreeBuildingMethods); phylogenyTreeBuildingMethods = stList_construct(); stList *methodStrings = stString_splitByString(optarg, ","); for (int64_t i = 0; i < stList_length(methodStrings); i++) { char *methodString = stList_get(methodStrings, i); enum stCaf_TreeBuildingMethod *method = st_malloc(sizeof(enum stCaf_TreeBuildingMethod)); if (strcmp(methodString, "neighborJoining") == 0) { *method = NEIGHBOR_JOINING; } else if (strcmp(methodString, "guidedNeighborJoining") == 0) { *method = GUIDED_NEIGHBOR_JOINING; } else if (strcmp(methodString, "splitDecomposition") == 0) { *method = SPLIT_DECOMPOSITION; } else if (strcmp(methodString, "strictSplitDecomposition") == 0) { *method = STRICT_SPLIT_DECOMPOSITION; } else if (strcmp(methodString, "removeBadChains") == 0) { *method = REMOVE_BAD_CHAINS; } else { st_errAbort("Unknown tree building method: %s", methodString); } stList_append(phylogenyTreeBuildingMethods, method); } stList_destruct(methodStrings); break; case 'N': k = sscanf(optarg, "%lf", &phylogenyCostPerDupPerBase); assert(k == 1); break; case 'O': k = sscanf(optarg, "%lf", &phylogenyCostPerLossPerBase); assert(k == 1); break; case 'P': referenceEventHeader = stString_copy(optarg); break; case 'Q': k = sscanf(optarg, "%lf", &phylogenyDoSplitsWithSupportHigherThanThisAllAtOnce); assert(k == 1); break; case 'R': k = sscanf(optarg, "%" PRIi64, &numTreeBuildingThreads); assert(k == 1); break; case 'S': doPhylogeny = true; break; case 'T': k = sscanf(optarg, "%lf", &minimumBlockHomologySupport); assert(k == 1); assert(minimumBlockHomologySupport <= 1.0); assert(minimumBlockHomologySupport >= 0.0); break; case 'U': k = sscanf(optarg, "%lf", &nucleotideScalingFactor); assert(k == 1); break; case 'V': k = sscanf(optarg, "%" PRIi64, &minimumBlockDegreeToCheckSupport); assert(k == 1); break; case 'W': if (strcmp(optarg, "1") == 0) { removeRecoverableChains = true; recoverableChainsFilter = NULL; } else if (strcmp(optarg, "unequalNumberOfIngroupCopies") == 0) { removeRecoverableChains = true; recoverableChainsFilter = stCaf_chainHasUnequalNumberOfIngroupCopies; } else if (strcmp(optarg, "unequalNumberOfIngroupCopiesOrNoOutgroup") == 0) { removeRecoverableChains = true; recoverableChainsFilter = stCaf_chainHasUnequalNumberOfIngroupCopiesOrNoOutgroup; } else if (strcmp(optarg, "0") == 0) { removeRecoverableChains = false; } else { st_errAbort("Could not parse removeRecoverableChains argument"); } break; case 'X': k = sscanf(optarg, "%" PRIi64, &minimumNumberOfSpecies); if (k != 1) { st_errAbort("Error parsing the minimumNumberOfSpecies argument"); } break; case 'Y': if (strcmp(optarg, "chain") == 0) { phylogenyHomologyUnitType = CHAIN; } else if (strcmp(optarg, "block") == 0) { phylogenyHomologyUnitType = BLOCK; } else { st_errAbort("Could not parse the phylogenyHomologyUnitType argument"); } break; case 'Z': if (strcmp(optarg, "jukesCantor") == 0) { phylogenyDistanceCorrectionMethod = JUKES_CANTOR; } else if (strcmp(optarg, "none") == 0 ) { phylogenyDistanceCorrectionMethod = NONE; } else { st_errAbort("Could not parse the phylogenyDistanceCorrectionMethod argument"); } break; case '1': k = sscanf(optarg, "%" PRIi64, &maxRecoverableChainsIterations); if (k != 1) { st_errAbort("Error parsing the maxRecoverableChainsIterations argument"); } break; case '2': k = sscanf(optarg, "%" PRIi64, &maxRecoverableChainLength); if (k != 1) { st_errAbort("Error parsing the maxRecoverableChainLength argument"); } break; default: usage(); return 1; } } /////////////////////////////////////////////////////////////////////////// // (0) Check the inputs. /////////////////////////////////////////////////////////////////////////// assert(cactusDiskDatabaseString != NULL); assert(minimumTreeCoverage >= 0.0); assert(minimumTreeCoverage <= 1.0); assert(blockTrim >= 0); assert(annealingRoundsLength >= 0); for (int64_t i = 0; i < annealingRoundsLength; i++) { assert(annealingRounds[i] >= 0); } assert(meltingRoundsLength >= 0); for (int64_t i = 1; i < meltingRoundsLength; i++) { assert(meltingRounds[i - 1] < meltingRounds[i]); assert(meltingRounds[i - 1] >= 1); } assert(alignmentTrimLength >= 0); for (int64_t i = 0; i < alignmentTrimLength; i++) { assert(alignmentTrims[i] >= 0); } assert(minimumOutgroupDegree >= 0); assert(minimumIngroupDegree >= 0); ////////////////////////////////////////////// //Set up logging ////////////////////////////////////////////// st_setLogLevelFromString(logLevelString); ////////////////////////////////////////////// //Log (some of) the inputs ////////////////////////////////////////////// st_logInfo("Flower disk name : %s\n", cactusDiskDatabaseString); ////////////////////////////////////////////// //Load the database ////////////////////////////////////////////// kvDatabaseConf = stKVDatabaseConf_constructFromString(cactusDiskDatabaseString); cactusDisk = cactusDisk_construct(kvDatabaseConf, 0); st_logInfo("Set up the flower disk\n"); /////////////////////////////////////////////////////////////////////////// // Sort the constraints /////////////////////////////////////////////////////////////////////////// stPinchIterator *pinchIteratorForConstraints = NULL; if (constraintsFile != NULL) { pinchIteratorForConstraints = stPinchIterator_constructFromFile(constraintsFile); st_logInfo("Created an iterator for the alignment constaints from file: %s\n", constraintsFile); } /////////////////////////////////////////////////////////////////////////// // Do the alignment /////////////////////////////////////////////////////////////////////////// startTime = time(NULL); stList *flowers = flowerWriter_parseFlowersFromStdin(cactusDisk); if (alignmentsFile == NULL) { cactusDisk_preCacheStrings(cactusDisk, flowers); } char *tempFile1 = NULL; for (int64_t i = 0; i < stList_length(flowers); i++) { flower = stList_get(flowers, i); if (!flower_builtBlocks(flower)) { // Do nothing if the flower already has defined blocks st_logDebug("Processing flower: %lli\n", flower_getName(flower)); stCaf_setFlowerForAlignmentFiltering(flower); //Set up the graph and add the initial alignments stPinchThreadSet *threadSet = stCaf_setup(flower); //Build the set of outgroup threads outgroupThreads = stCaf_getOutgroupThreads(flower, threadSet); //Setup the alignments stPinchIterator *pinchIterator; stList *alignmentsList = NULL; if (alignmentsFile != NULL) { assert(i == 0); assert(stList_length(flowers) == 1); if (sortAlignments) { tempFile1 = getTempFile(); stCaf_sortCigarsFileByScoreInDescendingOrder(alignmentsFile, tempFile1); pinchIterator = stPinchIterator_constructFromFile(tempFile1); } else { pinchIterator = stPinchIterator_constructFromFile(alignmentsFile); } } else { if (tempFile1 == NULL) { tempFile1 = getTempFile(); } alignmentsList = stCaf_selfAlignFlower(flower, minimumSequenceLengthForBlast, lastzArguments, realign, realignArguments, tempFile1); if (sortAlignments) { stCaf_sortCigarsByScoreInDescendingOrder(alignmentsList); } st_logDebug("Ran lastz and have %" PRIi64 " alignments\n", stList_length(alignmentsList)); pinchIterator = stPinchIterator_constructFromList(alignmentsList); } for (int64_t annealingRound = 0; annealingRound < annealingRoundsLength; annealingRound++) { int64_t minimumChainLength = annealingRounds[annealingRound]; int64_t alignmentTrim = annealingRound < alignmentTrimLength ? alignmentTrims[annealingRound] : 0; st_logDebug("Starting annealing round with a minimum chain length of %" PRIi64 " and an alignment trim of %" PRIi64 "\n", minimumChainLength, alignmentTrim); stPinchIterator_setTrim(pinchIterator, alignmentTrim); //Add back in the constraints if (pinchIteratorForConstraints != NULL) { stCaf_anneal(threadSet, pinchIteratorForConstraints, filterFn); } //Do the annealing if (annealingRound == 0) { stCaf_anneal(threadSet, pinchIterator, filterFn); } else { stCaf_annealBetweenAdjacencyComponents(threadSet, pinchIterator, filterFn); } // Dump the block degree and length distribution to a file if (debugFileName != NULL) { dumpBlockInfo(threadSet, stString_print("%s-blockStats-preMelting", debugFileName)); } printf("Sequence graph statistics after annealing:\n"); printThreadSetStatistics(threadSet, flower, stdout); // Check for poorly-supported blocks--those that have // been transitively aligned together but with very // few homologies supporting the transitive // alignment. These "megablocks" can snarl up the // graph so that a lot of extra gets thrown away in // the first melting step. stPinchThreadSetBlockIt blockIt = stPinchThreadSet_getBlockIt(threadSet); stPinchBlock *block; while ((block = stPinchThreadSetBlockIt_getNext(&blockIt)) != NULL) { if (stPinchBlock_getDegree(block) > minimumBlockDegreeToCheckSupport) { uint64_t supportingHomologies = stPinchBlock_getNumSupportingHomologies(block); uint64_t possibleSupportingHomologies = numPossibleSupportingHomologies(block, flower); double support = ((double) supportingHomologies) / possibleSupportingHomologies; if (support < minimumBlockHomologySupport) { fprintf(stdout, "Destroyed a megablock with degree %" PRIi64 " and %" PRIi64 " supporting homologies out of a maximum " "of %" PRIi64 " (%lf%%).\n", stPinchBlock_getDegree(block), supportingHomologies, possibleSupportingHomologies, support); stPinchBlock_destruct(block); } } } //Do the melting rounds for (int64_t meltingRound = 0; meltingRound < meltingRoundsLength; meltingRound++) { int64_t minimumChainLengthForMeltingRound = meltingRounds[meltingRound]; st_logDebug("Starting melting round with a minimum chain length of %" PRIi64 " \n", minimumChainLengthForMeltingRound); if (minimumChainLengthForMeltingRound >= minimumChainLength) { break; } stCaf_melt(flower, threadSet, NULL, 0, minimumChainLengthForMeltingRound, 0, INT64_MAX); } st_logDebug("Last melting round of cycle with a minimum chain length of %" PRIi64 " \n", minimumChainLength); stCaf_melt(flower, threadSet, NULL, 0, minimumChainLength, breakChainsAtReverseTandems, maximumMedianSequenceLengthBetweenLinkedEnds); //This does the filtering of blocks that do not have the required species/tree-coverage/degree. stCaf_melt(flower, threadSet, blockFilterFn, blockTrim, 0, 0, INT64_MAX); } if (removeRecoverableChains) { stCaf_meltRecoverableChains(flower, threadSet, breakChainsAtReverseTandems, maximumMedianSequenceLengthBetweenLinkedEnds, recoverableChainsFilter, maxRecoverableChainsIterations, maxRecoverableChainLength); } if (debugFileName != NULL) { dumpBlockInfo(threadSet, stString_print("%s-blockStats-postMelting", debugFileName)); } printf("Sequence graph statistics after melting:\n"); printThreadSetStatistics(threadSet, flower, stdout); // Build a tree for each block, then use each tree to // partition the homologies between the ingroups sequences // into those that occur before the speciation with the // outgroup and those which occur late. if (stSet_size(outgroupThreads) > 0 && doPhylogeny) { st_logDebug("Starting to build trees and partition ingroup homologies\n"); stHash *threadStrings = stCaf_getThreadStrings(flower, threadSet); st_logDebug("Got sets of thread strings and set of threads that are outgroups\n"); stCaf_PhylogenyParameters params; params.distanceCorrectionMethod = phylogenyDistanceCorrectionMethod; params.treeBuildingMethods = phylogenyTreeBuildingMethods; params.rootingMethod = phylogenyRootingMethod; params.scoringMethod = phylogenyScoringMethod; params.breakpointScalingFactor = breakpointScalingFactor; params.nucleotideScalingFactor = nucleotideScalingFactor; params.skipSingleCopyBlocks = phylogenySkipSingleCopyBlocks; params.keepSingleDegreeBlocks = phylogenyKeepSingleDegreeBlocks; params.costPerDupPerBase = phylogenyCostPerDupPerBase; params.costPerLossPerBase = phylogenyCostPerLossPerBase; params.maxBaseDistance = phylogenyMaxBaseDistance; params.maxBlockDistance = phylogenyMaxBlockDistance; params.numTrees = phylogenyNumTrees; params.ignoreUnalignedBases = 1; params.onlyIncludeCompleteFeatureBlocks = 0; params.doSplitsWithSupportHigherThanThisAllAtOnce = phylogenyDoSplitsWithSupportHigherThanThisAllAtOnce; params.numTreeBuildingThreads = numTreeBuildingThreads; assert(params.numTreeBuildingThreads >= 1); stCaf_buildTreesToRemoveAncientHomologies( threadSet, phylogenyHomologyUnitType, threadStrings, outgroupThreads, flower, ¶ms, debugFileName == NULL ? NULL : stString_print("%s-phylogeny", debugFileName), referenceEventHeader); stHash_destruct(threadStrings); st_logDebug("Finished building trees\n"); if (removeRecoverableChains) { // We melt recoverable chains after splitting, as // well as before, to alleviate coverage loss // caused by bad splits. stCaf_meltRecoverableChains(flower, threadSet, breakChainsAtReverseTandems, maximumMedianSequenceLengthBetweenLinkedEnds, recoverableChainsFilter, maxRecoverableChainsIterations, maxRecoverableChainLength); } // Enforce the block constraints on minimum degree, // etc. after splitting. stCaf_melt(flower, threadSet, blockFilterFn, 0, 0, 0, INT64_MAX); } //Sort out case when we allow blocks of degree 1 if (minimumDegree < 2) { st_logDebug("Creating degree 1 blocks\n"); stCaf_makeDegreeOneBlocks(threadSet); stCaf_melt(flower, threadSet, blockFilterFn, blockTrim, 0, 0, INT64_MAX); } else if (maximumAdjacencyComponentSizeRatio < INT64_MAX) { //Deal with giant components st_logDebug("Breaking up components greedily\n"); stCaf_breakupComponentsGreedily(threadSet, maximumAdjacencyComponentSizeRatio); } //Finish up stCaf_finish(flower, threadSet, chainLengthForBigFlower, longChain, minLengthForChromosome, proportionOfUnalignedBasesForNewChromosome); //Flower is then destroyed at this point. st_logInfo("Ran the cactus core script\n"); //Cleanup stPinchThreadSet_destruct(threadSet); stPinchIterator_destruct(pinchIterator); stSet_destruct(outgroupThreads); if (alignmentsList != NULL) { stList_destruct(alignmentsList); } st_logInfo("Cleaned up from main loop\n"); } else { st_logInfo("We've already built blocks / alignments for this flower\n"); } } stList_destruct(flowers); if (tempFile1 != NULL) { st_system("rm %s", tempFile1); } if (constraintsFile != NULL) { stPinchIterator_destruct(pinchIteratorForConstraints); } /////////////////////////////////////////////////////////////////////////// // Write the flower to disk. /////////////////////////////////////////////////////////////////////////// st_logDebug("Writing the flowers to disk\n"); cactusDisk_write(cactusDisk); st_logInfo("Updated the flower on disk and %" PRIi64 " seconds have elapsed\n", time(NULL) - startTime); /////////////////////////////////////////////////////////////////////////// // Clean up. /////////////////////////////////////////////////////////////////////////// cactusDisk_destruct(cactusDisk); }