bool trueAdjacency(Cap *cap, stList *eventStrings) { if(getTerminalAdjacencyLength(cap) > 0) { return 0; } cap = getTerminalCap(cap); assert(cap != NULL); Cap *otherCap = cap_getAdjacency(cap); assert(otherCap != NULL); assert(cap_getAdjacency(otherCap) == cap); //So is the adjacency present in one of the haplotypes? That's what we're going to answer.. End *otherEnd = end_getPositiveOrientation(cap_getEnd(otherCap)); End_InstanceIterator *endInstanceIt = end_getInstanceIterator(cap_getEnd(cap)); Cap *cap2; while ((cap2 = end_getNext(endInstanceIt)) != NULL) { Cap *otherCap2 = cap_getAdjacency(cap2); assert(otherCap2 != NULL); if (otherEnd == end_getPositiveOrientation(cap_getEnd(otherCap2))) { //const char *eventName = event_getHeader(cap_getEvent(cap2)); assert(event_getHeader(cap_getEvent(cap2)) == event_getHeader( cap_getEvent(otherCap2))); if (capHasGivenEvents(cap2, eventStrings)) { //strcmp(eventName, "hapA1") == 0 || strcmp(eventName, "hapA2") == 0) { if(getTerminalAdjacencyLength(cap2) == 0) { end_destructInstanceIterator(endInstanceIt); return 1; } } } } end_destructInstanceIterator(endInstanceIt); return 0; }
bool endsAreAdjacent2(End *end1, End *end2, Cap **returnCap1, Cap **returnCap2, int64_t *minimumDistanceBetweenHaplotypeCaps, stList *eventStrings) { End_InstanceIterator *instanceIterator = end_getInstanceIterator(end1); Cap *cap1; *returnCap1 = NULL; *returnCap2 = NULL; *minimumDistanceBetweenHaplotypeCaps = INT64_MAX; bool areAdjacent = 0; while ((cap1 = end_getNext(instanceIterator)) != NULL) { if (capHasGivenEvents(cap1, eventStrings)) { End_InstanceIterator *instanceIterator2 = end_getInstanceIterator(end2); Cap *cap2; while ((cap2 = end_getNext(instanceIterator2)) != NULL) { int64_t i; if (capsAreAdjacent(cap1, cap2, &i)) { areAdjacent = 1; if (i < *minimumDistanceBetweenHaplotypeCaps) { *minimumDistanceBetweenHaplotypeCaps = i; *returnCap1 = cap1; *returnCap2 = cap2; } } } end_destructInstanceIterator(instanceIterator2); } } end_destructInstanceIterator(instanceIterator); return areAdjacent; }
bool hasCapNotInEvent(End *end, const char *eventString) { Cap *cap; End_InstanceIterator *instanceIt = end_getInstanceIterator(end); while ((cap = end_getNext(instanceIt)) != NULL) { if (strcmp(event_getHeader(cap_getEvent(cap)), eventString) != 0) { end_destructInstanceIterator(instanceIt); return 1; } } end_destructInstanceIterator(instanceIt); return 0; }
Cap *getCapForReferenceEvent(End *end, Name referenceEventName) { /* * Get the cap for a given event. */ End_InstanceIterator *it = end_getInstanceIterator(end); Cap *cap; while ((cap = end_getNext(it)) != NULL) { if (event_getName(cap_getEvent(cap)) == referenceEventName) { end_destructInstanceIterator(it); return cap; } } end_destructInstanceIterator(it); //assert(0); return NULL; }
void stCaf_addAdjacencies(Flower *flower) { //Build a list of caps. stList *list = stList_construct(); Flower_EndIterator *endIterator = flower_getEndIterator(flower); End *end; while ((end = flower_getNextEnd(endIterator)) != NULL) { End_InstanceIterator *instanceIterator = end_getInstanceIterator(end); Cap *cap; while ((cap = end_getNext(instanceIterator)) != NULL) { if (!cap_getStrand(cap)) { cap = cap_getReverse(cap); } stList_append(list, cap); } end_destructInstanceIterator(instanceIterator); } flower_destructEndIterator(endIterator); assert(stList_length(list) % 2 == 0); //Sort the list of caps. stList_sort(list, (int(*)(const void *, const void *)) addAdjacenciesPP); //Now make the adjacencies. for (int64_t i = 1; i < stList_length(list); i += 2) { Cap *cap = stList_get(list, i - 1); Cap *cap2 = stList_get(list, i); cap_makeAdjacent(cap, cap2); } //Clean up. stList_destruct(list); }
void testEnd_instanceIterator(CuTest* testCase) { cactusEndTestSetup(); End_InstanceIterator *iterator = end_getInstanceIterator(end); CuAssertTrue(testCase, iterator != NULL); CuAssertTrue(testCase, end_getNext(iterator) == cap_getReverse(rootCap)); CuAssertTrue(testCase, end_getNext(iterator) == cap_getReverse(leaf1Cap)); End_InstanceIterator *iterator2 = end_copyInstanceIterator(iterator); CuAssertTrue(testCase, end_getNext(iterator) == leaf2Cap); CuAssertTrue(testCase, end_getNext(iterator) == cap_getReverse(leaf3Cap)); CuAssertTrue(testCase, end_getNext(iterator) == NULL); CuAssertTrue(testCase, end_getPrevious(iterator) == cap_getReverse(leaf3Cap)); CuAssertTrue(testCase, end_getPrevious(iterator) == leaf2Cap); CuAssertTrue(testCase, end_getPrevious(iterator) == cap_getReverse(leaf1Cap)); CuAssertTrue(testCase, end_getPrevious(iterator) == cap_getReverse(rootCap)); CuAssertTrue(testCase, end_getPrevious(iterator) == NULL); CuAssertTrue(testCase, end_getNext(iterator2) == leaf2Cap); CuAssertTrue(testCase, end_getNext(iterator2) == cap_getReverse(leaf3Cap)); CuAssertTrue(testCase, end_getNext(iterator2) == NULL); CuAssertTrue(testCase, end_getPrevious(iterator2) == cap_getReverse(leaf3Cap)); CuAssertTrue(testCase, end_getPrevious(iterator2) == leaf2Cap); CuAssertTrue(testCase, end_getPrevious(iterator2) == cap_getReverse(leaf1Cap)); CuAssertTrue(testCase, end_getPrevious(iterator2) == cap_getReverse(rootCap)); CuAssertTrue(testCase, end_getPrevious(iterator2) == NULL); end_destructInstanceIterator(iterator); end_destructInstanceIterator(iterator2); iterator = end_getInstanceIterator(end_getReverse(end)); CuAssertTrue(testCase, end_getNext(iterator) == rootCap); CuAssertTrue(testCase, end_getNext(iterator) == leaf1Cap); CuAssertTrue(testCase, end_getNext(iterator) == cap_getReverse(leaf2Cap)); CuAssertTrue(testCase, end_getNext(iterator) == leaf3Cap); CuAssertTrue(testCase, end_getNext(iterator) == NULL); CuAssertTrue(testCase, end_getPrevious(iterator) == leaf3Cap); CuAssertTrue(testCase, end_getPrevious(iterator) == cap_getReverse(leaf2Cap)); CuAssertTrue(testCase, end_getPrevious(iterator) == leaf1Cap); CuAssertTrue(testCase, end_getPrevious(iterator) == rootCap); CuAssertTrue(testCase, end_getPrevious(iterator) == NULL); end_destructInstanceIterator(iterator); cactusEndTestTeardown(); }
bool endsAreConnected(End *end1, End *end2, stList *eventStrings) { if (end_getName(end1) == end_getName(end2)) { //Then the ends are the same and are part of the same chromosome by definition. End_InstanceIterator *instanceIterator = end_getInstanceIterator(end1); Cap *cap1; while ((cap1 = end_getNext(instanceIterator)) != NULL) { if (capHasGivenEvents(cap1, eventStrings)) { end_destructInstanceIterator(instanceIterator); return 1; } } return 0; } End_InstanceIterator *instanceIterator = end_getInstanceIterator(end1); Cap *cap1; while ((cap1 = end_getNext(instanceIterator)) != NULL) { if (capHasGivenEvents(cap1, eventStrings)) { End_InstanceIterator *instanceIterator2 = end_getInstanceIterator(end2); Cap *cap2; while ((cap2 = end_getNext(instanceIterator2)) != NULL) { assert(cap_getName(cap2) != cap_getName(cap1)); //This could only happen if end1 == end2 if (sequence_getMetaSequence(cap_getSequence(cap1)) == sequence_getMetaSequence(cap_getSequence(cap2))) { assert(strcmp(event_getHeader(cap_getEvent(cap1)), event_getHeader(cap_getEvent(cap2))) == 0); assert(cap_getPositiveOrientation(cap1) != cap_getPositiveOrientation(cap2)); assert(cap_getName(cap1) != cap_getName(cap2)); //they could have the same coordinate if they represent two ends of a block of length 1. end_destructInstanceIterator(instanceIterator); end_destructInstanceIterator(instanceIterator2); return 1; } } end_destructInstanceIterator(instanceIterator2); } } end_destructInstanceIterator(instanceIterator); return 0; }
static stSortedSet *getEventStrings(End *end, stList *eventStrings) { stSortedSet *eventStringsSet = stSortedSet_construct3( (int(*)(const void *, const void *)) strcmp, NULL); End_InstanceIterator *instanceIt = end_getInstanceIterator(end); Cap *cap; while ((cap = end_getNext(instanceIt)) != NULL) { const char *header = event_getHeader(cap_getEvent(cap)); for(int64_t i=0; i<stList_length(eventStrings); i++) { if(strcmp(stList_get(eventStrings, i), header) == 0) { stSortedSet_insert(eventStringsSet, (void *) header); } } } end_destructInstanceIterator(instanceIt); return eventStringsSet; }
int64_t flower_getTotalBaseLength(Flower *flower) { /* * The implementation of this function is very like that in group_getTotalBaseLength, with a few differences. Consider merging them. */ Flower_EndIterator *endIterator = flower_getEndIterator(flower); End *end; int64_t totalLength = 0; while ((end = flower_getNextEnd(endIterator)) != NULL) { if (!end_isBlockEnd(end)) { End_InstanceIterator *instanceIterator = end_getInstanceIterator(end); Cap *cap; while ((cap = end_getNext(instanceIterator)) != NULL) { cap = cap_getStrand(cap) ? cap : cap_getReverse(cap); if (!cap_getSide(cap) && cap_getSequence(cap) != NULL) { Cap *cap2 = cap_getAdjacency(cap); assert(cap2 != NULL); while (end_isBlockEnd(cap_getEnd(cap2))) { Segment *segment = cap_getSegment(cap2); assert(segment != NULL); assert(segment_get5Cap(segment) == cap2); cap2 = cap_getAdjacency(segment_get3Cap(segment)); assert(cap2 != NULL); assert(cap_getStrand(cap2)); assert(cap_getSide(cap2)); } assert(cap_getStrand(cap2)); assert(cap_getSide(cap2)); int64_t length = cap_getCoordinate(cap2) - cap_getCoordinate(cap) - 1; assert(length >= 0); totalLength += length; } } end_destructInstanceIterator(instanceIterator); } } flower_destructEndIterator(endIterator); return totalLength; }
static stList *getSubstringsForFlowers(stList *flowers) { /* * Get the set of substrings for sequence intervals in the given set of flowers. */ stList *substrings = stList_construct3(0, (void (*)(void *)) substring_destruct); for (int64_t i = 0; i < stList_length(flowers); i++) { Flower *flower = stList_get(flowers, i); Flower_EndIterator *endIt = flower_getEndIterator(flower); End *end; while ((end = flower_getNextEnd(endIt)) != NULL) { if (end_isStubEnd(end)) { End_InstanceIterator *instanceIt = end_getInstanceIterator(end); Cap *cap; while ((cap = end_getNext(instanceIt)) != NULL) { Sequence *sequence; if ((sequence = cap_getSequence(cap)) != NULL) { cap = cap_getStrand(cap) ? cap : cap_getReverse(cap); if (!cap_getSide(cap)) { //We have a sequence interval of interest Cap *adjacentCap = cap_getAdjacency(cap); assert(adjacentCap != NULL); int64_t length = cap_getCoordinate(adjacentCap) - cap_getCoordinate(cap) - 1; assert(length >= 0); if (length > 0) { stList_append(substrings, substring_construct(sequence_getMetaSequence(sequence)->stringName, cap_getCoordinate(cap) + 1 - sequence_getStart(sequence), length)); } } } } end_destructInstanceIterator(instanceIt); } } flower_destructEndIterator(endIt); } return substrings; }
stSortedSet *makeEndAlignment(StateMachine *sM, End *end, int64_t spanningTrees, int64_t maxSequenceLength, bool useProgressiveMerging, float gapGamma, PairwiseAlignmentParameters *pairwiseAlignmentBandingParameters) { //Make an alignment of the sequences in the ends //Get the adjacency sequences to be aligned. Cap *cap; End_InstanceIterator *it = end_getInstanceIterator(end); stList *sequences = stList_construct3(0, (void (*)(void *))adjacencySequence_destruct); stList *seqFrags = stList_construct3(0, (void (*)(void *))seqFrag_destruct); stHash *endInstanceNumbers = stHash_construct2(NULL, free); while((cap = end_getNext(it)) != NULL) { if(cap_getSide(cap)) { cap = cap_getReverse(cap); } AdjacencySequence *adjacencySequence = adjacencySequence_construct(cap, maxSequenceLength); stList_append(sequences, adjacencySequence); assert(cap_getAdjacency(cap) != NULL); End *otherEnd = end_getPositiveOrientation(cap_getEnd(cap_getAdjacency(cap))); stList_append(seqFrags, seqFrag_construct(adjacencySequence->string, 0, end_getName(otherEnd))); //Increase count of seqfrags with a given end. int64_t *c = stHash_search(endInstanceNumbers, otherEnd); if(c == NULL) { c = st_calloc(1, sizeof(int64_t)); assert(*c == 0); stHash_insert(endInstanceNumbers, otherEnd, c); } (*c)++; } end_destructInstanceIterator(it); //Get the alignment. MultipleAlignment *mA = makeAlignment(sM, seqFrags, spanningTrees, 100000000, useProgressiveMerging, gapGamma, pairwiseAlignmentBandingParameters); //Build an array of weights to reweight pairs in the alignment. int64_t *pairwiseAlignmentsPerSequenceNonCommonEnds = st_calloc(stList_length(seqFrags), sizeof(int64_t)); int64_t *pairwiseAlignmentsPerSequenceCommonEnds = st_calloc(stList_length(seqFrags), sizeof(int64_t)); //First build array on number of pairwise alignments to each sequence, distinguishing alignments between sequences sharing //common ends. for(int64_t i=0; i<stList_length(mA->chosenPairwiseAlignments); i++) { stIntTuple *pairwiseAlignment = stList_get(mA->chosenPairwiseAlignments, i); int64_t seq1 = stIntTuple_get(pairwiseAlignment, 1); int64_t seq2 = stIntTuple_get(pairwiseAlignment, 2); assert(seq1 != seq2); SeqFrag *seqFrag1 = stList_get(seqFrags, seq1); SeqFrag *seqFrag2 = stList_get(seqFrags, seq2); int64_t *pairwiseAlignmentsPerSequence = seqFrag1->rightEndId == seqFrag2->rightEndId ? pairwiseAlignmentsPerSequenceCommonEnds : pairwiseAlignmentsPerSequenceNonCommonEnds; pairwiseAlignmentsPerSequence[seq1]++; pairwiseAlignmentsPerSequence[seq2]++; } //Now calculate score adjustments. double *scoreAdjustmentsNonCommonEnds = st_malloc(stList_length(seqFrags) * sizeof(double)); double *scoreAdjustmentsCommonEnds = st_malloc(stList_length(seqFrags) * sizeof(double)); for(int64_t i=0; i<stList_length(seqFrags); i++) { SeqFrag *seqFrag = stList_get(seqFrags, i); End *otherEnd = flower_getEnd(end_getFlower(end), seqFrag->rightEndId); assert(otherEnd != NULL); assert(stHash_search(endInstanceNumbers, otherEnd) != NULL); int64_t commonInstanceNumber = *(int64_t *)stHash_search(endInstanceNumbers, otherEnd); int64_t nonCommonInstanceNumber = stList_length(seqFrags) - commonInstanceNumber; assert(commonInstanceNumber > 0 && nonCommonInstanceNumber >= 0); assert(pairwiseAlignmentsPerSequenceNonCommonEnds[i] <= nonCommonInstanceNumber); assert(pairwiseAlignmentsPerSequenceNonCommonEnds[i] >= 0); assert(pairwiseAlignmentsPerSequenceCommonEnds[i] < commonInstanceNumber); assert(pairwiseAlignmentsPerSequenceCommonEnds[i] >= 0); //scoreAdjustmentsNonCommonEnds[i] = ((double)nonCommonInstanceNumber + commonInstanceNumber - 1)/(pairwiseAlignmentsPerSequenceNonCommonEnds[i] + pairwiseAlignmentsPerSequenceCommonEnds[i]); //scoreAdjustmentsCommonEnds[i] = scoreAdjustmentsNonCommonEnds[i]; if(pairwiseAlignmentsPerSequenceNonCommonEnds[i] > 0) { scoreAdjustmentsNonCommonEnds[i] = ((double)nonCommonInstanceNumber)/pairwiseAlignmentsPerSequenceNonCommonEnds[i]; assert(scoreAdjustmentsNonCommonEnds[i] >= 1.0); assert(scoreAdjustmentsNonCommonEnds[i] <= nonCommonInstanceNumber); } else { scoreAdjustmentsNonCommonEnds[i] = INT64_MIN; } if(pairwiseAlignmentsPerSequenceCommonEnds[i] > 0) { scoreAdjustmentsCommonEnds[i] = ((double)commonInstanceNumber-1)/pairwiseAlignmentsPerSequenceCommonEnds[i]; assert(scoreAdjustmentsCommonEnds[i] >= 1.0); assert(scoreAdjustmentsCommonEnds[i] <= commonInstanceNumber-1); } else { scoreAdjustmentsCommonEnds[i] = INT64_MIN; } } //Convert the alignment pairs to an alignment of the caps.. stSortedSet *sortedAlignment = stSortedSet_construct3((int (*)(const void *, const void *))alignedPair_cmpFn, (void (*)(void *))alignedPair_destruct); while(stList_length(mA->alignedPairs) > 0) { stIntTuple *alignedPair = stList_pop(mA->alignedPairs); assert(stIntTuple_length(alignedPair) == 5); int64_t seqIndex1 = stIntTuple_get(alignedPair, 1); int64_t seqIndex2 = stIntTuple_get(alignedPair, 3); AdjacencySequence *i = stList_get(sequences, seqIndex1); AdjacencySequence *j = stList_get(sequences, seqIndex2); assert(i != j); int64_t offset1 = stIntTuple_get(alignedPair, 2); int64_t offset2 = stIntTuple_get(alignedPair, 4); int64_t score = stIntTuple_get(alignedPair, 0); if(score <= 0) { //Happens when indel probs are included score = 1; //This is the minimum } assert(score > 0 && score <= PAIR_ALIGNMENT_PROB_1); SeqFrag *seqFrag1 = stList_get(seqFrags, seqIndex1); SeqFrag *seqFrag2 = stList_get(seqFrags, seqIndex2); assert(seqFrag1 != seqFrag2); double *scoreAdjustments = seqFrag1->rightEndId == seqFrag2->rightEndId ? scoreAdjustmentsCommonEnds : scoreAdjustmentsNonCommonEnds; assert(scoreAdjustments[seqIndex1] != INT64_MIN); assert(scoreAdjustments[seqIndex2] != INT64_MIN); AlignedPair *alignedPair2 = alignedPair_construct( i->subsequenceIdentifier, i->start + (i->strand ? offset1 : -offset1), i->strand, j->subsequenceIdentifier, j->start + (j->strand ? offset2 : -offset2), j->strand, score*scoreAdjustments[seqIndex1], score*scoreAdjustments[seqIndex2]); //Do the reweighting here. assert(stSortedSet_search(sortedAlignment, alignedPair2) == NULL); assert(stSortedSet_search(sortedAlignment, alignedPair2->reverse) == NULL); stSortedSet_insert(sortedAlignment, alignedPair2); stSortedSet_insert(sortedAlignment, alignedPair2->reverse); stIntTuple_destruct(alignedPair); } //Cleanup stList_destruct(seqFrags); stList_destruct(sequences); free(pairwiseAlignmentsPerSequenceNonCommonEnds); free(pairwiseAlignmentsPerSequenceCommonEnds); free(scoreAdjustmentsNonCommonEnds); free(scoreAdjustmentsCommonEnds); multipleAlignment_destruct(mA); stHash_destruct(endInstanceNumbers); return sortedAlignment; }
int main(int argc, char *argv[]) { st_setLogLevelFromString(argv[1]); st_logDebug("Set up logging\n"); stKVDatabaseConf *kvDatabaseConf = stKVDatabaseConf_constructFromString(argv[2]); CactusDisk *cactusDisk = cactusDisk_construct(kvDatabaseConf, 0); stKVDatabaseConf_destruct(kvDatabaseConf); st_logDebug("Set up the flower disk\n"); Name flowerName = cactusMisc_stringToName(argv[3]); Flower *flower = cactusDisk_getFlower(cactusDisk, flowerName); int64_t totalBases = flower_getTotalBaseLength(flower); int64_t totalEnds = flower_getEndNumber(flower); int64_t totalFreeEnds = flower_getFreeStubEndNumber(flower); int64_t totalAttachedEnds = flower_getAttachedStubEndNumber(flower); int64_t totalCaps = flower_getCapNumber(flower); int64_t totalBlocks = flower_getBlockNumber(flower); int64_t totalGroups = flower_getGroupNumber(flower); int64_t totalChains = flower_getChainNumber(flower); int64_t totalLinkGroups = 0; int64_t maxEndDegree = 0; int64_t maxAdjacencyLength = 0; int64_t totalEdges = 0; Flower_EndIterator *endIt = flower_getEndIterator(flower); End *end; while((end = flower_getNextEnd(endIt)) != NULL) { assert(end_getOrientation(end)); if(end_getInstanceNumber(end) > maxEndDegree) { maxEndDegree = end_getInstanceNumber(end); } stSortedSet *ends = stSortedSet_construct(); End_InstanceIterator *capIt = end_getInstanceIterator(end); Cap *cap; while((cap = end_getNext(capIt)) != NULL) { if(cap_getSequence(cap) != NULL) { Cap *adjacentCap = cap_getAdjacency(cap); assert(adjacentCap != NULL); End *adjacentEnd = end_getPositiveOrientation(cap_getEnd(adjacentCap)); stSortedSet_insert(ends, adjacentEnd); int64_t adjacencyLength = cap_getCoordinate(cap) - cap_getCoordinate(adjacentCap); if(adjacencyLength < 0) { adjacencyLength *= -1; } assert(adjacencyLength >= 1); if(adjacencyLength >= maxAdjacencyLength) { maxAdjacencyLength = adjacencyLength; } } } end_destructInstanceIterator(capIt); totalEdges += stSortedSet_size(ends); if(stSortedSet_search(ends, end) != NULL) { //This ensures we count self edges twice, so that the division works. totalEdges += 1; } stSortedSet_destruct(ends); } assert(totalEdges % 2 == 0); flower_destructEndIterator(endIt); Flower_GroupIterator *groupIt = flower_getGroupIterator(flower); Group *group; while((group = flower_getNextGroup(groupIt)) != NULL) { if(group_getLink(group) != NULL) { totalLinkGroups++; } } flower_destructGroupIterator(groupIt); printf("flower name: %" PRIi64 " total bases: %" PRIi64 " total-ends: %" PRIi64 " total-caps: %" PRIi64 " max-end-degree: %" PRIi64 " max-adjacency-length: %" PRIi64 " total-blocks: %" PRIi64 " total-groups: %" PRIi64 " total-edges: %" PRIi64 " total-free-ends: %" PRIi64 " total-attached-ends: %" PRIi64 " total-chains: %" PRIi64 " total-link groups: %" PRIi64 "\n", flower_getName(flower), totalBases, totalEnds, totalCaps, maxEndDegree, maxAdjacencyLength, totalBlocks, totalGroups, totalEdges/2, totalFreeEnds, totalAttachedEnds, totalChains, totalLinkGroups); return 0; }