static enum CapCode getHaplotypeSwitchCode(Cap *cap, stList *eventStrings) { Cap *adjacentCap = cap_getAdjacency(getTerminalCap(cap)); assert(adjacentCap != NULL); End *end = cap_getEnd(cap); End *adjacentEnd = cap_getEnd(adjacentCap); stSortedSet *eventStringsForEnd1 = getEventStrings(end, eventStrings); stSortedSet *eventStringsForEnd2 = getEventStrings(adjacentEnd, eventStrings); assert(stSortedSet_size(eventStringsForEnd1) > 0); assert(stSortedSet_size(eventStringsForEnd2) > 0); stSortedSet *intersectionOfEventStrings = stSortedSet_getIntersection( eventStringsForEnd1, eventStringsForEnd2); enum CapCode code1 = (stSortedSet_size(intersectionOfEventStrings) != stSortedSet_size(eventStringsForEnd1) || stSortedSet_size( intersectionOfEventStrings) != stSortedSet_size(eventStringsForEnd2)) ? HAP_SWITCH : HAP_NOTHING; stSortedSet_destruct(eventStringsForEnd1); stSortedSet_destruct(eventStringsForEnd2); stSortedSet_destruct(intersectionOfEventStrings); return code1; }
static void getMaximalHaplotypePathsP2(Segment *segment, stList *maximalHaplotypePath, stSortedSet *segmentSet, stList *eventStrings) { /* * Iterate all the way to one end of the contig then start the traversal to define the maximal * haplotype path. */ Cap *_5Cap = segment_get5Cap(segment); assert(hasCapInEvents(cap_getEnd(segment_get3Cap(segment)), eventStrings)); //isHaplotypeEnd(cap_getEnd(segment_get3Cap(segment)))); if (trueAdjacency(_5Cap, eventStrings)) { //Check that the adjacency is supported by a haplotype path Segment *otherSegment = getAdjacentCapsSegment(_5Cap); assert(segment != otherSegment); assert(segment_getReverse(segment) != otherSegment); if (otherSegment != NULL) { assert(stSortedSet_search(segmentSet, otherSegment) == NULL); assert(stSortedSet_search(segmentSet, segment_getReverse( otherSegment)) == NULL); assert(hasCapInEvents(cap_getEnd(segment_get3Cap(otherSegment)), eventStrings)); //isHaplotypeEnd(cap_getEnd(segment_get3Cap(otherSegment)))); getMaximalHaplotypePathsP2(otherSegment, maximalHaplotypePath, segmentSet, eventStrings); } else { //We need to start the maximal haplotype recursion getMaximalHaplotypePathsP3(segment, maximalHaplotypePath, segmentSet, eventStrings); } } else { getMaximalHaplotypePathsP3(segment, maximalHaplotypePath, segmentSet, eventStrings); } }
bool trueAdjacency(Cap *cap, stList *eventStrings) { if(getTerminalAdjacencyLength(cap) > 0) { return 0; } cap = getTerminalCap(cap); assert(cap != NULL); Cap *otherCap = cap_getAdjacency(cap); assert(otherCap != NULL); assert(cap_getAdjacency(otherCap) == cap); //So is the adjacency present in one of the haplotypes? That's what we're going to answer.. End *otherEnd = end_getPositiveOrientation(cap_getEnd(otherCap)); End_InstanceIterator *endInstanceIt = end_getInstanceIterator(cap_getEnd(cap)); Cap *cap2; while ((cap2 = end_getNext(endInstanceIt)) != NULL) { Cap *otherCap2 = cap_getAdjacency(cap2); assert(otherCap2 != NULL); if (otherEnd == end_getPositiveOrientation(cap_getEnd(otherCap2))) { //const char *eventName = event_getHeader(cap_getEvent(cap2)); assert(event_getHeader(cap_getEvent(cap2)) == event_getHeader( cap_getEvent(otherCap2))); if (capHasGivenEvents(cap2, eventStrings)) { //strcmp(eventName, "hapA1") == 0 || strcmp(eventName, "hapA2") == 0) { if(getTerminalAdjacencyLength(cap2) == 0) { end_destructInstanceIterator(endInstanceIt); return 1; } } } } end_destructInstanceIterator(endInstanceIt); return 0; }
static void debugScaffoldPathsP(Cap *cap, stList *haplotypePath, stHash *haplotypePathToScaffoldPathHash, stHash *haplotypeToMaximalHaplotypeLengthHash, stHash *segmentToMaximalHaplotypePathHash, stList *haplotypeEventStrings, stList *contaminationEventStrings, CapCodeParameters *capCodeParameters, bool capDir) { int64_t insertLength; int64_t deleteLength; Cap *otherCap; enum CapCode capCode = getCapCode(cap, &otherCap, haplotypeEventStrings, contaminationEventStrings, &insertLength, &deleteLength, capCodeParameters); if (capCode == SCAFFOLD_GAP || capCode == AMBIGUITY_GAP) { Segment *adjacentSegment = getAdjacentCapsSegment(cap); assert(adjacentSegment != NULL); while (!hasCapInEvents(cap_getEnd(capDir ? segment_get5Cap(adjacentSegment) : segment_get3Cap(adjacentSegment)), haplotypeEventStrings)) { adjacentSegment = getAdjacentCapsSegment(capDir ? segment_get5Cap(adjacentSegment) : segment_get3Cap(adjacentSegment)); assert(adjacentSegment != NULL); } assert(adjacentSegment != NULL); assert(hasCapInEvents(cap_getEnd(segment_get5Cap(adjacentSegment)), haplotypeEventStrings)); //isHaplotypeEnd(cap_getEnd(segment_get5Cap(adjacentSegment)))); stIntTuple *j = stHash_search(haplotypeToMaximalHaplotypeLengthHash, haplotypePath); (void)j; assert(j != NULL); stList *adjacentHaplotypePath = stHash_search(segmentToMaximalHaplotypePathHash, adjacentSegment); if (adjacentHaplotypePath == NULL) { adjacentHaplotypePath = stHash_search(segmentToMaximalHaplotypePathHash, segment_getReverse(adjacentSegment)); } assert(adjacentHaplotypePath != NULL); assert(adjacentHaplotypePath != haplotypePath); stIntTuple *k = stHash_search(haplotypeToMaximalHaplotypeLengthHash, adjacentHaplotypePath); (void)k; assert(k != NULL); assert(stIntTuple_get(j, 0) == stIntTuple_get(k, 0)); assert(stHash_search(haplotypePathToScaffoldPathHash, haplotypePath) == stHash_search(haplotypePathToScaffoldPathHash, adjacentHaplotypePath)); } }
static int64_t traceThreadLength(Cap *cap, Cap **terminatingCap) { /* * Gets the length in bases of the thread in the flower, starting from a given attached stub cap. * The thread length includes the lengths of adjacencies that it contains. * * Terminating cap is initialised with the final cap on the thread from cap. */ assert(end_isStubEnd(cap_getEnd(cap))); int64_t threadLength = 0; while (1) { assert(cap_getCoordinate(cap) != INT64_MAX); int64_t adjacencyLength = cap_getCoordinate(cap); threadLength += adjacencyLength; Cap *adjacentCap = cap_getAdjacency(cap); assert(adjacentCap != NULL); assert(adjacencyLength == cap_getCoordinate(adjacentCap)); //Traverse any block.. if (cap_getSegment(adjacentCap) != NULL) { threadLength += segment_getLength(cap_getSegment(adjacentCap)); cap = cap_getOtherSegmentCap(adjacentCap); assert(cap != NULL); } else { assert(end_isStubEnd(cap_getEnd(adjacentCap))); *terminatingCap = adjacentCap; return threadLength; } } return 1; }
void testCap_getEnd(CuTest* testCase) { cactusCapTestSetup(); CuAssertTrue(testCase, cap_getEnd(rootCap) == end_getReverse(end)); CuAssertTrue(testCase, cap_getEnd(cap_getReverse(rootCap)) == end); CuAssertTrue(testCase, cap_getEnd(leaf2Cap) == end); CuAssertTrue(testCase, cap_getEnd(cap_getReverse(leaf2Cap)) == end_getReverse(end)); cactusCapTestTeardown(); }
void testCap_getOrientation(CuTest* testCase) { cactusCapTestSetup(); CuAssertTrue(testCase, cap_getOrientation(rootCap) == end_getOrientation(cap_getEnd(rootCap))); CuAssertTrue(testCase, cap_getOrientation(leaf1Cap) == end_getOrientation(cap_getEnd(leaf1Cap))); CuAssertTrue(testCase, cap_getOrientation(leaf2Cap) == end_getOrientation(cap_getEnd(leaf2Cap))); CuAssertTrue(testCase, cap_getOrientation(cap_getReverse(rootCap)) == end_getOrientation(end_getReverse(cap_getEnd(rootCap)))); CuAssertTrue(testCase, cap_getOrientation(cap_getReverse(leaf1Cap)) == end_getOrientation(end_getReverse(cap_getEnd(leaf1Cap)))); CuAssertTrue(testCase, cap_getOrientation(cap_getReverse(leaf2Cap)) == end_getOrientation(end_getReverse(cap_getEnd(leaf2Cap)))); CuAssertTrue(testCase, cap_getOrientation(leaf1Cap) == cap_getOrientation(rootCap)); CuAssertTrue(testCase, cap_getOrientation(leaf1Cap) != cap_getOrientation(leaf2Cap)); cactusCapTestTeardown(); }
static void recoverBrokenAdjacencies(Flower *flower, stList *recoveredCaps, Name referenceEventName) { /* * Find reference intervals that are book-ended by stubs created in a child flower. */ Flower_GroupIterator *groupIt = flower_getGroupIterator(flower); Group *group; while((group = flower_getNextGroup(groupIt)) != NULL) { Flower *nestedFlower; if((nestedFlower = group_getNestedFlower(group)) != NULL) { Flower_EndIterator *endIt = flower_getEndIterator(nestedFlower); End *childEnd; while((childEnd = flower_getNextEnd(endIt)) != NULL) { if(end_isStubEnd(childEnd) && flower_getEnd(flower, end_getName(childEnd)) == NULL) { //We have a thread we need to promote Cap *childCap = getCapForReferenceEvent(childEnd, referenceEventName); //The cap in the reference assert(childCap != NULL); assert(!end_isAttached(childEnd)); childCap = cap_getStrand(childCap) ? childCap : cap_getReverse(childCap); if (!cap_getSide(childCap)) { Cap *adjacentChildCap = NULL; int64_t adjacencyLength = traceThreadLength(childCap, &adjacentChildCap); Cap *cap = copyCapToParent(childCap, recoveredCaps); assert(adjacentChildCap != NULL); assert(!end_isAttached(cap_getEnd(adjacentChildCap))); assert(!cap_getSide(cap)); Cap *adjacentCap = copyCapToParent(adjacentChildCap, recoveredCaps); cap_makeAdjacent(cap, adjacentCap); setAdjacencyLength(cap, adjacentCap, adjacencyLength); } } } flower_destructEndIterator(endIt); } } flower_destructGroupIterator(groupIt); }
bool getCapGetAtEndOfPath(Cap *cap, Cap **pathEndCap, int64_t *pathLength, int64_t *nCount, stList *haplotypeEventStrings, stList *contaminationEventStrings) { //Account for length of adjacency *pathLength += getTerminalAdjacencyLength(cap); *nCount += getNumberOfNsInAdjacency(cap); Segment *segment = getAdjacentCapsSegment(cap); if (segment == NULL) { *pathEndCap = cap_getAdjacency(getTerminalCap(cap)); assert(*pathEndCap != NULL); return 0; } Cap *adjacentCap = cap_getSide(cap) ? segment_get3Cap(segment) : segment_get5Cap(segment); assert( cap_getName(adjacentCap) == cap_getName( cap_getAdjacency(getTerminalCap(cap)))); End *adjacentEnd = cap_getEnd(adjacentCap); if (hasCapInEvents(adjacentEnd, contaminationEventStrings) || hasCapInEvents(adjacentEnd, haplotypeEventStrings)) { //hasCapNotInEvent(adjacentEnd, event_getHeader(cap_getEvent(cap)))) { //isContaminationEnd(adjacentEnd) || isHaplotypeEnd(adjacentEnd)) { *pathEndCap = adjacentCap; return 1; } *pathLength += segment_getLength(segment); *nCount += getNumberOfNsInSegment(segment); return getCapGetAtEndOfPath(cap_getOtherSegmentCap(adjacentCap), pathEndCap, pathLength, nCount, haplotypeEventStrings, contaminationEventStrings); }
static void getMaximalHaplotypePathsCheck(Flower *flower, stSortedSet *segmentSet, const char *eventString, stList *eventStrings) { /* * Do debug checks that the haplotypes paths are well formed. */ Flower_SegmentIterator *segmentIt = flower_getSegmentIterator(flower); Segment *segment; while ((segment = flower_getNextSegment(segmentIt)) != NULL) { if (strcmp(event_getHeader(segment_getEvent(segment)), eventString) == 0) { if (hasCapInEvents(cap_getEnd(segment_get5Cap(segment)), eventStrings)) { //isHaplotypeEnd(cap_getEnd(segment_get5Cap(segment)))) { assert(stSortedSet_search(segmentSet, segment) != NULL || stSortedSet_search(segmentSet, segment_getReverse( segment)) != NULL); } } } flower_destructSegmentIterator(segmentIt); Flower_GroupIterator *groupIt = flower_getGroupIterator(flower); Group *group; while ((group = flower_getNextGroup(groupIt)) != NULL) { if (group_getNestedFlower(group) != NULL) { getMaximalHaplotypePathsCheck(group_getNestedFlower(group), segmentSet, eventString, eventStrings); } } flower_destructGroupIterator(groupIt); }
Cap *getTerminalCap(Cap *cap) { Flower *nestedFlower = group_getNestedFlower(end_getGroup(cap_getEnd(cap))); if (nestedFlower != NULL) { Cap *nestedCap = flower_getCap(nestedFlower, cap_getName(cap)); assert(nestedCap != NULL); return getTerminalCap(cap_getOrientation(cap) ? nestedCap : cap_getReverse(nestedCap)); } return cap; }
stList *getContigPaths(Flower *flower, const char *eventString, stList *eventStrings) { stList *maximalHaplotypePaths = stList_construct3(0, (void(*)(void *)) stList_destruct); stSortedSet *segmentSet = stSortedSet_construct(); getMaximalHaplotypePathsP(flower, maximalHaplotypePaths, segmentSet, eventString, eventStrings); //Do some debug checks.. st_logDebug("We have %" PRIi64 " maximal haplotype paths\n", stList_length( maximalHaplotypePaths)); getMaximalHaplotypePathsCheck(flower, segmentSet, eventString, eventStrings); for (int64_t i = 0; i < stList_length(maximalHaplotypePaths); i++) { stList *maximalHaplotypePath = stList_get(maximalHaplotypePaths, i); st_logDebug("We have a maximal haplotype path with length %" PRIi64 "\n", stList_length(maximalHaplotypePath)); assert(stList_length(maximalHaplotypePath) > 0); Segment *_5Segment = stList_get(maximalHaplotypePath, 0); Segment *_3Segment = stList_get(maximalHaplotypePath, stList_length( maximalHaplotypePath) - 1); if (getAdjacentCapsSegment(segment_get5Cap(_5Segment)) != NULL) { assert(!trueAdjacency(segment_get5Cap(_5Segment), eventStrings)); } if (getAdjacentCapsSegment(segment_get3Cap(_3Segment)) != NULL) { assert(!trueAdjacency(segment_get3Cap(_3Segment), eventStrings)); } for (int64_t j = 0; j < stList_length(maximalHaplotypePath) - 1; j++) { _5Segment = stList_get(maximalHaplotypePath, j); _3Segment = stList_get(maximalHaplotypePath, j + 1); assert(trueAdjacency(segment_get3Cap(_5Segment), eventStrings)); assert(trueAdjacency(segment_get5Cap(_3Segment), eventStrings)); assert(cap_getAdjacency(getTerminalCap(segment_get3Cap(_5Segment))) == getTerminalCap(segment_get5Cap(_3Segment))); assert(strcmp(event_getHeader(segment_getEvent(_5Segment)), eventString) == 0); assert(strcmp(event_getHeader(segment_getEvent(_3Segment)), eventString) == 0); assert(hasCapInEvents(cap_getEnd(segment_get5Cap(_5Segment)), eventStrings)); //isHaplotypeEnd(cap_getEnd(segment_get5Cap(_5Segment)))); assert(hasCapInEvents(cap_getEnd(segment_get5Cap(_3Segment)), eventStrings)); //isHaplotypeEnd(cap_getEnd(segment_get5Cap(_3Segment)))); } } stSortedSet_destruct(segmentSet); return maximalHaplotypePaths; }
static void getMaximalHaplotypePathsP(Flower *flower, stList *maximalHaplotypePaths, stSortedSet *segmentSet, const char *eventString, stList *eventStrings) { /* * Iterate through the segments in this flower. */ Flower_SegmentIterator *segmentIt = flower_getSegmentIterator(flower); Segment *segment; while ((segment = flower_getNextSegment(segmentIt)) != NULL) { if (stSortedSet_search(segmentSet, segment) == NULL && stSortedSet_search(segmentSet, segment_getReverse(segment)) == NULL) { //Check we haven't yet seen this segment if (strcmp(event_getHeader(segment_getEvent(segment)), eventString) == 0) { //Check if the segment is in the assembly if (hasCapInEvents(cap_getEnd(segment_get5Cap(segment)), eventStrings)) { //Is a block in a haplotype segment assert(hasCapInEvents(cap_getEnd(segment_get3Cap(segment)), eventStrings)); //isHaplotypeEnd(cap_getEnd(segment_get3Cap(segment)))); stList *maximalHaplotypePath = stList_construct(); stList_append(maximalHaplotypePaths, maximalHaplotypePath); getMaximalHaplotypePathsP2(segment, maximalHaplotypePath, segmentSet, eventStrings); } else { assert(!hasCapInEvents(cap_getEnd(segment_get3Cap(segment)), eventStrings));//assert(!isHaplotypeEnd(cap_getEnd(segment_get3Cap(segment)))); } } } } flower_destructSegmentIterator(segmentIt); /* * Now recurse on the contained flowers. */ Flower_GroupIterator *groupIt = flower_getGroupIterator(flower); Group *group; while ((group = flower_getNextGroup(groupIt)) != NULL) { if (group_getNestedFlower(group) != NULL) { getMaximalHaplotypePathsP(group_getNestedFlower(group), maximalHaplotypePaths, segmentSet, eventString, eventStrings); } } flower_destructGroupIterator(groupIt); }
void bottomUp(stList *flowers, stKVDatabase *sequenceDatabase, Name referenceEventName, bool isTop, stMatrix *(*generateSubstitutionMatrix)(double)) { /* * A reference thread between the two caps * in each flower f may be broken into two in the children of f. * Therefore, for each flower f first identify attached stub ends present in the children of f that are * not present in f and copy them into f, reattaching the reference caps as needed. */ stList *caps = getCaps(flowers, referenceEventName); for (int64_t i = stList_length(caps) - 1; i >= 0; i--) { //Start from end, as we add to this list. setAdjacencyLengthsAndRecoverNewCapsAndBrokenAdjacencies(stList_get(caps, i), caps); } for(int64_t i=0; i<stList_length(flowers); i++) { recoverBrokenAdjacencies(stList_get(flowers, i), caps, referenceEventName); } //Build the phylogenetic event trees for base calling. segmentWriteFn_flowerToPhylogeneticTreeHash = stHash_construct2(NULL, (void (*)(void *))cleanupPhylogeneticTree); for(int64_t i=0; i<stList_length(flowers); i++) { Flower *flower = stList_get(flowers, i); Event *refEvent = eventTree_getEvent(flower_getEventTree(flower), referenceEventName); assert(refEvent != NULL); stHash_insert(segmentWriteFn_flowerToPhylogeneticTreeHash, flower, getPhylogeneticTreeRootedAtGivenEvent(refEvent, generateSubstitutionMatrix)); } if (isTop) { stList *threadStrings = buildRecursiveThreadsInList(sequenceDatabase, caps, segmentWriteFn, terminalAdjacencyWriteFn); assert(stList_length(threadStrings) == stList_length(caps)); int64_t nonTrivialSeqIndex = 0, trivialSeqIndex = stList_length(threadStrings); //These are used as indices for the names of trivial and non-trivial sequences. for (int64_t i = 0; i < stList_length(threadStrings); i++) { Cap *cap = stList_get(caps, i); assert(cap_getStrand(cap)); assert(!cap_getSide(cap)); Flower *flower = end_getFlower(cap_getEnd(cap)); char *threadString = stList_get(threadStrings, i); bool trivialString = isTrivialString(&threadString); //This alters the original string MetaSequence *metaSequence = addMetaSequence(flower, cap, trivialString ? trivialSeqIndex++ : nonTrivialSeqIndex++, threadString, trivialString); free(threadString); int64_t endCoordinate = setCoordinates(flower, metaSequence, cap, metaSequence_getStart(metaSequence) - 1); (void) endCoordinate; assert(endCoordinate == metaSequence_getLength(metaSequence) + metaSequence_getStart(metaSequence)); } stList_setDestructor(threadStrings, NULL); //The strings are already cleaned up by the above loop stList_destruct(threadStrings); } else { buildRecursiveThreads(sequenceDatabase, caps, segmentWriteFn, terminalAdjacencyWriteFn); } stHash_destruct(segmentWriteFn_flowerToPhylogeneticTreeHash); stList_destruct(caps); }
static void setAdjacencyLengthsAndRecoverNewCapsAndBrokenAdjacencies(Cap *cap, stList *recoveredCaps) { /* * Sets the coordinates of the caps to be equal to the length of the adjacency sequence between them. * Used to build the reference sequence bottom up. * * One complexity is that a reference thread between the two caps * in each flower f may be broken into two in the children of f. * Therefore, for each flower f first identify attached stub ends present in the children of f that are * not present in f and copy them into f, reattaching the reference caps as needed. */ while (1) { Cap *adjacentCap = cap_getAdjacency(cap); assert(adjacentCap != NULL); assert(cap_getCoordinate(cap) == INT64_MAX); assert(cap_getCoordinate(adjacentCap) == INT64_MAX); assert(cap_getStrand(cap) == cap_getStrand(adjacentCap)); assert(cap_getSide(cap) != cap_getSide(adjacentCap)); Group *group = end_getGroup(cap_getEnd(cap)); assert(group != NULL); if (!group_isLeaf(group)) { //Adjacency is not terminal, so establish its sequence. Flower *nestedFlower = group_getNestedFlower(group); Cap *nestedCap = flower_getCap(nestedFlower, cap_getName(cap)); assert(nestedCap != NULL); Cap *nestedAdjacentCap = flower_getCap(nestedFlower, cap_getName(adjacentCap)); assert(nestedAdjacentCap != NULL); Cap *breakerCap; int64_t adjacencyLength = traceThreadLength(nestedCap, &breakerCap); assert(cap_getOrientation(nestedAdjacentCap)); if (cap_getPositiveOrientation(breakerCap) != nestedAdjacentCap) { //The thread is broken at the lower level. //Copy cap into higher level graph. breakerCap = copyCapToParent(breakerCap, recoveredCaps); assert(cap_getSide(breakerCap)); cap_makeAdjacent(cap, breakerCap); setAdjacencyLength(cap, breakerCap, adjacencyLength); adjacencyLength = traceThreadLength(nestedAdjacentCap, &breakerCap); assert(cap_getPositiveOrientation(breakerCap) != cap); breakerCap = copyCapToParent(breakerCap, recoveredCaps); assert(!cap_getSide(breakerCap)); cap_makeAdjacent(breakerCap, adjacentCap); setAdjacencyLength(adjacentCap, breakerCap, adjacencyLength); } else { //The thread is not broken at the lower level setAdjacencyLength(cap, adjacentCap, adjacencyLength); } } else { //Set the coordinates of the caps to the adjacency size setAdjacencyLength(cap, adjacentCap, 0); } if ((cap = cap_getOtherSegmentCap(adjacentCap)) == NULL) { break; } } }
Segment *getCapsSegment(Cap *cap) { if (cap_getSegment(cap) != NULL) { return cap_getSegment(cap); } assert(!end_isBlockEnd(cap_getEnd(cap))); assert(end_isStubEnd(cap_getEnd(cap))); //Walk up to get the next adjacency. Group *parentGroup = flower_getParentGroup(end_getFlower(cap_getEnd(cap))); if (parentGroup != NULL) { Cap *parentCap = flower_getCap(group_getFlower(parentGroup), cap_getName(cap)); if (parentCap != NULL) { assert(cap_getOrientation(parentCap)); if (!cap_getOrientation(cap)) { parentCap = cap_getReverse(parentCap); } return getCapsSegment(parentCap); } else { //Cap must be a free stub end. assert(0); //Not in the current alignments. assert(end_isFree(cap_getEnd(cap))); } } return NULL; }
void mapBlockToExon(Cap *cap, int level, FILE *fileHandle){ fprintf(fileHandle, "\t\t\t<block>\n"); Block *block = end_getBlock(cap_getEnd(cap)); Chain *chain = block_getChain(block); int start = cap_getCoordinate(cap); int end = cap_getCoordinate(cap_getOtherSegmentCap(cap)) +1; fprintf(fileHandle, "\t\t\t\t<blockName>%s</blockName>\n", cactusMisc_nameToString(block_getName(block))); if(chain != NULL){ fprintf(fileHandle, "\t\t\t\t<chainName>%s</chainName>\n", cactusMisc_nameToString(chain_getName(chain))); }else{ fprintf(fileHandle, "\t\t\t\t<chainName>NA</chainName>\n"); } fprintf(fileHandle, "\t\t\t\t<level>%d</level>\n", level); fprintf(fileHandle, "\t\t\t\t<start>%d</start>\n", start); fprintf(fileHandle, "\t\t\t\t<end>%d</end>\n", end); fprintf(fileHandle, "\t\t\t</block>\n"); st_logInfo("mapBlockToExon: start: %d, end: %d\n", start, end); }
static Cap *copyCapToParent(Cap *cap, stList *recoveredCaps) { /* * Get the adjacent stub end by looking at the reference adjacency in the parent. */ End *end = cap_getEnd(cap); assert(end != NULL); Group *parentGroup = flower_getParentGroup(end_getFlower(end)); assert(parentGroup != NULL); End *copiedEnd = end_copyConstruct(end, group_getFlower(parentGroup)); end_setGroup(copiedEnd, parentGroup); //Set group Cap *copiedCap = end_getInstance(copiedEnd, cap_getName(cap)); assert(copiedCap != NULL); copiedCap = cap_getStrand(copiedCap) ? copiedCap : cap_getReverse(copiedCap); if (!cap_getSide(copiedCap)) { stList_append(recoveredCaps, copiedCap); } return copiedCap; }
int64_t flower_getTotalBaseLength(Flower *flower) { /* * The implementation of this function is very like that in group_getTotalBaseLength, with a few differences. Consider merging them. */ Flower_EndIterator *endIterator = flower_getEndIterator(flower); End *end; int64_t totalLength = 0; while ((end = flower_getNextEnd(endIterator)) != NULL) { if (!end_isBlockEnd(end)) { End_InstanceIterator *instanceIterator = end_getInstanceIterator(end); Cap *cap; while ((cap = end_getNext(instanceIterator)) != NULL) { cap = cap_getStrand(cap) ? cap : cap_getReverse(cap); if (!cap_getSide(cap) && cap_getSequence(cap) != NULL) { Cap *cap2 = cap_getAdjacency(cap); assert(cap2 != NULL); while (end_isBlockEnd(cap_getEnd(cap2))) { Segment *segment = cap_getSegment(cap2); assert(segment != NULL); assert(segment_get5Cap(segment) == cap2); cap2 = cap_getAdjacency(segment_get3Cap(segment)); assert(cap2 != NULL); assert(cap_getStrand(cap2)); assert(cap_getSide(cap2)); } assert(cap_getStrand(cap2)); assert(cap_getSide(cap2)); int64_t length = cap_getCoordinate(cap2) - cap_getCoordinate(cap) - 1; assert(length >= 0); totalLength += length; } } end_destructInstanceIterator(instanceIterator); } } flower_destructEndIterator(endIterator); return totalLength; }
stSortedSet *makeEndAlignment(StateMachine *sM, End *end, int64_t spanningTrees, int64_t maxSequenceLength, bool useProgressiveMerging, float gapGamma, PairwiseAlignmentParameters *pairwiseAlignmentBandingParameters) { //Make an alignment of the sequences in the ends //Get the adjacency sequences to be aligned. Cap *cap; End_InstanceIterator *it = end_getInstanceIterator(end); stList *sequences = stList_construct3(0, (void (*)(void *))adjacencySequence_destruct); stList *seqFrags = stList_construct3(0, (void (*)(void *))seqFrag_destruct); stHash *endInstanceNumbers = stHash_construct2(NULL, free); while((cap = end_getNext(it)) != NULL) { if(cap_getSide(cap)) { cap = cap_getReverse(cap); } AdjacencySequence *adjacencySequence = adjacencySequence_construct(cap, maxSequenceLength); stList_append(sequences, adjacencySequence); assert(cap_getAdjacency(cap) != NULL); End *otherEnd = end_getPositiveOrientation(cap_getEnd(cap_getAdjacency(cap))); stList_append(seqFrags, seqFrag_construct(adjacencySequence->string, 0, end_getName(otherEnd))); //Increase count of seqfrags with a given end. int64_t *c = stHash_search(endInstanceNumbers, otherEnd); if(c == NULL) { c = st_calloc(1, sizeof(int64_t)); assert(*c == 0); stHash_insert(endInstanceNumbers, otherEnd, c); } (*c)++; } end_destructInstanceIterator(it); //Get the alignment. MultipleAlignment *mA = makeAlignment(sM, seqFrags, spanningTrees, 100000000, useProgressiveMerging, gapGamma, pairwiseAlignmentBandingParameters); //Build an array of weights to reweight pairs in the alignment. int64_t *pairwiseAlignmentsPerSequenceNonCommonEnds = st_calloc(stList_length(seqFrags), sizeof(int64_t)); int64_t *pairwiseAlignmentsPerSequenceCommonEnds = st_calloc(stList_length(seqFrags), sizeof(int64_t)); //First build array on number of pairwise alignments to each sequence, distinguishing alignments between sequences sharing //common ends. for(int64_t i=0; i<stList_length(mA->chosenPairwiseAlignments); i++) { stIntTuple *pairwiseAlignment = stList_get(mA->chosenPairwiseAlignments, i); int64_t seq1 = stIntTuple_get(pairwiseAlignment, 1); int64_t seq2 = stIntTuple_get(pairwiseAlignment, 2); assert(seq1 != seq2); SeqFrag *seqFrag1 = stList_get(seqFrags, seq1); SeqFrag *seqFrag2 = stList_get(seqFrags, seq2); int64_t *pairwiseAlignmentsPerSequence = seqFrag1->rightEndId == seqFrag2->rightEndId ? pairwiseAlignmentsPerSequenceCommonEnds : pairwiseAlignmentsPerSequenceNonCommonEnds; pairwiseAlignmentsPerSequence[seq1]++; pairwiseAlignmentsPerSequence[seq2]++; } //Now calculate score adjustments. double *scoreAdjustmentsNonCommonEnds = st_malloc(stList_length(seqFrags) * sizeof(double)); double *scoreAdjustmentsCommonEnds = st_malloc(stList_length(seqFrags) * sizeof(double)); for(int64_t i=0; i<stList_length(seqFrags); i++) { SeqFrag *seqFrag = stList_get(seqFrags, i); End *otherEnd = flower_getEnd(end_getFlower(end), seqFrag->rightEndId); assert(otherEnd != NULL); assert(stHash_search(endInstanceNumbers, otherEnd) != NULL); int64_t commonInstanceNumber = *(int64_t *)stHash_search(endInstanceNumbers, otherEnd); int64_t nonCommonInstanceNumber = stList_length(seqFrags) - commonInstanceNumber; assert(commonInstanceNumber > 0 && nonCommonInstanceNumber >= 0); assert(pairwiseAlignmentsPerSequenceNonCommonEnds[i] <= nonCommonInstanceNumber); assert(pairwiseAlignmentsPerSequenceNonCommonEnds[i] >= 0); assert(pairwiseAlignmentsPerSequenceCommonEnds[i] < commonInstanceNumber); assert(pairwiseAlignmentsPerSequenceCommonEnds[i] >= 0); //scoreAdjustmentsNonCommonEnds[i] = ((double)nonCommonInstanceNumber + commonInstanceNumber - 1)/(pairwiseAlignmentsPerSequenceNonCommonEnds[i] + pairwiseAlignmentsPerSequenceCommonEnds[i]); //scoreAdjustmentsCommonEnds[i] = scoreAdjustmentsNonCommonEnds[i]; if(pairwiseAlignmentsPerSequenceNonCommonEnds[i] > 0) { scoreAdjustmentsNonCommonEnds[i] = ((double)nonCommonInstanceNumber)/pairwiseAlignmentsPerSequenceNonCommonEnds[i]; assert(scoreAdjustmentsNonCommonEnds[i] >= 1.0); assert(scoreAdjustmentsNonCommonEnds[i] <= nonCommonInstanceNumber); } else { scoreAdjustmentsNonCommonEnds[i] = INT64_MIN; } if(pairwiseAlignmentsPerSequenceCommonEnds[i] > 0) { scoreAdjustmentsCommonEnds[i] = ((double)commonInstanceNumber-1)/pairwiseAlignmentsPerSequenceCommonEnds[i]; assert(scoreAdjustmentsCommonEnds[i] >= 1.0); assert(scoreAdjustmentsCommonEnds[i] <= commonInstanceNumber-1); } else { scoreAdjustmentsCommonEnds[i] = INT64_MIN; } } //Convert the alignment pairs to an alignment of the caps.. stSortedSet *sortedAlignment = stSortedSet_construct3((int (*)(const void *, const void *))alignedPair_cmpFn, (void (*)(void *))alignedPair_destruct); while(stList_length(mA->alignedPairs) > 0) { stIntTuple *alignedPair = stList_pop(mA->alignedPairs); assert(stIntTuple_length(alignedPair) == 5); int64_t seqIndex1 = stIntTuple_get(alignedPair, 1); int64_t seqIndex2 = stIntTuple_get(alignedPair, 3); AdjacencySequence *i = stList_get(sequences, seqIndex1); AdjacencySequence *j = stList_get(sequences, seqIndex2); assert(i != j); int64_t offset1 = stIntTuple_get(alignedPair, 2); int64_t offset2 = stIntTuple_get(alignedPair, 4); int64_t score = stIntTuple_get(alignedPair, 0); if(score <= 0) { //Happens when indel probs are included score = 1; //This is the minimum } assert(score > 0 && score <= PAIR_ALIGNMENT_PROB_1); SeqFrag *seqFrag1 = stList_get(seqFrags, seqIndex1); SeqFrag *seqFrag2 = stList_get(seqFrags, seqIndex2); assert(seqFrag1 != seqFrag2); double *scoreAdjustments = seqFrag1->rightEndId == seqFrag2->rightEndId ? scoreAdjustmentsCommonEnds : scoreAdjustmentsNonCommonEnds; assert(scoreAdjustments[seqIndex1] != INT64_MIN); assert(scoreAdjustments[seqIndex2] != INT64_MIN); AlignedPair *alignedPair2 = alignedPair_construct( i->subsequenceIdentifier, i->start + (i->strand ? offset1 : -offset1), i->strand, j->subsequenceIdentifier, j->start + (j->strand ? offset2 : -offset2), j->strand, score*scoreAdjustments[seqIndex1], score*scoreAdjustments[seqIndex2]); //Do the reweighting here. assert(stSortedSet_search(sortedAlignment, alignedPair2) == NULL); assert(stSortedSet_search(sortedAlignment, alignedPair2->reverse) == NULL); stSortedSet_insert(sortedAlignment, alignedPair2); stSortedSet_insert(sortedAlignment, alignedPair2->reverse); stIntTuple_destruct(alignedPair); } //Cleanup stList_destruct(seqFrags); stList_destruct(sequences); free(pairwiseAlignmentsPerSequenceNonCommonEnds); free(pairwiseAlignmentsPerSequenceCommonEnds); free(scoreAdjustmentsNonCommonEnds); free(scoreAdjustmentsCommonEnds); multipleAlignment_destruct(mA); stHash_destruct(endInstanceNumbers); return sortedAlignment; }
static stHash *getScaffoldPathsP(stList *haplotypePaths, stHash *haplotypePathToScaffoldPathHash, stList *haplotypeEventStrings, stList *contaminationEventStrings, CapCodeParameters *capCodeParameters) { stHash *haplotypeToMaximalHaplotypeLengthHash = buildContigPathToContigPathLengthHash(haplotypePaths); stHash *segmentToMaximalHaplotypePathHash = buildSegmentToContigPathHash(haplotypePaths); for (int64_t i = 0; i < stList_length(haplotypePaths); i++) { stSortedSet *bucket = stSortedSet_construct(); stHash_insert(haplotypePathToScaffoldPathHash, stList_get(haplotypePaths, i), bucket); stSortedSet_insert(bucket, stList_get(haplotypePaths, i)); } for (int64_t i = 0; i < stList_length(haplotypePaths); i++) { stList *haplotypePath = stList_get(haplotypePaths, i); assert(stList_length(haplotypePath) > 0); Segment *_5Segment = stList_get(haplotypePath, 0); if (!segment_getStrand(_5Segment)) { _5Segment = segment_getReverse(stList_get(haplotypePath, stList_length(haplotypePath) - 1)); } assert(segment_getStrand(_5Segment)); if (getAdjacentCapsSegment(segment_get5Cap(_5Segment)) != NULL) { assert(!trueAdjacency(segment_get5Cap(_5Segment), haplotypeEventStrings)); } int64_t insertLength; int64_t deleteLength; Cap *otherCap; enum CapCode _5CapCode = getCapCode(segment_get5Cap(_5Segment), &otherCap, haplotypeEventStrings, contaminationEventStrings, &insertLength, &deleteLength, capCodeParameters); if (_5CapCode == SCAFFOLD_GAP || _5CapCode == AMBIGUITY_GAP) { assert(stHash_search(haplotypeToMaximalHaplotypeLengthHash, haplotypePath) != NULL); int64_t j = stIntTuple_get(stHash_search(haplotypeToMaximalHaplotypeLengthHash, haplotypePath), 0); Segment *adjacentSegment = getAdjacentCapsSegment(segment_get5Cap(_5Segment)); assert(adjacentSegment != NULL); while (!hasCapInEvents(cap_getEnd(segment_get5Cap(adjacentSegment)), haplotypeEventStrings)) { //is not a haplotype end adjacentSegment = getAdjacentCapsSegment(segment_get5Cap(adjacentSegment)); assert(adjacentSegment != NULL); } assert(adjacentSegment != NULL); assert(hasCapInEvents(cap_getEnd(segment_get5Cap(adjacentSegment)), haplotypeEventStrings)); //is a haplotype end stList *adjacentHaplotypePath = stHash_search(segmentToMaximalHaplotypePathHash, adjacentSegment); if (adjacentHaplotypePath == NULL) { adjacentHaplotypePath = stHash_search(segmentToMaximalHaplotypePathHash, segment_getReverse( adjacentSegment)); } assert(adjacentHaplotypePath != NULL); assert(adjacentHaplotypePath != haplotypePath); assert(stHash_search(haplotypeToMaximalHaplotypeLengthHash, adjacentHaplotypePath) != NULL); int64_t k = stIntTuple_get(stHash_search(haplotypeToMaximalHaplotypeLengthHash, adjacentHaplotypePath), 0); //Now merge the buckets and make new int tuples.. stSortedSet *bucket1 = stHash_search(haplotypePathToScaffoldPathHash, haplotypePath); stSortedSet *bucket2 = stHash_search(haplotypePathToScaffoldPathHash, adjacentHaplotypePath); assert(bucket1 != NULL); assert(bucket2 != NULL); assert(bucket1 != bucket2); stSortedSet *bucket3 = stSortedSet_getUnion(bucket1, bucket2); stSortedSetIterator *bucketIt = stSortedSet_getIterator(bucket3); stList *l; while ((l = stSortedSet_getNext(bucketIt)) != NULL) { //Do the bucket first assert(stHash_search(haplotypePathToScaffoldPathHash, l) == bucket1 || stHash_search(haplotypePathToScaffoldPathHash, l) == bucket2); stHash_remove(haplotypePathToScaffoldPathHash, l); stHash_insert(haplotypePathToScaffoldPathHash, l, bucket3); //Now the length stIntTuple *m = stHash_remove(haplotypeToMaximalHaplotypeLengthHash, l); assert(m != NULL); assert(stIntTuple_get(m, 0) == j || stIntTuple_get(m, 0) == k); stHash_insert(haplotypeToMaximalHaplotypeLengthHash, l, stIntTuple_construct1( j + k)); stIntTuple_destruct(m); } assert(stHash_search(haplotypePathToScaffoldPathHash, haplotypePath) == bucket3); assert(stHash_search(haplotypePathToScaffoldPathHash, adjacentHaplotypePath) == bucket3); stSortedSet_destructIterator(bucketIt); } } stHash_destruct(segmentToMaximalHaplotypePathHash); return haplotypeToMaximalHaplotypeLengthHash; }
enum CapCode getCapCode(Cap *cap, Cap **otherCap, stList *haplotypeEventStrings, stList *contaminationEventStrings, int64_t *insertLength, int64_t *deleteLength, CapCodeParameters *capCodeParameters) { assert(hasCapInEvents(cap_getEnd(cap), haplotypeEventStrings)); if (trueAdjacency(cap, haplotypeEventStrings)) { return getHaplotypeSwitchCode(cap, haplotypeEventStrings); } *insertLength = 0; *deleteLength = 0; End *end = cap_getEnd(cap); Cap *pathEndCap = NULL; int64_t pathLength = 0, nCount = 0; bool pathEndsOnStub = !getCapGetAtEndOfPath(cap, &pathEndCap, &pathLength, &nCount, haplotypeEventStrings, contaminationEventStrings); *otherCap = pathEndCap; assert(pathLength >= 0); assert(nCount >= 0); assert(pathEndCap != NULL); End *otherPathEnd = cap_getEnd(pathEndCap); nCount += getBoundingNs(cap) + getBoundingNs(pathEndCap); if (pathEndsOnStub) { assert(!hasCapInEvents(otherPathEnd, contaminationEventStrings)); //Can not test for hap event strings, as a stub end may contain the reference. //assert(!hasCapInEvents(otherPathEnd, haplotypeEventStrings)); //isHaplotypeEnd(otherPathEnd) && !isContaminationEnd(otherPathEnd)); return pathLength == 0 ? CONTIG_END : (nCount >= 1 ? (nCount >= capCodeParameters->minimumNCount ? CONTIG_END_WITH_SCAFFOLD_GAP : CONTIG_END_WITH_AMBIGUITY_GAP) : ERROR_CONTIG_END_WITH_INSERT); } if (hasCapInEvents(otherPathEnd, haplotypeEventStrings)) { if (endsAreConnected(end, otherPathEnd, haplotypeEventStrings)) { int64_t minimumHaplotypeDistanceBetweenEnds; //Establish if indel or order breaking rearrangement if (endsAreAdjacent(end, otherPathEnd, &minimumHaplotypeDistanceBetweenEnds, haplotypeEventStrings)) { *insertLength = pathLength; *deleteLength = minimumHaplotypeDistanceBetweenEnds; if (nCount >= capCodeParameters->minimumNCount) { //Insertion was scaffold gap return SCAFFOLD_GAP; } if (nCount >= 1) { return AMBIGUITY_GAP; } if (pathLength > 0) { //Had insertion if (minimumHaplotypeDistanceBetweenEnds > 0) { //Had deletion also return (minimumHaplotypeDistanceBetweenEnds >= capCodeParameters->maxDeletionLength || pathLength >= capCodeParameters->maxInsertionLength) ? ERROR_HAP_TO_HAP_SAME_CHROMOSOME : ERROR_HAP_TO_INSERT_AND_DELETION; } return pathLength >= capCodeParameters->maxInsertionLength ? ERROR_HAP_TO_HAP_SAME_CHROMOSOME : ERROR_HAP_TO_INSERT; } else { //Deletion, possibly with Ns assert(minimumHaplotypeDistanceBetweenEnds > 0); return minimumHaplotypeDistanceBetweenEnds >= capCodeParameters->maxDeletionLength ? ERROR_HAP_TO_HAP_SAME_CHROMOSOME : ERROR_HAP_TO_DELETION; } } else { return ERROR_HAP_TO_HAP_SAME_CHROMOSOME; } } else { return ERROR_HAP_TO_HAP_DIFFERENT_CHROMOSOMES; } } else { return pathLength == 0 ? ERROR_HAP_TO_CONTAMINATION : ERROR_HAP_TO_INSERT_TO_CONTAMINATION; } }
int main(int argc, char *argv[]) { st_setLogLevelFromString(argv[1]); st_logDebug("Set up logging\n"); stKVDatabaseConf *kvDatabaseConf = stKVDatabaseConf_constructFromString(argv[2]); CactusDisk *cactusDisk = cactusDisk_construct(kvDatabaseConf, 0); stKVDatabaseConf_destruct(kvDatabaseConf); st_logDebug("Set up the flower disk\n"); Name flowerName = cactusMisc_stringToName(argv[3]); Flower *flower = cactusDisk_getFlower(cactusDisk, flowerName); int64_t totalBases = flower_getTotalBaseLength(flower); int64_t totalEnds = flower_getEndNumber(flower); int64_t totalFreeEnds = flower_getFreeStubEndNumber(flower); int64_t totalAttachedEnds = flower_getAttachedStubEndNumber(flower); int64_t totalCaps = flower_getCapNumber(flower); int64_t totalBlocks = flower_getBlockNumber(flower); int64_t totalGroups = flower_getGroupNumber(flower); int64_t totalChains = flower_getChainNumber(flower); int64_t totalLinkGroups = 0; int64_t maxEndDegree = 0; int64_t maxAdjacencyLength = 0; int64_t totalEdges = 0; Flower_EndIterator *endIt = flower_getEndIterator(flower); End *end; while((end = flower_getNextEnd(endIt)) != NULL) { assert(end_getOrientation(end)); if(end_getInstanceNumber(end) > maxEndDegree) { maxEndDegree = end_getInstanceNumber(end); } stSortedSet *ends = stSortedSet_construct(); End_InstanceIterator *capIt = end_getInstanceIterator(end); Cap *cap; while((cap = end_getNext(capIt)) != NULL) { if(cap_getSequence(cap) != NULL) { Cap *adjacentCap = cap_getAdjacency(cap); assert(adjacentCap != NULL); End *adjacentEnd = end_getPositiveOrientation(cap_getEnd(adjacentCap)); stSortedSet_insert(ends, adjacentEnd); int64_t adjacencyLength = cap_getCoordinate(cap) - cap_getCoordinate(adjacentCap); if(adjacencyLength < 0) { adjacencyLength *= -1; } assert(adjacencyLength >= 1); if(adjacencyLength >= maxAdjacencyLength) { maxAdjacencyLength = adjacencyLength; } } } end_destructInstanceIterator(capIt); totalEdges += stSortedSet_size(ends); if(stSortedSet_search(ends, end) != NULL) { //This ensures we count self edges twice, so that the division works. totalEdges += 1; } stSortedSet_destruct(ends); } assert(totalEdges % 2 == 0); flower_destructEndIterator(endIt); Flower_GroupIterator *groupIt = flower_getGroupIterator(flower); Group *group; while((group = flower_getNextGroup(groupIt)) != NULL) { if(group_getLink(group) != NULL) { totalLinkGroups++; } } flower_destructGroupIterator(groupIt); printf("flower name: %" PRIi64 " total bases: %" PRIi64 " total-ends: %" PRIi64 " total-caps: %" PRIi64 " max-end-degree: %" PRIi64 " max-adjacency-length: %" PRIi64 " total-blocks: %" PRIi64 " total-groups: %" PRIi64 " total-edges: %" PRIi64 " total-free-ends: %" PRIi64 " total-attached-ends: %" PRIi64 " total-chains: %" PRIi64 " total-link groups: %" PRIi64 "\n", flower_getName(flower), totalBases, totalEnds, totalCaps, maxEndDegree, maxAdjacencyLength, totalBlocks, totalGroups, totalEdges/2, totalFreeEnds, totalAttachedEnds, totalChains, totalLinkGroups); return 0; }
int mapGene(Cap *cap, int level, int exon, struct bed *gene, FILE *fileHandle){ /* *Following cactus adjacencies, starting from 'cap', find regions that overlap with *exons of input gene. Report chain relations of these regions with the exons. *cap: current cap. Level = chain level. exon = exon number. gene = bed record of gene */ int64_t exonStart, exonEnd; if(isStubCap(cap)){ Group *group = end_getGroup(cap_getEnd(cap)); Flower *nestedFlower = group_getNestedFlower(group); if(nestedFlower != NULL){//recursive call Cap *childCap = flower_getCap(nestedFlower, cap_getName(cap)); assert(childCap != NULL); exon = mapGene(childCap, level + 1, exon, gene, fileHandle); exonStart = gene->chromStarts->list[exon] + gene->chromStart; exonEnd = exonStart + gene->blockSizes->list[exon]; } } cap = cap_getAdjacency(cap); Cap *nextcap; int64_t capCoor; exonStart = gene->chromStarts->list[exon] + gene->chromStart; exonEnd = exonStart + gene->blockSizes->list[exon]; Block *block = end_getBlock(cap_getEnd(cap)); if(block == NULL){ moveCapToNextBlock(&cap); } while(!isStubCap(cap) && exon < gene->blockCount){ End *cend = cap_getEnd(cap); capCoor = cap_getCoordinate(cap);//Cap coordinate is always the coordinate on + strand nextcap = cap_getAdjacency(cap_getOtherSegmentCap(cap)); st_logInfo("capCoor: %d, nextCap: %d, eStart: %d, eEnd: %d. Exon: %d\n", capCoor, cap_getCoordinate(nextcap), exonStart, exonEnd, exon); //keep moving if nextBlock Start is still upstream of current exon if(cap_getCoordinate(nextcap) <= exonStart){ moveCapToNextBlock(&cap); st_logInfo("Still upstream, nextcap <= exonStart. Move to next chainBlock\n"); }else if(capCoor >= exonEnd){//Done with current exon, move to next st_logInfo("Done with current exon, move to next one\n\n"); fprintf(fileHandle, "\t\t</exon>\n");//end previous exon exon++; if(exon < gene->blockCount){ exonStart = gene->chromStarts->list[exon] + gene->chromStart; exonEnd = exonStart + gene->blockSizes->list[exon]; fprintf(fileHandle, "\t\t<exon id=\"%d\" start=\"%" PRIi64 "\" end=\"%" PRIi64 "\">\n", exon, exonStart, exonEnd); } }else{//current exon overlaps with current block Or with lower level flower Cap *oppcap = cap_getOtherSegmentCap(cap); st_logInfo("Current exon overlaps with current block or with lower flower\n"); if(cap_getCoordinate(oppcap) >= exonStart && exonEnd > capCoor){ mapBlockToExon(cap, level, fileHandle); if(exonEnd <= cap_getCoordinate(oppcap) + 1){ st_logInfo("Done with current exon, move to next one\n\n"); fprintf(fileHandle, "\t\t</exon>\n");//end previous exon exon++; if(exon < gene->blockCount){ exonStart = gene->chromStarts->list[exon] + gene->chromStart; exonEnd = exonStart + gene->blockSizes->list[exon]; fprintf(fileHandle, "\t\t<exon id=\"%d\" start=\"%" PRIi64 "\" end=\"%" PRIi64 "\">\n", exon, exonStart, exonEnd); } continue; } } //Traverse lower level flowers if exists Group *group = end_getGroup(end_getOtherBlockEnd(cend)); Flower *nestedFlower = group_getNestedFlower(group); if(nestedFlower != NULL){//recursive call Cap *childCap = flower_getCap(nestedFlower, cap_getName(cap_getOtherSegmentCap(cap))); assert(childCap != NULL); exon = mapGene(childCap, level + 1, exon, gene, fileHandle); exonStart = gene->chromStarts->list[exon] + gene->chromStart; exonEnd = exonStart + gene->blockSizes->list[exon]; } moveCapToNextBlock(&cap); } } return exon; }