static void test_stSortedSetEquals(CuTest* testCase) { sonLibSortedSetTestSetup(); CuAssertTrue(testCase, stSortedSet_equals(sortedSet, sortedSet)); int32_t i; for(i=0; i<size; i++) { stSortedSet_insert(sortedSet, stIntTuple_construct(1, input[i])); } CuAssertTrue(testCase, stSortedSet_equals(sortedSet, sortedSet)); CuAssertTrue(testCase, !stSortedSet_equals(sortedSet, sortedSet2)); for(i=1; i<size; i++) { //first argument is unique in input.. stSortedSet_insert(sortedSet2, stIntTuple_construct(1, input[i])); } CuAssertTrue(testCase, !stSortedSet_equals(sortedSet, sortedSet2)); stSortedSet_insert(sortedSet2, stIntTuple_construct(1, input[0])); CuAssertTrue(testCase, stSortedSet_equals(sortedSet, sortedSet2)); stSortedSet *sortedSet3 = stSortedSet_construct(); //diff comparator.. CuAssertTrue(testCase, !stSortedSet_equals(sortedSet, sortedSet3)); for(i=0; i<size; i++) { stSortedSet_insert(sortedSet3, stIntTuple_construct(1, input[i])); } CuAssertTrue(testCase, !stSortedSet_equals(sortedSet, sortedSet3)); stSortedSet_destruct(sortedSet3); sonLibSortedSetTestTeardown(); }
static stHash *getComponents(stList *filteredEdges) { /* * A kind of stupid reimplementation of the greedy function, done just to trap typos. */ stHash *nodesToComponents = stHash_construct3((uint64_t(*)(const void *)) stIntTuple_hashKey, (int(*)(const void *, const void *)) stIntTuple_equalsFn, NULL, NULL); for (int64_t i = 0; i < stList_length(nodes); i++) { stIntTuple *node = stList_get(nodes, i); stSortedSet *component = stSortedSet_construct(); stSortedSet_insert(component, node); stHash_insert(nodesToComponents, node, component); } for (int64_t i = 0; i < stList_length(filteredEdges); i++) { stIntTuple *edge = stList_get(filteredEdges, i); stIntTuple *node1 = stIntTuple_construct1( stIntTuple_get(edge, 1)); stIntTuple *node2 = stIntTuple_construct1( stIntTuple_get(edge, 2)); stSortedSet *component1 = stHash_search(nodesToComponents, node1); stSortedSet *component2 = stHash_search(nodesToComponents, node2); assert(component1 != NULL && component2 != NULL); if (component1 != component2) { stSortedSet *component3 = stSortedSet_getUnion(component1, component2); stSortedSetIterator *setIt = stSortedSet_getIterator(component3); stIntTuple *node3; while ((node3 = stSortedSet_getNext(setIt)) != NULL) { stHash_insert(nodesToComponents, node3, component3); } stSortedSet_destructIterator(setIt); stSortedSet_destruct(component1); stSortedSet_destruct(component2); } stIntTuple_destruct(node1); stIntTuple_destruct(node2); } return nodesToComponents; }
stSortedSet *getMetaSequencesForEvents(Flower *flower, stList *eventStrings) { /* * Gets the haplotype sequences in the set. */ stSortedSet *metaSequences = stSortedSet_construct(); getMetaSequencesForEventsP(metaSequences, flower, eventStrings); return metaSequences; }
void test_stList_filter(CuTest *testCase) { setup(); stSortedSet *set = stSortedSet_construct(); stSortedSet_insert(set, strings[0]); stSortedSet_insert(set, strings[4]); stList *list2 = stList_filterToExclude(list, set); stList *list3 = stList_filterToInclude(list, set); CuAssertTrue(testCase,stList_length(list2) == 3); CuAssertTrue(testCase,stList_length(list3) == 2); CuAssertTrue(testCase,stList_get(list2, 0) == strings[1]); CuAssertTrue(testCase,stList_get(list2, 1) == strings[2]); CuAssertTrue(testCase,stList_get(list2, 2) == strings[3]); CuAssertTrue(testCase,stList_get(list3, 0) == strings[0]); CuAssertTrue(testCase,stList_get(list3, 1) == strings[4]); teardown(); }
static void test_stSortedSetDifference(CuTest* testCase) { sonLibSortedSetTestSetup(); //Check difference of empty sets is okay.. stSortedSet *sortedSet3 = stSortedSet_getDifference(sortedSet, sortedSet2); CuAssertTrue(testCase, stSortedSet_size(sortedSet3) == 0); stSortedSet_destruct(sortedSet3); int32_t i; for(i=0; i<size; i++) { stSortedSet_insert(sortedSet, stIntTuple_construct(1, input[i])); } //Check difference of non-empty set / empty set is the non-empty. sortedSet3 = stSortedSet_getDifference(sortedSet, sortedSet2); CuAssertTrue(testCase, stSortedSet_equals(sortedSet, sortedSet3)); stSortedSet_destruct(sortedSet3); //Check difference of two non-empty, overlapping sets in correct. stSortedSet_insert(sortedSet2, stIntTuple_construct(1, 0)); stSortedSet_insert(sortedSet2, stIntTuple_construct(1, 1)); stSortedSet_insert(sortedSet2, stIntTuple_construct(1, 5)); sortedSet3 = stSortedSet_getDifference(sortedSet, sortedSet2); CuAssertTrue(testCase, stSortedSet_size(sortedSet3) == stSortedSet_size(sortedSet) - 2); CuAssertTrue(testCase, !stSortedSet_equals(sortedSet, sortedSet3)); stSortedSet_insert(sortedSet3, stIntTuple_construct(1, 1)); stSortedSet_insert(sortedSet3, stIntTuple_construct(1, 5)); CuAssertTrue(testCase, stSortedSet_equals(sortedSet, sortedSet3)); stSortedSet_destruct(sortedSet3); //Check we get an exception when merging sorted sets with different comparators. stSortedSet *sortedSet4 = stSortedSet_construct(); stTry { stSortedSet_getDifference(sortedSet, sortedSet4); CuAssertTrue(testCase, 0); } stCatch(except) { CuAssertTrue(testCase, stExcept_getId(except) == SORTED_SET_EXCEPTION_ID); } stTryEnd stSortedSet_destruct(sortedSet4); sonLibSortedSetTestTeardown(); }
static void test_stSortedSetIntersection(CuTest* testCase) { sonLibSortedSetTestSetup(); //Check intersection of empty sets is okay.. stSortedSet *sortedSet3 = stSortedSet_getIntersection(sortedSet, sortedSet2); CuAssertTrue(testCase, stSortedSet_size(sortedSet3) == 0); stSortedSet_destruct(sortedSet3); int32_t i; for(i=0; i<size; i++) { stSortedSet_insert(sortedSet, stIntTuple_construct(1, input[i])); } //Check intersection of empty and non-empty set is empty. sortedSet3 = stSortedSet_getIntersection(sortedSet, sortedSet2); CuAssertTrue(testCase, stSortedSet_size(sortedSet3) == 0); stSortedSet_destruct(sortedSet3); //Check intersection of two non-empty, overlapping sets in correct. stSortedSet_insert(sortedSet2, stIntTuple_construct(1, 0)); stSortedSet_insert(sortedSet2, stIntTuple_construct(1, 1)); stSortedSet_insert(sortedSet2, stIntTuple_construct(1, 5)); sortedSet3 = stSortedSet_getIntersection(sortedSet, sortedSet2); CuAssertTrue(testCase, stSortedSet_size(sortedSet3) == 2); stIntTuple *intTuple = stIntTuple_construct(1, 1); CuAssertTrue(testCase, stSortedSet_search(sortedSet3, intTuple) != NULL); stIntTuple_destruct(intTuple); intTuple = stIntTuple_construct(1, 5); CuAssertTrue(testCase, stSortedSet_search(sortedSet3, intTuple) != NULL); stIntTuple_destruct(intTuple); stSortedSet_destruct(sortedSet3); //Check we get an exception with sorted sets with different comparators. stSortedSet *sortedSet4 = stSortedSet_construct(); stTry { stSortedSet_getIntersection(sortedSet, sortedSet4); } stCatch(except) { CuAssertTrue(testCase, stExcept_getId(except) == SORTED_SET_EXCEPTION_ID); } stTryEnd stSortedSet_destruct(sortedSet4); sonLibSortedSetTestTeardown(); }
stList *getContigPaths(Flower *flower, const char *eventString, stList *eventStrings) { stList *maximalHaplotypePaths = stList_construct3(0, (void(*)(void *)) stList_destruct); stSortedSet *segmentSet = stSortedSet_construct(); getMaximalHaplotypePathsP(flower, maximalHaplotypePaths, segmentSet, eventString, eventStrings); //Do some debug checks.. st_logDebug("We have %" PRIi64 " maximal haplotype paths\n", stList_length( maximalHaplotypePaths)); getMaximalHaplotypePathsCheck(flower, segmentSet, eventString, eventStrings); for (int64_t i = 0; i < stList_length(maximalHaplotypePaths); i++) { stList *maximalHaplotypePath = stList_get(maximalHaplotypePaths, i); st_logDebug("We have a maximal haplotype path with length %" PRIi64 "\n", stList_length(maximalHaplotypePath)); assert(stList_length(maximalHaplotypePath) > 0); Segment *_5Segment = stList_get(maximalHaplotypePath, 0); Segment *_3Segment = stList_get(maximalHaplotypePath, stList_length( maximalHaplotypePath) - 1); if (getAdjacentCapsSegment(segment_get5Cap(_5Segment)) != NULL) { assert(!trueAdjacency(segment_get5Cap(_5Segment), eventStrings)); } if (getAdjacentCapsSegment(segment_get3Cap(_3Segment)) != NULL) { assert(!trueAdjacency(segment_get3Cap(_3Segment), eventStrings)); } for (int64_t j = 0; j < stList_length(maximalHaplotypePath) - 1; j++) { _5Segment = stList_get(maximalHaplotypePath, j); _3Segment = stList_get(maximalHaplotypePath, j + 1); assert(trueAdjacency(segment_get3Cap(_5Segment), eventStrings)); assert(trueAdjacency(segment_get5Cap(_3Segment), eventStrings)); assert(cap_getAdjacency(getTerminalCap(segment_get3Cap(_5Segment))) == getTerminalCap(segment_get5Cap(_3Segment))); assert(strcmp(event_getHeader(segment_getEvent(_5Segment)), eventString) == 0); assert(strcmp(event_getHeader(segment_getEvent(_3Segment)), eventString) == 0); assert(hasCapInEvents(cap_getEnd(segment_get5Cap(_5Segment)), eventStrings)); //isHaplotypeEnd(cap_getEnd(segment_get5Cap(_5Segment)))); assert(hasCapInEvents(cap_getEnd(segment_get5Cap(_3Segment)), eventStrings)); //isHaplotypeEnd(cap_getEnd(segment_get5Cap(_3Segment)))); } } stSortedSet_destruct(segmentSet); return maximalHaplotypePaths; }
stList *getComponents(stList *edges) { /* * Gets a list of connected components, each connected component * being represented as a list of the edges, such that each edge is in exactly one * connected component. Allows for multi-graphs (multiple edges connecting two nodes). */ stHash *nodesToEdges = getNodesToEdgesHash(edges); /* * Traverse the edges greedily */ stList *components = stList_construct3(0, (void(*)(void *)) stList_destruct); stList *nodes = stHash_getKeys(nodesToEdges); while (stList_length(nodes) > 0) { stIntTuple *node = stList_pop(nodes); stList *edges = stHash_search(nodesToEdges, node); if (edges != NULL) { //We have a component to build stSortedSet *component = stSortedSet_construct(); stHash_remove(nodesToEdges, node); for (int64_t i = 0; i < stList_length(edges); i++) { stIntTuple *edge = stList_get(edges, i); getComponentsP(nodesToEdges, stIntTuple_get(edge, 0), component); getComponentsP(nodesToEdges, stIntTuple_get(edge, 1), component); } stList_append(components, stSortedSet_getList(component)); //Cleanup stSortedSet_destruct(component); stList_destruct(edges); } stIntTuple_destruct(node); } assert(stHash_size(nodesToEdges) == 0); stHash_destruct(nodesToEdges); stList_destruct(nodes); return components; }
int main(int argc, char *argv[]) { st_setLogLevelFromString(argv[1]); st_logDebug("Set up logging\n"); stKVDatabaseConf *kvDatabaseConf = stKVDatabaseConf_constructFromString(argv[2]); CactusDisk *cactusDisk = cactusDisk_construct(kvDatabaseConf, 0); stKVDatabaseConf_destruct(kvDatabaseConf); st_logDebug("Set up the flower disk\n"); Name flowerName = cactusMisc_stringToName(argv[3]); Flower *flower = cactusDisk_getFlower(cactusDisk, flowerName); int64_t totalBases = flower_getTotalBaseLength(flower); int64_t totalEnds = flower_getEndNumber(flower); int64_t totalFreeEnds = flower_getFreeStubEndNumber(flower); int64_t totalAttachedEnds = flower_getAttachedStubEndNumber(flower); int64_t totalCaps = flower_getCapNumber(flower); int64_t totalBlocks = flower_getBlockNumber(flower); int64_t totalGroups = flower_getGroupNumber(flower); int64_t totalChains = flower_getChainNumber(flower); int64_t totalLinkGroups = 0; int64_t maxEndDegree = 0; int64_t maxAdjacencyLength = 0; int64_t totalEdges = 0; Flower_EndIterator *endIt = flower_getEndIterator(flower); End *end; while((end = flower_getNextEnd(endIt)) != NULL) { assert(end_getOrientation(end)); if(end_getInstanceNumber(end) > maxEndDegree) { maxEndDegree = end_getInstanceNumber(end); } stSortedSet *ends = stSortedSet_construct(); End_InstanceIterator *capIt = end_getInstanceIterator(end); Cap *cap; while((cap = end_getNext(capIt)) != NULL) { if(cap_getSequence(cap) != NULL) { Cap *adjacentCap = cap_getAdjacency(cap); assert(adjacentCap != NULL); End *adjacentEnd = end_getPositiveOrientation(cap_getEnd(adjacentCap)); stSortedSet_insert(ends, adjacentEnd); int64_t adjacencyLength = cap_getCoordinate(cap) - cap_getCoordinate(adjacentCap); if(adjacencyLength < 0) { adjacencyLength *= -1; } assert(adjacencyLength >= 1); if(adjacencyLength >= maxAdjacencyLength) { maxAdjacencyLength = adjacencyLength; } } } end_destructInstanceIterator(capIt); totalEdges += stSortedSet_size(ends); if(stSortedSet_search(ends, end) != NULL) { //This ensures we count self edges twice, so that the division works. totalEdges += 1; } stSortedSet_destruct(ends); } assert(totalEdges % 2 == 0); flower_destructEndIterator(endIt); Flower_GroupIterator *groupIt = flower_getGroupIterator(flower); Group *group; while((group = flower_getNextGroup(groupIt)) != NULL) { if(group_getLink(group) != NULL) { totalLinkGroups++; } } flower_destructGroupIterator(groupIt); printf("flower name: %" PRIi64 " total bases: %" PRIi64 " total-ends: %" PRIi64 " total-caps: %" PRIi64 " max-end-degree: %" PRIi64 " max-adjacency-length: %" PRIi64 " total-blocks: %" PRIi64 " total-groups: %" PRIi64 " total-edges: %" PRIi64 " total-free-ends: %" PRIi64 " total-attached-ends: %" PRIi64 " total-chains: %" PRIi64 " total-link groups: %" PRIi64 "\n", flower_getName(flower), totalBases, totalEnds, totalCaps, maxEndDegree, maxAdjacencyLength, totalBlocks, totalGroups, totalEdges/2, totalFreeEnds, totalAttachedEnds, totalChains, totalLinkGroups); return 0; }
static stHash *getScaffoldPathsP(stList *haplotypePaths, stHash *haplotypePathToScaffoldPathHash, stList *haplotypeEventStrings, stList *contaminationEventStrings, CapCodeParameters *capCodeParameters) { stHash *haplotypeToMaximalHaplotypeLengthHash = buildContigPathToContigPathLengthHash(haplotypePaths); stHash *segmentToMaximalHaplotypePathHash = buildSegmentToContigPathHash(haplotypePaths); for (int64_t i = 0; i < stList_length(haplotypePaths); i++) { stSortedSet *bucket = stSortedSet_construct(); stHash_insert(haplotypePathToScaffoldPathHash, stList_get(haplotypePaths, i), bucket); stSortedSet_insert(bucket, stList_get(haplotypePaths, i)); } for (int64_t i = 0; i < stList_length(haplotypePaths); i++) { stList *haplotypePath = stList_get(haplotypePaths, i); assert(stList_length(haplotypePath) > 0); Segment *_5Segment = stList_get(haplotypePath, 0); if (!segment_getStrand(_5Segment)) { _5Segment = segment_getReverse(stList_get(haplotypePath, stList_length(haplotypePath) - 1)); } assert(segment_getStrand(_5Segment)); if (getAdjacentCapsSegment(segment_get5Cap(_5Segment)) != NULL) { assert(!trueAdjacency(segment_get5Cap(_5Segment), haplotypeEventStrings)); } int64_t insertLength; int64_t deleteLength; Cap *otherCap; enum CapCode _5CapCode = getCapCode(segment_get5Cap(_5Segment), &otherCap, haplotypeEventStrings, contaminationEventStrings, &insertLength, &deleteLength, capCodeParameters); if (_5CapCode == SCAFFOLD_GAP || _5CapCode == AMBIGUITY_GAP) { assert(stHash_search(haplotypeToMaximalHaplotypeLengthHash, haplotypePath) != NULL); int64_t j = stIntTuple_get(stHash_search(haplotypeToMaximalHaplotypeLengthHash, haplotypePath), 0); Segment *adjacentSegment = getAdjacentCapsSegment(segment_get5Cap(_5Segment)); assert(adjacentSegment != NULL); while (!hasCapInEvents(cap_getEnd(segment_get5Cap(adjacentSegment)), haplotypeEventStrings)) { //is not a haplotype end adjacentSegment = getAdjacentCapsSegment(segment_get5Cap(adjacentSegment)); assert(adjacentSegment != NULL); } assert(adjacentSegment != NULL); assert(hasCapInEvents(cap_getEnd(segment_get5Cap(adjacentSegment)), haplotypeEventStrings)); //is a haplotype end stList *adjacentHaplotypePath = stHash_search(segmentToMaximalHaplotypePathHash, adjacentSegment); if (adjacentHaplotypePath == NULL) { adjacentHaplotypePath = stHash_search(segmentToMaximalHaplotypePathHash, segment_getReverse( adjacentSegment)); } assert(adjacentHaplotypePath != NULL); assert(adjacentHaplotypePath != haplotypePath); assert(stHash_search(haplotypeToMaximalHaplotypeLengthHash, adjacentHaplotypePath) != NULL); int64_t k = stIntTuple_get(stHash_search(haplotypeToMaximalHaplotypeLengthHash, adjacentHaplotypePath), 0); //Now merge the buckets and make new int tuples.. stSortedSet *bucket1 = stHash_search(haplotypePathToScaffoldPathHash, haplotypePath); stSortedSet *bucket2 = stHash_search(haplotypePathToScaffoldPathHash, adjacentHaplotypePath); assert(bucket1 != NULL); assert(bucket2 != NULL); assert(bucket1 != bucket2); stSortedSet *bucket3 = stSortedSet_getUnion(bucket1, bucket2); stSortedSetIterator *bucketIt = stSortedSet_getIterator(bucket3); stList *l; while ((l = stSortedSet_getNext(bucketIt)) != NULL) { //Do the bucket first assert(stHash_search(haplotypePathToScaffoldPathHash, l) == bucket1 || stHash_search(haplotypePathToScaffoldPathHash, l) == bucket2); stHash_remove(haplotypePathToScaffoldPathHash, l); stHash_insert(haplotypePathToScaffoldPathHash, l, bucket3); //Now the length stIntTuple *m = stHash_remove(haplotypeToMaximalHaplotypeLengthHash, l); assert(m != NULL); assert(stIntTuple_get(m, 0) == j || stIntTuple_get(m, 0) == k); stHash_insert(haplotypeToMaximalHaplotypeLengthHash, l, stIntTuple_construct1( j + k)); stIntTuple_destruct(m); } assert(stHash_search(haplotypePathToScaffoldPathHash, haplotypePath) == bucket3); assert(stHash_search(haplotypePathToScaffoldPathHash, adjacentHaplotypePath) == bucket3); stSortedSet_destructIterator(bucketIt); } } stHash_destruct(segmentToMaximalHaplotypePathHash); return haplotypeToMaximalHaplotypeLengthHash; }