void stNaiveConnectivity_addEdge(stNaiveConnectivity *connectivity, void *node1, void *node2) { invalidateCache(connectivity); struct adjacency *newEdge1 = malloc(sizeof(struct adjacency)); struct adjacency *newEdge2 = malloc(sizeof(struct adjacency)); newEdge1->toNode = node2; newEdge2->toNode = node1; newEdge1->inverse = newEdge2; newEdge2->inverse = newEdge1; newEdge1->prev = NULL; newEdge2->prev = NULL; struct adjacency *adjList1 = stHash_search(connectivity->nodesToAdjList, node1); if (adjList1 == NULL) { newEdge1->next = NULL; } else { newEdge1->next = adjList1; adjList1->prev = newEdge1; } stHash_remove(connectivity->nodesToAdjList, node1); stHash_insert(connectivity->nodesToAdjList, node1, newEdge1); struct adjacency *adjList2 = stHash_search(connectivity->nodesToAdjList, node2); if (adjList2 == NULL) { newEdge2->next = NULL; } else { newEdge2->next = adjList2; adjList2->prev = newEdge2; } stHash_remove(connectivity->nodesToAdjList, node2); stHash_insert(connectivity->nodesToAdjList, node2, newEdge2); }
/* Parse XML string into a hash. This parses all attributes of all tags * into values. st_kv_database_conf type is stored as conf_type, * database tag is stores as db_tag. This does minimal error checking * and is really lame. */ static stHash *hackParseXmlString(const char *xmlString) { stHash *hash = stHash_construct3(stHash_stringKey, stHash_stringEqualKey, free, free); char *toReplace[5] = { "</", "<", "/>", ">", "=" }; char *cA = stString_replace(xmlString, toReplace[0], " "), *cA2; for (int64_t i = 1; i < 5; i++) { cA2 = stString_replace(cA, toReplace[i], " "); free(cA); cA = cA2; } getExpectedToken(&cA2, "st_kv_database_conf"); stHash_insert(hash, stString_copy("conf_type"), getKeyValue(&cA2, "type")); stHash_insert(hash, stString_copy("db_tag"), getNextToken(&cA2)); char *key; while (((key = getNextToken(&cA2)) != NULL) && !stString_eq(key, "st_kv_database_conf")) { char *value = getNextToken(&cA2); if (value == NULL) { stThrowNew(ST_KV_DATABASE_EXCEPTION_ID, "failed to to get value for key \"%s\"", key); } if (stHash_search(hash, key) != NULL) { stThrowNew(ST_KV_DATABASE_EXCEPTION_ID, "got a duplicate entry in the database conf string \"%s\"", key); } stHash_insert(hash, key, value); } if(!stString_eq(key, "st_kv_database_conf")) { stThrowNew(ST_KV_DATABASE_EXCEPTION_ID, "got an unexpected final entry \"%s\"", key); } free(key); free(cA); return hash; }
static stHash *getComponents(stList *filteredEdges) { /* * A kind of stupid reimplementation of the greedy function, done just to trap typos. */ stHash *nodesToComponents = stHash_construct3((uint64_t(*)(const void *)) stIntTuple_hashKey, (int(*)(const void *, const void *)) stIntTuple_equalsFn, NULL, NULL); for (int64_t i = 0; i < stList_length(nodes); i++) { stIntTuple *node = stList_get(nodes, i); stSortedSet *component = stSortedSet_construct(); stSortedSet_insert(component, node); stHash_insert(nodesToComponents, node, component); } for (int64_t i = 0; i < stList_length(filteredEdges); i++) { stIntTuple *edge = stList_get(filteredEdges, i); stIntTuple *node1 = stIntTuple_construct1( stIntTuple_get(edge, 1)); stIntTuple *node2 = stIntTuple_construct1( stIntTuple_get(edge, 2)); stSortedSet *component1 = stHash_search(nodesToComponents, node1); stSortedSet *component2 = stHash_search(nodesToComponents, node2); assert(component1 != NULL && component2 != NULL); if (component1 != component2) { stSortedSet *component3 = stSortedSet_getUnion(component1, component2); stSortedSetIterator *setIt = stSortedSet_getIterator(component3); stIntTuple *node3; while ((node3 = stSortedSet_getNext(setIt)) != NULL) { stHash_insert(nodesToComponents, node3, component3); } stSortedSet_destructIterator(setIt); stSortedSet_destruct(component1); stSortedSet_destruct(component2); } stIntTuple_destruct(node1); stIntTuple_destruct(node2); } return nodesToComponents; }
static void test_addBlockToHash_3(CuTest *testCase) { // concatenation with 2 bases of interstitial and a sequence length breakpoint options_t *options = options_construct(); options->breakpointPenalty = 10; options->interstitialSequence = 5; stList *observedList = stList_construct3(0, free); stList *expectedList = stList_construct3(0, free); stHash *observedHash = createBlockHashFromString("a score=0\n" "s reference.chr0 0 13 + 158545518 gcagctgaaaaca\n" "s name.chr1 0 10 + 100 ATGT---ATGCCG\n" "s name2.chr1 0 10 + 100 ATGT---ATGCCG\n" "s name3.chr1 0 13 + 100 GCAGCTGAAAACA\n", observedList ); mafBlock_t *mb = maf_newMafBlockListFromString("a score=0 test\n" "s reference.chr0 13 5 + 158545518 ACGTA\n" "s name.chr1 12 5 + 100 gtcGG\n" "s name2.chr1 10 5 + 100 ATGTg\n" "s name3.chr1 50 5 + 100 CCCCC\n" , 3); stHash *expectedHash = NULL; expectedHash = createBlockHashFromString("a score=0\n" "s reference.chr0 0 18 + 158545518 gcagctgaaaaca------------ACGTA\n" "s name.chr1 0 17 + 100 ATGT---ATGCCGac----------gtcGG\n" "s name2.chr1 0 15 + 100 ATGT---ATGCCG------------ATGTg\n" "s name3 0 28 + 28 GCAGCTGAAAACA--NNNNNNNNNNCCCCC\n", expectedList ); row_t *r = stHash_search(expectedHash, "name3"); r->prevRightPos = 54; free(r->prevName); r->prevName = stString_copy("name3.chr1"); r->multipleNames = true; stHash *seqHash = createSeqHashFromString("name.chr1", "ATGTATGCCGacgtc" "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG" "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG"); mtfseq_t *mtfs = newMtfseqFromString("gcagctgaaaacaACGTA" "tttttttttttttttttttttttttttttttt" "tttttttttttttttttttttttttttttttttttttttttttttttttt"); stHash_insert(seqHash, stString_copy("reference.chr0"), mtfs); mtfs = newMtfseqFromString("ATGTATGCCGATGTg" "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC" "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"); stHash_insert(seqHash, stString_copy("name2.chr1"), mtfs); mtfs = newMtfseqFromString("GCAGCTGAAAACAggggggggggggggggggggggggggggggggggggg" "CCCCCaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" ); stHash_insert(seqHash, stString_copy("name3.chr1"), mtfs); addMafBlockToRowHash(observedHash, seqHash, observedList, mb, options); CuAssertTrue(testCase, hashesAreEqual(observedHash, expectedHash)); CuAssertTrue(testCase, listsAreEqual(observedList, expectedList)); // clean up stHash_destruct(observedHash); stHash_destruct(expectedHash); stHash_destruct(seqHash); stList_destruct(observedList); stList_destruct(expectedList); maf_destroyMafBlockList(mb); destroyOptions(options); }
static void test_addBlockToHash_2(CuTest *testCase) { // concatenation with 2 bases of interstitial AND a previously unobserved sequence options_t *options = options_construct(); options->breakpointPenalty = 10; options->interstitialSequence = 5; stList *observedList = stList_construct3(0, free); stList *expectedList = stList_construct3(0, free); stHash *observedHash = createBlockHashFromString("a score=0\n" "s reference.chr0 0 13 + 158545518 gcagctgaaaaca\n" "s name.chr1 0 10 + 100 ATGT---ATGCCG\n" "s name2.chr1 0 10 + 100 ATGT---ATGCCG\n", observedList); mafBlock_t *mb = maf_newMafBlockListFromString("a score=0 test\n" "s reference.chr0 13 5 + 158545518 ACGTA\n" "s name.chr1 12 5 + 100 gTcGG\n" "s name2.chr1 10 5 + 100 ATGTg\n" "s name3.chr@ 0 5 + 20 aaccg\n" , 3); stHash *expectedHash = createBlockHashFromString("a score=0\n" "s reference.chr0 0 18 + 158545518 gcagctgaaaaca--ACGTA\n" "s name.chr1 0 17 + 100 ATGT---ATGCCGacgTcGG\n" "s name2.chr1 0 15 + 100 ATGT---ATGCCG--ATGTg\n" "s name3.chr@ 0 5 + 20 ---------------aaccg\n", expectedList ); stHash *seqHash = createSeqHashFromString("name.chr1", "ATGTATGCCGacgTc" "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG" "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG"); mtfseq_t *mtfs = newMtfseqFromString("gcagctgaaaacaACGTA" "tttttttttttttttttttttttttttttttt" "tttttttttttttttttttttttttttttttttttttttttttttttttt"); stHash_insert(seqHash, stString_copy("reference.chr0"), mtfs); mtfs = newMtfseqFromString("ATGTATGCCGATGTg" "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC" "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"); stHash_insert(seqHash, stString_copy("name2.chr1"), mtfs); mtfs = newMtfseqFromString("aaccgTTTTTTTTTTTTTTT"); stHash_insert(seqHash, stString_copy("name3.chr@"), mtfs); addMafBlockToRowHash(observedHash, seqHash, observedList, mb, options); CuAssertTrue(testCase, hashesAreEqual(observedHash, expectedHash)); CuAssertTrue(testCase, listsAreEqual(observedList, expectedList)); // clean up stHash_destruct(observedHash); stHash_destruct(expectedHash); stHash_destruct(seqHash); stList_destruct(observedList); stList_destruct(expectedList); maf_destroyMafBlockList(mb); destroyOptions(options); }
static stHash *createSeqHashFromString(char *name, char *input) { mtfseq_t *mtfs = newMtfseq(strlen(input)); stHash *hash = stHash_construct3(stHash_stringKey, stHash_stringEqualKey, free, destroyMtfseq); seq_copyIn(mtfs, input); stHash_insert(hash, stString_copy(name), mtfs); return hash; }
// Returns a hash mapping from sequence header to sequence data. static stHash *readFastaFile(char *filename) { FILE *fasta = fopen(filename, "r"); if (fasta == NULL) { st_errnoAbort("Could not open fasta file %s", filename); } stHash *headerToData = stHash_construct3(stHash_stringKey, stHash_stringEqualKey, free, free); struct List *seqs = constructEmptyList(0, NULL); struct List *seqLengths = constructEmptyList(0, free); struct List *headers = constructEmptyList(0, free); fastaRead(fasta, seqs, seqLengths, headers); for (int64_t i = 0; i < seqs->length; i++) { char *fullHeader = headers->list[i]; stList *headerTokens = stString_splitByString(fullHeader, " "); char *usableHeader = stString_copy(stList_get(headerTokens, 0)); stHash_insert(headerToData, usableHeader, seqs->list[i]); stList_destruct(headerTokens); } destructList(seqs); destructList(seqLengths); destructList(headers); return headerToData; }
static stHash *putEdgesInHash(stList *edges) { stHash *intsToEdgesHash = stHash_construct3((uint64_t (*)(const void *))stIntTuple_hashKey, (int (*)(const void *, const void *))stIntTuple_equalsFn, (void (*)(void *))stIntTuple_destruct, NULL); for(int64_t i=0; i<stList_length(edges); i++) { stIntTuple *edge = stList_get(edges, i); stHash_insert(intsToEdgesHash, constructEdge(stIntTuple_get(edge, 0), stIntTuple_get(edge, 1)), edge); } return intsToEdgesHash; }
stHash *buildContigPathToContigPathLengthHash( stList *maximalHaplotypePaths) { stHash *maximalHaplotypesToMaximalHaplotypePathLengths = stHash_construct(); for (int64_t i = 0; i < stList_length(maximalHaplotypePaths); i++) { stList *maximalHaplotypePath = stList_get(maximalHaplotypePaths, i); int64_t k = contigPathLength(maximalHaplotypePath); stHash_insert(maximalHaplotypesToMaximalHaplotypePathLengths, maximalHaplotypePath, stIntTuple_construct1( k)); } return maximalHaplotypesToMaximalHaplotypePathLengths; }
/* * This builds an adjacency list structure for the the sequences. Every sequence-position * has a column in the hash with which it can be aligned with. */ static stHash *buildAdjacencyList(stList *pairs, int64_t sequenceNumber) { stHash *hash = stHash_construct3((uint64_t (*)(const void *))stIntTuple_hashKey, (int (*)(const void *, const void *))stIntTuple_equalsFn, (void (*)(void *))stIntTuple_destruct, NULL); for(int64_t seq=0; seq<sequenceNumber; seq++) { for(int64_t position=0; position<MAX_SEQUENCE_SIZE; position++) { stIntTuple *seqPos = stIntTuple_construct2( seq, position); stSortedSet *column = stSortedSet_construct3((int (*)(const void *, const void *))stIntTuple_cmpFn, NULL); stSortedSet_insert(column, seqPos); stHash_insert(hash, seqPos, column); } } stListIterator *it = stList_getIterator(pairs); stIntTuple *pair; while((pair = stList_getNext(it)) != NULL) { stIntTuple *seqPos1 = stIntTuple_construct2( stIntTuple_get(pair, 0), stIntTuple_get(pair, 1)); stIntTuple *seqPos2 = stIntTuple_construct2( stIntTuple_get(pair, 2), stIntTuple_get(pair, 3)); stSortedSet *column1 = stHash_search(hash, seqPos1); assert(column1 != NULL); stSortedSet *column2 = stHash_search(hash, seqPos2); assert(column2 != NULL); if(column1 != column2) { //Merge the columns stSortedSetIterator *it2 = stSortedSet_getIterator(column2); stIntTuple *seqPos3; while((seqPos3 = stSortedSet_getNext(it2)) != NULL) { assert(stSortedSet_search(column1, seqPos3) == NULL); stSortedSet_insert(column1, seqPos3); assert(stHash_search(hash, seqPos3) == column2); stHash_insert(hash, seqPos3, column1); assert(stHash_search(hash, seqPos3) == column1); } stSortedSet_destructIterator(it2); stSortedSet_destruct(column2); } //Cleanup loop. stIntTuple_destruct(seqPos1); stIntTuple_destruct(seqPos2); } stList_destructIterator(it); return hash; }
/* * Fill in a hashtable which to every node associates * alist of lifted edges */ static stHash *buildFaces_computeLiftedEdges(Flower * flower) { stHash *liftedEdgesTable = stHash_construct3(buildFaces_hashfunction, buildFaces_key_eq_fn, NULL, buildFaces_destructValue); Flower_CapIterator *iter = flower_getCapIterator(flower); Cap *cap, *attachedAncestor; Cap *adjacency, *adjacencyAncestor; stList *liftedEdges; LiftedEdge *liftedEdge; // Iterate through potential bottom nodes while ((cap = flower_getNextCap(iter))) { // ... check if connected if ((adjacency = cap_getAdjacency(cap))) { // ... lift attachedAncestor = cap_getTopCap(cap); adjacencyAncestor = cap_getTopCap(cap_getPositiveOrientation( adjacency)); #ifndef NDEBUG assert((attachedAncestor && adjacencyAncestor) || (!attachedAncestor && !adjacencyAncestor)); #endif // If root node if (attachedAncestor == NULL) continue; // ... create lifted edge liftedEdge = st_malloc(sizeof(LiftedEdge)); liftedEdge->destination = adjacencyAncestor; liftedEdge->bottomNode = cap; #ifndef NDEBUG // Self loop if (adjacencyAncestor == attachedAncestor) abort(); #endif // ... add it to the hashtable if ((liftedEdges = stHash_search(liftedEdgesTable, attachedAncestor))) { stList_append(liftedEdges, liftedEdge); } else { liftedEdges = stList_construct3(2, buildFaces_stList_destructElem); stList_append(liftedEdges, liftedEdge); stHash_insert(liftedEdgesTable, attachedAncestor, liftedEdges); } } } flower_destructCapIterator(iter); return liftedEdgesTable; }
void bottomUp(stList *flowers, stKVDatabase *sequenceDatabase, Name referenceEventName, bool isTop, stMatrix *(*generateSubstitutionMatrix)(double)) { /* * A reference thread between the two caps * in each flower f may be broken into two in the children of f. * Therefore, for each flower f first identify attached stub ends present in the children of f that are * not present in f and copy them into f, reattaching the reference caps as needed. */ stList *caps = getCaps(flowers, referenceEventName); for (int64_t i = stList_length(caps) - 1; i >= 0; i--) { //Start from end, as we add to this list. setAdjacencyLengthsAndRecoverNewCapsAndBrokenAdjacencies(stList_get(caps, i), caps); } for(int64_t i=0; i<stList_length(flowers); i++) { recoverBrokenAdjacencies(stList_get(flowers, i), caps, referenceEventName); } //Build the phylogenetic event trees for base calling. segmentWriteFn_flowerToPhylogeneticTreeHash = stHash_construct2(NULL, (void (*)(void *))cleanupPhylogeneticTree); for(int64_t i=0; i<stList_length(flowers); i++) { Flower *flower = stList_get(flowers, i); Event *refEvent = eventTree_getEvent(flower_getEventTree(flower), referenceEventName); assert(refEvent != NULL); stHash_insert(segmentWriteFn_flowerToPhylogeneticTreeHash, flower, getPhylogeneticTreeRootedAtGivenEvent(refEvent, generateSubstitutionMatrix)); } if (isTop) { stList *threadStrings = buildRecursiveThreadsInList(sequenceDatabase, caps, segmentWriteFn, terminalAdjacencyWriteFn); assert(stList_length(threadStrings) == stList_length(caps)); int64_t nonTrivialSeqIndex = 0, trivialSeqIndex = stList_length(threadStrings); //These are used as indices for the names of trivial and non-trivial sequences. for (int64_t i = 0; i < stList_length(threadStrings); i++) { Cap *cap = stList_get(caps, i); assert(cap_getStrand(cap)); assert(!cap_getSide(cap)); Flower *flower = end_getFlower(cap_getEnd(cap)); char *threadString = stList_get(threadStrings, i); bool trivialString = isTrivialString(&threadString); //This alters the original string MetaSequence *metaSequence = addMetaSequence(flower, cap, trivialString ? trivialSeqIndex++ : nonTrivialSeqIndex++, threadString, trivialString); free(threadString); int64_t endCoordinate = setCoordinates(flower, metaSequence, cap, metaSequence_getStart(metaSequence) - 1); (void) endCoordinate; assert(endCoordinate == metaSequence_getLength(metaSequence) + metaSequence_getStart(metaSequence)); } stList_setDestructor(threadStrings, NULL); //The strings are already cleaned up by the above loop stList_destruct(threadStrings); } else { buildRecursiveThreads(sequenceDatabase, caps, segmentWriteFn, terminalAdjacencyWriteFn); } stHash_destruct(segmentWriteFn_flowerToPhylogeneticTreeHash); stList_destruct(caps); }
// Remove and free an edge properly. static void removeEdgeFromAdjList(stNaiveConnectivity *connectivity, void *node, struct adjacency *adj) { invalidateCache(connectivity); if (adj->next != NULL) { adj->next->prev = adj->prev; } if (adj->prev != NULL) { adj->prev->next = adj->next; } else { stHash_remove(connectivity->nodesToAdjList, node); stHash_insert(connectivity->nodesToAdjList, node, adj->next); } free(adj); }
stHash *buildSegmentToContigPathHash(stList *maximalHaplotypePaths) { stHash *segmentToMaximalHaplotypePathHash = stHash_construct(); for (int64_t i = 0; i < stList_length(maximalHaplotypePaths); i++) { stList *maximalHaplotypePath = stList_get(maximalHaplotypePaths, i); assert(stList_length(maximalHaplotypePath) > 0); for (int64_t j = 0; j < stList_length(maximalHaplotypePath); j++) { Segment *segment = stList_get(maximalHaplotypePath, j); assert(stHash_search(segmentToMaximalHaplotypePathHash, segment) == NULL); assert(stHash_search(segmentToMaximalHaplotypePathHash, segment_getReverse(segment)) == NULL); stHash_insert(segmentToMaximalHaplotypePathHash, segment, maximalHaplotypePath); } } return segmentToMaximalHaplotypePathHash; }
/* * Recursive function which fills a givenlist with the * connected nodes within a module and fills their lifted * edges in the same pass */ static void buildFaces_fillTopNodeList2(Cap * cap, stList *list, stHash *liftedEdgesTable) { stList *liftedEdges = stList_construct3(2, buildFaces_stList_destructElem); int64_t index; // Orientation check cap = cap_getPositiveOrientation(cap); // Limit of recursion if (stList_contains(list, cap)) return; // Actual filling st_logInfo("Adding cap %p to face\n", cap); stList_append(list, cap); // Compute lifted edges for (index = 0; index < cap_getChildNumber(cap); index++) buildFaces_computeLiftedEdgesAtTopNode(cap_getChild(cap, index), liftedEdges); // If emptylist... if (stList_length(liftedEdges) == 0) stList_destruct(liftedEdges); // Recursion through lifted edges else { stHash_insert(liftedEdgesTable, cap, liftedEdges); for (index = 0; index < stList_length(liftedEdges); index++) buildFaces_fillTopNodeList2( ((LiftedEdge *) stList_get(liftedEdges, index))->destination, list, liftedEdgesTable); } // Recursion through adjacency if (cap_getAdjacency(cap)) buildFaces_fillTopNodeList2(cap_getAdjacency(cap),list, liftedEdgesTable); }
void stNaiveConnectivity_addNode(stNaiveConnectivity *connectivity, void *node) { invalidateCache(connectivity); assert(stHash_search(connectivity->nodesToAdjList, node) == NULL); stHash_insert(connectivity->nodesToAdjList, node, NULL); }
void stSet_insert(stSet *set, void *key) { if (stSet_search(set, key) != NULL) { // This will ensure we don't end up with duplicate keys.. stSet_remove(set, key); } stHash_insert(set->hash, key, key); }
static stHash *getScaffoldPathsP(stList *haplotypePaths, stHash *haplotypePathToScaffoldPathHash, stList *haplotypeEventStrings, stList *contaminationEventStrings, CapCodeParameters *capCodeParameters) { stHash *haplotypeToMaximalHaplotypeLengthHash = buildContigPathToContigPathLengthHash(haplotypePaths); stHash *segmentToMaximalHaplotypePathHash = buildSegmentToContigPathHash(haplotypePaths); for (int64_t i = 0; i < stList_length(haplotypePaths); i++) { stSortedSet *bucket = stSortedSet_construct(); stHash_insert(haplotypePathToScaffoldPathHash, stList_get(haplotypePaths, i), bucket); stSortedSet_insert(bucket, stList_get(haplotypePaths, i)); } for (int64_t i = 0; i < stList_length(haplotypePaths); i++) { stList *haplotypePath = stList_get(haplotypePaths, i); assert(stList_length(haplotypePath) > 0); Segment *_5Segment = stList_get(haplotypePath, 0); if (!segment_getStrand(_5Segment)) { _5Segment = segment_getReverse(stList_get(haplotypePath, stList_length(haplotypePath) - 1)); } assert(segment_getStrand(_5Segment)); if (getAdjacentCapsSegment(segment_get5Cap(_5Segment)) != NULL) { assert(!trueAdjacency(segment_get5Cap(_5Segment), haplotypeEventStrings)); } int64_t insertLength; int64_t deleteLength; Cap *otherCap; enum CapCode _5CapCode = getCapCode(segment_get5Cap(_5Segment), &otherCap, haplotypeEventStrings, contaminationEventStrings, &insertLength, &deleteLength, capCodeParameters); if (_5CapCode == SCAFFOLD_GAP || _5CapCode == AMBIGUITY_GAP) { assert(stHash_search(haplotypeToMaximalHaplotypeLengthHash, haplotypePath) != NULL); int64_t j = stIntTuple_get(stHash_search(haplotypeToMaximalHaplotypeLengthHash, haplotypePath), 0); Segment *adjacentSegment = getAdjacentCapsSegment(segment_get5Cap(_5Segment)); assert(adjacentSegment != NULL); while (!hasCapInEvents(cap_getEnd(segment_get5Cap(adjacentSegment)), haplotypeEventStrings)) { //is not a haplotype end adjacentSegment = getAdjacentCapsSegment(segment_get5Cap(adjacentSegment)); assert(adjacentSegment != NULL); } assert(adjacentSegment != NULL); assert(hasCapInEvents(cap_getEnd(segment_get5Cap(adjacentSegment)), haplotypeEventStrings)); //is a haplotype end stList *adjacentHaplotypePath = stHash_search(segmentToMaximalHaplotypePathHash, adjacentSegment); if (adjacentHaplotypePath == NULL) { adjacentHaplotypePath = stHash_search(segmentToMaximalHaplotypePathHash, segment_getReverse( adjacentSegment)); } assert(adjacentHaplotypePath != NULL); assert(adjacentHaplotypePath != haplotypePath); assert(stHash_search(haplotypeToMaximalHaplotypeLengthHash, adjacentHaplotypePath) != NULL); int64_t k = stIntTuple_get(stHash_search(haplotypeToMaximalHaplotypeLengthHash, adjacentHaplotypePath), 0); //Now merge the buckets and make new int tuples.. stSortedSet *bucket1 = stHash_search(haplotypePathToScaffoldPathHash, haplotypePath); stSortedSet *bucket2 = stHash_search(haplotypePathToScaffoldPathHash, adjacentHaplotypePath); assert(bucket1 != NULL); assert(bucket2 != NULL); assert(bucket1 != bucket2); stSortedSet *bucket3 = stSortedSet_getUnion(bucket1, bucket2); stSortedSetIterator *bucketIt = stSortedSet_getIterator(bucket3); stList *l; while ((l = stSortedSet_getNext(bucketIt)) != NULL) { //Do the bucket first assert(stHash_search(haplotypePathToScaffoldPathHash, l) == bucket1 || stHash_search(haplotypePathToScaffoldPathHash, l) == bucket2); stHash_remove(haplotypePathToScaffoldPathHash, l); stHash_insert(haplotypePathToScaffoldPathHash, l, bucket3); //Now the length stIntTuple *m = stHash_remove(haplotypeToMaximalHaplotypeLengthHash, l); assert(m != NULL); assert(stIntTuple_get(m, 0) == j || stIntTuple_get(m, 0) == k); stHash_insert(haplotypeToMaximalHaplotypeLengthHash, l, stIntTuple_construct1( j + k)); stIntTuple_destruct(m); } assert(stHash_search(haplotypePathToScaffoldPathHash, haplotypePath) == bucket3); assert(stHash_search(haplotypePathToScaffoldPathHash, adjacentHaplotypePath) == bucket3); stSortedSet_destructIterator(bucketIt); } } stHash_destruct(segmentToMaximalHaplotypePathHash); return haplotypeToMaximalHaplotypeLengthHash; }
stSortedSet *makeEndAlignment(StateMachine *sM, End *end, int64_t spanningTrees, int64_t maxSequenceLength, bool useProgressiveMerging, float gapGamma, PairwiseAlignmentParameters *pairwiseAlignmentBandingParameters) { //Make an alignment of the sequences in the ends //Get the adjacency sequences to be aligned. Cap *cap; End_InstanceIterator *it = end_getInstanceIterator(end); stList *sequences = stList_construct3(0, (void (*)(void *))adjacencySequence_destruct); stList *seqFrags = stList_construct3(0, (void (*)(void *))seqFrag_destruct); stHash *endInstanceNumbers = stHash_construct2(NULL, free); while((cap = end_getNext(it)) != NULL) { if(cap_getSide(cap)) { cap = cap_getReverse(cap); } AdjacencySequence *adjacencySequence = adjacencySequence_construct(cap, maxSequenceLength); stList_append(sequences, adjacencySequence); assert(cap_getAdjacency(cap) != NULL); End *otherEnd = end_getPositiveOrientation(cap_getEnd(cap_getAdjacency(cap))); stList_append(seqFrags, seqFrag_construct(adjacencySequence->string, 0, end_getName(otherEnd))); //Increase count of seqfrags with a given end. int64_t *c = stHash_search(endInstanceNumbers, otherEnd); if(c == NULL) { c = st_calloc(1, sizeof(int64_t)); assert(*c == 0); stHash_insert(endInstanceNumbers, otherEnd, c); } (*c)++; } end_destructInstanceIterator(it); //Get the alignment. MultipleAlignment *mA = makeAlignment(sM, seqFrags, spanningTrees, 100000000, useProgressiveMerging, gapGamma, pairwiseAlignmentBandingParameters); //Build an array of weights to reweight pairs in the alignment. int64_t *pairwiseAlignmentsPerSequenceNonCommonEnds = st_calloc(stList_length(seqFrags), sizeof(int64_t)); int64_t *pairwiseAlignmentsPerSequenceCommonEnds = st_calloc(stList_length(seqFrags), sizeof(int64_t)); //First build array on number of pairwise alignments to each sequence, distinguishing alignments between sequences sharing //common ends. for(int64_t i=0; i<stList_length(mA->chosenPairwiseAlignments); i++) { stIntTuple *pairwiseAlignment = stList_get(mA->chosenPairwiseAlignments, i); int64_t seq1 = stIntTuple_get(pairwiseAlignment, 1); int64_t seq2 = stIntTuple_get(pairwiseAlignment, 2); assert(seq1 != seq2); SeqFrag *seqFrag1 = stList_get(seqFrags, seq1); SeqFrag *seqFrag2 = stList_get(seqFrags, seq2); int64_t *pairwiseAlignmentsPerSequence = seqFrag1->rightEndId == seqFrag2->rightEndId ? pairwiseAlignmentsPerSequenceCommonEnds : pairwiseAlignmentsPerSequenceNonCommonEnds; pairwiseAlignmentsPerSequence[seq1]++; pairwiseAlignmentsPerSequence[seq2]++; } //Now calculate score adjustments. double *scoreAdjustmentsNonCommonEnds = st_malloc(stList_length(seqFrags) * sizeof(double)); double *scoreAdjustmentsCommonEnds = st_malloc(stList_length(seqFrags) * sizeof(double)); for(int64_t i=0; i<stList_length(seqFrags); i++) { SeqFrag *seqFrag = stList_get(seqFrags, i); End *otherEnd = flower_getEnd(end_getFlower(end), seqFrag->rightEndId); assert(otherEnd != NULL); assert(stHash_search(endInstanceNumbers, otherEnd) != NULL); int64_t commonInstanceNumber = *(int64_t *)stHash_search(endInstanceNumbers, otherEnd); int64_t nonCommonInstanceNumber = stList_length(seqFrags) - commonInstanceNumber; assert(commonInstanceNumber > 0 && nonCommonInstanceNumber >= 0); assert(pairwiseAlignmentsPerSequenceNonCommonEnds[i] <= nonCommonInstanceNumber); assert(pairwiseAlignmentsPerSequenceNonCommonEnds[i] >= 0); assert(pairwiseAlignmentsPerSequenceCommonEnds[i] < commonInstanceNumber); assert(pairwiseAlignmentsPerSequenceCommonEnds[i] >= 0); //scoreAdjustmentsNonCommonEnds[i] = ((double)nonCommonInstanceNumber + commonInstanceNumber - 1)/(pairwiseAlignmentsPerSequenceNonCommonEnds[i] + pairwiseAlignmentsPerSequenceCommonEnds[i]); //scoreAdjustmentsCommonEnds[i] = scoreAdjustmentsNonCommonEnds[i]; if(pairwiseAlignmentsPerSequenceNonCommonEnds[i] > 0) { scoreAdjustmentsNonCommonEnds[i] = ((double)nonCommonInstanceNumber)/pairwiseAlignmentsPerSequenceNonCommonEnds[i]; assert(scoreAdjustmentsNonCommonEnds[i] >= 1.0); assert(scoreAdjustmentsNonCommonEnds[i] <= nonCommonInstanceNumber); } else { scoreAdjustmentsNonCommonEnds[i] = INT64_MIN; } if(pairwiseAlignmentsPerSequenceCommonEnds[i] > 0) { scoreAdjustmentsCommonEnds[i] = ((double)commonInstanceNumber-1)/pairwiseAlignmentsPerSequenceCommonEnds[i]; assert(scoreAdjustmentsCommonEnds[i] >= 1.0); assert(scoreAdjustmentsCommonEnds[i] <= commonInstanceNumber-1); } else { scoreAdjustmentsCommonEnds[i] = INT64_MIN; } } //Convert the alignment pairs to an alignment of the caps.. stSortedSet *sortedAlignment = stSortedSet_construct3((int (*)(const void *, const void *))alignedPair_cmpFn, (void (*)(void *))alignedPair_destruct); while(stList_length(mA->alignedPairs) > 0) { stIntTuple *alignedPair = stList_pop(mA->alignedPairs); assert(stIntTuple_length(alignedPair) == 5); int64_t seqIndex1 = stIntTuple_get(alignedPair, 1); int64_t seqIndex2 = stIntTuple_get(alignedPair, 3); AdjacencySequence *i = stList_get(sequences, seqIndex1); AdjacencySequence *j = stList_get(sequences, seqIndex2); assert(i != j); int64_t offset1 = stIntTuple_get(alignedPair, 2); int64_t offset2 = stIntTuple_get(alignedPair, 4); int64_t score = stIntTuple_get(alignedPair, 0); if(score <= 0) { //Happens when indel probs are included score = 1; //This is the minimum } assert(score > 0 && score <= PAIR_ALIGNMENT_PROB_1); SeqFrag *seqFrag1 = stList_get(seqFrags, seqIndex1); SeqFrag *seqFrag2 = stList_get(seqFrags, seqIndex2); assert(seqFrag1 != seqFrag2); double *scoreAdjustments = seqFrag1->rightEndId == seqFrag2->rightEndId ? scoreAdjustmentsCommonEnds : scoreAdjustmentsNonCommonEnds; assert(scoreAdjustments[seqIndex1] != INT64_MIN); assert(scoreAdjustments[seqIndex2] != INT64_MIN); AlignedPair *alignedPair2 = alignedPair_construct( i->subsequenceIdentifier, i->start + (i->strand ? offset1 : -offset1), i->strand, j->subsequenceIdentifier, j->start + (j->strand ? offset2 : -offset2), j->strand, score*scoreAdjustments[seqIndex1], score*scoreAdjustments[seqIndex2]); //Do the reweighting here. assert(stSortedSet_search(sortedAlignment, alignedPair2) == NULL); assert(stSortedSet_search(sortedAlignment, alignedPair2->reverse) == NULL); stSortedSet_insert(sortedAlignment, alignedPair2); stSortedSet_insert(sortedAlignment, alignedPair2->reverse); stIntTuple_destruct(alignedPair); } //Cleanup stList_destruct(seqFrags); stList_destruct(sequences); free(pairwiseAlignmentsPerSequenceNonCommonEnds); free(pairwiseAlignmentsPerSequenceCommonEnds); free(scoreAdjustmentsNonCommonEnds); free(scoreAdjustmentsCommonEnds); multipleAlignment_destruct(mA); stHash_destruct(endInstanceNumbers); return sortedAlignment; }
static void test_addBlockToHash_4(CuTest *testCase) { // concatenation with sequnece breakpoint due to *strand* alone // note that name3 is well within the interstitial boundary, the two blocks // essentially looking like >>>>>>>>>>>>> <<<<< (strand diffs) options_t *options = options_construct(); options->breakpointPenalty = 10; options->interstitialSequence = 5; stList *observedList = stList_construct3(0, free); stList *expectedList = stList_construct3(0, free); stHash *observedHash = createBlockHashFromString("a score=0\n" "s reference.chr0 0 13 + 158545518 gcagctgaaaaca\n" "s name.chr1 0 10 + 100 ATGT---ATGCCG\n" "s name2.chr1 0 10 + 100 ATGT---ATGCCG\n" "s name3.chr1 0 13 + 100 GCAGCTGAAAACA\n", observedList ); mafBlock_t *mb = maf_newMafBlockListFromString("a score=0 test\n" "s reference.chr0 13 5 + 158545518 ACGTA\n" "s name.chr1 12 5 + 100 gtcGG\n" "s name2.chr1 10 5 + 100 ATGTg\n" "s name3.chr1 82 5 - 100 GGGGG\n" , 3); stHash *expectedHash = NULL; expectedHash = createBlockHashFromString("a score=0\n" "s reference.chr0 0 18 + 158545518 gcagctgaaaaca------------ACGTA\n" "s name.chr1 0 17 + 100 ATGT---ATGCCGac----------gtcGG\n" "s name2.chr1 0 15 + 100 ATGT---ATGCCG------------ATGTg\n" "s name3.chr1 0 28 + 100 GCAGCTGAAAACA--NNNNNNNNNNGGGGG\n", expectedList ); row_t *r = stHash_search(expectedHash, "name3"); r->prevRightPos = 86; r->strand = '*'; r->prevStrand = '-'; stHash *seqHash = createSeqHashFromString("name.chr1", "ATGTATGCCGacgtc" "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG" "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG"); mtfseq_t *mtfs = newMtfseqFromString("gcagctgaaaacaACGTA" "tttttttttttttttttttttttttttttttt" "tttttttttttttttttttttttttttttttttttttttttttttttttt"); stHash_insert(seqHash, stString_copy("reference.chr0"), mtfs); mtfs = newMtfseqFromString("ATGTATGCCGATGTg" "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC" "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"); stHash_insert(seqHash, stString_copy("name2.chr1"), mtfs); mtfs = newMtfseqFromString("GCAGCTGAAAACACCCCCgggggggggggggggggggggggggggggggg" "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" ); stHash_insert(seqHash, stString_copy("name3.chr1"), mtfs); addMafBlockToRowHash(observedHash, seqHash, observedList, mb, options); CuAssertTrue(testCase, hashesAreEqual(observedHash, expectedHash)); CuAssertTrue(testCase, listsAreEqual(observedList, expectedList)); // clean up stHash_destruct(observedHash); stHash_destruct(expectedHash); stHash_destruct(seqHash); stList_destruct(observedList); stList_destruct(expectedList); maf_destroyMafBlockList(mb); destroyOptions(options); }