int main(int argc, char **argv) { options_t *options = options_construct(); stHash *sequenceHash = NULL; // keyed on fasta headers, valued with mtfseq_t pointers stHash *alignmentHash = stHash_construct3(stHash_stringKey, stHash_stringEqualKey, free, destroyRow); // keyed on species names, valued with row_t pointers stList *rowOrder = stList_construct3(0, free); // when adding keys to alignmentHash, append to this list parseOptions(argc, argv, options); // read fastas, populate sequenceHash de_verbose("Creating sequence hash.\n"); sequenceHash = createSequenceHash(options->seqs); mafFileApi_t *mfapi = maf_newMfa(options->maf, "r"); de_verbose("Creating alignment hash.\n"); buildAlignmentHash(mfapi, alignmentHash, sequenceHash, rowOrder, options); if (options->outMfa != NULL) { // fasta output de_verbose("Writing fasta output.\n"); writeFastaOut(alignmentHash, rowOrder, options); } if (options->outMaf != NULL) { // maf output de_verbose("Writing maf output.\n"); writeMafOut(alignmentHash, rowOrder, options); } // cleanup maf_destroyMfa(mfapi); stHash_destruct(alignmentHash); stHash_destruct(sequenceHash); stList_destruct(rowOrder); destroyOptions(options); return(EXIT_SUCCESS); }
static void test_addBlockToHash_3(CuTest *testCase) { // concatenation with 2 bases of interstitial and a sequence length breakpoint options_t *options = options_construct(); options->breakpointPenalty = 10; options->interstitialSequence = 5; stList *observedList = stList_construct3(0, free); stList *expectedList = stList_construct3(0, free); stHash *observedHash = createBlockHashFromString("a score=0\n" "s reference.chr0 0 13 + 158545518 gcagctgaaaaca\n" "s name.chr1 0 10 + 100 ATGT---ATGCCG\n" "s name2.chr1 0 10 + 100 ATGT---ATGCCG\n" "s name3.chr1 0 13 + 100 GCAGCTGAAAACA\n", observedList ); mafBlock_t *mb = maf_newMafBlockListFromString("a score=0 test\n" "s reference.chr0 13 5 + 158545518 ACGTA\n" "s name.chr1 12 5 + 100 gtcGG\n" "s name2.chr1 10 5 + 100 ATGTg\n" "s name3.chr1 50 5 + 100 CCCCC\n" , 3); stHash *expectedHash = NULL; expectedHash = createBlockHashFromString("a score=0\n" "s reference.chr0 0 18 + 158545518 gcagctgaaaaca------------ACGTA\n" "s name.chr1 0 17 + 100 ATGT---ATGCCGac----------gtcGG\n" "s name2.chr1 0 15 + 100 ATGT---ATGCCG------------ATGTg\n" "s name3 0 28 + 28 GCAGCTGAAAACA--NNNNNNNNNNCCCCC\n", expectedList ); row_t *r = stHash_search(expectedHash, "name3"); r->prevRightPos = 54; free(r->prevName); r->prevName = stString_copy("name3.chr1"); r->multipleNames = true; stHash *seqHash = createSeqHashFromString("name.chr1", "ATGTATGCCGacgtc" "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG" "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG"); mtfseq_t *mtfs = newMtfseqFromString("gcagctgaaaacaACGTA" "tttttttttttttttttttttttttttttttt" "tttttttttttttttttttttttttttttttttttttttttttttttttt"); stHash_insert(seqHash, stString_copy("reference.chr0"), mtfs); mtfs = newMtfseqFromString("ATGTATGCCGATGTg" "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC" "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"); stHash_insert(seqHash, stString_copy("name2.chr1"), mtfs); mtfs = newMtfseqFromString("GCAGCTGAAAACAggggggggggggggggggggggggggggggggggggg" "CCCCCaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" ); stHash_insert(seqHash, stString_copy("name3.chr1"), mtfs); addMafBlockToRowHash(observedHash, seqHash, observedList, mb, options); CuAssertTrue(testCase, hashesAreEqual(observedHash, expectedHash)); CuAssertTrue(testCase, listsAreEqual(observedList, expectedList)); // clean up stHash_destruct(observedHash); stHash_destruct(expectedHash); stHash_destruct(seqHash); stList_destruct(observedList); stList_destruct(expectedList); maf_destroyMafBlockList(mb); destroyOptions(options); }
static AdjacencySwitch *getBestAdjacencySwitch(stList *cycles, stList *nonZeroWeightAdjacencyEdges, stSortedSet *allAdjacencyEdges) { /* * Returns the best 3 or 4 edge switch (one including 3 or 4 edges) for the given existing edge, if they exist, or else NULL. */ assert(stList_length(cycles) > 0); stHash *nodesToNonZeroWeightedAdjacencyEdges = getNodesToEdgesHash( nonZeroWeightAdjacencyEdges); stList *allComponentEdges = stList_join(cycles); assert(stList_length(allComponentEdges) > 0); stHash *nodesToAllCurrentEdgesSet = getNodesToEdgesHash(allComponentEdges); /* * Get list of adjacency edges that bridge between (have a node in two) components. */ stList *bridgingAdjacencyEdges = getEdgesThatBridgeComponents(cycles, nodesToNonZeroWeightedAdjacencyEdges); stHash *nodesToBridgingAdjacencyEdges = getNodesToEdgesHash( bridgingAdjacencyEdges); /* * For the best 2 edge switch. */ AdjacencySwitch *minimumCostAdjacencySwitch = getBest2EdgeAdjacencySwitch( cycles, allAdjacencyEdges); /* * Look for the best 3 or 4 edge switch. */ for (int64_t i = 0; i < stList_length(allComponentEdges); i++) { minimumCostAdjacencySwitch = getMinimumCostAdjacencySwitch( minimumCostAdjacencySwitch, getBest4EdgeAdjacencySwitch2(stList_get(allComponentEdges, i), allAdjacencyEdges, nodesToAllCurrentEdgesSet, nodesToBridgingAdjacencyEdges)); } assert(minimumCostAdjacencySwitch != NULL); /* * Cleanup */ stList_destruct(allComponentEdges); stList_destruct(bridgingAdjacencyEdges); stHash_destruct(nodesToAllCurrentEdgesSet); stHash_destruct(nodesToBridgingAdjacencyEdges); stHash_destruct(nodesToNonZeroWeightedAdjacencyEdges); return minimumCostAdjacencySwitch; }
static void test_addBlockToHash_2(CuTest *testCase) { // concatenation with 2 bases of interstitial AND a previously unobserved sequence options_t *options = options_construct(); options->breakpointPenalty = 10; options->interstitialSequence = 5; stList *observedList = stList_construct3(0, free); stList *expectedList = stList_construct3(0, free); stHash *observedHash = createBlockHashFromString("a score=0\n" "s reference.chr0 0 13 + 158545518 gcagctgaaaaca\n" "s name.chr1 0 10 + 100 ATGT---ATGCCG\n" "s name2.chr1 0 10 + 100 ATGT---ATGCCG\n", observedList); mafBlock_t *mb = maf_newMafBlockListFromString("a score=0 test\n" "s reference.chr0 13 5 + 158545518 ACGTA\n" "s name.chr1 12 5 + 100 gTcGG\n" "s name2.chr1 10 5 + 100 ATGTg\n" "s name3.chr@ 0 5 + 20 aaccg\n" , 3); stHash *expectedHash = createBlockHashFromString("a score=0\n" "s reference.chr0 0 18 + 158545518 gcagctgaaaaca--ACGTA\n" "s name.chr1 0 17 + 100 ATGT---ATGCCGacgTcGG\n" "s name2.chr1 0 15 + 100 ATGT---ATGCCG--ATGTg\n" "s name3.chr@ 0 5 + 20 ---------------aaccg\n", expectedList ); stHash *seqHash = createSeqHashFromString("name.chr1", "ATGTATGCCGacgTc" "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG" "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG"); mtfseq_t *mtfs = newMtfseqFromString("gcagctgaaaacaACGTA" "tttttttttttttttttttttttttttttttt" "tttttttttttttttttttttttttttttttttttttttttttttttttt"); stHash_insert(seqHash, stString_copy("reference.chr0"), mtfs); mtfs = newMtfseqFromString("ATGTATGCCGATGTg" "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC" "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"); stHash_insert(seqHash, stString_copy("name2.chr1"), mtfs); mtfs = newMtfseqFromString("aaccgTTTTTTTTTTTTTTT"); stHash_insert(seqHash, stString_copy("name3.chr@"), mtfs); addMafBlockToRowHash(observedHash, seqHash, observedList, mb, options); CuAssertTrue(testCase, hashesAreEqual(observedHash, expectedHash)); CuAssertTrue(testCase, listsAreEqual(observedList, expectedList)); // clean up stHash_destruct(observedHash); stHash_destruct(expectedHash); stHash_destruct(seqHash); stList_destruct(observedList); stList_destruct(expectedList); maf_destroyMafBlockList(mb); destroyOptions(options); }
static void debugScaffoldPaths(stList *haplotypePaths, stHash *haplotypePathToScaffoldPathHash, stHash *haplotypeToMaximalHaplotypeLengthHash, stList *haplotypeEventStrings, stList *contaminationEventStrings, CapCodeParameters *capCodeParameters) { stHash *segmentToMaximalHaplotypePathHash = buildSegmentToContigPathHash(haplotypePaths); for (int64_t i = 0; i < stList_length(haplotypePaths); i++) { stList *haplotypePath = stList_get(haplotypePaths, i); assert(stList_length(haplotypePath) > 0); //Traversing from 5' end.. Segment *_5Segment = stList_get(haplotypePath, 0); Segment *_3Segment = stList_get(haplotypePath, stList_length(haplotypePath) - 1); assert(segment_getStrand(_5Segment) == segment_getStrand(_3Segment)); if (!segment_getStrand(_5Segment)) { Segment *j = _5Segment; _5Segment = segment_getReverse(_3Segment); _3Segment = segment_getReverse(j); } assert(segment_getStrand(_5Segment)); assert(segment_getStrand(_3Segment)); Cap *_5Cap = segment_get5Cap(_5Segment); Cap *_3Cap = segment_get3Cap(_3Segment); if (getAdjacentCapsSegment(_5Cap) != NULL) { assert(!trueAdjacency(_5Cap, haplotypeEventStrings)); } if (getAdjacentCapsSegment(_3Cap) != NULL) { assert(!trueAdjacency(_3Cap, haplotypeEventStrings)); } debugScaffoldPathsP(_5Cap, haplotypePath, haplotypePathToScaffoldPathHash, haplotypeToMaximalHaplotypeLengthHash, segmentToMaximalHaplotypePathHash, haplotypeEventStrings, contaminationEventStrings, capCodeParameters, 1); debugScaffoldPathsP(_3Cap, haplotypePath, haplotypePathToScaffoldPathHash, haplotypeToMaximalHaplotypeLengthHash, segmentToMaximalHaplotypePathHash, haplotypeEventStrings, contaminationEventStrings, capCodeParameters, 0); } stHash_destruct(segmentToMaximalHaplotypePathHash); }
static void makeMatchingPerfect(stList *chosenEdges, stList *adjacencyEdges, stSortedSet *nodes) { /* * While the the number of edges is less than a perfect matching add random edges. */ stSortedSet *attachedNodes = getNodeSetOfEdges(chosenEdges); stHash *nodesToAdjacencyEdges = getNodesToEdgesHash(adjacencyEdges); stIntTuple *pNode = NULL; stSortedSetIterator *it = stSortedSet_getIterator(nodes); stIntTuple *node; while((node = stSortedSet_getNext(it)) != NULL) { if (stSortedSet_search(attachedNodes, node) == NULL) { if (pNode == NULL) { pNode = node; } else { stList_append(chosenEdges, getEdgeForNodes(stIntTuple_get(pNode, 0), stIntTuple_get(node, 0), nodesToAdjacencyEdges)); pNode = NULL; } } } stSortedSet_destructIterator(it); assert(pNode == NULL); stSortedSet_destruct(attachedNodes); assert(stList_length(chosenEdges) * 2 == stSortedSet_size(nodes)); stHash_destruct(nodesToAdjacencyEdges); }
static void checkComponents(CuTest *testCase, stList *filteredEdges) { stHash *nodesToComponents = getComponents(filteredEdges); //Check all components are smaller than threshold stList *components = stHash_getValues(nodesToComponents); for (int64_t i = 0; i < stList_length(components); i++) { stSortedSet *component = stList_get(components, i); CuAssertTrue(testCase, stSortedSet_size(component) <= maxComponentSize); CuAssertTrue(testCase, stSortedSet_size(component) >= 1); } //Check no edges can be added from those filtered. stSortedSet *filteredEdgesSet = stList_getSortedSet(filteredEdges, (int(*)(const void *, const void *)) stIntTuple_cmpFn); for (int64_t i = 0; i < stList_length(edges); i++) { stIntTuple *edge = stList_get(edges, i); if (stSortedSet_search(filteredEdgesSet, edge) == NULL) { stIntTuple *node1 = stIntTuple_construct1( stIntTuple_get(edge, 1)); stIntTuple *node2 = stIntTuple_construct1( stIntTuple_get(edge, 2)); stSortedSet *component1 = stHash_search(nodesToComponents, node1); stSortedSet *component2 = stHash_search(nodesToComponents, node2); CuAssertTrue(testCase, component1 != NULL && component2 != NULL); CuAssertTrue(testCase, component1 != component2); CuAssertTrue(testCase, stSortedSet_size(component1) + stSortedSet_size(component2) > maxComponentSize); stIntTuple_destruct(node1); stIntTuple_destruct(node2); } } stSortedSet_destruct(filteredEdgesSet); //Cleanup the components stSortedSet *componentsSet = stList_getSortedSet(components, NULL); stList_destruct(components); stSortedSet_setDestructor(componentsSet, (void(*)(void *)) stSortedSet_destruct); stSortedSet_destruct(componentsSet); stHash_destruct(nodesToComponents); }
static stList *readMatching(FILE *fileHandle, stList *originalEdges) { /* * Reads the matching created by Blossum. */ stHash *originalEdgesHash = putEdgesInHash(originalEdges); char *line = stFile_getLineFromFile(fileHandle); assert(line != NULL); int64_t nodeNumber, edgeNumber; int64_t i = sscanf(line, "%" PRIi64 " %" PRIi64 "\n", &nodeNumber, &edgeNumber); assert(i == 2); free(line); stList *chosenEdges = stList_construct(); for(int64_t j=0; j<edgeNumber; j++) { line = stFile_getLineFromFile(fileHandle); int64_t node1, node2; i = sscanf(line, "%" PRIi64 " %" PRIi64 "", &node1, &node2); assert(i == 2); free(line); assert(node1 >= 0); assert(node1 < nodeNumber); assert(node2 >= 0); assert(node2 < nodeNumber); stIntTuple *edge = constructEdge(node1, node2); stIntTuple *originalEdge = stHash_search(originalEdgesHash, edge); if(originalEdge != NULL) { stList_append(chosenEdges, originalEdge); } stIntTuple_destruct(edge); } stHash_destruct(originalEdgesHash); return chosenEdges; }
static stKVDatabaseConf *constructFromString(const char *xmlString) { stHash *hash = hackParseXmlString(xmlString); stKVDatabaseConf *databaseConf = NULL; const char *type = getXmlValueRequired(hash, "conf_type"); const char *dbTag = getXmlValueRequired(hash, "db_tag"); if (!stString_eq(type, dbTag)) { stThrowNew(ST_KV_DATABASE_EXCEPTION_ID, "Database XML tag \"%s\" did not match st_kv_database_conf type attribute", dbTag, type); } if (stString_eq(type, "tokyo_cabinet")) { databaseConf = stKVDatabaseConf_constructTokyoCabinet(getXmlValueRequired(hash, "database_dir")); } else if (stString_eq(type, "kyoto_tycoon")) { databaseConf = stKVDatabaseConf_constructKyotoTycoon(getXmlValueRequired(hash, "host"), getXmlPort(hash), getXmlTimeout(hash), getXMLMaxKTRecordSize(hash), getXMLMaxKTBulkSetSize(hash), getXMLMaxKTBulkSetNumRecords(hash), getXmlValueRequired(hash, "database_dir"), stHash_search(hash, "database_name")); } else if (stString_eq(type, "mysql")) { databaseConf = stKVDatabaseConf_constructMySql(getXmlValueRequired(hash, "host"), getXmlPort(hash), getXmlValueRequired(hash, "user"), getXmlValueRequired(hash, "password"), getXmlValueRequired(hash, "database_name"), getXmlValueRequired(hash, "table_name")); } else { stThrowNew(ST_KV_DATABASE_EXCEPTION_ID, "invalid database type \"%s\"", type); } stHash_destruct(hash); return databaseConf; }
stHash *getScaffoldPaths(stList *haplotypePaths, stList *haplotypeEventStrings, stList *contaminationEventStrings, CapCodeParameters *capCodeParameters) { stHash *haplotypePathToScaffoldPathHash = stHash_construct(); stHash *i = getScaffoldPathsP(haplotypePaths, haplotypePathToScaffoldPathHash, haplotypeEventStrings, contaminationEventStrings, capCodeParameters); debugScaffoldPaths(haplotypePaths, haplotypePathToScaffoldPathHash, i, haplotypeEventStrings, contaminationEventStrings, capCodeParameters); stHash_destruct(i); return haplotypePathToScaffoldPathHash; }
/* * Uses the functions above to build an adjacency list, then by DFS attempts to create * a valid topological sort, returning non-zero if the graph contains a cycle. */ static int64_t containsACycle(stList *pairs, int64_t sequenceNumber) { //Build an adjacency list structure.. stHash *adjacencyList = buildAdjacencyList(pairs, sequenceNumber); //Do a topological sort of the adjacency list stSortedSet *started = stSortedSet_construct3((int (*)(const void *, const void *))stIntTuple_cmpFn, NULL); stSortedSet *done = stSortedSet_construct3((int (*)(const void *, const void *))stIntTuple_cmpFn, NULL); int64_t cyclic = 0; for(int64_t seq=0; seq<sequenceNumber; seq++) { stIntTuple *seqPos = stIntTuple_construct2( seq, 0); //The following hacks avoid memory cleanup.. stSortedSet *column = stHash_search(adjacencyList, seqPos); assert(column != NULL); stIntTuple *seqPos2 = stSortedSet_search(column, seqPos); assert(seqPos2 != NULL); cyclic = cyclic || dfs(adjacencyList, seqPos2, started, done); stIntTuple_destruct(seqPos); } //cleanup stHashIterator *it = stHash_getIterator(adjacencyList); stIntTuple *seqPos; stSortedSet *columns = stSortedSet_construct2((void (*)(void *))stSortedSet_destruct); while((seqPos = stHash_getNext(it)) != NULL) { stSortedSet *column = stHash_search(adjacencyList, seqPos); assert(column != NULL); stSortedSet_insert(columns, column); } stHash_destructIterator(it); stHash_destruct(adjacencyList); stSortedSet_destruct(columns); stSortedSet_destruct(started); stSortedSet_destruct(done); return cyclic; }
static void test_readingFasta_0(CuTest *testCase) { char inputName[] = ">simChimp.chrA"; char inputSequence[] = "ATAATACTTGCACACTTCTGCTATTACTTGATGTGTTTTCTATGGGGTGT" "CTTTCAGTGCTATGGGCAAGGCCATGGATTAATGGTGCCATAATTGCTCT" "AGGCAGTGACTAGAAACAGTTCACAAGTTTTTACTGTATCAAACTATGTT" "TTATAGTACGATTCACCCTCCAGGGGACCATCCCAAACTACTGGCCTAAA" "AGGACCTGCCATGTTGTAACTCCCCAGCTTAGAAATATAGACGGGAGGAA" "TGACaaaaagaagaaaaaaaaaaaaagaaaaaataaaaaaaaaacaaaaa" "agatagagaaaaaaaaaagtaaaaacaaaaaaaaataaaaaagggaaaaa" "aaataacaaaggaacaaaaaaaaaaaaaaaaaaataaaaagaaaaaCAAG" "ATAACCTTCATGCCATTGGAGCTATCTATTATTGTCTTGACCTATGCTTT" "ATCAATTTCTTCCTTCCTAGGAAGACATTTTTCTAGAAAGCTAAACGTTT" "TTGTAGGCTTGCATGTTCTGTCTGGGCTTGAATGGTTGTGCGTCTACAAG" "CCTCATTTACCATAGCACCATGCTTGGGTGGTATCTATCATCATTATCAA" "TAGTCAAGTCATTATAATGTTTTGGTGATCAGGCCAGATCCCTTGCACCA" "GTGACTTTCTAAATAGCACCTCCTCCATCATTTAAGGATCTCTAGCAACT" "TTAATCTGACTCACCTTGCCATGCAGAGTGCATGTTCCTTTTTAACACCC" "TGTGATTATGGGTTGGGTCTATTTGTATTTGTTTGATTACATCAGACGAC" "CAGGCCAGAGACAGATAAACACAACAGCCACTGGAACCTAAAGCTGTGTT" "CAGAATGTCACGGAATGTCTCATTGCACCCAGAGCTAGGGTGGGTATGAG" "TATGATCTTCTACATAAGGTACCCCAGGAAAATTAACTTAACAACCAATC" "AATTACAGAAGATGAATTCTGCTGTTGTCTCTTATTAGTTGGACTATTCA" "GCCTAATGGTTGGCCACTTAGCTTGTCATGAGCATTACTGTACTACTATG" "TCTAGTGTTTCCAGTTATTAGTTAGCCCACTGGATAGACAGTTTTGGCTT" "GTTTTCTTTCATTTGTATTGCCCACTCACCTAGCAAATCAGACAAAGGGG" "CATGTGAAAACTACCTTAGACTCTGCAGTTAGACAAACCATACTTTCCAC" "ATAGACCTCAGACATTTGGACATGAATAATTTCCTTCCTCCGGAGTGGTG" "GTTCCTCAACACTTATCACTTTCTTCTTCTTTTACCCGTATCACTGTCAA"; FILE *ofp = de_fopen("testFasta.fa", "w"); fprintf(ofp, "%s\n", inputName); for (size_t i = 0; i < strlen(inputSequence); ++i) { fprintf(ofp, "%c", inputSequence[i]); if (((i + 1) % 50) == 0) { fprintf(ofp, "\n"); } } fprintf(ofp, "\n"); fclose(ofp); stHash *sequenceHash = stHash_construct3(stHash_stringKey, stHash_stringEqualKey, free, destroyMtfseq); addSequencesToHash(sequenceHash, "testFasta.fa"); mtfseq_t *value = NULL; CuAssertTrue(testCase, (value = stHash_search(sequenceHash, "not in there")) == NULL); CuAssertTrue(testCase, (value = stHash_search(sequenceHash, "simChimp.chrA")) != NULL); if (value != NULL) { CuAssertTrue(testCase, strlen(value->seq) == strlen(inputSequence)); CuAssertTrue(testCase, strcmp(value->seq, inputSequence) == 0); } if (remove("testFasta.fa")) { fprintf(stderr, "Error, unable to remove temporary file testFasta.fa\n"); exit(EXIT_FAILURE); } stHash_destruct(sequenceHash); }
void bottomUp(stList *flowers, stKVDatabase *sequenceDatabase, Name referenceEventName, bool isTop, stMatrix *(*generateSubstitutionMatrix)(double)) { /* * A reference thread between the two caps * in each flower f may be broken into two in the children of f. * Therefore, for each flower f first identify attached stub ends present in the children of f that are * not present in f and copy them into f, reattaching the reference caps as needed. */ stList *caps = getCaps(flowers, referenceEventName); for (int64_t i = stList_length(caps) - 1; i >= 0; i--) { //Start from end, as we add to this list. setAdjacencyLengthsAndRecoverNewCapsAndBrokenAdjacencies(stList_get(caps, i), caps); } for(int64_t i=0; i<stList_length(flowers); i++) { recoverBrokenAdjacencies(stList_get(flowers, i), caps, referenceEventName); } //Build the phylogenetic event trees for base calling. segmentWriteFn_flowerToPhylogeneticTreeHash = stHash_construct2(NULL, (void (*)(void *))cleanupPhylogeneticTree); for(int64_t i=0; i<stList_length(flowers); i++) { Flower *flower = stList_get(flowers, i); Event *refEvent = eventTree_getEvent(flower_getEventTree(flower), referenceEventName); assert(refEvent != NULL); stHash_insert(segmentWriteFn_flowerToPhylogeneticTreeHash, flower, getPhylogeneticTreeRootedAtGivenEvent(refEvent, generateSubstitutionMatrix)); } if (isTop) { stList *threadStrings = buildRecursiveThreadsInList(sequenceDatabase, caps, segmentWriteFn, terminalAdjacencyWriteFn); assert(stList_length(threadStrings) == stList_length(caps)); int64_t nonTrivialSeqIndex = 0, trivialSeqIndex = stList_length(threadStrings); //These are used as indices for the names of trivial and non-trivial sequences. for (int64_t i = 0; i < stList_length(threadStrings); i++) { Cap *cap = stList_get(caps, i); assert(cap_getStrand(cap)); assert(!cap_getSide(cap)); Flower *flower = end_getFlower(cap_getEnd(cap)); char *threadString = stList_get(threadStrings, i); bool trivialString = isTrivialString(&threadString); //This alters the original string MetaSequence *metaSequence = addMetaSequence(flower, cap, trivialString ? trivialSeqIndex++ : nonTrivialSeqIndex++, threadString, trivialString); free(threadString); int64_t endCoordinate = setCoordinates(flower, metaSequence, cap, metaSequence_getStart(metaSequence) - 1); (void) endCoordinate; assert(endCoordinate == metaSequence_getLength(metaSequence) + metaSequence_getStart(metaSequence)); } stList_setDestructor(threadStrings, NULL); //The strings are already cleaned up by the above loop stList_destruct(threadStrings); } else { buildRecursiveThreads(sequenceDatabase, caps, segmentWriteFn, terminalAdjacencyWriteFn); } stHash_destruct(segmentWriteFn_flowerToPhylogeneticTreeHash); stList_destruct(caps); }
static void test_penalize_0(CuTest *testCase) { stList *observedList = stList_construct3(0, free); stList *expectedList = stList_construct3(0, free); stHash *observedHash = createBlockHashFromString("a score=0\n" "s reference.chr0 0 13 + 158545518 gcagctgaaaaca\n" "s name.chr1 0 10 + 100 ATGT---ATGCCG\n" "s name2.chr1 0 10 + 100 ATGT---ATGCCG\n", observedList); stHash *expectedHash = createBlockHashFromString("a score=0\n" "s reference.chr0 0 13 + 158545518 gcagctgaaaaca-----\n" "s name.chr1 0 15 + 100 ATGT---ATGCCGNNNNN\n" "s name2.chr1 0 10 + 100 ATGT---ATGCCG-----\n", expectedList); penalize(observedHash, "name.chr1", 5); CuAssertTrue(testCase, hashesAreEqual(observedHash, expectedHash)); CuAssertTrue(testCase, listsAreEqual(observedList, expectedList)); // clean up stHash_destruct(observedHash); stHash_destruct(expectedHash); stList_destruct(observedList); stList_destruct(expectedList); }
void flower_reconstructFaces(Flower * flower) { flower_destructFaces(flower); stHash *liftedEdgesTable = buildFaces_computeLiftedEdges(flower); Flower_CapIterator *iter = flower_getCapIterator(flower); stList *liftedEdges; Cap *current; while ((current = flower_getNextCap(iter))) { if ((liftedEdges = stHash_search(liftedEdgesTable, current)) && (stList_length(liftedEdges) >= 1)) { buildFaces_constructFromCap(current, liftedEdgesTable, flower); } } stHash_destruct(liftedEdgesTable); flower_destructCapIterator(iter); }
static stSortedSet *getOddNodes(stList *cycle) { /* * Returns alternating nodes in a simple cycle. */ //Set to return stSortedSet *nodes = stSortedSet_construct3( (int(*)(const void *, const void *)) stIntTuple_cmpFn, (void(*)(void *)) stIntTuple_destruct); stHash *nodesToEdges = getNodesToEdgesHash(cycle); int64_t node = stIntTuple_get(stList_get(cycle, 0), 0); int64_t pNode = -1; int64_t counter = 0; bool b = 1; assert(stList_length(cycle) % 2 == 0); while (counter++ < stList_length(cycle)) { if (b) { //Make alternating addNodeToSet(nodes, node); b = 0; } else { b = 1; } stList *edges = getItemForNode(node, nodesToEdges); assert(stList_length(edges) == 2); stIntTuple *edge = stList_get(edges, 0); int64_t node2 = getOtherPosition(edge, node); if (node2 != pNode) { pNode = node; node = node2; continue; } edge = stList_get(edges, 1); node2 = getOtherPosition(edge, node); assert(node2 != pNode); pNode = node; node = node2; } stHash_destruct(nodesToEdges); assert(stList_length(cycle) / 2 == stSortedSet_size(nodes)); return nodes; }
static void test_interstitial_0(CuTest *testCase) { stList *observedList = stList_construct3(0, free); stList *expectedList = stList_construct3(0, free); stHash *observedHash = createBlockHashFromString("a score=0\n" "s reference.chr0 0 13 + 158545518 gcagctgaaaaca\n" "s name.chr1 0 10 + 100 ATGT---ATGCCG\n" "s name2.chr1 0 10 + 100 ATGT---ATGCCG\n", observedList); stHash *expectedHash = createBlockHashFromString("a score=0\n" "s reference.chr0 0 13 + 158545518 gcagctgaaaaca-----\n" "s name.chr1 0 15 + 100 ATGT---ATGCCGaaaTa\n" "s name2.chr1 0 10 + 100 ATGT---ATGCCG-----\n", expectedList); stHash *seqHash = createSeqHashFromString("name.chr1", "ATGTATGCCGaaaTaTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT" "TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT"); interstitialInsert(observedHash, seqHash, "name.chr1", 10, '+', 5); CuAssertTrue(testCase, hashesAreEqual(observedHash, expectedHash)); CuAssertTrue(testCase, listsAreEqual(observedList, expectedList)); // clean up stHash_destruct(observedHash); stHash_destruct(expectedHash); stHash_destruct(seqHash); stList_destruct(observedList); stList_destruct(expectedList); observedList = stList_construct3(0, free); expectedList = stList_construct3(0, free); observedHash = createBlockHashFromString("a score=0\n" "s reference.chr0 0 13 + 158545518 gcagctgaaaaca\n" "s name.chr1 0 10 - 100 ATGT---ATGCCG\n" "s name2.chr1 0 10 + 100 ATGT---ATGCCG\n", observedList); expectedHash = createBlockHashFromString("a score=0\n" "s reference.chr0 0 13 + 158545518 gcagctgaaaaca-----\n" "s name.chr1 0 15 - 100 ATGT---ATGCCGaaaTa\n" "s name2.chr1 0 10 + 100 ATGT---ATGCCG-----\n", expectedList); seqHash = createSeqHashFromString("name.chr1", "ggggggggggggTTgggggggggggggggggggggggggggggggggggg" // 50 "gggaagggGgggCgggTgggAgggcgggtgggagg" // 35 "tAtttCGGCATACAT"); interstitialInsert(observedHash, seqHash, "name.chr1", 10, '-', 5); CuAssertTrue(testCase, hashesAreEqual(observedHash, expectedHash)); CuAssertTrue(testCase, listsAreEqual(observedList, expectedList)); // clean up stHash_destruct(observedHash); stHash_destruct(expectedHash); stHash_destruct(seqHash); stList_destruct(observedList); stList_destruct(expectedList); }
stList *getComponents(stList *edges) { /* * Gets a list of connected components, each connected component * being represented as a list of the edges, such that each edge is in exactly one * connected component. Allows for multi-graphs (multiple edges connecting two nodes). */ stHash *nodesToEdges = getNodesToEdgesHash(edges); /* * Traverse the edges greedily */ stList *components = stList_construct3(0, (void(*)(void *)) stList_destruct); stList *nodes = stHash_getKeys(nodesToEdges); while (stList_length(nodes) > 0) { stIntTuple *node = stList_pop(nodes); stList *edges = stHash_search(nodesToEdges, node); if (edges != NULL) { //We have a component to build stSortedSet *component = stSortedSet_construct(); stHash_remove(nodesToEdges, node); for (int64_t i = 0; i < stList_length(edges); i++) { stIntTuple *edge = stList_get(edges, i); getComponentsP(nodesToEdges, stIntTuple_get(edge, 0), component); getComponentsP(nodesToEdges, stIntTuple_get(edge, 1), component); } stList_append(components, stSortedSet_getList(component)); //Cleanup stSortedSet_destruct(component); stList_destruct(edges); } stIntTuple_destruct(node); } assert(stHash_size(nodesToEdges) == 0); stHash_destruct(nodesToEdges); stList_destruct(nodes); return components; }
static stHash *getScaffoldPathsP(stList *haplotypePaths, stHash *haplotypePathToScaffoldPathHash, stList *haplotypeEventStrings, stList *contaminationEventStrings, CapCodeParameters *capCodeParameters) { stHash *haplotypeToMaximalHaplotypeLengthHash = buildContigPathToContigPathLengthHash(haplotypePaths); stHash *segmentToMaximalHaplotypePathHash = buildSegmentToContigPathHash(haplotypePaths); for (int64_t i = 0; i < stList_length(haplotypePaths); i++) { stSortedSet *bucket = stSortedSet_construct(); stHash_insert(haplotypePathToScaffoldPathHash, stList_get(haplotypePaths, i), bucket); stSortedSet_insert(bucket, stList_get(haplotypePaths, i)); } for (int64_t i = 0; i < stList_length(haplotypePaths); i++) { stList *haplotypePath = stList_get(haplotypePaths, i); assert(stList_length(haplotypePath) > 0); Segment *_5Segment = stList_get(haplotypePath, 0); if (!segment_getStrand(_5Segment)) { _5Segment = segment_getReverse(stList_get(haplotypePath, stList_length(haplotypePath) - 1)); } assert(segment_getStrand(_5Segment)); if (getAdjacentCapsSegment(segment_get5Cap(_5Segment)) != NULL) { assert(!trueAdjacency(segment_get5Cap(_5Segment), haplotypeEventStrings)); } int64_t insertLength; int64_t deleteLength; Cap *otherCap; enum CapCode _5CapCode = getCapCode(segment_get5Cap(_5Segment), &otherCap, haplotypeEventStrings, contaminationEventStrings, &insertLength, &deleteLength, capCodeParameters); if (_5CapCode == SCAFFOLD_GAP || _5CapCode == AMBIGUITY_GAP) { assert(stHash_search(haplotypeToMaximalHaplotypeLengthHash, haplotypePath) != NULL); int64_t j = stIntTuple_get(stHash_search(haplotypeToMaximalHaplotypeLengthHash, haplotypePath), 0); Segment *adjacentSegment = getAdjacentCapsSegment(segment_get5Cap(_5Segment)); assert(adjacentSegment != NULL); while (!hasCapInEvents(cap_getEnd(segment_get5Cap(adjacentSegment)), haplotypeEventStrings)) { //is not a haplotype end adjacentSegment = getAdjacentCapsSegment(segment_get5Cap(adjacentSegment)); assert(adjacentSegment != NULL); } assert(adjacentSegment != NULL); assert(hasCapInEvents(cap_getEnd(segment_get5Cap(adjacentSegment)), haplotypeEventStrings)); //is a haplotype end stList *adjacentHaplotypePath = stHash_search(segmentToMaximalHaplotypePathHash, adjacentSegment); if (adjacentHaplotypePath == NULL) { adjacentHaplotypePath = stHash_search(segmentToMaximalHaplotypePathHash, segment_getReverse( adjacentSegment)); } assert(adjacentHaplotypePath != NULL); assert(adjacentHaplotypePath != haplotypePath); assert(stHash_search(haplotypeToMaximalHaplotypeLengthHash, adjacentHaplotypePath) != NULL); int64_t k = stIntTuple_get(stHash_search(haplotypeToMaximalHaplotypeLengthHash, adjacentHaplotypePath), 0); //Now merge the buckets and make new int tuples.. stSortedSet *bucket1 = stHash_search(haplotypePathToScaffoldPathHash, haplotypePath); stSortedSet *bucket2 = stHash_search(haplotypePathToScaffoldPathHash, adjacentHaplotypePath); assert(bucket1 != NULL); assert(bucket2 != NULL); assert(bucket1 != bucket2); stSortedSet *bucket3 = stSortedSet_getUnion(bucket1, bucket2); stSortedSetIterator *bucketIt = stSortedSet_getIterator(bucket3); stList *l; while ((l = stSortedSet_getNext(bucketIt)) != NULL) { //Do the bucket first assert(stHash_search(haplotypePathToScaffoldPathHash, l) == bucket1 || stHash_search(haplotypePathToScaffoldPathHash, l) == bucket2); stHash_remove(haplotypePathToScaffoldPathHash, l); stHash_insert(haplotypePathToScaffoldPathHash, l, bucket3); //Now the length stIntTuple *m = stHash_remove(haplotypeToMaximalHaplotypeLengthHash, l); assert(m != NULL); assert(stIntTuple_get(m, 0) == j || stIntTuple_get(m, 0) == k); stHash_insert(haplotypeToMaximalHaplotypeLengthHash, l, stIntTuple_construct1( j + k)); stIntTuple_destruct(m); } assert(stHash_search(haplotypePathToScaffoldPathHash, haplotypePath) == bucket3); assert(stHash_search(haplotypePathToScaffoldPathHash, adjacentHaplotypePath) == bucket3); stSortedSet_destructIterator(bucketIt); } } stHash_destruct(segmentToMaximalHaplotypePathHash); return haplotypeToMaximalHaplotypeLengthHash; }
static void test_newBlockHashFromBlock_0(CuTest *testCase) { stList *orderList = stList_construct3(0, free); stHash *observedHash = createBlockHashFromString("a score=0 test=0\n" "s reference.chr0 0 13 + 158545518 gcagctgaaaaca\n" "s name.chr1 0 10 + 100 ATGT---ATGCCG\n" "s name2.chr1 0 10 + 100 ATGT---ATGCCG\n" "s name3.chr1 5 13 - 100 ATGTgggATGCCG\n", orderList); CuAssertTrue(testCase, stHash_size(observedHash) == 4); row_t *key = NULL; key = stHash_search(observedHash, "reference"); CuAssertTrue(testCase, key != NULL); CuAssertTrue(testCase, strcmp(key->name, "reference.chr0") == 0); CuAssertTrue(testCase, strcmp(key->prevName, "reference.chr0") == 0); CuAssertTrue(testCase, key->multipleNames == false); CuAssertTrue(testCase, key->start == 0); CuAssertTrue(testCase, key->length == 13); CuAssertTrue(testCase, key->sourceLength == 158545518); CuAssertTrue(testCase, key->prevRightPos == 12); CuAssertTrue(testCase, key->strand == '+'); CuAssertTrue(testCase, key->prevStrand == '+'); CuAssertTrue(testCase, strcmp(key->sequence, "gcagctgaaaaca") == 0); // row 2 key = stHash_search(observedHash, "name"); CuAssertTrue(testCase, key != NULL); CuAssertTrue(testCase, strcmp(key->name, "name.chr1") == 0); CuAssertTrue(testCase, strcmp(key->prevName, "name.chr1") == 0); CuAssertTrue(testCase, key->multipleNames == false); CuAssertTrue(testCase, key->start == 0); CuAssertTrue(testCase, key->length == 10); CuAssertTrue(testCase, key->sourceLength == 100); CuAssertTrue(testCase, key->prevRightPos == 9); CuAssertTrue(testCase, key->strand == '+'); CuAssertTrue(testCase, key->prevStrand == '+'); CuAssertTrue(testCase, strcmp(key->sequence, "ATGT---ATGCCG") == 0); // row 3 key = stHash_search(observedHash, "name2"); CuAssertTrue(testCase, key != NULL); CuAssertTrue(testCase, strcmp(key->name, "name2.chr1") == 0); CuAssertTrue(testCase, strcmp(key->prevName, "name2.chr1") == 0); CuAssertTrue(testCase, key->multipleNames == false); CuAssertTrue(testCase, key->start == 0); CuAssertTrue(testCase, key->length == 10); CuAssertTrue(testCase, key->sourceLength == 100); CuAssertTrue(testCase, key->prevRightPos == 9); CuAssertTrue(testCase, key->strand == '+'); CuAssertTrue(testCase, key->prevStrand == '+'); CuAssertTrue(testCase, strcmp(key->sequence, "ATGT---ATGCCG") == 0); // row 4 key = stHash_search(observedHash, "name3"); CuAssertTrue(testCase, key != NULL); CuAssertTrue(testCase, strcmp(key->name, "name3.chr1") == 0); CuAssertTrue(testCase, strcmp(key->prevName, "name3.chr1") == 0); CuAssertTrue(testCase, key->multipleNames == false); CuAssertTrue(testCase, key->start == 5); CuAssertTrue(testCase, key->length == 13); CuAssertTrue(testCase, key->sourceLength == 100); CuAssertTrue(testCase, key->prevRightPos == 17); CuAssertTrue(testCase, key->strand == '-'); CuAssertTrue(testCase, key->prevStrand == '-'); CuAssertTrue(testCase, strcmp(key->sequence, "ATGTgggATGCCG") == 0); stList_destruct(orderList); stHash_destruct(observedHash); }
int main(int argc, char *argv[]) { // Parse arguments if (argc != 3) { usage(argv); return 1; } // You would load a custom HMM here if you wanted using // hmm_getStateMachine (see the realign code) StateMachine *stateMachine = stateMachine5_construct(fiveState); PairwiseAlignmentParameters *parameters = pairwiseAlignmentBandingParameters_construct(); stHash *targetSequences = readFastaFile(argv[1]); stHash *querySequences = readFastaFile(argv[2]); // For each query sequence, align it against all target sequences. stHashIterator *queryIt = stHash_getIterator(querySequences); char *queryHeader; while ((queryHeader = stHash_getNext(queryIt)) != NULL) { char *querySeq = stHash_search(querySequences, queryHeader); stHashIterator *targetIt = stHash_getIterator(targetSequences); char *targetHeader; while ((targetHeader = stHash_getNext(targetIt)) != NULL) { char *targetSeq = stHash_search(targetSequences, targetHeader); // Here we should try both the target sequence and its // reverse-complemented version // Aligns the sequences. // If you have alignment constraints (anchors) you should // replace this with getAlignedPairsUsingAnchors. stList *alignedPairs = getAlignedPairs(stateMachine, targetSeq, querySeq, parameters, true, true); // Takes into account the probability of aligning to a // gap, by transforming the posterior probability into the // AMAP objective function (see Schwartz & Pachter, 2007). alignedPairs = reweightAlignedPairs2(alignedPairs, strlen(targetSeq), strlen(querySeq), parameters->gapGamma); // I think this calculates the optimal ordered set of // alignments from the unordered set of aligned pairs, not // completely sure. alignedPairs = filterPairwiseAlignmentToMakePairsOrdered(alignedPairs, targetSeq, querySeq, // This parameter says that the minimum posterior probability we will accept has to be at least 0.9. 0.9); // After this the "aligned pairs" data structure changes, // which is a little sketchy. It's just so that the // alignment can be printed properly. stList_mapReplace(alignedPairs, convertToAnchorPair, NULL); stList_sort(alignedPairs, (int (*)(const void *, const void *)) stIntTuple_cmpFn); struct PairwiseAlignment *alignment = convertAlignedPairsToPairwiseAlignment(targetHeader, queryHeader, 0, strlen(targetSeq), strlen(querySeq), alignedPairs); // Output the cigar string cigarWrite(stdout, alignment, 0); stList_destruct(alignedPairs); destructPairwiseAlignment(alignment); } stHash_destructIterator(targetIt); } stHash_destructIterator(queryIt); // Clean up stHash_destruct(targetSequences); stHash_destruct(querySequences); pairwiseAlignmentBandingParameters_destruct(parameters); stateMachine_destruct(stateMachine); }
int main(int argc, char *argv[]) { /* * Script for adding alignments to cactus tree. */ int64_t startTime; stKVDatabaseConf *kvDatabaseConf; CactusDisk *cactusDisk; int key, k; bool (*filterFn)(stPinchSegment *, stPinchSegment *) = NULL; stSet *outgroupThreads = NULL; /* * Arguments/options */ char * logLevelString = NULL; char * alignmentsFile = NULL; char * constraintsFile = NULL; char * cactusDiskDatabaseString = NULL; char * lastzArguments = ""; int64_t minimumSequenceLengthForBlast = 1; //Parameters for annealing/melting rounds int64_t *annealingRounds = NULL; int64_t annealingRoundsLength = 0; int64_t *meltingRounds = NULL; int64_t meltingRoundsLength = 0; //Parameters for melting float maximumAdjacencyComponentSizeRatio = 10; int64_t blockTrim = 0; int64_t alignmentTrimLength = 0; int64_t *alignmentTrims = NULL; int64_t chainLengthForBigFlower = 1000000; int64_t longChain = 2; int64_t minLengthForChromosome = 1000000; float proportionOfUnalignedBasesForNewChromosome = 0.8; bool breakChainsAtReverseTandems = 1; int64_t maximumMedianSequenceLengthBetweenLinkedEnds = INT64_MAX; bool realign = 0; char *realignArguments = ""; bool removeRecoverableChains = false; bool (*recoverableChainsFilter)(stCactusEdgeEnd *, Flower *) = NULL; int64_t maxRecoverableChainsIterations = 1; int64_t maxRecoverableChainLength = INT64_MAX; //Parameters for removing ancient homologies bool doPhylogeny = false; int64_t phylogenyNumTrees = 1; enum stCaf_RootingMethod phylogenyRootingMethod = BEST_RECON; enum stCaf_ScoringMethod phylogenyScoringMethod = COMBINED_LIKELIHOOD; double breakpointScalingFactor = 1.0; bool phylogenySkipSingleCopyBlocks = 0; int64_t phylogenyMaxBaseDistance = 1000; int64_t phylogenyMaxBlockDistance = 100; bool phylogenyKeepSingleDegreeBlocks = 0; stList *phylogenyTreeBuildingMethods = stList_construct(); enum stCaf_TreeBuildingMethod defaultMethod = GUIDED_NEIGHBOR_JOINING; stList_append(phylogenyTreeBuildingMethods, &defaultMethod); double phylogenyCostPerDupPerBase = 0.2; double phylogenyCostPerLossPerBase = 0.2; const char *debugFileName = NULL; const char *referenceEventHeader = NULL; double phylogenyDoSplitsWithSupportHigherThanThisAllAtOnce = 1.0; int64_t numTreeBuildingThreads = 2; int64_t minimumBlockDegreeToCheckSupport = 10; double minimumBlockHomologySupport = 0.7; double nucleotideScalingFactor = 1.0; HomologyUnitType phylogenyHomologyUnitType = BLOCK; enum stCaf_DistanceCorrectionMethod phylogenyDistanceCorrectionMethod = JUKES_CANTOR; bool sortAlignments = false; /////////////////////////////////////////////////////////////////////////// // (0) Parse the inputs handed by genomeCactus.py / setup stuff. /////////////////////////////////////////////////////////////////////////// while (1) { static struct option long_options[] = { { "logLevel", required_argument, 0, 'a' }, { "alignments", required_argument, 0, 'b' }, { "cactusDisk", required_argument, 0, 'c' }, { "lastzArguments", required_argument, 0, 'd' }, { "help", no_argument, 0, 'h' }, { "annealingRounds", required_argument, 0, 'i' }, { "trim", required_argument, 0, 'k' }, { "trimChange", required_argument, 0, 'l', }, { "minimumTreeCoverage", required_argument, 0, 'm' }, { "blockTrim", required_argument, 0, 'n' }, { "deannealingRounds", required_argument, 0, 'o' }, { "minimumDegree", required_argument, 0, 'p' }, { "minimumIngroupDegree", required_argument, 0, 'q' }, { "minimumOutgroupDegree", required_argument, 0, 'r' }, { "alignmentFilter", required_argument, 0, 't' }, { "minimumSequenceLengthForBlast", required_argument, 0, 'v' }, { "maxAdjacencyComponentSizeRatio", required_argument, 0, 'w' }, { "constraints", required_argument, 0, 'x' }, { "minLengthForChromosome", required_argument, 0, 'y' }, { "proportionOfUnalignedBasesForNewChromosome", required_argument, 0, 'z' }, { "maximumMedianSequenceLengthBetweenLinkedEnds", required_argument, 0, 'A' }, { "realign", no_argument, 0, 'B' }, { "realignArguments", required_argument, 0, 'C' }, { "phylogenyNumTrees", required_argument, 0, 'D' }, { "phylogenyRootingMethod", required_argument, 0, 'E' }, { "phylogenyScoringMethod", required_argument, 0, 'F' }, { "phylogenyBreakpointScalingFactor", required_argument, 0, 'G' }, { "phylogenySkipSingleCopyBlocks", no_argument, 0, 'H' }, { "phylogenyMaxBaseDistance", required_argument, 0, 'I' }, { "phylogenyMaxBlockDistance", required_argument, 0, 'J' }, { "phylogenyDebugFile", required_argument, 0, 'K' }, { "phylogenyKeepSingleDegreeBlocks", no_argument, 0, 'L' }, { "phylogenyTreeBuildingMethod", required_argument, 0, 'M' }, { "phylogenyCostPerDupPerBase", required_argument, 0, 'N' }, { "phylogenyCostPerLossPerBase", required_argument, 0, 'O' }, { "referenceEventHeader", required_argument, 0, 'P' }, { "phylogenyDoSplitsWithSupportHigherThanThisAllAtOnce", required_argument, 0, 'Q' }, { "numTreeBuildingThreads", required_argument, 0, 'R' }, { "phylogeny", no_argument, 0, 'S' }, { "minimumBlockHomologySupport", required_argument, 0, 'T' }, { "phylogenyNucleotideScalingFactor", required_argument, 0, 'U' }, { "minimumBlockDegreeToCheckSupport", required_argument, 0, 'V' }, { "removeRecoverableChains", required_argument, 0, 'W' }, { "minimumNumberOfSpecies", required_argument, 0, 'X' }, { "phylogenyHomologyUnitType", required_argument, 0, 'Y' }, { "phylogenyDistanceCorrectionMethod", required_argument, 0, 'Z' }, { "maxRecoverableChainsIterations", required_argument, 0, '1' }, { "maxRecoverableChainLength", required_argument, 0, '2' }, { 0, 0, 0, 0 } }; int option_index = 0; key = getopt_long(argc, argv, "a:b:c:hi:k:m:n:o:p:q:r:stv:w:x:y:z:A:BC:D:E:", long_options, &option_index); if (key == -1) { break; } switch (key) { case 'a': logLevelString = stString_copy(optarg); st_setLogLevelFromString(logLevelString); break; case 'b': alignmentsFile = stString_copy(optarg); break; case 'c': cactusDiskDatabaseString = stString_copy(optarg); break; case 'd': lastzArguments = stString_copy(optarg); break; case 'h': usage(); return 0; case 'i': annealingRounds = getInts(optarg, &annealingRoundsLength); break; case 'o': meltingRounds = getInts(optarg, &meltingRoundsLength); break; case 'k': alignmentTrims = getInts(optarg, &alignmentTrimLength); break; case 'm': k = sscanf(optarg, "%f", &minimumTreeCoverage); assert(k == 1); break; case 'n': k = sscanf(optarg, "%" PRIi64 "", &blockTrim); assert(k == 1); break; case 'p': k = sscanf(optarg, "%" PRIi64 "", &minimumDegree); assert(k == 1); break; case 'q': k = sscanf(optarg, "%" PRIi64 "", &minimumIngroupDegree); assert(k == 1); break; case 'r': k = sscanf(optarg, "%" PRIi64 "", &minimumOutgroupDegree); assert(k == 1); break; case 't': if (strcmp(optarg, "singleCopyOutgroup") == 0) { sortAlignments = true; filterFn = stCaf_filterByOutgroup; } else if (strcmp(optarg, "relaxedSingleCopyOutgroup") == 0) { sortAlignments = true; filterFn = stCaf_relaxedFilterByOutgroup; } else if (strcmp(optarg, "singleCopy") == 0) { sortAlignments = true; filterFn = stCaf_filterByRepeatSpecies; } else if (strcmp(optarg, "relaxedSingleCopy") == 0) { sortAlignments = true; filterFn = stCaf_relaxedFilterByRepeatSpecies; } else if (strcmp(optarg, "singleCopyChr") == 0) { sortAlignments = true; filterFn = stCaf_singleCopyChr; } else if (strcmp(optarg, "singleCopyIngroup") == 0) { sortAlignments = true; filterFn = stCaf_singleCopyIngroup; } else if (strcmp(optarg, "relaxedSingleCopyIngroup") == 0) { sortAlignments = true; filterFn = stCaf_relaxedSingleCopyIngroup; } else if (strcmp(optarg, "none") == 0) { sortAlignments = false; filterFn = NULL; } else { st_errAbort("Could not recognize alignmentFilter option %s", optarg); } break; case 'v': k = sscanf(optarg, "%" PRIi64 "", &minimumSequenceLengthForBlast); assert(k == 1); break; case 'w': k = sscanf(optarg, "%f", &maximumAdjacencyComponentSizeRatio); assert(k == 1); break; case 'x': constraintsFile = stString_copy(optarg); break; case 'y': k = sscanf(optarg, "%" PRIi64 "", &minLengthForChromosome); assert(k == 1); break; case 'z': k = sscanf(optarg, "%f", &proportionOfUnalignedBasesForNewChromosome); assert(k == 1); break; case 'A': k = sscanf(optarg, "%" PRIi64 "", &maximumMedianSequenceLengthBetweenLinkedEnds); assert(k == 1); break; case 'B': realign = 1; break; case 'C': realignArguments = stString_copy(optarg); break; case 'D': k = sscanf(optarg, "%" PRIi64, &phylogenyNumTrees); assert(k == 1); break; case 'E': if (!strcmp(optarg, "outgroupBranch")) { phylogenyRootingMethod = OUTGROUP_BRANCH; } else if (!strcmp(optarg, "longestBranch")) { phylogenyRootingMethod = LONGEST_BRANCH; } else if (!strcmp(optarg, "bestRecon")) { phylogenyRootingMethod = BEST_RECON; } else { st_errAbort("Invalid tree rooting method: %s", optarg); } break; case 'F': if (!strcmp(optarg, "reconCost")) { phylogenyScoringMethod = RECON_COST; } else if (!strcmp(optarg, "nucLikelihood")) { phylogenyScoringMethod = NUCLEOTIDE_LIKELIHOOD; } else if (!strcmp(optarg, "reconLikelihood")) { phylogenyScoringMethod = RECON_LIKELIHOOD; } else if (!strcmp(optarg, "combinedLikelihood")) { phylogenyScoringMethod = COMBINED_LIKELIHOOD; } else { st_errAbort("Invalid tree scoring method: %s", optarg); } break; case 'G': k = sscanf(optarg, "%lf", &breakpointScalingFactor); assert(k == 1); break; case 'H': phylogenySkipSingleCopyBlocks = true; break; case 'I': k = sscanf(optarg, "%" PRIi64, &phylogenyMaxBaseDistance); assert(k == 1); break; case 'J': k = sscanf(optarg, "%" PRIi64, &phylogenyMaxBlockDistance); assert(k == 1); break; case 'K': debugFileName = stString_copy(optarg); break; case 'L': phylogenyKeepSingleDegreeBlocks = true; break; case 'M': // clear the default setting of the list stList_destruct(phylogenyTreeBuildingMethods); phylogenyTreeBuildingMethods = stList_construct(); stList *methodStrings = stString_splitByString(optarg, ","); for (int64_t i = 0; i < stList_length(methodStrings); i++) { char *methodString = stList_get(methodStrings, i); enum stCaf_TreeBuildingMethod *method = st_malloc(sizeof(enum stCaf_TreeBuildingMethod)); if (strcmp(methodString, "neighborJoining") == 0) { *method = NEIGHBOR_JOINING; } else if (strcmp(methodString, "guidedNeighborJoining") == 0) { *method = GUIDED_NEIGHBOR_JOINING; } else if (strcmp(methodString, "splitDecomposition") == 0) { *method = SPLIT_DECOMPOSITION; } else if (strcmp(methodString, "strictSplitDecomposition") == 0) { *method = STRICT_SPLIT_DECOMPOSITION; } else if (strcmp(methodString, "removeBadChains") == 0) { *method = REMOVE_BAD_CHAINS; } else { st_errAbort("Unknown tree building method: %s", methodString); } stList_append(phylogenyTreeBuildingMethods, method); } stList_destruct(methodStrings); break; case 'N': k = sscanf(optarg, "%lf", &phylogenyCostPerDupPerBase); assert(k == 1); break; case 'O': k = sscanf(optarg, "%lf", &phylogenyCostPerLossPerBase); assert(k == 1); break; case 'P': referenceEventHeader = stString_copy(optarg); break; case 'Q': k = sscanf(optarg, "%lf", &phylogenyDoSplitsWithSupportHigherThanThisAllAtOnce); assert(k == 1); break; case 'R': k = sscanf(optarg, "%" PRIi64, &numTreeBuildingThreads); assert(k == 1); break; case 'S': doPhylogeny = true; break; case 'T': k = sscanf(optarg, "%lf", &minimumBlockHomologySupport); assert(k == 1); assert(minimumBlockHomologySupport <= 1.0); assert(minimumBlockHomologySupport >= 0.0); break; case 'U': k = sscanf(optarg, "%lf", &nucleotideScalingFactor); assert(k == 1); break; case 'V': k = sscanf(optarg, "%" PRIi64, &minimumBlockDegreeToCheckSupport); assert(k == 1); break; case 'W': if (strcmp(optarg, "1") == 0) { removeRecoverableChains = true; recoverableChainsFilter = NULL; } else if (strcmp(optarg, "unequalNumberOfIngroupCopies") == 0) { removeRecoverableChains = true; recoverableChainsFilter = stCaf_chainHasUnequalNumberOfIngroupCopies; } else if (strcmp(optarg, "unequalNumberOfIngroupCopiesOrNoOutgroup") == 0) { removeRecoverableChains = true; recoverableChainsFilter = stCaf_chainHasUnequalNumberOfIngroupCopiesOrNoOutgroup; } else if (strcmp(optarg, "0") == 0) { removeRecoverableChains = false; } else { st_errAbort("Could not parse removeRecoverableChains argument"); } break; case 'X': k = sscanf(optarg, "%" PRIi64, &minimumNumberOfSpecies); if (k != 1) { st_errAbort("Error parsing the minimumNumberOfSpecies argument"); } break; case 'Y': if (strcmp(optarg, "chain") == 0) { phylogenyHomologyUnitType = CHAIN; } else if (strcmp(optarg, "block") == 0) { phylogenyHomologyUnitType = BLOCK; } else { st_errAbort("Could not parse the phylogenyHomologyUnitType argument"); } break; case 'Z': if (strcmp(optarg, "jukesCantor") == 0) { phylogenyDistanceCorrectionMethod = JUKES_CANTOR; } else if (strcmp(optarg, "none") == 0 ) { phylogenyDistanceCorrectionMethod = NONE; } else { st_errAbort("Could not parse the phylogenyDistanceCorrectionMethod argument"); } break; case '1': k = sscanf(optarg, "%" PRIi64, &maxRecoverableChainsIterations); if (k != 1) { st_errAbort("Error parsing the maxRecoverableChainsIterations argument"); } break; case '2': k = sscanf(optarg, "%" PRIi64, &maxRecoverableChainLength); if (k != 1) { st_errAbort("Error parsing the maxRecoverableChainLength argument"); } break; default: usage(); return 1; } } /////////////////////////////////////////////////////////////////////////// // (0) Check the inputs. /////////////////////////////////////////////////////////////////////////// assert(cactusDiskDatabaseString != NULL); assert(minimumTreeCoverage >= 0.0); assert(minimumTreeCoverage <= 1.0); assert(blockTrim >= 0); assert(annealingRoundsLength >= 0); for (int64_t i = 0; i < annealingRoundsLength; i++) { assert(annealingRounds[i] >= 0); } assert(meltingRoundsLength >= 0); for (int64_t i = 1; i < meltingRoundsLength; i++) { assert(meltingRounds[i - 1] < meltingRounds[i]); assert(meltingRounds[i - 1] >= 1); } assert(alignmentTrimLength >= 0); for (int64_t i = 0; i < alignmentTrimLength; i++) { assert(alignmentTrims[i] >= 0); } assert(minimumOutgroupDegree >= 0); assert(minimumIngroupDegree >= 0); ////////////////////////////////////////////// //Set up logging ////////////////////////////////////////////// st_setLogLevelFromString(logLevelString); ////////////////////////////////////////////// //Log (some of) the inputs ////////////////////////////////////////////// st_logInfo("Flower disk name : %s\n", cactusDiskDatabaseString); ////////////////////////////////////////////// //Load the database ////////////////////////////////////////////// kvDatabaseConf = stKVDatabaseConf_constructFromString(cactusDiskDatabaseString); cactusDisk = cactusDisk_construct(kvDatabaseConf, 0); st_logInfo("Set up the flower disk\n"); /////////////////////////////////////////////////////////////////////////// // Sort the constraints /////////////////////////////////////////////////////////////////////////// stPinchIterator *pinchIteratorForConstraints = NULL; if (constraintsFile != NULL) { pinchIteratorForConstraints = stPinchIterator_constructFromFile(constraintsFile); st_logInfo("Created an iterator for the alignment constaints from file: %s\n", constraintsFile); } /////////////////////////////////////////////////////////////////////////// // Do the alignment /////////////////////////////////////////////////////////////////////////// startTime = time(NULL); stList *flowers = flowerWriter_parseFlowersFromStdin(cactusDisk); if (alignmentsFile == NULL) { cactusDisk_preCacheStrings(cactusDisk, flowers); } char *tempFile1 = NULL; for (int64_t i = 0; i < stList_length(flowers); i++) { flower = stList_get(flowers, i); if (!flower_builtBlocks(flower)) { // Do nothing if the flower already has defined blocks st_logDebug("Processing flower: %lli\n", flower_getName(flower)); stCaf_setFlowerForAlignmentFiltering(flower); //Set up the graph and add the initial alignments stPinchThreadSet *threadSet = stCaf_setup(flower); //Build the set of outgroup threads outgroupThreads = stCaf_getOutgroupThreads(flower, threadSet); //Setup the alignments stPinchIterator *pinchIterator; stList *alignmentsList = NULL; if (alignmentsFile != NULL) { assert(i == 0); assert(stList_length(flowers) == 1); if (sortAlignments) { tempFile1 = getTempFile(); stCaf_sortCigarsFileByScoreInDescendingOrder(alignmentsFile, tempFile1); pinchIterator = stPinchIterator_constructFromFile(tempFile1); } else { pinchIterator = stPinchIterator_constructFromFile(alignmentsFile); } } else { if (tempFile1 == NULL) { tempFile1 = getTempFile(); } alignmentsList = stCaf_selfAlignFlower(flower, minimumSequenceLengthForBlast, lastzArguments, realign, realignArguments, tempFile1); if (sortAlignments) { stCaf_sortCigarsByScoreInDescendingOrder(alignmentsList); } st_logDebug("Ran lastz and have %" PRIi64 " alignments\n", stList_length(alignmentsList)); pinchIterator = stPinchIterator_constructFromList(alignmentsList); } for (int64_t annealingRound = 0; annealingRound < annealingRoundsLength; annealingRound++) { int64_t minimumChainLength = annealingRounds[annealingRound]; int64_t alignmentTrim = annealingRound < alignmentTrimLength ? alignmentTrims[annealingRound] : 0; st_logDebug("Starting annealing round with a minimum chain length of %" PRIi64 " and an alignment trim of %" PRIi64 "\n", minimumChainLength, alignmentTrim); stPinchIterator_setTrim(pinchIterator, alignmentTrim); //Add back in the constraints if (pinchIteratorForConstraints != NULL) { stCaf_anneal(threadSet, pinchIteratorForConstraints, filterFn); } //Do the annealing if (annealingRound == 0) { stCaf_anneal(threadSet, pinchIterator, filterFn); } else { stCaf_annealBetweenAdjacencyComponents(threadSet, pinchIterator, filterFn); } // Dump the block degree and length distribution to a file if (debugFileName != NULL) { dumpBlockInfo(threadSet, stString_print("%s-blockStats-preMelting", debugFileName)); } printf("Sequence graph statistics after annealing:\n"); printThreadSetStatistics(threadSet, flower, stdout); // Check for poorly-supported blocks--those that have // been transitively aligned together but with very // few homologies supporting the transitive // alignment. These "megablocks" can snarl up the // graph so that a lot of extra gets thrown away in // the first melting step. stPinchThreadSetBlockIt blockIt = stPinchThreadSet_getBlockIt(threadSet); stPinchBlock *block; while ((block = stPinchThreadSetBlockIt_getNext(&blockIt)) != NULL) { if (stPinchBlock_getDegree(block) > minimumBlockDegreeToCheckSupport) { uint64_t supportingHomologies = stPinchBlock_getNumSupportingHomologies(block); uint64_t possibleSupportingHomologies = numPossibleSupportingHomologies(block, flower); double support = ((double) supportingHomologies) / possibleSupportingHomologies; if (support < minimumBlockHomologySupport) { fprintf(stdout, "Destroyed a megablock with degree %" PRIi64 " and %" PRIi64 " supporting homologies out of a maximum " "of %" PRIi64 " (%lf%%).\n", stPinchBlock_getDegree(block), supportingHomologies, possibleSupportingHomologies, support); stPinchBlock_destruct(block); } } } //Do the melting rounds for (int64_t meltingRound = 0; meltingRound < meltingRoundsLength; meltingRound++) { int64_t minimumChainLengthForMeltingRound = meltingRounds[meltingRound]; st_logDebug("Starting melting round with a minimum chain length of %" PRIi64 " \n", minimumChainLengthForMeltingRound); if (minimumChainLengthForMeltingRound >= minimumChainLength) { break; } stCaf_melt(flower, threadSet, NULL, 0, minimumChainLengthForMeltingRound, 0, INT64_MAX); } st_logDebug("Last melting round of cycle with a minimum chain length of %" PRIi64 " \n", minimumChainLength); stCaf_melt(flower, threadSet, NULL, 0, minimumChainLength, breakChainsAtReverseTandems, maximumMedianSequenceLengthBetweenLinkedEnds); //This does the filtering of blocks that do not have the required species/tree-coverage/degree. stCaf_melt(flower, threadSet, blockFilterFn, blockTrim, 0, 0, INT64_MAX); } if (removeRecoverableChains) { stCaf_meltRecoverableChains(flower, threadSet, breakChainsAtReverseTandems, maximumMedianSequenceLengthBetweenLinkedEnds, recoverableChainsFilter, maxRecoverableChainsIterations, maxRecoverableChainLength); } if (debugFileName != NULL) { dumpBlockInfo(threadSet, stString_print("%s-blockStats-postMelting", debugFileName)); } printf("Sequence graph statistics after melting:\n"); printThreadSetStatistics(threadSet, flower, stdout); // Build a tree for each block, then use each tree to // partition the homologies between the ingroups sequences // into those that occur before the speciation with the // outgroup and those which occur late. if (stSet_size(outgroupThreads) > 0 && doPhylogeny) { st_logDebug("Starting to build trees and partition ingroup homologies\n"); stHash *threadStrings = stCaf_getThreadStrings(flower, threadSet); st_logDebug("Got sets of thread strings and set of threads that are outgroups\n"); stCaf_PhylogenyParameters params; params.distanceCorrectionMethod = phylogenyDistanceCorrectionMethod; params.treeBuildingMethods = phylogenyTreeBuildingMethods; params.rootingMethod = phylogenyRootingMethod; params.scoringMethod = phylogenyScoringMethod; params.breakpointScalingFactor = breakpointScalingFactor; params.nucleotideScalingFactor = nucleotideScalingFactor; params.skipSingleCopyBlocks = phylogenySkipSingleCopyBlocks; params.keepSingleDegreeBlocks = phylogenyKeepSingleDegreeBlocks; params.costPerDupPerBase = phylogenyCostPerDupPerBase; params.costPerLossPerBase = phylogenyCostPerLossPerBase; params.maxBaseDistance = phylogenyMaxBaseDistance; params.maxBlockDistance = phylogenyMaxBlockDistance; params.numTrees = phylogenyNumTrees; params.ignoreUnalignedBases = 1; params.onlyIncludeCompleteFeatureBlocks = 0; params.doSplitsWithSupportHigherThanThisAllAtOnce = phylogenyDoSplitsWithSupportHigherThanThisAllAtOnce; params.numTreeBuildingThreads = numTreeBuildingThreads; assert(params.numTreeBuildingThreads >= 1); stCaf_buildTreesToRemoveAncientHomologies( threadSet, phylogenyHomologyUnitType, threadStrings, outgroupThreads, flower, ¶ms, debugFileName == NULL ? NULL : stString_print("%s-phylogeny", debugFileName), referenceEventHeader); stHash_destruct(threadStrings); st_logDebug("Finished building trees\n"); if (removeRecoverableChains) { // We melt recoverable chains after splitting, as // well as before, to alleviate coverage loss // caused by bad splits. stCaf_meltRecoverableChains(flower, threadSet, breakChainsAtReverseTandems, maximumMedianSequenceLengthBetweenLinkedEnds, recoverableChainsFilter, maxRecoverableChainsIterations, maxRecoverableChainLength); } // Enforce the block constraints on minimum degree, // etc. after splitting. stCaf_melt(flower, threadSet, blockFilterFn, 0, 0, 0, INT64_MAX); } //Sort out case when we allow blocks of degree 1 if (minimumDegree < 2) { st_logDebug("Creating degree 1 blocks\n"); stCaf_makeDegreeOneBlocks(threadSet); stCaf_melt(flower, threadSet, blockFilterFn, blockTrim, 0, 0, INT64_MAX); } else if (maximumAdjacencyComponentSizeRatio < INT64_MAX) { //Deal with giant components st_logDebug("Breaking up components greedily\n"); stCaf_breakupComponentsGreedily(threadSet, maximumAdjacencyComponentSizeRatio); } //Finish up stCaf_finish(flower, threadSet, chainLengthForBigFlower, longChain, minLengthForChromosome, proportionOfUnalignedBasesForNewChromosome); //Flower is then destroyed at this point. st_logInfo("Ran the cactus core script\n"); //Cleanup stPinchThreadSet_destruct(threadSet); stPinchIterator_destruct(pinchIterator); stSet_destruct(outgroupThreads); if (alignmentsList != NULL) { stList_destruct(alignmentsList); } st_logInfo("Cleaned up from main loop\n"); } else { st_logInfo("We've already built blocks / alignments for this flower\n"); } } stList_destruct(flowers); if (tempFile1 != NULL) { st_system("rm %s", tempFile1); } if (constraintsFile != NULL) { stPinchIterator_destruct(pinchIteratorForConstraints); } /////////////////////////////////////////////////////////////////////////// // Write the flower to disk. /////////////////////////////////////////////////////////////////////////// st_logDebug("Writing the flowers to disk\n"); cactusDisk_write(cactusDisk); st_logInfo("Updated the flower on disk and %" PRIi64 " seconds have elapsed\n", time(NULL) - startTime); /////////////////////////////////////////////////////////////////////////// // Clean up. /////////////////////////////////////////////////////////////////////////// cactusDisk_destruct(cactusDisk); }
void stNaiveConnectivity_destruct(stNaiveConnectivity *connectivity) { invalidateCache(connectivity); stHash_destruct(connectivity->nodesToAdjList); free(connectivity); }
void stSet_destruct(stSet *set) { stHash_destruct(set->hash); free(set); }
static void test_addBlockToHash_4(CuTest *testCase) { // concatenation with sequnece breakpoint due to *strand* alone // note that name3 is well within the interstitial boundary, the two blocks // essentially looking like >>>>>>>>>>>>> <<<<< (strand diffs) options_t *options = options_construct(); options->breakpointPenalty = 10; options->interstitialSequence = 5; stList *observedList = stList_construct3(0, free); stList *expectedList = stList_construct3(0, free); stHash *observedHash = createBlockHashFromString("a score=0\n" "s reference.chr0 0 13 + 158545518 gcagctgaaaaca\n" "s name.chr1 0 10 + 100 ATGT---ATGCCG\n" "s name2.chr1 0 10 + 100 ATGT---ATGCCG\n" "s name3.chr1 0 13 + 100 GCAGCTGAAAACA\n", observedList ); mafBlock_t *mb = maf_newMafBlockListFromString("a score=0 test\n" "s reference.chr0 13 5 + 158545518 ACGTA\n" "s name.chr1 12 5 + 100 gtcGG\n" "s name2.chr1 10 5 + 100 ATGTg\n" "s name3.chr1 82 5 - 100 GGGGG\n" , 3); stHash *expectedHash = NULL; expectedHash = createBlockHashFromString("a score=0\n" "s reference.chr0 0 18 + 158545518 gcagctgaaaaca------------ACGTA\n" "s name.chr1 0 17 + 100 ATGT---ATGCCGac----------gtcGG\n" "s name2.chr1 0 15 + 100 ATGT---ATGCCG------------ATGTg\n" "s name3.chr1 0 28 + 100 GCAGCTGAAAACA--NNNNNNNNNNGGGGG\n", expectedList ); row_t *r = stHash_search(expectedHash, "name3"); r->prevRightPos = 86; r->strand = '*'; r->prevStrand = '-'; stHash *seqHash = createSeqHashFromString("name.chr1", "ATGTATGCCGacgtc" "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG" "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG"); mtfseq_t *mtfs = newMtfseqFromString("gcagctgaaaacaACGTA" "tttttttttttttttttttttttttttttttt" "tttttttttttttttttttttttttttttttttttttttttttttttttt"); stHash_insert(seqHash, stString_copy("reference.chr0"), mtfs); mtfs = newMtfseqFromString("ATGTATGCCGATGTg" "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC" "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC"); stHash_insert(seqHash, stString_copy("name2.chr1"), mtfs); mtfs = newMtfseqFromString("GCAGCTGAAAACACCCCCgggggggggggggggggggggggggggggggg" "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" ); stHash_insert(seqHash, stString_copy("name3.chr1"), mtfs); addMafBlockToRowHash(observedHash, seqHash, observedList, mb, options); CuAssertTrue(testCase, hashesAreEqual(observedHash, expectedHash)); CuAssertTrue(testCase, listsAreEqual(observedList, expectedList)); // clean up stHash_destruct(observedHash); stHash_destruct(expectedHash); stHash_destruct(seqHash); stList_destruct(observedList); stList_destruct(expectedList); maf_destroyMafBlockList(mb); destroyOptions(options); }
stSortedSet *makeEndAlignment(StateMachine *sM, End *end, int64_t spanningTrees, int64_t maxSequenceLength, bool useProgressiveMerging, float gapGamma, PairwiseAlignmentParameters *pairwiseAlignmentBandingParameters) { //Make an alignment of the sequences in the ends //Get the adjacency sequences to be aligned. Cap *cap; End_InstanceIterator *it = end_getInstanceIterator(end); stList *sequences = stList_construct3(0, (void (*)(void *))adjacencySequence_destruct); stList *seqFrags = stList_construct3(0, (void (*)(void *))seqFrag_destruct); stHash *endInstanceNumbers = stHash_construct2(NULL, free); while((cap = end_getNext(it)) != NULL) { if(cap_getSide(cap)) { cap = cap_getReverse(cap); } AdjacencySequence *adjacencySequence = adjacencySequence_construct(cap, maxSequenceLength); stList_append(sequences, adjacencySequence); assert(cap_getAdjacency(cap) != NULL); End *otherEnd = end_getPositiveOrientation(cap_getEnd(cap_getAdjacency(cap))); stList_append(seqFrags, seqFrag_construct(adjacencySequence->string, 0, end_getName(otherEnd))); //Increase count of seqfrags with a given end. int64_t *c = stHash_search(endInstanceNumbers, otherEnd); if(c == NULL) { c = st_calloc(1, sizeof(int64_t)); assert(*c == 0); stHash_insert(endInstanceNumbers, otherEnd, c); } (*c)++; } end_destructInstanceIterator(it); //Get the alignment. MultipleAlignment *mA = makeAlignment(sM, seqFrags, spanningTrees, 100000000, useProgressiveMerging, gapGamma, pairwiseAlignmentBandingParameters); //Build an array of weights to reweight pairs in the alignment. int64_t *pairwiseAlignmentsPerSequenceNonCommonEnds = st_calloc(stList_length(seqFrags), sizeof(int64_t)); int64_t *pairwiseAlignmentsPerSequenceCommonEnds = st_calloc(stList_length(seqFrags), sizeof(int64_t)); //First build array on number of pairwise alignments to each sequence, distinguishing alignments between sequences sharing //common ends. for(int64_t i=0; i<stList_length(mA->chosenPairwiseAlignments); i++) { stIntTuple *pairwiseAlignment = stList_get(mA->chosenPairwiseAlignments, i); int64_t seq1 = stIntTuple_get(pairwiseAlignment, 1); int64_t seq2 = stIntTuple_get(pairwiseAlignment, 2); assert(seq1 != seq2); SeqFrag *seqFrag1 = stList_get(seqFrags, seq1); SeqFrag *seqFrag2 = stList_get(seqFrags, seq2); int64_t *pairwiseAlignmentsPerSequence = seqFrag1->rightEndId == seqFrag2->rightEndId ? pairwiseAlignmentsPerSequenceCommonEnds : pairwiseAlignmentsPerSequenceNonCommonEnds; pairwiseAlignmentsPerSequence[seq1]++; pairwiseAlignmentsPerSequence[seq2]++; } //Now calculate score adjustments. double *scoreAdjustmentsNonCommonEnds = st_malloc(stList_length(seqFrags) * sizeof(double)); double *scoreAdjustmentsCommonEnds = st_malloc(stList_length(seqFrags) * sizeof(double)); for(int64_t i=0; i<stList_length(seqFrags); i++) { SeqFrag *seqFrag = stList_get(seqFrags, i); End *otherEnd = flower_getEnd(end_getFlower(end), seqFrag->rightEndId); assert(otherEnd != NULL); assert(stHash_search(endInstanceNumbers, otherEnd) != NULL); int64_t commonInstanceNumber = *(int64_t *)stHash_search(endInstanceNumbers, otherEnd); int64_t nonCommonInstanceNumber = stList_length(seqFrags) - commonInstanceNumber; assert(commonInstanceNumber > 0 && nonCommonInstanceNumber >= 0); assert(pairwiseAlignmentsPerSequenceNonCommonEnds[i] <= nonCommonInstanceNumber); assert(pairwiseAlignmentsPerSequenceNonCommonEnds[i] >= 0); assert(pairwiseAlignmentsPerSequenceCommonEnds[i] < commonInstanceNumber); assert(pairwiseAlignmentsPerSequenceCommonEnds[i] >= 0); //scoreAdjustmentsNonCommonEnds[i] = ((double)nonCommonInstanceNumber + commonInstanceNumber - 1)/(pairwiseAlignmentsPerSequenceNonCommonEnds[i] + pairwiseAlignmentsPerSequenceCommonEnds[i]); //scoreAdjustmentsCommonEnds[i] = scoreAdjustmentsNonCommonEnds[i]; if(pairwiseAlignmentsPerSequenceNonCommonEnds[i] > 0) { scoreAdjustmentsNonCommonEnds[i] = ((double)nonCommonInstanceNumber)/pairwiseAlignmentsPerSequenceNonCommonEnds[i]; assert(scoreAdjustmentsNonCommonEnds[i] >= 1.0); assert(scoreAdjustmentsNonCommonEnds[i] <= nonCommonInstanceNumber); } else { scoreAdjustmentsNonCommonEnds[i] = INT64_MIN; } if(pairwiseAlignmentsPerSequenceCommonEnds[i] > 0) { scoreAdjustmentsCommonEnds[i] = ((double)commonInstanceNumber-1)/pairwiseAlignmentsPerSequenceCommonEnds[i]; assert(scoreAdjustmentsCommonEnds[i] >= 1.0); assert(scoreAdjustmentsCommonEnds[i] <= commonInstanceNumber-1); } else { scoreAdjustmentsCommonEnds[i] = INT64_MIN; } } //Convert the alignment pairs to an alignment of the caps.. stSortedSet *sortedAlignment = stSortedSet_construct3((int (*)(const void *, const void *))alignedPair_cmpFn, (void (*)(void *))alignedPair_destruct); while(stList_length(mA->alignedPairs) > 0) { stIntTuple *alignedPair = stList_pop(mA->alignedPairs); assert(stIntTuple_length(alignedPair) == 5); int64_t seqIndex1 = stIntTuple_get(alignedPair, 1); int64_t seqIndex2 = stIntTuple_get(alignedPair, 3); AdjacencySequence *i = stList_get(sequences, seqIndex1); AdjacencySequence *j = stList_get(sequences, seqIndex2); assert(i != j); int64_t offset1 = stIntTuple_get(alignedPair, 2); int64_t offset2 = stIntTuple_get(alignedPair, 4); int64_t score = stIntTuple_get(alignedPair, 0); if(score <= 0) { //Happens when indel probs are included score = 1; //This is the minimum } assert(score > 0 && score <= PAIR_ALIGNMENT_PROB_1); SeqFrag *seqFrag1 = stList_get(seqFrags, seqIndex1); SeqFrag *seqFrag2 = stList_get(seqFrags, seqIndex2); assert(seqFrag1 != seqFrag2); double *scoreAdjustments = seqFrag1->rightEndId == seqFrag2->rightEndId ? scoreAdjustmentsCommonEnds : scoreAdjustmentsNonCommonEnds; assert(scoreAdjustments[seqIndex1] != INT64_MIN); assert(scoreAdjustments[seqIndex2] != INT64_MIN); AlignedPair *alignedPair2 = alignedPair_construct( i->subsequenceIdentifier, i->start + (i->strand ? offset1 : -offset1), i->strand, j->subsequenceIdentifier, j->start + (j->strand ? offset2 : -offset2), j->strand, score*scoreAdjustments[seqIndex1], score*scoreAdjustments[seqIndex2]); //Do the reweighting here. assert(stSortedSet_search(sortedAlignment, alignedPair2) == NULL); assert(stSortedSet_search(sortedAlignment, alignedPair2->reverse) == NULL); stSortedSet_insert(sortedAlignment, alignedPair2); stSortedSet_insert(sortedAlignment, alignedPair2->reverse); stIntTuple_destruct(alignedPair); } //Cleanup stList_destruct(seqFrags); stList_destruct(sequences); free(pairwiseAlignmentsPerSequenceNonCommonEnds); free(pairwiseAlignmentsPerSequenceCommonEnds); free(scoreAdjustmentsNonCommonEnds); free(scoreAdjustmentsCommonEnds); multipleAlignment_destruct(mA); stHash_destruct(endInstanceNumbers); return sortedAlignment; }
/* * Constructs a face locally from a given Cap but without precomputed liftedEdges */ void buildFaces_reconstructFromCap(Cap * startingCap, Flower * flower) { Face *face = face_construct(flower); stList * liftedEdges; stList *topNodes = stList_construct3(16, NULL); stHash *liftedEdgesTable = stHash_construct3(buildFaces_hashfunction, buildFaces_key_eq_fn, NULL, buildFaces_destructValue); Cap *cap, *bottomNode, *ancestor; int64_t index, index2; printf("Constructing new face"); // Establishlist of top nodes and fill liftedEdges table buildFaces_fillTopNodeList2(startingCap, topNodes, liftedEdgesTable); #ifndef NDEBUG // What, no top nodes!? assert(stList_length(topNodes)); #endif // Initialize data structure face_allocateSpace(face, stList_length(topNodes)); // For every top node for (index = 0; index < stList_length(topNodes); index++) { cap = stList_get(topNodes, index); face_setTopNode(face, index, cap); liftedEdges = stHash_search(liftedEdgesTable, cap); if (!liftedEdges) { face_setBottomNodeNumber(face, index, 0); continue; } face_setBottomNodeNumber(face, index, stList_length(liftedEdges)); // For every bottom node of that top node for (index2 = 0; index2 < stList_length(liftedEdges); index2++) { bottomNode = ((LiftedEdge *) stList_get(liftedEdges, index2))->bottomNode; face_addBottomNode(face, index, bottomNode); assert(cap_getAdjacency(bottomNode)); ancestor = cap_getTopCap(cap_getPositiveOrientation( cap_getAdjacency(bottomNode))); if (cap_getAdjacency(cap) != ancestor) face_setDerivedDestination(face, index, index2, ancestor); else face_setDerivedDestination(face, index, index2, NULL); #ifndef NDEBUG // If bottom nodes part of top nodes if (stList_contains(topNodes, cap_getPositiveOrientation( ((LiftedEdge*) stList_get(liftedEdges, index2))->bottomNode))) abort(); #endif } } // Clean up stList_destruct(topNodes); stHash_destruct(liftedEdgesTable); }