// Returns a hash mapping from sequence header to sequence data. static stHash *readFastaFile(char *filename) { FILE *fasta = fopen(filename, "r"); if (fasta == NULL) { st_errnoAbort("Could not open fasta file %s", filename); } stHash *headerToData = stHash_construct3(stHash_stringKey, stHash_stringEqualKey, free, free); struct List *seqs = constructEmptyList(0, NULL); struct List *seqLengths = constructEmptyList(0, free); struct List *headers = constructEmptyList(0, free); fastaRead(fasta, seqs, seqLengths, headers); for (int64_t i = 0; i < seqs->length; i++) { char *fullHeader = headers->list[i]; stList *headerTokens = stString_splitByString(fullHeader, " "); char *usableHeader = stString_copy(stList_get(headerTokens, 0)); stHash_insert(headerToData, usableHeader, seqs->list[i]); stList_destruct(headerTokens); } destructList(seqs); destructList(seqLengths); destructList(headers); return headerToData; }
int main(int argc, char **argv) { options_t *options = options_construct(); stHash *sequenceHash = NULL; // keyed on fasta headers, valued with mtfseq_t pointers stHash *alignmentHash = stHash_construct3(stHash_stringKey, stHash_stringEqualKey, free, destroyRow); // keyed on species names, valued with row_t pointers stList *rowOrder = stList_construct3(0, free); // when adding keys to alignmentHash, append to this list parseOptions(argc, argv, options); // read fastas, populate sequenceHash de_verbose("Creating sequence hash.\n"); sequenceHash = createSequenceHash(options->seqs); mafFileApi_t *mfapi = maf_newMfa(options->maf, "r"); de_verbose("Creating alignment hash.\n"); buildAlignmentHash(mfapi, alignmentHash, sequenceHash, rowOrder, options); if (options->outMfa != NULL) { // fasta output de_verbose("Writing fasta output.\n"); writeFastaOut(alignmentHash, rowOrder, options); } if (options->outMaf != NULL) { // maf output de_verbose("Writing maf output.\n"); writeMafOut(alignmentHash, rowOrder, options); } // cleanup maf_destroyMfa(mfapi); stHash_destruct(alignmentHash); stHash_destruct(sequenceHash); stList_destruct(rowOrder); destroyOptions(options); return(EXIT_SUCCESS); }
/* Parse XML string into a hash. This parses all attributes of all tags * into values. st_kv_database_conf type is stored as conf_type, * database tag is stores as db_tag. This does minimal error checking * and is really lame. */ static stHash *hackParseXmlString(const char *xmlString) { stHash *hash = stHash_construct3(stHash_stringKey, stHash_stringEqualKey, free, free); char *toReplace[5] = { "</", "<", "/>", ">", "=" }; char *cA = stString_replace(xmlString, toReplace[0], " "), *cA2; for (int64_t i = 1; i < 5; i++) { cA2 = stString_replace(cA, toReplace[i], " "); free(cA); cA = cA2; } getExpectedToken(&cA2, "st_kv_database_conf"); stHash_insert(hash, stString_copy("conf_type"), getKeyValue(&cA2, "type")); stHash_insert(hash, stString_copy("db_tag"), getNextToken(&cA2)); char *key; while (((key = getNextToken(&cA2)) != NULL) && !stString_eq(key, "st_kv_database_conf")) { char *value = getNextToken(&cA2); if (value == NULL) { stThrowNew(ST_KV_DATABASE_EXCEPTION_ID, "failed to to get value for key \"%s\"", key); } if (stHash_search(hash, key) != NULL) { stThrowNew(ST_KV_DATABASE_EXCEPTION_ID, "got a duplicate entry in the database conf string \"%s\"", key); } stHash_insert(hash, key, value); } if(!stString_eq(key, "st_kv_database_conf")) { stThrowNew(ST_KV_DATABASE_EXCEPTION_ID, "got an unexpected final entry \"%s\"", key); } free(key); free(cA); return hash; }
static stHash *createSeqHashFromString(char *name, char *input) { mtfseq_t *mtfs = newMtfseq(strlen(input)); stHash *hash = stHash_construct3(stHash_stringKey, stHash_stringEqualKey, free, destroyMtfseq); seq_copyIn(mtfs, input); stHash_insert(hash, stString_copy(name), mtfs); return hash; }
static stHash *getComponents(stList *filteredEdges) { /* * A kind of stupid reimplementation of the greedy function, done just to trap typos. */ stHash *nodesToComponents = stHash_construct3((uint64_t(*)(const void *)) stIntTuple_hashKey, (int(*)(const void *, const void *)) stIntTuple_equalsFn, NULL, NULL); for (int64_t i = 0; i < stList_length(nodes); i++) { stIntTuple *node = stList_get(nodes, i); stSortedSet *component = stSortedSet_construct(); stSortedSet_insert(component, node); stHash_insert(nodesToComponents, node, component); } for (int64_t i = 0; i < stList_length(filteredEdges); i++) { stIntTuple *edge = stList_get(filteredEdges, i); stIntTuple *node1 = stIntTuple_construct1( stIntTuple_get(edge, 1)); stIntTuple *node2 = stIntTuple_construct1( stIntTuple_get(edge, 2)); stSortedSet *component1 = stHash_search(nodesToComponents, node1); stSortedSet *component2 = stHash_search(nodesToComponents, node2); assert(component1 != NULL && component2 != NULL); if (component1 != component2) { stSortedSet *component3 = stSortedSet_getUnion(component1, component2); stSortedSetIterator *setIt = stSortedSet_getIterator(component3); stIntTuple *node3; while ((node3 = stSortedSet_getNext(setIt)) != NULL) { stHash_insert(nodesToComponents, node3, component3); } stSortedSet_destructIterator(setIt); stSortedSet_destruct(component1); stSortedSet_destruct(component2); } stIntTuple_destruct(node1); stIntTuple_destruct(node2); } return nodesToComponents; }
static stHash *putEdgesInHash(stList *edges) { stHash *intsToEdgesHash = stHash_construct3((uint64_t (*)(const void *))stIntTuple_hashKey, (int (*)(const void *, const void *))stIntTuple_equalsFn, (void (*)(void *))stIntTuple_destruct, NULL); for(int64_t i=0; i<stList_length(edges); i++) { stIntTuple *edge = stList_get(edges, i); stHash_insert(intsToEdgesHash, constructEdge(stIntTuple_get(edge, 0), stIntTuple_get(edge, 1)), edge); } return intsToEdgesHash; }
static void test_readingFasta_0(CuTest *testCase) { char inputName[] = ">simChimp.chrA"; char inputSequence[] = "ATAATACTTGCACACTTCTGCTATTACTTGATGTGTTTTCTATGGGGTGT" "CTTTCAGTGCTATGGGCAAGGCCATGGATTAATGGTGCCATAATTGCTCT" "AGGCAGTGACTAGAAACAGTTCACAAGTTTTTACTGTATCAAACTATGTT" "TTATAGTACGATTCACCCTCCAGGGGACCATCCCAAACTACTGGCCTAAA" "AGGACCTGCCATGTTGTAACTCCCCAGCTTAGAAATATAGACGGGAGGAA" "TGACaaaaagaagaaaaaaaaaaaaagaaaaaataaaaaaaaaacaaaaa" "agatagagaaaaaaaaaagtaaaaacaaaaaaaaataaaaaagggaaaaa" "aaataacaaaggaacaaaaaaaaaaaaaaaaaaataaaaagaaaaaCAAG" "ATAACCTTCATGCCATTGGAGCTATCTATTATTGTCTTGACCTATGCTTT" "ATCAATTTCTTCCTTCCTAGGAAGACATTTTTCTAGAAAGCTAAACGTTT" "TTGTAGGCTTGCATGTTCTGTCTGGGCTTGAATGGTTGTGCGTCTACAAG" "CCTCATTTACCATAGCACCATGCTTGGGTGGTATCTATCATCATTATCAA" "TAGTCAAGTCATTATAATGTTTTGGTGATCAGGCCAGATCCCTTGCACCA" "GTGACTTTCTAAATAGCACCTCCTCCATCATTTAAGGATCTCTAGCAACT" "TTAATCTGACTCACCTTGCCATGCAGAGTGCATGTTCCTTTTTAACACCC" "TGTGATTATGGGTTGGGTCTATTTGTATTTGTTTGATTACATCAGACGAC" "CAGGCCAGAGACAGATAAACACAACAGCCACTGGAACCTAAAGCTGTGTT" "CAGAATGTCACGGAATGTCTCATTGCACCCAGAGCTAGGGTGGGTATGAG" "TATGATCTTCTACATAAGGTACCCCAGGAAAATTAACTTAACAACCAATC" "AATTACAGAAGATGAATTCTGCTGTTGTCTCTTATTAGTTGGACTATTCA" "GCCTAATGGTTGGCCACTTAGCTTGTCATGAGCATTACTGTACTACTATG" "TCTAGTGTTTCCAGTTATTAGTTAGCCCACTGGATAGACAGTTTTGGCTT" "GTTTTCTTTCATTTGTATTGCCCACTCACCTAGCAAATCAGACAAAGGGG" "CATGTGAAAACTACCTTAGACTCTGCAGTTAGACAAACCATACTTTCCAC" "ATAGACCTCAGACATTTGGACATGAATAATTTCCTTCCTCCGGAGTGGTG" "GTTCCTCAACACTTATCACTTTCTTCTTCTTTTACCCGTATCACTGTCAA"; FILE *ofp = de_fopen("testFasta.fa", "w"); fprintf(ofp, "%s\n", inputName); for (size_t i = 0; i < strlen(inputSequence); ++i) { fprintf(ofp, "%c", inputSequence[i]); if (((i + 1) % 50) == 0) { fprintf(ofp, "\n"); } } fprintf(ofp, "\n"); fclose(ofp); stHash *sequenceHash = stHash_construct3(stHash_stringKey, stHash_stringEqualKey, free, destroyMtfseq); addSequencesToHash(sequenceHash, "testFasta.fa"); mtfseq_t *value = NULL; CuAssertTrue(testCase, (value = stHash_search(sequenceHash, "not in there")) == NULL); CuAssertTrue(testCase, (value = stHash_search(sequenceHash, "simChimp.chrA")) != NULL); if (value != NULL) { CuAssertTrue(testCase, strlen(value->seq) == strlen(inputSequence)); CuAssertTrue(testCase, strcmp(value->seq, inputSequence) == 0); } if (remove("testFasta.fa")) { fprintf(stderr, "Error, unable to remove temporary file testFasta.fa\n"); exit(EXIT_FAILURE); } stHash_destruct(sequenceHash); }
/* * Fill in a hashtable which to every node associates * alist of lifted edges */ static stHash *buildFaces_computeLiftedEdges(Flower * flower) { stHash *liftedEdgesTable = stHash_construct3(buildFaces_hashfunction, buildFaces_key_eq_fn, NULL, buildFaces_destructValue); Flower_CapIterator *iter = flower_getCapIterator(flower); Cap *cap, *attachedAncestor; Cap *adjacency, *adjacencyAncestor; stList *liftedEdges; LiftedEdge *liftedEdge; // Iterate through potential bottom nodes while ((cap = flower_getNextCap(iter))) { // ... check if connected if ((adjacency = cap_getAdjacency(cap))) { // ... lift attachedAncestor = cap_getTopCap(cap); adjacencyAncestor = cap_getTopCap(cap_getPositiveOrientation( adjacency)); #ifndef NDEBUG assert((attachedAncestor && adjacencyAncestor) || (!attachedAncestor && !adjacencyAncestor)); #endif // If root node if (attachedAncestor == NULL) continue; // ... create lifted edge liftedEdge = st_malloc(sizeof(LiftedEdge)); liftedEdge->destination = adjacencyAncestor; liftedEdge->bottomNode = cap; #ifndef NDEBUG // Self loop if (adjacencyAncestor == attachedAncestor) abort(); #endif // ... add it to the hashtable if ((liftedEdges = stHash_search(liftedEdgesTable, attachedAncestor))) { stList_append(liftedEdges, liftedEdge); } else { liftedEdges = stList_construct3(2, buildFaces_stList_destructElem); stList_append(liftedEdges, liftedEdge); stHash_insert(liftedEdgesTable, attachedAncestor, liftedEdges); } } } flower_destructCapIterator(iter); return liftedEdgesTable; }
/* * This builds an adjacency list structure for the the sequences. Every sequence-position * has a column in the hash with which it can be aligned with. */ static stHash *buildAdjacencyList(stList *pairs, int64_t sequenceNumber) { stHash *hash = stHash_construct3((uint64_t (*)(const void *))stIntTuple_hashKey, (int (*)(const void *, const void *))stIntTuple_equalsFn, (void (*)(void *))stIntTuple_destruct, NULL); for(int64_t seq=0; seq<sequenceNumber; seq++) { for(int64_t position=0; position<MAX_SEQUENCE_SIZE; position++) { stIntTuple *seqPos = stIntTuple_construct2( seq, position); stSortedSet *column = stSortedSet_construct3((int (*)(const void *, const void *))stIntTuple_cmpFn, NULL); stSortedSet_insert(column, seqPos); stHash_insert(hash, seqPos, column); } } stListIterator *it = stList_getIterator(pairs); stIntTuple *pair; while((pair = stList_getNext(it)) != NULL) { stIntTuple *seqPos1 = stIntTuple_construct2( stIntTuple_get(pair, 0), stIntTuple_get(pair, 1)); stIntTuple *seqPos2 = stIntTuple_construct2( stIntTuple_get(pair, 2), stIntTuple_get(pair, 3)); stSortedSet *column1 = stHash_search(hash, seqPos1); assert(column1 != NULL); stSortedSet *column2 = stHash_search(hash, seqPos2); assert(column2 != NULL); if(column1 != column2) { //Merge the columns stSortedSetIterator *it2 = stSortedSet_getIterator(column2); stIntTuple *seqPos3; while((seqPos3 = stSortedSet_getNext(it2)) != NULL) { assert(stSortedSet_search(column1, seqPos3) == NULL); stSortedSet_insert(column1, seqPos3); assert(stHash_search(hash, seqPos3) == column2); stHash_insert(hash, seqPos3, column1); assert(stHash_search(hash, seqPos3) == column1); } stSortedSet_destructIterator(it2); stSortedSet_destruct(column2); } //Cleanup loop. stIntTuple_destruct(seqPos1); stIntTuple_destruct(seqPos2); } stList_destructIterator(it); return hash; }
stSet *stSet_construct3(uint64_t(*hashKey)(const void *), int(*hashEqualsKey)(const void *, const void *), void(*destructKeys)(void *)) { stSet *set = st_malloc(sizeof(*set)); set->hash = stHash_construct3(hashKey, hashEqualsKey, destructKeys, NULL); return set; }
/* * Constructs a face locally from a given Cap but without precomputed liftedEdges */ void buildFaces_reconstructFromCap(Cap * startingCap, Flower * flower) { Face *face = face_construct(flower); stList * liftedEdges; stList *topNodes = stList_construct3(16, NULL); stHash *liftedEdgesTable = stHash_construct3(buildFaces_hashfunction, buildFaces_key_eq_fn, NULL, buildFaces_destructValue); Cap *cap, *bottomNode, *ancestor; int64_t index, index2; printf("Constructing new face"); // Establishlist of top nodes and fill liftedEdges table buildFaces_fillTopNodeList2(startingCap, topNodes, liftedEdgesTable); #ifndef NDEBUG // What, no top nodes!? assert(stList_length(topNodes)); #endif // Initialize data structure face_allocateSpace(face, stList_length(topNodes)); // For every top node for (index = 0; index < stList_length(topNodes); index++) { cap = stList_get(topNodes, index); face_setTopNode(face, index, cap); liftedEdges = stHash_search(liftedEdgesTable, cap); if (!liftedEdges) { face_setBottomNodeNumber(face, index, 0); continue; } face_setBottomNodeNumber(face, index, stList_length(liftedEdges)); // For every bottom node of that top node for (index2 = 0; index2 < stList_length(liftedEdges); index2++) { bottomNode = ((LiftedEdge *) stList_get(liftedEdges, index2))->bottomNode; face_addBottomNode(face, index, bottomNode); assert(cap_getAdjacency(bottomNode)); ancestor = cap_getTopCap(cap_getPositiveOrientation( cap_getAdjacency(bottomNode))); if (cap_getAdjacency(cap) != ancestor) face_setDerivedDestination(face, index, index2, ancestor); else face_setDerivedDestination(face, index, index2, NULL); #ifndef NDEBUG // If bottom nodes part of top nodes if (stList_contains(topNodes, cap_getPositiveOrientation( ((LiftedEdge*) stList_get(liftedEdges, index2))->bottomNode))) abort(); #endif } } // Clean up stList_destruct(topNodes); stHash_destruct(liftedEdgesTable); }