// Returns a hash mapping from sequence header to sequence data. static stHash *readFastaFile(char *filename) { FILE *fasta = fopen(filename, "r"); if (fasta == NULL) { st_errnoAbort("Could not open fasta file %s", filename); } stHash *headerToData = stHash_construct3(stHash_stringKey, stHash_stringEqualKey, free, free); struct List *seqs = constructEmptyList(0, NULL); struct List *seqLengths = constructEmptyList(0, free); struct List *headers = constructEmptyList(0, free); fastaRead(fasta, seqs, seqLengths, headers); for (int64_t i = 0; i < seqs->length; i++) { char *fullHeader = headers->list[i]; stList *headerTokens = stString_splitByString(fullHeader, " "); char *usableHeader = stString_copy(stList_get(headerTokens, 0)); stHash_insert(headerToData, usableHeader, seqs->list[i]); stList_destruct(headerTokens); } destructList(seqs); destructList(seqLengths); destructList(headers); return headerToData; }
int main(int argc, char *argv[]) { FILE *fileHandle; struct List *seqs; struct List *seqLengths; struct List *seqNames; int32_t i; assert(argc == 3); seqs = constructEmptyList(0, free); seqLengths = constructEmptyList(0, (void (*)(void *))destructInt); seqNames = constructEmptyList(0, free); fileHandle = fopen(argv[1], "r"); fastaRead(fileHandle, seqs, seqLengths, seqNames); fclose(fileHandle); fileHandle = fopen(argv[2], "w"); for(i=0; i < seqs->length; i++) { assert(strlen(seqs->list[i]) == *((int32_t *)seqLengths->list[i])); fastaWrite(seqs->list[i], seqNames->list[i], fileHandle); } fclose(fileHandle); return 0; }
int main(int argc, char *argv[]) { /* * Arguments/options */ char *logLevelString = NULL; char *mfaFile = NULL; char *outputFile = NULL; char *treeFile = NULL; /////////////////////////////////////////////////////////////////////////// // (0) Parse the inputs handed by genomeCactus.py / setup stuff. /////////////////////////////////////////////////////////////////////////// while (1) { static struct option long_options[] = { { "logLevel", required_argument, 0, 'a' }, { "mfaFile", required_argument, 0, 'b' }, { "outputFile", required_argument, 0, 'd' }, { "treeFile", optional_argument, 0, 't' }, { "help", no_argument, 0, 'h' }, { 0, 0, 0, 0 } }; int option_index = 0; int key = getopt_long(argc, argv, "a:b:d:t:h", long_options, &option_index); if (key == -1) { break; } switch (key) { case 'a': logLevelString = stString_copy(optarg); break; case 'b': mfaFile = stString_copy(optarg); break; case 'd': outputFile = stString_copy(optarg); break; case 't': treeFile = stString_copy(optarg); break; case 'h': usage(); return 0; default: usage(); return 1; } } /////////////////////////////////////////////////////////////////////////// // (0) Check the inputs. /////////////////////////////////////////////////////////////////////////// if (argc == 1) { usage(); exit(1); } assert(mfaFile != NULL); assert(outputFile != NULL); ////////////////////////////////////////////// //Set up logging ////////////////////////////////////////////// st_setLogLevelFromString(logLevelString); ////////////////////////////////////////////// //Log (some of) the inputs ////////////////////////////////////////////// st_logInfo("MFA file name : %s\n", mfaFile); st_logInfo("Output MAF file : %s\n", outputFile); st_logInfo("Tree file name: %s\n", treeFile == NULL ? "null" : treeFile); ////////////////////////////////////////////// //Get the MFA alignment ////////////////////////////////////////////// //get the alignment struct List *sequences = constructEmptyList(0, free); struct List *seqLengths = constructEmptyList(0, (void (*)(void *))destructInt); struct List *fastaNames = constructEmptyList(0, free); FILE *fileHandle = fopen(mfaFile, "r"); if (fileHandle == NULL) { usage(); exit(1); } fastaRead(fileHandle, sequences, seqLengths, fastaNames); fclose(fileHandle); ////////////////////////////////////////////// //Get the tree alignment ////////////////////////////////////////////// stTree *tree = NULL; LeafPtrArray *leafArray = NULL; int32_t leafCount = 0; if (treeFile != NULL) { tree = eTreeX_getTreeFromFile(treeFile); eTreeX_postOrderTraversal(tree, eTreeX_countLeaves, &leafCount); leafArray = eTreeX_constructLeafPtrArray(leafCount); eTreeX_postOrderTraversal(tree, eTreeX_getLeafArray, (void *) leafArray); } ////////////////////////////////////////////// //Write the MFA alignment. ////////////////////////////////////////////// fileHandle = fopen(outputFile, "w"); //write the header. fprintf(fileHandle, "##maf version=1 scoring=NULL\n"); fprintf(fileHandle, "# converted_from_MFA\n\n"); //write the score line char *treeString = NULL; if (treeFile != NULL) { treeString = stTree_getNewickTreeString(tree); fprintf(fileHandle, "a score=0 tree=\"%s\"\n", treeString); } else { fprintf(fileHandle, "a score=0\n"); leafCount = sequences->length; } //write the alignment int32_t i, j; int32_t ii; const char *label; for (ii=0; ii<leafCount; ii++) { if (treeFile != NULL) { label = stTree_getLabel((stTree *) leafArray->ptrArray[ii]); /* Do a brute force search to find the appropriate sequence that matches "label" */ for (i=0; i<sequences->length; i++) { char *fastaHeader = fastaNames->list[i]; char *sequenceName = st_malloc(sizeof(char) *(1 + strlen(fastaHeader))); sscanf(fastaHeader, "%s", sequenceName); //take the sequence name to be the first word of the sequence. if (strcmp(label, sequenceName) == 0) { free(sequenceName); break; } free(sequenceName); } } else { i = ii; } char *sequence = sequences->list[i]; int32_t seqLength = *((int32_t *)seqLengths->list[i]); assert(seqLength == (int32_t)strlen(sequence)); char *fastaHeader = fastaNames->list[i]; char *sequenceName = st_malloc(sizeof(char) *(1 + strlen(fastaHeader))); sscanf(fastaHeader, "%s", sequenceName); //take the sequence name to be the first word of the sequence. int32_t length = 0; for (j=0; j<(int32_t)strlen(sequence); j++) { if (sequence[j] != '-') { length++; } } fprintf(fileHandle, "s\t%s\t%i\t%i\t%s\t%i\t%s\n", sequenceName, 0, length, "+", length, sequence); free(sequenceName); } fclose(fileHandle); ////////////////////////////////////////////// //Clean up. ////////////////////////////////////////////// free(mfaFile); free(outputFile); free(treeFile); if (treeFile != NULL) { stTree_destruct(tree); free(treeString); eTreeX_destructLeafPtrArray(leafArray); } destructList(sequences); destructList(seqLengths); destructList(fastaNames); return 0; }