int main(int argc, char *argv[]) { FILE *fileHandle; struct List *seqs; struct List *seqLengths; struct List *seqNames; int32_t i; assert(argc == 3); seqs = constructEmptyList(0, free); seqLengths = constructEmptyList(0, (void (*)(void *))destructInt); seqNames = constructEmptyList(0, free); fileHandle = fopen(argv[1], "r"); fastaRead(fileHandle, seqs, seqLengths, seqNames); fclose(fileHandle); fileHandle = fopen(argv[2], "w"); for(i=0; i < seqs->length; i++) { assert(strlen(seqs->list[i]) == *((int32_t *)seqLengths->list[i])); fastaWrite(seqs->list[i], seqNames->list[i], fileHandle); } fclose(fileHandle); return 0; }
// Returns a hash mapping from sequence header to sequence data. static stHash *readFastaFile(char *filename) { FILE *fasta = fopen(filename, "r"); if (fasta == NULL) { st_errnoAbort("Could not open fasta file %s", filename); } stHash *headerToData = stHash_construct3(stHash_stringKey, stHash_stringEqualKey, free, free); struct List *seqs = constructEmptyList(0, NULL); struct List *seqLengths = constructEmptyList(0, free); struct List *headers = constructEmptyList(0, free); fastaRead(fasta, seqs, seqLengths, headers); for (int64_t i = 0; i < seqs->length; i++) { char *fullHeader = headers->list[i]; stList *headerTokens = stString_splitByString(fullHeader, " "); char *usableHeader = stString_copy(stList_get(headerTokens, 0)); stHash_insert(headerToData, usableHeader, seqs->list[i]); stList_destruct(headerTokens); } destructList(seqs); destructList(seqLengths); destructList(headers); return headerToData; }
Event *eventTree_getCommonAncestor(Event *event, Event *event2) { Event *ancestorEvent; struct List *list; assert(event != NULL); assert(event2 != NULL); assert(event_getEventTree(event) == event_getEventTree(event2)); list = constructEmptyList(0, NULL); ancestorEvent = event; while(ancestorEvent != NULL) { if(ancestorEvent == event2) { destructList(list); return event2; } listAppend(list, ancestorEvent); ancestorEvent = event_getParent(ancestorEvent); } ancestorEvent = event2; while((ancestorEvent = event_getParent(ancestorEvent)) != NULL) { if(listContains(list, ancestorEvent)) { destructList(list); return ancestorEvent; } } destructList(list); assert(FALSE); return NULL; }
static stList *getRandomPairwiseAlignments() { stList *pairwiseAlignments = stList_construct3(0, (void(*)(void *)) destructPairwiseAlignment); int64_t randomAlignmentNumber = st_randomInt(0, 10); for (int64_t i = 0; i < randomAlignmentNumber; i++) { char *contig1 = stString_print("%" PRIi64 "", i); char *contig2 = stString_print("%" PRIi64 "", i * 10); int64_t start1 = st_randomInt(100000, 1000000); int64_t start2 = st_randomInt(100000, 1000000); int64_t strand1 = st_random() > 0.5; int64_t strand2 = st_random() > 0.5; int64_t end1 = start1; int64_t end2 = start2; struct List *operationList = constructEmptyList(0, NULL); while (st_random() > 0.1) { int64_t length = st_randomInt(0, 10); int64_t type = st_randomInt(0, 3); assert(type < 3); listAppend(operationList, constructAlignmentOperation(type, length, 0)); if (type != PAIRWISE_INDEL_Y) { end1 += strand1 ? length : -length; } if (type != PAIRWISE_INDEL_X) { end2 += strand2 ? length : -length; } } stList_append(pairwiseAlignments, constructPairwiseAlignment(contig1, start1, end1, strand1, contig2, start2, end2, strand2, 0.0, operationList)); free(contig1); free(contig2); } return pairwiseAlignments; }
struct List *parseTrioFile(char *trioFile) { FILE *fileHandle = fopen(trioFile, "r"); int bytesRead; int nBytes = 100; char *cA; int j; char *species[3]; struct List *speciesList = NULL; speciesList = constructEmptyList(0, freeTrioNames); cA = st_malloc(nBytes + 1); bytesRead = benLine(&cA, &nBytes, fileHandle); while(bytesRead != -1) { if (bytesRead > 0) { species[0] = st_malloc(sizeof(char) * (1 + (bytesRead))); species[1] = st_malloc(sizeof(char) * (1 + (bytesRead))); species[2] = st_malloc(sizeof(char) * (1 + (bytesRead))); j = sscanf(cA, "%s\t%s\t%s", species[0], species[1], species[2]); if (j != 3) { fprintf(stderr, "Invalid triple line '%s' in '%s'\n", cA, trioFile); exit(1); } cStr_lowerCase(species[0]); cStr_lowerCase(species[1]); cStr_lowerCase(species[2]); qsort(species, 3, sizeof(char *), cStr_compare); TrioNames *trio = st_malloc(sizeof(TrioNames)); trio->speciesA = species[0]; trio->speciesB = species[1]; trio->speciesC = species[2]; listAppend(speciesList, trio); } bytesRead = benLine(&cA, &nBytes, fileHandle); } fclose(fileHandle); free(cA); return speciesList; }
Event *event_construct(Name name, const char *header, float branchLength, Event *parentEvent, EventTree *eventTree) { assert(eventTree_getEvent(eventTree, name) == NULL); //the event must not already exist in the tree. Event *event; event = st_malloc(sizeof(Event)); event->name = name; event->parent = parentEvent; event->children = constructEmptyList(0, NULL); event->header = stString_copy(header == NULL ? "" : header); event->branchLength = branchLength < 0.0 ? 0.0 : branchLength; event->isOutgroup = 0; if (parentEvent != NULL) { listAppend(parentEvent->children, event); } event->eventTree = eventTree; eventTree_addEvent(eventTree, event); return event; }
// copied from cPecanRealign struct PairwiseAlignment *convertAlignedPairsToPairwiseAlignment(char *seqName1, char *seqName2, double score, int64_t length1, int64_t length2, stList *alignedPairs) { //Make pairwise alignment int64_t pX = -1, pY = -1, mL = 0; //Create an end matched pair, which is used to ensure the alignment has the correct end indels. struct List *opList = constructEmptyList(0, (void (*)(void *)) destructAlignmentOperation); stList_append(alignedPairs, stIntTuple_construct2(length1, length2)); for (int64_t i = 0; i < stList_length(alignedPairs); i++) { stIntTuple *alignedPair = stList_get(alignedPairs, i); int64_t x = stIntTuple_get(alignedPair, 0); int64_t y = stIntTuple_get(alignedPair, 1); assert(x - pX > 0); assert(y - pY > 0); if (x - pX > 0 && y - pY > 0) { //This is a hack for filtering if (x - pX > 1) { //There is an indel. if (mL > 0) { listAppend(opList, constructAlignmentOperation(PAIRWISE_MATCH, mL, 0)); mL = 0; } listAppend(opList, constructAlignmentOperation(PAIRWISE_INDEL_X, x - pX - 1, 0)); } if (y - pY > 1) { if (mL > 0) { listAppend(opList, constructAlignmentOperation(PAIRWISE_MATCH, mL, 0)); mL = 0; } listAppend(opList, constructAlignmentOperation(PAIRWISE_INDEL_Y, y - pY - 1, 0)); } mL++; pX = x; pY = y; } } //Deal with a trailing match, but exclude the final match if (mL > 1) { listAppend(opList, constructAlignmentOperation(PAIRWISE_MATCH, mL - 1, 0)); } stIntTuple_destruct(stList_pop(alignedPairs)); //Construct the alignment struct PairwiseAlignment *pA = constructPairwiseAlignment(seqName1, 0, length1, 1, seqName2, 0, length2, 1, score, opList); return pA; }
int main(int argc, char *argv[]) { /* * Arguments/options */ char *logLevelString = NULL; char *mfaFile = NULL; char *outputFile = NULL; char *treeFile = NULL; /////////////////////////////////////////////////////////////////////////// // (0) Parse the inputs handed by genomeCactus.py / setup stuff. /////////////////////////////////////////////////////////////////////////// while (1) { static struct option long_options[] = { { "logLevel", required_argument, 0, 'a' }, { "mfaFile", required_argument, 0, 'b' }, { "outputFile", required_argument, 0, 'd' }, { "treeFile", optional_argument, 0, 't' }, { "help", no_argument, 0, 'h' }, { 0, 0, 0, 0 } }; int option_index = 0; int key = getopt_long(argc, argv, "a:b:d:t:h", long_options, &option_index); if (key == -1) { break; } switch (key) { case 'a': logLevelString = stString_copy(optarg); break; case 'b': mfaFile = stString_copy(optarg); break; case 'd': outputFile = stString_copy(optarg); break; case 't': treeFile = stString_copy(optarg); break; case 'h': usage(); return 0; default: usage(); return 1; } } /////////////////////////////////////////////////////////////////////////// // (0) Check the inputs. /////////////////////////////////////////////////////////////////////////// if (argc == 1) { usage(); exit(1); } assert(mfaFile != NULL); assert(outputFile != NULL); ////////////////////////////////////////////// //Set up logging ////////////////////////////////////////////// st_setLogLevelFromString(logLevelString); ////////////////////////////////////////////// //Log (some of) the inputs ////////////////////////////////////////////// st_logInfo("MFA file name : %s\n", mfaFile); st_logInfo("Output MAF file : %s\n", outputFile); st_logInfo("Tree file name: %s\n", treeFile == NULL ? "null" : treeFile); ////////////////////////////////////////////// //Get the MFA alignment ////////////////////////////////////////////// //get the alignment struct List *sequences = constructEmptyList(0, free); struct List *seqLengths = constructEmptyList(0, (void (*)(void *))destructInt); struct List *fastaNames = constructEmptyList(0, free); FILE *fileHandle = fopen(mfaFile, "r"); if (fileHandle == NULL) { usage(); exit(1); } fastaRead(fileHandle, sequences, seqLengths, fastaNames); fclose(fileHandle); ////////////////////////////////////////////// //Get the tree alignment ////////////////////////////////////////////// stTree *tree = NULL; LeafPtrArray *leafArray = NULL; int32_t leafCount = 0; if (treeFile != NULL) { tree = eTreeX_getTreeFromFile(treeFile); eTreeX_postOrderTraversal(tree, eTreeX_countLeaves, &leafCount); leafArray = eTreeX_constructLeafPtrArray(leafCount); eTreeX_postOrderTraversal(tree, eTreeX_getLeafArray, (void *) leafArray); } ////////////////////////////////////////////// //Write the MFA alignment. ////////////////////////////////////////////// fileHandle = fopen(outputFile, "w"); //write the header. fprintf(fileHandle, "##maf version=1 scoring=NULL\n"); fprintf(fileHandle, "# converted_from_MFA\n\n"); //write the score line char *treeString = NULL; if (treeFile != NULL) { treeString = stTree_getNewickTreeString(tree); fprintf(fileHandle, "a score=0 tree=\"%s\"\n", treeString); } else { fprintf(fileHandle, "a score=0\n"); leafCount = sequences->length; } //write the alignment int32_t i, j; int32_t ii; const char *label; for (ii=0; ii<leafCount; ii++) { if (treeFile != NULL) { label = stTree_getLabel((stTree *) leafArray->ptrArray[ii]); /* Do a brute force search to find the appropriate sequence that matches "label" */ for (i=0; i<sequences->length; i++) { char *fastaHeader = fastaNames->list[i]; char *sequenceName = st_malloc(sizeof(char) *(1 + strlen(fastaHeader))); sscanf(fastaHeader, "%s", sequenceName); //take the sequence name to be the first word of the sequence. if (strcmp(label, sequenceName) == 0) { free(sequenceName); break; } free(sequenceName); } } else { i = ii; } char *sequence = sequences->list[i]; int32_t seqLength = *((int32_t *)seqLengths->list[i]); assert(seqLength == (int32_t)strlen(sequence)); char *fastaHeader = fastaNames->list[i]; char *sequenceName = st_malloc(sizeof(char) *(1 + strlen(fastaHeader))); sscanf(fastaHeader, "%s", sequenceName); //take the sequence name to be the first word of the sequence. int32_t length = 0; for (j=0; j<(int32_t)strlen(sequence); j++) { if (sequence[j] != '-') { length++; } } fprintf(fileHandle, "s\t%s\t%i\t%i\t%s\t%i\t%s\n", sequenceName, 0, length, "+", length, sequence); free(sequenceName); } fclose(fileHandle); ////////////////////////////////////////////// //Clean up. ////////////////////////////////////////////// free(mfaFile); free(outputFile); free(treeFile); if (treeFile != NULL) { stTree_destruct(tree); free(treeString); eTreeX_destructLeafPtrArray(leafArray); } destructList(sequences); destructList(seqLengths); destructList(fastaNames); return 0; }