static char *tree_getNewickTreeStringP(stTree *tree) { char *cA, *cA2; if(stTree_getChildNumber(tree) > 0) { int32_t i; cA = stString_copy("("); for(i=0; i<stTree_getChildNumber(tree); i++) { cA2 = tree_getNewickTreeStringP(stTree_getChild(tree, i)); char *cA3 = stString_print((i+1 < stTree_getChildNumber(tree) ? "%s%s," : "%s%s"), cA, cA2); free(cA); free(cA2); cA = cA3; } cA2 = stString_print("%s)", cA); free(cA); cA = cA2; } else { cA = stString_copy(""); } if(stTree_getLabel(tree) != NULL) { cA2 = stString_print("%s%s", cA, stTree_getLabel(tree)); free(cA); cA = cA2; } if(stTree_getBranchLength(tree) != INFINITY) { char *cA2 = stString_print("%s:%g", cA, stTree_getBranchLength(tree)); free(cA); cA = cA2; } return cA; }
static void assignEventsAndSequences(Event *parentEvent, stTree *tree, stSet *outgroupNameSet, char *argv[], int64_t *j) { Event *myEvent = NULL; // To distinguish from the global "event" variable. assert(tree != NULL); totalEventNumber++; if (stTree_getChildNumber(tree) > 0) { myEvent = event_construct3(stTree_getLabel(tree), stTree_getBranchLength(tree), parentEvent, eventTree); for (int64_t i = 0; i < stTree_getChildNumber(tree); i++) { assignEventsAndSequences(myEvent, stTree_getChild(tree, i), outgroupNameSet, argv, j); } } if (stTree_getChildNumber(tree) == 0 || (stTree_getLabel(tree) != NULL && (stSet_search(outgroupNameSet, (char *)stTree_getLabel(tree)) != NULL))) { // This event is a leaf and/or an outgroup, so it has // associated sequence. assert(stTree_getLabel(tree) != NULL); assert(stTree_getBranchLength(tree) != INFINITY); if (stTree_getChildNumber(tree) == 0) { // Construct the leaf event myEvent = event_construct3(stTree_getLabel(tree), stTree_getBranchLength(tree), parentEvent, eventTree); } char *fileName = argv[*j]; if (!stFile_exists(fileName)) { st_errAbort("File does not exist: %s\n", fileName); } // Set the global "event" variable, which is needed for the // function provided to fastaReadToFunction. event = myEvent; if (stFile_isDir(fileName)) { st_logInfo("Processing directory: %s\n", fileName); stList *filesInDir = stFile_getFileNamesInDirectory(fileName); for (int64_t i = 0; i < stList_length(filesInDir); i++) { char *absChildFileName = stFile_pathJoin(fileName, stList_get(filesInDir, i)); assert(stFile_exists(absChildFileName)); setCompleteStatus(absChildFileName); //decide if the sequences in the file should be free or attached. FILE *fileHandle = fopen(absChildFileName, "r"); fastaReadToFunction(fileHandle, processSequence); fclose(fileHandle); free(absChildFileName); } stList_destruct(filesInDir); } else { st_logInfo("Processing file: %s\n", fileName); setCompleteStatus(fileName); //decide if the sequences in the file should be free or attached. FILE *fileHandle = fopen(fileName, "r"); fastaReadToFunction(fileHandle, processSequence); fclose(fileHandle); } (*j)++; } }
/* enforce order of children */ static int sortChildrenCmpFn(stTree *a, stTree *b) { int diff = strcmp(stTree_getLabel(a), stTree_getLabel(b)); if (diff == 0) { // same names, sort by seq and location, if available struct mafTreeNodeCompLink *ncLinkA = getNodeCompLink(a), *ncLinkB = getNodeCompLink(b); if ((ncLinkA != NULL) && (ncLinkB != NULL)) { diff = malnComp_cmp(ncLinkA->comp, ncLinkB->comp); } } return diff; }
/* assert sanity of nodeCompLink */ void mafTreeNodeCompLink_assert(struct mafTreeNodeCompLink *ncLink) { #ifndef NDEBUG if (ncLink != NULL) { assert(stString_eq(stTree_getLabel(ncLink->node), ncLink->comp->seq->orgSeqName)); } #endif }
/* clone a node */ stTree *stTree_cloneNode(stTree *node) { stTree *node2 = stTree_construct(); stTree_setBranchLength(node2, stTree_getBranchLength(node)); stTree_setClientData(node2, stTree_getClientData(node)); stTree_setLabel(node2, stTree_getLabel(node)); return node2; }
void makeEventHeadersAlphaNumericFn(stTree *tree) { char *cA = makeAlphaNumeric(stTree_getLabel(tree)); stTree_setLabel(tree, cA); free(cA); for (int64_t i = 0; i < stTree_getChildNumber(tree); i++) { makeEventHeadersAlphaNumericFn(stTree_getChild(tree, i)); } }
bool stTree_equals(stTree *tree1, stTree *tree2) { if (stTree_getBranchLength(tree1) != stTree_getBranchLength(tree2)) { return false; } if (!stString_eq(stTree_getLabel(tree1), stTree_getLabel(tree2))) { return false; } int numChildren = stTree_getChildNumber(tree1); if (stTree_getChildNumber(tree2) != numChildren) { return false; } for (int i = 0; i < numChildren; i++) { if (!stTree_equals(stTree_getChild(tree1, i), stTree_getChild(tree2, i))) { return false; } } return true; }
/* DFS to fill in table of node links and link back with clientData */ static void fillNodeCompLinksDFS(mafTree *mTree, stTree *node, int *treeOrder, struct malnComp *treeComps[]) { for (int i = 0; i < stTree_getChildNumber(node); i++) { fillNodeCompLinksDFS(mTree, stTree_getChild(node, i), treeOrder, treeComps); } struct mafTreeNodeCompLink *ncLink = mafTreeNodeCompLink_construct(*treeOrder, node, treeComps[*treeOrder]); (*treeOrder)++; if (!sameString(ncLink->comp->seq->orgSeqName, stTree_getLabel(node))) { errAbort("tree component name \"%s\" doesn't match tree node name \"%s\"", ncLink->comp->seq->orgSeqName, stTree_getLabel(node)); } }
/* recursive dump */ static void dumpSubtree(stTree *root, FILE *fh, int indent) { fprintf(fh, "%*s", 4*indent, ""); struct malnComp *comp = getNodeComp(root); if (comp == NULL) { fprintf(fh, "%s", stTree_getLabel(root)); } else { malnComp_prInfo(comp, fh); } fputc('\n', fh); for (int i = 0; i < stTree_getChildNumber(root); i++) { dumpSubtree(stTree_getChild(root, i), fh, indent+1); } }
/* DFS to set or check tree order after a join. */ static void setCheckTreeOrderDFS(mafTree *mTree, stTree *node, bool check, int *treeOrder) { for (int i = 0; i < stTree_getChildNumber(node); i++) { setCheckTreeOrderDFS(mTree, stTree_getChild(node, i), check, treeOrder); } struct mafTreeNodeCompLink *ncLink = getNodeCompLink(node); if (!sameString(ncLink->comp->seq->orgSeqName, stTree_getLabel(node))) { errAbort("tree component name \"%s\" doesn't match tree node name \"%s\"", ncLink->comp->seq->orgSeqName, stTree_getLabel(node)); } if (!check) { ncLink->treeOrder = *treeOrder; } else if (ncLink->treeOrder != *treeOrder) { errAbort("expected tree order (%d) doesn't match actual tree node order (%d) for \"%s\"", *treeOrder, ncLink->treeOrder, stTree_getLabel(node)); } (*treeOrder)++; }
/* Join at the specified components, returning new root */ static stTree *joinAtNodes(stTree *root1, stTree *node1, stTree *root2, stTree *node2, struct malnCompCompMap *srcDestCompMap) { if ((stTree_getParent(node1) == NULL) && (stTree_getParent(node2) == NULL)) { assert((root1 == node1) && (root2 == node2)); return joinTrees(root1, node1, node2, srcDestCompMap); } else if (stTree_getParent(node1) == NULL) { assert(root1 == node1); return joinTrees(root2, node2, node1, srcDestCompMap); } else if (stTree_getParent(node2) == NULL) { assert(root2 == node2); return joinTrees(root1, node1, node2, srcDestCompMap); } else { errAbort("join nodes don't obey rules: node1: %s node2: %s", stTree_getLabel(node1), stTree_getLabel(node2)); return NULL; } }
/* add genome objects as client data */ static void speciesTreeAddLinks(stTree *speciesNode, struct Genomes *genomes) { stTree_setClientData(speciesNode, genomesGetGenome(genomes, stTree_getLabel(speciesNode))); for (int i = 0; i < stTree_getChildNumber(speciesNode); i++) { speciesTreeAddLinks(stTree_getChild(speciesNode, i), genomes); } }
/* clone the root. */ static stTree *subrangeCloneRoot(stTree *srcRoot, struct malnCompCompMap *srcDestCompMap) { // clone root, if deleted, these must only be one child (due to the way // the trees are constructed). stList *pendingSubtrees = stList_construct(); stTree *destRoot = subrangeCloneNode(srcRoot, srcDestCompMap, pendingSubtrees); if (destRoot == NULL) { if (stList_length(pendingSubtrees) > 1) { struct mafTreeNodeCompLink *srcNcLink = getNodeCompLink(srcRoot); errAbort("deleted tree root %s (component: %s:%d-%d/%c)) has more that one child", stTree_getLabel(srcRoot), srcNcLink->comp->seq->orgSeqName, srcNcLink->comp->start, srcNcLink->comp->end, srcNcLink->comp->strand); } else if (stList_length(pendingSubtrees) == 1) { destRoot = stList_pop(pendingSubtrees); } } stList_destruct(pendingSubtrees); return destRoot; }
int main(int argc, char *argv[]) { /* * Arguments/options */ char *logLevelString = NULL; char *mfaFile = NULL; char *outputFile = NULL; char *treeFile = NULL; /////////////////////////////////////////////////////////////////////////// // (0) Parse the inputs handed by genomeCactus.py / setup stuff. /////////////////////////////////////////////////////////////////////////// while (1) { static struct option long_options[] = { { "logLevel", required_argument, 0, 'a' }, { "mfaFile", required_argument, 0, 'b' }, { "outputFile", required_argument, 0, 'd' }, { "treeFile", optional_argument, 0, 't' }, { "help", no_argument, 0, 'h' }, { 0, 0, 0, 0 } }; int option_index = 0; int key = getopt_long(argc, argv, "a:b:d:t:h", long_options, &option_index); if (key == -1) { break; } switch (key) { case 'a': logLevelString = stString_copy(optarg); break; case 'b': mfaFile = stString_copy(optarg); break; case 'd': outputFile = stString_copy(optarg); break; case 't': treeFile = stString_copy(optarg); break; case 'h': usage(); return 0; default: usage(); return 1; } } /////////////////////////////////////////////////////////////////////////// // (0) Check the inputs. /////////////////////////////////////////////////////////////////////////// if (argc == 1) { usage(); exit(1); } assert(mfaFile != NULL); assert(outputFile != NULL); ////////////////////////////////////////////// //Set up logging ////////////////////////////////////////////// st_setLogLevelFromString(logLevelString); ////////////////////////////////////////////// //Log (some of) the inputs ////////////////////////////////////////////// st_logInfo("MFA file name : %s\n", mfaFile); st_logInfo("Output MAF file : %s\n", outputFile); st_logInfo("Tree file name: %s\n", treeFile == NULL ? "null" : treeFile); ////////////////////////////////////////////// //Get the MFA alignment ////////////////////////////////////////////// //get the alignment struct List *sequences = constructEmptyList(0, free); struct List *seqLengths = constructEmptyList(0, (void (*)(void *))destructInt); struct List *fastaNames = constructEmptyList(0, free); FILE *fileHandle = fopen(mfaFile, "r"); if (fileHandle == NULL) { usage(); exit(1); } fastaRead(fileHandle, sequences, seqLengths, fastaNames); fclose(fileHandle); ////////////////////////////////////////////// //Get the tree alignment ////////////////////////////////////////////// stTree *tree = NULL; LeafPtrArray *leafArray = NULL; int32_t leafCount = 0; if (treeFile != NULL) { tree = eTreeX_getTreeFromFile(treeFile); eTreeX_postOrderTraversal(tree, eTreeX_countLeaves, &leafCount); leafArray = eTreeX_constructLeafPtrArray(leafCount); eTreeX_postOrderTraversal(tree, eTreeX_getLeafArray, (void *) leafArray); } ////////////////////////////////////////////// //Write the MFA alignment. ////////////////////////////////////////////// fileHandle = fopen(outputFile, "w"); //write the header. fprintf(fileHandle, "##maf version=1 scoring=NULL\n"); fprintf(fileHandle, "# converted_from_MFA\n\n"); //write the score line char *treeString = NULL; if (treeFile != NULL) { treeString = stTree_getNewickTreeString(tree); fprintf(fileHandle, "a score=0 tree=\"%s\"\n", treeString); } else { fprintf(fileHandle, "a score=0\n"); leafCount = sequences->length; } //write the alignment int32_t i, j; int32_t ii; const char *label; for (ii=0; ii<leafCount; ii++) { if (treeFile != NULL) { label = stTree_getLabel((stTree *) leafArray->ptrArray[ii]); /* Do a brute force search to find the appropriate sequence that matches "label" */ for (i=0; i<sequences->length; i++) { char *fastaHeader = fastaNames->list[i]; char *sequenceName = st_malloc(sizeof(char) *(1 + strlen(fastaHeader))); sscanf(fastaHeader, "%s", sequenceName); //take the sequence name to be the first word of the sequence. if (strcmp(label, sequenceName) == 0) { free(sequenceName); break; } free(sequenceName); } } else { i = ii; } char *sequence = sequences->list[i]; int32_t seqLength = *((int32_t *)seqLengths->list[i]); assert(seqLength == (int32_t)strlen(sequence)); char *fastaHeader = fastaNames->list[i]; char *sequenceName = st_malloc(sizeof(char) *(1 + strlen(fastaHeader))); sscanf(fastaHeader, "%s", sequenceName); //take the sequence name to be the first word of the sequence. int32_t length = 0; for (j=0; j<(int32_t)strlen(sequence); j++) { if (sequence[j] != '-') { length++; } } fprintf(fileHandle, "s\t%s\t%i\t%i\t%s\t%i\t%s\n", sequenceName, 0, length, "+", length, sequence); free(sequenceName); } fclose(fileHandle); ////////////////////////////////////////////// //Clean up. ////////////////////////////////////////////// free(mfaFile); free(outputFile); free(treeFile); if (treeFile != NULL) { stTree_destruct(tree); free(treeString); eTreeX_destructLeafPtrArray(leafArray); } destructList(sequences); destructList(seqLengths); destructList(fastaNames); return 0; }