StateMachine *stateMachine3_construct(StateMachineType type) { StateMachine3 *sM3 = st_malloc(sizeof(StateMachine3)); sM3->TRANSITION_MATCH_CONTINUE = -0.030064059121770816; //0.9703833696510062f sM3->TRANSITION_MATCH_FROM_GAP_X = -1.272871422049609; //1.0 - gapExtend - gapSwitch = 0.280026392297485 sM3->TRANSITION_MATCH_FROM_GAP_Y = -1.272871422049609; //1.0 - gapExtend - gapSwitch = 0.280026392297485 sM3->TRANSITION_GAP_OPEN_X = -4.21256642; //0.0129868352330243 sM3->TRANSITION_GAP_OPEN_Y = -4.21256642; //0.0129868352330243 sM3->TRANSITION_GAP_EXTEND_X = -0.3388262689231553; //0.7126062401851738f; sM3->TRANSITION_GAP_EXTEND_Y = -0.3388262689231553; //0.7126062401851738f; sM3->TRANSITION_GAP_SWITCH_TO_X = -4.910694825551255; //0.0073673675173412815f; sM3->TRANSITION_GAP_SWITCH_TO_Y = -4.910694825551255; //0.0073673675173412815f; emissions_setMatchProbsToDefaults(sM3->EMISSION_MATCH_PROBS); emissions_setGapProbsToDefaults(sM3->EMISSION_GAP_X_PROBS); emissions_setGapProbsToDefaults(sM3->EMISSION_GAP_Y_PROBS); if (type != threeState && type != threeStateAsymmetric) { st_errAbort("Tried to create a three state state-machine with the wrong type"); } sM3->model.type = type; sM3->model.stateNumber = 3; sM3->model.matchState = match; sM3->model.startStateProb = stateMachine3_startStateProb; sM3->model.endStateProb = stateMachine3_endStateProb; sM3->model.raggedStartStateProb = stateMachine3_raggedStartStateProb; sM3->model.raggedEndStateProb = stateMachine3_raggedEndStateProb; sM3->model.cellCalculate = stateMachine3_cellCalculate; return (StateMachine *) sM3; }
Hmm *hmm_constructEmpty(double pseudoExpectation, StateMachineType type) { Hmm *hmm = st_malloc(sizeof(Hmm)); hmm->type = type; switch (type) { case fiveState: case fiveStateAsymmetric: hmm->stateNumber = 5; break; case threeState: case threeStateAsymmetric: hmm->stateNumber = 3; break; default: st_errAbort("Unrecognised state type: %i\n", type); } hmm->transitions = st_malloc(hmm->stateNumber * hmm->stateNumber * sizeof(double)); for (int64_t i = 0; i < hmm->stateNumber * hmm->stateNumber; i++) { hmm->transitions[i] = pseudoExpectation; } hmm->emissions = st_malloc(hmm->stateNumber * SYMBOL_NUMBER_NO_N * SYMBOL_NUMBER_NO_N * sizeof(double)); for (int64_t i = 0; i < hmm->stateNumber * SYMBOL_NUMBER_NO_N * SYMBOL_NUMBER_NO_N; i++) { hmm->emissions[i] = pseudoExpectation; } hmm->likelihood = 0.0; return hmm; }
void checkBranchLengthsAreDefined(stTree *tree) { if (isinf(stTree_getBranchLength(tree))) { st_errAbort("Got a non defined branch length in the input tree: %s.\n", stTree_getNewickTreeString(tree)); } for (int64_t i = 0; i < stTree_getChildNumber(tree); i++) { checkBranchLengthsAreDefined(stTree_getChild(tree, i)); } }
static void assignEventsAndSequences(Event *parentEvent, stTree *tree, stSet *outgroupNameSet, char *argv[], int64_t *j) { Event *myEvent = NULL; // To distinguish from the global "event" variable. assert(tree != NULL); totalEventNumber++; if (stTree_getChildNumber(tree) > 0) { myEvent = event_construct3(stTree_getLabel(tree), stTree_getBranchLength(tree), parentEvent, eventTree); for (int64_t i = 0; i < stTree_getChildNumber(tree); i++) { assignEventsAndSequences(myEvent, stTree_getChild(tree, i), outgroupNameSet, argv, j); } } if (stTree_getChildNumber(tree) == 0 || (stTree_getLabel(tree) != NULL && (stSet_search(outgroupNameSet, (char *)stTree_getLabel(tree)) != NULL))) { // This event is a leaf and/or an outgroup, so it has // associated sequence. assert(stTree_getLabel(tree) != NULL); assert(stTree_getBranchLength(tree) != INFINITY); if (stTree_getChildNumber(tree) == 0) { // Construct the leaf event myEvent = event_construct3(stTree_getLabel(tree), stTree_getBranchLength(tree), parentEvent, eventTree); } char *fileName = argv[*j]; if (!stFile_exists(fileName)) { st_errAbort("File does not exist: %s\n", fileName); } // Set the global "event" variable, which is needed for the // function provided to fastaReadToFunction. event = myEvent; if (stFile_isDir(fileName)) { st_logInfo("Processing directory: %s\n", fileName); stList *filesInDir = stFile_getFileNamesInDirectory(fileName); for (int64_t i = 0; i < stList_length(filesInDir); i++) { char *absChildFileName = stFile_pathJoin(fileName, stList_get(filesInDir, i)); assert(stFile_exists(absChildFileName)); setCompleteStatus(absChildFileName); //decide if the sequences in the file should be free or attached. FILE *fileHandle = fopen(absChildFileName, "r"); fastaReadToFunction(fileHandle, processSequence); fclose(fileHandle); free(absChildFileName); } stList_destruct(filesInDir); } else { st_logInfo("Processing file: %s\n", fileName); setCompleteStatus(fileName); //decide if the sequences in the file should be free or attached. FILE *fileHandle = fopen(fileName, "r"); fastaReadToFunction(fileHandle, processSequence); fclose(fileHandle); } (*j)++; } }
Hmm *hmm_loadFromFile(const char *fileName) { FILE *fH = fopen(fileName, "r"); char *string = stFile_getLineFromFile(fH); stList *tokens = stString_split(string); if (stList_length(tokens) < 2) { st_errAbort("Got an empty line in the input state machine file %s\n", fileName); } int type; int64_t j = sscanf(stList_get(tokens, 0), "%i", &type); if (j != 1) { st_errAbort("Failed to parse state number (int) from string: %s\n", string); } Hmm *hmm = hmm_constructEmpty(0.0, type); if (stList_length(tokens) != hmm->stateNumber * hmm->stateNumber + 2) { st_errAbort( "Got the wrong number of transitions in the input state machine file %s, got %" PRIi64 " instead of %" PRIi64 "\n", fileName, stList_length(tokens), hmm->stateNumber * hmm->stateNumber + 2); } for (int64_t i = 0; i < hmm->stateNumber * hmm->stateNumber; i++) { j = sscanf(stList_get(tokens, i + 1), "%lf", &(hmm->transitions[i])); if (j != 1) { st_errAbort("Failed to parse transition prob (float) from string: %s\n", string); } } j = sscanf(stList_get(tokens, stList_length(tokens) - 1), "%lf", &(hmm->likelihood)); if (j != 1) { st_errAbort("Failed to parse likelihood (float) from string: %s\n", string); } //Cleanup transitions line free(string); stList_destruct(tokens); //Now parse the emissions line string = stFile_getLineFromFile(fH); tokens = stString_split(string); if (stList_length(tokens) != hmm->stateNumber * SYMBOL_NUMBER_NO_N * SYMBOL_NUMBER_NO_N) { st_errAbort( "Got the wrong number of emissions in the input state machine file %s, got %" PRIi64 " instead of %" PRIi64 "\n", fileName, stList_length(tokens), hmm->stateNumber * SYMBOL_NUMBER_NO_N * SYMBOL_NUMBER_NO_N); } for (int64_t i = 0; i < hmm->stateNumber * SYMBOL_NUMBER_NO_N * SYMBOL_NUMBER_NO_N; i++) { j = sscanf(stList_get(tokens, i), "%lf", &(hmm->emissions[i])); if (j != 1) { st_errAbort("Failed to parse emission prob (float) from string: %s\n", string); } } //Final cleanup free(string); stList_destruct(tokens); fclose(fH); return hmm; }
static char *getStringFromDisk(FILE *fileHandle, int64_t name, int64_t start, int64_t length) { int64_t k = fseek(fileHandle, name + start, SEEK_SET); if (k != 0) { st_errAbort("Could not fseek to start of desired sequence: %" PRIi64 "\n", name + start); } char *string = st_malloc(sizeof(char) * (length + 1)); int64_t bytesRead = fread(string, sizeof(char), length, fileHandle); if (bytesRead != length) { st_errAbort("Read only %" PRIi64 " bytes of string of length %" PRIi64 " when caching substrings from DB\n", bytesRead, length); } string[length] = '\0'; #ifndef NDEBUG for (int64_t j = 0; j < length; j++) { assert(string[j] != '>'); assert(string[j] != ' '); } #endif return string; }
static stList *chooseAdjacencyPairing_externalProgram(stList *edges, int64_t nodeNumber, const char *programName) { /* * We create temp files to hold stuff. */ if(nodeNumber <= 1) { assert(stList_length(edges) == 0); return stList_construct(); } char *tempInputFile = getTempFile(), *tempOutputFile = getTempFile(); /* * We write the graph to a temp file. */ FILE *fileHandle = fopen(tempInputFile, "w"); if(strcmp(programName, "blossom5") == 0) { //Must be all connected as //generates perfect matchings. writeCliqueGraph(fileHandle, edges, nodeNumber, 1); } else { writeGraph(fileHandle, edges, nodeNumber); } fclose(fileHandle); /* * We run the external program. */ char *command = stString_print("%s -e %s -w %s >& /dev/null", programName, tempInputFile, tempOutputFile); int64_t i = st_system(command); if(i != 0) { st_errAbort("Something went wrong with the command: %s", command); //For some reason this causes a seg fault //stThrowNew(MATCHING_EXCEPTION, "Something went wrong with the command: %s", command); } free(command); /* * We get back the matching. */ fileHandle = fopen(tempOutputFile, "r"); stList *matching = readMatching(fileHandle, edges); fclose(fileHandle); st_logDebug("The adjacency matching for %" PRIi64 " nodes with %" PRIi64 " initial edges contains %" PRIi64 " edges\n", nodeNumber, stList_length(edges), stList_length(matching)); /* * Get rid of the temp files.. */ st_system("rm -rf %s %s", tempInputFile, tempOutputFile); free(tempInputFile); free(tempOutputFile); return matching; }
stSortedSet *loadEndAlignmentFromDisk(Flower *flower, FILE *fileHandle, End **end) { stSortedSet *endAlignment = stSortedSet_construct3((int (*)(const void *, const void *))alignedPair_cmpFn, (void (*)(void *))alignedPair_destruct); char *line = stFile_getLineFromFile(fileHandle); if(line == NULL) { *end = NULL; return NULL; } Name flowerName; int64_t lineNumber; int64_t i = sscanf(line, "%" PRIi64 " %" PRIi64 "", &flowerName, &lineNumber); if(i != 2 || lineNumber < 0) { st_errAbort("We encountered a mis-specified name in loading the first line of an end alignment from the disk: '%s'\n", line); } free(line); *end = flower_getEnd(flower, flowerName); if(*end == NULL) { st_errAbort("We encountered an end name that is not in the database: '%s'\n", line); } for(int64_t i=0; i<lineNumber; i++) { line = stFile_getLineFromFile(fileHandle); if(line == NULL) { st_errAbort("Got a null line when parsing an end alignment\n"); } int64_t sI1, sI2; int64_t p1, st1, p2, st2, score1, score2; int64_t i = sscanf(line, "%" PRIi64 " %" PRIi64 " %" PRIi64 " %" PRIi64 " %" PRIi64 " %" PRIi64 " %" PRIi64 " %" PRIi64 "", &sI1, &p1, &st1, &score1, &sI2, &p2, &st2, &score2); (void)i; if(i != 8) { st_errAbort("We encountered a mis-specified name in loading an end alignment from the disk: '%s'\n", line); } stSortedSet_insert(endAlignment, alignedPair_construct(sI1, p1, st1, sI2, p2, st2, score1, score2)); free(line); } return endAlignment; }
int main(int argc, char *argv[]) { char *cactusDiskString = NULL; stKVDatabaseConf *kvDatabaseConf; CactusDisk *cactusDisk; Flower *flower; Flower_SequenceIterator *flowerIt; Sequence *sequence; struct option longopts[] = { {"cactusDisk", required_argument, NULL, 'c' }, {0, 0, 0, 0} }; int flag; while((flag = getopt_long(argc, argv, "", longopts, NULL)) != -1) { switch(flag) { case 'c': cactusDiskString = stString_copy(optarg); break; case '?': default: usage(); return 1; } } if (cactusDiskString == NULL) { st_errAbort("--cactusDisk option must be provided"); } kvDatabaseConf = stKVDatabaseConf_constructFromString(cactusDiskString); cactusDisk = cactusDisk_construct(kvDatabaseConf, 0); // Get top-level flower. flower = cactusDisk_getFlower(cactusDisk, 0); flowerIt = flower_getSequenceIterator(flower); while((sequence = flower_getNextSequence(flowerIt)) != NULL) { MetaSequence *metaSequence = sequence_getMetaSequence(sequence); const char *header; char *firstToken, *newHeader; stList *tokens; // Strip the ID token from the header (should be the first // |-separated token) and complain if there isn't one. header = metaSequence_getHeader(metaSequence); tokens = fastaDecodeHeader(header); assert(stList_length(tokens) > 1); firstToken = stList_removeFirst(tokens); assert(!strncmp(firstToken, "id=", 3)); free(firstToken); newHeader = fastaEncodeHeader(tokens); metaSequence_setHeader(metaSequence, newHeader); } cactusDisk_write(cactusDisk); }
static void stateMachine5_loadAsymmetric(StateMachine5 *sM5, Hmm *hmm) { if (hmm->type != fiveStateAsymmetric) { st_errAbort("Wrong hmm type"); } sM5->TRANSITION_MATCH_CONTINUE = log(hmm_getTransition(hmm, match, match)); //0.9703833696510062f sM5->TRANSITION_MATCH_FROM_SHORT_GAP_X = log(hmm_getTransition(hmm, shortGapX, match)); sM5->TRANSITION_MATCH_FROM_LONG_GAP_X = log(hmm_getTransition(hmm, longGapX, match)); sM5->TRANSITION_GAP_SHORT_OPEN_X = log(hmm_getTransition(hmm, match, shortGapX)); sM5->TRANSITION_GAP_SHORT_EXTEND_X = log(hmm_getTransition(hmm, shortGapX, shortGapX)); sM5->TRANSITION_GAP_SHORT_SWITCH_TO_X = log(hmm_getTransition(hmm, shortGapY, shortGapX)); sM5->TRANSITION_GAP_LONG_OPEN_X = log(hmm_getTransition(hmm, match, longGapX)); sM5->TRANSITION_GAP_LONG_EXTEND_X = log(hmm_getTransition(hmm, longGapX, longGapX)); sM5->TRANSITION_GAP_LONG_SWITCH_TO_X = log(hmm_getTransition(hmm, longGapY, longGapX)); if(sM5->TRANSITION_GAP_SHORT_EXTEND_X > sM5->TRANSITION_GAP_LONG_EXTEND_X) { //Switch the long and short gap parameters if one the "long states" have a smaller extend probability than the "short states", as can randomly happen during EM training. switchDoubles(&(sM5->TRANSITION_GAP_SHORT_EXTEND_X), &(sM5->TRANSITION_GAP_LONG_EXTEND_X)); switchDoubles(&(sM5->TRANSITION_MATCH_FROM_SHORT_GAP_X), &(sM5->TRANSITION_MATCH_FROM_LONG_GAP_X)); switchDoubles(&(sM5->TRANSITION_GAP_SHORT_OPEN_X), &(sM5->TRANSITION_GAP_LONG_OPEN_X)); switchDoubles(&(sM5->TRANSITION_GAP_SHORT_SWITCH_TO_X), &(sM5->TRANSITION_GAP_LONG_SWITCH_TO_X)); } sM5->TRANSITION_MATCH_FROM_SHORT_GAP_Y = log(hmm_getTransition(hmm, shortGapY, match)); sM5->TRANSITION_MATCH_FROM_LONG_GAP_Y = log(hmm_getTransition(hmm, longGapY, match)); sM5->TRANSITION_GAP_SHORT_OPEN_Y = log(hmm_getTransition(hmm, match, shortGapY)); sM5->TRANSITION_GAP_SHORT_EXTEND_Y = log(hmm_getTransition(hmm, shortGapY, shortGapY)); sM5->TRANSITION_GAP_SHORT_SWITCH_TO_Y = log(hmm_getTransition(hmm, shortGapX, shortGapY)); sM5->TRANSITION_GAP_LONG_OPEN_Y = log(hmm_getTransition(hmm, match, longGapY)); sM5->TRANSITION_GAP_LONG_EXTEND_Y = log(hmm_getTransition(hmm, longGapY, longGapY)); sM5->TRANSITION_GAP_LONG_SWITCH_TO_Y = log(hmm_getTransition(hmm, longGapX, longGapY)); if(sM5->TRANSITION_GAP_SHORT_EXTEND_Y > sM5->TRANSITION_GAP_LONG_EXTEND_Y) { //Switch the long and short gap parameters if one the "long states" have a smaller extend probability than the "short states", as can randomly happen during EM training. switchDoubles(&(sM5->TRANSITION_GAP_SHORT_EXTEND_Y), &(sM5->TRANSITION_GAP_LONG_EXTEND_Y)); switchDoubles(&(sM5->TRANSITION_MATCH_FROM_SHORT_GAP_Y), &(sM5->TRANSITION_MATCH_FROM_LONG_GAP_Y)); switchDoubles(&(sM5->TRANSITION_GAP_SHORT_OPEN_Y), &(sM5->TRANSITION_GAP_LONG_OPEN_Y)); switchDoubles(&(sM5->TRANSITION_GAP_SHORT_SWITCH_TO_Y), &(sM5->TRANSITION_GAP_LONG_SWITCH_TO_Y)); } emissions_loadMatchProbs(sM5->EMISSION_MATCH_PROBS, hmm, match); int64_t xGapStates[2] = { shortGapX, longGapX }; int64_t yGapStates[2] = { shortGapY, longGapY }; emissions_loadGapProbs(sM5->EMISSION_GAP_X_PROBS, hmm, xGapStates, 2, NULL, 0); emissions_loadGapProbs(sM5->EMISSION_GAP_Y_PROBS, hmm, NULL, 0, yGapStates, 2); }
static void stateMachine5_loadSymmetric(StateMachine5 *sM5, Hmm *hmm) { if (hmm->type != fiveState) { st_errAbort("Wrong hmm type"); } sM5->TRANSITION_MATCH_CONTINUE = log(hmm_getTransition(hmm, match, match)); //0.9703833696510062f sM5->TRANSITION_MATCH_FROM_SHORT_GAP_X = log( (hmm_getTransition(hmm, shortGapX, match) + hmm_getTransition(hmm, shortGapY, match)) / 2); //1.0 - gapExtend - gapSwitch = 0.280026392297485 sM5->TRANSITION_MATCH_FROM_LONG_GAP_X = log( (hmm_getTransition(hmm, longGapX, match) + hmm_getTransition(hmm, longGapY, match)) / 2); //1.0 - gapExtend = 0.00343657420938 sM5->TRANSITION_GAP_SHORT_OPEN_X = log( (hmm_getTransition(hmm, match, shortGapX) + hmm_getTransition(hmm, match, shortGapY)) / 2); //0.0129868352330243 sM5->TRANSITION_GAP_SHORT_EXTEND_X = log( (hmm_getTransition(hmm, shortGapX, shortGapX) + hmm_getTransition(hmm, shortGapY, shortGapY)) / 2); //0.7126062401851738f; sM5->TRANSITION_GAP_SHORT_SWITCH_TO_X = log( (hmm_getTransition(hmm, shortGapX, shortGapY) + hmm_getTransition(hmm, shortGapY, shortGapX)) / 2); //0.0073673675173412815f; sM5->TRANSITION_GAP_LONG_OPEN_X = log( (hmm_getTransition(hmm, match, longGapX) + hmm_getTransition(hmm, match, longGapY)) / 2); //(1.0 - match - 2*gapOpenShort)/2 = 0.001821479941473 sM5->TRANSITION_GAP_LONG_EXTEND_X = log( (hmm_getTransition(hmm, longGapX, longGapX) + hmm_getTransition(hmm, longGapY, longGapY)) / 2); sM5->TRANSITION_GAP_LONG_SWITCH_TO_X = log( (hmm_getTransition(hmm, longGapX, longGapY) + hmm_getTransition(hmm, longGapY, longGapX)) / 2); //0.0073673675173412815f; if(sM5->TRANSITION_GAP_SHORT_EXTEND_X > sM5->TRANSITION_GAP_LONG_EXTEND_X) { //Switch the long and short gap parameters if one the "long states" have a smaller extend probability than the "short states", as can randomly happen during EM training. switchDoubles(&(sM5->TRANSITION_GAP_SHORT_EXTEND_X), &(sM5->TRANSITION_GAP_LONG_EXTEND_X)); switchDoubles(&(sM5->TRANSITION_MATCH_FROM_SHORT_GAP_X), &(sM5->TRANSITION_MATCH_FROM_LONG_GAP_X)); switchDoubles(&(sM5->TRANSITION_GAP_SHORT_OPEN_X), &(sM5->TRANSITION_GAP_LONG_OPEN_X)); switchDoubles(&(sM5->TRANSITION_GAP_SHORT_SWITCH_TO_X), &(sM5->TRANSITION_GAP_LONG_SWITCH_TO_X)); } sM5->TRANSITION_MATCH_FROM_SHORT_GAP_Y = sM5->TRANSITION_MATCH_FROM_SHORT_GAP_X; sM5->TRANSITION_MATCH_FROM_LONG_GAP_Y = sM5->TRANSITION_MATCH_FROM_LONG_GAP_X; sM5->TRANSITION_GAP_SHORT_OPEN_Y = sM5->TRANSITION_GAP_SHORT_OPEN_X; sM5->TRANSITION_GAP_SHORT_EXTEND_Y = sM5->TRANSITION_GAP_SHORT_EXTEND_X; sM5->TRANSITION_GAP_SHORT_SWITCH_TO_Y = sM5->TRANSITION_GAP_SHORT_SWITCH_TO_X; sM5->TRANSITION_GAP_LONG_OPEN_Y = sM5->TRANSITION_GAP_LONG_OPEN_X; sM5->TRANSITION_GAP_LONG_EXTEND_Y = sM5->TRANSITION_GAP_LONG_EXTEND_X; sM5->TRANSITION_GAP_LONG_SWITCH_TO_Y = sM5->TRANSITION_GAP_LONG_SWITCH_TO_X; emissions_loadMatchProbsSymmetrically(sM5->EMISSION_MATCH_PROBS, hmm, match); int64_t xGapStates[2] = { shortGapX, longGapX }; int64_t yGapStates[2] = { shortGapY, longGapY }; emissions_loadGapProbs(sM5->EMISSION_GAP_X_PROBS, hmm, xGapStates, 2, yGapStates, 2); emissions_loadGapProbs(sM5->EMISSION_GAP_Y_PROBS, hmm, xGapStates, 2, yGapStates, 2); }
static void stateMachine3_loadAsymmetric(StateMachine3 *sM3, Hmm *hmm) { if (hmm->type != threeStateAsymmetric) { st_errAbort("Wrong hmm type"); } sM3->TRANSITION_MATCH_CONTINUE = log(hmm_getTransition(hmm, match, match)); sM3->TRANSITION_MATCH_FROM_GAP_X = log(hmm_getTransition(hmm, shortGapX, match)); sM3->TRANSITION_MATCH_FROM_GAP_Y = log(hmm_getTransition(hmm, shortGapY, match)); sM3->TRANSITION_GAP_OPEN_X = log(hmm_getTransition(hmm, match, shortGapX)); sM3->TRANSITION_GAP_OPEN_Y = log(hmm_getTransition(hmm, match, shortGapY)); sM3->TRANSITION_GAP_EXTEND_X = log(hmm_getTransition(hmm, shortGapX, shortGapX)); sM3->TRANSITION_GAP_EXTEND_Y = log(hmm_getTransition(hmm, shortGapY, shortGapY)); sM3->TRANSITION_GAP_SWITCH_TO_X = log(hmm_getTransition(hmm, shortGapY, shortGapX)); sM3->TRANSITION_GAP_SWITCH_TO_Y = log(hmm_getTransition(hmm, shortGapX, shortGapY)); emissions_loadMatchProbs(sM3->EMISSION_MATCH_PROBS, hmm, match); int64_t xGapStates[1] = { shortGapX }; int64_t yGapStates[1] = { shortGapY }; emissions_loadGapProbs(sM3->EMISSION_GAP_X_PROBS, hmm, xGapStates, 1, NULL, 0); emissions_loadGapProbs(sM3->EMISSION_GAP_Y_PROBS, hmm, NULL, 0, yGapStates, 1); }
StateMachine *stateMachine5_construct(StateMachineType type) { StateMachine5 *sM5 = st_malloc(sizeof(StateMachine5)); sM5->TRANSITION_MATCH_CONTINUE = -0.030064059121770816; //0.9703833696510062f sM5->TRANSITION_MATCH_FROM_SHORT_GAP_X = -1.272871422049609; //1.0 - gapExtend - gapSwitch = 0.280026392297485 sM5->TRANSITION_MATCH_FROM_LONG_GAP_X = -5.673280173170473; //1.0 - gapExtend = 0.00343657420938 sM5->TRANSITION_GAP_SHORT_OPEN_X = -4.34381910900448; //0.0129868352330243 sM5->TRANSITION_GAP_SHORT_EXTEND_X = -0.3388262689231553; //0.7126062401851738f; sM5->TRANSITION_GAP_SHORT_SWITCH_TO_X = -4.910694825551255; //0.0073673675173412815f; sM5->TRANSITION_GAP_LONG_OPEN_X = -6.30810595366929; //(1.0 - match - 2*gapOpenShort)/2 = 0.001821479941473 sM5->TRANSITION_GAP_LONG_EXTEND_X = -0.003442492794189331; //0.99656342579062f; sM5->TRANSITION_GAP_LONG_SWITCH_TO_X = -6.30810595366929; //0.99656342579062f; sM5->TRANSITION_MATCH_FROM_SHORT_GAP_Y = sM5->TRANSITION_MATCH_FROM_SHORT_GAP_X; sM5->TRANSITION_MATCH_FROM_LONG_GAP_Y = sM5->TRANSITION_MATCH_FROM_LONG_GAP_X; sM5->TRANSITION_GAP_SHORT_OPEN_Y = sM5->TRANSITION_GAP_SHORT_OPEN_X; sM5->TRANSITION_GAP_SHORT_EXTEND_Y = sM5->TRANSITION_GAP_SHORT_EXTEND_X; sM5->TRANSITION_GAP_SHORT_SWITCH_TO_Y = sM5->TRANSITION_GAP_SHORT_SWITCH_TO_X; sM5->TRANSITION_GAP_LONG_OPEN_Y = sM5->TRANSITION_GAP_LONG_OPEN_X; sM5->TRANSITION_GAP_LONG_EXTEND_Y = sM5->TRANSITION_GAP_LONG_EXTEND_X; sM5->TRANSITION_GAP_LONG_SWITCH_TO_Y = sM5->TRANSITION_GAP_LONG_SWITCH_TO_X; emissions_setMatchProbsToDefaults(sM5->EMISSION_MATCH_PROBS); emissions_setGapProbsToDefaults(sM5->EMISSION_GAP_X_PROBS); emissions_setGapProbsToDefaults(sM5->EMISSION_GAP_Y_PROBS); if(type != fiveState && type != fiveStateAsymmetric) { st_errAbort("Wrong type for five state %i", type); } sM5->model.type = type; sM5->model.stateNumber = 5; sM5->model.matchState = match; sM5->model.startStateProb = stateMachine5_startStateProb; sM5->model.endStateProb = stateMachine5_endStateProb; sM5->model.raggedStartStateProb = stateMachine5_raggedStartStateProb; sM5->model.raggedEndStateProb = stateMachine5_raggedEndStateProb; sM5->model.cellCalculate = stateMachine5_cellCalculate; return (StateMachine *) sM5; }
int main(int argc, char *argv[]) { int64_t j = 0; char *npReadFile = NULL; char *templateModelFile = stString_print("../models/testModelR9_template.model"); char *complementModelFile = stString_print("../models/testModelR9_complement_pop2.model"); double threshold = 0.8; int key; while (1) { static struct option long_options[] = { {"help", no_argument, 0, 'h'}, {"templateModel", required_argument, 0, 'T'}, {"complementModel", required_argument, 0, 'C'}, {"npRead", required_argument, 0, 'q'}, {"threshold", required_argument, 0, 'D'}, {0, 0, 0, 0} }; int option_index = 0; key = getopt_long(argc, argv, "h:T:C:q:f:b:D:m:", long_options, &option_index); if (key == -1) { //usage(); break; } switch (key) { case 'h': usage(); return 1; case 'T': templateModelFile = stString_copy(optarg); break; case 'C': complementModelFile = stString_copy(optarg); break; case 'q': npReadFile = stString_copy(optarg); break; case 'D': j = sscanf(optarg, "%lf", &threshold); assert (j == 1); assert (threshold >= 0); break; default: usage(); return 1; } } if (!stFile_exists(npReadFile)) { st_errAbort("Could not find npRead here: %s\n", npReadFile); } // read in the .npRead file NanoporeRead *npRead = nanopore_loadNanoporeReadFromFile(npReadFile); // build state machines (to use the look up table) StateMachine *sMt = getStateMachine3(templateModelFile); //StateMachine *sMc = getStateMachine3(complementModelFile); // make 1D map of events (mean, noise) to kmers stList *templateMap = signalUtils_templateOneDAssignmentsFromRead(npRead, sMt, ASSIGNMENT_THRESHOLD); //stList *complementMap = signalUtils_complementOneDAssignmentsFromRead(npRead, sMc, ASSIGNMENT_THRESHOLD); // convert template to log normal // NB only need this if you're estimating the NOISE parameteres //nanopore_convert_to_lognormal_params(sMt->alphabetSize, sMt->kmerLength, sMt->EMISSION_MATCH_MATRIX, templateMap); // convert complement to log normal //nanopore_convert_to_lognormal_params(sMc->alphabetSize, sMc->kmerLength, sMc->EMISSION_MATCH_MATRIX, complementMap); // error log report st_uglyf("SENTINEL - Before: shift: %f scale: %f var: %f [template]\n", npRead->templateParams.shift, npRead->templateParams.scale, npRead->templateParams.var); // compute template params //nanopore_compute_noise_scale_params(sMt->EMISSION_MATCH_MATRIX, templateMap, &npRead->templateParams); // compute complement params //nanopore_compute_noise_scale_params(sMc->EMISSION_MATCH_MATRIX, complementMap, &npRead->complementParams); // error log report signalUtils_estimateNanoporeParams(sMt, npRead, &npRead->templateParams, ASSIGNMENT_THRESHOLD, signalUtils_templateOneDAssignmentsFromRead, nanopore_dontAdjustEvents); //signalUtils_estimateNanoporeParams(sMc, npRead, &npRead->complementParams, ASSIGNMENT_THRESHOLD, // signalUtils_complementOneDAssignmentsFromRead, nanopore_dontAdjustEvents); st_uglyf("SENTINEL - After: shift: %f scale: %f var: %f [template]\n", npRead->templateParams.shift, npRead->templateParams.scale, npRead->templateParams.var); //st_uglyf("SENTINEL - After: shift_sd: %f scale_sd: %f var_sd: %f [template]\n", // npRead->complementParams.shift_sd, npRead->complementParams.scale_sd, npRead->complementParams.var_sd); stList *templateKmers = lineTokensFromFile(npReadFile, 10); //stList *complementKmers = lineTokensFromFile(npReadFile, 12); //printEventNoisesAndParams(npRead, templateKmers, complementKmers); printEventMeansAndParams(npRead, templateKmers, NULL); stList_destruct(templateKmers); //stList_destruct(complementKmers); stList_destruct(templateMap); //stList_destruct(complementMap); nanopore_nanoporeReadDestruct(npRead); stateMachine_destruct(sMt); //stateMachine_destruct(sMc); (void) j; // silence unused variable warning. return 0; }
int main(int argc, char *argv[]) { /* * Script for adding a reference genome to a flower. */ /* * Arguments/options */ char * logLevelString = NULL; char * cactusDiskDatabaseString = NULL; char * secondaryDatabaseString = NULL; char *referenceEventString = (char *) cactusMisc_getDefaultReferenceEventHeader(); bool bottomUpPhase = 0; /////////////////////////////////////////////////////////////////////////// // (0) Parse the inputs handed by genomeCactus.py / setup stuff. /////////////////////////////////////////////////////////////////////////// while (1) { static struct option long_options[] = { { "logLevel", required_argument, 0, 'a' }, { "cactusDisk", required_argument, 0, 'b' }, { "secondaryDisk", required_argument, 0, 'd' }, { "referenceEventString", required_argument, 0, 'g' }, { "help", no_argument, 0, 'h' }, { "bottomUpPhase", no_argument, 0, 'j' }, { 0, 0, 0, 0 } }; int option_index = 0; int key = getopt_long(argc, argv, "a:b:c:d:e:g:hi:j", long_options, &option_index); if (key == -1) { break; } switch (key) { case 'a': logLevelString = stString_copy(optarg); break; case 'b': cactusDiskDatabaseString = stString_copy(optarg); break; case 'd': secondaryDatabaseString = stString_copy(optarg); break; case 'g': referenceEventString = stString_copy(optarg); break; case 'h': usage(); return 0; case 'j': bottomUpPhase = 1; break; default: usage(); return 1; } } /////////////////////////////////////////////////////////////////////////// // (0) Check the inputs. /////////////////////////////////////////////////////////////////////////// assert(cactusDiskDatabaseString != NULL); ////////////////////////////////////////////// //Set up logging ////////////////////////////////////////////// st_setLogLevelFromString(logLevelString); ////////////////////////////////////////////// //Load the database ////////////////////////////////////////////// st_logInfo("referenceEventString = %s\n", referenceEventString); st_logInfo("bottomUpPhase = %i\n", bottomUpPhase); stKVDatabaseConf *kvDatabaseConf = stKVDatabaseConf_constructFromString(cactusDiskDatabaseString); CactusDisk *cactusDisk = cactusDisk_construct(kvDatabaseConf, false, true); stKVDatabaseConf_destruct(kvDatabaseConf); st_logInfo("Set up the flower disk\n"); stKVDatabase *sequenceDatabase = NULL; if (secondaryDatabaseString != NULL) { kvDatabaseConf = stKVDatabaseConf_constructFromString(secondaryDatabaseString); sequenceDatabase = stKVDatabase_construct(kvDatabaseConf, 0); stKVDatabaseConf_destruct(kvDatabaseConf); } FlowerStream *flowerStream = flowerWriter_getFlowerStream(cactusDisk, stdin); Flower *flower; while ((flower = flowerStream_getNext(flowerStream)) != NULL) { st_logDebug("Processing flower %" PRIi64 "\n", flower_getName(flower)); /////////////////////////////////////////////////////////////////////////// // Get the appropriate event names /////////////////////////////////////////////////////////////////////////// st_logInfo("%s\n", eventTree_makeNewickString(flower_getEventTree(flower))); Event *referenceEvent = eventTree_getEventByHeader(flower_getEventTree(flower), referenceEventString); if (referenceEvent == NULL) { st_errAbort("Reference event %s not found in tree. Check your " "--referenceEventString option", referenceEventString); } Name referenceEventName = event_getName(referenceEvent); /////////////////////////////////////////////////////////////////////////// // Now do bottom up or top down, depending /////////////////////////////////////////////////////////////////////////// stList *flowers = stList_construct(); stList_append(flowers, flower); preCacheNestedFlowers(cactusDisk, flowers); if (bottomUpPhase) { assert(sequenceDatabase != NULL); cactusDisk_preCacheSegmentStrings(cactusDisk, flowers); bottomUp(flowers, sequenceDatabase, referenceEventName, !flower_hasParentGroup(flower), generateJukesCantorMatrix); // Unload the nested flowers to save memory. They haven't // been changed, so we don't write them to the cactus // disk. Flower_GroupIterator *groupIt = flower_getGroupIterator(flower); Group *group; while ((group = flower_getNextGroup(groupIt)) != NULL) { if (!group_isLeaf(group)) { flower_unload(group_getNestedFlower(group)); } } flower_destructGroupIterator(groupIt); assert(!flower_isParentLoaded(flower)); // Write this flower to disk. cactusDisk_addUpdateRequest(cactusDisk, flower); } else { topDown(flower, referenceEventName); // We've changed the nested flowers, but not this // flower. We write the nested flowers to disk, then // unload them to save memory. This flower will be // unloaded by the flower-stream code. Flower_GroupIterator *groupIt = flower_getGroupIterator(flower); Group *group; while ((group = flower_getNextGroup(groupIt)) != NULL) { if (!group_isLeaf(group)) { cactusDisk_addUpdateRequest(cactusDisk, group_getNestedFlower(group)); flower_unload(group_getNestedFlower(group)); } } flower_destructGroupIterator(groupIt); } stList_destruct(flowers); } /////////////////////////////////////////////////////////////////////////// // Write the flower(s) back to disk. /////////////////////////////////////////////////////////////////////////// cactusDisk_write(cactusDisk); st_logInfo("Updated the flower on disk\n"); /////////////////////////////////////////////////////////////////////////// //Clean up. /////////////////////////////////////////////////////////////////////////// if (sequenceDatabase != NULL) { stKVDatabase_destruct(sequenceDatabase); } cactusDisk_destruct(cactusDisk); return 0; //Exit without clean up is quicker, enable cleanup when doing memory leak detection. free(cactusDiskDatabaseString); free(referenceEventString); free(logLevelString); st_logInfo("Cleaned stuff up and am finished\n"); return 0; }
int main(int argc, char *argv[]) { /* * Script for adding alignments to cactus tree. */ int64_t startTime; stKVDatabaseConf *kvDatabaseConf; CactusDisk *cactusDisk; int key, k; bool (*filterFn)(stPinchSegment *, stPinchSegment *) = NULL; stSet *outgroupThreads = NULL; /* * Arguments/options */ char * logLevelString = NULL; char * alignmentsFile = NULL; char * constraintsFile = NULL; char * cactusDiskDatabaseString = NULL; char * lastzArguments = ""; int64_t minimumSequenceLengthForBlast = 1; //Parameters for annealing/melting rounds int64_t *annealingRounds = NULL; int64_t annealingRoundsLength = 0; int64_t *meltingRounds = NULL; int64_t meltingRoundsLength = 0; //Parameters for melting float maximumAdjacencyComponentSizeRatio = 10; int64_t blockTrim = 0; int64_t alignmentTrimLength = 0; int64_t *alignmentTrims = NULL; int64_t chainLengthForBigFlower = 1000000; int64_t longChain = 2; int64_t minLengthForChromosome = 1000000; float proportionOfUnalignedBasesForNewChromosome = 0.8; bool breakChainsAtReverseTandems = 1; int64_t maximumMedianSequenceLengthBetweenLinkedEnds = INT64_MAX; bool realign = 0; char *realignArguments = ""; bool removeRecoverableChains = false; bool (*recoverableChainsFilter)(stCactusEdgeEnd *, Flower *) = NULL; int64_t maxRecoverableChainsIterations = 1; int64_t maxRecoverableChainLength = INT64_MAX; //Parameters for removing ancient homologies bool doPhylogeny = false; int64_t phylogenyNumTrees = 1; enum stCaf_RootingMethod phylogenyRootingMethod = BEST_RECON; enum stCaf_ScoringMethod phylogenyScoringMethod = COMBINED_LIKELIHOOD; double breakpointScalingFactor = 1.0; bool phylogenySkipSingleCopyBlocks = 0; int64_t phylogenyMaxBaseDistance = 1000; int64_t phylogenyMaxBlockDistance = 100; bool phylogenyKeepSingleDegreeBlocks = 0; stList *phylogenyTreeBuildingMethods = stList_construct(); enum stCaf_TreeBuildingMethod defaultMethod = GUIDED_NEIGHBOR_JOINING; stList_append(phylogenyTreeBuildingMethods, &defaultMethod); double phylogenyCostPerDupPerBase = 0.2; double phylogenyCostPerLossPerBase = 0.2; const char *debugFileName = NULL; const char *referenceEventHeader = NULL; double phylogenyDoSplitsWithSupportHigherThanThisAllAtOnce = 1.0; int64_t numTreeBuildingThreads = 2; int64_t minimumBlockDegreeToCheckSupport = 10; double minimumBlockHomologySupport = 0.7; double nucleotideScalingFactor = 1.0; HomologyUnitType phylogenyHomologyUnitType = BLOCK; enum stCaf_DistanceCorrectionMethod phylogenyDistanceCorrectionMethod = JUKES_CANTOR; bool sortAlignments = false; /////////////////////////////////////////////////////////////////////////// // (0) Parse the inputs handed by genomeCactus.py / setup stuff. /////////////////////////////////////////////////////////////////////////// while (1) { static struct option long_options[] = { { "logLevel", required_argument, 0, 'a' }, { "alignments", required_argument, 0, 'b' }, { "cactusDisk", required_argument, 0, 'c' }, { "lastzArguments", required_argument, 0, 'd' }, { "help", no_argument, 0, 'h' }, { "annealingRounds", required_argument, 0, 'i' }, { "trim", required_argument, 0, 'k' }, { "trimChange", required_argument, 0, 'l', }, { "minimumTreeCoverage", required_argument, 0, 'm' }, { "blockTrim", required_argument, 0, 'n' }, { "deannealingRounds", required_argument, 0, 'o' }, { "minimumDegree", required_argument, 0, 'p' }, { "minimumIngroupDegree", required_argument, 0, 'q' }, { "minimumOutgroupDegree", required_argument, 0, 'r' }, { "alignmentFilter", required_argument, 0, 't' }, { "minimumSequenceLengthForBlast", required_argument, 0, 'v' }, { "maxAdjacencyComponentSizeRatio", required_argument, 0, 'w' }, { "constraints", required_argument, 0, 'x' }, { "minLengthForChromosome", required_argument, 0, 'y' }, { "proportionOfUnalignedBasesForNewChromosome", required_argument, 0, 'z' }, { "maximumMedianSequenceLengthBetweenLinkedEnds", required_argument, 0, 'A' }, { "realign", no_argument, 0, 'B' }, { "realignArguments", required_argument, 0, 'C' }, { "phylogenyNumTrees", required_argument, 0, 'D' }, { "phylogenyRootingMethod", required_argument, 0, 'E' }, { "phylogenyScoringMethod", required_argument, 0, 'F' }, { "phylogenyBreakpointScalingFactor", required_argument, 0, 'G' }, { "phylogenySkipSingleCopyBlocks", no_argument, 0, 'H' }, { "phylogenyMaxBaseDistance", required_argument, 0, 'I' }, { "phylogenyMaxBlockDistance", required_argument, 0, 'J' }, { "phylogenyDebugFile", required_argument, 0, 'K' }, { "phylogenyKeepSingleDegreeBlocks", no_argument, 0, 'L' }, { "phylogenyTreeBuildingMethod", required_argument, 0, 'M' }, { "phylogenyCostPerDupPerBase", required_argument, 0, 'N' }, { "phylogenyCostPerLossPerBase", required_argument, 0, 'O' }, { "referenceEventHeader", required_argument, 0, 'P' }, { "phylogenyDoSplitsWithSupportHigherThanThisAllAtOnce", required_argument, 0, 'Q' }, { "numTreeBuildingThreads", required_argument, 0, 'R' }, { "phylogeny", no_argument, 0, 'S' }, { "minimumBlockHomologySupport", required_argument, 0, 'T' }, { "phylogenyNucleotideScalingFactor", required_argument, 0, 'U' }, { "minimumBlockDegreeToCheckSupport", required_argument, 0, 'V' }, { "removeRecoverableChains", required_argument, 0, 'W' }, { "minimumNumberOfSpecies", required_argument, 0, 'X' }, { "phylogenyHomologyUnitType", required_argument, 0, 'Y' }, { "phylogenyDistanceCorrectionMethod", required_argument, 0, 'Z' }, { "maxRecoverableChainsIterations", required_argument, 0, '1' }, { "maxRecoverableChainLength", required_argument, 0, '2' }, { 0, 0, 0, 0 } }; int option_index = 0; key = getopt_long(argc, argv, "a:b:c:hi:k:m:n:o:p:q:r:stv:w:x:y:z:A:BC:D:E:", long_options, &option_index); if (key == -1) { break; } switch (key) { case 'a': logLevelString = stString_copy(optarg); st_setLogLevelFromString(logLevelString); break; case 'b': alignmentsFile = stString_copy(optarg); break; case 'c': cactusDiskDatabaseString = stString_copy(optarg); break; case 'd': lastzArguments = stString_copy(optarg); break; case 'h': usage(); return 0; case 'i': annealingRounds = getInts(optarg, &annealingRoundsLength); break; case 'o': meltingRounds = getInts(optarg, &meltingRoundsLength); break; case 'k': alignmentTrims = getInts(optarg, &alignmentTrimLength); break; case 'm': k = sscanf(optarg, "%f", &minimumTreeCoverage); assert(k == 1); break; case 'n': k = sscanf(optarg, "%" PRIi64 "", &blockTrim); assert(k == 1); break; case 'p': k = sscanf(optarg, "%" PRIi64 "", &minimumDegree); assert(k == 1); break; case 'q': k = sscanf(optarg, "%" PRIi64 "", &minimumIngroupDegree); assert(k == 1); break; case 'r': k = sscanf(optarg, "%" PRIi64 "", &minimumOutgroupDegree); assert(k == 1); break; case 't': if (strcmp(optarg, "singleCopyOutgroup") == 0) { sortAlignments = true; filterFn = stCaf_filterByOutgroup; } else if (strcmp(optarg, "relaxedSingleCopyOutgroup") == 0) { sortAlignments = true; filterFn = stCaf_relaxedFilterByOutgroup; } else if (strcmp(optarg, "singleCopy") == 0) { sortAlignments = true; filterFn = stCaf_filterByRepeatSpecies; } else if (strcmp(optarg, "relaxedSingleCopy") == 0) { sortAlignments = true; filterFn = stCaf_relaxedFilterByRepeatSpecies; } else if (strcmp(optarg, "singleCopyChr") == 0) { sortAlignments = true; filterFn = stCaf_singleCopyChr; } else if (strcmp(optarg, "singleCopyIngroup") == 0) { sortAlignments = true; filterFn = stCaf_singleCopyIngroup; } else if (strcmp(optarg, "relaxedSingleCopyIngroup") == 0) { sortAlignments = true; filterFn = stCaf_relaxedSingleCopyIngroup; } else if (strcmp(optarg, "none") == 0) { sortAlignments = false; filterFn = NULL; } else { st_errAbort("Could not recognize alignmentFilter option %s", optarg); } break; case 'v': k = sscanf(optarg, "%" PRIi64 "", &minimumSequenceLengthForBlast); assert(k == 1); break; case 'w': k = sscanf(optarg, "%f", &maximumAdjacencyComponentSizeRatio); assert(k == 1); break; case 'x': constraintsFile = stString_copy(optarg); break; case 'y': k = sscanf(optarg, "%" PRIi64 "", &minLengthForChromosome); assert(k == 1); break; case 'z': k = sscanf(optarg, "%f", &proportionOfUnalignedBasesForNewChromosome); assert(k == 1); break; case 'A': k = sscanf(optarg, "%" PRIi64 "", &maximumMedianSequenceLengthBetweenLinkedEnds); assert(k == 1); break; case 'B': realign = 1; break; case 'C': realignArguments = stString_copy(optarg); break; case 'D': k = sscanf(optarg, "%" PRIi64, &phylogenyNumTrees); assert(k == 1); break; case 'E': if (!strcmp(optarg, "outgroupBranch")) { phylogenyRootingMethod = OUTGROUP_BRANCH; } else if (!strcmp(optarg, "longestBranch")) { phylogenyRootingMethod = LONGEST_BRANCH; } else if (!strcmp(optarg, "bestRecon")) { phylogenyRootingMethod = BEST_RECON; } else { st_errAbort("Invalid tree rooting method: %s", optarg); } break; case 'F': if (!strcmp(optarg, "reconCost")) { phylogenyScoringMethod = RECON_COST; } else if (!strcmp(optarg, "nucLikelihood")) { phylogenyScoringMethod = NUCLEOTIDE_LIKELIHOOD; } else if (!strcmp(optarg, "reconLikelihood")) { phylogenyScoringMethod = RECON_LIKELIHOOD; } else if (!strcmp(optarg, "combinedLikelihood")) { phylogenyScoringMethod = COMBINED_LIKELIHOOD; } else { st_errAbort("Invalid tree scoring method: %s", optarg); } break; case 'G': k = sscanf(optarg, "%lf", &breakpointScalingFactor); assert(k == 1); break; case 'H': phylogenySkipSingleCopyBlocks = true; break; case 'I': k = sscanf(optarg, "%" PRIi64, &phylogenyMaxBaseDistance); assert(k == 1); break; case 'J': k = sscanf(optarg, "%" PRIi64, &phylogenyMaxBlockDistance); assert(k == 1); break; case 'K': debugFileName = stString_copy(optarg); break; case 'L': phylogenyKeepSingleDegreeBlocks = true; break; case 'M': // clear the default setting of the list stList_destruct(phylogenyTreeBuildingMethods); phylogenyTreeBuildingMethods = stList_construct(); stList *methodStrings = stString_splitByString(optarg, ","); for (int64_t i = 0; i < stList_length(methodStrings); i++) { char *methodString = stList_get(methodStrings, i); enum stCaf_TreeBuildingMethod *method = st_malloc(sizeof(enum stCaf_TreeBuildingMethod)); if (strcmp(methodString, "neighborJoining") == 0) { *method = NEIGHBOR_JOINING; } else if (strcmp(methodString, "guidedNeighborJoining") == 0) { *method = GUIDED_NEIGHBOR_JOINING; } else if (strcmp(methodString, "splitDecomposition") == 0) { *method = SPLIT_DECOMPOSITION; } else if (strcmp(methodString, "strictSplitDecomposition") == 0) { *method = STRICT_SPLIT_DECOMPOSITION; } else if (strcmp(methodString, "removeBadChains") == 0) { *method = REMOVE_BAD_CHAINS; } else { st_errAbort("Unknown tree building method: %s", methodString); } stList_append(phylogenyTreeBuildingMethods, method); } stList_destruct(methodStrings); break; case 'N': k = sscanf(optarg, "%lf", &phylogenyCostPerDupPerBase); assert(k == 1); break; case 'O': k = sscanf(optarg, "%lf", &phylogenyCostPerLossPerBase); assert(k == 1); break; case 'P': referenceEventHeader = stString_copy(optarg); break; case 'Q': k = sscanf(optarg, "%lf", &phylogenyDoSplitsWithSupportHigherThanThisAllAtOnce); assert(k == 1); break; case 'R': k = sscanf(optarg, "%" PRIi64, &numTreeBuildingThreads); assert(k == 1); break; case 'S': doPhylogeny = true; break; case 'T': k = sscanf(optarg, "%lf", &minimumBlockHomologySupport); assert(k == 1); assert(minimumBlockHomologySupport <= 1.0); assert(minimumBlockHomologySupport >= 0.0); break; case 'U': k = sscanf(optarg, "%lf", &nucleotideScalingFactor); assert(k == 1); break; case 'V': k = sscanf(optarg, "%" PRIi64, &minimumBlockDegreeToCheckSupport); assert(k == 1); break; case 'W': if (strcmp(optarg, "1") == 0) { removeRecoverableChains = true; recoverableChainsFilter = NULL; } else if (strcmp(optarg, "unequalNumberOfIngroupCopies") == 0) { removeRecoverableChains = true; recoverableChainsFilter = stCaf_chainHasUnequalNumberOfIngroupCopies; } else if (strcmp(optarg, "unequalNumberOfIngroupCopiesOrNoOutgroup") == 0) { removeRecoverableChains = true; recoverableChainsFilter = stCaf_chainHasUnequalNumberOfIngroupCopiesOrNoOutgroup; } else if (strcmp(optarg, "0") == 0) { removeRecoverableChains = false; } else { st_errAbort("Could not parse removeRecoverableChains argument"); } break; case 'X': k = sscanf(optarg, "%" PRIi64, &minimumNumberOfSpecies); if (k != 1) { st_errAbort("Error parsing the minimumNumberOfSpecies argument"); } break; case 'Y': if (strcmp(optarg, "chain") == 0) { phylogenyHomologyUnitType = CHAIN; } else if (strcmp(optarg, "block") == 0) { phylogenyHomologyUnitType = BLOCK; } else { st_errAbort("Could not parse the phylogenyHomologyUnitType argument"); } break; case 'Z': if (strcmp(optarg, "jukesCantor") == 0) { phylogenyDistanceCorrectionMethod = JUKES_CANTOR; } else if (strcmp(optarg, "none") == 0 ) { phylogenyDistanceCorrectionMethod = NONE; } else { st_errAbort("Could not parse the phylogenyDistanceCorrectionMethod argument"); } break; case '1': k = sscanf(optarg, "%" PRIi64, &maxRecoverableChainsIterations); if (k != 1) { st_errAbort("Error parsing the maxRecoverableChainsIterations argument"); } break; case '2': k = sscanf(optarg, "%" PRIi64, &maxRecoverableChainLength); if (k != 1) { st_errAbort("Error parsing the maxRecoverableChainLength argument"); } break; default: usage(); return 1; } } /////////////////////////////////////////////////////////////////////////// // (0) Check the inputs. /////////////////////////////////////////////////////////////////////////// assert(cactusDiskDatabaseString != NULL); assert(minimumTreeCoverage >= 0.0); assert(minimumTreeCoverage <= 1.0); assert(blockTrim >= 0); assert(annealingRoundsLength >= 0); for (int64_t i = 0; i < annealingRoundsLength; i++) { assert(annealingRounds[i] >= 0); } assert(meltingRoundsLength >= 0); for (int64_t i = 1; i < meltingRoundsLength; i++) { assert(meltingRounds[i - 1] < meltingRounds[i]); assert(meltingRounds[i - 1] >= 1); } assert(alignmentTrimLength >= 0); for (int64_t i = 0; i < alignmentTrimLength; i++) { assert(alignmentTrims[i] >= 0); } assert(minimumOutgroupDegree >= 0); assert(minimumIngroupDegree >= 0); ////////////////////////////////////////////// //Set up logging ////////////////////////////////////////////// st_setLogLevelFromString(logLevelString); ////////////////////////////////////////////// //Log (some of) the inputs ////////////////////////////////////////////// st_logInfo("Flower disk name : %s\n", cactusDiskDatabaseString); ////////////////////////////////////////////// //Load the database ////////////////////////////////////////////// kvDatabaseConf = stKVDatabaseConf_constructFromString(cactusDiskDatabaseString); cactusDisk = cactusDisk_construct(kvDatabaseConf, 0); st_logInfo("Set up the flower disk\n"); /////////////////////////////////////////////////////////////////////////// // Sort the constraints /////////////////////////////////////////////////////////////////////////// stPinchIterator *pinchIteratorForConstraints = NULL; if (constraintsFile != NULL) { pinchIteratorForConstraints = stPinchIterator_constructFromFile(constraintsFile); st_logInfo("Created an iterator for the alignment constaints from file: %s\n", constraintsFile); } /////////////////////////////////////////////////////////////////////////// // Do the alignment /////////////////////////////////////////////////////////////////////////// startTime = time(NULL); stList *flowers = flowerWriter_parseFlowersFromStdin(cactusDisk); if (alignmentsFile == NULL) { cactusDisk_preCacheStrings(cactusDisk, flowers); } char *tempFile1 = NULL; for (int64_t i = 0; i < stList_length(flowers); i++) { flower = stList_get(flowers, i); if (!flower_builtBlocks(flower)) { // Do nothing if the flower already has defined blocks st_logDebug("Processing flower: %lli\n", flower_getName(flower)); stCaf_setFlowerForAlignmentFiltering(flower); //Set up the graph and add the initial alignments stPinchThreadSet *threadSet = stCaf_setup(flower); //Build the set of outgroup threads outgroupThreads = stCaf_getOutgroupThreads(flower, threadSet); //Setup the alignments stPinchIterator *pinchIterator; stList *alignmentsList = NULL; if (alignmentsFile != NULL) { assert(i == 0); assert(stList_length(flowers) == 1); if (sortAlignments) { tempFile1 = getTempFile(); stCaf_sortCigarsFileByScoreInDescendingOrder(alignmentsFile, tempFile1); pinchIterator = stPinchIterator_constructFromFile(tempFile1); } else { pinchIterator = stPinchIterator_constructFromFile(alignmentsFile); } } else { if (tempFile1 == NULL) { tempFile1 = getTempFile(); } alignmentsList = stCaf_selfAlignFlower(flower, minimumSequenceLengthForBlast, lastzArguments, realign, realignArguments, tempFile1); if (sortAlignments) { stCaf_sortCigarsByScoreInDescendingOrder(alignmentsList); } st_logDebug("Ran lastz and have %" PRIi64 " alignments\n", stList_length(alignmentsList)); pinchIterator = stPinchIterator_constructFromList(alignmentsList); } for (int64_t annealingRound = 0; annealingRound < annealingRoundsLength; annealingRound++) { int64_t minimumChainLength = annealingRounds[annealingRound]; int64_t alignmentTrim = annealingRound < alignmentTrimLength ? alignmentTrims[annealingRound] : 0; st_logDebug("Starting annealing round with a minimum chain length of %" PRIi64 " and an alignment trim of %" PRIi64 "\n", minimumChainLength, alignmentTrim); stPinchIterator_setTrim(pinchIterator, alignmentTrim); //Add back in the constraints if (pinchIteratorForConstraints != NULL) { stCaf_anneal(threadSet, pinchIteratorForConstraints, filterFn); } //Do the annealing if (annealingRound == 0) { stCaf_anneal(threadSet, pinchIterator, filterFn); } else { stCaf_annealBetweenAdjacencyComponents(threadSet, pinchIterator, filterFn); } // Dump the block degree and length distribution to a file if (debugFileName != NULL) { dumpBlockInfo(threadSet, stString_print("%s-blockStats-preMelting", debugFileName)); } printf("Sequence graph statistics after annealing:\n"); printThreadSetStatistics(threadSet, flower, stdout); // Check for poorly-supported blocks--those that have // been transitively aligned together but with very // few homologies supporting the transitive // alignment. These "megablocks" can snarl up the // graph so that a lot of extra gets thrown away in // the first melting step. stPinchThreadSetBlockIt blockIt = stPinchThreadSet_getBlockIt(threadSet); stPinchBlock *block; while ((block = stPinchThreadSetBlockIt_getNext(&blockIt)) != NULL) { if (stPinchBlock_getDegree(block) > minimumBlockDegreeToCheckSupport) { uint64_t supportingHomologies = stPinchBlock_getNumSupportingHomologies(block); uint64_t possibleSupportingHomologies = numPossibleSupportingHomologies(block, flower); double support = ((double) supportingHomologies) / possibleSupportingHomologies; if (support < minimumBlockHomologySupport) { fprintf(stdout, "Destroyed a megablock with degree %" PRIi64 " and %" PRIi64 " supporting homologies out of a maximum " "of %" PRIi64 " (%lf%%).\n", stPinchBlock_getDegree(block), supportingHomologies, possibleSupportingHomologies, support); stPinchBlock_destruct(block); } } } //Do the melting rounds for (int64_t meltingRound = 0; meltingRound < meltingRoundsLength; meltingRound++) { int64_t minimumChainLengthForMeltingRound = meltingRounds[meltingRound]; st_logDebug("Starting melting round with a minimum chain length of %" PRIi64 " \n", minimumChainLengthForMeltingRound); if (minimumChainLengthForMeltingRound >= minimumChainLength) { break; } stCaf_melt(flower, threadSet, NULL, 0, minimumChainLengthForMeltingRound, 0, INT64_MAX); } st_logDebug("Last melting round of cycle with a minimum chain length of %" PRIi64 " \n", minimumChainLength); stCaf_melt(flower, threadSet, NULL, 0, minimumChainLength, breakChainsAtReverseTandems, maximumMedianSequenceLengthBetweenLinkedEnds); //This does the filtering of blocks that do not have the required species/tree-coverage/degree. stCaf_melt(flower, threadSet, blockFilterFn, blockTrim, 0, 0, INT64_MAX); } if (removeRecoverableChains) { stCaf_meltRecoverableChains(flower, threadSet, breakChainsAtReverseTandems, maximumMedianSequenceLengthBetweenLinkedEnds, recoverableChainsFilter, maxRecoverableChainsIterations, maxRecoverableChainLength); } if (debugFileName != NULL) { dumpBlockInfo(threadSet, stString_print("%s-blockStats-postMelting", debugFileName)); } printf("Sequence graph statistics after melting:\n"); printThreadSetStatistics(threadSet, flower, stdout); // Build a tree for each block, then use each tree to // partition the homologies between the ingroups sequences // into those that occur before the speciation with the // outgroup and those which occur late. if (stSet_size(outgroupThreads) > 0 && doPhylogeny) { st_logDebug("Starting to build trees and partition ingroup homologies\n"); stHash *threadStrings = stCaf_getThreadStrings(flower, threadSet); st_logDebug("Got sets of thread strings and set of threads that are outgroups\n"); stCaf_PhylogenyParameters params; params.distanceCorrectionMethod = phylogenyDistanceCorrectionMethod; params.treeBuildingMethods = phylogenyTreeBuildingMethods; params.rootingMethod = phylogenyRootingMethod; params.scoringMethod = phylogenyScoringMethod; params.breakpointScalingFactor = breakpointScalingFactor; params.nucleotideScalingFactor = nucleotideScalingFactor; params.skipSingleCopyBlocks = phylogenySkipSingleCopyBlocks; params.keepSingleDegreeBlocks = phylogenyKeepSingleDegreeBlocks; params.costPerDupPerBase = phylogenyCostPerDupPerBase; params.costPerLossPerBase = phylogenyCostPerLossPerBase; params.maxBaseDistance = phylogenyMaxBaseDistance; params.maxBlockDistance = phylogenyMaxBlockDistance; params.numTrees = phylogenyNumTrees; params.ignoreUnalignedBases = 1; params.onlyIncludeCompleteFeatureBlocks = 0; params.doSplitsWithSupportHigherThanThisAllAtOnce = phylogenyDoSplitsWithSupportHigherThanThisAllAtOnce; params.numTreeBuildingThreads = numTreeBuildingThreads; assert(params.numTreeBuildingThreads >= 1); stCaf_buildTreesToRemoveAncientHomologies( threadSet, phylogenyHomologyUnitType, threadStrings, outgroupThreads, flower, ¶ms, debugFileName == NULL ? NULL : stString_print("%s-phylogeny", debugFileName), referenceEventHeader); stHash_destruct(threadStrings); st_logDebug("Finished building trees\n"); if (removeRecoverableChains) { // We melt recoverable chains after splitting, as // well as before, to alleviate coverage loss // caused by bad splits. stCaf_meltRecoverableChains(flower, threadSet, breakChainsAtReverseTandems, maximumMedianSequenceLengthBetweenLinkedEnds, recoverableChainsFilter, maxRecoverableChainsIterations, maxRecoverableChainLength); } // Enforce the block constraints on minimum degree, // etc. after splitting. stCaf_melt(flower, threadSet, blockFilterFn, 0, 0, 0, INT64_MAX); } //Sort out case when we allow blocks of degree 1 if (minimumDegree < 2) { st_logDebug("Creating degree 1 blocks\n"); stCaf_makeDegreeOneBlocks(threadSet); stCaf_melt(flower, threadSet, blockFilterFn, blockTrim, 0, 0, INT64_MAX); } else if (maximumAdjacencyComponentSizeRatio < INT64_MAX) { //Deal with giant components st_logDebug("Breaking up components greedily\n"); stCaf_breakupComponentsGreedily(threadSet, maximumAdjacencyComponentSizeRatio); } //Finish up stCaf_finish(flower, threadSet, chainLengthForBigFlower, longChain, minLengthForChromosome, proportionOfUnalignedBasesForNewChromosome); //Flower is then destroyed at this point. st_logInfo("Ran the cactus core script\n"); //Cleanup stPinchThreadSet_destruct(threadSet); stPinchIterator_destruct(pinchIterator); stSet_destruct(outgroupThreads); if (alignmentsList != NULL) { stList_destruct(alignmentsList); } st_logInfo("Cleaned up from main loop\n"); } else { st_logInfo("We've already built blocks / alignments for this flower\n"); } } stList_destruct(flowers); if (tempFile1 != NULL) { st_system("rm %s", tempFile1); } if (constraintsFile != NULL) { stPinchIterator_destruct(pinchIteratorForConstraints); } /////////////////////////////////////////////////////////////////////////// // Write the flower to disk. /////////////////////////////////////////////////////////////////////////// st_logDebug("Writing the flowers to disk\n"); cactusDisk_write(cactusDisk); st_logInfo("Updated the flower on disk and %" PRIi64 " seconds have elapsed\n", time(NULL) - startTime); /////////////////////////////////////////////////////////////////////////// // Clean up. /////////////////////////////////////////////////////////////////////////// cactusDisk_destruct(cactusDisk); }
int main(int argc, char *argv[]) { /* * Open the database. * Construct a flower. * Construct an event tree representing the species tree. * For each sequence contruct two ends each containing an cap. * Make a file for the sequence. * Link the two caps. * Finish! */ int64_t key, j; Group *group; Flower_EndIterator *endIterator; End *end; bool makeEventHeadersAlphaNumeric = 0; /* * Arguments/options */ char * logLevelString = NULL; char * speciesTree = NULL; char * outgroupEvents = NULL; /////////////////////////////////////////////////////////////////////////// // (0) Parse the inputs handed by genomeCactus.py / setup stuff. /////////////////////////////////////////////////////////////////////////// while (1) { static struct option long_options[] = { { "logLevel", required_argument, 0, 'a' }, { "cactusDisk", required_argument, 0, 'b' }, { "speciesTree", required_argument, 0, 'f' }, { "outgroupEvents", required_argument, 0, 'g' }, { "help", no_argument, 0, 'h' }, { "makeEventHeadersAlphaNumeric", no_argument, 0, 'i' }, { 0, 0, 0, 0 } }; int option_index = 0; key = getopt_long(argc, argv, "a:b:f:hg:i", long_options, &option_index); if (key == -1) { break; } switch (key) { case 'a': logLevelString = optarg; break; case 'b': cactusDiskDatabaseString = optarg; break; case 'f': speciesTree = optarg; break; case 'g': outgroupEvents = optarg; break; case 'h': usage(); return 0; case 'i': makeEventHeadersAlphaNumeric = 1; break; default: usage(); return 1; } } /////////////////////////////////////////////////////////////////////////// // (0) Check the inputs. /////////////////////////////////////////////////////////////////////////// //assert(logLevelString == NULL || strcmp(logLevelString, "CRITICAL") == 0 || strcmp(logLevelString, "INFO") == 0 || strcmp(logLevelString, "DEBUG") == 0); assert(cactusDiskDatabaseString != NULL); assert(speciesTree != NULL); ////////////////////////////////////////////// //Set up logging ////////////////////////////////////////////// st_setLogLevelFromString(logLevelString); ////////////////////////////////////////////// //Log (some of) the inputs ////////////////////////////////////////////// st_logInfo("Flower disk name : %s\n", cactusDiskDatabaseString); for (j = optind; j < argc; j++) { st_logInfo("Sequence file/directory %s\n", argv[j]); } ////////////////////////////////////////////// //Load the database ////////////////////////////////////////////// stKVDatabaseConf *kvDatabaseConf = kvDatabaseConf = stKVDatabaseConf_constructFromString(cactusDiskDatabaseString); if (stKVDatabaseConf_getType(kvDatabaseConf) == stKVDatabaseTypeTokyoCabinet || stKVDatabaseConf_getType(kvDatabaseConf) == stKVDatabaseTypeKyotoTycoon) { assert(stKVDatabaseConf_getDir(kvDatabaseConf) != NULL); cactusDisk = cactusDisk_construct2(kvDatabaseConf, "cactusSequences"); } else { cactusDisk = cactusDisk_construct(kvDatabaseConf, 1); } st_logInfo("Set up the flower disk\n"); ////////////////////////////////////////////// //Construct the flower ////////////////////////////////////////////// if (cactusDisk_getFlower(cactusDisk, 0) != NULL) { cactusDisk_destruct(cactusDisk); st_logInfo("The first flower already exists\n"); return 0; } flower = flower_construct2(0, cactusDisk); assert(flower_getName(flower) == 0); st_logInfo("Constructed the flower\n"); ////////////////////////////////////////////// //Construct the event tree ////////////////////////////////////////////// st_logInfo("Going to build the event tree with newick string: %s\n", speciesTree); stTree *tree = stTree_parseNewickString(speciesTree); st_logInfo("Parsed the tree\n"); if (makeEventHeadersAlphaNumeric) { makeEventHeadersAlphaNumericFn(tree); } stTree_setBranchLength(tree, INT64_MAX); checkBranchLengthsAreDefined(tree); eventTree = eventTree_construct2(flower); //creates the event tree and the root even totalEventNumber = 1; st_logInfo("Constructed the basic event tree\n"); // Construct a set of outgroup names so that ancestral outgroups // get recognized. stSet *outgroupNameSet = stSet_construct3(stHash_stringKey, stHash_stringEqualKey, free); if(outgroupEvents != NULL) { stList *outgroupNames = stString_split(outgroupEvents); for(int64_t i = 0; i < stList_length(outgroupNames); i++) { char *outgroupName = stList_get(outgroupNames, i); stSet_insert(outgroupNameSet, stString_copy(outgroupName)); } stList_destruct(outgroupNames); } //now traverse the tree j = optind; assignEventsAndSequences(eventTree_getRootEvent(eventTree), tree, outgroupNameSet, argv, &j); char *eventTreeString = eventTree_makeNewickString(eventTree); st_logInfo( "Constructed the initial flower with %" PRIi64 " sequences and %" PRIi64 " events with string: %s\n", totalSequenceNumber, totalEventNumber, eventTreeString); assert(event_getSubTreeBranchLength(eventTree_getRootEvent(eventTree)) >= 0.0); free(eventTreeString); //assert(0); ////////////////////////////////////////////// //Label any outgroup events. ////////////////////////////////////////////// if (outgroupEvents != NULL) { stList *outgroupEventsList = stString_split(outgroupEvents); for (int64_t i = 0; i < stList_length(outgroupEventsList); i++) { char *outgroupEvent = makeEventHeadersAlphaNumeric ? makeAlphaNumeric(stList_get(outgroupEventsList, i)) : stString_copy(stList_get(outgroupEventsList, i)); Event *event = eventTree_getEventByHeader(eventTree, outgroupEvent); if (event == NULL) { st_errAbort("Got an outgroup string that does not match an event, outgroup string %s", outgroupEvent); } assert(!event_isOutgroup(event)); event_setOutgroupStatus(event, 1); assert(event_isOutgroup(event)); free(outgroupEvent); } stList_destruct(outgroupEventsList); } ////////////////////////////////////////////// //Construct the terminal group. ////////////////////////////////////////////// if (flower_getEndNumber(flower) > 0) { group = group_construct2(flower); endIterator = flower_getEndIterator(flower); while ((end = flower_getNextEnd(endIterator)) != NULL) { end_setGroup(end, group); } flower_destructEndIterator(endIterator); assert(group_isLeaf(group)); // Create a one link chain if there is only one pair of attached ends.. group_constructChainForLink(group); assert(!flower_builtBlocks(flower)); } else { flower_setBuiltBlocks(flower, 1); } /////////////////////////////////////////////////////////////////////////// // Write the flower to disk. /////////////////////////////////////////////////////////////////////////// //flower_check(flower); cactusDisk_write(cactusDisk); st_logInfo("Updated the flower on disk\n"); /////////////////////////////////////////////////////////////////////////// // Cleanup. /////////////////////////////////////////////////////////////////////////// cactusDisk_destruct(cactusDisk); return 0; //Exit without clean up is quicker, enable cleanup when doing memory leak detection. stSet_destruct(outgroupNameSet); stTree_destruct(tree); stKVDatabaseConf_destruct(kvDatabaseConf); return 0; }
int main(int argc, char *argv[]) { /* * Script for adding a reference genome to a flower. */ /* * Arguments/options */ char * logLevelString = NULL; char * cactusDiskDatabaseString = NULL; char *referenceEventString = (char *) cactusMisc_getDefaultReferenceEventHeader(); char *outputFile = NULL; Name flowerName = NULL_NAME; /////////////////////////////////////////////////////////////////////////// // (0) Parse the inputs handed by genomeCactus.py / setup stuff. /////////////////////////////////////////////////////////////////////////// while (1) { static struct option long_options[] = { { "logLevel", required_argument, 0, 'a' }, { "cactusDisk", required_argument, 0, 'c' }, { "flowerName", required_argument, 0, 'd' }, { "referenceEventString", required_argument, 0, 'g' }, { "help", no_argument, 0, 'h' }, { "outputFile", required_argument, 0, 'k' }, { 0, 0, 0, 0 } }; int option_index = 0; int key = getopt_long(argc, argv, "a:c:d:g:hk:", long_options, &option_index); if (key == -1) { break; } switch (key) { case 'a': logLevelString = stString_copy(optarg); break; case 'c': cactusDiskDatabaseString = stString_copy(optarg); break; case 'd': flowerName = cactusMisc_stringToName(optarg); break; case 'g': referenceEventString = stString_copy(optarg); break; case 'h': usage(); return 0; case 'k': outputFile = stString_copy(optarg); break; default: usage(); return 1; } } /////////////////////////////////////////////////////////////////////////// // (0) Check the inputs. /////////////////////////////////////////////////////////////////////////// assert(cactusDiskDatabaseString != NULL); ////////////////////////////////////////////// //Set up logging ////////////////////////////////////////////// st_setLogLevelFromString(logLevelString); ////////////////////////////////////////////// //Load the database ////////////////////////////////////////////// stKVDatabaseConf *kvDatabaseConf = stKVDatabaseConf_constructFromString( cactusDiskDatabaseString); CactusDisk *cactusDisk = cactusDisk_construct(kvDatabaseConf, 0); stKVDatabaseConf_destruct(kvDatabaseConf); st_logInfo("Set up the flower disk\n"); /////////////////////////////////////////////////////////////////////////// // Get the set of flowers to manipulate /////////////////////////////////////////////////////////////////////////// Flower *flower = cactusDisk_getFlower(cactusDisk, flowerName); /////////////////////////////////////////////////////////////////////////// // Get the reference event name /////////////////////////////////////////////////////////////////////////// Event *referenceEvent = eventTree_getEventByHeader( flower_getEventTree(flower), referenceEventString); assert(referenceEvent != NULL); Name referenceEventName = event_getName(referenceEvent); /////////////////////////////////////////////////////////////////////////// // Now process each flower in turn. /////////////////////////////////////////////////////////////////////////// if(outputFile == NULL) { st_errAbort("No output file specified\n"); } FILE *fileHandle = fopen(outputFile, "w"); printFastaSequences(flower, fileHandle, referenceEventName); if(fileHandle != NULL) { fclose(fileHandle); } /////////////////////////////////////////////////////////////////////////// //Clean up memory /////////////////////////////////////////////////////////////////////////// cactusDisk_destruct(cactusDisk); //return 0; //Exit without clean up is quicker, enable cleanup when doing memory leak detection. free(cactusDiskDatabaseString); free(referenceEventString); free(logLevelString); st_logInfo("Cleaned stuff up and am finished\n"); //while(1); return 0; }
int main(int argc, char *argv[]) { char * logLevelString = NULL; char * cactusDiskDatabaseString = NULL; int64_t i, j; int64_t spanningTrees = 10; int64_t maximumLength = 1500; bool useProgressiveMerging = 0; float matchGamma = 0.5; bool useBanding = 0; int64_t k; stList *listOfEndAlignmentFiles = NULL; char *endAlignmentsToPrecomputeOutputFile = NULL; bool calculateWhichEndsToComputeSeparately = 0; int64_t largeEndSize = 1000000; int64_t chainLengthForBigFlower = 1000000; int64_t longChain = 2; char *ingroupCoverageFilePath = NULL; int64_t minimumSizeToRescue = 1; double minimumCoverageToRescue = 0.0; PairwiseAlignmentParameters *pairwiseAlignmentBandingParameters = pairwiseAlignmentBandingParameters_construct(); /* * Setup the input parameters for cactus core. */ bool pruneOutStubAlignments = 0; /* * Parse the options. */ while (1) { static struct option long_options[] = { { "logLevel", required_argument, 0, 'a' }, { "cactusDisk", required_argument, 0, 'b' }, { "help", no_argument, 0, 'h' }, { "spanningTrees", required_argument, 0, 'i' }, { "maximumLength", required_argument, 0, 'j' }, { "useBanding", no_argument, 0, 'k' }, { "gapGamma", required_argument, 0, 'l' }, { "matchGamma", required_argument, 0, 'L' }, { "splitMatrixBiggerThanThis", required_argument, 0, 'o' }, { "anchorMatrixBiggerThanThis", required_argument, 0, 'p' }, { "repeatMaskMatrixBiggerThanThis", required_argument, 0, 'q' }, { "diagonalExpansion", required_argument, 0, 'r' }, { "constraintDiagonalTrim", required_argument, 0, 't' }, { "minimumDegree", required_argument, 0, 'u' }, { "alignAmbiguityCharacters", no_argument, 0, 'w' }, { "pruneOutStubAlignments", no_argument, 0, 'y' }, { "minimumIngroupDegree", required_argument, 0, 'A' }, { "minimumOutgroupDegree", required_argument, 0, 'B' }, { "precomputedAlignments", required_argument, 0, 'D' }, { "endAlignmentsToPrecomputeOutputFile", required_argument, 0, 'E' }, { "useProgressiveMerging", no_argument, 0, 'F' }, { "calculateWhichEndsToComputeSeparately", no_argument, 0, 'G' }, { "largeEndSize", required_argument, 0, 'I' }, {"ingroupCoverageFile", required_argument, 0, 'J'}, {"minimumSizeToRescue", required_argument, 0, 'K'}, {"minimumCoverageToRescue", required_argument, 0, 'M'}, { "minimumNumberOfSpecies", required_argument, 0, 'N' }, { 0, 0, 0, 0 } }; int option_index = 0; int key = getopt_long(argc, argv, "a:b:hi:j:kl:o:p:q:r:t:u:wy:A:B:D:E:FGI:J:K:L:M:N:", long_options, &option_index); if (key == -1) { break; } switch (key) { case 'a': logLevelString = stString_copy(optarg); st_setLogLevelFromString(logLevelString); break; case 'b': cactusDiskDatabaseString = stString_copy(optarg); break; case 'h': usage(); return 0; case 'i': i = sscanf(optarg, "%" PRIi64 "", &spanningTrees); (void) i; assert(i == 1); assert(spanningTrees >= 0); break; case 'j': i = sscanf(optarg, "%" PRIi64 "", &maximumLength); assert(i == 1); assert(maximumLength >= 0); break; case 'k': useBanding = !useBanding; break; case 'l': i = sscanf(optarg, "%f", &pairwiseAlignmentBandingParameters->gapGamma); assert(i == 1); assert(pairwiseAlignmentBandingParameters->gapGamma >= 0.0); break; case 'L': i = sscanf(optarg, "%f", &matchGamma); assert(i == 1); assert(matchGamma >= 0.0); break; case 'o': i = sscanf(optarg, "%" PRIi64 "", &k); assert(i == 1); assert(k >= 0); pairwiseAlignmentBandingParameters->splitMatrixBiggerThanThis = (int64_t) k * k; break; case 'p': i = sscanf(optarg, "%" PRIi64 "", &k); assert(i == 1); assert(k >= 0); pairwiseAlignmentBandingParameters->anchorMatrixBiggerThanThis = (int64_t) k * k; break; case 'q': i = sscanf(optarg, "%" PRIi64 "", &k); assert(i == 1); assert(k >= 0); pairwiseAlignmentBandingParameters->repeatMaskMatrixBiggerThanThis = (int64_t) k * k; break; case 'r': i = sscanf(optarg, "%" PRIi64 "", &pairwiseAlignmentBandingParameters->diagonalExpansion); assert(i == 1); assert(pairwiseAlignmentBandingParameters->diagonalExpansion >= 0); assert(pairwiseAlignmentBandingParameters->diagonalExpansion % 2 == 0); break; case 't': i = sscanf(optarg, "%" PRIi64 "", &pairwiseAlignmentBandingParameters->constraintDiagonalTrim); assert(i == 1); assert(pairwiseAlignmentBandingParameters->constraintDiagonalTrim >= 0); break; case 'u': i = sscanf(optarg, "%" PRIi64 "", &minimumDegree); assert(i == 1); break; case 'w': pairwiseAlignmentBandingParameters->alignAmbiguityCharacters = 1; break; case 'y': pruneOutStubAlignments = 1; break; case 'A': i = sscanf(optarg, "%" PRIi64 "", &minimumIngroupDegree); assert(i == 1); break; case 'B': i = sscanf(optarg, "%" PRIi64 "", &minimumOutgroupDegree); assert(i == 1); break; case 'D': listOfEndAlignmentFiles = stString_split(optarg); break; case 'E': endAlignmentsToPrecomputeOutputFile = stString_copy(optarg); break; case 'F': useProgressiveMerging = 1; break; case 'G': calculateWhichEndsToComputeSeparately = 1; break; case 'I': i = sscanf(optarg, "%" PRIi64 "", &largeEndSize); assert(i == 1); break; case 'J': ingroupCoverageFilePath = stString_copy(optarg); break; case 'K': i = sscanf(optarg, "%" PRIi64, &minimumSizeToRescue); assert(i == 1); break; case 'M': i = sscanf(optarg, "%lf", &minimumCoverageToRescue); assert(i == 1); break; case 'N': i = sscanf(optarg, "%" PRIi64, &minimumNumberOfSpecies); if (i != 1) { st_errAbort("Error parsing minimumNumberOfSpecies parameter"); } break; default: usage(); return 1; } } st_setLogLevelFromString(logLevelString); /* * Load the flowerdisk */ stKVDatabaseConf *kvDatabaseConf = stKVDatabaseConf_constructFromString(cactusDiskDatabaseString); CactusDisk *cactusDisk = cactusDisk_construct(kvDatabaseConf, 0); //We precache the sequences st_logInfo("Set up the flower disk\n"); /* * Load the hmm */ StateMachine *sM = stateMachine5_construct(fiveState); /* * For each flower. */ if (calculateWhichEndsToComputeSeparately) { stList *flowers = flowerWriter_parseFlowersFromStdin(cactusDisk); if (stList_length(flowers) != 1) { st_errAbort("We are breaking up a flower's end alignments for precomputation but we have %" PRIi64 " flowers.\n", stList_length(flowers)); } stSortedSet *endsToAlignSeparately = getEndsToAlignSeparately(stList_get(flowers, 0), maximumLength, largeEndSize); assert(stSortedSet_size(endsToAlignSeparately) != 1); stSortedSetIterator *it = stSortedSet_getIterator(endsToAlignSeparately); End *end; while ((end = stSortedSet_getNext(it)) != NULL) { fprintf(stdout, "%s\t%" PRIi64 "\t%" PRIi64 "\n", cactusMisc_nameToStringStatic(end_getName(end)), end_getInstanceNumber(end), getTotalAdjacencyLength(end)); } return 0; //avoid cleanup costs stSortedSet_destructIterator(it); stSortedSet_destruct(endsToAlignSeparately); } else if (endAlignmentsToPrecomputeOutputFile != NULL) { /* * In this case we will align a set of end and save the alignments in a file. */ stList *names = flowerWriter_parseNames(stdin); Flower *flower = cactusDisk_getFlower(cactusDisk, *((Name *)stList_get(names, 0))); FILE *fileHandle = fopen(endAlignmentsToPrecomputeOutputFile, "w"); for(int64_t i=1; i<stList_length(names); i++) { End *end = flower_getEnd(flower, *((Name *)stList_get(names, i))); if (end == NULL) { st_errAbort("The end %" PRIi64 " was not found in the flower\n", *((Name *)stList_get(names, i))); } stSortedSet *endAlignment = makeEndAlignment(sM, end, spanningTrees, maximumLength, useProgressiveMerging, matchGamma, pairwiseAlignmentBandingParameters); writeEndAlignmentToDisk(end, endAlignment, fileHandle); stSortedSet_destruct(endAlignment); } fclose(fileHandle); return 0; //avoid cleanup costs stList_destruct(names); st_logInfo("Finished precomputing end alignments\n"); } else { /* * Compute complete flower alignments, possibly loading some precomputed alignments. */ bedRegion *bedRegions = NULL; size_t numBeds = 0; if (ingroupCoverageFilePath != NULL) { // Pre-load the mmap for the coverage file. FILE *coverageFile = fopen(ingroupCoverageFilePath, "rb"); if (coverageFile == NULL) { st_errnoAbort("Opening coverage file %s failed", ingroupCoverageFilePath); } fseek(coverageFile, 0, SEEK_END); int64_t coverageFileLen = ftell(coverageFile); assert(coverageFileLen >= 0); assert(coverageFileLen % sizeof(bedRegion) == 0); if (coverageFileLen == 0) { // mmap doesn't like length-0 mappings, for obvious // reasons. Pretend that the coverage file doesn't // exist in this case, since it contains no data. ingroupCoverageFilePath = NULL; } else { // Establish a memory mapping for the file. bedRegions = mmap(NULL, coverageFileLen, PROT_READ, MAP_SHARED, fileno(coverageFile), 0); if (bedRegions == MAP_FAILED) { st_errnoAbort("Failure mapping coverage file"); } numBeds = coverageFileLen / sizeof(bedRegion); } fclose(coverageFile); } stList *flowers = flowerWriter_parseFlowersFromStdin(cactusDisk); if (listOfEndAlignmentFiles != NULL && stList_length(flowers) != 1) { st_errAbort("We have precomputed alignments but %" PRIi64 " flowers to align.\n", stList_length(flowers)); } cactusDisk_preCacheStrings(cactusDisk, flowers); for (j = 0; j < stList_length(flowers); j++) { flower = stList_get(flowers, j); st_logInfo("Processing a flower\n"); stSortedSet *alignedPairs = makeFlowerAlignment3(sM, flower, listOfEndAlignmentFiles, spanningTrees, maximumLength, useProgressiveMerging, matchGamma, pairwiseAlignmentBandingParameters, pruneOutStubAlignments); st_logInfo("Created the alignment: %" PRIi64 " pairs\n", stSortedSet_size(alignedPairs)); stPinchIterator *pinchIterator = stPinchIterator_constructFromAlignedPairs(alignedPairs, getNextAlignedPairAlignment); /* * Run the cactus caf functions to build cactus. */ stPinchThreadSet *threadSet = stCaf_setup(flower); stCaf_anneal(threadSet, pinchIterator, NULL); if (minimumDegree < 2) { stCaf_makeDegreeOneBlocks(threadSet); } if (minimumIngroupDegree > 0 || minimumOutgroupDegree > 0 || minimumDegree > 1) { stCaf_melt(flower, threadSet, blockFilterFn, 0, 0, 0, INT64_MAX); } if (ingroupCoverageFilePath != NULL) { // Rescue any sequence that is covered by outgroups // but currently unaligned into single-degree blocks. stPinchThreadSetIt pinchIt = stPinchThreadSet_getIt(threadSet); stPinchThread *thread; while ((thread = stPinchThreadSetIt_getNext(&pinchIt)) != NULL) { Cap *cap = flower_getCap(flower, stPinchThread_getName(thread)); assert(cap != NULL); Sequence *sequence = cap_getSequence(cap); assert(sequence != NULL); rescueCoveredRegions(thread, bedRegions, numBeds, sequence_getName(sequence), minimumSizeToRescue, minimumCoverageToRescue); } stCaf_joinTrivialBoundaries(threadSet); } stCaf_finish(flower, threadSet, chainLengthForBigFlower, longChain, INT64_MAX, INT64_MAX); //Flower now destroyed. stPinchThreadSet_destruct(threadSet); st_logInfo("Ran the cactus core script.\n"); /* * Cleanup */ //Clean up the sorted set after cleaning up the iterator stPinchIterator_destruct(pinchIterator); stSortedSet_destruct(alignedPairs); st_logInfo("Finished filling in the alignments for the flower\n"); } stList_destruct(flowers); //st_errAbort("Done\n"); /* * Write and close the cactusdisk. */ cactusDisk_write(cactusDisk); return 0; //Exit without clean up is quicker, enable cleanup when doing memory leak detection. if (bedRegions != NULL) { // Clean up our mapping. munmap(bedRegions, numBeds * sizeof(bedRegion)); } } /////////////////////////////////////////////////////////////////////////// // Cleanup /////////////////////////////////////////////////////////////////////////// stateMachine_destruct(sM); cactusDisk_destruct(cactusDisk); stKVDatabaseConf_destruct(kvDatabaseConf); //destructCactusCoreInputParameters(cCIP); free(cactusDiskDatabaseString); if (listOfEndAlignmentFiles != NULL) { stList_destruct(listOfEndAlignmentFiles); } if (logLevelString != NULL) { free(logLevelString); } st_logInfo("Finished with the flower disk for this flower.\n"); //while(1); return 0; }