static stList *getRandomPairwiseAlignments() { stList *pairwiseAlignments = stList_construct3(0, (void(*)(void *)) destructPairwiseAlignment); int64_t randomAlignmentNumber = st_randomInt(0, 10); for (int64_t i = 0; i < randomAlignmentNumber; i++) { char *contig1 = stString_print("%" PRIi64 "", i); char *contig2 = stString_print("%" PRIi64 "", i * 10); int64_t start1 = st_randomInt(100000, 1000000); int64_t start2 = st_randomInt(100000, 1000000); int64_t strand1 = st_random() > 0.5; int64_t strand2 = st_random() > 0.5; int64_t end1 = start1; int64_t end2 = start2; struct List *operationList = constructEmptyList(0, NULL); while (st_random() > 0.1) { int64_t length = st_randomInt(0, 10); int64_t type = st_randomInt(0, 3); assert(type < 3); listAppend(operationList, constructAlignmentOperation(type, length, 0)); if (type != PAIRWISE_INDEL_Y) { end1 += strand1 ? length : -length; } if (type != PAIRWISE_INDEL_X) { end2 += strand2 ? length : -length; } } stList_append(pairwiseAlignments, constructPairwiseAlignment(contig1, start1, end1, strand1, contig2, start2, end2, strand2, 0.0, operationList)); free(contig1); free(contig2); } return pairwiseAlignments; }
static char *tree_getNewickTreeStringP(stTree *tree) { char *cA, *cA2; if(stTree_getChildNumber(tree) > 0) { int32_t i; cA = stString_copy("("); for(i=0; i<stTree_getChildNumber(tree); i++) { cA2 = tree_getNewickTreeStringP(stTree_getChild(tree, i)); char *cA3 = stString_print((i+1 < stTree_getChildNumber(tree) ? "%s%s," : "%s%s"), cA, cA2); free(cA); free(cA2); cA = cA3; } cA2 = stString_print("%s)", cA); free(cA); cA = cA2; } else { cA = stString_copy(""); } if(stTree_getLabel(tree) != NULL) { cA2 = stString_print("%s%s", cA, stTree_getLabel(tree)); free(cA); cA = cA2; } if(stTree_getBranchLength(tree) != INFINITY) { char *cA2 = stString_print("%s:%g", cA, stTree_getBranchLength(tree)); free(cA); cA = cA2; } return cA; }
static char *segmentWriteFn(Segment *segment) { stTree *phylogeneticTree = stHash_search(segmentWriteFn_flowerToPhylogeneticTreeHash, block_getFlower(segment_getBlock(segment))); assert(phylogeneticTree != NULL); char *segmentString = getMaximumLikelihoodString(phylogeneticTree, segment_getBlock(segment)); //We append a zero to a segment string if it is part of block containing only a reference segment, else we append a 1. //We use these boolean values to determine if a sequence contains only these trivial strings, and is therefore trivial. char *appendedSegmentString = stString_print("%s%c ", segmentString, block_getInstanceNumber(segment_getBlock(segment)) == 1 ? '0' : '1'); free(segmentString); return appendedSegmentString; }
static stTree *eventTree_getStTree_R(Event *event) { stTree *ret = stTree_construct(); stTree_setLabel(ret, stString_print("%" PRIi64, event_getName(event))); stTree_setBranchLength(ret, event_getBranchLength(event)); for(int64_t i = 0; i < event_getChildNumber(event); i++) { Event *child = event_getChild(event, i); stTree *childStTree = eventTree_getStTree_R(child); stTree_setParent(childStTree, ret); } return ret; }
static stList *chooseAdjacencyPairing_externalProgram(stList *edges, int64_t nodeNumber, const char *programName) { /* * We create temp files to hold stuff. */ if(nodeNumber <= 1) { assert(stList_length(edges) == 0); return stList_construct(); } char *tempInputFile = getTempFile(), *tempOutputFile = getTempFile(); /* * We write the graph to a temp file. */ FILE *fileHandle = fopen(tempInputFile, "w"); if(strcmp(programName, "blossom5") == 0) { //Must be all connected as //generates perfect matchings. writeCliqueGraph(fileHandle, edges, nodeNumber, 1); } else { writeGraph(fileHandle, edges, nodeNumber); } fclose(fileHandle); /* * We run the external program. */ char *command = stString_print("%s -e %s -w %s >& /dev/null", programName, tempInputFile, tempOutputFile); int64_t i = st_system(command); if(i != 0) { st_errAbort("Something went wrong with the command: %s", command); //For some reason this causes a seg fault //stThrowNew(MATCHING_EXCEPTION, "Something went wrong with the command: %s", command); } free(command); /* * We get back the matching. */ fileHandle = fopen(tempOutputFile, "r"); stList *matching = readMatching(fileHandle, edges); fclose(fileHandle); st_logDebug("The adjacency matching for %" PRIi64 " nodes with %" PRIi64 " initial edges contains %" PRIi64 " edges\n", nodeNumber, stList_length(edges), stList_length(matching)); /* * Get rid of the temp files.. */ st_system("rm -rf %s %s", tempInputFile, tempOutputFile); free(tempInputFile); free(tempOutputFile); return matching; }
char *block_makeNewickStringP(Segment *segment, bool includeInternalNames, bool includeUnaryEvents) { if(!includeUnaryEvents && segment_getChildNumber(segment) == 1) { return block_makeNewickStringP(segment_getChild(segment, 0), includeInternalNames, includeUnaryEvents); } if(segment_getChildNumber(segment) > 0) { char *left = stString_print("("); int64_t i; bool comma = 0; for(i=0; i<segment_getChildNumber(segment); i++) { Segment *childSegment = segment_getChild(segment, i); char *cA = block_makeNewickStringP(childSegment, includeInternalNames, includeUnaryEvents); char *cA2 = stString_print(comma ? "%s,%s" : "%s%s", left, cA); free(cA); left = cA2; comma = 1; } char *final = includeInternalNames ? stString_print("%s)%s", left, cactusMisc_nameToStringStatic(segment_getName(segment))) : stString_print("%s)", left); free(left); return final; }
static MetaSequence *addMetaSequence(Flower *flower, Cap *cap, int64_t index, char *string, bool trivialString) { /* * Adds a meta sequence representing a top level thread to the cactus disk. * The sequence is all 'N's at this stage. */ Event *referenceEvent = cap_getEvent(cap); assert(referenceEvent != NULL); char *sequenceName = stString_print("%srefChr%" PRIi64 "", event_getHeader(referenceEvent), index); //char *sequenceName = stString_print("refChr%" PRIi64 "", index); MetaSequence *metaSequence = metaSequence_construct3(1, strlen(string), string, sequenceName, event_getName(referenceEvent), trivialString, flower_getCactusDisk(flower)); free(sequenceName); return metaSequence; }
static void testGibbsSamplingWithSimulatedAnnealing(CuTest *testCase, double(*temperatureFn)(double), bool pureGreedy, int64_t maxNumberOfChainsBeforeSwitchingToFast) { for (int64_t i = 0; i < 100; i++) { setup(); double totalScore; time_t startTime = time(NULL); bool fast = maxNumberOfChainsBeforeSwitchingToFast > nodeNumber; stList *reference = makeReferenceGreedily(stubs, chains, zMatrix, nodeNumber, &totalScore, fast); checkIsValidReference(testCase, reference, totalScore); time_t totalTime = time(NULL) - startTime; char *cA = stString_print("Pre-annealing greedy (fast:%" PRIi64 "), %" PRIi64 " nodes, it took %" PRIi64 " seconds, score: %f\n", fast, nodeNumber, totalTime, totalScore); st_logInfo(cA); logReference(reference, nodeNumber, zMatrix, totalScore, cA); free(cA); startTime = time(NULL); int64_t permutations = st_randomInt(0, 100); if(pureGreedy) { greedyImprovement(reference, chains, zMatrix, permutations); } else { gibbsSamplingWithSimulatedAnnealing(reference, chains, zMatrix, permutations, temperatureFn); } double totalScoreAfterAnnealing = calculateZScoreOfReference(reference, nodeNumber, zMatrix); checkIsValidReference(testCase, reference, totalScoreAfterAnnealing); totalTime = time(NULL) - startTime; cA = stString_print("Post-annealing permutations:%" PRIi64 ", %" PRIi64 " nodes, it took %" PRIi64 " seconds, score: %f\n", permutations, nodeNumber, totalTime, totalScore); st_logInfo(cA); logReference(reference, nodeNumber, zMatrix, totalScoreAfterAnnealing, cA); if(pureGreedy) { CuAssertTrue(testCase, totalScoreAfterAnnealing + 0.001 >= totalScore); } teardown(); } }
static void testMakeReferenceGreedily(CuTest *testCase) { for (int64_t i = 0; i < 100; i++) { setup(); double totalScore; time_t startTime = time(NULL); bool fast = 1; //st_random() > 0.5; stList *reference = makeReferenceGreedily(stubs, chains, zMatrix, nodeNumber, &totalScore, fast); checkIsValidReference(testCase, reference, totalScore); time_t totalTime = time(NULL) - startTime; char *cA = stString_print("Greedy (fast:%" PRIi64 "), %" PRIi64 " nodes, it took %" PRIi64 " seconds, score: %f\n", fast, nodeNumber, totalTime, totalScore); st_logInfo(cA); logReference(reference, nodeNumber, zMatrix, totalScore, cA); free(cA); teardown(); } }
static void cactusDisk_loadFromBinaryRepresentation(void **binaryString, CactusDisk *cactusDisk, stKVDatabaseConf *conf) { cactusDisk->sequencesReadFileHandle = NULL; cactusDisk->sequencesWriteFileHandle = NULL; //I think these lines are not needed. cactusDisk->sequencesFileName = NULL; cactusDisk->absSequencesFileName = NULL; assert(binaryRepresentation_peekNextElementType(*binaryString) == CODE_CACTUS_DISK); binaryRepresentation_popNextElementType(binaryString); cactusDisk->storeSequencesInAFile = binaryRepresentation_getBool(binaryString); if (cactusDisk->storeSequencesInAFile) { cactusDisk->sequencesFileName = binaryRepresentation_getString(binaryString); if (stKVDatabaseConf_getDir(conf) == NULL) { stThrowNew(CACTUS_DISK_EXCEPTION_ID, "The database conf does not contain a directory in which the sequence file is to be found!\n"); } cactusDisk->absSequencesFileName = stString_print("%s/%s", stKVDatabaseConf_getDir(conf), cactusDisk->sequencesFileName); } assert(binaryRepresentation_peekNextElementType(*binaryString) == CODE_CACTUS_DISK); binaryRepresentation_popNextElementType(binaryString); }
static CactusDisk *cactusDisk_constructPrivate(stKVDatabaseConf *conf, bool create, const char *sequencesFileName) { //sequencesFileName = NULL; //Disable the ability to store the sequences on disk. CactusDisk *cactusDisk = st_calloc(1, sizeof(CactusDisk)); //construct lists of in memory objects cactusDisk->metaSequences = stSortedSet_construct3(cactusDisk_constructMetaSequencesP, NULL); cactusDisk->flowers = stSortedSet_construct3(cactusDisk_constructFlowersP, NULL); cactusDisk->flowerNamesMarkedForDeletion = stSortedSet_construct3((int (*)(const void *, const void *)) strcmp, free); cactusDisk->updateRequests = stList_construct3(0, (void (*)(void *)) stKVDatabaseBulkRequest_destruct); //Now open the database cactusDisk->database = stKVDatabase_construct(conf, create); cactusDisk->cache = stCache_construct(); cactusDisk->stringCache = stCache_construct(); //initialise the unique ids. int64_t seed = (clock() << 24) | (time(NULL) << 16) | (getpid() & 65535); //Likely to be unique st_logDebug("The cactus disk is seeding the random number generator with the value %" PRIi64 "\n", seed); st_randomSeed(seed); cactusDisk->uniqueNumber = 0; cactusDisk->maxUniqueNumber = 0; //Now load any stuff.. if (containsRecord(cactusDisk, CACTUS_DISK_PARAMETER_KEY)) { if (create) { stThrowNew(CACTUS_DISK_EXCEPTION_ID, "Tried to create a cactus disk, but the cactus disk already exists"); } if (sequencesFileName != NULL) { stThrowNew(CACTUS_DISK_EXCEPTION_ID, "A sequences file name is specified, but the cactus disk is not being created"); } void *record = getRecord(cactusDisk, CACTUS_DISK_PARAMETER_KEY, "cactus_disk parameters"); void *record2 = record; cactusDisk_loadFromBinaryRepresentation(&record, cactusDisk, conf); free(record2); } else { assert(create); if (sequencesFileName == NULL) { cactusDisk->storeSequencesInAFile = 0; cactusDisk->sequencesFileName = NULL; cactusDisk->sequencesReadFileHandle = NULL; cactusDisk->sequencesWriteFileHandle = NULL; cactusDisk->absSequencesFileName = NULL; } else { if (stKVDatabaseConf_getDir(conf) == NULL) { stThrowNew(CACTUS_DISK_EXCEPTION_ID, "The database conf does not contain a directory in which the sequence file is to be found!\n"); } cactusDisk->storeSequencesInAFile = 1; cactusDisk->sequencesFileName = stString_copy(sequencesFileName); cactusDisk->absSequencesFileName = stString_print("%s/%s", stKVDatabaseConf_getDir(conf), cactusDisk->sequencesFileName); //Make sure the file exists cactusDisk->sequencesReadFileHandle = fopen(cactusDisk->absSequencesFileName, "w"); assert(cactusDisk->sequencesReadFileHandle != NULL); fclose(cactusDisk->sequencesReadFileHandle); //Flush it first time. cactusDisk->sequencesReadFileHandle = NULL; cactusDisk->sequencesWriteFileHandle = NULL; } } return cactusDisk; }
char *stTree_getNewickTreeString(stTree *tree) { char *cA = tree_getNewickTreeStringP(tree), *cA2; cA2 = stString_print("%s;", cA); free(cA); return cA2; }
int main(int argc, char *argv[]) { /* * Script for adding alignments to cactus tree. */ int64_t startTime; stKVDatabaseConf *kvDatabaseConf; CactusDisk *cactusDisk; int key, k; bool (*filterFn)(stPinchSegment *, stPinchSegment *) = NULL; stSet *outgroupThreads = NULL; /* * Arguments/options */ char * logLevelString = NULL; char * alignmentsFile = NULL; char * constraintsFile = NULL; char * cactusDiskDatabaseString = NULL; char * lastzArguments = ""; int64_t minimumSequenceLengthForBlast = 1; //Parameters for annealing/melting rounds int64_t *annealingRounds = NULL; int64_t annealingRoundsLength = 0; int64_t *meltingRounds = NULL; int64_t meltingRoundsLength = 0; //Parameters for melting float maximumAdjacencyComponentSizeRatio = 10; int64_t blockTrim = 0; int64_t alignmentTrimLength = 0; int64_t *alignmentTrims = NULL; int64_t chainLengthForBigFlower = 1000000; int64_t longChain = 2; int64_t minLengthForChromosome = 1000000; float proportionOfUnalignedBasesForNewChromosome = 0.8; bool breakChainsAtReverseTandems = 1; int64_t maximumMedianSequenceLengthBetweenLinkedEnds = INT64_MAX; bool realign = 0; char *realignArguments = ""; bool removeRecoverableChains = false; bool (*recoverableChainsFilter)(stCactusEdgeEnd *, Flower *) = NULL; int64_t maxRecoverableChainsIterations = 1; int64_t maxRecoverableChainLength = INT64_MAX; //Parameters for removing ancient homologies bool doPhylogeny = false; int64_t phylogenyNumTrees = 1; enum stCaf_RootingMethod phylogenyRootingMethod = BEST_RECON; enum stCaf_ScoringMethod phylogenyScoringMethod = COMBINED_LIKELIHOOD; double breakpointScalingFactor = 1.0; bool phylogenySkipSingleCopyBlocks = 0; int64_t phylogenyMaxBaseDistance = 1000; int64_t phylogenyMaxBlockDistance = 100; bool phylogenyKeepSingleDegreeBlocks = 0; stList *phylogenyTreeBuildingMethods = stList_construct(); enum stCaf_TreeBuildingMethod defaultMethod = GUIDED_NEIGHBOR_JOINING; stList_append(phylogenyTreeBuildingMethods, &defaultMethod); double phylogenyCostPerDupPerBase = 0.2; double phylogenyCostPerLossPerBase = 0.2; const char *debugFileName = NULL; const char *referenceEventHeader = NULL; double phylogenyDoSplitsWithSupportHigherThanThisAllAtOnce = 1.0; int64_t numTreeBuildingThreads = 2; int64_t minimumBlockDegreeToCheckSupport = 10; double minimumBlockHomologySupport = 0.7; double nucleotideScalingFactor = 1.0; HomologyUnitType phylogenyHomologyUnitType = BLOCK; enum stCaf_DistanceCorrectionMethod phylogenyDistanceCorrectionMethod = JUKES_CANTOR; bool sortAlignments = false; /////////////////////////////////////////////////////////////////////////// // (0) Parse the inputs handed by genomeCactus.py / setup stuff. /////////////////////////////////////////////////////////////////////////// while (1) { static struct option long_options[] = { { "logLevel", required_argument, 0, 'a' }, { "alignments", required_argument, 0, 'b' }, { "cactusDisk", required_argument, 0, 'c' }, { "lastzArguments", required_argument, 0, 'd' }, { "help", no_argument, 0, 'h' }, { "annealingRounds", required_argument, 0, 'i' }, { "trim", required_argument, 0, 'k' }, { "trimChange", required_argument, 0, 'l', }, { "minimumTreeCoverage", required_argument, 0, 'm' }, { "blockTrim", required_argument, 0, 'n' }, { "deannealingRounds", required_argument, 0, 'o' }, { "minimumDegree", required_argument, 0, 'p' }, { "minimumIngroupDegree", required_argument, 0, 'q' }, { "minimumOutgroupDegree", required_argument, 0, 'r' }, { "alignmentFilter", required_argument, 0, 't' }, { "minimumSequenceLengthForBlast", required_argument, 0, 'v' }, { "maxAdjacencyComponentSizeRatio", required_argument, 0, 'w' }, { "constraints", required_argument, 0, 'x' }, { "minLengthForChromosome", required_argument, 0, 'y' }, { "proportionOfUnalignedBasesForNewChromosome", required_argument, 0, 'z' }, { "maximumMedianSequenceLengthBetweenLinkedEnds", required_argument, 0, 'A' }, { "realign", no_argument, 0, 'B' }, { "realignArguments", required_argument, 0, 'C' }, { "phylogenyNumTrees", required_argument, 0, 'D' }, { "phylogenyRootingMethod", required_argument, 0, 'E' }, { "phylogenyScoringMethod", required_argument, 0, 'F' }, { "phylogenyBreakpointScalingFactor", required_argument, 0, 'G' }, { "phylogenySkipSingleCopyBlocks", no_argument, 0, 'H' }, { "phylogenyMaxBaseDistance", required_argument, 0, 'I' }, { "phylogenyMaxBlockDistance", required_argument, 0, 'J' }, { "phylogenyDebugFile", required_argument, 0, 'K' }, { "phylogenyKeepSingleDegreeBlocks", no_argument, 0, 'L' }, { "phylogenyTreeBuildingMethod", required_argument, 0, 'M' }, { "phylogenyCostPerDupPerBase", required_argument, 0, 'N' }, { "phylogenyCostPerLossPerBase", required_argument, 0, 'O' }, { "referenceEventHeader", required_argument, 0, 'P' }, { "phylogenyDoSplitsWithSupportHigherThanThisAllAtOnce", required_argument, 0, 'Q' }, { "numTreeBuildingThreads", required_argument, 0, 'R' }, { "phylogeny", no_argument, 0, 'S' }, { "minimumBlockHomologySupport", required_argument, 0, 'T' }, { "phylogenyNucleotideScalingFactor", required_argument, 0, 'U' }, { "minimumBlockDegreeToCheckSupport", required_argument, 0, 'V' }, { "removeRecoverableChains", required_argument, 0, 'W' }, { "minimumNumberOfSpecies", required_argument, 0, 'X' }, { "phylogenyHomologyUnitType", required_argument, 0, 'Y' }, { "phylogenyDistanceCorrectionMethod", required_argument, 0, 'Z' }, { "maxRecoverableChainsIterations", required_argument, 0, '1' }, { "maxRecoverableChainLength", required_argument, 0, '2' }, { 0, 0, 0, 0 } }; int option_index = 0; key = getopt_long(argc, argv, "a:b:c:hi:k:m:n:o:p:q:r:stv:w:x:y:z:A:BC:D:E:", long_options, &option_index); if (key == -1) { break; } switch (key) { case 'a': logLevelString = stString_copy(optarg); st_setLogLevelFromString(logLevelString); break; case 'b': alignmentsFile = stString_copy(optarg); break; case 'c': cactusDiskDatabaseString = stString_copy(optarg); break; case 'd': lastzArguments = stString_copy(optarg); break; case 'h': usage(); return 0; case 'i': annealingRounds = getInts(optarg, &annealingRoundsLength); break; case 'o': meltingRounds = getInts(optarg, &meltingRoundsLength); break; case 'k': alignmentTrims = getInts(optarg, &alignmentTrimLength); break; case 'm': k = sscanf(optarg, "%f", &minimumTreeCoverage); assert(k == 1); break; case 'n': k = sscanf(optarg, "%" PRIi64 "", &blockTrim); assert(k == 1); break; case 'p': k = sscanf(optarg, "%" PRIi64 "", &minimumDegree); assert(k == 1); break; case 'q': k = sscanf(optarg, "%" PRIi64 "", &minimumIngroupDegree); assert(k == 1); break; case 'r': k = sscanf(optarg, "%" PRIi64 "", &minimumOutgroupDegree); assert(k == 1); break; case 't': if (strcmp(optarg, "singleCopyOutgroup") == 0) { sortAlignments = true; filterFn = stCaf_filterByOutgroup; } else if (strcmp(optarg, "relaxedSingleCopyOutgroup") == 0) { sortAlignments = true; filterFn = stCaf_relaxedFilterByOutgroup; } else if (strcmp(optarg, "singleCopy") == 0) { sortAlignments = true; filterFn = stCaf_filterByRepeatSpecies; } else if (strcmp(optarg, "relaxedSingleCopy") == 0) { sortAlignments = true; filterFn = stCaf_relaxedFilterByRepeatSpecies; } else if (strcmp(optarg, "singleCopyChr") == 0) { sortAlignments = true; filterFn = stCaf_singleCopyChr; } else if (strcmp(optarg, "singleCopyIngroup") == 0) { sortAlignments = true; filterFn = stCaf_singleCopyIngroup; } else if (strcmp(optarg, "relaxedSingleCopyIngroup") == 0) { sortAlignments = true; filterFn = stCaf_relaxedSingleCopyIngroup; } else if (strcmp(optarg, "none") == 0) { sortAlignments = false; filterFn = NULL; } else { st_errAbort("Could not recognize alignmentFilter option %s", optarg); } break; case 'v': k = sscanf(optarg, "%" PRIi64 "", &minimumSequenceLengthForBlast); assert(k == 1); break; case 'w': k = sscanf(optarg, "%f", &maximumAdjacencyComponentSizeRatio); assert(k == 1); break; case 'x': constraintsFile = stString_copy(optarg); break; case 'y': k = sscanf(optarg, "%" PRIi64 "", &minLengthForChromosome); assert(k == 1); break; case 'z': k = sscanf(optarg, "%f", &proportionOfUnalignedBasesForNewChromosome); assert(k == 1); break; case 'A': k = sscanf(optarg, "%" PRIi64 "", &maximumMedianSequenceLengthBetweenLinkedEnds); assert(k == 1); break; case 'B': realign = 1; break; case 'C': realignArguments = stString_copy(optarg); break; case 'D': k = sscanf(optarg, "%" PRIi64, &phylogenyNumTrees); assert(k == 1); break; case 'E': if (!strcmp(optarg, "outgroupBranch")) { phylogenyRootingMethod = OUTGROUP_BRANCH; } else if (!strcmp(optarg, "longestBranch")) { phylogenyRootingMethod = LONGEST_BRANCH; } else if (!strcmp(optarg, "bestRecon")) { phylogenyRootingMethod = BEST_RECON; } else { st_errAbort("Invalid tree rooting method: %s", optarg); } break; case 'F': if (!strcmp(optarg, "reconCost")) { phylogenyScoringMethod = RECON_COST; } else if (!strcmp(optarg, "nucLikelihood")) { phylogenyScoringMethod = NUCLEOTIDE_LIKELIHOOD; } else if (!strcmp(optarg, "reconLikelihood")) { phylogenyScoringMethod = RECON_LIKELIHOOD; } else if (!strcmp(optarg, "combinedLikelihood")) { phylogenyScoringMethod = COMBINED_LIKELIHOOD; } else { st_errAbort("Invalid tree scoring method: %s", optarg); } break; case 'G': k = sscanf(optarg, "%lf", &breakpointScalingFactor); assert(k == 1); break; case 'H': phylogenySkipSingleCopyBlocks = true; break; case 'I': k = sscanf(optarg, "%" PRIi64, &phylogenyMaxBaseDistance); assert(k == 1); break; case 'J': k = sscanf(optarg, "%" PRIi64, &phylogenyMaxBlockDistance); assert(k == 1); break; case 'K': debugFileName = stString_copy(optarg); break; case 'L': phylogenyKeepSingleDegreeBlocks = true; break; case 'M': // clear the default setting of the list stList_destruct(phylogenyTreeBuildingMethods); phylogenyTreeBuildingMethods = stList_construct(); stList *methodStrings = stString_splitByString(optarg, ","); for (int64_t i = 0; i < stList_length(methodStrings); i++) { char *methodString = stList_get(methodStrings, i); enum stCaf_TreeBuildingMethod *method = st_malloc(sizeof(enum stCaf_TreeBuildingMethod)); if (strcmp(methodString, "neighborJoining") == 0) { *method = NEIGHBOR_JOINING; } else if (strcmp(methodString, "guidedNeighborJoining") == 0) { *method = GUIDED_NEIGHBOR_JOINING; } else if (strcmp(methodString, "splitDecomposition") == 0) { *method = SPLIT_DECOMPOSITION; } else if (strcmp(methodString, "strictSplitDecomposition") == 0) { *method = STRICT_SPLIT_DECOMPOSITION; } else if (strcmp(methodString, "removeBadChains") == 0) { *method = REMOVE_BAD_CHAINS; } else { st_errAbort("Unknown tree building method: %s", methodString); } stList_append(phylogenyTreeBuildingMethods, method); } stList_destruct(methodStrings); break; case 'N': k = sscanf(optarg, "%lf", &phylogenyCostPerDupPerBase); assert(k == 1); break; case 'O': k = sscanf(optarg, "%lf", &phylogenyCostPerLossPerBase); assert(k == 1); break; case 'P': referenceEventHeader = stString_copy(optarg); break; case 'Q': k = sscanf(optarg, "%lf", &phylogenyDoSplitsWithSupportHigherThanThisAllAtOnce); assert(k == 1); break; case 'R': k = sscanf(optarg, "%" PRIi64, &numTreeBuildingThreads); assert(k == 1); break; case 'S': doPhylogeny = true; break; case 'T': k = sscanf(optarg, "%lf", &minimumBlockHomologySupport); assert(k == 1); assert(minimumBlockHomologySupport <= 1.0); assert(minimumBlockHomologySupport >= 0.0); break; case 'U': k = sscanf(optarg, "%lf", &nucleotideScalingFactor); assert(k == 1); break; case 'V': k = sscanf(optarg, "%" PRIi64, &minimumBlockDegreeToCheckSupport); assert(k == 1); break; case 'W': if (strcmp(optarg, "1") == 0) { removeRecoverableChains = true; recoverableChainsFilter = NULL; } else if (strcmp(optarg, "unequalNumberOfIngroupCopies") == 0) { removeRecoverableChains = true; recoverableChainsFilter = stCaf_chainHasUnequalNumberOfIngroupCopies; } else if (strcmp(optarg, "unequalNumberOfIngroupCopiesOrNoOutgroup") == 0) { removeRecoverableChains = true; recoverableChainsFilter = stCaf_chainHasUnequalNumberOfIngroupCopiesOrNoOutgroup; } else if (strcmp(optarg, "0") == 0) { removeRecoverableChains = false; } else { st_errAbort("Could not parse removeRecoverableChains argument"); } break; case 'X': k = sscanf(optarg, "%" PRIi64, &minimumNumberOfSpecies); if (k != 1) { st_errAbort("Error parsing the minimumNumberOfSpecies argument"); } break; case 'Y': if (strcmp(optarg, "chain") == 0) { phylogenyHomologyUnitType = CHAIN; } else if (strcmp(optarg, "block") == 0) { phylogenyHomologyUnitType = BLOCK; } else { st_errAbort("Could not parse the phylogenyHomologyUnitType argument"); } break; case 'Z': if (strcmp(optarg, "jukesCantor") == 0) { phylogenyDistanceCorrectionMethod = JUKES_CANTOR; } else if (strcmp(optarg, "none") == 0 ) { phylogenyDistanceCorrectionMethod = NONE; } else { st_errAbort("Could not parse the phylogenyDistanceCorrectionMethod argument"); } break; case '1': k = sscanf(optarg, "%" PRIi64, &maxRecoverableChainsIterations); if (k != 1) { st_errAbort("Error parsing the maxRecoverableChainsIterations argument"); } break; case '2': k = sscanf(optarg, "%" PRIi64, &maxRecoverableChainLength); if (k != 1) { st_errAbort("Error parsing the maxRecoverableChainLength argument"); } break; default: usage(); return 1; } } /////////////////////////////////////////////////////////////////////////// // (0) Check the inputs. /////////////////////////////////////////////////////////////////////////// assert(cactusDiskDatabaseString != NULL); assert(minimumTreeCoverage >= 0.0); assert(minimumTreeCoverage <= 1.0); assert(blockTrim >= 0); assert(annealingRoundsLength >= 0); for (int64_t i = 0; i < annealingRoundsLength; i++) { assert(annealingRounds[i] >= 0); } assert(meltingRoundsLength >= 0); for (int64_t i = 1; i < meltingRoundsLength; i++) { assert(meltingRounds[i - 1] < meltingRounds[i]); assert(meltingRounds[i - 1] >= 1); } assert(alignmentTrimLength >= 0); for (int64_t i = 0; i < alignmentTrimLength; i++) { assert(alignmentTrims[i] >= 0); } assert(minimumOutgroupDegree >= 0); assert(minimumIngroupDegree >= 0); ////////////////////////////////////////////// //Set up logging ////////////////////////////////////////////// st_setLogLevelFromString(logLevelString); ////////////////////////////////////////////// //Log (some of) the inputs ////////////////////////////////////////////// st_logInfo("Flower disk name : %s\n", cactusDiskDatabaseString); ////////////////////////////////////////////// //Load the database ////////////////////////////////////////////// kvDatabaseConf = stKVDatabaseConf_constructFromString(cactusDiskDatabaseString); cactusDisk = cactusDisk_construct(kvDatabaseConf, 0); st_logInfo("Set up the flower disk\n"); /////////////////////////////////////////////////////////////////////////// // Sort the constraints /////////////////////////////////////////////////////////////////////////// stPinchIterator *pinchIteratorForConstraints = NULL; if (constraintsFile != NULL) { pinchIteratorForConstraints = stPinchIterator_constructFromFile(constraintsFile); st_logInfo("Created an iterator for the alignment constaints from file: %s\n", constraintsFile); } /////////////////////////////////////////////////////////////////////////// // Do the alignment /////////////////////////////////////////////////////////////////////////// startTime = time(NULL); stList *flowers = flowerWriter_parseFlowersFromStdin(cactusDisk); if (alignmentsFile == NULL) { cactusDisk_preCacheStrings(cactusDisk, flowers); } char *tempFile1 = NULL; for (int64_t i = 0; i < stList_length(flowers); i++) { flower = stList_get(flowers, i); if (!flower_builtBlocks(flower)) { // Do nothing if the flower already has defined blocks st_logDebug("Processing flower: %lli\n", flower_getName(flower)); stCaf_setFlowerForAlignmentFiltering(flower); //Set up the graph and add the initial alignments stPinchThreadSet *threadSet = stCaf_setup(flower); //Build the set of outgroup threads outgroupThreads = stCaf_getOutgroupThreads(flower, threadSet); //Setup the alignments stPinchIterator *pinchIterator; stList *alignmentsList = NULL; if (alignmentsFile != NULL) { assert(i == 0); assert(stList_length(flowers) == 1); if (sortAlignments) { tempFile1 = getTempFile(); stCaf_sortCigarsFileByScoreInDescendingOrder(alignmentsFile, tempFile1); pinchIterator = stPinchIterator_constructFromFile(tempFile1); } else { pinchIterator = stPinchIterator_constructFromFile(alignmentsFile); } } else { if (tempFile1 == NULL) { tempFile1 = getTempFile(); } alignmentsList = stCaf_selfAlignFlower(flower, minimumSequenceLengthForBlast, lastzArguments, realign, realignArguments, tempFile1); if (sortAlignments) { stCaf_sortCigarsByScoreInDescendingOrder(alignmentsList); } st_logDebug("Ran lastz and have %" PRIi64 " alignments\n", stList_length(alignmentsList)); pinchIterator = stPinchIterator_constructFromList(alignmentsList); } for (int64_t annealingRound = 0; annealingRound < annealingRoundsLength; annealingRound++) { int64_t minimumChainLength = annealingRounds[annealingRound]; int64_t alignmentTrim = annealingRound < alignmentTrimLength ? alignmentTrims[annealingRound] : 0; st_logDebug("Starting annealing round with a minimum chain length of %" PRIi64 " and an alignment trim of %" PRIi64 "\n", minimumChainLength, alignmentTrim); stPinchIterator_setTrim(pinchIterator, alignmentTrim); //Add back in the constraints if (pinchIteratorForConstraints != NULL) { stCaf_anneal(threadSet, pinchIteratorForConstraints, filterFn); } //Do the annealing if (annealingRound == 0) { stCaf_anneal(threadSet, pinchIterator, filterFn); } else { stCaf_annealBetweenAdjacencyComponents(threadSet, pinchIterator, filterFn); } // Dump the block degree and length distribution to a file if (debugFileName != NULL) { dumpBlockInfo(threadSet, stString_print("%s-blockStats-preMelting", debugFileName)); } printf("Sequence graph statistics after annealing:\n"); printThreadSetStatistics(threadSet, flower, stdout); // Check for poorly-supported blocks--those that have // been transitively aligned together but with very // few homologies supporting the transitive // alignment. These "megablocks" can snarl up the // graph so that a lot of extra gets thrown away in // the first melting step. stPinchThreadSetBlockIt blockIt = stPinchThreadSet_getBlockIt(threadSet); stPinchBlock *block; while ((block = stPinchThreadSetBlockIt_getNext(&blockIt)) != NULL) { if (stPinchBlock_getDegree(block) > minimumBlockDegreeToCheckSupport) { uint64_t supportingHomologies = stPinchBlock_getNumSupportingHomologies(block); uint64_t possibleSupportingHomologies = numPossibleSupportingHomologies(block, flower); double support = ((double) supportingHomologies) / possibleSupportingHomologies; if (support < minimumBlockHomologySupport) { fprintf(stdout, "Destroyed a megablock with degree %" PRIi64 " and %" PRIi64 " supporting homologies out of a maximum " "of %" PRIi64 " (%lf%%).\n", stPinchBlock_getDegree(block), supportingHomologies, possibleSupportingHomologies, support); stPinchBlock_destruct(block); } } } //Do the melting rounds for (int64_t meltingRound = 0; meltingRound < meltingRoundsLength; meltingRound++) { int64_t minimumChainLengthForMeltingRound = meltingRounds[meltingRound]; st_logDebug("Starting melting round with a minimum chain length of %" PRIi64 " \n", minimumChainLengthForMeltingRound); if (minimumChainLengthForMeltingRound >= minimumChainLength) { break; } stCaf_melt(flower, threadSet, NULL, 0, minimumChainLengthForMeltingRound, 0, INT64_MAX); } st_logDebug("Last melting round of cycle with a minimum chain length of %" PRIi64 " \n", minimumChainLength); stCaf_melt(flower, threadSet, NULL, 0, minimumChainLength, breakChainsAtReverseTandems, maximumMedianSequenceLengthBetweenLinkedEnds); //This does the filtering of blocks that do not have the required species/tree-coverage/degree. stCaf_melt(flower, threadSet, blockFilterFn, blockTrim, 0, 0, INT64_MAX); } if (removeRecoverableChains) { stCaf_meltRecoverableChains(flower, threadSet, breakChainsAtReverseTandems, maximumMedianSequenceLengthBetweenLinkedEnds, recoverableChainsFilter, maxRecoverableChainsIterations, maxRecoverableChainLength); } if (debugFileName != NULL) { dumpBlockInfo(threadSet, stString_print("%s-blockStats-postMelting", debugFileName)); } printf("Sequence graph statistics after melting:\n"); printThreadSetStatistics(threadSet, flower, stdout); // Build a tree for each block, then use each tree to // partition the homologies between the ingroups sequences // into those that occur before the speciation with the // outgroup and those which occur late. if (stSet_size(outgroupThreads) > 0 && doPhylogeny) { st_logDebug("Starting to build trees and partition ingroup homologies\n"); stHash *threadStrings = stCaf_getThreadStrings(flower, threadSet); st_logDebug("Got sets of thread strings and set of threads that are outgroups\n"); stCaf_PhylogenyParameters params; params.distanceCorrectionMethod = phylogenyDistanceCorrectionMethod; params.treeBuildingMethods = phylogenyTreeBuildingMethods; params.rootingMethod = phylogenyRootingMethod; params.scoringMethod = phylogenyScoringMethod; params.breakpointScalingFactor = breakpointScalingFactor; params.nucleotideScalingFactor = nucleotideScalingFactor; params.skipSingleCopyBlocks = phylogenySkipSingleCopyBlocks; params.keepSingleDegreeBlocks = phylogenyKeepSingleDegreeBlocks; params.costPerDupPerBase = phylogenyCostPerDupPerBase; params.costPerLossPerBase = phylogenyCostPerLossPerBase; params.maxBaseDistance = phylogenyMaxBaseDistance; params.maxBlockDistance = phylogenyMaxBlockDistance; params.numTrees = phylogenyNumTrees; params.ignoreUnalignedBases = 1; params.onlyIncludeCompleteFeatureBlocks = 0; params.doSplitsWithSupportHigherThanThisAllAtOnce = phylogenyDoSplitsWithSupportHigherThanThisAllAtOnce; params.numTreeBuildingThreads = numTreeBuildingThreads; assert(params.numTreeBuildingThreads >= 1); stCaf_buildTreesToRemoveAncientHomologies( threadSet, phylogenyHomologyUnitType, threadStrings, outgroupThreads, flower, ¶ms, debugFileName == NULL ? NULL : stString_print("%s-phylogeny", debugFileName), referenceEventHeader); stHash_destruct(threadStrings); st_logDebug("Finished building trees\n"); if (removeRecoverableChains) { // We melt recoverable chains after splitting, as // well as before, to alleviate coverage loss // caused by bad splits. stCaf_meltRecoverableChains(flower, threadSet, breakChainsAtReverseTandems, maximumMedianSequenceLengthBetweenLinkedEnds, recoverableChainsFilter, maxRecoverableChainsIterations, maxRecoverableChainLength); } // Enforce the block constraints on minimum degree, // etc. after splitting. stCaf_melt(flower, threadSet, blockFilterFn, 0, 0, 0, INT64_MAX); } //Sort out case when we allow blocks of degree 1 if (minimumDegree < 2) { st_logDebug("Creating degree 1 blocks\n"); stCaf_makeDegreeOneBlocks(threadSet); stCaf_melt(flower, threadSet, blockFilterFn, blockTrim, 0, 0, INT64_MAX); } else if (maximumAdjacencyComponentSizeRatio < INT64_MAX) { //Deal with giant components st_logDebug("Breaking up components greedily\n"); stCaf_breakupComponentsGreedily(threadSet, maximumAdjacencyComponentSizeRatio); } //Finish up stCaf_finish(flower, threadSet, chainLengthForBigFlower, longChain, minLengthForChromosome, proportionOfUnalignedBasesForNewChromosome); //Flower is then destroyed at this point. st_logInfo("Ran the cactus core script\n"); //Cleanup stPinchThreadSet_destruct(threadSet); stPinchIterator_destruct(pinchIterator); stSet_destruct(outgroupThreads); if (alignmentsList != NULL) { stList_destruct(alignmentsList); } st_logInfo("Cleaned up from main loop\n"); } else { st_logInfo("We've already built blocks / alignments for this flower\n"); } } stList_destruct(flowers); if (tempFile1 != NULL) { st_system("rm %s", tempFile1); } if (constraintsFile != NULL) { stPinchIterator_destruct(pinchIteratorForConstraints); } /////////////////////////////////////////////////////////////////////////// // Write the flower to disk. /////////////////////////////////////////////////////////////////////////// st_logDebug("Writing the flowers to disk\n"); cactusDisk_write(cactusDisk); st_logInfo("Updated the flower on disk and %" PRIi64 " seconds have elapsed\n", time(NULL) - startTime); /////////////////////////////////////////////////////////////////////////// // Clean up. /////////////////////////////////////////////////////////////////////////// cactusDisk_destruct(cactusDisk); }
int main(int argc, char *argv[]) { int64_t j = 0; char *npReadFile = NULL; char *templateModelFile = stString_print("../models/testModelR9_template.model"); char *complementModelFile = stString_print("../models/testModelR9_complement_pop2.model"); double threshold = 0.8; int key; while (1) { static struct option long_options[] = { {"help", no_argument, 0, 'h'}, {"templateModel", required_argument, 0, 'T'}, {"complementModel", required_argument, 0, 'C'}, {"npRead", required_argument, 0, 'q'}, {"threshold", required_argument, 0, 'D'}, {0, 0, 0, 0} }; int option_index = 0; key = getopt_long(argc, argv, "h:T:C:q:f:b:D:m:", long_options, &option_index); if (key == -1) { //usage(); break; } switch (key) { case 'h': usage(); return 1; case 'T': templateModelFile = stString_copy(optarg); break; case 'C': complementModelFile = stString_copy(optarg); break; case 'q': npReadFile = stString_copy(optarg); break; case 'D': j = sscanf(optarg, "%lf", &threshold); assert (j == 1); assert (threshold >= 0); break; default: usage(); return 1; } } if (!stFile_exists(npReadFile)) { st_errAbort("Could not find npRead here: %s\n", npReadFile); } // read in the .npRead file NanoporeRead *npRead = nanopore_loadNanoporeReadFromFile(npReadFile); // build state machines (to use the look up table) StateMachine *sMt = getStateMachine3(templateModelFile); //StateMachine *sMc = getStateMachine3(complementModelFile); // make 1D map of events (mean, noise) to kmers stList *templateMap = signalUtils_templateOneDAssignmentsFromRead(npRead, sMt, ASSIGNMENT_THRESHOLD); //stList *complementMap = signalUtils_complementOneDAssignmentsFromRead(npRead, sMc, ASSIGNMENT_THRESHOLD); // convert template to log normal // NB only need this if you're estimating the NOISE parameteres //nanopore_convert_to_lognormal_params(sMt->alphabetSize, sMt->kmerLength, sMt->EMISSION_MATCH_MATRIX, templateMap); // convert complement to log normal //nanopore_convert_to_lognormal_params(sMc->alphabetSize, sMc->kmerLength, sMc->EMISSION_MATCH_MATRIX, complementMap); // error log report st_uglyf("SENTINEL - Before: shift: %f scale: %f var: %f [template]\n", npRead->templateParams.shift, npRead->templateParams.scale, npRead->templateParams.var); // compute template params //nanopore_compute_noise_scale_params(sMt->EMISSION_MATCH_MATRIX, templateMap, &npRead->templateParams); // compute complement params //nanopore_compute_noise_scale_params(sMc->EMISSION_MATCH_MATRIX, complementMap, &npRead->complementParams); // error log report signalUtils_estimateNanoporeParams(sMt, npRead, &npRead->templateParams, ASSIGNMENT_THRESHOLD, signalUtils_templateOneDAssignmentsFromRead, nanopore_dontAdjustEvents); //signalUtils_estimateNanoporeParams(sMc, npRead, &npRead->complementParams, ASSIGNMENT_THRESHOLD, // signalUtils_complementOneDAssignmentsFromRead, nanopore_dontAdjustEvents); st_uglyf("SENTINEL - After: shift: %f scale: %f var: %f [template]\n", npRead->templateParams.shift, npRead->templateParams.scale, npRead->templateParams.var); //st_uglyf("SENTINEL - After: shift_sd: %f scale_sd: %f var_sd: %f [template]\n", // npRead->complementParams.shift_sd, npRead->complementParams.scale_sd, npRead->complementParams.var_sd); stList *templateKmers = lineTokensFromFile(npReadFile, 10); //stList *complementKmers = lineTokensFromFile(npReadFile, 12); //printEventNoisesAndParams(npRead, templateKmers, complementKmers); printEventMeansAndParams(npRead, templateKmers, NULL); stList_destruct(templateKmers); //stList_destruct(complementKmers); stList_destruct(templateMap); //stList_destruct(complementMap); nanopore_nanoporeReadDestruct(npRead); stateMachine_destruct(sMt); //stateMachine_destruct(sMc); (void) j; // silence unused variable warning. return 0; }