Пример #1
0
static void testBreakUpPinchGraphAdjacencyComponentsGreedily(CuTest *testCase) {
    //return;
    for (int64_t test = 0; test < 10000; test++) {
        st_logInfo("Starting break up giant pinch graph components random test %" PRIi64 "\n", test);
        stPinchThreadSet *threadSet = stPinchThreadSet_getRandomGraph();
        int64_t totalNodes = 2 * stPinchThreadSet_getTotalBlockNumber(threadSet);
        float maximumAdjacencyComponentSizeRatio = st_random() * 10;
        int64_t maximumAdjacencyComponentSize = log(maximumAdjacencyComponentSizeRatio) * totalNodes;
        if (maximumAdjacencyComponentSize < 2) {
            maximumAdjacencyComponentSize = 2;
        }
        stList *adjacencyComponents = stPinchThreadSet_getAdjacencyComponents(threadSet);
        int64_t largestAdjacencyComponentSizeInGraph = getSizeOfLargestAdjacencyComponent(adjacencyComponents);
        st_logInfo(
                "We have a random pinch graph with %" PRIi64 " nodes and %" PRIi64 " adjacency components, the largest adjacency component has %" PRIi64 " nodes, with a ratio of %f we will break up adjacency components larger than %" PRIi64 " in size, this will result in a breakup: %" PRIi64 "\n",
                totalNodes, stList_length(adjacencyComponents), largestAdjacencyComponentSizeInGraph, maximumAdjacencyComponentSizeRatio,
                maximumAdjacencyComponentSize, largestAdjacencyComponentSizeInGraph > maximumAdjacencyComponentSize);
        stList_destruct(adjacencyComponents);
        //Now do the actual breaking up
        stCaf_breakupComponentsGreedily(threadSet, maximumAdjacencyComponentSizeRatio);
        adjacencyComponents = stPinchThreadSet_getAdjacencyComponents(threadSet);
        int64_t largestAdjacencyComponentSizeInGraphAfterBreakup = getSizeOfLargestAdjacencyComponent(adjacencyComponents);
        totalNodes = 2 * stPinchThreadSet_getTotalBlockNumber(threadSet);
        st_logInfo(
                "After splitting we have a pinch graph with %" PRIi64 " nodes and %" PRIi64 " adjacency components, the largest adjacency component has %" PRIi64 " nodes, with a ratio of %f that broke up adjacency components larger than %" PRIi64 " in size\n",
                totalNodes, stList_length(adjacencyComponents), largestAdjacencyComponentSizeInGraphAfterBreakup,
                maximumAdjacencyComponentSizeRatio, maximumAdjacencyComponentSize);
        //Cleanup
        stList_destruct(adjacencyComponents);
        stPinchThreadSet_destruct(threadSet);
    }
}
Пример #2
0
int main(int argc, char** argv) {
    
    if(argc == 1) {
        // Print the help
        help_main(argv);
        return 1;
    }
    
    size_t kmerSize = 0;
    size_t edgeMax = 0;
    
    // Should we only merge on kmers and skip paths?
    bool kmersOnly = false;
    
    optind = 1; // Start at first real argument
    bool optionsRemaining = true;
    while(optionsRemaining) {
        static struct option longOptions[] = {
            {"kmer-size", required_argument, 0, 'k'},
            {"edge-max", required_argument, 0, 'e'},
            {"kmers-only", no_argument, 0, 'o'},
            {"threads", required_argument, 0, 't'},
            {"help", no_argument, 0, 'h'},
            {0, 0, 0, 0}
        };

        int optionIndex = 0;

        switch(getopt_long(argc, argv, "k:e:t:h", longOptions, &optionIndex)) {
        // Option value is in global optarg
        case -1:
            optionsRemaining = false;
            break;
        case 'k': // Set the kmer size
            kmerSize = atol(optarg);
            break;
        case 'e': // Set the edge max parameter for kmer enumeration
            edgeMax = atol(optarg);
            break;
        case 'o': // Only merge on kmers
            kmersOnly = true;
            break;
        case 't': // Set the openmp threads
            omp_set_num_threads(atoi(optarg));
            break;
        case 'h': // When the user asks for help
        case '?': // When we get options we can't parse
            help_main(argv);
            exit(1);
            break;
        default:
            // TODO: keep track of the option
            std::cerr << "Illegal option" << std::endl;
            exit(1);
        }
    }
    
    if(argc - optind < 2) {
        // We don't have two positional arguments
        // Print the help
        help_main(argv);
        return 1;
    }
    
    if(kmersOnly && kmerSize == 0) {
        // We need a kmer size to use kmers
        throw std::runtime_error("Can't merge only on kmers with no kmer size");
    }
    
    // Pull out the VG file names
    std::string vgFile1 = argv[optind++];
    std::string vgFile2 = argv[optind++];
    
    // Guess index names (TODO: add options)
    std::string indexDir1 = vgFile1 + ".index";
    std::string indexDir2 = vgFile2 + ".index";
    
    // Open the files
    std::ifstream vgStream1(vgFile1);
    if(!vgStream1.good()) {
        std::cerr << "Could not read " << vgFile1 << std::endl;
        exit(1);
    }
    
    std::ifstream vgStream2(vgFile2);
    if(!vgStream2.good()) {
        std::cerr << "Could not read " << vgFile2 << std::endl;
        exit(1);
    }
    
    // We may have indexes. We need to use pointers because destructing an index
    // that was never opened segfaults. TODO: fix vg
    vg::Index* index1 = nullptr;
    vg::Index* index2 = nullptr;
    
    if(kmerSize) {
        // Only go looking for indexes if we want to merge on kmers.
        index1 = new vg::Index();
        index1->open_read_only(indexDir1);
        index2 = new vg::Index();
        index2->open_read_only(indexDir2);
    }
    
    
    // Load up the first VG file
    vg::VG vg1(vgStream1);
    // And the second
    vg::VG vg2(vgStream2);
    
    
    // Make a way to track IDs
    int64_t nextId = 1;
    std::function<int64_t(void)> getId = [&]() {
        return nextId++;
    };
    
    // Make a thread set
    auto threadSet = stPinchThreadSet_construct();
    
    // Make a place to keep track of the thread sequences.
    // This will only contain sequences for threads that aren't staples.
    // TODO: should this be by pointer instead?
    std::map<int64_t, std::string> threadSequences;
    
    // Add in each vg graph to the thread set
    coregraph::EmbeddedGraph embedding1(vg1, threadSet, threadSequences, getId, vgFile1);
    coregraph::EmbeddedGraph embedding2(vg2, threadSet, threadSequences, getId, vgFile2);
    
    if(!kmersOnly) {
        // We want to merge on shared paths in addition to kmers
    
        // Complain if any of the graphs is not completely covered by paths
        if(!embedding1.isCoveredByPaths()) {
            std::cerr << "WARNING: " << embedding1.getName() << " contains nodes with no paths!" << std::endl;
        }
        if(!embedding2.isCoveredByPaths()) {
            std::cerr << "WARNING: " << embedding2.getName() << " contains nodes with no paths!" << std::endl;
        }
        
        // Trace the paths and merge the embedded graphs.
        std::cerr << "Pinching graphs on shared paths..." << std::endl;
        embedding1.pinchWith(embedding2);
    }
    
    if(kmerSize > 0) {
        // Merge on kmers that are unique in both graphs.
        std::cerr << "Pinching graphs on shared " << kmerSize << "-mers..." << std::endl;
        embedding1.pinchOnKmers(*index1, embedding2, *index2, kmerSize, edgeMax);
    }
    
    // Fix trivial joins so we don't produce more vg nodes than we really need to.
    stPinchThreadSet_joinTrivialBoundaries(threadSet);
    
    // Make another vg graph from the thread set
    vg::VG core = pinchToVG(threadSet, threadSequences);
    
    // Spit it out to standard output
    core.serialize_to_ostream(std::cout);
    
    // Tear everything down. TODO: can we somehow run this destruction function
    // after all our other, potentially depending locals are destructed?
    stPinchThreadSet_destruct(threadSet);
    
    return 0;
}
Пример #3
0
int main(int argc, char *argv[]) {
    /*
     * Script for adding alignments to cactus tree.
     */
    int64_t startTime;
    stKVDatabaseConf *kvDatabaseConf;
    CactusDisk *cactusDisk;
    int key, k;

    bool (*filterFn)(stPinchSegment *, stPinchSegment *) = NULL;
    stSet *outgroupThreads = NULL;

    /*
     * Arguments/options
     */
    char * logLevelString = NULL;
    char * alignmentsFile = NULL;
    char * constraintsFile = NULL;
    char * cactusDiskDatabaseString = NULL;
    char * lastzArguments = "";
    int64_t minimumSequenceLengthForBlast = 1;

    //Parameters for annealing/melting rounds
    int64_t *annealingRounds = NULL;
    int64_t annealingRoundsLength = 0;
    int64_t *meltingRounds = NULL;
    int64_t meltingRoundsLength = 0;

    //Parameters for melting
    float maximumAdjacencyComponentSizeRatio = 10;
    int64_t blockTrim = 0;
    int64_t alignmentTrimLength = 0;
    int64_t *alignmentTrims = NULL;
    int64_t chainLengthForBigFlower = 1000000;
    int64_t longChain = 2;
    int64_t minLengthForChromosome = 1000000;
    float proportionOfUnalignedBasesForNewChromosome = 0.8;
    bool breakChainsAtReverseTandems = 1;
    int64_t maximumMedianSequenceLengthBetweenLinkedEnds = INT64_MAX;
    bool realign = 0;
    char *realignArguments = "";
    bool removeRecoverableChains = false;
    bool (*recoverableChainsFilter)(stCactusEdgeEnd *, Flower *) = NULL;
    int64_t maxRecoverableChainsIterations = 1;
    int64_t maxRecoverableChainLength = INT64_MAX;

    //Parameters for removing ancient homologies
    bool doPhylogeny = false;
    int64_t phylogenyNumTrees = 1;
    enum stCaf_RootingMethod phylogenyRootingMethod = BEST_RECON;
    enum stCaf_ScoringMethod phylogenyScoringMethod = COMBINED_LIKELIHOOD;
    double breakpointScalingFactor = 1.0;
    bool phylogenySkipSingleCopyBlocks = 0;
    int64_t phylogenyMaxBaseDistance = 1000;
    int64_t phylogenyMaxBlockDistance = 100;
    bool phylogenyKeepSingleDegreeBlocks = 0;
    stList *phylogenyTreeBuildingMethods = stList_construct();
    enum stCaf_TreeBuildingMethod defaultMethod = GUIDED_NEIGHBOR_JOINING;
    stList_append(phylogenyTreeBuildingMethods, &defaultMethod);
    double phylogenyCostPerDupPerBase = 0.2;
    double phylogenyCostPerLossPerBase = 0.2;
    const char *debugFileName = NULL;
    const char *referenceEventHeader = NULL;
    double phylogenyDoSplitsWithSupportHigherThanThisAllAtOnce = 1.0;
    int64_t numTreeBuildingThreads = 2;
    int64_t minimumBlockDegreeToCheckSupport = 10;
    double minimumBlockHomologySupport = 0.7;
    double nucleotideScalingFactor = 1.0;
    HomologyUnitType phylogenyHomologyUnitType = BLOCK;
    enum stCaf_DistanceCorrectionMethod phylogenyDistanceCorrectionMethod = JUKES_CANTOR;
    bool sortAlignments = false;

    ///////////////////////////////////////////////////////////////////////////
    // (0) Parse the inputs handed by genomeCactus.py / setup stuff.
    ///////////////////////////////////////////////////////////////////////////

    while (1) {
        static struct option long_options[] = { { "logLevel", required_argument, 0, 'a' }, { "alignments", required_argument, 0, 'b' }, {
                "cactusDisk", required_argument, 0, 'c' }, { "lastzArguments", required_argument, 0, 'd' },
                { "help", no_argument, 0, 'h' }, { "annealingRounds", required_argument, 0, 'i' }, { "trim", required_argument, 0, 'k' }, {
                        "trimChange", required_argument, 0, 'l', }, { "minimumTreeCoverage", required_argument, 0, 'm' }, { "blockTrim",
                        required_argument, 0, 'n' }, { "deannealingRounds", required_argument, 0, 'o' }, { "minimumDegree",
                        required_argument, 0, 'p' }, { "minimumIngroupDegree", required_argument, 0, 'q' }, {
                        "minimumOutgroupDegree", required_argument, 0, 'r' }, { "alignmentFilter", required_argument, 0, 't' }, {
                        "minimumSequenceLengthForBlast", required_argument, 0, 'v' }, { "maxAdjacencyComponentSizeRatio",
                        required_argument, 0, 'w' }, { "constraints", required_argument, 0, 'x' }, { "minLengthForChromosome",
                        required_argument, 0, 'y' }, { "proportionOfUnalignedBasesForNewChromosome", required_argument, 0, 'z' },
                        { "maximumMedianSequenceLengthBetweenLinkedEnds", required_argument, 0, 'A' },
                        { "realign", no_argument, 0, 'B' }, { "realignArguments", required_argument, 0, 'C' },
                        { "phylogenyNumTrees", required_argument, 0, 'D' },
                        { "phylogenyRootingMethod", required_argument, 0, 'E' },
                        { "phylogenyScoringMethod", required_argument, 0, 'F' },
                        { "phylogenyBreakpointScalingFactor", required_argument, 0, 'G' },
                        { "phylogenySkipSingleCopyBlocks", no_argument, 0, 'H' },
                        { "phylogenyMaxBaseDistance", required_argument, 0, 'I' },
                        { "phylogenyMaxBlockDistance", required_argument, 0, 'J' },
                        { "phylogenyDebugFile", required_argument, 0, 'K' },
                        { "phylogenyKeepSingleDegreeBlocks", no_argument, 0, 'L' },
                        { "phylogenyTreeBuildingMethod", required_argument, 0, 'M' },
                        { "phylogenyCostPerDupPerBase", required_argument, 0, 'N' },
                        { "phylogenyCostPerLossPerBase", required_argument, 0, 'O' },
                        { "referenceEventHeader", required_argument, 0, 'P' },
                        { "phylogenyDoSplitsWithSupportHigherThanThisAllAtOnce", required_argument, 0, 'Q' },
                        { "numTreeBuildingThreads", required_argument, 0, 'R' },
                        { "phylogeny", no_argument, 0, 'S' },
                        { "minimumBlockHomologySupport", required_argument, 0, 'T' },
                        { "phylogenyNucleotideScalingFactor", required_argument, 0, 'U' },
                        { "minimumBlockDegreeToCheckSupport", required_argument, 0, 'V' },
                        { "removeRecoverableChains", required_argument, 0, 'W' },
                        { "minimumNumberOfSpecies", required_argument, 0, 'X' },
                        { "phylogenyHomologyUnitType", required_argument, 0, 'Y' },
                        { "phylogenyDistanceCorrectionMethod", required_argument, 0, 'Z' },
                        { "maxRecoverableChainsIterations", required_argument, 0, '1' },
                        { "maxRecoverableChainLength", required_argument, 0, '2' },
                        { 0, 0, 0, 0 } };

        int option_index = 0;

        key = getopt_long(argc, argv, "a:b:c:hi:k:m:n:o:p:q:r:stv:w:x:y:z:A:BC:D:E:", long_options, &option_index);

        if (key == -1) {
            break;
        }

        switch (key) {
            case 'a':
                logLevelString = stString_copy(optarg);
                st_setLogLevelFromString(logLevelString);
                break;
            case 'b':
                alignmentsFile = stString_copy(optarg);
                break;
            case 'c':
                cactusDiskDatabaseString = stString_copy(optarg);
                break;
            case 'd':
                lastzArguments = stString_copy(optarg);
                break;
            case 'h':
                usage();
                return 0;
            case 'i':
                annealingRounds = getInts(optarg, &annealingRoundsLength);
                break;
            case 'o':
                meltingRounds = getInts(optarg, &meltingRoundsLength);
                break;
            case 'k':
                alignmentTrims = getInts(optarg, &alignmentTrimLength);
                break;
            case 'm':
                k = sscanf(optarg, "%f", &minimumTreeCoverage);
                assert(k == 1);
                break;
            case 'n':
                k = sscanf(optarg, "%" PRIi64 "", &blockTrim);
                assert(k == 1);
                break;
            case 'p':
                k = sscanf(optarg, "%" PRIi64 "", &minimumDegree);
                assert(k == 1);
                break;
            case 'q':
                k = sscanf(optarg, "%" PRIi64 "", &minimumIngroupDegree);
                assert(k == 1);
                break;
            case 'r':
                k = sscanf(optarg, "%" PRIi64 "", &minimumOutgroupDegree);
                assert(k == 1);
                break;
            case 't':
                if (strcmp(optarg, "singleCopyOutgroup") == 0) {
                    sortAlignments = true;
                    filterFn = stCaf_filterByOutgroup;
                } else if (strcmp(optarg, "relaxedSingleCopyOutgroup") == 0) {
                    sortAlignments = true;
                    filterFn = stCaf_relaxedFilterByOutgroup;
                } else if (strcmp(optarg, "singleCopy") == 0) {
                    sortAlignments = true;
                    filterFn = stCaf_filterByRepeatSpecies;
                } else if (strcmp(optarg, "relaxedSingleCopy") == 0) {
                    sortAlignments = true;
                    filterFn = stCaf_relaxedFilterByRepeatSpecies;
                } else if (strcmp(optarg, "singleCopyChr") == 0) {
                    sortAlignments = true;
                    filterFn = stCaf_singleCopyChr;
                } else if (strcmp(optarg, "singleCopyIngroup") == 0) {
                    sortAlignments = true;
                    filterFn = stCaf_singleCopyIngroup;
                } else if (strcmp(optarg, "relaxedSingleCopyIngroup") == 0) {
                    sortAlignments = true;
                    filterFn = stCaf_relaxedSingleCopyIngroup;
                } else if (strcmp(optarg, "none") == 0) {
                    sortAlignments = false;
                    filterFn = NULL;
                } else {
                    st_errAbort("Could not recognize alignmentFilter option %s", optarg);
                }
                break;
            case 'v':
                k = sscanf(optarg, "%" PRIi64 "", &minimumSequenceLengthForBlast);
                assert(k == 1);
                break;
            case 'w':
                k = sscanf(optarg, "%f", &maximumAdjacencyComponentSizeRatio);
                assert(k == 1);
                break;
            case 'x':
                constraintsFile = stString_copy(optarg);
                break;
            case 'y':
                k = sscanf(optarg, "%" PRIi64 "", &minLengthForChromosome);
                assert(k == 1);
                break;
            case 'z':
                k = sscanf(optarg, "%f", &proportionOfUnalignedBasesForNewChromosome);
                assert(k == 1);
                break;
            case 'A':
                k = sscanf(optarg, "%" PRIi64 "", &maximumMedianSequenceLengthBetweenLinkedEnds);
                assert(k == 1);
                break;
            case 'B':
                realign = 1;
                break;
            case 'C':
                realignArguments = stString_copy(optarg);
                break;
            case 'D':
                k = sscanf(optarg, "%" PRIi64, &phylogenyNumTrees);
                assert(k == 1);
                break;
            case 'E':
                if (!strcmp(optarg, "outgroupBranch")) {
                    phylogenyRootingMethod = OUTGROUP_BRANCH;
                } else if (!strcmp(optarg, "longestBranch")) {
                    phylogenyRootingMethod = LONGEST_BRANCH;
                } else if (!strcmp(optarg, "bestRecon")) {
                    phylogenyRootingMethod = BEST_RECON;
                } else {
                    st_errAbort("Invalid tree rooting method: %s", optarg);
                }
                break;
            case 'F':
                if (!strcmp(optarg, "reconCost")) {
                    phylogenyScoringMethod = RECON_COST;
                } else if (!strcmp(optarg, "nucLikelihood")) {
                    phylogenyScoringMethod = NUCLEOTIDE_LIKELIHOOD;
                } else if (!strcmp(optarg, "reconLikelihood")) {
                    phylogenyScoringMethod = RECON_LIKELIHOOD;
                } else if (!strcmp(optarg, "combinedLikelihood")) {
                    phylogenyScoringMethod = COMBINED_LIKELIHOOD;
                } else {
                    st_errAbort("Invalid tree scoring method: %s", optarg);
                }
                break;
            case 'G':
                k = sscanf(optarg, "%lf", &breakpointScalingFactor);
                assert(k == 1);
                break;
            case 'H':
                phylogenySkipSingleCopyBlocks = true;
                break;
            case 'I':
                k = sscanf(optarg, "%" PRIi64, &phylogenyMaxBaseDistance);
                assert(k == 1);
                break;
            case 'J':
                k = sscanf(optarg, "%" PRIi64, &phylogenyMaxBlockDistance);
                assert(k == 1);
                break;
            case 'K':
                debugFileName = stString_copy(optarg);
                break;
            case 'L':
                phylogenyKeepSingleDegreeBlocks = true;
                break;
            case 'M':
                // clear the default setting of the list
                stList_destruct(phylogenyTreeBuildingMethods);
                phylogenyTreeBuildingMethods = stList_construct();
                stList *methodStrings = stString_splitByString(optarg, ",");

                for (int64_t i = 0; i < stList_length(methodStrings); i++) {
                    char *methodString = stList_get(methodStrings, i);
                    enum stCaf_TreeBuildingMethod *method = st_malloc(sizeof(enum stCaf_TreeBuildingMethod));
                    if (strcmp(methodString, "neighborJoining") == 0) {
                        *method = NEIGHBOR_JOINING;
                    } else if (strcmp(methodString, "guidedNeighborJoining") == 0) {
                        *method = GUIDED_NEIGHBOR_JOINING;
                    } else if (strcmp(methodString, "splitDecomposition") == 0) {
                        *method = SPLIT_DECOMPOSITION;
                    } else if (strcmp(methodString, "strictSplitDecomposition") == 0) {
                        *method = STRICT_SPLIT_DECOMPOSITION;
                    } else if (strcmp(methodString, "removeBadChains") == 0) {
                        *method = REMOVE_BAD_CHAINS;
                    } else {
                        st_errAbort("Unknown tree building method: %s", methodString);
                    }
                    stList_append(phylogenyTreeBuildingMethods, method);
                }
                stList_destruct(methodStrings);
                break;
            case 'N':
                k = sscanf(optarg, "%lf", &phylogenyCostPerDupPerBase);
                assert(k == 1);
                break;
            case 'O':
                k = sscanf(optarg, "%lf", &phylogenyCostPerLossPerBase);
                assert(k == 1);
                break;
            case 'P':
                referenceEventHeader = stString_copy(optarg);
                break;
            case 'Q':
                k = sscanf(optarg, "%lf", &phylogenyDoSplitsWithSupportHigherThanThisAllAtOnce);
                assert(k == 1);
                break;
            case 'R':
                k = sscanf(optarg, "%" PRIi64, &numTreeBuildingThreads);
                assert(k == 1);
                break;
            case 'S':
                doPhylogeny = true;
                break;
            case 'T':
                k = sscanf(optarg, "%lf", &minimumBlockHomologySupport);
                assert(k == 1);
                assert(minimumBlockHomologySupport <= 1.0);
                assert(minimumBlockHomologySupport >= 0.0);
                break;
            case 'U':
                k = sscanf(optarg, "%lf", &nucleotideScalingFactor);
                assert(k == 1);
                break;
            case 'V':
                k = sscanf(optarg, "%" PRIi64, &minimumBlockDegreeToCheckSupport);
                assert(k == 1);
                break;
            case 'W':
                if (strcmp(optarg, "1") == 0) {
                    removeRecoverableChains = true;
                    recoverableChainsFilter = NULL;
                } else if (strcmp(optarg, "unequalNumberOfIngroupCopies") == 0) {
                    removeRecoverableChains = true;
                    recoverableChainsFilter = stCaf_chainHasUnequalNumberOfIngroupCopies;
                } else if (strcmp(optarg, "unequalNumberOfIngroupCopiesOrNoOutgroup") == 0) {
                    removeRecoverableChains = true;
                    recoverableChainsFilter = stCaf_chainHasUnequalNumberOfIngroupCopiesOrNoOutgroup;
                } else if (strcmp(optarg, "0") == 0) {
                    removeRecoverableChains = false;
                } else {
                    st_errAbort("Could not parse removeRecoverableChains argument");
                }
                break;
            case 'X':
                k = sscanf(optarg, "%" PRIi64, &minimumNumberOfSpecies);
                if (k != 1) {
                    st_errAbort("Error parsing the minimumNumberOfSpecies argument");
                }
                break;
            case 'Y':
                if (strcmp(optarg, "chain") == 0) {
                    phylogenyHomologyUnitType = CHAIN;
                } else if (strcmp(optarg, "block") == 0) {
                    phylogenyHomologyUnitType = BLOCK;
                } else {
                    st_errAbort("Could not parse the phylogenyHomologyUnitType argument");
                }
                break;
            case 'Z':
                if (strcmp(optarg, "jukesCantor") == 0) {
                    phylogenyDistanceCorrectionMethod = JUKES_CANTOR;
                } else if (strcmp(optarg, "none") == 0 ) {
                    phylogenyDistanceCorrectionMethod = NONE;
                } else {
                    st_errAbort("Could not parse the phylogenyDistanceCorrectionMethod argument");
                }
                break;
            case '1':
                k = sscanf(optarg, "%" PRIi64, &maxRecoverableChainsIterations);
                if (k != 1) {
                    st_errAbort("Error parsing the maxRecoverableChainsIterations argument");
                }
                break;
            case '2':
                k = sscanf(optarg, "%" PRIi64, &maxRecoverableChainLength);
                if (k != 1) {
                    st_errAbort("Error parsing the maxRecoverableChainLength argument");
                }
                break;
            default:
                usage();
                return 1;
        }
    }

    ///////////////////////////////////////////////////////////////////////////
    // (0) Check the inputs.
    ///////////////////////////////////////////////////////////////////////////

    assert(cactusDiskDatabaseString != NULL);
    assert(minimumTreeCoverage >= 0.0);
    assert(minimumTreeCoverage <= 1.0);
    assert(blockTrim >= 0);
    assert(annealingRoundsLength >= 0);
    for (int64_t i = 0; i < annealingRoundsLength; i++) {
        assert(annealingRounds[i] >= 0);
    }
    assert(meltingRoundsLength >= 0);
    for (int64_t i = 1; i < meltingRoundsLength; i++) {
        assert(meltingRounds[i - 1] < meltingRounds[i]);
        assert(meltingRounds[i - 1] >= 1);
    }
    assert(alignmentTrimLength >= 0);
    for (int64_t i = 0; i < alignmentTrimLength; i++) {
        assert(alignmentTrims[i] >= 0);
    }
    assert(minimumOutgroupDegree >= 0);
    assert(minimumIngroupDegree >= 0);

    //////////////////////////////////////////////
    //Set up logging
    //////////////////////////////////////////////

    st_setLogLevelFromString(logLevelString);

    //////////////////////////////////////////////
    //Log (some of) the inputs
    //////////////////////////////////////////////

    st_logInfo("Flower disk name : %s\n", cactusDiskDatabaseString);

    //////////////////////////////////////////////
    //Load the database
    //////////////////////////////////////////////

    kvDatabaseConf = stKVDatabaseConf_constructFromString(cactusDiskDatabaseString);
    cactusDisk = cactusDisk_construct(kvDatabaseConf, 0);
    st_logInfo("Set up the flower disk\n");

    ///////////////////////////////////////////////////////////////////////////
    // Sort the constraints
    ///////////////////////////////////////////////////////////////////////////

    stPinchIterator *pinchIteratorForConstraints = NULL;
    if (constraintsFile != NULL) {
        pinchIteratorForConstraints = stPinchIterator_constructFromFile(constraintsFile);
        st_logInfo("Created an iterator for the alignment constaints from file: %s\n", constraintsFile);
    }

    ///////////////////////////////////////////////////////////////////////////
    // Do the alignment
    ///////////////////////////////////////////////////////////////////////////

    startTime = time(NULL);

    stList *flowers = flowerWriter_parseFlowersFromStdin(cactusDisk);
    if (alignmentsFile == NULL) {
        cactusDisk_preCacheStrings(cactusDisk, flowers);
    }
    char *tempFile1 = NULL;
    for (int64_t i = 0; i < stList_length(flowers); i++) {
        flower = stList_get(flowers, i);
        if (!flower_builtBlocks(flower)) { // Do nothing if the flower already has defined blocks
            st_logDebug("Processing flower: %lli\n", flower_getName(flower));

            stCaf_setFlowerForAlignmentFiltering(flower);

            //Set up the graph and add the initial alignments
            stPinchThreadSet *threadSet = stCaf_setup(flower);

            //Build the set of outgroup threads
            outgroupThreads = stCaf_getOutgroupThreads(flower, threadSet);

            //Setup the alignments
            stPinchIterator *pinchIterator;
            stList *alignmentsList = NULL;
            if (alignmentsFile != NULL) {
                assert(i == 0);
                assert(stList_length(flowers) == 1);
                if (sortAlignments) {
                    tempFile1 = getTempFile();
                    stCaf_sortCigarsFileByScoreInDescendingOrder(alignmentsFile, tempFile1);
                    pinchIterator = stPinchIterator_constructFromFile(tempFile1);
                } else {
                    pinchIterator = stPinchIterator_constructFromFile(alignmentsFile);
                }
            } else {
                if (tempFile1 == NULL) {
                    tempFile1 = getTempFile();
                }
                alignmentsList = stCaf_selfAlignFlower(flower, minimumSequenceLengthForBlast, lastzArguments, realign, realignArguments, tempFile1);
                if (sortAlignments) {
                    stCaf_sortCigarsByScoreInDescendingOrder(alignmentsList);
                }
                st_logDebug("Ran lastz and have %" PRIi64 " alignments\n", stList_length(alignmentsList));
                pinchIterator = stPinchIterator_constructFromList(alignmentsList);
            }

            for (int64_t annealingRound = 0; annealingRound < annealingRoundsLength; annealingRound++) {
                int64_t minimumChainLength = annealingRounds[annealingRound];
                int64_t alignmentTrim = annealingRound < alignmentTrimLength ? alignmentTrims[annealingRound] : 0;
                st_logDebug("Starting annealing round with a minimum chain length of %" PRIi64 " and an alignment trim of %" PRIi64 "\n", minimumChainLength, alignmentTrim);
                stPinchIterator_setTrim(pinchIterator, alignmentTrim);

                //Add back in the constraints
                if (pinchIteratorForConstraints != NULL) {
                    stCaf_anneal(threadSet, pinchIteratorForConstraints, filterFn);
                }

                //Do the annealing
                if (annealingRound == 0) {
                    stCaf_anneal(threadSet, pinchIterator, filterFn);
                } else {
                    stCaf_annealBetweenAdjacencyComponents(threadSet, pinchIterator, filterFn);
                }

                // Dump the block degree and length distribution to a file
                if (debugFileName != NULL) {
                    dumpBlockInfo(threadSet, stString_print("%s-blockStats-preMelting", debugFileName));
                }

                printf("Sequence graph statistics after annealing:\n");
                printThreadSetStatistics(threadSet, flower, stdout);

                // Check for poorly-supported blocks--those that have
                // been transitively aligned together but with very
                // few homologies supporting the transitive
                // alignment. These "megablocks" can snarl up the
                // graph so that a lot of extra gets thrown away in
                // the first melting step.
                stPinchThreadSetBlockIt blockIt = stPinchThreadSet_getBlockIt(threadSet);
                stPinchBlock *block;
                while ((block = stPinchThreadSetBlockIt_getNext(&blockIt)) != NULL) {
                    if (stPinchBlock_getDegree(block) > minimumBlockDegreeToCheckSupport) {
                        uint64_t supportingHomologies = stPinchBlock_getNumSupportingHomologies(block);
                        uint64_t possibleSupportingHomologies = numPossibleSupportingHomologies(block, flower);
                        double support = ((double) supportingHomologies) / possibleSupportingHomologies;
                        if (support < minimumBlockHomologySupport) {
                            fprintf(stdout, "Destroyed a megablock with degree %" PRIi64
                                    " and %" PRIi64 " supporting homologies out of a maximum "
                                    "of %" PRIi64 " (%lf%%).\n", stPinchBlock_getDegree(block),
                                    supportingHomologies, possibleSupportingHomologies, support);
                            stPinchBlock_destruct(block);
                        }
                    }
                }

                //Do the melting rounds
                for (int64_t meltingRound = 0; meltingRound < meltingRoundsLength; meltingRound++) {
                    int64_t minimumChainLengthForMeltingRound = meltingRounds[meltingRound];
                    st_logDebug("Starting melting round with a minimum chain length of %" PRIi64 " \n", minimumChainLengthForMeltingRound);
                    if (minimumChainLengthForMeltingRound >= minimumChainLength) {
                        break;
                    }
                    stCaf_melt(flower, threadSet, NULL, 0, minimumChainLengthForMeltingRound, 0, INT64_MAX);
                } st_logDebug("Last melting round of cycle with a minimum chain length of %" PRIi64 " \n", minimumChainLength);
                stCaf_melt(flower, threadSet, NULL, 0, minimumChainLength, breakChainsAtReverseTandems, maximumMedianSequenceLengthBetweenLinkedEnds);
                //This does the filtering of blocks that do not have the required species/tree-coverage/degree.
                stCaf_melt(flower, threadSet, blockFilterFn, blockTrim, 0, 0, INT64_MAX);
            }

            if (removeRecoverableChains) {
                stCaf_meltRecoverableChains(flower, threadSet, breakChainsAtReverseTandems, maximumMedianSequenceLengthBetweenLinkedEnds, recoverableChainsFilter, maxRecoverableChainsIterations, maxRecoverableChainLength);
            }
            if (debugFileName != NULL) {
                dumpBlockInfo(threadSet, stString_print("%s-blockStats-postMelting", debugFileName));
            }

            printf("Sequence graph statistics after melting:\n");
            printThreadSetStatistics(threadSet, flower, stdout);

            // Build a tree for each block, then use each tree to
            // partition the homologies between the ingroups sequences
            // into those that occur before the speciation with the
            // outgroup and those which occur late.

            if (stSet_size(outgroupThreads) > 0 && doPhylogeny) {
                st_logDebug("Starting to build trees and partition ingroup homologies\n");
                stHash *threadStrings = stCaf_getThreadStrings(flower, threadSet);
                st_logDebug("Got sets of thread strings and set of threads that are outgroups\n");
                stCaf_PhylogenyParameters params;
                params.distanceCorrectionMethod = phylogenyDistanceCorrectionMethod;
                params.treeBuildingMethods = phylogenyTreeBuildingMethods;
                params.rootingMethod = phylogenyRootingMethod;
                params.scoringMethod = phylogenyScoringMethod;
                params.breakpointScalingFactor = breakpointScalingFactor;
                params.nucleotideScalingFactor = nucleotideScalingFactor;
                params.skipSingleCopyBlocks = phylogenySkipSingleCopyBlocks;
                params.keepSingleDegreeBlocks = phylogenyKeepSingleDegreeBlocks;
                params.costPerDupPerBase = phylogenyCostPerDupPerBase;
                params.costPerLossPerBase = phylogenyCostPerLossPerBase;
                params.maxBaseDistance = phylogenyMaxBaseDistance;
                params.maxBlockDistance = phylogenyMaxBlockDistance;
                params.numTrees = phylogenyNumTrees;
                params.ignoreUnalignedBases = 1;
                params.onlyIncludeCompleteFeatureBlocks = 0;
                params.doSplitsWithSupportHigherThanThisAllAtOnce = phylogenyDoSplitsWithSupportHigherThanThisAllAtOnce;
                params.numTreeBuildingThreads = numTreeBuildingThreads;

                assert(params.numTreeBuildingThreads >= 1);

                stCaf_buildTreesToRemoveAncientHomologies(
                    threadSet, phylogenyHomologyUnitType, threadStrings, outgroupThreads, flower, &params,
                    debugFileName == NULL ? NULL : stString_print("%s-phylogeny", debugFileName), referenceEventHeader);
                stHash_destruct(threadStrings);
                st_logDebug("Finished building trees\n");

                if (removeRecoverableChains) {
                    // We melt recoverable chains after splitting, as
                    // well as before, to alleviate coverage loss
                    // caused by bad splits.
                    stCaf_meltRecoverableChains(flower, threadSet, breakChainsAtReverseTandems, maximumMedianSequenceLengthBetweenLinkedEnds, recoverableChainsFilter, maxRecoverableChainsIterations, maxRecoverableChainLength);
                }

                // Enforce the block constraints on minimum degree,
                // etc. after splitting.
                stCaf_melt(flower, threadSet, blockFilterFn, 0, 0, 0, INT64_MAX);
            }

            //Sort out case when we allow blocks of degree 1
            if (minimumDegree < 2) {
                st_logDebug("Creating degree 1 blocks\n");
                stCaf_makeDegreeOneBlocks(threadSet);
                stCaf_melt(flower, threadSet, blockFilterFn, blockTrim, 0, 0, INT64_MAX);
            } else if (maximumAdjacencyComponentSizeRatio < INT64_MAX) { //Deal with giant components
                st_logDebug("Breaking up components greedily\n");
                stCaf_breakupComponentsGreedily(threadSet, maximumAdjacencyComponentSizeRatio);
            }

            //Finish up
            stCaf_finish(flower, threadSet, chainLengthForBigFlower, longChain, minLengthForChromosome,
                    proportionOfUnalignedBasesForNewChromosome); //Flower is then destroyed at this point.
            st_logInfo("Ran the cactus core script\n");

            //Cleanup
            stPinchThreadSet_destruct(threadSet);
            stPinchIterator_destruct(pinchIterator);
            stSet_destruct(outgroupThreads);

            if (alignmentsList != NULL) {
                stList_destruct(alignmentsList);
            }
            st_logInfo("Cleaned up from main loop\n");
        } else {
            st_logInfo("We've already built blocks / alignments for this flower\n");
        }
    }
    stList_destruct(flowers);
    if (tempFile1 != NULL) {
        st_system("rm %s", tempFile1);
    }

    if (constraintsFile != NULL) {
        stPinchIterator_destruct(pinchIteratorForConstraints);
    }

    ///////////////////////////////////////////////////////////////////////////
    // Write the flower to disk.
    ///////////////////////////////////////////////////////////////////////////
    st_logDebug("Writing the flowers to disk\n");
    cactusDisk_write(cactusDisk);
    st_logInfo("Updated the flower on disk and %" PRIi64 " seconds have elapsed\n", time(NULL) - startTime);

    ///////////////////////////////////////////////////////////////////////////
    // Clean up.
    ///////////////////////////////////////////////////////////////////////////

    cactusDisk_destruct(cactusDisk);
}
Пример #4
0
int main(int argc, char *argv[]) {

    char * logLevelString = NULL;
    char * cactusDiskDatabaseString = NULL;
    int64_t i, j;
    int64_t spanningTrees = 10;
    int64_t maximumLength = 1500;
    bool useProgressiveMerging = 0;
    float matchGamma = 0.5;
    bool useBanding = 0;
    int64_t k;
    stList *listOfEndAlignmentFiles = NULL;
    char *endAlignmentsToPrecomputeOutputFile = NULL;
    bool calculateWhichEndsToComputeSeparately = 0;
    int64_t largeEndSize = 1000000;
    int64_t chainLengthForBigFlower = 1000000;
    int64_t longChain = 2;
    char *ingroupCoverageFilePath = NULL;
    int64_t minimumSizeToRescue = 1;
    double minimumCoverageToRescue = 0.0;

    PairwiseAlignmentParameters *pairwiseAlignmentBandingParameters = pairwiseAlignmentBandingParameters_construct();

    /*
     * Setup the input parameters for cactus core.
     */
    bool pruneOutStubAlignments = 0;

    /*
     * Parse the options.
     */
    while (1) {
        static struct option long_options[] = { { "logLevel", required_argument, 0, 'a' }, { "cactusDisk", required_argument, 0, 'b' }, {
                "help", no_argument, 0, 'h' }, { "spanningTrees", required_argument, 0, 'i' },
                { "maximumLength", required_argument, 0, 'j' }, { "useBanding", no_argument, 0, 'k' },
                { "gapGamma", required_argument, 0, 'l' }, { "matchGamma", required_argument, 0, 'L' },
                { "splitMatrixBiggerThanThis", required_argument, 0, 'o' }, { "anchorMatrixBiggerThanThis",
                        required_argument, 0, 'p' }, { "repeatMaskMatrixBiggerThanThis", required_argument, 0, 'q' }, {
                        "diagonalExpansion", required_argument, 0, 'r' }, { "constraintDiagonalTrim", required_argument, 0, 't' }, {
                        "minimumDegree", required_argument, 0, 'u' }, { "alignAmbiguityCharacters", no_argument, 0, 'w' }, {
                        "pruneOutStubAlignments", no_argument, 0, 'y' }, {
                        "minimumIngroupDegree", required_argument, 0, 'A' }, { "minimumOutgroupDegree", required_argument, 0, 'B' },
                { "precomputedAlignments", required_argument, 0, 'D' }, {
                        "endAlignmentsToPrecomputeOutputFile", required_argument, 0, 'E' }, { "useProgressiveMerging",
                        no_argument, 0, 'F' }, { "calculateWhichEndsToComputeSeparately", no_argument, 0, 'G' }, { "largeEndSize",
                        required_argument, 0, 'I' },
                        {"ingroupCoverageFile", required_argument, 0, 'J'},
                        {"minimumSizeToRescue", required_argument, 0, 'K'},
                        {"minimumCoverageToRescue", required_argument, 0, 'M'},
                        { "minimumNumberOfSpecies", required_argument, 0, 'N' },
                        { 0, 0, 0, 0 } };

        int option_index = 0;

        int key = getopt_long(argc, argv, "a:b:hi:j:kl:o:p:q:r:t:u:wy:A:B:D:E:FGI:J:K:L:M:N:", long_options, &option_index);

        if (key == -1) {
            break;
        }

        switch (key) {
            case 'a':
                logLevelString = stString_copy(optarg);
                st_setLogLevelFromString(logLevelString);
                break;
            case 'b':
                cactusDiskDatabaseString = stString_copy(optarg);
                break;
            case 'h':
                usage();
                return 0;
            case 'i':
                i = sscanf(optarg, "%" PRIi64 "", &spanningTrees);
                (void) i;
                assert(i == 1);
                assert(spanningTrees >= 0);
                break;
            case 'j':
                i = sscanf(optarg, "%" PRIi64 "", &maximumLength);
                assert(i == 1);
                assert(maximumLength >= 0);
                break;
            case 'k':
                useBanding = !useBanding;
                break;
            case 'l':
                i = sscanf(optarg, "%f", &pairwiseAlignmentBandingParameters->gapGamma);
                assert(i == 1);
                assert(pairwiseAlignmentBandingParameters->gapGamma >= 0.0);
                break;
            case 'L':
                i = sscanf(optarg, "%f", &matchGamma);
                assert(i == 1);
                assert(matchGamma >= 0.0);
                break;
            case 'o':
                i = sscanf(optarg, "%" PRIi64 "", &k);
                assert(i == 1);
                assert(k >= 0);
                pairwiseAlignmentBandingParameters->splitMatrixBiggerThanThis = (int64_t) k * k;
                break;
            case 'p':
                i = sscanf(optarg, "%" PRIi64 "", &k);
                assert(i == 1);
                assert(k >= 0);
                pairwiseAlignmentBandingParameters->anchorMatrixBiggerThanThis = (int64_t) k * k;
                break;
            case 'q':
                i = sscanf(optarg, "%" PRIi64 "", &k);
                assert(i == 1);
                assert(k >= 0);
                pairwiseAlignmentBandingParameters->repeatMaskMatrixBiggerThanThis = (int64_t) k * k;
                break;
            case 'r':
                i = sscanf(optarg, "%" PRIi64 "", &pairwiseAlignmentBandingParameters->diagonalExpansion);
                assert(i == 1);
                assert(pairwiseAlignmentBandingParameters->diagonalExpansion >= 0);
                assert(pairwiseAlignmentBandingParameters->diagonalExpansion % 2 == 0);
                break;
            case 't':
                i = sscanf(optarg, "%" PRIi64 "", &pairwiseAlignmentBandingParameters->constraintDiagonalTrim);
                assert(i == 1);
                assert(pairwiseAlignmentBandingParameters->constraintDiagonalTrim >= 0);
                break;
            case 'u':
                i = sscanf(optarg, "%" PRIi64 "", &minimumDegree);
                assert(i == 1);
                break;
            case 'w':
                pairwiseAlignmentBandingParameters->alignAmbiguityCharacters = 1;
                break;
            case 'y':
                pruneOutStubAlignments = 1;
                break;
            case 'A':
                i = sscanf(optarg, "%" PRIi64 "", &minimumIngroupDegree);
                assert(i == 1);
                break;
            case 'B':
                i = sscanf(optarg, "%" PRIi64 "", &minimumOutgroupDegree);
                assert(i == 1);
                break;
            case 'D':
                listOfEndAlignmentFiles = stString_split(optarg);
                break;
            case 'E':
                endAlignmentsToPrecomputeOutputFile = stString_copy(optarg);
                break;
            case 'F':
                useProgressiveMerging = 1;
                break;
            case 'G':
                calculateWhichEndsToComputeSeparately = 1;
                break;
            case 'I':
                i = sscanf(optarg, "%" PRIi64 "", &largeEndSize);
                assert(i == 1);
                break;
            case 'J':
                ingroupCoverageFilePath = stString_copy(optarg);
                break;
            case 'K':
                i = sscanf(optarg, "%" PRIi64, &minimumSizeToRescue);
                assert(i == 1);
                break;
            case 'M':
                i = sscanf(optarg, "%lf", &minimumCoverageToRescue);
                assert(i == 1);
                break;
            case 'N':
                i = sscanf(optarg, "%" PRIi64, &minimumNumberOfSpecies);
                if (i != 1) {
                    st_errAbort("Error parsing minimumNumberOfSpecies parameter");
                }
                break;
            default:
                usage();
                return 1;
        }
    }

    st_setLogLevelFromString(logLevelString);

    /*
     * Load the flowerdisk
     */
    stKVDatabaseConf *kvDatabaseConf = stKVDatabaseConf_constructFromString(cactusDiskDatabaseString);
    CactusDisk *cactusDisk = cactusDisk_construct(kvDatabaseConf, 0); //We precache the sequences
    st_logInfo("Set up the flower disk\n");

    /*
     * Load the hmm
     */
    StateMachine *sM = stateMachine5_construct(fiveState);

    /*
     * For each flower.
     */
    if (calculateWhichEndsToComputeSeparately) {
        stList *flowers = flowerWriter_parseFlowersFromStdin(cactusDisk);
        if (stList_length(flowers) != 1) {
            st_errAbort("We are breaking up a flower's end alignments for precomputation but we have %" PRIi64 " flowers.\n", stList_length(flowers));
        }
        stSortedSet *endsToAlignSeparately = getEndsToAlignSeparately(stList_get(flowers, 0), maximumLength, largeEndSize);
        assert(stSortedSet_size(endsToAlignSeparately) != 1);
        stSortedSetIterator *it = stSortedSet_getIterator(endsToAlignSeparately);
        End *end;
        while ((end = stSortedSet_getNext(it)) != NULL) {
            fprintf(stdout, "%s\t%" PRIi64 "\t%" PRIi64 "\n", cactusMisc_nameToStringStatic(end_getName(end)), end_getInstanceNumber(end), getTotalAdjacencyLength(end));
        }
        return 0; //avoid cleanup costs
        stSortedSet_destructIterator(it);
        stSortedSet_destruct(endsToAlignSeparately);
    } else if (endAlignmentsToPrecomputeOutputFile != NULL) {
        /*
         * In this case we will align a set of end and save the alignments in a file.
         */
        stList *names = flowerWriter_parseNames(stdin);
        Flower *flower = cactusDisk_getFlower(cactusDisk, *((Name *)stList_get(names, 0)));
        FILE *fileHandle = fopen(endAlignmentsToPrecomputeOutputFile, "w");
        for(int64_t i=1; i<stList_length(names); i++) {
            End *end = flower_getEnd(flower, *((Name *)stList_get(names, i)));
            if (end == NULL) {
                st_errAbort("The end %" PRIi64 " was not found in the flower\n", *((Name *)stList_get(names, i)));
            }
            stSortedSet *endAlignment = makeEndAlignment(sM, end, spanningTrees, maximumLength, useProgressiveMerging,
                            matchGamma, pairwiseAlignmentBandingParameters);
            writeEndAlignmentToDisk(end, endAlignment, fileHandle);
            stSortedSet_destruct(endAlignment);
        }
        fclose(fileHandle);
        return 0; //avoid cleanup costs
        stList_destruct(names);
        st_logInfo("Finished precomputing end alignments\n");
    } else {
        /*
         * Compute complete flower alignments, possibly loading some precomputed alignments.
         */
        bedRegion *bedRegions = NULL;
        size_t numBeds = 0;
        if (ingroupCoverageFilePath != NULL) {
            // Pre-load the mmap for the coverage file.
            FILE *coverageFile = fopen(ingroupCoverageFilePath, "rb");
            if (coverageFile == NULL) {
                st_errnoAbort("Opening coverage file %s failed",
                              ingroupCoverageFilePath);
            }
            fseek(coverageFile, 0, SEEK_END);
            int64_t coverageFileLen = ftell(coverageFile);
            assert(coverageFileLen >= 0);
            assert(coverageFileLen % sizeof(bedRegion) == 0);
            if (coverageFileLen == 0) {
                // mmap doesn't like length-0 mappings, for obvious
                // reasons. Pretend that the coverage file doesn't
                // exist in this case, since it contains no data.
                ingroupCoverageFilePath = NULL;
            } else {
                // Establish a memory mapping for the file.
                bedRegions = mmap(NULL, coverageFileLen, PROT_READ, MAP_SHARED,
                                  fileno(coverageFile), 0);
                if (bedRegions == MAP_FAILED) {
                    st_errnoAbort("Failure mapping coverage file");
                }

                numBeds = coverageFileLen / sizeof(bedRegion);
            }
            fclose(coverageFile);
        }

        stList *flowers = flowerWriter_parseFlowersFromStdin(cactusDisk);
        if (listOfEndAlignmentFiles != NULL && stList_length(flowers) != 1) {
            st_errAbort("We have precomputed alignments but %" PRIi64 " flowers to align.\n", stList_length(flowers));
        }
        cactusDisk_preCacheStrings(cactusDisk, flowers);
        for (j = 0; j < stList_length(flowers); j++) {
            flower = stList_get(flowers, j);
            st_logInfo("Processing a flower\n");

            stSortedSet *alignedPairs = makeFlowerAlignment3(sM, flower, listOfEndAlignmentFiles, spanningTrees, maximumLength,
                    useProgressiveMerging, matchGamma, pairwiseAlignmentBandingParameters, pruneOutStubAlignments);
            st_logInfo("Created the alignment: %" PRIi64 " pairs\n", stSortedSet_size(alignedPairs));
            stPinchIterator *pinchIterator = stPinchIterator_constructFromAlignedPairs(alignedPairs, getNextAlignedPairAlignment);

            /*
             * Run the cactus caf functions to build cactus.
             */
            stPinchThreadSet *threadSet = stCaf_setup(flower);
            stCaf_anneal(threadSet, pinchIterator, NULL);
            if (minimumDegree < 2) {
                stCaf_makeDegreeOneBlocks(threadSet);
            }
            if (minimumIngroupDegree > 0 || minimumOutgroupDegree > 0 || minimumDegree > 1) {
                stCaf_melt(flower, threadSet, blockFilterFn, 0, 0, 0, INT64_MAX);
            }

            if (ingroupCoverageFilePath != NULL) {
                // Rescue any sequence that is covered by outgroups
                // but currently unaligned into single-degree blocks.
                stPinchThreadSetIt pinchIt = stPinchThreadSet_getIt(threadSet);
                stPinchThread *thread;
                while ((thread = stPinchThreadSetIt_getNext(&pinchIt)) != NULL) {
                    Cap *cap = flower_getCap(flower,
                                             stPinchThread_getName(thread));
                    assert(cap != NULL);
                    Sequence *sequence = cap_getSequence(cap);
                    assert(sequence != NULL);
                    rescueCoveredRegions(thread, bedRegions, numBeds,
                                         sequence_getName(sequence),
                                         minimumSizeToRescue,
                                         minimumCoverageToRescue);
                }
                stCaf_joinTrivialBoundaries(threadSet);
            }

            stCaf_finish(flower, threadSet, chainLengthForBigFlower, longChain, INT64_MAX, INT64_MAX); //Flower now destroyed.
            stPinchThreadSet_destruct(threadSet);
            st_logInfo("Ran the cactus core script.\n");

            /*
             * Cleanup
             */
            //Clean up the sorted set after cleaning up the iterator
            stPinchIterator_destruct(pinchIterator);
            stSortedSet_destruct(alignedPairs);

            st_logInfo("Finished filling in the alignments for the flower\n");
        }
        stList_destruct(flowers);
        //st_errAbort("Done\n");
        /*
         * Write and close the cactusdisk.
         */
        cactusDisk_write(cactusDisk);
        return 0; //Exit without clean up is quicker, enable cleanup when doing memory leak detection.
        if (bedRegions != NULL) {
            // Clean up our mapping.
            munmap(bedRegions, numBeds * sizeof(bedRegion));
        }
    }


    ///////////////////////////////////////////////////////////////////////////
    // Cleanup
    ///////////////////////////////////////////////////////////////////////////

    stateMachine_destruct(sM);
    cactusDisk_destruct(cactusDisk);
    stKVDatabaseConf_destruct(kvDatabaseConf);
    //destructCactusCoreInputParameters(cCIP);
    free(cactusDiskDatabaseString);
    if (listOfEndAlignmentFiles != NULL) {
        stList_destruct(listOfEndAlignmentFiles);
    }
    if (logLevelString != NULL) {
        free(logLevelString);
    }
    st_logInfo("Finished with the flower disk for this flower.\n");

    //while(1);

    return 0;
}
Пример #5
0
/**
 * cactusMerge.cpp: merge two pairs of c2h and FASTA files into one pair. The
 * files must be star trees with the same root sequence.
 */
int 
main(
    int argc, 
    char** argv
) {

    // Register ctrl+c handler. See
    // <http://www.yolinux.com/TUTORIALS/C++Signals.html>
    signal(SIGINT, stacktraceOnSignal);
    
    // Register segfaults with the stack trace handler
    signal(SIGSEGV, stacktraceOnSignal);
    
    // Parse options with boost::programOptions. See
    // <http://www.radmangames.com/programming/how-to-use-boost-program_options>

    std::string appDescription = 
        std::string("Merge c2h/FASTA file pairs.\n" 
        "Usage: cactusMerge <c2hOut> <fastaOut> --c2h <c2h files...> "
            "--fasta <fasta files...> --suffix <suffixes...>");

    // Make an options description for our program's options.
    boost::program_options::options_description description("Options");
    // Add all the options
    description.add_options() 
        ("help", "Print help messages")
        ("c2h", boost::program_options::value<std::vector<std::string>>(),
            "List of c2h files to merge")
        ("fasta", boost::program_options::value<std::vector<std::string>>(),
            "List of FASTA files for the given c2h files")
        ("suffix", boost::program_options::value<std::vector<std::string>>(),
            "List of suffixes to add on to event names")
        ("mergeOn", boost::program_options::value<std::string>()->required(), 
            "An event on which to merge the files")
        ("c2hOut", boost::program_options::value<std::string>()->required(), 
            "File to save .c2h-format alignment in")
        ("fastaOut", boost::program_options::value<std::string>()->required(), 
            "File in which to save FASTA records for building HAL from .c2h");
        
        
        
    // And set up our positional arguments
    boost::program_options::positional_options_description positionals;
    positionals.add("mergeOn", 1);
    positionals.add("c2hOut", 1);
    positionals.add("fastaOut", 1);
    
    // Add a variables map to hold option variables.
    boost::program_options::variables_map options;
    
    try {
        // Parse options into the variable map, or throw an error if there's
        // something wring with them.
        boost::program_options::store(
            // Build the command line parser.
            boost::program_options::command_line_parser(argc, argv)
                .options(description)
                .positional(positionals)
                .run(),
            options);
        boost::program_options::notify(options);
            
        if(options.count("help")) {
            // The help option was given. Print program help.
            std::cout << appDescription << std::endl;
            std::cout << description << std::endl;
            
            // Don't do the actual program.
            return 0; 
        }
        
        if(!options.count("mergeOn") || !options.count("c2h") ||
            !options.count("fasta")) {
            
            // We need both of these
            throw boost::program_options::error("Missing important arguments!");
        }
        
        if(options["c2h"].as<std::vector<std::string>>().size() != 
            options["fasta"].as<std::vector<std::string>>().size()) {
            
            // Counts need to match up here, because these are pairs
            throw boost::program_options::error(
                "c2h/fasta counts don't match!");
        }
        
        if(options.count("suffix") && 
            options["c2h"].as<std::vector<std::string>>().size() != 
            options["suffix"].as<std::vector<std::string>>().size()) {
        
            // If we have any suffixes we must have the right number
            throw boost::program_options::error(
                "c2h/suffix counts don't match!");
        }
            
    } catch(boost::program_options::error& error) {
        // Something is bad about our options. Complain on stderr
        std::cerr << "Option parsing error: " << error.what() << std::endl;
        std::cerr << std::endl; 
        // Talk about our app.
        std::cerr << appDescription << std::endl;
        // Show all the actually available options.
        std::cerr << description << std::endl; 
        
        // Stop the program.
        return -1; 
    }
    
    // If we get here, we have the right arguments.
    
    // Make a list of the c2h files to use
    std::vector<std::string> c2hFiles(
        options["c2h"].as<std::vector<std::string>>());
    
    // This holds the suffix applied to all the top sequences and events in each
    // file.
    std::vector<std::string> suffixes(
        options["suffix"].as<std::vector<std::string>>());
        
    // Make a list of the FASTA files to use
    std::vector<std::string> fastaFiles(
        options["fasta"].as<std::vector<std::string>>());
    
    // This will hold all of the renames that have to happen for each file.
    // These are generated when we go through the file by renaming top and
    // bottom sequences with suffixes.
    std::vector<std::map<std::string, std::string>> renames;
    
    for(size_t i = 0; i < c2hFiles.size(); i++) {
        // Make sure it has an empty map of renames for each file.
        renames.push_back(std::map<std::string, std::string>());
    }
    
    // This will hold the event names for the c2h files in order
    std::vector<std::string> eventNames;
    
    // And this will hold the sequence names
    std::vector<std::string> sequenceNames;
    
    // This will hold bottom (1) and top (0) flags for each sequence.
    std::vector<bool> isBottom;
    
    // And this will hold the sequence lengths
    std::vector<size_t> sequenceLengths;
    
    // This will hold the first sequence number for any event and sequence name
    std::map<std::pair<std::string, std::string>, size_t> firstSequenceNumber;
    
    // Holds Merge structs to be executed later.
    std::vector<C2hMerge> merges;
    
    // We're going to throw out all of the events that are old rootSeqs, and
    // just keep the actual leaves. This holds the list of renamed event names
    // we are keeping.
    std::set<std::string> eventsToKeep;
    
    for(size_t fileIndex = 0; fileIndex < c2hFiles.size(); fileIndex++) {
        // Scan through the c2h files to get the event, sequence, and length of
        // each thread, and to collect merges.
        
        Log::output() << "Reading alignment " << c2hFiles[fileIndex] << 
            std::endl;
        
        // Open the file
        std::ifstream c2h(c2hFiles[fileIndex]);
        
        // This maps block name to (sequence number, start location) pairs for
        // this file. We use it to compose merges for our list in global
        // sequence number space.
        std::map<size_t, std::pair<size_t, size_t>> nameMap;
        
        for(std::string line; std::getline(c2h, line);) {
            // This is a new sequence. Split it up on \t.
            std::vector<std::string> parts;
            boost::split(parts, line, boost::is_any_of("\t"));
        
            if(parts.size() < 1) {
                // Skip lines that have nothing on them.
                continue;
            }
        
            // For each line
            if(parts[0] == "s") {
                
                // It's a sequence line. Start a new squence.
                
                if(parts.size() != 4) {
                    // Not the right number of fields.
                    throw std::runtime_error(
                        std::string("Invalid field count in ") + line);
                }
                
                // Grab the parts
                std::string eventName = unquote(parts[1]);
                std::string sequenceName = unquote(parts[2]);
                bool bottomFlag = std::stoi(parts[3]);
                
                Log::info() << "Read sequence " << eventName << "." << 
                    sequenceName << (bottomFlag ? " (bottom)" : " (top)") << 
                    std::endl;
                
                if(eventName != options["mergeOn"].as<std::string>()) {
                    // We aren't merging on this sequence, so we may have to
                    // apply a suffix.
                
                    // We need to rename this event (possibly to the same thing)
                    renames[fileIndex][eventName] = eventName + 
                        suffixes[fileIndex];
                        
                    if(bottomFlag) {
                        // All the bottom events (that aren't being merged
                        // on) need to be renamed apart manually since the names
                        // may be reused.
                        renames[fileIndex][eventName] += "-" + 
                            std::to_string(fileIndex);
                    }
                    eventName = renames[fileIndex][eventName];
                    
                    if(!bottomFlag) {
                        // Keep this event when we do our final output.
                        eventsToKeep.insert(eventName);
                    }
                    
                    // And the sequence
                    renames[fileIndex][sequenceName] = sequenceName + 
                        suffixes[fileIndex];
                        
                    if(bottomFlag) {
                        // All the bottom sequences (that aren't being merged
                        // on) need to be renamed apart manually since the names
                        // may be reused.
                        renames[fileIndex][sequenceName] += "-" + 
                            std::to_string(fileIndex);
                    }
                    sequenceName = renames[fileIndex][sequenceName];
                    
                    Log::info() << "Canonical name: " << eventName << "." << 
                        sequenceName << std::endl;
                    
                } else {
                    // If we are going to merge on it, we keep its name the same
                    // and then later we just make one thread for that name. We
                    // do definitely need it in the output though.
                    eventsToKeep.insert(eventName);
                }
                
                
                // Save the names
                eventNames.push_back(eventName);
                sequenceNames.push_back(sequenceName);
                
                // Save the bottomness flag.
                isBottom.push_back(bottomFlag);
                
                // Initialize the total length to 0
                sequenceLengths.push_back(0);
                
                
                auto namePair = std::make_pair(eventName, sequenceName);
                if(!firstSequenceNumber.count(namePair)) {
                    // This is the first time we have seen a sequence
                    // for this event and sequence name. Everything should
                    // merge against this one thread and not make more
                    // threads.
                    
                    // If this is the mergeOn event, we'll only make this
                    // once across all the files.
                    
                    // Later instances of this event and sequence name should
                    // redirect here.
                    firstSequenceNumber[namePair] = sequenceNames.size() - 1;
                    
                    Log::info() << "This is the first time we have seen "
                        "this sequence." << std::endl;
                }
                
            } else if(parts[0] == "a") {
                // This is an alignment block
                
                if(sequenceNames.size() == 0) {
                    throw std::runtime_error(
                        "Found alignmet block before sequence");
                }
                
                // Which sequence are we working on?
                size_t sequenceNumber = sequenceNames.size() - 1;
                
                if(isBottom[sequenceNumber]) {
                    // Parse it as a bottom block: "a" name start length
                    
                    if(parts.size() != 4) {
                        // Not the right number of fields.
                        throw std::runtime_error(
                            std::string("Invalid field count in ") + line);
                    }
                    
                    size_t blockName = std::stoll(parts[1]);
                    size_t blockStart = std::stoll(parts[2]);
                    size_t blockLength = std::stoll(parts[3]);
                    
                    // Look up the sequence number we actually want to merge
                    // against when we come get this block.
                    auto namePair = std::make_pair(eventNames[sequenceNumber], 
                        sequenceNames[sequenceNumber]);
                    size_t mergeSequenceNumber = firstSequenceNumber[namePair];
                    
                    // We need to associate the block name with the thread
                    // number for the sequence we want it to merge into, and the
                    // start location it specifies, for merging later.
                    nameMap[blockName] = std::make_pair(mergeSequenceNumber,
                        blockStart);
                    
                    Log::debug() << "Bottom block " << blockName << " is " << 
                        blockStart << " on sequence " << mergeSequenceNumber << 
                        std::endl;
                    
                    // Also record the additional length on this sequence
                    sequenceLengths[sequenceNumber] += blockLength;
                    
                } else {
                    // Parse it as a top block: 
                    // "a" start length [name orientation]
                    
                    if(parts.size() < 3) {
                        // Not the right number of fields.
                        throw std::runtime_error(
                            std::string("Invalid field count in ") + line);
                    }
                    
                    // Parse out the start and length
                    size_t segmentStart = std::stoll(parts[1]);
                    size_t segmentLength = std::stoll(parts[2]);
                    
                    // Add in the length
                    sequenceLengths[sequenceNumber] += segmentLength;
                    
                    if(parts.size() == 5) {
                        // If it has a name and orientation, remember a merge.
                        
                        size_t blockName = std::stoll(parts[3]);
                        bool orientation = std::stoi(parts[4]);
                        
                        // Get the sequence number that canonically represents
                        // all sequences with this event/sequence name
                        // combination.
                        auto namePair = std::make_pair(
                            eventNames[sequenceNumber], 
                            sequenceNames[sequenceNumber]);
                        size_t mergeSequenceNumber = firstSequenceNumber[
                            namePair];
                        
                        // Make a merge and populate it with everything we can
                        // get from this segment.
                        C2hMerge merge;
                        merge.sequence1 = mergeSequenceNumber;
                        merge.start1 = segmentStart;
                        merge.length = segmentLength;
                        // TODO: error-check length
                        merge.orientation = orientation;
                        
                        // Grab the info from the bottom segment we are talking
                        // about earlier in this file.
                        merge.sequence2 = nameMap[blockName].first;
                        merge.start2 = nameMap[blockName].second;
                        
                        Log::debug() << "Going to merge " << segmentStart << 
                            " length " << segmentLength << " to " <<
                            blockName << " orientation " << orientation << 
                            std::endl;
                        
                        // Save the merge for doing later.
                        merges.push_back(merge);
                        
                    }
                    
                }
            }
        }
    }
    
    // Make a thread set with all those threads
    stPinchThreadSet* threadSet = stPinchThreadSet_construct();
    
    // Make all the threads. Be 1-based internally since the serialization code
    // wants that.
    for(size_t i = 0; i < sequenceLengths.size(); i++) {
        
        auto namePair = std::make_pair(eventNames[i], sequenceNames[i]);
        if(firstSequenceNumber[namePair] != i) {
            // This sequence is not the first; it is getting merged into another
            // one that is the same length and structure and name (i.e. it
            // appears in two files). Don't make a thread for it.
            continue;
        }
        
        
        // Make threads for all the top sequences and the first bottom sequence
        // for every event and sequence name pair.
        stPinchThreadSet_addThread(threadSet, i, 1, sequenceLengths[i]); 
    }
    
    for(auto merge : merges) {
        // Apply all the merges, converting merges to 1-based
        stPinchThread_pinch(
            stPinchThreadSet_getThread(threadSet, merge.sequence1),
            stPinchThreadSet_getThread(threadSet, merge.sequence2),
            merge.start1 + 1, merge.start2 + 1, merge.length, 
            merge.orientation);
            
        Log::trace() << "Applied merge between threads " << merge.sequence1 << 
            ":" << merge.start1 << "-" << merge.start1 + merge.length << 
            " and " << merge.sequence2 << ":" << merge.start2 << "-" << 
            merge.start2 + merge.length << " orientation " << 
            merge.orientation << std::endl;
    }
    
    // Write out a new c2h file, with a new rootSeq.
    size_t newRootLength = writeAlignment(threadSet, sequenceNames, eventNames, 
        options["c2hOut"].as<std::string>(), &eventsToKeep);
        
    // Clean up thread set.
    stPinchThreadSet_destruct(threadSet);
    
    // Merge the FASTAs, applying any renaming that needs to happen.
    
    // We'll do the FASTA output ourselves. Open the file.
    std::ofstream fastaOut(options["fastaOut"].as<std::string>());
    
    // Write the newly synthesized rootSeq. TODO: unify with writeAlignmentFasta
    // by moving support for renames over there.
    fastaOut << ">rootSeq" << std::endl;
    for(size_t i = 0; i < newRootLength; i++) {
        // Write an n for every base
        fastaOut << "N";
    }
    fastaOut << std::endl;
    
    // This holds the IDs of all the sequences we already wrote. Only write
    // sequences if they aren't duplicates after renaming (which is how we
    // deduplicate the shared root)
    std::unordered_set<std::string> alreadyWritten;
    
    for(size_t fileIndex = 0; fileIndex < fastaFiles.size(); fileIndex++) {
        // Open up the FASTA for reading
        Fasta fasta(fastaFiles[fileIndex]);
        
        Log::info() << "Copying over FASTA records from " <<
            fastaFiles[fileIndex] << std::endl;
        
        while(fasta.hasNext()) {
            // Go through all the FASTA records.
            // TODO: assumes FASTA headers have nothing but IDs.
            std::pair<std::string, std::string> record = fasta.getNextRecord();
            
            if(renames[fileIndex].count(record.first)) {
                // Rename them if necessary
                record.first = renames[fileIndex][record.first];
            }
            
            if(!eventsToKeep.count(record.first)) {
                // This event wasn't on the list of events to actually output,
                // so don't output it.
                Log::info() << "Skipped event " << record.first << std::endl;
                continue;
            }
            
            if(!alreadyWritten.count(record.first)) {
            
                // Save the record to the output FASTA file.
                fastaOut << ">" << record.first << std::endl << record.second <<
                    std::endl;
                
                // Remember that we have written a record by this name.
                alreadyWritten.insert(record.first);
            }
            
        }
    }
    
    fastaOut.close();
    
    // Now we're done!
    return 0;
}