Exemplo n.º 1
0
int main(int argc, char* argv[]) {
    namespace po = boost::program_options;
    namespace bf = boost::filesystem;

    std::vector<std::string> files;
    std::string output;
    std::string ref_fasta;
    std::string regions_bed = "";
    std::string targets_bed = "";

    // limits
    std::string chr;
    int64_t start = -1;
    int64_t end = -1;
    int64_t rlimit = -1;

    int64_t message = -1;

    bool apply_filters = false;
    bool leftshift = false;
    bool trimalleles = false;
    bool splitalleles = false;
    int mergebylocation = false;
    bool uniqalleles = false;
    bool calls_only = true;
    bool homref_split = false;
    bool primitives = false;
    std::string homref_vcf = "";

    bool process_formats = false;

    try
    {
        // Declare the supported options.
        po::options_description desc("Allowed options");
        desc.add_options()
            ("help,h", "produce help message")
            ("version", "Show version")
            ("input-file", po::value<std::vector< std::string> >(), "The input files")
            ("output-file,o", po::value<std::string>(), "The output file name.")
            ("reference,r", po::value<std::string>(), "The reference fasta file.")
            ("location,l", po::value<std::string>(), "Start location.")
            ("regions,R", po::value<std::string>(), "Use a bed file for getting a subset of regions (traversal via tabix).")
            ("targets,T", po::value<std::string>(), "Use a bed file for getting a subset of targets (streaming the whole file, ignoring things outside the bed regions).")
            ("limit-records", po::value<int64_t>(), "Maximum umber of records to process")
            ("message-every", po::value<int64_t>(), "Print a message every N records.")
            ("apply-filters,f", po::value<bool>(), "Apply filtering in VCF.")
            ("leftshift", po::value<bool>(), "Leftshift variant alleles.")
            ("trimalleles", po::value<bool>(), "Remove unused variant alleles.")
            ("splitalleles", po::value<bool>(), "Split and sort variant alleles.")
            ("merge-by-location", po::value<int>(), "Merge calls at the same location.")
            ("unique-alleles", po::value<bool>(), "Make alleles unique across a single line.")
            ("homref-split", po::value<bool>(), "Split homref blocks into per-nucleotide blocks.")
            ("homref-vcf-out", po::value<std::string>(), "Output split homref blocks as BCF/VCF.")
            ("calls-only", po::value<bool>(), "Remove homref blocks.")
            ("primitives", po::value<bool>(), "Split complex alleles into primitives via realignment.")
            ("process-split", po::value<bool>(), "Enables splitalleles, trimalleles, unique-alleles, leftshift.")
            ("process-full", po::value<bool>(), "Enables splitalleles, trimalleles, unique-alleles, leftshift, mergebylocation.")
            ("process-formats", po::value<bool>(), "Process GQ/DP/AD format fields.")
        ;

        po::positional_options_description popts;
        popts.add("input-file", -1);

        po::options_description cmdline_options;
        cmdline_options
            .add(desc)
        ;

        po::variables_map vm;

        po::store(po::command_line_parser(argc, argv).
                  options(cmdline_options).positional(popts).run(), vm);
        po::notify(vm);

        if (vm.count("version"))
        {
            std::cout << "multimerge version " << HAPLOTYPES_VERSION << "\n";
            return 0;
        }

        if (vm.count("help"))
        {
            std::cout << desc << "\n";
            return 1;
        }

        if (vm.count("input-file"))
        {
            files = vm["input-file"].as< std::vector<std::string> >();
        }

        if (vm.count("output-file"))
        {
            output = vm["output-file"].as< std::string >();
        }

        if (vm.count("reference"))
        {
            ref_fasta = vm["reference"].as< std::string >();
        }
        else
        {
            error("Please specify a reference file name.");
        }

        if (vm.count("location"))
        {
            stringutil::parsePos(vm["location"].as< std::string >(), chr, start, end);
        }

        if (vm.count("regions"))
        {
            regions_bed = vm["regions"].as< std::string >();
        }

        if (vm.count("targets"))
        {
            targets_bed = vm["targets"].as< std::string >();
        }

        if (vm.count("limit-records"))
        {
            rlimit = vm["limit-records"].as< int64_t >();
        }

        if (vm.count("message-every"))
        {
            message = vm["message-every"].as< int64_t >();
        }

        if (vm.count("apply-filters"))
        {
            apply_filters = vm["apply-filters"].as< bool >();
        }

        if (vm.count("leftshift"))
        {
            leftshift = vm["leftshift"].as< bool >();
        }

        if (vm.count("trimalleles"))
        {
            trimalleles = vm["trimalleles"].as< bool >();
        }

        if (vm.count("splitalleles"))
        {
            splitalleles = vm["splitalleles"].as< bool >();
        }

        if (vm.count("merge-by-location"))
        {
            mergebylocation = vm["merge-by-location"].as< int >();
        }

        if (vm.count("unique-alleles"))
        {
            uniqalleles = vm["unique-alleles"].as< bool >();
        }

        if (vm.count("calls-only"))
        {
            calls_only = vm["calls-only"].as< bool >();
        }

        if (vm.count("homref-split"))
        {
            homref_split = vm["homref-split"].as< bool >();
        }

        if (vm.count("primitives"))
        {
            primitives = vm["primitives"].as< bool >();
        }

        if (vm.count("homref-vcf-out"))
        {
            homref_split = 1;
            homref_vcf = vm["homref-vcf-out"].as< std::string >();
        }

        if (vm.count("process-split"))
        {
            homref_split = true;
            trimalleles = true;
            splitalleles = true;
            uniqalleles = true;
            leftshift = true;
            calls_only = true;
        }

        if (vm.count("process-full"))
        {
            homref_split = true;
            trimalleles = true;
            splitalleles = true;
            uniqalleles = true;
            leftshift = true;
            calls_only = true;
            mergebylocation = 2;
            primitives = true;
        }

        if (vm.count("process-formats"))
        {
            process_formats = vm["process-formats"].as< bool >();
        }

        if(files.size() == 0)
        {
            std::cerr << "Please specify at least one input file / sample.\n";
            return 1;
        }

        if (output == "")
        {
            std::cerr << "Please specify an output file.\n";
            return 1;
        }
    }
    catch (po::error & e)
    {
        std::cerr << e.what() << "\n";
        return 1;
    }

    try
    {
        VariantReader r;
        if(regions_bed != "")
        {
            r.setRegions(regions_bed.c_str(), true);
        }
        if(targets_bed != "")
        {
            r.setTargets(targets_bed.c_str(), true);
        }

        VariantWriter w(output.c_str(), ref_fasta.c_str());
        std::shared_ptr<VariantWriter> p_homref_writer;
        if (homref_vcf.size() != 0)
        {
            p_homref_writer = std::make_shared<VariantWriter>(homref_vcf.c_str(), ref_fasta.c_str());
        }

        w.setWriteFormats(process_formats);
        if (p_homref_writer)
        {
            p_homref_writer->setWriteFormats(process_formats);
        }

        r.setApplyFilters(apply_filters);

        for(std::string const & f : files)
        {
            std::vector<std::string> v;
            stringutil::split(f, v, ":");

            std::string filename, sample = "";

            // in case someone passes a ":"
            assert(v.size() > 0);

            filename = v[0];

            if(v.size() > 1)
            {
                sample = v[1];
            }
            std::cerr << "Adding file '" << filename << "' / sample '" << sample << "'" << "\n";
            r.addSample(filename.c_str(), sample.c_str());
        }

        std::list< std::pair<std::string, std::string> > samples;
        r.getSampleList(samples);

        std::set<std::string> samplenames;
        for (auto const & p : samples)
        {
            std::string sname = p.second;
            if (sname == "")
            {
                sname = boost::filesystem::path(p.first).stem().string();
            }
            int i = 1;
            while (samplenames.count(sname))
            {
                if(p.second == "")
                {
                    sname = boost::filesystem::path(p.first).stem().string() + "." + std::to_string(i++);
                }
                else
                {
                    sname = p.second + "." + std::to_string(i++);
                }
            }
            samplenames.insert(sname);
            std::cerr << "Writing '" << p.first << ":" << p.second << "' as sample '" << sname << "'" << "\n";
            w.addSample(sname.c_str());
            if (p_homref_writer)
            {
                p_homref_writer->addSample(sname.c_str());
            }
        }

        w.addHeader(r);
        if (p_homref_writer)
        {
            p_homref_writer->addHeader(r);
        }
        r.rewind(chr.c_str(), start);

        VariantInput vi(
            ref_fasta.c_str(),
            leftshift,           // bool leftshift
            true,                // bool refpadding
            trimalleles,         // bool trimalleles = false,
            splitalleles,        // bool splitalleles = false,
            mergebylocation,     // int mergebylocation = false,
            uniqalleles,         // bool uniqalleles = false,
            calls_only,          // bool calls_only = true,
            homref_split,        // bool homref_split = false
            primitives,          // bool primitives = false
            (bool)p_homref_writer// homref output
            );

        vi.getProcessor().setReader(r, VariantBufferMode::buffer_block, 100);

        VariantProcessor & proc = vi.getProcessor();
        VariantProcessor & proc_homref = vi.getProcessor(VariantInput::homref);

        int64_t rcount = 0;

        bool advance1 = proc.advance();
        bool advance2 = p_homref_writer ? proc_homref.advance() : false;

        while(advance1 || advance2)
        {
            if(rlimit != -1)
            {
                if(rcount >= rlimit)
                {
                    break;
                }
            }

            if (advance1)
            {
                Variants & v = proc.current();
                if (chr.size() != 0 && chr != v.chr)
                {
                    // chromosome changed and location was given => abort
                    while(advance2)
                    {
                        Variants & v = proc_homref.current();
                        p_homref_writer->put(v);
                        advance2 = homref_split ? proc_homref.advance() : false;
                    }
                    break;
                }
                if(end != -1 && v.pos > end)
                {
                    break;
                }
                w.put(v);
                if(message > 0 && (rcount % message) == 0)
                {
                    std::cout << stringutil::formatPos(v.chr.c_str(), v.pos) << ": " << v << "\n";
                }
            }

            // make sure our homref variant output doesn't fill up all memory
            while(advance2 && p_homref_writer)
            {
                Variants & v = proc_homref.current();
                p_homref_writer->put(v);
                advance2 = proc_homref.advance();
            }

            advance1 = proc.advance();
            ++rcount;
        }
    }
    catch(std::runtime_error & e)
    {
        std::cerr << e.what() << std::endl;
        return 1;
    }
    catch(std::logic_error & e)
    {
        std::cerr << e.what() << std::endl;
        return 1;
    }

    return 0;
}
Exemplo n.º 2
0
int main(int argc, char* argv[]) {
    namespace po = boost::program_options;

    std::string ref_fasta;

    std::string chr = "";
    int64_t start = -1;
    int64_t end = -1;

    std::string file1;
    std::string sample1;
    std::string file2;
    std::string sample2;

    std::string regions_bed = "";
    std::string targets_bed = "";

    std::string out_vcf = "";
    std::string out_errors = "";

    // = max 12 unphased hets in segment
    int64_t blimit = -1;
    bool progress = false;
    int progress_seconds = 10;
    int max_n_haplotypes = 4096;
    int64_t hb_window = 30;
    int64_t hb_expand = 30;

    bool apply_filters_query = false;
    bool apply_filters_truth = true;
    bool preprocess = false;
    bool leftshift = false;
    bool always_hapcmp = false;
    bool no_hapcmp = false;
    bool compare_raw = false;
    bool output_roc_vals = false;

    try
    {
        // Declare the supported options.
        po::options_description desc("Allowed options");
        desc.add_options()
            ("help,h", "produce help message")
            ("version", "Show version")
            ("input-vcfs", po::value<std::vector<std::string> >(), "Two VCF files to compare (use file:sample for a specific sample column).")
            ("output-vcf,o", po::value<std::string>(), "Output variant comparison results to VCF.")
            ("output-errors,e", po::value<std::string>(), "Output failure information.")
            ("reference,r", po::value<std::string>(), "The reference fasta file.")
            ("location,l", po::value<std::string>(), "The location to start at.")
            ("regions,R", po::value<std::string>(), "Use a bed file for getting a subset of regions (traversal via tabix).")
            ("targets,T", po::value<std::string>(), "Use a bed file for getting a subset of targets (streaming the whole file, ignoring things outside the bed regions).")
            ("progress", po::value<bool>(), "Set to true to output progress information.")
            ("progress-seconds", po::value<int>(), "Output progress information every n seconds.")
            ("window,w", po::value<int64_t>(), "Overlap window to create haplotype blocks.")
            ("max-n-haplotypes,n", po::value<int>(), "Maximum number of haplotypes to enumerate.")
            ("expand-hapblocks", po::value<int64_t>(), "Number of bases to expand around each haplotype block.")
            ("limit", po::value<int64_t>(), "Maximum number of haplotype blocks to process.")
            ("apply-filters-truth", po::value<bool>(), "Apply filtering in truth VCF (on by default).")
            ("apply-filters-query,f", po::value<bool>(), "Apply filtering in query VCF (off by default).")
            ("preprocess-variants,V", po::value<bool>(), "Apply variant normalisations, trimming, realignment for complex variants (off by default).")
            ("leftshift", po::value<bool>(), "Left-shift indel alleles (off by default).")
            ("always-hapcmp", po::value<bool>(), "Always compare haplotype blocks (even if they match). Testing use only/slow.")
            ("compare-raw", po::value<bool>(), "Compare raw calls also to maximize chances of matching difficult regions.")
            ("no-hapcmp", po::value<bool>(), "Disable haplotype comparison. This overrides all other haplotype comparison options.")
            ("roc-vals", po::value<bool>(), "Output GQX and qual values for truth and query in INFO (which gets preserved through quantify).")
        ;

        po::positional_options_description popts;
        popts.add("input-vcfs", 2);

        po::options_description cmdline_options;
        cmdline_options
            .add(desc)
        ;

        po::variables_map vm;

        po::store(po::command_line_parser(argc, argv).
                  options(cmdline_options).positional(popts).run(), vm);
        po::notify(vm);

        if (vm.count("version"))
        {
            std::cout << "xcmp version " << HAPLOTYPES_VERSION << "\n";
            return 0;
        }

        if (vm.count("help"))
        {
            std::cout << desc << "\n";
            return 1;
        }

        if (vm.count("input-vcfs"))
        {
            std::vector<std::string> vr = vm["input-vcfs"].as< std::vector<std::string> >();

            if(vr.size() != 2)
            {
                error("Please pass exactly two vcf file names for comparison.");
            }

            std::vector<std::string> v;
            stringutil::split(vr[0], v, ":");
            // in case someone passes a ":"
            assert(v.size() > 0);

            file1 = v[0];
            sample1 = "";
            if(v.size() > 1)
            {
                sample1 = v[1];
            }

            v.clear();
            stringutil::split(vr[1], v, ":");
            // in case someone passes a ":"
            assert(v.size() > 0);

            file2 = v[0];
            sample2 = "";
            if(v.size() > 1)
            {
                sample2 = v[1];
            }
        }

        if (vm.count("output-vcf"))
        {
            out_vcf = vm["output-vcf"].as< std::string >();
        }

        if (vm.count("output-errors"))
        {
            out_errors = vm["output-errors"].as< std::string >();
        }

        if (vm.count("preprocess-variants"))
        {
            preprocess = vm["preprocess-variants"].as< bool >();
        }

        if (vm.count("leftshift"))
        {
            leftshift = vm["leftshift"].as< bool >();
        }

        if (vm.count("location"))
        {
            stringutil::parsePos(vm["location"].as< std::string >(), chr, start, end);
        }

        if (vm.count("regions"))
        {
            regions_bed = vm["regions"].as< std::string >();
        }

        if (vm.count("targets"))
        {
            targets_bed = vm["targets"].as< std::string >();
        }

        if (vm.count("reference"))
        {
            ref_fasta = vm["reference"].as< std::string >();
        }
        else if(preprocess)
        {
            error("Please specify a reference file name.");
        }

        if (vm.count("max-n-haplotypes"))
        {
            max_n_haplotypes = vm["max-n-haplotypes"].as< int >();
        }

        if (vm.count("limit"))
        {
            blimit = vm["limit"].as< int64_t >();
        }

        if (vm.count("expand-hapblocks"))
        {
            hb_expand = vm["expand-hapblocks"].as< int64_t >();
        }

        if (vm.count("window"))
        {
            hb_window = vm["window"].as< int64_t >();
        }

        if (vm.count("progress"))
        {
            progress = vm["progress"].as< bool >();
        }

        if (vm.count("progress-seconds"))
        {
            progress_seconds = vm["progress-seconds"].as< int >();
        }

        if (vm.count("apply-filters-truth"))
        {
            apply_filters_truth = vm["apply-filters-truth"].as< bool >();
        }

        if (vm.count("apply-filters-query"))
        {
            apply_filters_query = vm["apply-filters-query"].as< bool >();
        }

        if (vm.count("always-hapcmp"))
        {
            always_hapcmp = vm["always-hapcmp"].as< bool >();
        }

        if (vm.count("no-hapcmp"))
        {
            no_hapcmp = vm["no-hapcmp"].as< bool >();
        }

        if (vm.count("compare-raw"))
        {
            compare_raw = vm["compare-raw"].as< bool >();
        }

        if (vm.count("roc-vals"))
        {
            output_roc_vals = vm["roc-vals"].as< bool >();
        }
    }
    catch (po::error & e)
    {
        std::cerr << e.what() << "\n";
        return 1;
    }
    catch(std::runtime_error & e)
    {
        std::cerr << e.what() << std::endl;
        return 1;
    }
    catch(std::logic_error & e)
    {
        std::cerr << e.what() << std::endl;
        return 1;
    }

    try
    {
        VariantReader vr;
        vr.setReturnHomref(false);

        if(regions_bed != "")
        {
            vr.setRegions(regions_bed.c_str(), true);
        }
        if(targets_bed != "")
        {
            vr.setTargets(targets_bed.c_str(), true);
        }

        int r1 = vr.addSample(file1.c_str(), sample1.c_str());
        int r2 = vr.addSample(file2.c_str(), sample2.c_str());

        vr.setApplyFilters(apply_filters_truth, r1);
        vr.setApplyFilters(apply_filters_query, r2);

        VariantInput vi(
            ref_fasta.c_str(),
            preprocess || leftshift,          // bool leftshift
            false,          // bool refpadding
            true,                // bool trimalleles = false, (remove unused alleles)
            preprocess || leftshift,      // bool splitalleles = false,
            ( preprocess || leftshift ) ? 2 : 0,  // int mergebylocation = false,
            true,                // bool uniqalleles = false,
            true,                // bool calls_only = true,
            false,               // bool homref_split = false // this is handled by calls_only
            preprocess,          // bool primitives = false
            false,               // bool homref_output
            leftshift ? hb_window-1 : 0,   // int64_t leftshift_limit
            compare_raw         // collect_raw
            );

        VariantProcessor & vp = vi.getProcessor();

        std::unique_ptr<VariantAlleleRemover> p_raw_allele_remover;
        std::unique_ptr<VariantAlleleSplitter> p_raw_allele_splitter;
        std::unique_ptr<VariantAlleleNormalizer> p_raw_allele_normalizer;
        std::unique_ptr<VariantLocationAggregator> p_raw_aggregator;
        std::unique_ptr<VariantAlleleUniq> p_raw_allele_uniq;
        std::unique_ptr<VariantCallsOnly> p_raw_callsonly;
        if(compare_raw)
        {
            p_raw_allele_remover = std::move(std::unique_ptr<VariantAlleleRemover>(new VariantAlleleRemover()));
            vi.getProcessor(VariantInput::raw).addStep(*p_raw_allele_remover);

            p_raw_allele_splitter = std::move(std::unique_ptr<VariantAlleleSplitter>(new VariantAlleleSplitter()));
            vi.getProcessor(VariantInput::raw).addStep(*p_raw_allele_splitter);

            if(leftshift)
            {
                p_raw_allele_normalizer = std::move(std::unique_ptr<VariantAlleleNormalizer>(new VariantAlleleNormalizer()));
                p_raw_allele_normalizer->setReference(ref_fasta);
                p_raw_allele_normalizer->setEnableRefPadding(false);
                p_raw_allele_normalizer->setLeftshiftLimit(hb_window - 1);
                p_raw_allele_normalizer->setEnableHomrefVariants(false);
                vi.getProcessor(VariantInput::raw).addStep(*p_raw_allele_normalizer);
            }

            p_raw_aggregator = std::move(std::unique_ptr<VariantLocationAggregator>(new VariantLocationAggregator()));
            vi.getProcessor(VariantInput::raw).addStep(*p_raw_aggregator);

            p_raw_allele_uniq = std::move(std::unique_ptr<VariantAlleleUniq>(new VariantAlleleUniq()));
            vi.getProcessor(VariantInput::raw).addStep(*p_raw_allele_uniq);

            p_raw_callsonly = std::move(std::unique_ptr<VariantCallsOnly>(new VariantCallsOnly()));
            vi.getProcessor(VariantInput::raw).addStep(*p_raw_callsonly);
        }

        vp.setReader(vr, VariantBufferMode::buffer_block, 10*hb_window);

        bool stop_after_chr_change = false;
        if(chr != "")
        {
            vp.rewind(chr.c_str(), start);
            stop_after_chr_change = true;
        }

        std::unique_ptr<VariantWriter> pvw;
        if (out_vcf != "")
        {
            pvw = std::move(std::unique_ptr<VariantWriter> (new VariantWriter(out_vcf.c_str(), ref_fasta.c_str())));
            pvw->addHeader(vr);
            pvw->addHeader("##INFO=<ID=gtt1,Number=1,Type=String,Description=\"GT of truth call\">");
            pvw->addHeader("##INFO=<ID=gtt2,Number=1,Type=String,Description=\"GT of query call\">");
            pvw->addHeader("##INFO=<ID=type,Number=1,Type=String,Description=\"Decision for call (TP/FP/FN/N)\">");
            pvw->addHeader("##INFO=<ID=kind,Number=1,Type=String,Description=\"Sub-type for decision (match/mismatch type)\">");
            pvw->addHeader("##INFO=<ID=ctype,Number=1,Type=String,Description=\"Type of comparison performed\">");
            pvw->addHeader("##INFO=<ID=HapMatch,Number=0,Type=Flag,Description=\"Variant is in matching haplotype block\">");
            if(output_roc_vals)
            {
                pvw->addHeader("##INFO=<ID=T_GQ,Number=1,Type=Float,Description=\"GQ field in truth VCF.\">");
                pvw->addHeader("##INFO=<ID=Q_GQ,Number=1,Type=Float,Description=\"GQ field in query VCF.\">");
                pvw->addHeader("##INFO=<ID=T_DP,Number=1,Type=Float,Description=\"DP field in truth VCF.\">");
                pvw->addHeader("##INFO=<ID=Q_DP,Number=1,Type=Float,Description=\"DP field in query VCF.\">");
                pvw->addHeader("##INFO=<ID=T_QUAL,Number=1,Type=Float,Description=\"Qual column in truth VCF.\">");
                pvw->addHeader("##INFO=<ID=Q_QUAL,Number=1,Type=Float,Description=\"Qual column in query VCF.\">");
            }
            pvw->addSample("TRUTH");
            pvw->addSample("QUERY");
        }

        std::ostream * error_out_stream = NULL;
        if(out_errors == "-")
        {
            error_out_stream = &std::cerr;
        }
        else if(out_errors != "")
        {
            error_out_stream = new std::ofstream(out_errors.c_str());
        }
        DiploidCompare hc(ref_fasta.c_str());
        hc.setMaxHapEnum(max_n_haplotypes);
        hc.setDoAlignments(false);

        int64_t nhb = 0;
        int64_t last_pos = std::numeric_limits<int64_t>::max();

        // hap-block status + update
        std::list<Variants> block_variants;
        int64_t block_start = -1;
        int64_t block_end = -1;
        int n_nonsnp = 0, calls_1 = 0, calls_2 = 0;
        bool has_mismatch = false;

        const auto finish_block = [&vi,
                                   &block_variants, r1, r2,
                                   &chr,
                                   &block_start,
                                   &block_end,
                                   &n_nonsnp, &calls_1, &calls_2,
                                   &has_mismatch,
                                   &pvw, &error_out_stream,
                                   &hc,
                                   hb_expand,
                                   no_hapcmp,
                                   always_hapcmp,
                                   compare_raw] ()
        {
            bool hap_match = false, hap_fail = false, hap_run = false, raw_match = false;
            // try HC if we have mismatches, and if the number of calls is > 0
            if (!no_hapcmp && (always_hapcmp || (has_mismatch && calls_1 > 0 && calls_2 > 0 && n_nonsnp > 0)))
            {
                if(compare_raw)
                {
                    std::list<Variants> raw_variants;
                    while(vi.getProcessor(VariantInput::raw).advance())
                    {
                        Variants & vars = vi.getProcessor(VariantInput::raw).current();
                        raw_variants.push_back(vars);
                    }

                    try
                    {
                        hap_run = true;
                        hap_fail = true;
                        hc.setRegion(chr.c_str(), std::max(int64_t(0), block_start-hb_expand), block_end + hb_expand,
                                     raw_variants, r1, r2);
                        DiploidComparisonResult const & hcr = hc.getResult();
#ifdef DEBUG_XCMP
                        std::cerr << chr << ":" << block_start << "-" << block_end << " variants: " << "\n";
                        for(auto const & x : block_variants)
                        {
                            std::cerr << x << "\n";
                        }
                        std::cerr << "Block result: " << "\n";
                        std::cerr << hcr << "\n";
#endif
                        raw_match = hap_match = hcr.outcome == dco_match;
                        hap_fail = !(hcr.outcome == dco_match || hcr.outcome == dco_mismatch);
                    }
                    catch(std::runtime_error &e)
                    {
                    }
                    catch(std::logic_error &e)
                    {
                    }
                }
                if(!hap_match)
                {
                    try
                    {
                        hap_run = true;
                        hap_fail = true;
                        hc.setRegion(chr.c_str(), std::max(int64_t(0), block_start-hb_expand), block_end + hb_expand,
                                     block_variants, r1, r2);
                        DiploidComparisonResult const & hcr = hc.getResult();
#ifdef DEBUG_XCMP
                        std::cerr << chr << ":" << block_start << "-" << block_end << " variants: " << "\n";
                        for(auto const & x : block_variants)
                        {
                            std::cerr << x << "\n";
                        }
                        std::cerr << "Block result: " << "\n";
                        std::cerr << hcr << "\n";
#endif
                        hap_match = hcr.outcome == dco_match;
                        hap_fail = !(hcr.outcome == dco_match || hcr.outcome == dco_mismatch);
                    }
                    catch(std::runtime_error &e)
                    {
                        if (error_out_stream)
                        {
                            *error_out_stream << chr << "\t" << block_start << "\t" << block_end+1 << "\t" << "hap_error\t" << e.what() << "\n";
                        }
                    }
                    catch(std::logic_error &e)
                    {
                        if (error_out_stream)
                        {
                            *error_out_stream << chr << "\t" << block_start << "\t" << block_end+1 << "\t" << "hap_error\t" << e.what() << "\n";
                        }
                    }
                }
            }
            std::string result;
            if(hap_run)
            {
                if (hap_fail)
                {
                    result = "hapfail:";
                }
                else if(has_mismatch)
                {
                    result = "hap:";
                }
                else
                {
                    result = "simple:";
                }
            }
            else
            {
                result = "simple:";
            }

            if(hap_run && !has_mismatch && !hap_match)
            {
                result += "suspicious_simple_match";
            }
            else if(always_hapcmp && hap_match && ((calls_1 == 0 && calls_2 > 0) || (calls_1 > 0 && calls_2 == 0)))
            {
                bool any_filtered = false;
                for (Variants const & v : block_variants)
                {
                    for (Call const & c : v.calls)
                    {
                        for (size_t i = 0; i < c.nfilter; ++i)
                        {
                            if(c.filter[i] != "PASS" && c.filter[i] != ".")
                            {
                                any_filtered = true;
                                break;
                            }
                        }
                    }
                    if(any_filtered)
                    {
                        break;
                    }
                }

                if(any_filtered)
                {
                    result += "match_ignoring_filtered";
                }
                else
                {
                    result += "suspicious_hap_match";
                }
            }
            else if(hap_match || !has_mismatch)
            {
                result += "match";
            }
            else
            {
                result += "mismatch";
            }

            if(raw_match)
            {
                result += "_raw";
            }

            if(error_out_stream)
            {
                *error_out_stream << chr << "\t" << block_start << "\t" << block_end+1 << "\t" << result << "\t"
                                  << has_mismatch << ":" << hap_match << ":" << hap_fail << ":"
                                  << calls_1 << ":" << calls_2 << ":" << n_nonsnp << "\n";
            }
            if (pvw)
            {
                for (Variants & v : block_variants)
                {
                    if(v.info != "")
                    {
                        v.info += ";";
                    }
                    v.info += std::string("ctype=") + result;
                    if (hap_match)
                    {
                        v.info += ";HapMatch";
                    }
                    pvw->put(v);
                }
            }

            block_variants.clear();
            block_start = -1;
            block_end = -1;
            n_nonsnp = 0;
            calls_1 = 0;
            calls_2 = 0;
            has_mismatch = false;
        };

        auto start_time = std::chrono::high_resolution_clock::now();
        auto last_time = std::chrono::high_resolution_clock::now();
        while(vp.advance())
        {
            if(blimit > 0 && nhb++ > blimit)
            {
                // reached record limit
                break;
            }
            Variants & v = vp.current();

            if(end != -1 && (v.pos > end || (chr.size() != 0 && chr != v.chr)))
            {
                // reached end
                break;
            }

            if(stop_after_chr_change && chr.size() != 0 && chr != v.chr)
            {
                // reached end of chr
                break;
            }

            if(chr.size() == 0)
            {
                chr = v.chr;
            }

            if (v.chr != chr || (block_end > 0 && block_end + hb_window < v.pos))
            {
                finish_block();
            }
            chr = v.chr;

            if (block_start < 0)
            {
                block_start = v.pos;
            }
            else
            {
                block_start = std::min(block_start, v.pos);
            }

            if (block_end < 0)
            {
                block_end = v.pos + v.len - 1;
            }
            else
            {
                block_end = std::max(v.pos + v.len - 1, block_end);
            }

            if(compareVariants(v, r1, r2, n_nonsnp, calls_1, calls_2) != dco_match)
            {
                has_mismatch = true;
            }

            if(output_roc_vals)
            {
                if(!v.info.empty()) { v.info += ";"; }
                v.info += "T_GQ=" + std::to_string(v.calls[r1].gq);
                v.info += ";Q_GQ=" + std::to_string(v.calls[r2].gq);
                v.info += ";T_DP=" + std::to_string(v.calls[r1].dp);
                v.info += ";Q_DP=" + std::to_string(v.calls[r2].dp);
                v.info += ";T_QUAL=" + std::to_string(v.calls[r1].qual);
                v.info += ";Q_QUAL=" + std::to_string(v.calls[r2].qual);
            }

            block_variants.push_back(v);

#ifdef DEBUG_XCMP
            std::cerr << v << "\n";
            std::cerr << "block_start : " << block_start << "\t"
                      << "block_end : " << block_end << "\t"
                      << "block_size : " << block_variants.size() << "\t"
                      << "n_nonsnp : " << n_nonsnp << "\t"
                      << "calls_1 : " << calls_1 << "\t"
                      << "calls_2 : " << calls_2 << "\t"
                      << "\n";
#endif

            if(progress)
            {
                using namespace std;
                auto end_time = chrono::high_resolution_clock::now();
                auto secs = chrono::duration_cast<chrono::seconds>(end_time - last_time).count();

                if(secs > progress_seconds)
                {
                    auto secs_since_start = chrono::duration_cast<chrono::seconds>(end_time - start_time).count();
                    std::string mbps = "";
                    if(last_pos < v.pos)
                    {
                        mbps = " mpbs: ";
                        mbps += std::to_string(double(v.pos - last_pos) / double(secs_since_start) * 1e-6);
                    }
                    else
                    {
                        last_pos = v.pos;
                    }
                    last_time = end_time;

                    std::cerr << "[PROGRESS] Total time: " << secs_since_start << "s Pos: " << v.pos << mbps << "\n";
                }
            }
        }
#ifdef DEBUG_XCMP
        std::cerr << "END\n";
        std::cerr << "block_start : " << block_start << "\t"
                  << "block_end : " << block_end << "\t"
                  << "block_size : " << block_variants.size() << "\t"
                  << "n_nonsnp : " << n_nonsnp << "\t"
                  << "calls_1 : " << calls_1 << "\t"
                  << "calls_2 : " << calls_2 << "\t"
                  << "\n";
#endif
        finish_block();
        if(error_out_stream && out_errors != "-")
        {
            delete error_out_stream;
        }
    }
    catch(std::runtime_error &e)
    {
        std::cerr << e.what() << std::endl;
        return 1;
    }
    catch(std::logic_error &e)
    {
        std::cerr << e.what() << std::endl;
        return 1;
    }
    return 0;
}
Exemplo n.º 3
0
int main(int argc, char* argv[]) {
    namespace po = boost::program_options;
    namespace bf = boost::filesystem;

    std::string input_dir;
    std::string output_vcf;
    std::string ref_fasta;

    try
    {
        // Declare the supported options.
        po::options_description desc("Allowed options");
        desc.add_options()
        ("help,h", "produce help message")
        ("version", "Show version")
        ("input-dir", po::value<std::string>(), "Path to a vcfeval output directory.")
        ("output-vcf", po::value<std::string>(), "Annotated VCF output file.")
        ("reference,r", po::value<std::string>(), "The reference fasta file.")
        ;

        po::positional_options_description popts;
        popts.add("input-dir", 1);
        popts.add("output-vcf", 1);

        po::options_description cmdline_options;
        cmdline_options
        .add(desc)
        ;

        po::variables_map vm;

        po::store(po::command_line_parser(argc, argv).
                  options(cmdline_options).positional(popts).run(), vm);
        po::notify(vm);

        if (vm.count("version"))
        {
            std::cout << "postvcfeval version " << HAPLOTYPES_VERSION << "\n";
            return 0;
        }

        if (vm.count("help"))
        {
            std::cout << desc << "\n";
            return 1;
        }

        if (vm.count("input-dir"))
        {
            input_dir = vm["input-dir"].as<std::string>();
        }
        else
        {
            error("Please specify an input directory");
        }

        if (vm.count("output-vcf"))
        {
            output_vcf = vm["output-vcf"].as< std::string >();
        }

        if (vm.count("reference"))
        {
            ref_fasta = vm["reference"].as< std::string >();
        }
        else
        {
            error("To write an output VCF, you need to specify a reference file, too.");
        }

        if (output_vcf == "")
        {
            std::cerr << "Please specify an output file.\n";
            return 1;
        }
    }
    catch (po::error & e)
    {
        std::cerr << e.what() << "\n";
        return 1;
    }

    try
    {
        VariantReader r;
        r.setApplyFilters(false);

        boost::filesystem::path p(input_dir);
        int in_ix_fp  = r.addSample((p / "fp.vcf.gz").c_str(), "");
        int in_ix_fn  = r.addSample((p / "fn.vcf.gz").c_str(), "");
        int in_ix_tp  = r.addSample((p / "tp.vcf.gz").c_str(), "");
        int in_ix_tpb = r.addSample((p / "tp-baseline.vcf.gz").c_str(), "");

        std::unique_ptr<VariantWriter> writer = std::make_unique<VariantWriter>(output_vcf.c_str(), ref_fasta.c_str());
        writer->addSample("TRUTH");
        writer->addSample("QUERY");
        writer->addHeader(r);
        writer->addHeader("##INFO=<ID=type,Number=1,Type=String,Description=\"Decision for call (TP/FP/FN/N)\">");
        writer->addHeader("##INFO=<ID=kind,Number=1,Type=String,Description=\"Sub-type for decision (match/mismatch type)\">");

        int64_t rcount = 0;

        while(r.advance())
        {
            Variants & v = r.current();

            if(!v.calls[in_ix_fp].isNocall())
            {
                Variants out_vars;
                out_vars = v;
                out_vars.calls.clear();
                out_vars.calls.resize(2);

                out_vars.calls[1] = v.calls[in_ix_fp];
                if(!out_vars.info.empty()) out_vars.info += ";";
                out_vars.info += "type=FP";
                out_vars.info += ";kind=missing";
                writer->put(out_vars);
            }

            if(!v.calls[in_ix_fn].isNocall())
            {
                Variants out_vars;
                out_vars = v;
                out_vars.calls.clear();
                out_vars.calls.resize(2);

                out_vars.calls[0] = v.calls[in_ix_fn];
                if(!out_vars.info.empty()) out_vars.info += ";";
                out_vars.info += "type=FN";
                out_vars.info += ";kind=missing";
                writer->put(out_vars);
            }

            if(!v.calls[in_ix_tp].isNocall() || !v.calls[in_ix_tpb].isNocall())
            {
                Variants out_vars;
                out_vars = v;
                out_vars.calls.clear();
                out_vars.calls.resize(2);

                out_vars.calls[0] = v.calls[in_ix_tp];
                out_vars.calls[1] = v.calls[in_ix_tpb];
                if(!out_vars.info.empty()) out_vars.info += ";";
                out_vars.info += "type=TP";
                out_vars.info += ";kind=vcfeval";
                writer->put(out_vars);
            }
            ++rcount;
        }
    }
    catch(std::runtime_error & e)
    {
        std::cerr << e.what() << std::endl;
        return 1;
    }
    catch(std::logic_error & e)
    {
        std::cerr << e.what() << std::endl;
        return 1;
    }

    return 0;
}
Exemplo n.º 4
0
int main(int argc, char* argv[]) {
    namespace po = boost::program_options;

    std::vector<std::string> files, samples;

    std::string regions_bed = "";
    std::string targets_bed = "";
    std::string out_bed = "";

    // limits
    std::string chr = "";
    int64_t start = -1;
    int64_t end = -1;

    int64_t rlimit = -1;
    int64_t message = -1;
    int64_t window = 30;
    bool apply_filters = false;

    int nblocks = 32;
    int nvars = 100;

    bool verbose = false;

    try
    {
        // Declare the supported options.
        po::options_description desc("Allowed options");
        desc.add_options()
            ("help,h", "produce help message")
            ("version", "Show version")
            ("input-file", po::value<std::vector<std::string> >(), "The input VCF/BCF file(s) (use file:sample to specify a sample)")
            ("output,o", po::value<std::string>(), "Write a bed file giving the locations of overlapping blocks (use - for stdout).")
            ("regions,R", po::value<std::string>(), "Use a bed file for getting a subset of regions (traversal via tabix).")
            ("targets,T", po::value<std::string>(), "Use a bed file for getting a subset of targets (streaming the whole file, ignoring things outside the bed regions).")
            ("location,l", po::value<std::string>(), "The location / subset.")
            ("limit-records,L", po::value<int64_t>(), "Maximum number of records to process")
            ("message-every,m", po::value<int64_t>(), "Print a message every N records.")
            ("window,w", po::value<int64_t>(), "Overlap window length.")
            ("nblocks", po::value<int>(), "Maximum number of blocks to break into (32).")
            ("nvars", po::value<int>(), "Minimum number of variants per block (100).")
            ("apply-filters,f", po::value<bool>(), "Apply filtering in VCF.")
            ("verbose", po::value<bool>(), "Verbose output.")
        ;

        po::positional_options_description popts;
        popts.add("input-file", -1);

        po::options_description cmdline_options;
        cmdline_options
            .add(desc)
        ;

        po::variables_map vm;
        
        po::store(po::command_line_parser(argc, argv).
                  options(cmdline_options).positional(popts).run(), vm);
        po::notify(vm); 

        if (vm.count("version")) 
        {
            std::cout << "blocksplit version " << HAPLOTYPES_VERSION << "\n";
            return 0;
        }

        if (vm.count("help")) 
        {
            std::cout << desc << "\n";
            return 1;
        }

        if (vm.count("input-file"))
        {
            std::vector<std::string> fs = vm["input-file"].as< std::vector<std::string> >();

            for(std::string const & s : fs)
            {
                std::vector<std::string> v;
                stringutil::split(s, v, ":");
                std::string filename, sample = "";

                // in case someone passes a ":"
                assert(v.size() > 0);

                filename = v[0];

                if(v.size() > 1)
                {
                    sample = v[1];
                }

                files.push_back(filename);
                samples.push_back(sample);         
            }
        }

        if(files.size() == 0)
        {
            error("Please specify at least one input file.");
        }

        if (vm.count("output"))
        {
            out_bed = vm["output"].as< std::string >();
        }
        else
        {
            out_bed = "-";
        }

        if (vm.count("regions"))
        {
            regions_bed = vm["regions"].as< std::string >();
        }

        if (vm.count("targets"))
        {
            targets_bed = vm["targets"].as< std::string >();
        }

        if (vm.count("verbose"))
        {
            verbose = vm["verbose"].as<bool>();
        }

        if (vm.count("location"))
        {
            stringutil::parsePos(vm["location"].as< std::string >(), chr, start, end);
        }

        if (vm.count("limit-records"))
        {
            rlimit = vm["limit-records"].as< int64_t >();
        }

        if (vm.count("message-every"))
        {
            message = vm["message-every"].as< int64_t >();
        }

        if (vm.count("apply-filters"))
        {
            apply_filters = vm["apply-filters"].as< bool >();
        }

        if (vm.count("window"))
        {
            window = vm["window"].as< int64_t >();
        }

        if (vm.count("nblocks"))
        {
            nblocks = vm["nblocks"].as< int >();
        }

        if (vm.count("nvars"))
        {
            nvars = vm["nvars"].as< int >();
        }

    } 
    catch (po::error & e)
    {
        std::cerr << e.what() << "\n";
        return 1;
    }
    catch(std::runtime_error & e)
    {
        std::cerr << e.what() << std::endl;
        return 1;
    }
    catch(std::logic_error & e)
    {
        std::cerr << e.what() << std::endl;
        return 1;
    }

    try
    {
        VariantReader r;

        if(regions_bed != "")
        {
            r.setRegions(regions_bed.c_str(), true);
        }
        if(targets_bed != "")
        {
            r.setTargets(targets_bed.c_str(), true);
        }

        std::list<int> sids;
        for(size_t i = 0; i < files.size(); ++i)
        {
            sids.push_back(r.addSample(files[i].c_str(), samples[i].c_str()));
        }

        r.setApplyFilters(apply_filters);

        bool stop_after_chr_change = false;
        if(chr != "")
        {
            r.rewind(chr.c_str(), start);
            stop_after_chr_change = true;
        }

        int64_t rcount = 0;
        int64_t last_end = -1;
        int64_t vars = 0, total_vars = 0;

        struct Breakpoint
        {
            std::string chr; 
            int64_t pos;
            int64_t vars;
        };

        std::list< Breakpoint > breakpoints;

        const auto add_bp = [&breakpoints, nvars, nblocks, &chr, &vars, verbose](int64_t bp)
        {
            if (vars > int64_t(nvars))
            {
                if(verbose)
                {
                    std::cerr << "Break point at " << chr << ":" << bp << " (" << vars << " variants)" << "\n";
                }
                breakpoints.push_back(Breakpoint{chr, bp, vars});
                vars = 0;
            }
        };

        std::string firstchr;

        while(r.advance(true, false))
        {
            if(rlimit != -1)
            {
                if(rcount >= rlimit)
                {
                    break;
                }
            }

            Variants & v = r.current();
            if(end != -1 && ( (v.pos > end) || (chr != "" && v.chr != chr)) )
            {
                break;
            }
            if(stop_after_chr_change && chr != "" && v.chr != chr)
            {
                break;
            }
            chr = v.chr;
            if (firstchr.size() == 0)
            {
                firstchr = chr;
            }

            // 
            if(chr != "" && v.chr != chr)
            {
                last_end = -1;
            }

            if(message > 0 && (rcount % message) == 0)
            {
                std::cerr << "From " << chr << ":" << last_end << " (" 
                          << breakpoints.size() << " bps, " << vars << " vars)" 
                          << " -- " << v << "\n";
            }

            bool call_this_pos = false;

            for(int s : sids)
            {
                Call & c = v.calls[s];
                gttype gtt = getGTType(c);
                if(!(gtt == gt_homref || gtt == gt_unknown))
                {
                    call_this_pos = true;
                    break;
                }
                if (int(v.ambiguous_alleles.size()) > s && !v.ambiguous_alleles[s].empty())
                {
                    call_this_pos = true;
                    break;                    
                }
            }

            if(!call_this_pos)
            {
                continue;
            }
            vars++;
            total_vars++;

            if(last_end >= 0 && v.pos > last_end + window) // can split here
            {
                add_bp(last_end);
            }
            last_end = std::max(last_end, v.pos + v.len - 1);

            ++rcount;
        }

        // write blocks
        std::ostream * outputfile = NULL;

        if(out_bed == "-" || out_bed == "")
        {
            outputfile = &std::cout;
        }
        else
        {
            if(verbose)
            {
                std::cerr << "Writing to " << out_bed << "\n";
            }
            outputfile = new std::ofstream(out_bed.c_str());
        }

        chr = firstchr;
        // TODO - the correct thing to do here would be to use start = 0
        // but bcftools / htslib don't like bed coordinates with start 0 
        // We should fix this in htslib, and then change it here (currently,
        // this will miss variants starting at the first coordinate of the 
        // chromosome)
        int64_t start = 1;
        int64_t vpb = 0;
        int64_t target_vpb = std::max(int64_t(nvars), total_vars / (2*nblocks));

        if(end <= 0)
        {
            end = std::numeric_limits<int>::max();
        }

        for (auto & b : breakpoints)
        {
            if (chr != b.chr)
            {
                *outputfile << chr << "\t" << start << "\t" << std::max(start + window + 1, end) << "\n";
                chr = b.chr;
                start = 1;
                vpb = 0;
            }
            vpb += b.vars;
            if(vpb > target_vpb)
            {
                *outputfile << chr << "\t" << start << "\t" << b.pos + window + 1 << "\n";
                start = b.pos + window + 1;
                vpb = 0;
            }
        }
        if(chr != "")
        {
            *outputfile << chr << "\t" << start << "\t" << std::max(start + window + 1, end) << "\n"; 
        }

        if(out_bed != "-" && out_bed != "")
        {
            delete outputfile;
        }
    }
    catch(std::runtime_error & e)
    {
        std::cerr << e.what() << std::endl;
        return 1;
    }
    catch(std::logic_error & e)
    {
        std::cerr << e.what() << std::endl;
        return 1;
    }
    return 0;
}