int main(int argc, char* argv[]) { namespace po = boost::program_options; namespace bf = boost::filesystem; std::vector<std::string> files; std::string output; std::string ref_fasta; std::string regions_bed = ""; std::string targets_bed = ""; // limits std::string chr; int64_t start = -1; int64_t end = -1; int64_t rlimit = -1; int64_t message = -1; bool apply_filters = false; bool leftshift = false; bool trimalleles = false; bool splitalleles = false; int mergebylocation = false; bool uniqalleles = false; bool calls_only = true; bool homref_split = false; bool primitives = false; std::string homref_vcf = ""; bool process_formats = false; try { // Declare the supported options. po::options_description desc("Allowed options"); desc.add_options() ("help,h", "produce help message") ("version", "Show version") ("input-file", po::value<std::vector< std::string> >(), "The input files") ("output-file,o", po::value<std::string>(), "The output file name.") ("reference,r", po::value<std::string>(), "The reference fasta file.") ("location,l", po::value<std::string>(), "Start location.") ("regions,R", po::value<std::string>(), "Use a bed file for getting a subset of regions (traversal via tabix).") ("targets,T", po::value<std::string>(), "Use a bed file for getting a subset of targets (streaming the whole file, ignoring things outside the bed regions).") ("limit-records", po::value<int64_t>(), "Maximum umber of records to process") ("message-every", po::value<int64_t>(), "Print a message every N records.") ("apply-filters,f", po::value<bool>(), "Apply filtering in VCF.") ("leftshift", po::value<bool>(), "Leftshift variant alleles.") ("trimalleles", po::value<bool>(), "Remove unused variant alleles.") ("splitalleles", po::value<bool>(), "Split and sort variant alleles.") ("merge-by-location", po::value<int>(), "Merge calls at the same location.") ("unique-alleles", po::value<bool>(), "Make alleles unique across a single line.") ("homref-split", po::value<bool>(), "Split homref blocks into per-nucleotide blocks.") ("homref-vcf-out", po::value<std::string>(), "Output split homref blocks as BCF/VCF.") ("calls-only", po::value<bool>(), "Remove homref blocks.") ("primitives", po::value<bool>(), "Split complex alleles into primitives via realignment.") ("process-split", po::value<bool>(), "Enables splitalleles, trimalleles, unique-alleles, leftshift.") ("process-full", po::value<bool>(), "Enables splitalleles, trimalleles, unique-alleles, leftshift, mergebylocation.") ("process-formats", po::value<bool>(), "Process GQ/DP/AD format fields.") ; po::positional_options_description popts; popts.add("input-file", -1); po::options_description cmdline_options; cmdline_options .add(desc) ; po::variables_map vm; po::store(po::command_line_parser(argc, argv). options(cmdline_options).positional(popts).run(), vm); po::notify(vm); if (vm.count("version")) { std::cout << "multimerge version " << HAPLOTYPES_VERSION << "\n"; return 0; } if (vm.count("help")) { std::cout << desc << "\n"; return 1; } if (vm.count("input-file")) { files = vm["input-file"].as< std::vector<std::string> >(); } if (vm.count("output-file")) { output = vm["output-file"].as< std::string >(); } if (vm.count("reference")) { ref_fasta = vm["reference"].as< std::string >(); } else { error("Please specify a reference file name."); } if (vm.count("location")) { stringutil::parsePos(vm["location"].as< std::string >(), chr, start, end); } if (vm.count("regions")) { regions_bed = vm["regions"].as< std::string >(); } if (vm.count("targets")) { targets_bed = vm["targets"].as< std::string >(); } if (vm.count("limit-records")) { rlimit = vm["limit-records"].as< int64_t >(); } if (vm.count("message-every")) { message = vm["message-every"].as< int64_t >(); } if (vm.count("apply-filters")) { apply_filters = vm["apply-filters"].as< bool >(); } if (vm.count("leftshift")) { leftshift = vm["leftshift"].as< bool >(); } if (vm.count("trimalleles")) { trimalleles = vm["trimalleles"].as< bool >(); } if (vm.count("splitalleles")) { splitalleles = vm["splitalleles"].as< bool >(); } if (vm.count("merge-by-location")) { mergebylocation = vm["merge-by-location"].as< int >(); } if (vm.count("unique-alleles")) { uniqalleles = vm["unique-alleles"].as< bool >(); } if (vm.count("calls-only")) { calls_only = vm["calls-only"].as< bool >(); } if (vm.count("homref-split")) { homref_split = vm["homref-split"].as< bool >(); } if (vm.count("primitives")) { primitives = vm["primitives"].as< bool >(); } if (vm.count("homref-vcf-out")) { homref_split = 1; homref_vcf = vm["homref-vcf-out"].as< std::string >(); } if (vm.count("process-split")) { homref_split = true; trimalleles = true; splitalleles = true; uniqalleles = true; leftshift = true; calls_only = true; } if (vm.count("process-full")) { homref_split = true; trimalleles = true; splitalleles = true; uniqalleles = true; leftshift = true; calls_only = true; mergebylocation = 2; primitives = true; } if (vm.count("process-formats")) { process_formats = vm["process-formats"].as< bool >(); } if(files.size() == 0) { std::cerr << "Please specify at least one input file / sample.\n"; return 1; } if (output == "") { std::cerr << "Please specify an output file.\n"; return 1; } } catch (po::error & e) { std::cerr << e.what() << "\n"; return 1; } try { VariantReader r; if(regions_bed != "") { r.setRegions(regions_bed.c_str(), true); } if(targets_bed != "") { r.setTargets(targets_bed.c_str(), true); } VariantWriter w(output.c_str(), ref_fasta.c_str()); std::shared_ptr<VariantWriter> p_homref_writer; if (homref_vcf.size() != 0) { p_homref_writer = std::make_shared<VariantWriter>(homref_vcf.c_str(), ref_fasta.c_str()); } w.setWriteFormats(process_formats); if (p_homref_writer) { p_homref_writer->setWriteFormats(process_formats); } r.setApplyFilters(apply_filters); for(std::string const & f : files) { std::vector<std::string> v; stringutil::split(f, v, ":"); std::string filename, sample = ""; // in case someone passes a ":" assert(v.size() > 0); filename = v[0]; if(v.size() > 1) { sample = v[1]; } std::cerr << "Adding file '" << filename << "' / sample '" << sample << "'" << "\n"; r.addSample(filename.c_str(), sample.c_str()); } std::list< std::pair<std::string, std::string> > samples; r.getSampleList(samples); std::set<std::string> samplenames; for (auto const & p : samples) { std::string sname = p.second; if (sname == "") { sname = boost::filesystem::path(p.first).stem().string(); } int i = 1; while (samplenames.count(sname)) { if(p.second == "") { sname = boost::filesystem::path(p.first).stem().string() + "." + std::to_string(i++); } else { sname = p.second + "." + std::to_string(i++); } } samplenames.insert(sname); std::cerr << "Writing '" << p.first << ":" << p.second << "' as sample '" << sname << "'" << "\n"; w.addSample(sname.c_str()); if (p_homref_writer) { p_homref_writer->addSample(sname.c_str()); } } w.addHeader(r); if (p_homref_writer) { p_homref_writer->addHeader(r); } r.rewind(chr.c_str(), start); VariantInput vi( ref_fasta.c_str(), leftshift, // bool leftshift true, // bool refpadding trimalleles, // bool trimalleles = false, splitalleles, // bool splitalleles = false, mergebylocation, // int mergebylocation = false, uniqalleles, // bool uniqalleles = false, calls_only, // bool calls_only = true, homref_split, // bool homref_split = false primitives, // bool primitives = false (bool)p_homref_writer// homref output ); vi.getProcessor().setReader(r, VariantBufferMode::buffer_block, 100); VariantProcessor & proc = vi.getProcessor(); VariantProcessor & proc_homref = vi.getProcessor(VariantInput::homref); int64_t rcount = 0; bool advance1 = proc.advance(); bool advance2 = p_homref_writer ? proc_homref.advance() : false; while(advance1 || advance2) { if(rlimit != -1) { if(rcount >= rlimit) { break; } } if (advance1) { Variants & v = proc.current(); if (chr.size() != 0 && chr != v.chr) { // chromosome changed and location was given => abort while(advance2) { Variants & v = proc_homref.current(); p_homref_writer->put(v); advance2 = homref_split ? proc_homref.advance() : false; } break; } if(end != -1 && v.pos > end) { break; } w.put(v); if(message > 0 && (rcount % message) == 0) { std::cout << stringutil::formatPos(v.chr.c_str(), v.pos) << ": " << v << "\n"; } } // make sure our homref variant output doesn't fill up all memory while(advance2 && p_homref_writer) { Variants & v = proc_homref.current(); p_homref_writer->put(v); advance2 = proc_homref.advance(); } advance1 = proc.advance(); ++rcount; } } catch(std::runtime_error & e) { std::cerr << e.what() << std::endl; return 1; } catch(std::logic_error & e) { std::cerr << e.what() << std::endl; return 1; } return 0; }
int main(int argc, char* argv[]) { namespace po = boost::program_options; std::string ref_fasta; std::string chr = ""; int64_t start = -1; int64_t end = -1; std::string file1; std::string sample1; std::string file2; std::string sample2; std::string regions_bed = ""; std::string targets_bed = ""; std::string out_vcf = ""; std::string out_errors = ""; // = max 12 unphased hets in segment int64_t blimit = -1; bool progress = false; int progress_seconds = 10; int max_n_haplotypes = 4096; int64_t hb_window = 30; int64_t hb_expand = 30; bool apply_filters_query = false; bool apply_filters_truth = true; bool preprocess = false; bool leftshift = false; bool always_hapcmp = false; bool no_hapcmp = false; bool compare_raw = false; bool output_roc_vals = false; try { // Declare the supported options. po::options_description desc("Allowed options"); desc.add_options() ("help,h", "produce help message") ("version", "Show version") ("input-vcfs", po::value<std::vector<std::string> >(), "Two VCF files to compare (use file:sample for a specific sample column).") ("output-vcf,o", po::value<std::string>(), "Output variant comparison results to VCF.") ("output-errors,e", po::value<std::string>(), "Output failure information.") ("reference,r", po::value<std::string>(), "The reference fasta file.") ("location,l", po::value<std::string>(), "The location to start at.") ("regions,R", po::value<std::string>(), "Use a bed file for getting a subset of regions (traversal via tabix).") ("targets,T", po::value<std::string>(), "Use a bed file for getting a subset of targets (streaming the whole file, ignoring things outside the bed regions).") ("progress", po::value<bool>(), "Set to true to output progress information.") ("progress-seconds", po::value<int>(), "Output progress information every n seconds.") ("window,w", po::value<int64_t>(), "Overlap window to create haplotype blocks.") ("max-n-haplotypes,n", po::value<int>(), "Maximum number of haplotypes to enumerate.") ("expand-hapblocks", po::value<int64_t>(), "Number of bases to expand around each haplotype block.") ("limit", po::value<int64_t>(), "Maximum number of haplotype blocks to process.") ("apply-filters-truth", po::value<bool>(), "Apply filtering in truth VCF (on by default).") ("apply-filters-query,f", po::value<bool>(), "Apply filtering in query VCF (off by default).") ("preprocess-variants,V", po::value<bool>(), "Apply variant normalisations, trimming, realignment for complex variants (off by default).") ("leftshift", po::value<bool>(), "Left-shift indel alleles (off by default).") ("always-hapcmp", po::value<bool>(), "Always compare haplotype blocks (even if they match). Testing use only/slow.") ("compare-raw", po::value<bool>(), "Compare raw calls also to maximize chances of matching difficult regions.") ("no-hapcmp", po::value<bool>(), "Disable haplotype comparison. This overrides all other haplotype comparison options.") ("roc-vals", po::value<bool>(), "Output GQX and qual values for truth and query in INFO (which gets preserved through quantify).") ; po::positional_options_description popts; popts.add("input-vcfs", 2); po::options_description cmdline_options; cmdline_options .add(desc) ; po::variables_map vm; po::store(po::command_line_parser(argc, argv). options(cmdline_options).positional(popts).run(), vm); po::notify(vm); if (vm.count("version")) { std::cout << "xcmp version " << HAPLOTYPES_VERSION << "\n"; return 0; } if (vm.count("help")) { std::cout << desc << "\n"; return 1; } if (vm.count("input-vcfs")) { std::vector<std::string> vr = vm["input-vcfs"].as< std::vector<std::string> >(); if(vr.size() != 2) { error("Please pass exactly two vcf file names for comparison."); } std::vector<std::string> v; stringutil::split(vr[0], v, ":"); // in case someone passes a ":" assert(v.size() > 0); file1 = v[0]; sample1 = ""; if(v.size() > 1) { sample1 = v[1]; } v.clear(); stringutil::split(vr[1], v, ":"); // in case someone passes a ":" assert(v.size() > 0); file2 = v[0]; sample2 = ""; if(v.size() > 1) { sample2 = v[1]; } } if (vm.count("output-vcf")) { out_vcf = vm["output-vcf"].as< std::string >(); } if (vm.count("output-errors")) { out_errors = vm["output-errors"].as< std::string >(); } if (vm.count("preprocess-variants")) { preprocess = vm["preprocess-variants"].as< bool >(); } if (vm.count("leftshift")) { leftshift = vm["leftshift"].as< bool >(); } if (vm.count("location")) { stringutil::parsePos(vm["location"].as< std::string >(), chr, start, end); } if (vm.count("regions")) { regions_bed = vm["regions"].as< std::string >(); } if (vm.count("targets")) { targets_bed = vm["targets"].as< std::string >(); } if (vm.count("reference")) { ref_fasta = vm["reference"].as< std::string >(); } else if(preprocess) { error("Please specify a reference file name."); } if (vm.count("max-n-haplotypes")) { max_n_haplotypes = vm["max-n-haplotypes"].as< int >(); } if (vm.count("limit")) { blimit = vm["limit"].as< int64_t >(); } if (vm.count("expand-hapblocks")) { hb_expand = vm["expand-hapblocks"].as< int64_t >(); } if (vm.count("window")) { hb_window = vm["window"].as< int64_t >(); } if (vm.count("progress")) { progress = vm["progress"].as< bool >(); } if (vm.count("progress-seconds")) { progress_seconds = vm["progress-seconds"].as< int >(); } if (vm.count("apply-filters-truth")) { apply_filters_truth = vm["apply-filters-truth"].as< bool >(); } if (vm.count("apply-filters-query")) { apply_filters_query = vm["apply-filters-query"].as< bool >(); } if (vm.count("always-hapcmp")) { always_hapcmp = vm["always-hapcmp"].as< bool >(); } if (vm.count("no-hapcmp")) { no_hapcmp = vm["no-hapcmp"].as< bool >(); } if (vm.count("compare-raw")) { compare_raw = vm["compare-raw"].as< bool >(); } if (vm.count("roc-vals")) { output_roc_vals = vm["roc-vals"].as< bool >(); } } catch (po::error & e) { std::cerr << e.what() << "\n"; return 1; } catch(std::runtime_error & e) { std::cerr << e.what() << std::endl; return 1; } catch(std::logic_error & e) { std::cerr << e.what() << std::endl; return 1; } try { VariantReader vr; vr.setReturnHomref(false); if(regions_bed != "") { vr.setRegions(regions_bed.c_str(), true); } if(targets_bed != "") { vr.setTargets(targets_bed.c_str(), true); } int r1 = vr.addSample(file1.c_str(), sample1.c_str()); int r2 = vr.addSample(file2.c_str(), sample2.c_str()); vr.setApplyFilters(apply_filters_truth, r1); vr.setApplyFilters(apply_filters_query, r2); VariantInput vi( ref_fasta.c_str(), preprocess || leftshift, // bool leftshift false, // bool refpadding true, // bool trimalleles = false, (remove unused alleles) preprocess || leftshift, // bool splitalleles = false, ( preprocess || leftshift ) ? 2 : 0, // int mergebylocation = false, true, // bool uniqalleles = false, true, // bool calls_only = true, false, // bool homref_split = false // this is handled by calls_only preprocess, // bool primitives = false false, // bool homref_output leftshift ? hb_window-1 : 0, // int64_t leftshift_limit compare_raw // collect_raw ); VariantProcessor & vp = vi.getProcessor(); std::unique_ptr<VariantAlleleRemover> p_raw_allele_remover; std::unique_ptr<VariantAlleleSplitter> p_raw_allele_splitter; std::unique_ptr<VariantAlleleNormalizer> p_raw_allele_normalizer; std::unique_ptr<VariantLocationAggregator> p_raw_aggregator; std::unique_ptr<VariantAlleleUniq> p_raw_allele_uniq; std::unique_ptr<VariantCallsOnly> p_raw_callsonly; if(compare_raw) { p_raw_allele_remover = std::move(std::unique_ptr<VariantAlleleRemover>(new VariantAlleleRemover())); vi.getProcessor(VariantInput::raw).addStep(*p_raw_allele_remover); p_raw_allele_splitter = std::move(std::unique_ptr<VariantAlleleSplitter>(new VariantAlleleSplitter())); vi.getProcessor(VariantInput::raw).addStep(*p_raw_allele_splitter); if(leftshift) { p_raw_allele_normalizer = std::move(std::unique_ptr<VariantAlleleNormalizer>(new VariantAlleleNormalizer())); p_raw_allele_normalizer->setReference(ref_fasta); p_raw_allele_normalizer->setEnableRefPadding(false); p_raw_allele_normalizer->setLeftshiftLimit(hb_window - 1); p_raw_allele_normalizer->setEnableHomrefVariants(false); vi.getProcessor(VariantInput::raw).addStep(*p_raw_allele_normalizer); } p_raw_aggregator = std::move(std::unique_ptr<VariantLocationAggregator>(new VariantLocationAggregator())); vi.getProcessor(VariantInput::raw).addStep(*p_raw_aggregator); p_raw_allele_uniq = std::move(std::unique_ptr<VariantAlleleUniq>(new VariantAlleleUniq())); vi.getProcessor(VariantInput::raw).addStep(*p_raw_allele_uniq); p_raw_callsonly = std::move(std::unique_ptr<VariantCallsOnly>(new VariantCallsOnly())); vi.getProcessor(VariantInput::raw).addStep(*p_raw_callsonly); } vp.setReader(vr, VariantBufferMode::buffer_block, 10*hb_window); bool stop_after_chr_change = false; if(chr != "") { vp.rewind(chr.c_str(), start); stop_after_chr_change = true; } std::unique_ptr<VariantWriter> pvw; if (out_vcf != "") { pvw = std::move(std::unique_ptr<VariantWriter> (new VariantWriter(out_vcf.c_str(), ref_fasta.c_str()))); pvw->addHeader(vr); pvw->addHeader("##INFO=<ID=gtt1,Number=1,Type=String,Description=\"GT of truth call\">"); pvw->addHeader("##INFO=<ID=gtt2,Number=1,Type=String,Description=\"GT of query call\">"); pvw->addHeader("##INFO=<ID=type,Number=1,Type=String,Description=\"Decision for call (TP/FP/FN/N)\">"); pvw->addHeader("##INFO=<ID=kind,Number=1,Type=String,Description=\"Sub-type for decision (match/mismatch type)\">"); pvw->addHeader("##INFO=<ID=ctype,Number=1,Type=String,Description=\"Type of comparison performed\">"); pvw->addHeader("##INFO=<ID=HapMatch,Number=0,Type=Flag,Description=\"Variant is in matching haplotype block\">"); if(output_roc_vals) { pvw->addHeader("##INFO=<ID=T_GQ,Number=1,Type=Float,Description=\"GQ field in truth VCF.\">"); pvw->addHeader("##INFO=<ID=Q_GQ,Number=1,Type=Float,Description=\"GQ field in query VCF.\">"); pvw->addHeader("##INFO=<ID=T_DP,Number=1,Type=Float,Description=\"DP field in truth VCF.\">"); pvw->addHeader("##INFO=<ID=Q_DP,Number=1,Type=Float,Description=\"DP field in query VCF.\">"); pvw->addHeader("##INFO=<ID=T_QUAL,Number=1,Type=Float,Description=\"Qual column in truth VCF.\">"); pvw->addHeader("##INFO=<ID=Q_QUAL,Number=1,Type=Float,Description=\"Qual column in query VCF.\">"); } pvw->addSample("TRUTH"); pvw->addSample("QUERY"); } std::ostream * error_out_stream = NULL; if(out_errors == "-") { error_out_stream = &std::cerr; } else if(out_errors != "") { error_out_stream = new std::ofstream(out_errors.c_str()); } DiploidCompare hc(ref_fasta.c_str()); hc.setMaxHapEnum(max_n_haplotypes); hc.setDoAlignments(false); int64_t nhb = 0; int64_t last_pos = std::numeric_limits<int64_t>::max(); // hap-block status + update std::list<Variants> block_variants; int64_t block_start = -1; int64_t block_end = -1; int n_nonsnp = 0, calls_1 = 0, calls_2 = 0; bool has_mismatch = false; const auto finish_block = [&vi, &block_variants, r1, r2, &chr, &block_start, &block_end, &n_nonsnp, &calls_1, &calls_2, &has_mismatch, &pvw, &error_out_stream, &hc, hb_expand, no_hapcmp, always_hapcmp, compare_raw] () { bool hap_match = false, hap_fail = false, hap_run = false, raw_match = false; // try HC if we have mismatches, and if the number of calls is > 0 if (!no_hapcmp && (always_hapcmp || (has_mismatch && calls_1 > 0 && calls_2 > 0 && n_nonsnp > 0))) { if(compare_raw) { std::list<Variants> raw_variants; while(vi.getProcessor(VariantInput::raw).advance()) { Variants & vars = vi.getProcessor(VariantInput::raw).current(); raw_variants.push_back(vars); } try { hap_run = true; hap_fail = true; hc.setRegion(chr.c_str(), std::max(int64_t(0), block_start-hb_expand), block_end + hb_expand, raw_variants, r1, r2); DiploidComparisonResult const & hcr = hc.getResult(); #ifdef DEBUG_XCMP std::cerr << chr << ":" << block_start << "-" << block_end << " variants: " << "\n"; for(auto const & x : block_variants) { std::cerr << x << "\n"; } std::cerr << "Block result: " << "\n"; std::cerr << hcr << "\n"; #endif raw_match = hap_match = hcr.outcome == dco_match; hap_fail = !(hcr.outcome == dco_match || hcr.outcome == dco_mismatch); } catch(std::runtime_error &e) { } catch(std::logic_error &e) { } } if(!hap_match) { try { hap_run = true; hap_fail = true; hc.setRegion(chr.c_str(), std::max(int64_t(0), block_start-hb_expand), block_end + hb_expand, block_variants, r1, r2); DiploidComparisonResult const & hcr = hc.getResult(); #ifdef DEBUG_XCMP std::cerr << chr << ":" << block_start << "-" << block_end << " variants: " << "\n"; for(auto const & x : block_variants) { std::cerr << x << "\n"; } std::cerr << "Block result: " << "\n"; std::cerr << hcr << "\n"; #endif hap_match = hcr.outcome == dco_match; hap_fail = !(hcr.outcome == dco_match || hcr.outcome == dco_mismatch); } catch(std::runtime_error &e) { if (error_out_stream) { *error_out_stream << chr << "\t" << block_start << "\t" << block_end+1 << "\t" << "hap_error\t" << e.what() << "\n"; } } catch(std::logic_error &e) { if (error_out_stream) { *error_out_stream << chr << "\t" << block_start << "\t" << block_end+1 << "\t" << "hap_error\t" << e.what() << "\n"; } } } } std::string result; if(hap_run) { if (hap_fail) { result = "hapfail:"; } else if(has_mismatch) { result = "hap:"; } else { result = "simple:"; } } else { result = "simple:"; } if(hap_run && !has_mismatch && !hap_match) { result += "suspicious_simple_match"; } else if(always_hapcmp && hap_match && ((calls_1 == 0 && calls_2 > 0) || (calls_1 > 0 && calls_2 == 0))) { bool any_filtered = false; for (Variants const & v : block_variants) { for (Call const & c : v.calls) { for (size_t i = 0; i < c.nfilter; ++i) { if(c.filter[i] != "PASS" && c.filter[i] != ".") { any_filtered = true; break; } } } if(any_filtered) { break; } } if(any_filtered) { result += "match_ignoring_filtered"; } else { result += "suspicious_hap_match"; } } else if(hap_match || !has_mismatch) { result += "match"; } else { result += "mismatch"; } if(raw_match) { result += "_raw"; } if(error_out_stream) { *error_out_stream << chr << "\t" << block_start << "\t" << block_end+1 << "\t" << result << "\t" << has_mismatch << ":" << hap_match << ":" << hap_fail << ":" << calls_1 << ":" << calls_2 << ":" << n_nonsnp << "\n"; } if (pvw) { for (Variants & v : block_variants) { if(v.info != "") { v.info += ";"; } v.info += std::string("ctype=") + result; if (hap_match) { v.info += ";HapMatch"; } pvw->put(v); } } block_variants.clear(); block_start = -1; block_end = -1; n_nonsnp = 0; calls_1 = 0; calls_2 = 0; has_mismatch = false; }; auto start_time = std::chrono::high_resolution_clock::now(); auto last_time = std::chrono::high_resolution_clock::now(); while(vp.advance()) { if(blimit > 0 && nhb++ > blimit) { // reached record limit break; } Variants & v = vp.current(); if(end != -1 && (v.pos > end || (chr.size() != 0 && chr != v.chr))) { // reached end break; } if(stop_after_chr_change && chr.size() != 0 && chr != v.chr) { // reached end of chr break; } if(chr.size() == 0) { chr = v.chr; } if (v.chr != chr || (block_end > 0 && block_end + hb_window < v.pos)) { finish_block(); } chr = v.chr; if (block_start < 0) { block_start = v.pos; } else { block_start = std::min(block_start, v.pos); } if (block_end < 0) { block_end = v.pos + v.len - 1; } else { block_end = std::max(v.pos + v.len - 1, block_end); } if(compareVariants(v, r1, r2, n_nonsnp, calls_1, calls_2) != dco_match) { has_mismatch = true; } if(output_roc_vals) { if(!v.info.empty()) { v.info += ";"; } v.info += "T_GQ=" + std::to_string(v.calls[r1].gq); v.info += ";Q_GQ=" + std::to_string(v.calls[r2].gq); v.info += ";T_DP=" + std::to_string(v.calls[r1].dp); v.info += ";Q_DP=" + std::to_string(v.calls[r2].dp); v.info += ";T_QUAL=" + std::to_string(v.calls[r1].qual); v.info += ";Q_QUAL=" + std::to_string(v.calls[r2].qual); } block_variants.push_back(v); #ifdef DEBUG_XCMP std::cerr << v << "\n"; std::cerr << "block_start : " << block_start << "\t" << "block_end : " << block_end << "\t" << "block_size : " << block_variants.size() << "\t" << "n_nonsnp : " << n_nonsnp << "\t" << "calls_1 : " << calls_1 << "\t" << "calls_2 : " << calls_2 << "\t" << "\n"; #endif if(progress) { using namespace std; auto end_time = chrono::high_resolution_clock::now(); auto secs = chrono::duration_cast<chrono::seconds>(end_time - last_time).count(); if(secs > progress_seconds) { auto secs_since_start = chrono::duration_cast<chrono::seconds>(end_time - start_time).count(); std::string mbps = ""; if(last_pos < v.pos) { mbps = " mpbs: "; mbps += std::to_string(double(v.pos - last_pos) / double(secs_since_start) * 1e-6); } else { last_pos = v.pos; } last_time = end_time; std::cerr << "[PROGRESS] Total time: " << secs_since_start << "s Pos: " << v.pos << mbps << "\n"; } } } #ifdef DEBUG_XCMP std::cerr << "END\n"; std::cerr << "block_start : " << block_start << "\t" << "block_end : " << block_end << "\t" << "block_size : " << block_variants.size() << "\t" << "n_nonsnp : " << n_nonsnp << "\t" << "calls_1 : " << calls_1 << "\t" << "calls_2 : " << calls_2 << "\t" << "\n"; #endif finish_block(); if(error_out_stream && out_errors != "-") { delete error_out_stream; } } catch(std::runtime_error &e) { std::cerr << e.what() << std::endl; return 1; } catch(std::logic_error &e) { std::cerr << e.what() << std::endl; return 1; } return 0; }
int main(int argc, char* argv[]) { namespace po = boost::program_options; namespace bf = boost::filesystem; std::string input_dir; std::string output_vcf; std::string ref_fasta; try { // Declare the supported options. po::options_description desc("Allowed options"); desc.add_options() ("help,h", "produce help message") ("version", "Show version") ("input-dir", po::value<std::string>(), "Path to a vcfeval output directory.") ("output-vcf", po::value<std::string>(), "Annotated VCF output file.") ("reference,r", po::value<std::string>(), "The reference fasta file.") ; po::positional_options_description popts; popts.add("input-dir", 1); popts.add("output-vcf", 1); po::options_description cmdline_options; cmdline_options .add(desc) ; po::variables_map vm; po::store(po::command_line_parser(argc, argv). options(cmdline_options).positional(popts).run(), vm); po::notify(vm); if (vm.count("version")) { std::cout << "postvcfeval version " << HAPLOTYPES_VERSION << "\n"; return 0; } if (vm.count("help")) { std::cout << desc << "\n"; return 1; } if (vm.count("input-dir")) { input_dir = vm["input-dir"].as<std::string>(); } else { error("Please specify an input directory"); } if (vm.count("output-vcf")) { output_vcf = vm["output-vcf"].as< std::string >(); } if (vm.count("reference")) { ref_fasta = vm["reference"].as< std::string >(); } else { error("To write an output VCF, you need to specify a reference file, too."); } if (output_vcf == "") { std::cerr << "Please specify an output file.\n"; return 1; } } catch (po::error & e) { std::cerr << e.what() << "\n"; return 1; } try { VariantReader r; r.setApplyFilters(false); boost::filesystem::path p(input_dir); int in_ix_fp = r.addSample((p / "fp.vcf.gz").c_str(), ""); int in_ix_fn = r.addSample((p / "fn.vcf.gz").c_str(), ""); int in_ix_tp = r.addSample((p / "tp.vcf.gz").c_str(), ""); int in_ix_tpb = r.addSample((p / "tp-baseline.vcf.gz").c_str(), ""); std::unique_ptr<VariantWriter> writer = std::make_unique<VariantWriter>(output_vcf.c_str(), ref_fasta.c_str()); writer->addSample("TRUTH"); writer->addSample("QUERY"); writer->addHeader(r); writer->addHeader("##INFO=<ID=type,Number=1,Type=String,Description=\"Decision for call (TP/FP/FN/N)\">"); writer->addHeader("##INFO=<ID=kind,Number=1,Type=String,Description=\"Sub-type for decision (match/mismatch type)\">"); int64_t rcount = 0; while(r.advance()) { Variants & v = r.current(); if(!v.calls[in_ix_fp].isNocall()) { Variants out_vars; out_vars = v; out_vars.calls.clear(); out_vars.calls.resize(2); out_vars.calls[1] = v.calls[in_ix_fp]; if(!out_vars.info.empty()) out_vars.info += ";"; out_vars.info += "type=FP"; out_vars.info += ";kind=missing"; writer->put(out_vars); } if(!v.calls[in_ix_fn].isNocall()) { Variants out_vars; out_vars = v; out_vars.calls.clear(); out_vars.calls.resize(2); out_vars.calls[0] = v.calls[in_ix_fn]; if(!out_vars.info.empty()) out_vars.info += ";"; out_vars.info += "type=FN"; out_vars.info += ";kind=missing"; writer->put(out_vars); } if(!v.calls[in_ix_tp].isNocall() || !v.calls[in_ix_tpb].isNocall()) { Variants out_vars; out_vars = v; out_vars.calls.clear(); out_vars.calls.resize(2); out_vars.calls[0] = v.calls[in_ix_tp]; out_vars.calls[1] = v.calls[in_ix_tpb]; if(!out_vars.info.empty()) out_vars.info += ";"; out_vars.info += "type=TP"; out_vars.info += ";kind=vcfeval"; writer->put(out_vars); } ++rcount; } } catch(std::runtime_error & e) { std::cerr << e.what() << std::endl; return 1; } catch(std::logic_error & e) { std::cerr << e.what() << std::endl; return 1; } return 0; }
int main(int argc, char* argv[]) { namespace po = boost::program_options; std::vector<std::string> files, samples; std::string regions_bed = ""; std::string targets_bed = ""; std::string out_bed = ""; // limits std::string chr = ""; int64_t start = -1; int64_t end = -1; int64_t rlimit = -1; int64_t message = -1; int64_t window = 30; bool apply_filters = false; int nblocks = 32; int nvars = 100; bool verbose = false; try { // Declare the supported options. po::options_description desc("Allowed options"); desc.add_options() ("help,h", "produce help message") ("version", "Show version") ("input-file", po::value<std::vector<std::string> >(), "The input VCF/BCF file(s) (use file:sample to specify a sample)") ("output,o", po::value<std::string>(), "Write a bed file giving the locations of overlapping blocks (use - for stdout).") ("regions,R", po::value<std::string>(), "Use a bed file for getting a subset of regions (traversal via tabix).") ("targets,T", po::value<std::string>(), "Use a bed file for getting a subset of targets (streaming the whole file, ignoring things outside the bed regions).") ("location,l", po::value<std::string>(), "The location / subset.") ("limit-records,L", po::value<int64_t>(), "Maximum number of records to process") ("message-every,m", po::value<int64_t>(), "Print a message every N records.") ("window,w", po::value<int64_t>(), "Overlap window length.") ("nblocks", po::value<int>(), "Maximum number of blocks to break into (32).") ("nvars", po::value<int>(), "Minimum number of variants per block (100).") ("apply-filters,f", po::value<bool>(), "Apply filtering in VCF.") ("verbose", po::value<bool>(), "Verbose output.") ; po::positional_options_description popts; popts.add("input-file", -1); po::options_description cmdline_options; cmdline_options .add(desc) ; po::variables_map vm; po::store(po::command_line_parser(argc, argv). options(cmdline_options).positional(popts).run(), vm); po::notify(vm); if (vm.count("version")) { std::cout << "blocksplit version " << HAPLOTYPES_VERSION << "\n"; return 0; } if (vm.count("help")) { std::cout << desc << "\n"; return 1; } if (vm.count("input-file")) { std::vector<std::string> fs = vm["input-file"].as< std::vector<std::string> >(); for(std::string const & s : fs) { std::vector<std::string> v; stringutil::split(s, v, ":"); std::string filename, sample = ""; // in case someone passes a ":" assert(v.size() > 0); filename = v[0]; if(v.size() > 1) { sample = v[1]; } files.push_back(filename); samples.push_back(sample); } } if(files.size() == 0) { error("Please specify at least one input file."); } if (vm.count("output")) { out_bed = vm["output"].as< std::string >(); } else { out_bed = "-"; } if (vm.count("regions")) { regions_bed = vm["regions"].as< std::string >(); } if (vm.count("targets")) { targets_bed = vm["targets"].as< std::string >(); } if (vm.count("verbose")) { verbose = vm["verbose"].as<bool>(); } if (vm.count("location")) { stringutil::parsePos(vm["location"].as< std::string >(), chr, start, end); } if (vm.count("limit-records")) { rlimit = vm["limit-records"].as< int64_t >(); } if (vm.count("message-every")) { message = vm["message-every"].as< int64_t >(); } if (vm.count("apply-filters")) { apply_filters = vm["apply-filters"].as< bool >(); } if (vm.count("window")) { window = vm["window"].as< int64_t >(); } if (vm.count("nblocks")) { nblocks = vm["nblocks"].as< int >(); } if (vm.count("nvars")) { nvars = vm["nvars"].as< int >(); } } catch (po::error & e) { std::cerr << e.what() << "\n"; return 1; } catch(std::runtime_error & e) { std::cerr << e.what() << std::endl; return 1; } catch(std::logic_error & e) { std::cerr << e.what() << std::endl; return 1; } try { VariantReader r; if(regions_bed != "") { r.setRegions(regions_bed.c_str(), true); } if(targets_bed != "") { r.setTargets(targets_bed.c_str(), true); } std::list<int> sids; for(size_t i = 0; i < files.size(); ++i) { sids.push_back(r.addSample(files[i].c_str(), samples[i].c_str())); } r.setApplyFilters(apply_filters); bool stop_after_chr_change = false; if(chr != "") { r.rewind(chr.c_str(), start); stop_after_chr_change = true; } int64_t rcount = 0; int64_t last_end = -1; int64_t vars = 0, total_vars = 0; struct Breakpoint { std::string chr; int64_t pos; int64_t vars; }; std::list< Breakpoint > breakpoints; const auto add_bp = [&breakpoints, nvars, nblocks, &chr, &vars, verbose](int64_t bp) { if (vars > int64_t(nvars)) { if(verbose) { std::cerr << "Break point at " << chr << ":" << bp << " (" << vars << " variants)" << "\n"; } breakpoints.push_back(Breakpoint{chr, bp, vars}); vars = 0; } }; std::string firstchr; while(r.advance(true, false)) { if(rlimit != -1) { if(rcount >= rlimit) { break; } } Variants & v = r.current(); if(end != -1 && ( (v.pos > end) || (chr != "" && v.chr != chr)) ) { break; } if(stop_after_chr_change && chr != "" && v.chr != chr) { break; } chr = v.chr; if (firstchr.size() == 0) { firstchr = chr; } // if(chr != "" && v.chr != chr) { last_end = -1; } if(message > 0 && (rcount % message) == 0) { std::cerr << "From " << chr << ":" << last_end << " (" << breakpoints.size() << " bps, " << vars << " vars)" << " -- " << v << "\n"; } bool call_this_pos = false; for(int s : sids) { Call & c = v.calls[s]; gttype gtt = getGTType(c); if(!(gtt == gt_homref || gtt == gt_unknown)) { call_this_pos = true; break; } if (int(v.ambiguous_alleles.size()) > s && !v.ambiguous_alleles[s].empty()) { call_this_pos = true; break; } } if(!call_this_pos) { continue; } vars++; total_vars++; if(last_end >= 0 && v.pos > last_end + window) // can split here { add_bp(last_end); } last_end = std::max(last_end, v.pos + v.len - 1); ++rcount; } // write blocks std::ostream * outputfile = NULL; if(out_bed == "-" || out_bed == "") { outputfile = &std::cout; } else { if(verbose) { std::cerr << "Writing to " << out_bed << "\n"; } outputfile = new std::ofstream(out_bed.c_str()); } chr = firstchr; // TODO - the correct thing to do here would be to use start = 0 // but bcftools / htslib don't like bed coordinates with start 0 // We should fix this in htslib, and then change it here (currently, // this will miss variants starting at the first coordinate of the // chromosome) int64_t start = 1; int64_t vpb = 0; int64_t target_vpb = std::max(int64_t(nvars), total_vars / (2*nblocks)); if(end <= 0) { end = std::numeric_limits<int>::max(); } for (auto & b : breakpoints) { if (chr != b.chr) { *outputfile << chr << "\t" << start << "\t" << std::max(start + window + 1, end) << "\n"; chr = b.chr; start = 1; vpb = 0; } vpb += b.vars; if(vpb > target_vpb) { *outputfile << chr << "\t" << start << "\t" << b.pos + window + 1 << "\n"; start = b.pos + window + 1; vpb = 0; } } if(chr != "") { *outputfile << chr << "\t" << start << "\t" << std::max(start + window + 1, end) << "\n"; } if(out_bed != "-" && out_bed != "") { delete outputfile; } } catch(std::runtime_error & e) { std::cerr << e.what() << std::endl; return 1; } catch(std::logic_error & e) { std::cerr << e.what() << std::endl; return 1; } return 0; }