int main(int argc, char* argv[]) { namespace po = boost::program_options; namespace bf = boost::filesystem; std::string input_dir; std::string output_vcf; std::string ref_fasta; try { // Declare the supported options. po::options_description desc("Allowed options"); desc.add_options() ("help,h", "produce help message") ("version", "Show version") ("input-dir", po::value<std::string>(), "Path to a vcfeval output directory.") ("output-vcf", po::value<std::string>(), "Annotated VCF output file.") ("reference,r", po::value<std::string>(), "The reference fasta file.") ; po::positional_options_description popts; popts.add("input-dir", 1); popts.add("output-vcf", 1); po::options_description cmdline_options; cmdline_options .add(desc) ; po::variables_map vm; po::store(po::command_line_parser(argc, argv). options(cmdline_options).positional(popts).run(), vm); po::notify(vm); if (vm.count("version")) { std::cout << "postvcfeval version " << HAPLOTYPES_VERSION << "\n"; return 0; } if (vm.count("help")) { std::cout << desc << "\n"; return 1; } if (vm.count("input-dir")) { input_dir = vm["input-dir"].as<std::string>(); } else { error("Please specify an input directory"); } if (vm.count("output-vcf")) { output_vcf = vm["output-vcf"].as< std::string >(); } if (vm.count("reference")) { ref_fasta = vm["reference"].as< std::string >(); } else { error("To write an output VCF, you need to specify a reference file, too."); } if (output_vcf == "") { std::cerr << "Please specify an output file.\n"; return 1; } } catch (po::error & e) { std::cerr << e.what() << "\n"; return 1; } try { VariantReader r; r.setApplyFilters(false); boost::filesystem::path p(input_dir); int in_ix_fp = r.addSample((p / "fp.vcf.gz").c_str(), ""); int in_ix_fn = r.addSample((p / "fn.vcf.gz").c_str(), ""); int in_ix_tp = r.addSample((p / "tp.vcf.gz").c_str(), ""); int in_ix_tpb = r.addSample((p / "tp-baseline.vcf.gz").c_str(), ""); std::unique_ptr<VariantWriter> writer = std::make_unique<VariantWriter>(output_vcf.c_str(), ref_fasta.c_str()); writer->addSample("TRUTH"); writer->addSample("QUERY"); writer->addHeader(r); writer->addHeader("##INFO=<ID=type,Number=1,Type=String,Description=\"Decision for call (TP/FP/FN/N)\">"); writer->addHeader("##INFO=<ID=kind,Number=1,Type=String,Description=\"Sub-type for decision (match/mismatch type)\">"); int64_t rcount = 0; while(r.advance()) { Variants & v = r.current(); if(!v.calls[in_ix_fp].isNocall()) { Variants out_vars; out_vars = v; out_vars.calls.clear(); out_vars.calls.resize(2); out_vars.calls[1] = v.calls[in_ix_fp]; if(!out_vars.info.empty()) out_vars.info += ";"; out_vars.info += "type=FP"; out_vars.info += ";kind=missing"; writer->put(out_vars); } if(!v.calls[in_ix_fn].isNocall()) { Variants out_vars; out_vars = v; out_vars.calls.clear(); out_vars.calls.resize(2); out_vars.calls[0] = v.calls[in_ix_fn]; if(!out_vars.info.empty()) out_vars.info += ";"; out_vars.info += "type=FN"; out_vars.info += ";kind=missing"; writer->put(out_vars); } if(!v.calls[in_ix_tp].isNocall() || !v.calls[in_ix_tpb].isNocall()) { Variants out_vars; out_vars = v; out_vars.calls.clear(); out_vars.calls.resize(2); out_vars.calls[0] = v.calls[in_ix_tp]; out_vars.calls[1] = v.calls[in_ix_tpb]; if(!out_vars.info.empty()) out_vars.info += ";"; out_vars.info += "type=TP"; out_vars.info += ";kind=vcfeval"; writer->put(out_vars); } ++rcount; } } catch(std::runtime_error & e) { std::cerr << e.what() << std::endl; return 1; } catch(std::logic_error & e) { std::cerr << e.what() << std::endl; return 1; } return 0; }
int main(int argc, char* argv[]) { namespace po = boost::program_options; std::vector<std::string> files, samples; std::string regions_bed = ""; std::string targets_bed = ""; std::string out_bed = ""; // limits std::string chr = ""; int64_t start = -1; int64_t end = -1; int64_t rlimit = -1; int64_t message = -1; int64_t window = 30; bool apply_filters = false; int nblocks = 32; int nvars = 100; bool verbose = false; try { // Declare the supported options. po::options_description desc("Allowed options"); desc.add_options() ("help,h", "produce help message") ("version", "Show version") ("input-file", po::value<std::vector<std::string> >(), "The input VCF/BCF file(s) (use file:sample to specify a sample)") ("output,o", po::value<std::string>(), "Write a bed file giving the locations of overlapping blocks (use - for stdout).") ("regions,R", po::value<std::string>(), "Use a bed file for getting a subset of regions (traversal via tabix).") ("targets,T", po::value<std::string>(), "Use a bed file for getting a subset of targets (streaming the whole file, ignoring things outside the bed regions).") ("location,l", po::value<std::string>(), "The location / subset.") ("limit-records,L", po::value<int64_t>(), "Maximum number of records to process") ("message-every,m", po::value<int64_t>(), "Print a message every N records.") ("window,w", po::value<int64_t>(), "Overlap window length.") ("nblocks", po::value<int>(), "Maximum number of blocks to break into (32).") ("nvars", po::value<int>(), "Minimum number of variants per block (100).") ("apply-filters,f", po::value<bool>(), "Apply filtering in VCF.") ("verbose", po::value<bool>(), "Verbose output.") ; po::positional_options_description popts; popts.add("input-file", -1); po::options_description cmdline_options; cmdline_options .add(desc) ; po::variables_map vm; po::store(po::command_line_parser(argc, argv). options(cmdline_options).positional(popts).run(), vm); po::notify(vm); if (vm.count("version")) { std::cout << "blocksplit version " << HAPLOTYPES_VERSION << "\n"; return 0; } if (vm.count("help")) { std::cout << desc << "\n"; return 1; } if (vm.count("input-file")) { std::vector<std::string> fs = vm["input-file"].as< std::vector<std::string> >(); for(std::string const & s : fs) { std::vector<std::string> v; stringutil::split(s, v, ":"); std::string filename, sample = ""; // in case someone passes a ":" assert(v.size() > 0); filename = v[0]; if(v.size() > 1) { sample = v[1]; } files.push_back(filename); samples.push_back(sample); } } if(files.size() == 0) { error("Please specify at least one input file."); } if (vm.count("output")) { out_bed = vm["output"].as< std::string >(); } else { out_bed = "-"; } if (vm.count("regions")) { regions_bed = vm["regions"].as< std::string >(); } if (vm.count("targets")) { targets_bed = vm["targets"].as< std::string >(); } if (vm.count("verbose")) { verbose = vm["verbose"].as<bool>(); } if (vm.count("location")) { stringutil::parsePos(vm["location"].as< std::string >(), chr, start, end); } if (vm.count("limit-records")) { rlimit = vm["limit-records"].as< int64_t >(); } if (vm.count("message-every")) { message = vm["message-every"].as< int64_t >(); } if (vm.count("apply-filters")) { apply_filters = vm["apply-filters"].as< bool >(); } if (vm.count("window")) { window = vm["window"].as< int64_t >(); } if (vm.count("nblocks")) { nblocks = vm["nblocks"].as< int >(); } if (vm.count("nvars")) { nvars = vm["nvars"].as< int >(); } } catch (po::error & e) { std::cerr << e.what() << "\n"; return 1; } catch(std::runtime_error & e) { std::cerr << e.what() << std::endl; return 1; } catch(std::logic_error & e) { std::cerr << e.what() << std::endl; return 1; } try { VariantReader r; if(regions_bed != "") { r.setRegions(regions_bed.c_str(), true); } if(targets_bed != "") { r.setTargets(targets_bed.c_str(), true); } std::list<int> sids; for(size_t i = 0; i < files.size(); ++i) { sids.push_back(r.addSample(files[i].c_str(), samples[i].c_str())); } r.setApplyFilters(apply_filters); bool stop_after_chr_change = false; if(chr != "") { r.rewind(chr.c_str(), start); stop_after_chr_change = true; } int64_t rcount = 0; int64_t last_end = -1; int64_t vars = 0, total_vars = 0; struct Breakpoint { std::string chr; int64_t pos; int64_t vars; }; std::list< Breakpoint > breakpoints; const auto add_bp = [&breakpoints, nvars, nblocks, &chr, &vars, verbose](int64_t bp) { if (vars > int64_t(nvars)) { if(verbose) { std::cerr << "Break point at " << chr << ":" << bp << " (" << vars << " variants)" << "\n"; } breakpoints.push_back(Breakpoint{chr, bp, vars}); vars = 0; } }; std::string firstchr; while(r.advance(true, false)) { if(rlimit != -1) { if(rcount >= rlimit) { break; } } Variants & v = r.current(); if(end != -1 && ( (v.pos > end) || (chr != "" && v.chr != chr)) ) { break; } if(stop_after_chr_change && chr != "" && v.chr != chr) { break; } chr = v.chr; if (firstchr.size() == 0) { firstchr = chr; } // if(chr != "" && v.chr != chr) { last_end = -1; } if(message > 0 && (rcount % message) == 0) { std::cerr << "From " << chr << ":" << last_end << " (" << breakpoints.size() << " bps, " << vars << " vars)" << " -- " << v << "\n"; } bool call_this_pos = false; for(int s : sids) { Call & c = v.calls[s]; gttype gtt = getGTType(c); if(!(gtt == gt_homref || gtt == gt_unknown)) { call_this_pos = true; break; } if (int(v.ambiguous_alleles.size()) > s && !v.ambiguous_alleles[s].empty()) { call_this_pos = true; break; } } if(!call_this_pos) { continue; } vars++; total_vars++; if(last_end >= 0 && v.pos > last_end + window) // can split here { add_bp(last_end); } last_end = std::max(last_end, v.pos + v.len - 1); ++rcount; } // write blocks std::ostream * outputfile = NULL; if(out_bed == "-" || out_bed == "") { outputfile = &std::cout; } else { if(verbose) { std::cerr << "Writing to " << out_bed << "\n"; } outputfile = new std::ofstream(out_bed.c_str()); } chr = firstchr; // TODO - the correct thing to do here would be to use start = 0 // but bcftools / htslib don't like bed coordinates with start 0 // We should fix this in htslib, and then change it here (currently, // this will miss variants starting at the first coordinate of the // chromosome) int64_t start = 1; int64_t vpb = 0; int64_t target_vpb = std::max(int64_t(nvars), total_vars / (2*nblocks)); if(end <= 0) { end = std::numeric_limits<int>::max(); } for (auto & b : breakpoints) { if (chr != b.chr) { *outputfile << chr << "\t" << start << "\t" << std::max(start + window + 1, end) << "\n"; chr = b.chr; start = 1; vpb = 0; } vpb += b.vars; if(vpb > target_vpb) { *outputfile << chr << "\t" << start << "\t" << b.pos + window + 1 << "\n"; start = b.pos + window + 1; vpb = 0; } } if(chr != "") { *outputfile << chr << "\t" << start << "\t" << std::max(start + window + 1, end) << "\n"; } if(out_bed != "-" && out_bed != "") { delete outputfile; } } catch(std::runtime_error & e) { std::cerr << e.what() << std::endl; return 1; } catch(std::logic_error & e) { std::cerr << e.what() << std::endl; return 1; } return 0; }