int main(int argc, char* argv[]) { namespace po = boost::program_options; namespace bf = boost::filesystem; std::vector<std::string> files; std::string output; std::string ref_fasta; std::string regions_bed = ""; std::string targets_bed = ""; // limits std::string chr; int64_t start = -1; int64_t end = -1; int64_t rlimit = -1; int64_t message = -1; bool apply_filters = false; bool leftshift = false; bool trimalleles = false; bool splitalleles = false; int mergebylocation = false; bool uniqalleles = false; bool calls_only = true; bool homref_split = false; bool primitives = false; std::string homref_vcf = ""; bool process_formats = false; try { // Declare the supported options. po::options_description desc("Allowed options"); desc.add_options() ("help,h", "produce help message") ("version", "Show version") ("input-file", po::value<std::vector< std::string> >(), "The input files") ("output-file,o", po::value<std::string>(), "The output file name.") ("reference,r", po::value<std::string>(), "The reference fasta file.") ("location,l", po::value<std::string>(), "Start location.") ("regions,R", po::value<std::string>(), "Use a bed file for getting a subset of regions (traversal via tabix).") ("targets,T", po::value<std::string>(), "Use a bed file for getting a subset of targets (streaming the whole file, ignoring things outside the bed regions).") ("limit-records", po::value<int64_t>(), "Maximum umber of records to process") ("message-every", po::value<int64_t>(), "Print a message every N records.") ("apply-filters,f", po::value<bool>(), "Apply filtering in VCF.") ("leftshift", po::value<bool>(), "Leftshift variant alleles.") ("trimalleles", po::value<bool>(), "Remove unused variant alleles.") ("splitalleles", po::value<bool>(), "Split and sort variant alleles.") ("merge-by-location", po::value<int>(), "Merge calls at the same location.") ("unique-alleles", po::value<bool>(), "Make alleles unique across a single line.") ("homref-split", po::value<bool>(), "Split homref blocks into per-nucleotide blocks.") ("homref-vcf-out", po::value<std::string>(), "Output split homref blocks as BCF/VCF.") ("calls-only", po::value<bool>(), "Remove homref blocks.") ("primitives", po::value<bool>(), "Split complex alleles into primitives via realignment.") ("process-split", po::value<bool>(), "Enables splitalleles, trimalleles, unique-alleles, leftshift.") ("process-full", po::value<bool>(), "Enables splitalleles, trimalleles, unique-alleles, leftshift, mergebylocation.") ("process-formats", po::value<bool>(), "Process GQ/DP/AD format fields.") ; po::positional_options_description popts; popts.add("input-file", -1); po::options_description cmdline_options; cmdline_options .add(desc) ; po::variables_map vm; po::store(po::command_line_parser(argc, argv). options(cmdline_options).positional(popts).run(), vm); po::notify(vm); if (vm.count("version")) { std::cout << "multimerge version " << HAPLOTYPES_VERSION << "\n"; return 0; } if (vm.count("help")) { std::cout << desc << "\n"; return 1; } if (vm.count("input-file")) { files = vm["input-file"].as< std::vector<std::string> >(); } if (vm.count("output-file")) { output = vm["output-file"].as< std::string >(); } if (vm.count("reference")) { ref_fasta = vm["reference"].as< std::string >(); } else { error("Please specify a reference file name."); } if (vm.count("location")) { stringutil::parsePos(vm["location"].as< std::string >(), chr, start, end); } if (vm.count("regions")) { regions_bed = vm["regions"].as< std::string >(); } if (vm.count("targets")) { targets_bed = vm["targets"].as< std::string >(); } if (vm.count("limit-records")) { rlimit = vm["limit-records"].as< int64_t >(); } if (vm.count("message-every")) { message = vm["message-every"].as< int64_t >(); } if (vm.count("apply-filters")) { apply_filters = vm["apply-filters"].as< bool >(); } if (vm.count("leftshift")) { leftshift = vm["leftshift"].as< bool >(); } if (vm.count("trimalleles")) { trimalleles = vm["trimalleles"].as< bool >(); } if (vm.count("splitalleles")) { splitalleles = vm["splitalleles"].as< bool >(); } if (vm.count("merge-by-location")) { mergebylocation = vm["merge-by-location"].as< int >(); } if (vm.count("unique-alleles")) { uniqalleles = vm["unique-alleles"].as< bool >(); } if (vm.count("calls-only")) { calls_only = vm["calls-only"].as< bool >(); } if (vm.count("homref-split")) { homref_split = vm["homref-split"].as< bool >(); } if (vm.count("primitives")) { primitives = vm["primitives"].as< bool >(); } if (vm.count("homref-vcf-out")) { homref_split = 1; homref_vcf = vm["homref-vcf-out"].as< std::string >(); } if (vm.count("process-split")) { homref_split = true; trimalleles = true; splitalleles = true; uniqalleles = true; leftshift = true; calls_only = true; } if (vm.count("process-full")) { homref_split = true; trimalleles = true; splitalleles = true; uniqalleles = true; leftshift = true; calls_only = true; mergebylocation = 2; primitives = true; } if (vm.count("process-formats")) { process_formats = vm["process-formats"].as< bool >(); } if(files.size() == 0) { std::cerr << "Please specify at least one input file / sample.\n"; return 1; } if (output == "") { std::cerr << "Please specify an output file.\n"; return 1; } } catch (po::error & e) { std::cerr << e.what() << "\n"; return 1; } try { VariantReader r; if(regions_bed != "") { r.setRegions(regions_bed.c_str(), true); } if(targets_bed != "") { r.setTargets(targets_bed.c_str(), true); } VariantWriter w(output.c_str(), ref_fasta.c_str()); std::shared_ptr<VariantWriter> p_homref_writer; if (homref_vcf.size() != 0) { p_homref_writer = std::make_shared<VariantWriter>(homref_vcf.c_str(), ref_fasta.c_str()); } w.setWriteFormats(process_formats); if (p_homref_writer) { p_homref_writer->setWriteFormats(process_formats); } r.setApplyFilters(apply_filters); for(std::string const & f : files) { std::vector<std::string> v; stringutil::split(f, v, ":"); std::string filename, sample = ""; // in case someone passes a ":" assert(v.size() > 0); filename = v[0]; if(v.size() > 1) { sample = v[1]; } std::cerr << "Adding file '" << filename << "' / sample '" << sample << "'" << "\n"; r.addSample(filename.c_str(), sample.c_str()); } std::list< std::pair<std::string, std::string> > samples; r.getSampleList(samples); std::set<std::string> samplenames; for (auto const & p : samples) { std::string sname = p.second; if (sname == "") { sname = boost::filesystem::path(p.first).stem().string(); } int i = 1; while (samplenames.count(sname)) { if(p.second == "") { sname = boost::filesystem::path(p.first).stem().string() + "." + std::to_string(i++); } else { sname = p.second + "." + std::to_string(i++); } } samplenames.insert(sname); std::cerr << "Writing '" << p.first << ":" << p.second << "' as sample '" << sname << "'" << "\n"; w.addSample(sname.c_str()); if (p_homref_writer) { p_homref_writer->addSample(sname.c_str()); } } w.addHeader(r); if (p_homref_writer) { p_homref_writer->addHeader(r); } r.rewind(chr.c_str(), start); VariantInput vi( ref_fasta.c_str(), leftshift, // bool leftshift true, // bool refpadding trimalleles, // bool trimalleles = false, splitalleles, // bool splitalleles = false, mergebylocation, // int mergebylocation = false, uniqalleles, // bool uniqalleles = false, calls_only, // bool calls_only = true, homref_split, // bool homref_split = false primitives, // bool primitives = false (bool)p_homref_writer// homref output ); vi.getProcessor().setReader(r, VariantBufferMode::buffer_block, 100); VariantProcessor & proc = vi.getProcessor(); VariantProcessor & proc_homref = vi.getProcessor(VariantInput::homref); int64_t rcount = 0; bool advance1 = proc.advance(); bool advance2 = p_homref_writer ? proc_homref.advance() : false; while(advance1 || advance2) { if(rlimit != -1) { if(rcount >= rlimit) { break; } } if (advance1) { Variants & v = proc.current(); if (chr.size() != 0 && chr != v.chr) { // chromosome changed and location was given => abort while(advance2) { Variants & v = proc_homref.current(); p_homref_writer->put(v); advance2 = homref_split ? proc_homref.advance() : false; } break; } if(end != -1 && v.pos > end) { break; } w.put(v); if(message > 0 && (rcount % message) == 0) { std::cout << stringutil::formatPos(v.chr.c_str(), v.pos) << ": " << v << "\n"; } } // make sure our homref variant output doesn't fill up all memory while(advance2 && p_homref_writer) { Variants & v = proc_homref.current(); p_homref_writer->put(v); advance2 = proc_homref.advance(); } advance1 = proc.advance(); ++rcount; } } catch(std::runtime_error & e) { std::cerr << e.what() << std::endl; return 1; } catch(std::logic_error & e) { std::cerr << e.what() << std::endl; return 1; } return 0; }