GenomicRegion::GenomicRegion(const std::string& tchr, const std::string& tpos1, const std::string& tpos2, const SeqLib::BamHeader& hdr) { strand = '*'; // convert the pos strings // throws invalid_argument if conversion can't be performed // or throws an out_of_range if it is too big for result #ifdef HAVE_C11 pos1 = std::stoi(tpos1); pos2 = std::stoi(tpos2); #else pos1 = std::atoi(tpos1.c_str()); pos2 = std::atoi(tpos2.c_str()); #endif // if no header, assume that it is "standard" if (hdr.isEmpty()) { if (tchr == "X" || tchr == "chrX") chr = 22; else if (tchr == "Y" || tchr == "chrY") chr = 23; else #ifdef HAVE_C11 chr = std::stoi(SeqLib::scrubString(tchr, "chr")) - 1; #else chr = std::atoi(SeqLib::scrubString(tchr, "chr").c_str()); #endif return; } else { chr = hdr.Name2ID(tchr); //bam_name2id(hdr.get(), tchr.c_str()); } }
int runFishhook(int argc, char** argv) { parseFishOptions(argc, argv); if (opt::verbose) { std::cerr << "FishHook Params: " << std::endl << "\tWidth: " << SeqLib::AddCommas(opt::width) << std::endl << "\tEvents: " << opt::events << std::endl << "\tCoverage Mask: " << opt::coverage << std::endl << "\tSlop: " << SeqLib::AddCommas(opt::slop) << std::endl << "\tInterval Tracks: " << std::endl; for (auto& i : opt::interval_files) std::cerr << "\t-- " << i << std::endl; std::cerr << "\tScored Tracks: " << std::endl; for (auto& i : opt::scored_files) std::cerr << "\t-- " << i << std::endl; std::cerr << "\tSequence Features: " << std::endl; for (auto& i : opt::seq_features) std::cerr << "\t-- " << i << std::endl; } // read in the covariate tracks SeqHashMap<std::string, Fractions> intervals; // read a header for info SeqLib::BamReader rdr; if (!rdr.Open(opt::bam)) { std::cerr << "Error: Could not read BAM supplied by -b: " << opt::bam << std::endl; exit(EXIT_FAILURE); } hdr = rdr.Header(); // read in the reference genome SeqLib::RefGenome ref; if (!ref.LoadIndex(opt::refgenome)) { if (opt::seq_features.size()) { std::cerr << "Error: Could not read referene genome supplied by -G: " << opt::refgenome << std::endl; exit(EXIT_FAILURE); } } // read in the events if (opt::verbose) std::cerr << "...reading events " << opt::events << std::endl; EventList events; if (!events.readFromBed(opt::events, hdr)) { std::cerr << "Error: Could not read events BED: " << opt::events << std::endl; exit(EXIT_FAILURE); } if (opt::verbose) std::cerr << "...read in " << SeqLib::AddCommas(events.size()) << " events" << std::endl; events.CreateTreeMap(); // create the tiled regions FishHookTiles fish(opt::width, opt::slop, hdr.GetHeaderSequenceVector()); if (opt::verbose) std::cerr << "...constructed " << SeqLib::AddCommas(fish.size()) << " fishhook intervals" << std::endl; fish.CreateTreeMap(); // read the coverage mask SeqLib::GRC cov; if (!opt::coverage.empty()) { if (opt::verbose) std::cerr << "...reading coverage mask " << opt::coverage << std::endl; cov.ReadBED(opt::coverage, hdr); if (opt::verbose) std::cerr << "...read in " << SeqLib::AddCommas(cov.size()) << " covered regions " << std::endl; if (!cov.size()) { std::cerr << "Non-empty coverage track read with 0 regions. Check that is non-empty BED" << std::endl; exit(EXIT_FAILURE); } if (opt::verbose) std::cerr << "...creating interval tree map on covered regions and overlapping with tiles" << std::endl; cov.CreateTreeMap(); // find covered amount per tile std::vector<int32_t> q, s; SeqLib::GRC ovlp; // fish is subject if (fish.size() > cov.size()) // do in most efficient order ovlp = cov.FindOverlaps(fish, q, s, false); else ovlp = fish.FindOverlaps(cov, s, q, false); if (opt::verbose) std::cerr << "..." << SeqLib::AddCommas(ovlp.size()) << " regions are covered" << std::endl; // set the amount covered by each for (size_t i = 0; i < ovlp.size(); ++i) { fish[s[i]].covered += (double)ovlp[i].Width() / fish[s[i]].Width(); } // mask the events q.clear(); s.clear(); ovlp.clear(); // events is subject if (events.size() > cov.size()) // do in most efficient order ovlp = cov.FindOverlaps(events, q, s, false); else ovlp = events.FindOverlaps(cov, s, q, false); EventList newe; // set the amount covered by each for (size_t i = 0; i < ovlp.size(); ++i) { newe.add(Event(ovlp[i], events.at(s[i]).id)); } events = newe; events.CreateTreeMap(); if (opt::verbose) std::cerr << "...kept " << SeqLib::AddCommas(events.size()) << " events after mask" << std::endl; } else { for (auto& i : fish) i.covered = 1; // the entire thing is covered if no mask provided } // read in the interval tracks for (auto& i : opt::interval_files) read_track(i, intervals, cov, false); for (auto& i : opt::scored_files) read_track(i, intervals, cov, true); // count events per tile (also de-dupes on patient per bin) fish.CountEvents(events); // overlap the covariates with the tiles for (auto& i : intervals) { fish.AddIntervalCovariate(i.first, i.second); } // make the matrix FishModel fm; fm.AddTiles(fish); fm.SetNumThreads(opt::num_threads); fm.EstimateOLS(); fm.CooksDistance(fm.GetOLS()); // write the covariates fish.PrintBEDHeader(std::cout); fish.PrintBED(std::cout, hdr); return 0; }
int main(int argc, char** argv) { int c; FastaReference reference; bool has_ref = false; bool suppress_output = false; bool debug = false; bool isuncompressed = true; int maxiterations = 50; if (argc < 2) { printUsage(argv); exit(1); } while (true) { static struct option long_options[] = { {"help", no_argument, 0, 'h'}, {"debug", no_argument, 0, 'd'}, {"fasta-reference", required_argument, 0, 'f'}, {"max-iterations", required_argument, 0, 'm'}, {"suppress-output", no_argument, 0, 's'}, {"compressed", no_argument, 0, 'c'}, {0, 0, 0, 0} }; int option_index = 0; c = getopt_long (argc, argv, "hdcsf:m:", long_options, &option_index); /* Detect the end of the options. */ if (c == -1) break; switch (c) { case 'f': reference.open(optarg); // will exit on open failure has_ref = true; break; case 'm': maxiterations = atoi(optarg); break; case 'd': debug = true; break; case 's': suppress_output = true; break; case 'c': isuncompressed = false; break; case 'h': printUsage(argv); exit(0); break; case '?': printUsage(argv); exit(1); break; default: abort(); break; } } if (!has_ref) { cerr << "no FASTA reference provided, cannot realign" << endl; exit(1); } BAMSINGLEREADER reader; if (!reader.Open(STDIN)) { cerr << "could not open stdin for reading" << endl; exit(1); } #ifdef HAVE_BAMTOOLS BamWriter writer; if (isuncompressed) { writer.SetCompressionMode(BamWriter::Uncompressed); } if (!suppress_output && !writer.Open("stdout", reader.GetHeaderText(), reader.GetReferenceData())) { cerr << "could not open stdout for writing" << endl; exit(1); } #else SeqLib::BamWriter writer(isuncompressed ? SeqLib::SAM : SeqLib::BAM); SeqLib::BamHeader hdr = reader.Header(); if (hdr.isEmpty()) { cerr << "could not open header for input" << endl; exit(1); } writer.SetHeader(hdr); if (!suppress_output && !writer.Open("-")) { cerr << "could not open stdout for writing" << endl; exit(1); } #endif // store the names of all the reference sequences in the BAM file map<int, string> referenceIDToName; REFVEC referenceSequences = reader.GETREFDATA; int i = 0; for (REFVEC::iterator r = referenceSequences.begin(); r != referenceSequences.end(); ++r) { referenceIDToName[i] = r->REFNAME; ++i; } BAMALIGN alignment; while (GETNEXT(reader, alignment)) { DEBUG("--------------------------- read --------------------------" << endl); DEBUG("| " << referenceIDToName[alignment.REFID] << ":" << alignment.POSITION << endl); DEBUG("| " << alignment.QNAME << ":" << alignment.ENDPOSITION << endl); DEBUG("| " << alignment.QNAME << ":" << (alignment.ISMAPPED ? " mapped" : " unmapped") << endl); DEBUG("| " << alignment.QNAME << ":" << " cigar data size: " << alignment.GETCIGAR.size() << endl); DEBUG("--------------------------- realigned --------------------------" << endl); // skip unmapped alignments, as they cannot be left-realigned without CIGAR data if (alignment.ISMAPPED) { int endpos = alignment.ENDPOSITION; int length = endpos - alignment.POSITION + 1; if (alignment.POSITION >= 0 && length > 0) { if (!stablyLeftAlign(alignment, reference.getSubSequence( referenceIDToName[alignment.REFID], alignment.POSITION, length), maxiterations, debug)) { cerr << "unstable realignment of " << alignment.QNAME << " at " << referenceIDToName[alignment.REFID] << ":" << alignment.POSITION << endl << alignment.QUERYBASES << endl; } } } DEBUG("----------------------------------------------------------------" << endl); DEBUG(endl); if (!suppress_output) WRITEALIGNMENT(writer, alignment); } reader.Close(); if (!suppress_output) writer.Close(); return 0; }