GenomicRegion::GenomicRegion(const std::string& tchr, const std::string& tpos1, const std::string& tpos2, const SeqLib::BamHeader& hdr)
  {
    strand = '*';
    // convert the pos strings
    // throws invalid_argument if conversion can't be performed
    // or throws an out_of_range if it is too big for result
#ifdef HAVE_C11
    pos1 = std::stoi(tpos1);
    pos2 = std::stoi(tpos2);
#else
    pos1 = std::atoi(tpos1.c_str());
    pos2 = std::atoi(tpos2.c_str());
#endif
    
    // if no header, assume that it is "standard"
    if (hdr.isEmpty()) {
      if (tchr == "X" || tchr == "chrX")
	chr = 22;
      else if (tchr == "Y" || tchr == "chrY")
	chr = 23;
      else 
#ifdef HAVE_C11
	chr = std::stoi(SeqLib::scrubString(tchr, "chr")) - 1;
#else
	chr = std::atoi(SeqLib::scrubString(tchr, "chr").c_str());
#endif
      return;
    } else {
      chr = hdr.Name2ID(tchr); //bam_name2id(hdr.get(), tchr.c_str());
    }
  }
Exemple #2
0
int runFishhook(int argc, char** argv) {

  parseFishOptions(argc, argv);

  if (opt::verbose) {
    std::cerr << "FishHook Params: " << std::endl 
	      << "\tWidth: " << SeqLib::AddCommas(opt::width) << std::endl
	      << "\tEvents: " << opt::events << std::endl
	      << "\tCoverage Mask: " << opt::coverage << std::endl
	      << "\tSlop: " << SeqLib::AddCommas(opt::slop) << std::endl
	      << "\tInterval Tracks: " << std::endl;
    for (auto& i : opt::interval_files)
      std::cerr << "\t-- " << i << std::endl;
    std::cerr << "\tScored Tracks: " << std::endl;
    for (auto& i : opt::scored_files)
      std::cerr << "\t-- " << i << std::endl;
    std::cerr << "\tSequence Features: " << std::endl;
    for (auto& i : opt::seq_features)
      std::cerr << "\t-- " << i << std::endl;

  }

  // read in the covariate tracks
  SeqHashMap<std::string, Fractions> intervals;
  
  // read a header for info
  SeqLib::BamReader rdr;
  if (!rdr.Open(opt::bam)) {
    std::cerr << "Error: Could not read BAM supplied by -b: " << opt::bam << std::endl;
    exit(EXIT_FAILURE);
  }
  hdr = rdr.Header();
  
  // read in the reference genome
  SeqLib::RefGenome ref;
  if (!ref.LoadIndex(opt::refgenome)) {
      if (opt::seq_features.size()) {
	std::cerr << "Error: Could not read referene genome supplied by -G: " << opt::refgenome << std::endl;
	exit(EXIT_FAILURE);
      }
  }

  // read in the events
  if (opt::verbose) std::cerr << "...reading events " << opt::events << std::endl;
  EventList events;
  if (!events.readFromBed(opt::events, hdr)) {
    std::cerr << "Error: Could not read events BED: " << opt::events << std::endl;
    exit(EXIT_FAILURE);
  }
  if (opt::verbose) std::cerr << "...read in " << SeqLib::AddCommas(events.size()) << " events" << std::endl;
  events.CreateTreeMap();
  
  // create the tiled regions
  FishHookTiles fish(opt::width, opt::slop, hdr.GetHeaderSequenceVector());
  if (opt::verbose)
    std::cerr << "...constructed " << SeqLib::AddCommas(fish.size()) << " fishhook intervals" << std::endl;
  fish.CreateTreeMap();

  // read the coverage mask
  SeqLib::GRC cov;
  if (!opt::coverage.empty()) {
    if (opt::verbose) std::cerr << "...reading coverage mask " << opt::coverage << std::endl;
    cov.ReadBED(opt::coverage, hdr);
    if (opt::verbose) std::cerr << "...read in " << SeqLib::AddCommas(cov.size()) << " covered regions " << std::endl;
    if (!cov.size()) {
      std::cerr << "Non-empty coverage track read with 0 regions. Check that is non-empty BED" << std::endl;
      exit(EXIT_FAILURE);
    }
    if (opt::verbose) std::cerr << "...creating interval tree map on covered regions and overlapping with tiles" << std::endl;
    cov.CreateTreeMap();

    // find covered amount per tile
    std::vector<int32_t> q, s;
    SeqLib::GRC ovlp;
    // fish is subject
    if (fish.size() > cov.size()) // do in most efficient order
      ovlp = cov.FindOverlaps(fish, q, s, false);
    else
      ovlp = fish.FindOverlaps(cov, s, q, false);
    if (opt::verbose) std::cerr << "..." << SeqLib::AddCommas(ovlp.size()) << " regions are covered" << std::endl;

    // set the amount covered by each
    for (size_t i = 0; i < ovlp.size(); ++i) {
      fish[s[i]].covered += (double)ovlp[i].Width() / fish[s[i]].Width();
    }

    // mask the events
    q.clear(); s.clear(); ovlp.clear();
    // events is subject
    if (events.size() > cov.size()) // do in most efficient order
      ovlp = cov.FindOverlaps(events, q, s, false);
    else
      ovlp = events.FindOverlaps(cov, s, q, false);

    EventList newe;
    // set the amount covered by each
    for (size_t i = 0; i < ovlp.size(); ++i) {
      newe.add(Event(ovlp[i], events.at(s[i]).id));
    }
    events = newe;
    events.CreateTreeMap();

    if (opt::verbose) std::cerr << "...kept " << SeqLib::AddCommas(events.size()) << " events after mask" << std::endl;
    
  } else {
    for (auto& i : fish)
      i.covered = 1; // the entire thing is covered if no mask provided
  }

  // read in the interval tracks
  for (auto& i : opt::interval_files)
    read_track(i, intervals, cov, false);
  for (auto& i : opt::scored_files)
    read_track(i, intervals, cov, true);

  // count events per tile (also de-dupes on patient per bin)
  fish.CountEvents(events);
  // overlap the covariates with the tiles
  for (auto& i : intervals) {
    fish.AddIntervalCovariate(i.first, i.second);
  }

  // make the matrix
  FishModel fm;
  fm.AddTiles(fish);
  
  fm.SetNumThreads(opt::num_threads);
  fm.EstimateOLS();
  fm.CooksDistance(fm.GetOLS());

  // write the covariates
  fish.PrintBEDHeader(std::cout);
  fish.PrintBED(std::cout, hdr);

  

  return 0;
}
Exemple #3
0
int main(int argc, char** argv) {

    int c;

    FastaReference reference;
    bool has_ref = false;
    bool suppress_output = false;
    bool debug = false;
    bool isuncompressed = true;

    int maxiterations = 50;
    
    if (argc < 2) {
        printUsage(argv);
        exit(1);
    }

    while (true) {
        static struct option long_options[] =
        {
            {"help", no_argument, 0, 'h'},
            {"debug", no_argument, 0, 'd'},
            {"fasta-reference", required_argument, 0, 'f'},
            {"max-iterations", required_argument, 0, 'm'},
            {"suppress-output", no_argument, 0, 's'},
            {"compressed", no_argument, 0, 'c'},
            {0, 0, 0, 0}
        };

        int option_index = 0;

        c = getopt_long (argc, argv, "hdcsf:m:",
                         long_options, &option_index);

        /* Detect the end of the options. */
        if (c == -1)
            break;
 
        switch (c) {

            case 'f':
                reference.open(optarg); // will exit on open failure
                has_ref = true;
                break;
     
            case 'm':
                maxiterations = atoi(optarg);
                break;

            case 'd':
                debug = true;
                break;

            case 's':
                suppress_output = true;
                break;

            case 'c':
                isuncompressed = false;
                break;

            case 'h':
                printUsage(argv);
                exit(0);
                break;
              
            case '?':
                printUsage(argv);
                exit(1);
                break;
     
              default:
                abort();
                break;
        }
    }

    if (!has_ref) {
        cerr << "no FASTA reference provided, cannot realign" << endl;
        exit(1);
    }


    BAMSINGLEREADER reader;
    if (!reader.Open(STDIN)) {
        cerr << "could not open stdin for reading" << endl;
        exit(1);
    }

#ifdef HAVE_BAMTOOLS

    BamWriter writer;

    if (isuncompressed) {
        writer.SetCompressionMode(BamWriter::Uncompressed);
    }

    if (!suppress_output && !writer.Open("stdout", reader.GetHeaderText(), reader.GetReferenceData())) {
        cerr << "could not open stdout for writing" << endl;
        exit(1);
    }
#else

    SeqLib::BamWriter writer(isuncompressed ? SeqLib::SAM : SeqLib::BAM);
    SeqLib::BamHeader hdr = reader.Header();
    if (hdr.isEmpty()) {
      cerr << "could not open header for input" << endl;
      exit(1);
    }
    writer.SetHeader(hdr);

    if (!suppress_output && !writer.Open("-")) {
        cerr << "could not open stdout for writing" << endl;
        exit(1);
    }
#endif

    // store the names of all the reference sequences in the BAM file
    map<int, string> referenceIDToName;
    REFVEC referenceSequences = reader.GETREFDATA;
    int i = 0;
    for (REFVEC::iterator r = referenceSequences.begin(); r != referenceSequences.end(); ++r) {
        referenceIDToName[i] = r->REFNAME;
        ++i;
    }

    BAMALIGN alignment;

    while (GETNEXT(reader, alignment)) {
      
            DEBUG("---------------------------   read    --------------------------" << endl);
            DEBUG("| " << referenceIDToName[alignment.REFID] << ":" << alignment.POSITION << endl);
            DEBUG("| " << alignment.QNAME << ":" << alignment.ENDPOSITION << endl);
            DEBUG("| " << alignment.QNAME << ":" << (alignment.ISMAPPED ? " mapped" : " unmapped") << endl);
            DEBUG("| " << alignment.QNAME << ":" << " cigar data size: " << alignment.GETCIGAR.size() << endl);
            DEBUG("--------------------------- realigned --------------------------" << endl);

            // skip unmapped alignments, as they cannot be left-realigned without CIGAR data
            if (alignment.ISMAPPED) {

                int endpos = alignment.ENDPOSITION;
                int length = endpos - alignment.POSITION + 1;
                if (alignment.POSITION >= 0 && length > 0) {
                    if (!stablyLeftAlign(alignment,
                                reference.getSubSequence(
                                    referenceIDToName[alignment.REFID],
                                    alignment.POSITION,
                                    length),
                                maxiterations, debug)) {
                        cerr << "unstable realignment of " << alignment.QNAME
                             << " at " << referenceIDToName[alignment.REFID] << ":" << alignment.POSITION << endl
                             << alignment.QUERYBASES << endl;
                    }
                }

            }

            DEBUG("----------------------------------------------------------------" << endl);
            DEBUG(endl);

        if (!suppress_output)
	  WRITEALIGNMENT(writer, alignment);

    }

    reader.Close();
    if (!suppress_output)
        writer.Close();

    return 0;
}