Beispiel #1
0
void BedIntersectPE::IntersectBamPE(string bamFile) {

    // load the "B" bed file into a map so
    // that we can easily compare "A" to it for overlaps
    _bedB->loadBedFileIntoMap();

    // open the BAM file
    BamReader reader;
    BamWriter writer;
    reader.Open(bamFile);

    // get header & reference information
    string bamHeader = reader.GetHeaderText();
    RefVector refs   = reader.GetReferenceData();

    // open a BAM output to stdout if we are writing BAM
    if (_bamOutput == true) {
        // set compression mode
        BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
        if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed;
        writer.SetCompressionMode(compressionMode);
        // open our BAM writer
        writer.Open("stdout", bamHeader, refs);
    }

    // track the previous and current sequence
    // names so that we can identify blocks of
    // alignments for a given read ID.
    string prevName, currName;
    prevName = currName = "";

    vector<BamAlignment> alignments;        // vector of BAM alignments for a given ID in a BAM file.
    alignments.reserve(100);

    _bedA->bedType = 10;                    // it's a full BEDPE given it's BAM

    // rip through the BAM file and convert each mapped entry to BEDPE
    BamAlignment bam1, bam2;
    while (reader.GetNextAlignment(bam1)) {
        // the alignment must be paired
        if (bam1.IsPaired() == true) {
            // grab the second alignment for the pair.
            reader.GetNextAlignment(bam2);

            // require that the alignments are from the same query
            if (bam1.Name == bam2.Name) {
                ProcessBamBlock(bam1, bam2, refs, writer);
            }
            else {
                cerr << "*****ERROR: -bedpe requires BAM to be sorted or grouped by query name. " << endl;
                exit(1);
            }
        }
    }
    // close up
    reader.Close();
    if (_bamOutput == true) {
        writer.Close();
    }
}
Beispiel #2
0
bool RevertTool::RevertToolPrivate::Run(void) {
  
    // opens the BAM file without checking for indexes
    BamReader reader;
    if ( !reader.Open(m_settings->InputFilename) ) {
        cerr << "Could not open input BAM file... quitting." << endl;
        return false;
    }

    // get BAM file metadata
    const string& headerText = reader.GetHeaderText();
    const RefVector& references = reader.GetReferenceData();
    
    // open writer
    BamWriter writer;
    bool writeUncompressed = ( m_settings->OutputFilename == Options::StandardOut() && !m_settings->IsForceCompression );
    if ( !writer.Open(m_settings->OutputFilename, headerText, references, writeUncompressed) ) {
        cerr << "Could not open " << m_settings->OutputFilename << " for writing." << endl;
        return false;
    }

    // plow through file, reverting alignments
    BamAlignment al;
    while ( reader.GetNextAlignment(al) ) {
        RevertAlignment(al);
        writer.SaveAlignment(al);
    }
    
    // clean and exit
    reader.Close();
    writer.Close();
    return true; 
}
Beispiel #3
0
// ValidateReaders checks that all the readers point to BAM files representing
// alignments against the same set of reference sequences, and that the
// sequences are identically ordered.  If these checks fail the operation of
// the multireader is undefined, so we force program exit.
void BamMultiReader::ValidateReaders(void) const {
    int firstRefCount = readers.front().first->GetReferenceCount();
    BamTools::RefVector firstRefData = readers.front().first->GetReferenceData();
    for (vector<pair<BamReader*, BamAlignment*> >::const_iterator it = readers.begin(); it != readers.end(); ++it) {
        BamReader* reader = it->first;
        BamTools::RefVector currentRefData = reader->GetReferenceData();
        BamTools::RefVector::const_iterator f = firstRefData.begin();
        BamTools::RefVector::const_iterator c = currentRefData.begin();
        if (reader->GetReferenceCount() != firstRefCount || firstRefData.size() != currentRefData.size()) {
            cerr << "ERROR: mismatched number of references in " << reader->GetFilename()
                      << " expected " << firstRefCount 
                      << " reference sequences but only found " << reader->GetReferenceCount() << endl;
            exit(1);
        }
        // this will be ok; we just checked above that we have identically-sized sets of references
        // here we simply check if they are all, in fact, equal in content
        while (f != firstRefData.end()) {
            if (f->RefName != c->RefName || f->RefLength != c->RefLength) {
                cerr << "ERROR: mismatched references found in " << reader->GetFilename()
                          << " expected: " << endl;
                for (BamTools::RefVector::const_iterator a = firstRefData.begin(); a != firstRefData.end(); ++a)
                    cerr << a->RefName << " " << a->RefLength << endl;
                cerr << "but found: " << endl;
                for (BamTools::RefVector::const_iterator a = currentRefData.begin(); a != currentRefData.end(); ++a)
                    cerr << a->RefName << " " << a->RefLength << endl;
                exit(1);
            }
            ++f; ++c;
        }
    }
}
Beispiel #4
0
void BedCoverage::CollectCoverageBam(string bamFile) {

    // load the "B" bed file into a map so
    // that we can easily compare "A" to it for overlaps
    _bedB->loadBedCovFileIntoMap();

    // open the BAM file
    BamReader reader;
    reader.Open(bamFile);

    // get header & reference information
    string header = reader.GetHeaderText();
    RefVector refs = reader.GetReferenceData();

    // convert each aligned BAM entry to BED
    // and compute coverage on B
    BamAlignment bam;
    while (reader.GetNextAlignment(bam)) {
        if (bam.IsMapped()) {
            // treat the BAM alignment as a single "block"
            if (_obeySplits == false) {
                // construct a new BED entry from the current BAM alignment.
                BED a;
                a.chrom  = refs.at(bam.RefID).RefName;
                a.start  = bam.Position;
                a.end    = bam.GetEndPosition(false, false);
                a.strand = "+";
                if (bam.IsReverseStrand()) a.strand = "-";

                _bedB->countHits(a, _sameStrand, _diffStrand, _countsOnly);
            }
            // split the BAM alignment into discrete blocks and
            // look for overlaps only within each block.
            else {
                // vec to store the discrete BED "blocks" from a
                bedVector bedBlocks;
                // since we are counting coverage, we do want to split blocks when a
                // deletion (D) CIGAR op is encountered (hence the true for the last parm)
                GetBamBlocks(bam, refs.at(bam.RefID).RefName, bedBlocks, false, true);
                // use countSplitHits to avoid over-counting each split chunk
                // as distinct read coverage.
                _bedB->countSplitHits(bedBlocks, _sameStrand, _diffStrand, _countsOnly);
            }
        }
    }
    // report the coverage (summary or histogram) for BED B.
    if (_countsOnly == true)
        ReportCounts();
    else 
        ReportCoverage();
    // close the BAM file
    reader.Close();
}
Beispiel #5
0
void parser::fetchLines(vector<BamAlignment>& result, uint32_t n,
        const std::string& file) {
    BamReader bam;
    BamAlignment read;
    Guarded<FileNotGood> g(!(bam.Open(file)), file.c_str());
    const RefVector refvec = bam.GetReferenceData();
    while (bam.GetNextAlignment(read) && n) {
        result.push_back(read);
//        cout << "read " << n << "\t" << read << "\n";
        n--;
    }
}
Beispiel #6
0
bool CoverageTool::CoverageToolPrivate::Run(void) {  
  
    // if output filename given
    ofstream outFile;
    if ( m_settings->HasOutputFile ) {
      
        // open output file stream
        outFile.open(m_settings->OutputFilename.c_str());
        if ( !outFile ) {
            cerr << "bamtools coverage ERROR: could not open " << m_settings->OutputFilename
                 << " for output" << endl;
            return false; 
        }
        
        // set m_out to file's streambuf
        m_out.rdbuf(outFile.rdbuf()); 
    } 
    
    //open our BAM reader
    BamReader reader;
    if ( !reader.Open(m_settings->InputBamFilename) ) {
        cerr << "bamtools coverage ERROR: could not open input BAM file: " << m_settings->InputBamFilename << endl;
        return false;
    }

    // retrieve references
    m_references = reader.GetReferenceData();
    
    // set up our output 'visitor'
    CoverageVisitor* cv = new CoverageVisitor(m_references, &m_out);
    
    // set up pileup engine with 'visitor'
    PileupEngine pileup;
    pileup.AddVisitor(cv);
    
    // process input data
    BamAlignment al;    
    while ( reader.GetNextAlignment(al) ) 
        pileup.AddAlignment(al);
    
    // clean up 
    reader.Close();
    if ( m_settings->HasOutputFile )
        outFile.close();
    delete cv;
    cv = 0;
    
    // return success
    return true;
}
Beispiel #7
0
int main (int argc, char *argv[]) {

     if( (argc== 1) ||
    	(argc== 2 && string(argv[1]) == "-h") ||
    	(argc== 2 && string(argv[1]) == "-help") ||
    	(argc== 2 && string(argv[1]) == "--help") ){
	 cout<<"Usage:setAsUnpaired [in bam] [outbam]"<<endl<<"this program takes flags all paired sequences as singles"<<endl;
    	return 1;
    }

     string bamfiletopen = string(argv[1]);
     string bamFileOUT   = string(argv[2]);

     BamReader reader;
     BamWriter writer;

     if ( !reader.Open(bamfiletopen) ) {
    	cerr << "Could not open input BAM files." << endl;
    	return 1;
     }
    const SamHeader header = reader.GetHeader();
    const RefVector references = reader.GetReferenceData();
    if ( !writer.Open(bamFileOUT,header,references) ) {
    	cerr << "Could not open output BAM file "<<bamFileOUT << endl;
    	return 1;
    }

    BamAlignment al;
 
    while ( reader.GetNextAlignment(al) ) {
	if(al.IsMapped()){
	    cerr << "Cannot yet handle mapped reads " << endl;
	    return 1;
	}

	
	al.SetIsPaired (false);
	
	writer.SaveAlignment(al);    

    } //while al

    reader.Close();
    writer.Close();

    return 0;
}
Beispiel #8
0
int main(const int argc, char* const argv[]) {
    int c, min_mapQ=0, seed=chrono::system_clock::now().time_since_epoch().count();
    unsigned int flag_on=0, flag_off=0;
    string fn_tgt, fn_in, fn_out="", out_format="b";

    while ((c = getopt(argc, argv, "SbBcCt:h1Ho:q:f:F:ul:r:?T:R:L:s:@:m:x:U:")) >= 0) {
        switch (c) {
            case 's': seed = atoi(optarg); break;
            case 'm': break;
            case 'c': break;
            case 'S': break;
            case 'b': break;
            case 'C': break;
            case 'h': break;
            case 'H': break;
            case 'o': fn_out = optarg; break;
            case 'U': break;
            case 'f': flag_on |= strtol(optarg, 0, 0); break;
            case 'F': flag_off |= strtol(optarg, 0, 0); break;
            case 'q': min_mapQ = atoi(optarg); break;
            case 'u': out_format = "u"; break;
            case '1': break;
            case 'l': break;
            case 'r': break;
            case 't': fn_tgt = optarg; break;
            case 'R': break;
            case '?': return usage();
            case 'T': break;
            case 'B': break;
            case '@': break;
            case 'x': break;
            default: return usage();
        }
    }
    if (fn_tgt.compare("") == 0) return usage();
    if (argc == optind) return usage();
    fn_in = argv[optind];

    BamReader reader;
    if (!reader.Open(fn_in)) {
        cerr << "ERROR: cannot open [" << fn_in << "] for reading\n";
        return 1;
    }
    if (!reader.LocateIndex()) {
        cerr << "ERROR: cannot find BAM index for [" << fn_in << "]\n";
        return 1;
    }

    const SamHeader header = reader.GetHeader();
    if (header.SortOrder.compare("coordinate") != 0) {
        cerr << "ERROR: [" << fn_in << "] not sorted by coordinate\n";
        return 1;
    }
    const RefVector refseq = reader.GetReferenceData();

    vector<BamRegion> regions;
    vector<unsigned int> src_depths, tgt_depths;
    if (read_region_depth(fn_tgt.c_str(), reader, regions, src_depths, tgt_depths) != 0) return 1;

    BamWriter writer;
    if (!writer.Open(fn_out, header, refseq)) {
        cerr << "ERROR: cannot open [" << fn_out << "] for writing\n";
        return 1;
    }

    BamAlignment aln;
    vector<BamAlignment> reads;
    vector<string> paired, unpaired;
    unordered_map<int, int> kept;
    unordered_map<string, unsigned int> seen, sampled;
    unordered_map<string, vector<int> > pool;
    for (size_t i=0; i<regions.size(); ++i) {
        reads.clear();
        paired.clear();
        unpaired.clear();
        kept.clear();
        pool.clear();

        char region_string[256];
        sprintf(region_string, "%s:%d-%d", refseq[regions[i].LeftRefID].RefName.c_str(), regions[i].LeftPosition, regions[i].RightPosition);
        if (!reader.SetRegion(regions[i])) {
            cerr << "WARNING: failed to locate [" << region_string << "]\n";
            //cerr << "WARNING: failed to locate [" << refseq[regions[i].LeftRefID].RefName << ':' << regions[i].LeftPosition << '-' << regions[i].RightPosition << "]\n";
            continue;
        }
        while (reader.GetNextAlignment(aln)) {
            if ((aln.AlignmentFlag & flag_on) == flag_on && !(aln.AlignmentFlag & flag_off) && aln.MapQuality >= min_mapQ)
                reads.push_back(aln);
        }
        if (reads.size() == 0) continue;

        unsigned int depth = 0;
        for (size_t k=0; k<reads.size(); ++k) {
            aln = reads[k];
            string rn = aln.Name;
            if (seen.find(rn) != seen.end()) { // if seen in previous regions
                if (sampled.find(rn) != sampled.end()) { // if self or mate sampled before, sample it
                    if (sampled[rn] != aln.AlignmentFlag) kept[k] = 1; // if mate sampled before, keep it
                    depth += get_overlap(aln, regions[i]);
                }
                if (seen[rn] != aln.AlignmentFlag) seen[rn] = aln.AlignmentFlag;
            }
            else { // if not seen in previous regions
                pool[rn].push_back(k);
            }
            if (depth > tgt_depths[i]) break;
        }
        if (depth < tgt_depths[i]) {
            for (auto it=pool.begin(); it!=pool.end(); ++it) {
                if (it->second.size()>1)
                    paired.push_back(it->first);
                else
                    unpaired.push_back(it->first);
            }
            shuffle(paired.begin(), paired.end(), default_random_engine(seed));
            shuffle(unpaired.begin(), unpaired.end(), default_random_engine(seed));
            int n1=paired.size(), n2=unpaired.size(), k1, k2, k3;
            while (depth < tgt_depths[i] && n1+n2 > 0) {
                if (n1>0) {
                    k1 = pool[paired[--n1]][0];
                    k2 = pool[paired[n1]][1];
                    depth += get_overlap(reads[k1], regions[i]);
                    depth += get_overlap(reads[k2], regions[i]);
                    kept[k1] = 1; kept[k2] = 1;
                    continue;
                }
                if (n2>0) {
                    k3 = pool[unpaired[--n2]][0];
                    depth += get_overlap(reads[k3], regions[i]);
                    kept[k3] = 1;
                    continue;
                }
            }
        }
        for (auto it=pool.begin(); it!=pool.end(); ++it) {
            string rn = it->first;
            seen[rn] = reads[pool[rn].back()].AlignmentFlag;
        }
        for (auto it=kept.begin(); it!=kept.end(); ++it) {
            int k = it->first;
            string rn = reads[k].Name;
            sampled[rn] = reads[k].AlignmentFlag;
            writer.SaveAlignment(reads[k]);
        }
        cerr << "INFO: target=[" << tgt_depths[i] << "], actual=[" << depth << "], N(reads)=[" << reads.size() << "], N(kept)=[" << kept.size() << "] at [" << region_string << "]\n";
    }

    reader.Close();

    return 0;
}
void BedIntersect::IntersectBam(string bamFile) {

	// load the "B" bed file into a map so
	// that we can easily compare "A" to it for overlaps
	_bedB->loadBedFileIntoMap();
	
	// open the BAM file
	BamReader reader;
	BamWriter writer;
	reader.Open(bamFile);

	// get header & reference information
	string header  = reader.GetHeaderText();
	RefVector refs = reader.GetReferenceData();

	// open a BAM output to stdout if we are writing BAM
	if (_bamOutput == true) {
		// open our BAM writer
        writer.Open("stdout", header, refs, _isUncompressedBam);
	}

	vector<BED> hits;
	// reserve some space
	hits.reserve(100);
	
	_bedA->bedType = 6;
	BamAlignment bam;	
	// get each set of alignments for each pair.
	while (reader.GetNextAlignment(bam)) {
		
		if (bam.IsMapped()) {	
			BED a;
			a.chrom = refs.at(bam.RefID).RefName;
			a.start = bam.Position;
			a.end   = bam.GetEndPosition(false);

			// build the name field from the BAM alignment.
			a.name = bam.Name;
			if (bam.IsFirstMate()) a.name += "/1";
			if (bam.IsSecondMate()) a.name += "/2";

			a.score  = ToString(bam.MapQuality);
			
			a.strand = "+"; 
			if (bam.IsReverseStrand()) a.strand = "-"; 
	
			if (_bamOutput == true) {
			    bool overlapsFound = false;
			    // treat the BAM alignment as a single "block"
			    if (_obeySplits == false) {
				    overlapsFound = FindOneOrMoreOverlap(a);
				}
				// split the BAM alignment into discrete blocks and
				// look for overlaps only within each block.
				else {
                    bool overlapFoundForBlock;
				    bedVector bedBlocks;  // vec to store the discrete BED "blocks" from a
				    // we don't want to split on "D" ops, hence the "false"
                    getBamBlocks(bam, refs, bedBlocks, false);
                    
                    vector<BED>::const_iterator bedItr  = bedBlocks.begin();
                	vector<BED>::const_iterator bedEnd  = bedBlocks.end();
                	for (; bedItr != bedEnd; ++bedItr) {
            	        overlapFoundForBlock = FindOneOrMoreOverlap(a);
            	        if (overlapFoundForBlock == true)
                            overlapsFound = true;
            	    }
				}
				if (overlapsFound == true) {
					if (_noHit == false)
						writer.SaveAlignment(bam);
				}
				else {
					if (_noHit == true) {
						writer.SaveAlignment(bam);
					}	
				}
			}
			else {
			    // treat the BAM alignment as a single BED "block"
			    if (_obeySplits == false) {
				    FindOverlaps(a, hits);
				    hits.clear();
			    }
			    // split the BAM alignment into discrete BED blocks and
				// look for overlaps only within each block.
			    else {
			        bedVector bedBlocks;  // vec to store the discrete BED "blocks" from a
                    getBamBlocks(bam, refs, bedBlocks, false);

                    vector<BED>::const_iterator bedItr  = bedBlocks.begin();
                	vector<BED>::const_iterator bedEnd  = bedBlocks.end();
                	for (; bedItr != bedEnd; ++bedItr) {
            	        FindOverlaps(*bedItr, hits);
                        hits.clear();
            	    }
			    }
			}
		}
	}
	
	// close the relevant BAM files.
	reader.Close();
	if (_bamOutput == true) {
		writer.Close();
	}
}
Beispiel #10
0
void BedIntersectPE::IntersectBamPE(string bamFile) {

    // load the "B" bed file into a map so
    // that we can easily compare "A" to it for overlaps
    _bedB->loadBedFileIntoMap();

    // open the BAM file
    BamReader reader;
    BamWriter writer;
    reader.Open(bamFile);

    // get header & reference information
    string bamHeader = reader.GetHeaderText();
    RefVector refs   = reader.GetReferenceData();

    // open a BAM output to stdout if we are writing BAM
    if (_bamOutput == true) {
        // set compression mode
        BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
        if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed;
        writer.SetCompressionMode(compressionMode);
        // open our BAM writer
        writer.Open("stdout", bamHeader, refs);
    }

    // track the previous and current sequence
    // names so that we can identify blocks of
    // alignments for a given read ID.
    string prevName, currName;
    prevName = currName = "";

    vector<BamAlignment> alignments;        // vector of BAM alignments for a given ID in a BAM file.
    alignments.reserve(100);

    _bedA->bedType = 10;                    // it's a full BEDPE given it's BAM

    // rip through the BAM file and convert each mapped entry to BEDPE
    BamAlignment bam1, bam2;
    while (reader.GetNextAlignment(bam1)) {
        reader.GetNextAlignment(bam2);        
        if (bam1.Name != bam2.Name) {
            while (bam1.Name != bam2.Name)
            {
                if (bam1.IsPaired()) 
                {
                    cerr << "*****WARNING: Query " << bam1.Name
                         << " is marked as paired, but it's mate does not occur"
                         << " next to it in your BAM file.  Skipping. " << endl;
                }
                bam1 = bam2;
                reader.GetNextAlignment(bam2);
            }
        }
        else if (bam1.IsPaired() && bam1.IsPaired()) {
            ProcessBamBlock(bam1, bam2, refs, writer);
        }
    }
    // close up
    reader.Close();
    if (_bamOutput == true) {
        writer.Close();
    }
}
Beispiel #11
0
int main (int argc, const char *argv[])
{
  printf ("------------- bamrealignment --------------\n");

  OptArgs opts;
  opts.ParseCmdLine(argc, argv);
  vector<int> score_vals(4);

  string input_bam  = opts.GetFirstString  ('i', "input", "");
  string output_bam = opts.GetFirstString  ('o', "output", "");
  opts.GetOption(score_vals, "4,-6,-5,-2", 's', "scores");
  int    clipping   = opts.GetFirstInt     ('c', "clipping", 2);
  bool   anchors    = opts.GetFirstBoolean ('a', "anchors", true);
  int    bandwidth  = opts.GetFirstInt     ('b', "bandwidth", 10);
  bool   verbose    = opts.GetFirstBoolean ('v', "verbose", false);
  bool   debug      = opts.GetFirstBoolean ('d', "debug", false);
  int    format     = opts.GetFirstInt     ('f', "format", 1);
  int  num_threads  = opts.GetFirstInt     ('t', "threads", 8);
  string log_fname  = opts.GetFirstString  ('l', "log", "");
  

  if (input_bam.empty() or output_bam.empty())
    return PrintHelp();

  opts.CheckNoLeftovers();

  std::ofstream logf;
  if (log_fname.size ())
  {
    logf.open (log_fname.c_str ());
    if (!logf.is_open ())
    {
      fprintf (stderr, "bamrealignment: Failed to open log file %s\n", log_fname.c_str());
      return 1;
    }
  }

  BamReader reader;
  if (!reader.Open(input_bam)) {
    fprintf(stderr, "bamrealignment: Failed to open input file %s\n", input_bam.c_str());
    return 1;
  }

  SamHeader header = reader.GetHeader();
  RefVector refs   = reader.GetReferenceData();

  BamWriter writer;
  writer.SetNumThreads(num_threads);
  if (format == 1)
    writer.SetCompressionMode(BamWriter::Uncompressed);
  else
    writer.SetCompressionMode(BamWriter::Compressed);

  if (!writer.Open(output_bam, header, refs)) {
    fprintf(stderr, "bamrealignment: Failed to open output file %s\n", output_bam.c_str());
    return 1;
  }


  // The meat starts here ------------------------------------

  if (verbose)
    cout << "Verbose option is activated, each alignment will print to screen." << endl
         << "  After a read hit RETURN to continue to the next one," << endl
         << "  or press q RETURN to quit the program," << endl
         << "  or press s Return to silence verbose," << endl
         << "  or press c RETURN to continue printing without further prompt." << endl << endl;

  unsigned int readcounter = 0;
  unsigned int mapped_readcounter = 0;
  unsigned int realigned_readcounter = 0;
  unsigned int modified_alignment_readcounter = 0;
  unsigned int pos_update_readcounter = 0;
  unsigned int failed_clip_realigned_readcount = 0;
  
  unsigned int already_perfect_readcount = 0;
  
  unsigned int bad_md_tag_readcount = 0;
  unsigned int error_recreate_ref_readcount = 0;
  unsigned int error_clip_anchor_readcount = 0;
  unsigned int error_sw_readcount = 0;
  unsigned int error_unclip_readcount = 0;
  
  unsigned int start_position_shift;
  int orig_position;
  int new_position;

  string  md_tag, new_md_tag, input = "x";
  vector<CigarOp>    new_cigar_data;
  vector<MDelement>  new_md_data;
  bool position_shift = false;
  time_t start_time = time(NULL);

  Realigner aligner;
  aligner.verbose_ = verbose;
  aligner.debug_   = debug;
  if (!aligner.SetScores(score_vals))
    cout << "bamrealignment: Four scores need to be provided: match, mismatch, gap open, gap extend score!" << endl;

  aligner.SetAlignmentBandwidth(bandwidth);

  BamAlignment alignment;
  while(reader.GetNextAlignment(alignment)){
    readcounter ++;
    position_shift = false;
    
    if ( (readcounter % 100000) == 0 )
       cout << "Processed " << readcounter << " reads. Elapsed time: " << (time(NULL) - start_time) << endl;

    if (alignment.IsMapped()) {
      
      
      
      orig_position = alignment.Position;
      mapped_readcounter++;
      aligner.SetClipping(clipping, !alignment.IsReverseStrand());
      if (aligner.verbose_) {
    	cout << endl;
        if (alignment.IsReverseStrand())
          cout << "The read is from the reverse strand." << endl;
        else
          cout << "The read is from the forward strand." << endl;
      }

      if (!alignment.GetTag("MD", md_tag)) {
    	if (aligner.verbose_)
          cout << "Warning: Skipping read " << alignment.Name << ". It is mapped but missing MD tag." << endl;
	if (logf.is_open ())
	  logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "MISSMD" << '\n';
	bad_md_tag_readcount++;
      } else if (aligner.CreateRefFromQueryBases(alignment.QueryBases, alignment.CigarData, md_tag, anchors)) {
	bool clipfail = false;
	if (Realigner::CR_ERR_CLIP_ANCHOR == aligner.GetCreateRefError ())
	{
	  clipfail = true;
	  failed_clip_realigned_readcount ++;
	}

        if (!aligner.computeSWalignment(new_cigar_data, new_md_data, start_position_shift)) {
          if (aligner.verbose_)
            cout << "Error in the alignment! Not updating read information." << endl;
	  if (logf.is_open ())
	    logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "SWERR" << '\n';
	  error_sw_readcount++;
          writer.SaveAlignment(alignment);  // Write alignment unchanged
          continue;
        }

        if (!aligner.addClippedBasesToTags(new_cigar_data, new_md_data, alignment.QueryBases.size())) {
          if (aligner.verbose_)
            cout << "Error when adding clipped anchors back to tags! Not updating read information." << endl;
	  if (logf.is_open ())
	    logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "UNCLIPERR" << '\n';
          writer.SaveAlignment(alignment);  // Write alignment unchanged
	  error_unclip_readcount ++;
          continue;
        }
        new_md_tag = aligner.GetMDstring(new_md_data);
        realigned_readcounter++;

        // adjust start position of read
        if (!aligner.LeftAnchorClipped() and start_position_shift != 0) {
          new_position = aligner.updateReadPosition(alignment.CigarData, (int)start_position_shift, alignment.Position);
          if (new_position != alignment.Position) {
            pos_update_readcounter++;
            position_shift = true;
            alignment.Position = new_position;
          }
        }
        
        if (position_shift || alignment.CigarData.size () != new_cigar_data.size () || md_tag != new_md_tag)
	{
	  if (logf.is_open ())
	  {
	    logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "MOD";
	    if (position_shift)
	      logf << "-SHIFT";
	    if (clipfail)
	      logf << " NOCLIP";
	    logf << '\n';
	  }
	  modified_alignment_readcounter++;
	}
	else
	{
            if (logf.is_open ())
	    {
	      logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "UNMOD";
              if (clipfail)
	        logf << " NOCLIP";
	      logf << '\n';
	    }
	}

        if (aligner.verbose_){
          cout << alignment.Name << endl;
          cout << "------------------------------------------" << endl;
          // Wait for input to continue or quit program
          if (input.size() == 0)
            input = 'x';
          else if (input[0] != 'c' and input[0] != 'C')
            getline(cin, input);
          if (input.size()>0){
            if (input[0] == 'q' or input[0] == 'Q')
              return 1;
            else if (input[0] == 's' or input[0] == 'S')
              aligner.verbose_ = false;
          }
        }

        // Finally update alignment information
        alignment.CigarData = new_cigar_data;
        alignment.EditTag("MD", "Z" , new_md_tag);

      } // end of CreateRef else if
      else {
	switch (aligner.GetCreateRefError ())
	{
	  case Realigner::CR_ERR_RECREATE_REF:
            if (logf.is_open ())
	      logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "RECRERR" << '\n';
	    error_recreate_ref_readcount++;
	    break;
	  case Realigner::CR_ERR_CLIP_ANCHOR:
            if (logf.is_open ())
	      logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "CLIPERR" << '\n';
	    error_clip_anchor_readcount++;
	    break;
	  default:
		  //  On a good run this writes way too many reads to the log file - don't want to create a too large txt file
          //  if (logf.is_open ())
	      //logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "PERFECT" << '\n';
	    already_perfect_readcount++;
	    break;
	}
	
	if (aligner.verbose_) {
	  cout << alignment.Name << endl;
	  cout << "------------------------------------------" << endl;
	  // Wait for input to continue or quit program
	  if (input.size() == 0)
	    input = 'x';
	  else if (input[0] != 'c' and input[0] != 'C')
	    getline(cin, input);
	  if (input.size()>0){
	    if (input[0] == 'q' or input[0] == 'Q')
	      return 1;
	    else if (input[0] == 's' or input[0] == 'S')
	      aligner.verbose_ = false;
	  }
	}
      }

      // --- Debug output for Rajesh ---
      if (debug && aligner.invalid_cigar_in_input) {
        aligner.verbose_ = true;
        cout << "Invalid cigar string / md tag pair in read " << alignment.Name << endl;
        // Rerun reference generation to display error
        aligner.CreateRefFromQueryBases(alignment.QueryBases, alignment.CigarData, md_tag, anchors);

        aligner.verbose_ = verbose;
        aligner.invalid_cigar_in_input = false;
      }
      // --- --- ---


    } // end of if isMapped

    writer.SaveAlignment(alignment);

  } // end while loop over reads

  if (aligner.invalid_cigar_in_input)
    cerr << "WARNING bamrealignment: There were invalid cigar string / md tag pairs in the input bam file." << endl;

  // ----------------------------------------------------------------
  // program end -- output summary information
  cout   << "                            File: " << input_bam    << endl
         << "                     Total reads: " << readcounter  << endl
         << "                    Mapped reads: " << mapped_readcounter << endl;
  if (bad_md_tag_readcount)
    cout << "            Skipped: bad MD tags: " << bad_md_tag_readcount << endl;
  if (error_recreate_ref_readcount)
    cout << " Skipped: unable to recreate ref: " << error_recreate_ref_readcount << endl;
  if (error_clip_anchor_readcount)
    cout << "  Skipped: error clipping anchor: " << error_clip_anchor_readcount << endl;
  cout  <<  "       Skipped:  already perfect: " << already_perfect_readcount << endl
        <<  "           Total reads realigned: " << mapped_readcounter - already_perfect_readcount - bad_md_tag_readcount - error_recreate_ref_readcount - error_clip_anchor_readcount << endl;
  if (failed_clip_realigned_readcount)
    cout << "                      (including  " << failed_clip_realigned_readcount << " that failed to clip)" << endl;
  if (error_sw_readcount)
    cout << " Failed to complete SW alignment: " << error_sw_readcount << endl;
  if (error_unclip_readcount)
    cout << "         Failed to unclip anchor: " << error_unclip_readcount << endl;
  cout   << "           Succesfully realigned: " << realigned_readcounter << endl
         << "             Modified alignments: " << modified_alignment_readcounter << endl
         << "                Shifted position: " << pos_update_readcounter << endl;
  
  cout << "Processing time: " << (time(NULL)-start_time) << " seconds." << endl;
  cout << "INFO: The output BAM file may be unsorted." << endl;
  cout << "------------------------------------------" << endl;
  return 0;
}
Beispiel #12
0
void BedWindow::WindowIntersectBam(string bamFile) {

    // load the "B" bed file into a map so
    // that we can easily compare "A" to it for overlaps
    _bedB->loadBedFileIntoMap();

    // open the BAM file
    BamReader reader;
    BamWriter writer;
    reader.Open(bamFile);

    // get header & reference information
    string bamHeader  = reader.GetHeaderText();
    RefVector refs    = reader.GetReferenceData();

    // open a BAM output to stdout if we are writing BAM
    if (_bamOutput == true) {
        // set compression mode
        BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
        if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed;
        writer.SetCompressionMode(compressionMode);
        // open our BAM writer
        writer.Open("stdout", bamHeader, refs);
    }

    vector<BED> hits;                   // vector of potential hits
    // reserve some space
    hits.reserve(100);

    _bedA->bedType = 6;
    BamAlignment bam;
    bool overlapsFound;
    // get each set of alignments for each pair.
    while (reader.GetNextAlignment(bam)) {

        if (bam.IsMapped()) {
            BED a;
            a.chrom = refs.at(bam.RefID).RefName;
            a.start = bam.Position;
            a.end   = bam.GetEndPosition(false, false);

            // build the name field from the BAM alignment.
            a.name = bam.Name;
            if (bam.IsFirstMate()) a.name += "/1";
            if (bam.IsSecondMate()) a.name += "/2";

            a.score  = ToString(bam.MapQuality);
            a.strand = "+"; if (bam.IsReverseStrand()) a.strand = "-";

            if (_bamOutput == true) {
                overlapsFound = FindOneOrMoreWindowOverlaps(a);
                if (overlapsFound == true) {
                    if (_noHit == false)
                        writer.SaveAlignment(bam);
                }
                else {
                    if (_noHit == true)
                        writer.SaveAlignment(bam);
                }
            }
            else {
                FindWindowOverlaps(a, hits);
                hits.clear();
            }
        }
        // BAM IsMapped() is false
        else if (_noHit == true) {
            writer.SaveAlignment(bam);
        }
    }

    // close the relevant BAM files.
    reader.Close();
    if (_bamOutput == true) {
        writer.Close();
    }
}
int main(int argc, char** argv) {
    FILE *fsp = 0, *fep = 0;

    int flag;       // 0
    char chrom[255];    // 1
    int pos;        // 2
    int quality;    // 3
    char *cigar;    // 4

    size_t len = 0;
    
    char chromcopy[255] = { 0 };

    char fn[255];

    int i;
    int forward, reverse;

    bool direction; // 0
    bool leftflag;

    const char *prefix = "";

    int cnt = 0;
    BamReader reader;

    //if (argc < 2 || argc > 4) usage();

    for (i=1; i<argc; i++) {
        if ((strcmp(argv[i], "-p") == 0) && (i != argc-1)) {
            prefix = argv[++i];
        } else {
            if (!reader.IsOpen()) {
                try {
                    reader.Open(argv[i]);
                } catch (exception& e) {
                    cout << e.what();
                    throw;
                }
            } else {
                usage();
            }
        }
    }
    if (!reader.IsOpen()) usage();

    const SamHeader header = reader.GetHeader();
    const RefVector references = reader.GetReferenceData();
    
    BamAlignment al;
    while(reader.GetNextAlignmentCore(al)) {
        flag = al.AlignmentFlag;
        if ((flag & 256) || (flag & 4) || (flag & 512) || (flag & 1024)) continue;

        direction = ( (flag & 16) == 16);
        strcpy(chrom, references[al.RefID].RefName.c_str());
        pos = al.Position+1;
        quality = al.MapQuality;
        if (quality == 0) continue;

        if (strcmp(chromcopy, chrom) != 0) {
            if (fsp) fclose(fsp);
            if (fep) fclose(fep);
            fn[0] = 0;
            if (prefix != "") {
                strcat(fn, prefix);
            }
            strcat(fn, chrom);
            strcat(fn, "_forward.txt");
            fsp = fopen(fn, "w");
            fn[0] = 0;
            if (prefix != "") {
                strcat(fn, prefix);
            }
            strcat(fn, chrom);
            strcat(fn, "_reverse.txt");
            fep = fopen(fn, "w");
            strcpy(chromcopy, chrom);
        }

        vector<CigarOp> cigars = al.CigarData;
        forward = pos;
        reverse = pos;
        leftflag = true;
        for (i = 0; i < cigars.size(); i++) {
            if (cigars[i].Type == 'S' || cigars[i].Type == 'H') {
                if (leftflag == false) break;
                continue;
            } else {
                if (cigars[i].Type != 'I') reverse += cigars[i].Length;
                leftflag = false;
            }
        }
        if (direction == 0)
            fprintf(fsp, "%d\n", forward);
        else
            fprintf(fep, "%d\n", reverse-1);

        cnt++;
        if (cnt % 500000 == 0)
            printf("%d ", cnt);
    }
    reader.Close();
    printf("\n");
    if (fsp) fclose(fsp);
    if (fep) fclose(fep);
}
Beispiel #14
0
int main (int argc, char *argv[]) {

     if( (argc!= 4 && argc !=5 && argc !=6) ||
    	(argc== 2 && string(argv[1]) == "-h") ||
    	(argc== 2 && string(argv[1]) == "-help") ||
    	(argc== 2 && string(argv[1]) == "--help") ){
	 cerr<<"Usage:splitByRG [in bam] [rg Tally] [out prefix] (optional target)"<<endl<<"this program will subsample a BAM file per read group for a certain target\nFor example splitByRG in.bam tally.txt out will create\nout.rg1.bam\nout.rg2.bam\n"<<endl;
    	return 1;
    }


     string bamfiletopen      = string(argv[1]);
     string rgTally           = string(argv[2]);
     string bamDirOutPrefix   = string(argv[3]);
     
     int target            =  200000;
     int maxTarget         = 1000000;

     if(argc==5){
	 target    = destringify<int> ( string(argv[4]) );	 
     }

     if(argc==6){
	 target    = destringify<int> ( string(argv[4]) );	 
	 maxTarget = destringify<int> ( string(argv[5]) );	 
     }


     cerr<<"minimum fragments:\t"<<target<<endl;
     cerr<<"target  fragments:\t"<<maxTarget<<endl;

     string line;
     ifstream myFileTally;
     map<string,double> rg2Fraction;

     myFileTally.open(rgTally.c_str(), ios::in);
     cerr<<"Retained groups:\n"<<endl;
     cerr<<"RG\t#mapped\tfraction retained"<<endl;
     cerr<<"-----------------------------------"<<endl;

     if (myFileTally.is_open()){
	 while ( getline (myFileTally,line)){
	     vector<string> tokens = allTokens(line,'\t');
	     if(tokens.size() > 6)
		 if( tokens[1] == "pass" && 
		    (tokens[0] != "\"\""    && 
		     tokens[0] != "control" && 
		     tokens[0] != "TOTAL") ){
		     //cout<<tokens[0]<<"\t"<<tokens[5]<<endl;
		     int count = destringify<int>(tokens[5]);

		     if(count>target){

			 if(count>=maxTarget){
			     rg2Fraction[  tokens[0] ] = double(maxTarget)/double(count);
			     cout<<tokens[0]<<"\t"<<count<<"\t"<<double(maxTarget)/double(count)<<endl;
			 }else{
			     cout<<tokens[0]<<"\t"<<count<<"\t"<<1.0<<endl;
			     rg2Fraction[  tokens[0] ] = 1.0;
			 }
		     }
		 }
	 }
	 myFileTally.close();
     }else{
	 cerr << "Unable to open file "<<rgTally<<endl;
	 return 1;
     }



     map<string,BamWriter *> rg2BamWriter;
     
     // if(!isDirectory(bamDirOut)){
     // 	 cerr<<"ERROR: the out directory does not exist"<<endl;
     // 	return 1;
     // }

     BamReader reader;

     if ( !reader.Open(bamfiletopen) ) {
    	cerr << "Could not open input BAM files." << endl;
    	return 1;
     }
    const SamHeader header = reader.GetHeader();
    const RefVector references = reader.GetReferenceData();
    vector<RefData>  refData=reader.GetReferenceData();

    SamReadGroupDictionary 	srgd=header.ReadGroups;
    for(SamReadGroupConstIterator srgci=srgd.ConstBegin();
	srgci<srgd.ConstEnd();
	srgci++){
	//cout<<*srgci<<endl;
	const SamReadGroup rg = (*srgci);
	//cout<<rg.ID<<endl;
	if( rg2Fraction.find(rg.ID) != rg2Fraction.end() ){
	    rg2BamWriter[rg.ID] = new  BamWriter();
	    rg2BamWriter[rg.ID]->Open(bamDirOutPrefix+"."+rg.ID+".bam",header,references); 
	}
	//cout<<bamDirOutPrefix+"."+rg.ID+".bam"<<endl;
    }
    //    return 1;

    //    BamWriter unmapped;

    // cout<<header.ToString()<<endl;
    // return 1;

    // if ( !unmapped.Open(bamDirOutPrefix+".unmapped.bam",header,references) ) {
    // 	cerr << "Could not open output BAM file "<< bamDirOutPrefix+".unmapped.bam" << endl;
    // 	return 1;
    // }

    //    cout<<"reading"<<endl;

    BamAlignment al;
    unsigned int total=0;
    while ( reader.GetNextAlignment(al) ) {


	if(al.HasTag("RG") &&
	   al.IsMapped() ){
	    string rgTag;
	    al.GetTag("RG",rgTag);
	    //cout<<rgTag<<endl;
	    if(rg2BamWriter.find(rgTag) == rg2BamWriter.end()){ //new: ignore completely
	
		
	    }else{
		if( randomProb() <= rg2Fraction[  rgTag ] ){
		    rg2BamWriter[rgTag]->SaveAlignment(al);	 
		    //cout<<"wrote "<<rgTag<<endl;
		}   else{
		    //cout<<"skipped "<<rgTag<<endl;
		}	   
	    }
	}// else{
	//     string rgTag="unknown";	    
	//     //cout<<rgTag<<endl;
	//     if(rg2BamWriter.find(rgTag) == rg2BamWriter.end()){ //new
	// 	cerr<<"Found new RG "<<rgTag<<endl;
	// 	rg2BamWriter[rgTag] = new  BamWriter();
	//  	if ( !rg2BamWriter[rgTag]->Open(bamDirOutPrefix+"."+rgTag+".bam",header,references) ) {
	//  	    cerr     << "Could not open output BAM file "<< bamDirOutPrefix<<"."<<rgTag<<".bam" << endl;
	//  	    return 1;
	//  	}
	// 	rg2BamWriter[rgTag]->SaveAlignment(al);	    	   
	//     }else{
	// 	rg2BamWriter[rgTag]->SaveAlignment(al);	    	   
	//     }

	//     // cerr << "Cannot get RG tag for " << al.Name<<endl;
	//     // return 1;
	// }

	total++;
    } //while al

    reader.Close();
    // writer.Close();
    
    // unmapped.Close();

    map<string,BamWriter *>::iterator rg2BamWriterIt;
    for (rg2BamWriterIt =rg2BamWriter.begin(); 
	 rg2BamWriterIt!=rg2BamWriter.end(); 
	 rg2BamWriterIt++){
	rg2BamWriterIt->second->Close();
    }
    cerr<<"Wrote succesfully "<<total<<" reads"<<endl;


    return 0;
}
int GenericIndividualSnpCall::call(Fasta &fastaObj, BamReader &bamObj, BamRegion &roi, GenericProbabilisticAlignment &probAligner, VariantCallSetting& snpCallSettings, vector<GenericVariant> &variantSet)
{
    RefVector chromosomes = bamObj.GetReferenceData();
    // set up genome blocks
    vector<int> BlockChrID, BlockLeftPos, BlockRightPos;
    int BlockNumber=setupGenomeBlock(chromosomes, roi, BlockChrID, BlockLeftPos, BlockRightPos);

    int numSNP = 0;

    // iterate throught blocks
    for (int i=0; i<BlockNumber; ++i)
    {
        if (m_verbosity>=1)
        {
            cout << "processing " << chromosomes[BlockChrID[i]].RefName << ":" << BlockLeftPos[i]+1 << "-" << BlockRightPos[i] << endl;
        }

        clock_t startTime = clock();

        // genome
        string BlockGenome;
        fastaObj.GetSequence(BlockChrID[i], BlockLeftPos[i], BlockRightPos[i], BlockGenome);

        map<int,list<tuple<char,int,int,double>>> BlockBamData;
        AlleleSet BlockSnpAlleleCandidates;
        // profile SNP sites by the simple method
        simpleSnpCall(BlockGenome, bamObj, BlockChrID[i], BlockLeftPos[i], BlockRightPos[i], BlockSnpAlleleCandidates, BlockBamData);

        // merge SNP sites to SNP blocks
        vector<tuple<int,int,list<Allele>>> BlockSnpLoci;
        mergeSnpSitesToBlocks(BlockSnpAlleleCandidates, BlockSnpLoci);

        // iterate through Snp locus
        for (int j=0; j<BlockSnpLoci.size(); j++)
        {
            int BlockSnpLeftPos  = get<0>(BlockSnpLoci[j]);
            int BlockSnpRightPos = get<1>(BlockSnpLoci[j]);

            // it is a SNP site
            if (BlockSnpRightPos==BlockSnpLeftPos+1)
            {
                simpleBayesianSnpCall(fastaObj, bamObj, BlockChrID[i], BlockSnpLeftPos, BlockSnpRightPos, get<2>(BlockSnpLoci[j]), BlockBamData[BlockSnpLeftPos], snpCallSettings, variantSet);
            }else if (BlockSnpRightPos==BlockSnpLeftPos+2)
            {
                for (int pos=BlockSnpLeftPos; pos<BlockSnpRightPos; pos++)
                {
                    list<Allele> fAlleles = get<2>(BlockSnpLoci[j]);
                    list<Allele> tAlleles;
                    for (list<Allele>::iterator faIter=fAlleles.begin(); faIter!=fAlleles.end(); faIter++)
                    {
                        if (faIter->m_chrPosition==pos)
                            tAlleles.emplace_back(*faIter);
                    }

                    if (!tAlleles.empty())
                        simpleBayesianSnpCall(fastaObj, bamObj, BlockChrID[i], pos, pos+1, tAlleles, BlockBamData[pos], snpCallSettings, variantSet);

                }
            }
            else   // it is a MNP site
            {
                PyroHMMsnp(fastaObj, bamObj, BlockChrID[i], BlockSnpLeftPos, BlockSnpRightPos, probAligner, get<2>(BlockSnpLoci[j]), snpCallSettings, variantSet);
            }
        }

        clock_t endTime = clock();
        if (m_verbosity>=1)
        {
            cout << "time elapsed " << ((endTime-startTime)/(double)CLOCKS_PER_SEC/60.) << " minutes";
            cout << ", ";
            cout << "call " << variantSet.size()-numSNP << " SNPs" << endl;
        }

        numSNP = variantSet.size();
    }

    return variantSet.size();
}
Beispiel #16
0
void BedGenomeCoverage::CoverageBam(string bamFile) {

    ResetChromCoverage();

    // open the BAM file
    BamReader reader;
    if (!reader.Open(bamFile)) {
        cerr << "Failed to open BAM file " << bamFile << endl;
        exit(1);
    }

    // get header & reference information
    string header = reader.GetHeaderText();
    RefVector refs = reader.GetReferenceData();

    // load the BAM header references into a BEDTools "genome file"
    _genome = new GenomeFile(refs);
    // convert each aligned BAM entry to BED
    // and compute coverage on B
    BamAlignment bam;
    while (reader.GetNextAlignment(bam)) {
        // skip if the read is unaligned
        if (bam.IsMapped() == false)
            continue;

        bool _isReverseStrand = bam.IsReverseStrand();

        //changing second mate's strand to opposite
        if( _dUTP && bam.IsPaired() && bam.IsMateMapped() && bam.IsSecondMate())
            _isReverseStrand = !bam.IsReverseStrand();

        // skip if we care about strands and the strand isn't what
        // the user wanted
        if ( (_filterByStrand == true) &&
             ((_requestedStrand == "-") != _isReverseStrand) )
            continue;

        // extract the chrom, start and end from the BAM alignment
        string chrom(refs.at(bam.RefID).RefName);
        CHRPOS start = bam.Position;
        CHRPOS end = bam.GetEndPosition(false, false) - 1;

        // are we on a new chromosome?
        if ( chrom != _currChromName )
            StartNewChrom(chrom);
        if(_pair_chip_) {
            // Skip if not a proper pair
            if (bam.IsPaired() && (!bam.IsProperPair() or !bam.IsMateMapped()) )
                continue;
            // Skip if wrong coordinates
            if( ( (bam.Position<bam.MatePosition) && bam.IsReverseStrand() ) ||
                ( (bam.MatePosition < bam.Position) && bam.IsMateReverseStrand() ) ) {
                    //chemically designed: left on positive strand, right on reverse one
                    continue;
            }

            /*if(_haveSize) {
                if (bam.IsFirstMate() && bam.IsReverseStrand()) { //put fragmentSize in to the middle of pair end_fragment
                    int mid = bam.MatePosition+abs(bam.InsertSize)/2;
                    if(mid<_fragmentSize/2)
                        AddCoverage(0, mid+_fragmentSize/2);
                    else
                        AddCoverage(mid-_fragmentSize/2, mid+_fragmentSize/2);
                }
                else if (bam.IsFirstMate() && bam.IsMateReverseStrand()) { //put fragmentSize in to the middle of pair end_fragment
                    int mid = start+abs(bam.InsertSize)/2;
                    if(mid<_fragmentSize/2)
                        AddCoverage(0, mid+_fragmentSize/2);
                    else
                        AddCoverage(mid-_fragmentSize/2, mid+_fragmentSize/2);
                }
            } else */

            if (bam.IsFirstMate() && bam.IsReverseStrand()) { //prolong to the mate to the left
                AddCoverage(bam.MatePosition, end);
            }
            else if (bam.IsFirstMate() && bam.IsMateReverseStrand()) { //prolong to the mate to the right
                AddCoverage(start, start + abs(bam.InsertSize) - 1);
            }
        } else if (_haveSize) {
            if(bam.IsReverseStrand()) {
                if(end<_fragmentSize) { //sometimes fragmentSize is bigger :(
                    AddCoverage(0, end);
                } else {
                    AddCoverage(end + 1 - _fragmentSize, end );
                }
            } else {
                AddCoverage(start,start+_fragmentSize - 1);
            }
        } else
        // add coverage accordingly.
        if (!_only_5p_end && !_only_3p_end) {
            bedVector bedBlocks;
            // we always want to split blocks when a D CIGAR op is found.
            // if the user invokes -split, we want to also split on N ops.
            if (_obeySplits) { // "D" true, "N" true
                GetBamBlocks(bam, refs.at(bam.RefID).RefName, bedBlocks, true, true);
            }
            else { // "D" true, "N" false
                GetBamBlocks(bam, refs.at(bam.RefID).RefName, bedBlocks, true, false);
            }
            AddBlockedCoverage(bedBlocks);
        }
        else if (_only_5p_end) {
            CHRPOS pos = ( !bam.IsReverseStrand() ) ? start : end;
            AddCoverage(pos,pos);
        }
        else if (_only_3p_end) {
            CHRPOS pos = ( bam.IsReverseStrand() ) ? start : end;
            AddCoverage(pos,pos);
        }
    }
    // close the BAM
    reader.Close();

    // process the results of the last chromosome.
    ReportChromCoverage(_currChromCoverage, _currChromSize,
            _currChromName, _currChromDepthHist);

    // report all empty chromsomes
    PrintEmptyChromosomes();

    // report the overall coverage if asked.
    PrintFinalCoverage();
}
Beispiel #17
0
void TagBam::Tag() {

    // open the annotations files for processing;
    OpenAnnoFiles();

    // open the BAM file
    BamReader reader;
    BamWriter writer;
	if (!reader.Open(_bamFile)) {
        cerr << "Failed to open BAM file " << _bamFile << endl;
        exit(1);
    }
    
    // get header & reference information
    string bamHeader  = reader.GetHeaderText();
    RefVector refs = reader.GetReferenceData();

    // set compression mode
    BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
//    if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed;
    writer.SetCompressionMode(compressionMode);
    // open our BAM writer
    writer.Open("stdout", bamHeader, refs);

    // rip through the BAM file and test for overlaps with each annotation file.
    BamAlignment al;
    vector<BED> hits;

    while (reader.GetNextAlignment(al)) {
        if (al.IsMapped() == true) {
            BED a;
            a.chrom = refs.at(al.RefID).RefName;
            a.start = al.Position;
            a.end   = al.GetEndPosition(false, false);
            a.strand = "+";
            if (al.IsReverseStrand()) a.strand = "-";
            
            ostringstream annotations;
            // annotate the BAM file based on overlaps with the annotation files.
            for (size_t i = 0; i < _annoFiles.size(); ++i) 
            {
                // grab the current annotation file.
                BedFile *anno = _annoFiles[i];
                
                if (!_useNames && !_useScores && !_useIntervals) {
                    // add the label for this annotation file to tag if there is overlap
                    if (anno->anyHits(a.chrom, a.start, a.end, a.strand, 
                                      _sameStrand, _diffStrand, _overlapFraction, false))
                    {
                        annotations << _annoLabels[i] << ";";
                    }
                }
                // use the score field
                else if (!_useNames && _useScores && !_useIntervals) {
                    anno->allHits(a.chrom, a.start, a.end, a.strand, 
                                  hits, _sameStrand, _diffStrand, 0.0, false);
                    for (size_t i = 0; i < hits.size(); ++i) {
                        annotations << hits[i].score;
                        if (i < hits.size() - 1) annotations << ",";
                    }
                    if (hits.size() > 0) annotations << ";";
                    hits.clear();
                }
                // use the name field from the annotation files to populate tag
                else if (_useNames && !_useScores && !_useIntervals) {
                    anno->allHits(a.chrom, a.start, a.end, a.strand, 
                                  hits, _sameStrand, _diffStrand, 0.0, false);
                    for (size_t j = 0; j < hits.size(); ++j) {
                        annotations << hits[j].name;
                        if (j < hits.size() - 1) annotations << ",";
                    }
                    if (hits.size() > 0) annotations << ";";
                    hits.clear();
                }
                // use the full interval information annotation files to populate tag
                else if (!_useNames && !_useScores && _useIntervals) {
                    anno->allHits(a.chrom, a.start, a.end, a.strand, 
                                  hits, _sameStrand, _diffStrand,  0.0, false);
                    for (size_t j = 0; j < hits.size(); ++j) {
                        annotations << _annoLabels[i]  << ":" << 
                                        hits[j].chrom  << ":" <<
                                        hits[j].start  << "-" <<
                                        hits[j].end    << "," <<
                                        hits[j].name   << "," <<
                                        hits[j].score  << "," <<
                                        hits[j].strand;
                        if (j < hits.size() - 1) annotations << ",";
                    }
                    if (hits.size() > 0) annotations << ";";
                    hits.clear();
                }
            }
            // were there any overlaps with which to make a tag?
            if (annotations.str().size() > 0) {
                al.AddTag(_tag, "Z", annotations.str().substr(0, annotations.str().size() - 1)); // get rid of the last ";"
            }
        }
        writer.SaveAlignment(al);
    }
    reader.Close();
    writer.Close();
    // close the annotations files;
    CloseAnnoFiles();
}
int 
main(int argc, char* argv[]) {

	// validate argument count
	if( argc != 2 ) {
		cerr << "USAGE: " << argv[0] << " <input BAM file> " << endl;
		return EXIT_FAILURE;
	}

	string filename = argv[1];
	//cerr << "Printing alignments from file: " << filename << endl;
	
	BamReader reader;
	if (!reader.Open(filename)) {
        cerr << "could not open filename " << filename << endl;
        return EXIT_FAILURE;
    }
    cerr << filename << ": Done opening" << endl;

    // Header can't be used to accurately determine sort order because samtools never
    // changes it; instead, check after loading each read as is done with "samtools index"

    // We don't need to load an index (right?)
	// if (!reader.LocateIndex()) {
    //     const string index_filename = filename + ".bai";
	//     if (!reader.OpenIndex(index_filename)) {
    //         cerr << "could not open index" << endl;
    //     }
    // }


    const SamHeader header = reader.GetHeader();
    cerr << filename << ": Done getting header" << endl;
    const RefVector refs = reader.GetReferenceData();
    cerr << filename << ": Done getting reference data" << endl;
	
    BamWriter writer;
    if (! output_bam_filename.empty()) {
        if (! writer.Open(output_bam_filename, header, refs)) {
            cerr << "Could not open BAM output file " << output_bam_filename << endl;
            return EXIT_FAILURE;
        }
        cerr << filename << ": Done opening BAM output file " << output_bam_filename << endl;
    }

    alignmentMap read1Map;  // a single map, for all reads awaiting their mate
    typedef map<string,int32_t> stringMap;
    typedef stringMap::iterator stringMapI;
    stringMap ref_mates;
    // alignmentMap read1Map, read2Map;

	BamAlignment full_al;
    int32_t count = 0;
    uint32_t max_reads_in_map = 0;
    int32_t n_reads_skipped_unmapped = 0;
    int32_t n_reads_skipped_mate_unmapped = 0;
    int32_t n_reads_skipped_wont_see_mate = 0;
    int32_t n_reads_skipped_mate_tail_est = 0;
    int32_t n_reads_skipped_ref_mate = 0;
    int32_t n_reads = 0;
    int32_t n_singleton_reads = 0;
    int32_t last_RefID = -1;
    int32_t last_Position = -1;

    cerr << filename << ": Looking for up to " << pairs_to_process << " link pairs,"
        << " total tail = " << link_pair_total_tail 
        << " critical tail = " << link_pair_crit_tail 
        << ", must be on diff chromosome = " << link_pair_diff_chrom << endl;

	while (reader.GetNextAlignment(full_al) 
           && (! pairs_to_process || count < pairs_to_process)) {

        BamAlignment al = full_al;

        //printAlignmentInfo(al, refs);
        //++count;
        ++n_reads;

        if (last_RefID < 0) last_RefID = al.RefID;
        if (last_Position < 0) last_Position = al.Position;
        if (al.RefID > last_RefID) {
            // We've moved to the next reference sequence
            // Clean up reads with mates expected here that haven't been seen
            if (debug_ref_mate) {
                cerr << "MISSED " << ref_mates.size() << " ref_mates on this reference "
                    << last_RefID << " " << refs[last_RefID].RefName << endl;
            }
            for (stringMapI rmI = ref_mates.begin(); rmI != ref_mates.end(); ++rmI) {
                ++n_reads_skipped_ref_mate;
                read1Map.erase(read1Map.find(rmI->first));
                ref_mates.erase(ref_mates.find(rmI->first));
            }
            last_RefID = al.RefID;
            last_Position = al.Position;
        } else if (! isCoordinateSorted(al.RefID, al.Position, last_RefID, last_Position)) {
            cerr << filename << " is not sorted, " << al.Name << " out of position" << endl;
            return EXIT_FAILURE;
        }

        if (! al.IsMapped()) { ++n_reads_skipped_unmapped; continue; }

        if (! al.IsMateMapped()) { ++n_reads_skipped_mate_unmapped; continue; }

        alignmentMapI mI = read1Map.find(al.Name);

        if (mI == read1Map.end()) {
            // the read name has not been seen before

            if (al.MateRefID < al.RefID
                || (al.MateRefID == al.RefID && al.MatePosition < al.Position)) {
                // we should have seen its mate earlier, so skip it
                ++n_reads_skipped_wont_see_mate;
                continue;
            }

            // If the mate likely to also be a link pair candidate, add the read
            int32_t mate_tail_est = readTailS(al.IsMateMapped(), al.IsMateReverseStrand(),
                            al.MatePosition, refs[al.MateRefID].RefLength, max_read_length);
            if (mate_tail_est <= mate_tail_est_crit) {
                // the mate tail estimate suggests it might be a link pair candidate
                read1Map[al.Name] = al;  // add the read to the map
            } else {
                // the mate tail estimate appears too long for the mate to be a candidate
                ++n_reads_skipped_mate_tail_est;
                continue;
            }

            if (read1Map.size() > max_reads_in_map) max_reads_in_map = read1Map.size();
            if (al.MateRefID == al.RefID && al.MatePosition >= al.Position) {
                // the mate is expected later on this contig
                ref_mates[al.Name] = al.MateRefID;
            }

        } else {
            // get the mate's alignment, and process the pair

            const BamAlignment& al_mate = mI->second;

            if (processReadPair(al, al_mate, refs, link_pair_total_tail, 
                                link_pair_crit_tail, link_pair_diff_chrom)) {
                ++count;

                // write to the new BAM file, if the string is not empty
                if (! output_bam_filename.empty()) {
                    writer.SaveAlignment(al_mate);  // the first one seen
                    writer.SaveAlignment(al);  // the second one seen
                }
            }

            read1Map.erase(mI);

            if (al.MateRefID == al.RefID) {
                stringMapI rmI = ref_mates.find(al.Name);
                if (rmI == ref_mates.end()) {
                    cerr << "expected a ref_mate, couldn't find its name: " << al.Name << endl;
                    return EXIT_FAILURE;
                }
                ref_mates.erase(rmI);
            }

        }

	}

	cerr << "===============================" << endl;
    cerr << read1Map.size() << " alignments left in read1Map" << endl;
    cerr << max_reads_in_map << " maximum number of reads in read1Map" << endl;
    cerr << count << " pairs processed" << endl;
	cerr << "===============================" << endl;
    cerr << n_reads << " total reads" << endl;
    cerr << n_singleton_reads << " singleton reads" << endl;
    cerr << n_reads_skipped_unmapped << " reads skipped because unmapped" << endl;
    cerr << n_reads_skipped_mate_unmapped << " reads skipped because mate unmapped" << endl;
    cerr << n_reads_skipped_wont_see_mate << " reads skipped because mate won't be seen" << endl;
    cerr << n_reads_skipped_mate_tail_est << " reads skipped because mate tail appears too long" << endl;
    cerr << n_reads_skipped_ref_mate << " reads skipped because mate not on reference" << endl;

	reader.Close();
    if (! output_bam_filename.empty()) {
	    writer.Close();
    }
	return EXIT_SUCCESS;
}
Beispiel #19
0
void BedIntersect::IntersectBam(string bamFile) {

    // load the "B" bed file into a map so
    // that we can easily compare "A" to it for overlaps
    _bedB = new BedFile(_bedBFile);
    _bedB->loadBedFileIntoMap();

    // create a dummy BED A file for printing purposes if not
    // using BAM output.
    if (_bamOutput == false) {
        _bedA = new BedFile(_bedAFile);
        _bedA->bedType = 12;
    }
    // open the BAM file
    BamReader reader;
    BamWriter writer;
    reader.Open(bamFile);
    // get header & reference information
    string bamHeader  = reader.GetHeaderText();
    RefVector refs    = reader.GetReferenceData();
    // open a BAM output to stdout if we are writing BAM
    if (_bamOutput == true) {
        // set compression mode
        BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
        if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed;
        writer.SetCompressionMode(compressionMode);
        // open our BAM writer
        writer.Open("stdout", bamHeader, refs);
    }
    vector<BED> hits;
    // reserve some space
    hits.reserve(100);
    BamAlignment bam;    
    // get each set of alignments for each pair.
    while (reader.GetNextAlignment(bam)) {

        // save an unaligned read if -v
        if (!bam.IsMapped()) {
            if (_noHit == true)
                writer.SaveAlignment(bam);
            continue;
        }   
        // break alignment into discrete blocks,
        bedVector bed_blocks;
        string chrom = refs.at(bam.RefID).RefName;
        GetBamBlocks(bam, chrom, bed_blocks, false, true);
        // create a basic BED entry from the BAM alignment
        BED bed;
        MakeBedFromBam(bam, chrom, bed_blocks, bed);
        bool overlapsFound = false;
        if ((_bamOutput == true) && (_obeySplits == false))
        {
            overlapsFound = _bedB->anyHits(bed.chrom, bed.start, bed.end, 
                                           bed.strand, _sameStrand, _diffStrand,
                                           _overlapFraction, _reciprocal);
        }
        else if ( ((_bamOutput == true)  && (_obeySplits == true)) ||
                  ((_bamOutput == false) && (_obeySplits == true)) )
        {
            // find the hits that overlap with the full span of the blocked BED
            _bedB->allHits(bed.chrom, bed.start, bed.end, bed.strand,
                           hits, _sameStrand, _diffStrand,
                           _overlapFraction, _reciprocal);
            // find the overlaps between the block in A and B
            overlapsFound = FindBlockedOverlaps(bed, bed_blocks, hits, _bamOutput);
        }
        else if ((_bamOutput == false) && (_obeySplits == false))
        {
            FindOverlaps(bed, hits);
        }
        // save the BAM alignment if overlap reqs. were met
        if (_bamOutput == true) {
            if ((overlapsFound == true) && (_noHit == false))
                writer.SaveAlignment(bam);
            else if ((overlapsFound == false) && (_noHit == true))
                writer.SaveAlignment(bam);
        }
        hits.clear();
    }

    // close the relevant BAM files.
    reader.Close();
    if (_bamOutput == true) {
        writer.Close();
    }
}
Beispiel #20
0
int main (int argc, char** argv)
{

    // Print Commandline
    string ss(argv[0]);   // convert Char to String
    string commandline = "##Print Command line " + ss;

    int c;

    FastaReference* reference = NULL;
    int minbaseQ        = 10;   //default
    int windowlen       = 40;  //by default
    string regionstr;
    string RegionFile;
    string bamfile;
    bool STdin          = false;
    bool has_region     = false;
    bool has_regionFile = false;
    bool has_bamfile    = false;
    bool has_ref        = false;
    int ploidy         = 2;
    bool SetLowComplexityRegionSWGapExt = false;
    bool SetLowComplexityRegion = false;
   

    if (argc < 2)
    {
        printSummary(argv);
        exit(1);
    }

    while (true)
    {
        static struct option long_options[] =
        {
            {"help", no_argument, 0, 'h'},
            {"ploidy", required_argument, 0, 'p'},
            {"window-size", required_argument, 0, 'w'},
            {"reference", required_argument, 0, 'f'},
            {"min-base-quality", required_argument, 0,'q'},
            {"Region", required_argument, 0, 'R'},
            {"STdin", no_argument, 0, 's'},
            {"bam", required_argument, 0, 'b'},
            {"Repeat-Extgap", no_argument, 0, 'E'},
            {"LowCompex", no_argument, 0, 'l'},
            {0, 0, 0, 0}
        };

        int option_index = 0;

        c = getopt_long (argc, argv, "hslEf:q:w:s:r:R:p:b:", long_options, &option_index);

        /* Detect the end of the options. */
        if (c == -1)
            break;

        switch (c)
        {
            case 'f':
                reference = new FastaReference(optarg); // will exit on open failure
                commandline = commandline + " -f " + optarg;
                has_ref = true;
                break;

            case 'b':
                has_bamfile = true;
                bamfile = optarg;
                commandline = commandline + " -b " + optarg;
                break;

            case 'r':
                regionstr = optarg;
                has_region = true;
                commandline = commandline + " -r " + optarg;
                break;

             case 'R':
                RegionFile = optarg;
                has_regionFile = true;
                commandline = commandline + " -R " + optarg;
                break;

            case 's':
                STdin = true;
                commandline = commandline + " -s ";
                break;
                
            case 'q':
                minbaseQ = atoi(optarg);
                commandline = commandline + " -q " + optarg;
                break;

            case 'w':
                windowlen = atoi(optarg);
                commandline = commandline + " -w " + optarg;
                break;

            case 'p':
                ploidy = atoi(optarg);
                commandline = commandline + " -p " + optarg;
                break;

            case 'E':
                SetLowComplexityRegionSWGapExt = true;
                commandline = commandline + " -E ";
                break;

            case 'l':
                SetLowComplexityRegion = true;
                commandline = commandline + " -l ";
                break;

            case 'h':
                printSummary(argv);
                commandline = commandline + " -h ";
                exit(0);
                break;

            case '?':
                printSummary(argv);
                exit(1);
                break;

              default:
                abort();
                break;
        }
    }

    //// Open Error log files
    ofstream cerrlog("bonsaiReport.txt");
    streambuf *cerrsave = std::cerr.rdbuf();

    // Redirect stream buffers
    if (cerrlog.is_open())
        cerr.rdbuf(cerrlog.rdbuf());

    cerr << commandline << endl;
    

    //Check for Reference Fasta sequence
    if (!has_ref)
    {
        cerr << "no FASTA reference provided, cannot realign" << endl;
        exit(1);
    }

    ////Check for reader
    BamReader reader;
    if (STdin == true)
    {
        if (!reader.Open("stdin"))
        {
            cerr << "could not open stdin bam for reading" << endl;
            cerr << reader.GetErrorString() << endl;
            reader.Close();
            printSummary(argv);
        }
    }
    else
    {
        if (has_bamfile == true)
        {
            if (!reader.Open(bamfile))
            {
                cerr << "ERROR: could not open bam files from stdin ... Aborting" << endl;
                cerr << reader.GetErrorString() << endl;
                reader.Close();
                printSummary(argv);
            }

            if ( !reader.LocateIndex() )
                reader.CreateIndex();
        }
        else
        {
            cerr << "--bam flag is set but no bamfile is provided... Aborting" << endl;
            reader.Close();
            printSummary(argv);
        }
    }

    //// Check Region Tags
    if ( (has_regionFile == true) && (has_region == true) )
    {
        cerr << "ERROR: You provide both region and has provide a Set Region List... Aborting" << endl;
        exit(1);
    }

    //// store the names of all the reference sequences in the BAM file
    vector<RefData> referencedata = reader.GetReferenceData();
   
    //// Store Region LIST
    vector<BamRegion> regionlist;
    if (has_region == true)
    {
        BamRegion region;
        ParseRegionString(regionstr, reader, region);
        regionlist.push_back(region);
    }
    else if (has_regionFile == true)
    {
        ifstream RG(RegionFile.c_str(), ios_base::in);
        string line;
        while(getline(RG,line))
        {
            BamRegion region;
            ParseRegionString(line, reader, region);
            regionlist.push_back(region);
        }
        RG.close();
    }
    else if ( (has_regionFile == false) && (has_region == false) )
    {
        for (int i= 0; i < (int)referencedata.size(); i++)
        {
            string regionstr = referencedata.at(i).RefName;
            BamRegion region;
            ParseRegionString(regionstr, reader, region);
            if (!reader.SetRegion(region)) // Bam region will get [0,101) = 0 to 100 => [closed, half-opened)
            {
                cerr << "ERROR: set region " << regionstr << " failed. Check that REGION describes a valid range... Aborting" << endl;
                reader.Close();
                exit(1);
            }
            else
                regionlist.push_back(region);
        }
    }

    //// 
    BamWriter writer;
    if (!writer.Open("stdout", reader.GetHeaderText(), reader.GetReferenceData()))
    {
        cerr << "could not open stdout for writing" << endl;
        exit(1);
    }

    //// Smallest start position and Largest end position for Req Seq
    vector<RefData>::iterator refdataIter = referencedata.begin();
    vector<BamRegion>::iterator regionListIter = regionlist.begin();
   

    // CLASS
    RealignFunctionsClass RealignFunction;

    map<int, string> RefIDRedName;
    vector<SalRealignInfo> AlGroups;
    multimap<int, BamAlignment> SortRealignedAlignmentsMultimap;

    int refid               = 0;
    BamAlignment alignment;
    bool IsNextAlignment = reader.GetNextAlignment(alignment);
    //cerr << "   " << alignment.Name << " Chr  " << alignment.RefID << " Startpos: " << alignment.Position << " Endpos: " << alignment.GetEndPosition() << " Length: " << alignment.Length << endl;

    int windowrealigned     = 0;
    int TotalWindowDetected = 0;
    int TotalReadsAligned   = 0;
    int TotalWindow         = 0;
    int TotalReads          = 0;

    while (refdataIter != referencedata.end() )
    {
        string refname = refdataIter->RefName;
        RefIDRedName[refid] = refname;
        int reflength = refdataIter->RefLength;
        int winstartpos, winendpos;
        int AllowableBasesInWindow = 1;
        bool nextChrName = false;

        cerr << "##HeaderINFO: RefID = " << refdataIter->RefName << "\t" << "RefLen = " << reflength << endl;
        
        while (nextChrName == false )
        {
            vector<int> minmaxRefSeqPos;
            bool IsPassDetectorNoRealignment = false;
            minmaxRefSeqPos.push_back(-1);
            minmaxRefSeqPos.push_back(0);
            //cerr << " region: " << (*regionListIter).LeftRefID << " : " << (*regionListIter).LeftPosition << " .. " << (*regionListIter).RightPosition << endl;
            if ((refid == (int)referencedata.size() - 1) && ((*regionListIter).LeftRefID == refid) && ((has_region==true) || (has_regionFile==true)) )
            {
                ////
                if ( (has_region == true) || (has_regionFile == true) )
                {                    
                    winstartpos = (*regionListIter).LeftPosition;
                    winendpos   = winstartpos + windowlen - 1;
                    reflength = (*regionListIter).RightPosition;
                    if (reflength < winendpos)
                        reflength = winendpos;
                                       
                    // Get Next Alignment First
                    if ( (refid == alignment.RefID) && (winstartpos == (*regionListIter).LeftPosition) && (IsNextAlignment == false) )
                        IsNextAlignment = reader.GetNextAlignment(alignment);
                }
                else if (has_region == false)
                {
                    winstartpos = 0;
                    winendpos   = winstartpos + windowlen - 1;

                    // Get Next Alignment First
                    if ( (refid == alignment.RefID) && (winstartpos == 0) && (IsNextAlignment == false) )
                        IsNextAlignment = reader.GetNextAlignment(alignment);
                }
                //cerr << " winstart: " << winstartpos << " ; winend: " << winendpos;
                //cerr << "   " << alignment.Name << " Chr  " << alignment.RefID << " Startpos: " << alignment.Position << " Endpos: " << alignment.GetEndPosition() << " Length: " << alignment.Length << endl;

                ////
                while ((winstartpos < reflength))
                {
                    //// Check window end position
                    if (winendpos > reflength)
                        winendpos = reflength;

                    // Reinitialized
                    unsigned int NewReadMappedcount = 0;
                
                    //// Save and Erase alignments that are outside of window (Deque?)
                    if (!AlGroups.empty())
                    {
                        minmaxRefSeqPos.at(0) = -1;
                        minmaxRefSeqPos.at(1) = 0;

                        //cerr << "#Start: Keep alignments with start position exceed the right end of the window/Region " << endl;
                        vector<SalRealignInfo>::iterator Iter = AlGroups.begin();

                        while (Iter != AlGroups.end())
                        {
                            // Erase alignment s
                            if ((*Iter).al.GetEndPosition() < winstartpos)
                            {
                                //cerr << "  ToWrite: " << (*Iter).second.size() << " ; " << (*Iter).al.Name << " ; " << (*Iter).al.Position << " < " << winstartpos << " : " << (*Iter).al.GetEndPosition() << endl;
                                SortRealignedAlignmentsMultimap.insert(pair<int, BamAlignment > ((*Iter).al.Position, (*Iter).al));
                                AlGroups.erase(Iter);

                                //cerr << "  ToWrite: DONE " << endl;
                            } 
                            else
                            {
                                string referenceSequence = reference->getSubSequence(RefIDRedName[(*Iter).al.RefID], (*Iter).al.Position, 2*(*Iter).al.Length);
                            
                                if ((*Iter).HasRealign == true )
                                {
                                    (*Iter).currentReadPosition = 0;
                                    (*Iter).currentGenomeSeqPosition = 0;
                                    (*Iter).currentAlPosition = (*Iter).al.Position;
                                    (*Iter).cigarindex = 0;
                                }

                                (*Iter).CigarSoftclippingLength = 0;
                                SalRealignInfo talr = (*Iter);
                                //cerr << "  ToKEEP: " << (*Iter).al.Name << " ; " << (*Iter).al.Position << " < " << winstartpos << " : " << (*Iter).al.GetEndPosition() << endl;
                                RealignFunction.ParseAlignmentsAndExtractVariantsByBaseQualv(AlGroups, talr, Iter, (*Iter).al, referenceSequence, minmaxRefSeqPos, winstartpos, winendpos, (float) minbaseQ, false);
                           
                                ++Iter; //Increment iterator
                            }
                        }
                    }
                

                    // Write Sorted Alignments that are outside of window
                    //cerr << "SortRealignedAlignmentsMultimap: " << SortRealignedAlignmentsMultimap.size() << " minmaxRefSeqPos.at(0)= " << minmaxRefSeqPos.at(0) << endl;
                    if (!SortRealignedAlignmentsMultimap.empty()) // && (winWrite < winstartpos ) )
                    {
                        //cerr << "#Start: Write alignments and delete alignments with start position exceed the right end of the window/Region " << endl;
                        multimap<int, BamAlignment>::iterator sraIter = SortRealignedAlignmentsMultimap.begin();

                        while (sraIter != SortRealignedAlignmentsMultimap.end()) 
                        {
                            //cerr << " (*sraIter).first= " <<  (*sraIter).first << " minmaxRefSeqPos.at(0)= " << minmaxRefSeqPos.at(0) << " winstartpos - ((windowlen - 1)*0.9)= " << winstartpos - ((windowlen - 1)*0.9) << endl;
                            if (((float) (*sraIter).first < floor((float) (winstartpos - ((windowlen - 1)*0.9)))) && ((minmaxRefSeqPos.at(0) > 0) && ((*sraIter).first < minmaxRefSeqPos.at(0)))) {
                                //writer.SaveAlignment((*sraIter).second);  // Why sometimes, it doesn't work ?????
                                if (!writer.SaveAlignment((*sraIter).second))
                                    cerr << writer.GetErrorString() << endl;

                                SortRealignedAlignmentsMultimap.erase(sraIter++);
                            } else {
                                ++sraIter;
                            }
                    }
                    //cerr << "#Done: Write alignments and delete alignments with start position exceed the right end of the window/Region " << endl;
                    }

                    //cerr << " winstart: " << winstartpos << " ; winend: " << winendpos;
                    //cerr << "   " << alignment.Name << " Chr  " << alignment.RefID << " Startpos: " << alignment.Position << " Endpos: " << alignment.GetEndPosition() << " Length: " << alignment.Length << endl;
                    //cerr <<  ": " << alignment.RefID << " :" << RefIDRedName[alignment.RefID] << " : " << RefIDRedName[alignment.RefID] << endl;

                    //cerr << "Start: Gather aligmenets that lie (fully or partially) within the window frame and group INDELs if there are ... " << endl;
                    // Gather Reads within a window frame
                  
                    while ((IsNextAlignment) && (refid == alignment.RefID)) // Neeed more conditions
                    {
                        if (SetLowComplexityRegion == true) 
                        {
                            string sequenceInWindow = reference->getSubSequence(RefIDRedName[alignment.RefID], winstartpos, (winendpos-winstartpos+1) );

                            if (IsWindowInRepeatRegion(sequenceInWindow) == true)
                            {
                                if ((IsWithinWindow(alignment, winstartpos, winendpos, AllowableBasesInWindow)) == 0)
                                {
                                    TotalReads++;
                                    if (alignment.IsMapped())
                                    {
                                        string referenceSequence = reference->getSubSequence(RefIDRedName[alignment.RefID], alignment.Position, 2*alignment.Length);
 
                                        vector<SalRealignInfo>::iterator tIter;
                                        SalRealignInfo alr;
                                        alr.al = alignment;
                                        alr.currentReadPosition = 0;
                                        alr.currentGenomeSeqPosition = 0;
                                        alr.currentAlPosition = alignment.Position;
                                        alr.cigarindex = 0;
                                        alr.HasRealign = false;
                                        alr.CigarSoftclippingLength = 0;

                                        string str = "ZZZZZZZZZZZZZZZZZ";
                                        if (alignment.Name.find(str) != string::npos) {
                                            stringstream cigar;
                                            for (vector<CigarOp>::const_iterator cigarIter = alignment.CigarData.begin(); cigarIter != alignment.CigarData.end(); ++cigarIter)
                                                cigar << cigarIter->Length << cigarIter->Type;

                                            string cigarstr = cigar.str();
                                            cerr << "   TRACKING: " << alignment.RefID << " " << alignment.Name << " pos: " << alignment.Position << " cigar: " << cigarstr << endl;
                                        }

                                        RealignFunction.ParseAlignmentsAndExtractVariantsByBaseQualv(AlGroups, alr, tIter, alignment, referenceSequence, minmaxRefSeqPos, winstartpos, winendpos, (float) minbaseQ, true);
                                        NewReadMappedcount++;
                                    } 
                                    else
                                    {
                                        SortRealignedAlignmentsMultimap.insert(pair<int, BamAlignment > (alignment.Position, alignment));
                                        cerr << "UNmapped : " << alignment.Name << endl;
                                    }
                                } 
                                else if ((IsWithinWindow(alignment, winstartpos, winendpos, AllowableBasesInWindow)) == 1)
                                {
                                    SortRealignedAlignmentsMultimap.insert(pair<int, BamAlignment > (alignment.Position, alignment));
                                }
                                else
                                    break;
                            } else {
                                if ((IsWithinWindow(alignment, winstartpos, winendpos, AllowableBasesInWindow)) < 2)
                                    SortRealignedAlignmentsMultimap.insert(pair<int, BamAlignment > (alignment.Position, alignment));
                                else
                                    break;
                            }
                        }
                        else // (SetLowComplexityRegion == false)
                        {
                            if ((IsWithinWindow(alignment, winstartpos, winendpos, AllowableBasesInWindow)) == 0)
                            {
                                TotalReads++;
                                if (alignment.IsMapped())
                                {
                                    string referenceSequence = reference->getSubSequence(RefIDRedName[alignment.RefID], alignment.Position, 2 * alignment.Length);

                                    vector<SalRealignInfo>::iterator tIter;
                                    SalRealignInfo alr;
                                    alr.al = alignment;
                                    alr.currentReadPosition = 0;
                                    alr.currentGenomeSeqPosition = 0;
                                    alr.currentAlPosition = alignment.Position;
                                    alr.cigarindex = 0;
                                    alr.HasRealign = false;
                                    alr.CigarSoftclippingLength = 0;

                                    string str = "ZZZZZZZZZZZZZZZZZ";
                                    if (alignment.Name.find(str) != string::npos)
                                    {
                                        stringstream cigar;
                                        for (vector<CigarOp>::const_iterator cigarIter = alignment.CigarData.begin(); cigarIter != alignment.CigarData.end(); ++cigarIter)
                                            cigar << cigarIter->Length << cigarIter->Type;

                                        string cigarstr = cigar.str();
                                        cerr << "   TRACKING: " << alignment.RefID << " " << alignment.Name << " pos: " << alignment.Position << " cigar: " << cigarstr << endl;
                                    }

                                    RealignFunction.ParseAlignmentsAndExtractVariantsByBaseQualv(AlGroups, alr, tIter, alignment, referenceSequence, minmaxRefSeqPos, winstartpos, winendpos, (float) minbaseQ, true);

                                    //cerr << " winstart: " << winstartpos << " ; winend: " << winendpos;
                                    //cerr << "   INDEL: " << alignment.RefID << " " << alignment.Name << " pos: " << alignment.Position << " Length: " << alignment.Length << " CIGARstr: " << cigarstr << endl;
                                    NewReadMappedcount++;
                                } 
                                else
                                {
                                    SortRealignedAlignmentsMultimap.insert(pair<int, BamAlignment > (alignment.Position, alignment));
                                    cerr << "UNmapped : " << alignment.Name << endl;
                                }
                            }
                            else if ((IsWithinWindow(alignment, winstartpos, winendpos, AllowableBasesInWindow)) == 1) {
                                SortRealignedAlignmentsMultimap.insert(pair<int, BamAlignment > (alignment.Position, alignment));
                            }
                            else
                                break;
                        }

                        ////Get next alignment
                        IsNextAlignment = reader.GetNextAlignment(alignment);
                    }

                   //cerr << "Done: Gather aligmenets that lie (fully or partially) within the window frame and group INDELs if there are ... " << endl;

                    //// Detector Corner
                    bool ToRealign = MeetIndelDetectorThresholdv(AlGroups);
                    cerr << "MeetIndelDetectorThresholdv(AlGroups).size()= " << AlGroups.size() << endl;
                    
                    // **************
                    if (ToRealign)
                    {
                        //cerr << "  ToRealign: " << refdataIter->RefName << "\t" << reflength << "\t" << winstartpos << "\t" << winendpos << "\t" << AlGroups.size() << endl;
                        //cerr << "             minmaxRefSeqPos.at(1)= " << minmaxRefSeqPos.at(1) << " minmaxRefSeqPos.at(0)= " << minmaxRefSeqPos.at(0) << endl;

                        ////// Perform Realign routines
                        int TotalAlR = 0; // Total number of alignments to be realigned
                        int NumAlR = 0; // Now many alignments are aligned
                        TotalWindowDetected++;

                        cerr << "#Start: Meet Threshold, Realigning ... " << endl;

                        if (minmaxRefSeqPos.at(1) < winendpos)
                            minmaxRefSeqPos.at(1) = winendpos;

                        if (minmaxRefSeqPos.at(0) > winstartpos)
                            minmaxRefSeqPos.at(0) = winstartpos;

                        bool IsToRealign = RealignFunction.PruningByNaiveSelectionProcedureAndConstructHaplotypes2(winstartpos, winendpos, refid, refname, minmaxRefSeqPos, reference);

                        if (IsToRealign == true)
                        {
                            RealignFunction.SelectHaplotypeCandidates_SmithWatermanBSv(AlGroups, minmaxRefSeqPos, SetLowComplexityRegionSWGapExt);

                            minmaxRefSeqPos.at(0) = -1;
                            minmaxRefSeqPos.at(1) = 0;

                            int nextwinstartpos = winendpos + 1;
                            int nextwinendpos = winstartpos + windowlen - 1;
                            if (nextwinendpos > reflength)
                                nextwinendpos = reflength;

                            //cerr <<  "   Before Realign : " << SortRealignedAlignmentsMultimap.size() << endl;
                            RealignFunction.AdjustCigarsWRTChosenMultipleHaplotypesAndPrepareAlignmentsTobeWrittenOut(AlGroups, SortRealignedAlignmentsMultimap, reference, RefIDRedName, minmaxRefSeqPos, nextwinstartpos, nextwinendpos, minbaseQ, TotalAlR, NumAlR, ploidy);
                            IsPassDetectorNoRealignment = false; // Set flag to false to deactivate write functions

                            //cerr <<  "   After Realign : " << SortRealignedAlignmentsMultimap.size() << endl;

                            TotalReadsAligned += NumAlR;

                            if (NumAlR > 0) // Realignment done
                                windowrealigned++;
                        } else


                        cerr << "#Done: Meet Threshold, Realigning ... " << endl;
                    }


                    if (NewReadMappedcount > 0)
                        TotalWindow++;

                    RealignFunction.Clear();

                    //// Move the window frame
                    winstartpos = winendpos + 1;
                    winendpos = winstartpos + windowlen - 1;
                }

                //// Save and Erase remaining alignments that are outside of window (Deque?)
                if ((!AlGroups.empty())) {
                    cerr << "#Start: Write Remaining alignments and delete all alignments" << endl;

                    for (vector<SalRealignInfo>::iterator Iter = AlGroups.begin(); Iter != AlGroups.end(); ++Iter) {
                        //cerr << "    Remain alignment start: " << (*Iter).al.Name << " " << Iter->al.Position  << " < " << winstartpos << "  " << winendpos << endl;
                        SortRealignedAlignmentsMultimap.insert(pair<int, BamAlignment > ((*Iter).al.Position, (*Iter).al));
                    }

                    cerr << "#Done: Write Remaining alignments and delete all alignments" << endl;
                }

                AlGroups.clear();


                // Write Sorted remaining Alignments that are outside of window
                if (!SortRealignedAlignmentsMultimap.empty())
                {
                    for (multimap<int, BamAlignment>::iterator sraIter = SortRealignedAlignmentsMultimap.begin(); sraIter != SortRealignedAlignmentsMultimap.end(); ++sraIter)
                    {
                        //writer.SaveAlignment((*sraIter).second);
                        if (!writer.SaveAlignment((*sraIter).second))
                            cerr << writer.GetErrorString() << endl;
                    }
                    SortRealignedAlignmentsMultimap.clear();
                }

            }

            ++regionListIter;
            if ((*regionListIter).LeftRefID > refid)
                nextChrName = true;
        }

        //// If End of the chromosome position
        //// increament iterator
        ++refdataIter;
        ++refid;
    }


    reader.Close();
    writer.Close();

    cerr << "##-Completed- " << endl;
    cerr << " Total Reads processed =  " << TotalReads << endl;
    cerr << " Total Reads Aligned =    " << TotalReadsAligned << endl;
    cerr << " Total Window processed = " << TotalWindow << endl;
    cerr << " Total Window Detected =  " << TotalWindowDetected << endl;
    cerr << " Total Windows Aligned =  " << windowrealigned << endl;


    // Restore cerr's stream buffer before terminating
    if (cerrlog.is_open())
        cerr.rdbuf(cerrsave);

    commandline.clear();
    return 0;
}
Beispiel #21
0
// generates mutiple sorted temp BAM files from single unsorted BAM file
bool SortTool::SortToolPrivate::GenerateSortedRuns(void) {
    
    // open input BAM file
    BamReader reader;
    if ( !reader.Open(m_settings->InputBamFilename) ) {
        cerr << "bamtools sort ERROR: could not open " << m_settings->InputBamFilename
             << " for reading... Aborting." << endl;
        return false;
    }
    
    // get basic data that will be shared by all temp/output files 
    SamHeader header = reader.GetHeader();
    header.SortOrder = ( m_settings->IsSortingByName
                       ? Constants::SAM_HD_SORTORDER_QUERYNAME
                       : Constants::SAM_HD_SORTORDER_COORDINATE );
    m_headerText = header.ToString();
    m_references = reader.GetReferenceData();
    
    // set up alignments buffer
    BamAlignment al;
    vector<BamAlignment> buffer;
    buffer.reserve( (size_t)(m_settings->MaxBufferCount*1.1) );
    bool bufferFull = false;
    
    // if sorting by name, we need to generate full char data
    // so can't use GetNextAlignmentCore()
    if ( m_settings->IsSortingByName ) {

        // iterate through file
        while ( reader.GetNextAlignment(al)) {

            // check buffer's usage
            bufferFull = ( buffer.size() >= m_settings->MaxBufferCount );

            // store alignments until buffer is "full"
            if ( !bufferFull )
                buffer.push_back(al);

            // if buffer is "full"
            else {

                // push any unmapped reads into buffer,
                // don't want to split these into a separate temp file
                if ( !al.IsMapped() )
                    buffer.push_back(al);

                // "al" is mapped, so create a sorted temp file with current buffer contents
                // then push "al" into fresh buffer
                else {
                    CreateSortedTempFile(buffer);
                    buffer.push_back(al);
                }
            }
        }
    }

    // sorting by position, can take advantage of GNACore() speedup
    else {

        // iterate through file
        while ( reader.GetNextAlignmentCore(al) ) {

            // check buffer's usage
            bufferFull = ( buffer.size() >= m_settings->MaxBufferCount );

            // store alignments until buffer is "full"
            if ( !bufferFull )
                buffer.push_back(al);

            // if buffer is "full"
            else {

                // push any unmapped reads into buffer,
                // don't want to split these into a separate temp file
                if ( !al.IsMapped() )
                    buffer.push_back(al);

                // "al" is mapped, so create a sorted temp file with current buffer contents
                // then push "al" into fresh buffer
                else {
                    CreateSortedTempFile(buffer);
                    buffer.push_back(al);
                }
            }
        }
    }

    // handle any leftover buffer contents
    if ( !buffer.empty() )
        CreateSortedTempFile(buffer);
    
    // close reader & return success
    reader.Close();
    return true;
}
Beispiel #22
0
//{{{bool sort_inter_chrom_bam(string in_file_name,
bool sort_inter_chrom_bam(string in_file_name,
						  string out_file_name)
{
    // open input BAM file
    BamReader reader;
    if ( !reader.Open(in_file_name) ) {
        cerr << "sort ERROR: could not open " << 
			in_file_name << " for reading... Aborting." << endl;
        return false;
    }

    SamHeader header = reader.GetHeader();
    if ( !header.HasVersion() )
        header.Version = Constants::SAM_CURRENT_VERSION;

    string header_text = header.ToString();
    RefVector ref = reader.GetReferenceData();

    // set up alignments buffer
    BamAlignment al;
    vector<BamAlignment> buffer;
    buffer.reserve( (size_t)(SORT_DEFAULT_MAX_BUFFER_COUNT*1.1) );
    bool bufferFull = false;

	
    int buff_count = 0;
    // iterate through file
    while ( reader.GetNextAlignment(al)) {

        // check buffer's usage
        bufferFull = ( buffer.size() >= SORT_DEFAULT_MAX_BUFFER_COUNT );

        // store alignments until buffer is "full"
        if ( !bufferFull )
            buffer.push_back(al);
        // if buffer is "full"
        else {
            // so create a sorted temp file with current buffer contents
            // then push "al" into fresh buffer
            create_sorted_temp_file(buffer,
                                    out_file_name,
                                    buff_count,
                                    header_text,
                                    ref);
                                    ++buff_count;
            buffer.push_back(al);
        }
    }

    // handle any leftover buffer contents
    if ( !buffer.empty() ) {
        create_sorted_temp_file(buffer,
                                out_file_name,
                                buff_count,
                                header_text,
                                ref);

        ++buff_count;
    }

    reader.Close();

    return merge_sorted_files(out_file_name, buff_count, header_text, ref);

/*
	for (int i = 0; i < buff_count; ++i) {
    	stringstream temp_name;
    	temp_name << out_file_name << i;
	}
*/
}
Beispiel #23
0
int main (int argc, char *argv[]) {


    bool produceUnCompressedBAM=false;
    bool verbose=false;
    bool ancientDNA=false;
    bool keepOrig=false;

    string adapter_F=options_adapter_F_BAM;
    string adapter_S=options_adapter_S_BAM;
    string adapter_chimera=options_adapter_chimera_BAM;
    string key="";
    bool allowMissing=false;
    int trimCutoff=1;

    bool allowAligned=false;
    bool printLog=false;
    string logFileName;

    BamReader reader;
    BamWriter writer;

    string bamFile;
    string bamFileOUT="";

    string key1;
    string key2;
    
    bool useDist=false;
    double location=-1.0;
    double scale   =-1.0;

    bool fastqFormat=false;
    string fastqfile1   = "";
    string fastqfile2   = "";
    string fastqoutfile = "";
    bool singleEndModeFQ=true;

    const string usage=string(string(argv[0])+
			      
			      " [options] BAMfile"+"\n"+
			      "\nThis program takes an unaligned BAM where mates are consecutive\nor fastq files and trims and merges reads\n"+

			      "\n\tYou can specify a unaligned bam file or one or two fastq :\n"+			      
			      "\t\t"+"-fq1" +"\t\t"+"First fastq"+"\n"+
			      "\t\t"+"-fq2" +"\t\t"+"Second  fastq file (for paired-end)"+"\n"+
			      "\t\t"+"-fqo" +"\t\t"+"Output fastq prefix"+"\n\n"+
			      //"\t"+"-p , --PIPE"+"\n\t\t"+"Read BAM from and write it to PIPE"+"\n"+
			      "\t"+"-o , --outfile" +"\t\t"+"Output (BAM format)."+"\n"+


			      "\t"+"-u            " +"\t\t"+"Produce uncompressed bam (good for pipe)"+"\n"+

			      //	"\t"+" , --outprefix" +"\n\t\t"+"Prefix for output files (default '"+outprefix+"')."+"\n"+
			      //"\t"+" , --SAM" +"\n\t\t"+"Output SAM not BAM."+"\n"+
			      "\t"+"--aligned" +"\t\t"+"Allow reads to be aligned (default "+boolStringify(allowAligned)+")"+"\n"+
			      "\t"+"-v , --verbose" +"\t\t"+"Turn all messages on (default "+boolStringify(verbose)+")"+"\n"+
			      "\t"+"--log [log file]" +"\t"+"Print a tally of merged reads to this log file (default only to stderr)"+"\n"+
			      
			      "\n\t"+"Paired End merging/Single Read trimming  options"+"\n"+
			      "\t\t"+"You can specify either:"+"\n"+
			      "\t\t\t"+"--ancientdna"+"\t\t\t"+"ancient DNA (default "+boolStringify(ancientDNA)+")"+"\n"+
			      "\t\t"+"            "+"\t\t\t\t"+"this allows for partial overlap"+"\n"+
			      "\n\t\t"+"or if you know your size length distribution:"+"\n"+
			      "\t\t\t"+"--loc"+"\t\t\t\t"+"Location for lognormal dist. (default none)"+"\n"+
			      "\t\t\t"+"--scale"+"\t\t\t\t"+"Scale for lognormal dist. (default none)"+"\n"+
			      //			      "\t\t\t\t\t\t\tGood for merging ancient DNA reads into a single sequence\n\n"
			      "\n\t\t"+"--keepOrig"+"\t\t\t\t"+"Write original reads if they are trimmed or merged  (default "+boolStringify(keepOrig)+")"+"\n"+
			      "\t\t\t\t\t\t\tSuch reads will be marked as PCR duplicates\n\n"
			      "\t\t"+"-f , --adapterFirstRead" +"\t\t\t"+"Adapter that is observed after the forward read (def. Multiplex: "+options_adapter_F_BAM.substr(0,30)+")"+"\n"+
			      "\t\t"+"-s , --adapterSecondRead" +"\t\t"+"Adapter that is observed after the reverse read (def. Multiplex: "+options_adapter_S_BAM.substr(0,30)+")"+"\n"+
			      "\t\t"+"-c , --FirstReadChimeraFilter" +"\t\t"+"If the forward read looks like this sequence, the cluster is filtered out.\n\t\t\t\t\t\t\tProvide several sequences separated by comma (def. Multiplex: "+options_adapter_chimera_BAM.substr(0,30)+")"+"\n"+
			      "\t\t"+"-k , --key"+"\t\t\t\t"+"Key sequence with which each sequence starts. Comma separate for forward and reverse reads. (default '"+key+"')"+"\n"+
			      "\t\t"+"-i , --allowMissing"+"\t\t\t"+"Allow one base in one key to be missing or wrong. (default "+boolStringify(allowMissing)+")"+"\n"+
			      "\t\t"+"-t , --trimCutoff"+"\t\t\t"+"Lowest number of adapter bases to be observed for single Read trimming (default "+stringify(trimCutoff)+")");

    if( (argc== 1) ||
    	(argc== 2 && string(argv[1]) == "-h") ||
    	(argc== 2 && string(argv[1]) == "-help") ||
    	(argc== 2 && string(argv[1]) == "--help") ){
    	cout<<"Usage:"<<endl;
    	cout<<""<<endl;
    	cout<<usage<<endl;
    	return 1;
    }

    

    for(int i=1;i<(argc-1);i++){ //all but the last arg

	if(strcmp(argv[i],"-fq1") == 0 ){
	    fastqfile1=string(argv[i+1]);
	    fastqFormat=true;
	    i++;
	    continue;
	}

	if(strcmp(argv[i],"-fq2") == 0 ){
	    fastqfile2=string(argv[i+1]);
	    fastqFormat=true;
	    singleEndModeFQ=false;
	    i++;
	    continue;
	}

	if(strcmp(argv[i],"-fqo") == 0 ){
	    fastqoutfile=string(argv[i+1]);
	    fastqFormat=true;
	    i++;
	    continue;
	}




	if(strcmp(argv[i],"--log") == 0 ){
	    logFileName =string(argv[i+1]);
	    printLog=true;
	    i++;
	    continue;
	}

	if(strcmp(argv[i],"-p") == 0 || strcmp(argv[i],"--PIPE") == 0 ){
	    cerr<<"This version no longer works with pipe, exiting"<<endl;
	    return 1;	    
	}

	if(strcmp(argv[i],"-u") == 0  ){
	    produceUnCompressedBAM=true;
	    continue;
	}

	if(strcmp(argv[i],"--aligned") == 0  ){
	    allowAligned=true;
	    continue;
	}



	if(strcmp(argv[i],"-o") == 0 || strcmp(argv[i],"--outfile") == 0 ){
	    bamFileOUT =string(argv[i+1]);
	    i++;
	    continue;
	}

	if(strcmp(argv[i],"-v") == 0 || strcmp(argv[i],"--verbose") == 0 ){
	    verbose=true;
	    continue;
	}

	if(strcmp(argv[i],"--ancientdna") == 0 ){
	    ancientDNA=true;
	    continue;
	}

	if(strcmp(argv[i],"--keepOrig") == 0 ){
	    keepOrig=true;
	    continue;
	}

	if(strcmp(argv[i],"--loc") == 0 ){
	    location =destringify<double>(argv[i+1]);
	    i++;
	    continue;
	}

	if(strcmp(argv[i],"--scale") == 0 ){
	    scale =destringify<double>(argv[i+1]);
	    i++;
	    continue;
	}



	if(strcmp(argv[i],"-f") == 0 || strcmp(argv[i],"--adapterFirstRead") == 0 ){
	    adapter_F =string(argv[i+1]);
	    i++;
	    continue;
	}


	if(strcmp(argv[i],"-s") == 0 || strcmp(argv[i],"--adapterSecondRead") == 0 ){
	    adapter_S =string(argv[i+1]);
	    i++;
	    continue;
	}

	if(strcmp(argv[i],"-c") == 0 || strcmp(argv[i],"--FirstReadChimeraFilter") == 0 ){
	    adapter_chimera =string(argv[i+1]);
	    i++;
	    continue;
	}

	if(strcmp(argv[i],"-k") == 0 || strcmp(argv[i],"--keys") == 0 ){
	    key =string(argv[i+1]);
	    i++;
	    continue;
	}
	

	if(strcmp(argv[i],"-i") == 0 || strcmp(argv[i],"--allowMissing") == 0 ){
	    allowMissing=true;
	    continue;
	}

	if(strcmp(argv[i],"-t") == 0 || strcmp(argv[i],"--trimCutoff") == 0 ){
	    trimCutoff=atoi(argv[i+1]);
	    i++;
	    continue;
	}
	
	cerr<<"Unknown option "<<argv[i] <<" exiting"<<endl;
	return 1;	    
    }

    bamFile=argv[argc-1];

    if( (location != -1.0 && scale == -1.0) ||
	(location == -1.0 && scale != -1.0) ){
	cerr<<"Cannot specify --location without specifying --scale"<<endl;
	return 1;	    
    }
	
    if( (location != -1.0 && scale != -1.0) ){
	useDist=true;
	    
	if(ancientDNA){
	    cerr<<"Cannot specify --location/--scale and --ancientDNA"<<endl;
	    return 1;	    
	}
    }
    
    MergeTrimReads mtr (adapter_F,adapter_S,adapter_chimera,
			key1,key2,
			trimCutoff,allowMissing,ancientDNA,location,scale,useDist);

    fqwriters onereadgroup;

    if(fastqFormat){
	
	if( bamFileOUT != ""  || produceUnCompressedBAM || allowAligned){
	    cerr<<"ERROR : Cannot specify options like -o, -u or --allowAligned for fastq"<<endl;
	    return 1;
	}

	if(fastqfile1 == ""){
	    cerr<<"ERROR : Must specify as least the first file for fastq"<<endl;
	    return 1;	    
	}



	FastQParser * fqp1;
	FastQParser * fqp2;

	if(singleEndModeFQ){
	    fqp1 = new FastQParser (fastqfile1);

	    string outdirs   = fastqoutfile+".fq.gz";
	    string outdirsf  = fastqoutfile+".fail.fq.gz";

	    onereadgroup.single.open(outdirs.c_str(), ios::out);
	    onereadgroup.singlef.open(outdirsf.c_str(), ios::out);

	    if(!onereadgroup.single.good()){       cerr<<"Cannot write to file "<<outdirs<<endl; return 1; }
	    if(!onereadgroup.singlef.good()){      cerr<<"Cannot write to file "<<outdirsf<<endl; return 1; }

	    
	}else{
	    fqp1 = new FastQParser (fastqfile1);
	    fqp2 = new FastQParser (fastqfile2);

	    string outdirs   = fastqoutfile+".fq.gz";
	    string outdir1   = fastqoutfile+"_r1.fq.gz";
	    string outdir2   = fastqoutfile+"_r2.fq.gz";

	    string outdirsf  = fastqoutfile+".fail.fq.gz";
	    string outdir1f  = fastqoutfile+"_r1.fail.fq.gz";
	    string outdir2f  = fastqoutfile+"_r2.fail.fq.gz";

	    onereadgroup.single.open(outdirs.c_str(), ios::out);
	    onereadgroup.pairr1.open(outdir1.c_str(), ios::out);
	    onereadgroup.pairr2.open(outdir2.c_str(), ios::out);

	    onereadgroup.singlef.open(outdirsf.c_str(), ios::out);
	    onereadgroup.pairr1f.open(outdir1f.c_str(), ios::out);
	    onereadgroup.pairr2f.open(outdir2f.c_str(), ios::out);

	    if(!onereadgroup.single.good()){       cerr<<"Cannot write to file "<<outdirs<<endl; return 1; }
	    if(!onereadgroup.pairr1.good()){       cerr<<"Cannot write to file "<<outdir1<<endl; return 1; }
	    if(!onereadgroup.pairr2.good()){       cerr<<"Cannot write to file "<<outdir2<<endl; return 1; }
	    
	    if(!onereadgroup.singlef.good()){      cerr<<"Cannot write to file "<<outdirsf<<endl; return 1; }
	    if(!onereadgroup.pairr1f.good()){      cerr<<"Cannot write to file "<<outdir1f<<endl; return 1; }
	    if(!onereadgroup.pairr2f.good()){      cerr<<"Cannot write to file "<<outdir2f<<endl; return 1; }
	    
	}


	unsigned int totalSeqs=0;
	while(fqp1->hasData()){

	    FastQObj * fo1=fqp1->getData();
	    vector<string> def1=allTokens( *(fo1->getID()), ' '  );
	    string def1s=def1[0];
	

	    FastQObj * fo2;
	    string def2s;
	    string ext2s;

	    if(!singleEndModeFQ){
		if(!fqp2->hasData()){
		    cerr << "ERROR: Discrepency between fastq files at record " <<  *(fo1->getID()) <<endl;
		    return 1;
		}

		fo2=fqp2->getData();
		vector<string> def2=allTokens( *(fo2->getID()), ' ' );
		def2s=def2[0];




		if(strEndsWith(def1s,"/1")){
		    def1s=def1s.substr(0,def1s.size()-2);
		}
		if(strEndsWith(def2s,"/2")){
		    def2s=def2s.substr(0,def2s.size()-2);
		}

		if(strBeginsWith(def1s,"@")){
		    def1s=def1s.substr(1,def1s.size()-1);
		}
		if(strBeginsWith(def2s,"@")){
		    def2s=def2s.substr(1,def2s.size()-1);
		}


		if(def1s != def2s){
		    cerr << "ERROR: Discrepency between fastq files, different names " << *(fo1->getID()) <<" and "<< *(fo2->getID()) <<endl;
		    return 1;
		}

		merged result=	mtr.process_PE(*(fo1->getSeq()),*(fo1->getQual()),
					       *(fo2->getSeq()),*(fo2->getQual()));

		mtr.incrementCountall();

		if(result.code != ' '){ //keys or chimeras

		    if(result.code == 'K'){
			mtr.incrementCountfkey();
		    }else{
			if(result.code == 'D'){
			    mtr.incrementCountchimera();
			}else{
			    cerr << "leehom: Wrong return code =\""<<result.code<<"\""<<endl;
			    exit(1);
			}
		    }
			
		    onereadgroup.pairr2f<<"@"<<def2s<<"/2" <<endl <<*(fo2->getSeq())<<endl<<"+"<<endl <<*(fo2->getQual())<<endl;
		    onereadgroup.pairr1f<<"@"<<def1s<<"/1" <<endl <<*(fo1->getSeq())<<endl<<"+"<<endl <<*(fo1->getQual())<<endl;
		    continue;

		}else{
		        if(result.sequence != ""){ //new sequence			    
			    onereadgroup.single<<"@"<<def1s<<"" <<endl << result.sequence<<endl<<"+"<<endl <<result.quality<<endl;    	    

			    if( result.sequence.length() > max(fo1->getSeq()->length(),fo2->getSeq()->length()) ){
				mtr.incrementCountmergedoverlap();
			    }else{
				mtr.incrementCountmerged();			  
			    }

			}else{ //keep as is
			    mtr.incrementCountnothing();

			    onereadgroup.pairr2<<"@"<<def2s<<"/2" <<endl <<*(fo2->getSeq())<<endl<<"+"<<endl <<*(fo2->getQual())<<endl;
			    onereadgroup.pairr1<<"@"<<def1s<<"/1" <<endl <<*(fo1->getSeq())<<endl<<"+"<<endl <<*(fo1->getQual())<<endl;
			    

			}
		}

	    }else{
		
		
		merged result=mtr.process_SR(*(fo1->getSeq()),*(fo1->getQual()));
		mtr.incrementCountall();

		if(result.code != ' '){ //either chimera or missing key

		    if(result.code == 'K'){
			mtr.incrementCountfkey();
		    }else{
			if(result.code == 'D'){
			    mtr.incrementCountchimera();
			}else{
			    cerr << "leehom: Wrong return code =\""<<result.code<<"\""<<endl;
			    exit(1);
			}
		    }

		    onereadgroup.singlef<<""<<*(fo1->getID())<<"" <<endl << *(fo1->getSeq())<<endl<<"+"<<endl <<*(fo1->getQual())<<endl;
		    continue;
		}

		if(result.sequence != ""){ //new sequence
		    mtr.incrementCounttrimmed();
		    onereadgroup.single<<""<<*(fo1->getID())<<"" <<endl << result.sequence<<endl<<"+"<<endl <<result.quality<<endl;
		}else{
		    mtr.incrementCountnothing();
		    onereadgroup.single<<""<<*(fo1->getID())<<"" <<endl << *(fo1->getSeq())<<endl<<"+"<<endl <<*(fo1->getQual())<<endl;
		}

	    }
	


	    totalSeqs++;
	}
    
	delete fqp1;
	if(!singleEndModeFQ){
	    delete fqp2;
	}

	if(singleEndModeFQ){

	    onereadgroup.single.close();
	    onereadgroup.singlef.close();
	    
	}else{
	    onereadgroup.single.close();
	    onereadgroup.pairr1.close();
	    onereadgroup.pairr2.close();
	    
	    onereadgroup.singlef.close();
	    onereadgroup.pairr1f.close();
	    onereadgroup.pairr2f.close();
	}
    
	//fastq
    }else{
	//else BAM


	//  initMerge();
	//     set_adapter_sequences(adapter_F,
	// 			  adapter_S,
	// 			  adapter_chimera);
	//     set_options(trimCutoff,allowMissing,mergeoverlap);
	if(key != ""){
	    size_t found=key.find(",");
	    if (found == string::npos){ //single end reads
		key1=key;
		key2="";
	    } else{                     //paired-end
		key1=key.substr(0,found);
		key2=key.substr(found+1,key.length()-found+1);
	    }
	}






	if( bamFileOUT == ""  ){
	    cerr<<"The output must be a be specified, exiting"<<endl;
	    return 1;
	}

	if ( !reader.Open(bamFile) ) {
	    cerr << "Could not open input BAM file  "<<bamFile << endl;
	    return 1;
	}
	SamHeader header = reader.GetHeader();

    

	string pID          = "mergeTrimReadsBAM";   
	string pName        = "mergeTrimReadsBAM";   
	string pCommandLine = "";
	for(int i=0;i<(argc);i++){
	    pCommandLine += (string(argv[i])+" ");
	}
	putProgramInHeader(&header,pID,pName,pCommandLine,returnGitHubVersion(string(argv[0]),".."));

	const RefVector references = reader.GetReferenceData();
	//we will not call bgzip with full compression, good for piping into another program to 
	//lessen the load on the CPU
	if(produceUnCompressedBAM) 
	    writer.SetCompressionMode(BamWriter::Uncompressed);

	if ( !writer.Open(bamFileOUT,header,references) ) {
	    cerr << "Could not open output BAM file "<<bamFileOUT << endl;
	    return 1;
	}



	SamHeader sh=reader.GetHeader();
	//Up to the user to be sure that a sequence is followed by his mate
	// if(!sh.HasSortOrder() || 
	//    sh.SortOrder != "queryname"){
	// 	cerr << "Bamfile must be sorted by queryname" << endl;
	// 	return 1;
	// }
    

	BamAlignment al;
	BamAlignment al2;
	bool al2Null=true;
    
	while ( reader.GetNextAlignment(al) ) {

	
	    if(al.IsMapped() || al.HasTag("NM") || al.HasTag("MD")  ){
		if(!allowAligned){
		    cerr << "Reads should not be aligned" << endl;
		    return 1;
		}else{
		    //should we remove tags ?
		}
	    }


	    if(al.IsPaired() && 
	       al2Null ){
		al2=al;
		al2Null=false;
		continue;
	    }else{
		if(al.IsPaired() && 
		   !al2Null){

		    bool  result =  mtr.processPair(al,al2);
		
		    if( result ){//was merged
			BamAlignment orig;
			BamAlignment orig2;

			if(keepOrig){
			    orig2 = al2;
			    orig  = al;
			}

			writer.SaveAlignment(al);

			if(keepOrig){
			    orig.SetIsDuplicate(true);
			    orig2.SetIsDuplicate(true);
			    writer.SaveAlignment(orig2);
			    writer.SaveAlignment(orig);
			}

			//the second record is empty
		    }else{
			//keep the sequences as pairs

			writer.SaveAlignment(al2);		    
			writer.SaveAlignment(al);
		    }

		    //
		    //  SINGLE END
		    //
		}else{ 
		    BamAlignment orig;
		    if(keepOrig){
			orig =al;
		    }
		    mtr.processSingle(al);

		    if(keepOrig){
			//write duplicate
			if(orig.QueryBases.length()  != al.QueryBases.length()){
			    orig.SetIsDuplicate(true);
			    writer.SaveAlignment(orig);
			}
		    }
		    writer.SaveAlignment(al);



		} //end single end
		al2Null=true;
	    }//second pair
		    

	} //while al
	reader.Close();
	writer.Close();


    } //else BAM


    cerr <<mtr.reportSingleLine()<<endl;

    if(printLog){
	ofstream fileLog;
	fileLog.open(logFileName.c_str());

	if (fileLog.is_open()){
	    fileLog <<mtr.reportMultipleLines() <<endl;

	}else{
	    cerr << "Unable to print to file "<<logFileName<<endl;
	}
	fileLog.close();
    }
    return 0;
}
Beispiel #24
0
int main (int argc, char *argv[]) {

    // bool mapped  =false;
    // bool unmapped=false;
    int bpToDecrease5=1;
    int bpToDecrease3=2;

    const string usage=string(string(argv[0])+" [options] input.bam out.bam"+"\n\n"+
			      "\tThis program takes a BAM file as input and produces\n"+
			      "\tanother where the putative deaminated bases have\n"+
			      "\ta base quality score of "+intStringify(baseQualForDeam)+"\n"+
			      "\tgiven an "+intStringify(offset)+" offset \n"+
			      "\n"+
			      "\tOptions:\n"+
			      "\t\t"+"-n5" +"\t\t\t"+"Decrease the nth bases surrounding the 5' ends (Default:"+stringify(bpToDecrease5)+") "+"\n"+
			      "\t\t"+"-n3" +"\t\t\t"+"Decrease the nth bases surrounding the 3' ends (Default:"+stringify(bpToDecrease3)+") "+"\n"
			      );
			      // "\t"+"-u , --unmapped" +"\n\t\t"+"For an unmapped bam file"+"\n"+
			      // "\t"+"-m , --mapped"   +"\n\t\t"+"For an mapped bam file"+"\n");
			      
			      

    if( (argc== 1) ||
	(argc== 2 && string(argv[1]) == "-h") ||
	(argc== 2 && string(argv[1]) == "-help") ||
	(argc== 2 && string(argv[1]) == "--help") ){
	cout<<"Usage:"<<endl;
	cout<<usage<<endl;
	cout<<""<<endl;
	return 1;
    }

    for(int i=1;i<(argc-2);i++){ //all but the last arg

    	if( string(argv[i]) == "-n5" ){
	    bpToDecrease5 = destringify<int>(argv[i+1]);
            i++;
    	    continue;
    	}

    	if( string(argv[i]) == "-n3" ){
	    bpToDecrease3 = destringify<int>(argv[i+1]);
            i++;
    	    continue;
    	}
       
    	cerr<<"Unknown option "<<argv[i] <<" exiting"<<endl;
    	return 1;
    }

    if(argc < 3){
	cerr<<"Error: Must specify the input and output BAM files";
	return 1;
    }

    string inbamFile =argv[argc-2];
    string outbamFile=argv[argc-1];

    if(inbamFile == outbamFile){
	cerr<<"Input and output files are the same"<<endl;
	return 1;    
    }
    // if(!mapped && !unmapped){
    // 	cerr << "Please specify whether you reads are mapped or unmapped" << endl;
    // 	return 1;
    // }
    // if(mapped && unmapped){
    // 	cerr << "Please specify either mapped or unmapped but not both" << endl;
    // 	return 1;
    // }

    BamReader reader;

    if ( !reader.Open(inbamFile) ) {
    	cerr << "Could not open input BAM files." << endl;
    	return 1;
    }


    vector<RefData>  testRefData=reader.GetReferenceData();
    SamHeader header = reader.GetHeader();
    string pID          = "decrQualDeaminated";   
    string pName        = "decrQualDeaminated";   
    string pCommandLine = "";
    for(int i=0;i<(argc);i++){
        pCommandLine += (string(argv[i])+" ");
    }
    putProgramInHeader(&header,pID,pName,pCommandLine);


    const RefVector references = reader.GetReferenceData();

    BamWriter writer;
    if ( !writer.Open(outbamFile, header, references) ) {
	cerr << "Could not open output BAM file" << endl;
	return 1;
    }

    BamAlignment al;
    // BamAlignment al2;
    // bool al2Null=true;
    
    while ( reader.GetNextAlignment(al) ) {

	    if(al.IsPaired() ){  

		if(al.IsFirstMate() ){ //5' end, need to check first base only
		    if(al.IsReverseStrand()){ //
			if(!al.IsMapped()){
			    cerr << "Cannot have reverse complemented unmapped reads :" <<al.Name<< endl;
			    //return 1;
			}
			int indexToCheck;

			//5' of first mate reversed
			indexToCheck=al.QueryBases.length()-1;
			for(int i=0;i<bpToDecrease5;i++){			
			    if(toupper(al.QueryBases[indexToCheck]) == 'A'){
				al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
			    }
			    indexToCheck=max(indexToCheck-1,0);
			}

			
		    }else{
			int indexToCheck;
			//5' of first mate
			indexToCheck=0;
			for(int i=0;i<bpToDecrease5;i++){ //first base			
			    if(toupper(al.QueryBases[indexToCheck]) == 'T'){
				al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
			    }
			    indexToCheck=min(indexToCheck+1,int(al.Qualities.size()));
			}

		    }


		}else{ //3' end, need to check last two bases only
		    if( al.IsSecondMate() ){
			if(al.IsReverseStrand()){ //
			    if(!al.IsMapped()){
				cerr << "Cannot have reverse complemented unmapped reads :" <<al.Name<< endl;
				//return 1;
			    }
			    int indexToCheck;

			    //3' of second mate reversed
			    indexToCheck=al.QueryBases.length()-1;
			    for(int i=0;i<bpToDecrease3;i++){			
				if(toupper(al.QueryBases[indexToCheck]) == 'T'){
				    al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
				}
				indexToCheck=max(indexToCheck-1,0);
			    }
			    

			}else{
			    int indexToCheck;
			    
			    //3' of second mate forward
			    indexToCheck=0;
			    for(int i=0;i<bpToDecrease3;i++){ //first base			
				if(toupper(al.QueryBases[indexToCheck]) == 'A'){
				    al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
				}
				indexToCheck=min(indexToCheck+1,int(al.Qualities.size()));
			    }

			}
		    }else{
			cerr << "Wrong state" << endl;
			return 1;
		    }
		}

	    }//end of paired end
	    else{//we consider single reads to have been sequenced from 5' to 3'

		if(al.IsReverseStrand()){ //need to consider 
		    if(!al.IsMapped()){
			cerr << "Cannot have reverse complemented unmapped reads :" <<al.Name<< endl;
			//return 1;
		    }

		    int indexToCheck;

		    //5' of single read reversed
		    indexToCheck=al.QueryBases.length()-1;
		    for(int i=0;i<bpToDecrease5;i++){			
			if(toupper(al.QueryBases[indexToCheck]) == 'A'){
			    al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
			}
			indexToCheck=max(indexToCheck-1,0);
		    }

		    //3' of single read reversed
		    indexToCheck=0;
		    for(int i=0;i<bpToDecrease3;i++){ //first base			
			if(toupper(al.QueryBases[indexToCheck]) == 'A'){
			    al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
			}
			indexToCheck=min(indexToCheck+1,int(al.Qualities.size()));
		    }

		    

		}else{

		    int indexToCheck;
		    
		    //5' of single read
		    indexToCheck=0;
		    for(int i=0;i<bpToDecrease5;i++){ //first base			
			if(toupper(al.QueryBases[indexToCheck]) == 'T'){
			    al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
			}
			indexToCheck=min(indexToCheck+1,int(al.Qualities.size()));
		    }


		    //3' of single read
		    indexToCheck=al.QueryBases.length()-1;
		    for(int i=0;i<bpToDecrease3;i++){			
			if(toupper(al.QueryBases[indexToCheck]) == 'T'){
			    al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
			}
			indexToCheck=max(indexToCheck-1,0);
		    }



		}
	    }//end of single end

	    writer.SaveAlignment(al);		


    }//    while ( reader.GetNextAlignment(al) ) {

    reader.Close();
    writer.Close();

    cerr<<"Program terminated gracefully"<<endl;   
    return 0;
}
Beispiel #25
0
// Same as ParseRegionString() above, but accepts a BamMultiReader
bool ParseRegionString(const string& regionString,
                                  const BamReader& reader,
                                  BamRegion& region)
{
    // -------------------------------
    // parse region string

    // check first for empty string
    if ( regionString.empty() )
        return false;

    //cerr << "ParseRegionString Input: " << regionString << endl;

    // non-empty string, look for a colom
    size_t foundFirstColon = regionString.find(':');

    // store chrom strings, and numeric positions
    string startChrom;
    string stopChrom;
    int startPos;
    int stopPos;

    // no colon found
    // going to use entire contents of requested chromosome
    // just store entire region string as startChrom name
    // use BamReader methods to check if its valid for current BAM file
    if ( foundFirstColon == string::npos ) {
        startChrom = regionString;
        startPos   = 0;
        stopChrom  = regionString;
        stopPos    = -1;
    }

    // colon found, so we at least have some sort of startPos requested
    else {
        // store start chrom from beginning to first colon
        startChrom = regionString.substr(0,foundFirstColon);

        // look for ".." after the colon
        size_t foundRangeDots = regionString.find("..", foundFirstColon+1);

        // no dots found
        // so we have a startPos but no range
        // store contents before colon as startChrom, after as startPos
        if ( foundRangeDots == string::npos )
        {
            startPos   = atoi( regionString.substr(foundFirstColon+1).c_str() );
            stopChrom  = startChrom;
            stopPos    = -1;
        }

        // ".." found, so we have some sort of range selected
        else {

            // store startPos between first colon and range dots ".."
            startPos = atoi( regionString.substr(foundFirstColon+1, foundRangeDots-foundFirstColon-1).c_str() );

            // look for second colon
            size_t foundSecondColon = regionString.find(':', foundRangeDots+1);

            // no second colon found
            // so we have a "standard" chrom:start..stop input format (on single chrom)
            if ( foundSecondColon == string::npos ) {
                stopChrom  = startChrom;
                stopPos    = atoi( regionString.substr(foundRangeDots+2).c_str() );
            }

            // second colon found
            // so we have a range requested across 2 chrom's
            else {
                stopChrom  = regionString.substr(foundRangeDots+2, foundSecondColon-(foundRangeDots+2));
                stopPos    = atoi( regionString.substr(foundSecondColon+1).c_str() );
            }
        }
    }
    
    // -------------------------------
    // validate reference IDs & genomic positions
    const RefVector references = reader.GetReferenceData();

    // if startRefID not found, return false
    int startRefID = reader.GetReferenceID(startChrom);
    if ( startRefID == -1 ) return false;
    // startPos cannot be greater than or equal to reference length
    const RefData& startReference = references.at(startRefID);
    if ( startPos >= startReference.RefLength ) return false;

    // if stopRefID not found, return false
    int stopRefID = reader.GetReferenceID(stopChrom);
    if ( stopRefID == -1 ) return false;

    // stopPosition cannot be larger than reference length
    const RefData& stopReference = references.at(stopRefID);
    if ( stopPos > stopReference.RefLength ) return false;

    // if no stopPosition specified, set to reference end
    if ( stopPos == -1 ) stopPos = stopReference.RefLength;

    // -------------------------------
    // set up Region struct & return

    region.LeftRefID     = startRefID;
    region.LeftPosition  = startPos;
    region.RightRefID    = stopRefID;;
    region.RightPosition = stopPos;

    //cerr << "ParseRegionString " << region.LeftRefID <<  " " << region.LeftPosition << " " << region.RightPosition << endl;
    return true;
}
Beispiel #26
0
int main (int argc, char *argv[]) {

    bool mapped  =false;
    bool unmapped=false;

    const string usage=string(string(argv[0])+" [options] input.bam out.bam"+"\n\n"+
			      "This program takes a BAM file as input and produces\n"+
			      "another where the putative deaminated bases have\n"+
			      "have been cut\n"+
			      "\n"+
			      "Options:\n");
			      // "\t"+"-u , --unmapped" +"\n\t\t"+"For an unmapped bam file"+"\n"+
			      // "\t"+"-m , --mapped"   +"\n\t\t"+"For an mapped bam file"+"\n");
			      
			      

    if( (argc== 1) ||
	(argc== 2 && string(argv[1]) == "-h") ||
	(argc== 2 && string(argv[1]) == "-help") ||
	(argc== 2 && string(argv[1]) == "--help") ){
	cout<<"Usage:"<<endl;
	cout<<usage<<endl;
	cout<<""<<endl;
	return 1;
    }

    // for(int i=1;i<(argc-1);i++){ //all but the last arg

    // 	if(strcmp(argv[i],"-m") == 0 || strcmp(argv[i],"--mapped") == 0 ){
    // 	    mapped=true;
    // 	    continue;
    // 	}

    // 	if(strcmp(argv[i],"-u") == 0 || strcmp(argv[i],"--unmapped") == 0 ){
    // 	    unmapped=true;
    // 	    continue;
    // 	}
       
    // 	cerr<<"Unknown option "<<argv[i] <<" exiting"<<endl;
    // 	return 1;
    // }

    if(argc != 3){
	cerr<<"Error: Must specify the input and output BAM files";
	return 1;
    }

    string inbamFile =argv[argc-2];
    string outbamFile=argv[argc-1];

    // if(!mapped && !unmapped){
    // 	cerr << "Please specify whether you reads are mapped or unmapped" << endl;
    // 	return 1;
    // }
    // if(mapped && unmapped){
    // 	cerr << "Please specify either mapped or unmapped but not both" << endl;
    // 	return 1;
    // }

    BamReader reader;

    if ( !reader.Open(inbamFile) ) {
    	cerr << "Could not open input BAM files." << endl;
    	return 1;
    }


    vector<RefData>  testRefData=reader.GetReferenceData();
    const SamHeader header = reader.GetHeader();
    const RefVector references = reader.GetReferenceData();

    BamWriter writer;
    if ( !writer.Open(outbamFile, header, references) ) {
	cerr << "Could not open output BAM file" << endl;
	return 1;
    }

    BamAlignment al;
    // BamAlignment al2;
    // bool al2Null=true;
    
    while ( reader.GetNextAlignment(al) ) {

	    if(al.IsPaired() ){  

		if(al.IsFirstMate() ){ //5' end, need to check first base only
		    if(al.IsReverseStrand()){ //
			if(!al.IsMapped()){
			    cerr << "Cannot have reverse complemented unmapped reads: " <<al.Name<< endl;
			    //return 1;
			}
			int indexToCheck;
			//last
			indexToCheck=al.QueryBases.length()-1;
			if(toupper(al.QueryBases[indexToCheck]) == 'A'){
			    //al.Qualities[indexToCheck]=char(offset+baseQualForDeam);			 
			    al.QueryBases = al.QueryBases.substr(0,indexToCheck);
			    al.Qualities  = al.Qualities.substr(0, indexToCheck);
			}
		    }else{
			int indexToCheck;
			//first base
			indexToCheck=0;
			if(toupper(al.QueryBases[indexToCheck]) == 'T'){
			    //al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
			    al.QueryBases = al.QueryBases.substr(indexToCheck+1);
			    al.Qualities  = al.Qualities.substr(indexToCheck+1);
			}
		    }
		}else{ //3' end, need to check last two bases only
		    if( al.IsSecondMate() ){
			if(al.IsReverseStrand()){ //
			    if(!al.IsMapped()){
				cerr << "Cannot have reverse complemented unmapped reads: " <<al.Name<< endl;
				//return 1;
			    }
			    int indexToCheck;

			    //second to last
			    indexToCheck=al.QueryBases.length()-2;
			    if(toupper(al.QueryBases[indexToCheck]) == 'T'){
				//al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
				al.QueryBases = al.QueryBases.substr(0,indexToCheck);
				al.Qualities  = al.Qualities.substr(0, indexToCheck);
			    }else{
				//last
				indexToCheck=al.QueryBases.length()-1;
				if(toupper(al.QueryBases[indexToCheck]) == 'T'){
				    //al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
				    al.QueryBases = al.QueryBases.substr(0,indexToCheck);
				    al.Qualities  = al.Qualities.substr(0, indexToCheck);
				}
			    }
			}else{
			    int indexToCheck;
			    //second base
			    indexToCheck=1;
			    if(toupper(al.QueryBases[indexToCheck]) == 'A'){
				//al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
				al.QueryBases = al.QueryBases.substr(indexToCheck+1);
				al.Qualities  = al.Qualities.substr(indexToCheck+1);
			    }else{
				//first base
				indexToCheck=0;
				if(toupper(al.QueryBases[indexToCheck]) == 'A'){
				    //al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
				    al.QueryBases = al.QueryBases.substr(indexToCheck+1);
				    al.Qualities  = al.Qualities.substr(indexToCheck+1);
				}
			    }

			}
		    }else{
			cerr << "Wrong state" << endl;
			return 1;
		    }
		}

	    }//end of paired end
	    else{//we consider single reads to have been sequenced from 5' to 3'

		if(al.IsReverseStrand()){ //need to consider 
		    if(!al.IsMapped()){
			cerr << "Cannot have reverse complemented unmapped reads: " <<al.Name<< endl;
			//return 1;
		    }

		    int indexToCheck;



		    //second base
		    indexToCheck=1;
		    if(toupper(al.QueryBases[indexToCheck]) == 'A'){
			// al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
			// cout<<"51 "<<al.QueryBases<<endl;
			// cout<<"51 "<<al.Qualities<<endl;
			al.QueryBases = al.QueryBases.substr(indexToCheck+1);
			al.Qualities  = al.Qualities.substr(indexToCheck+1);
			// cout<<"52 "<<al.QueryBases<<endl;
			// cout<<"52 "<<al.Qualities<<endl;
		    }else{
			//first base
			indexToCheck=0;
			if(toupper(al.QueryBases[indexToCheck]) == 'A'){
			    // al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
			    // cout<<"61 "<<al.QueryBases<<endl;
			    // cout<<"61 "<<al.Qualities<<endl;
			    al.QueryBases = al.QueryBases.substr(indexToCheck+1);
			    al.Qualities  = al.Qualities.substr(indexToCheck+1);
			    // cout<<"62 "<<al.QueryBases<<endl;
			    // cout<<"62 "<<al.Qualities<<endl;
			}

		    }


		    //last
		    indexToCheck=al.QueryBases.length()-1;
		    if(toupper(al.QueryBases[indexToCheck]) == 'A'){
			// al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
			// cout<<"21 "<<al.QueryBases<<endl;
			// cout<<"21 "<<al.Qualities<<endl;
			al.QueryBases = al.QueryBases.substr(0,indexToCheck);
			al.Qualities  = al.Qualities.substr(0, indexToCheck);
			// cout<<"22 "<<al.QueryBases<<endl;
			// cout<<"22 "<<al.Qualities<<endl;
		    }
		}else{

		    int indexToCheck;
		    //first base
		    indexToCheck=0;
		    if(toupper(al.QueryBases[indexToCheck]) == 'T'){
			// al.Qualities[indexToCheck]=char(offset+baseQualForDeam);

			// cout<<"11 "<<al.QueryBases<<endl;
			// cout<<"11 "<<al.Qualities<<endl;
			al.QueryBases = al.QueryBases.substr(indexToCheck+1);
			al.Qualities  = al.Qualities.substr(indexToCheck+1);
			// cout<<"12 "<<al.QueryBases<<endl;
			// cout<<"12 "<<al.Qualities<<endl;

		    }

		    //second to last
		    indexToCheck=al.QueryBases.length()-2;
		    if(toupper(al.QueryBases[indexToCheck]) == 'T'){
			// al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
			// cout<<"31 "<<al.QueryBases<<endl;
			// cout<<"31 "<<al.Qualities<<endl;
			al.QueryBases = al.QueryBases.substr(0,indexToCheck);
			al.Qualities  = al.Qualities.substr(0, indexToCheck);
			// cout<<"32 "<<al.QueryBases<<endl;
			// cout<<"32 "<<al.Qualities<<endl;
		    }else{

			//last
			indexToCheck=al.QueryBases.length()-1;
			if(toupper(al.QueryBases[indexToCheck]) == 'T'){
			    // al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
			    // cout<<"41 "<<al.QueryBases<<endl;
			    // cout<<"41 "<<al.Qualities<<endl;
			    al.QueryBases = al.QueryBases.substr(0,indexToCheck);
			    al.Qualities  = al.Qualities.substr(0, indexToCheck);
			    // cout<<"42 "<<al.QueryBases<<endl;
			    // cout<<"42 "<<al.Qualities<<endl;
			}

		    }
		}
	    }//end of single end

	    writer.SaveAlignment(al);		


    }//    while ( reader.GetNextAlignment(al) ) {

    reader.Close();
    writer.Close();
   
    return 0;
}
Beispiel #27
0
int main (int argc, char *argv[]) {

    int  minBaseQuality = 0;

    string usage=string(""+string(argv[0])+"  [in BAM file] [in VCF file] [chr name] [deam out BAM] [not deam out BAM]"+
			"\nThis program divides aligned single end reads into potentially deaminated\n"+
			"\nreads and the puts the rest into another bam file if the deaminated positions are not called as the alternative base in the VCF.\n"+
			"\nTip: if you do not need one of them, use /dev/null as your output\n"+
			"arguments:\n"+
			"\t"+"--bq  [base qual]   : Minimum base quality to flag a deaminated site (Default: "+stringify(minBaseQuality)+")\n"+
			"\n");

    if(argc == 1 ||
       argc < 4  ||
       (argc == 2 && (string(argv[0]) == "-h" || string(argv[0]) == "--help") )
       ){
	cerr << "Usage "<<usage<<endl;
	return 1;       
    }


    for(int i=1;i<(argc-2);i++){ 

	
        if(string(argv[i]) == "--bq"){
	    minBaseQuality=destringify<int>(argv[i+1]);
            i++;
            continue;
	}

    }

    string bamfiletopen = string( argv[ argc-5 ] );
    string vcffiletopen = string( argv[ argc-4 ] );
    string chrname      = string( argv[ argc-3 ] );
    string deambam      = string( argv[ argc-2 ] );
    string nondeambam   = string( argv[ argc-1 ] );

    //dummy reader, will need to reposition anyway
    VCFreader vcfr (vcffiletopen,
 		    vcffiletopen+".tbi",
 		    chrname,
 		    1,
 		    1,
 		    0);

    BamReader reader;
    
    if ( !reader.Open(bamfiletopen) ) {
    	cerr << "Could not open input BAM file"<< bamfiletopen << endl;
    	return 1;
    }

    // if ( !reader.LocateIndex()  ) {
    // 	cerr << "The index for the BAM file cannot be located" << endl;
    // 	return 1;
    // }

    // if ( !reader.HasIndex()  ) {
    // 	cerr << "The BAM file has not been indexed." << endl;
    // 	return 1;
    // }

    //positioning the bam file
    int refid=reader.GetReferenceID(chrname);
    if(refid < 0){
	cerr << "Cannot retrieve the reference ID for "<< chrname << endl;
	return 1;
    }
    //cout<<"redif "<<refid<<endl;	    

    //setting the BAM reader at that position
    reader.SetRegion(refid,
		     0,
		     refid,
		     -1); 	



    vector<RefData>  testRefData=reader.GetReferenceData();
    const SamHeader header = reader.GetHeader();
    const RefVector references = reader.GetReferenceData();

    BamWriter writerDeam;
    if ( !writerDeam.Open(deambam,      header, references) ) {
	cerr << "Could not open output BAM file" << endl;
	return 1;
    }

    BamWriter writerNoDeam;
    if ( !writerNoDeam.Open(nondeambam, header, references) ) {
	cerr << "Could not open output BAM file" << endl;
	return 1;
    }



    unsigned int totalReads      =0;
    unsigned int deaminatedReads =0;
    unsigned int ndeaminatedReads =0;
    unsigned int skipped      =0;



    //iterating over the alignments for these regions
    BamAlignment al;
    int i;

    while ( reader.GetNextAlignment(al) ) {
	// cerr<<al.Name<<endl;

	//skip unmapped
	if(!al.IsMapped()){
	    skipped++;
	    continue;
	}

	//skip paired end !
	if(al.IsPaired() ){  
	    continue;
	    // cerr<<"Paired end not yet coded"<<endl;
	    // return 1;
	}


	string reconstructedReference = reconstructRef(&al);



	char refeBase;
	char readBase;
	bool isDeaminated;
	if(al.Qualities.size() != reconstructedReference.size()){
	    cerr<<"Quality line is not the same size as the reconstructed reference"<<endl;
	    return 1;
	}

	isDeaminated=false;

	if(al.IsReverseStrand()){

	    //first base next to 3'
	    i = 0 ;
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);

	    if(  readBase  == 'A' && int(al.Qualities[i]-offset) >= minBaseQuality){  //isDeaminated=true; }

		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		transformRef(&refeBase,&readBase);

		vcfr.repositionIterator(chrname,al.Position+1,al.Position+1);
		while(vcfr.hasData()){
		    SimpleVCF * toprint=vcfr.getData();
		    // cout<<*toprint<<endl;
		    //skip deletions in the alt
		    if(toprint->getRef().length() != 1 )
			continue;

		    if(toprint->getRef()[0] != refeBase){
			cerr<<reconstructedReference<<endl;
			cerr<<al.Position<<endl;			
			cerr<<numberOfDeletions(&al)<<endl;			
			cerr<<"Problem1 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
			exit(1);
		    }
		    
		    //if the VCF has a at least one G but no A
		    if(  toprint->hasAtLeastOneG() && 
			!toprint->hasAtLeastOneA() ){
			isDeaminated=true; 
		    }
		}

	    }


	    //second base next to 3'
	    i = 1;
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);

	    //refeBase  == 'G'  &&
	    if( readBase  == 'A' &&  int(al.Qualities[i]-offset) >= minBaseQuality){  //isDeaminated=true; }

		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		transformRef(&refeBase,&readBase);

		vcfr.repositionIterator(chrname,al.Position+2,al.Position+2);

		while(vcfr.hasData()){
		    SimpleVCF * toprint=vcfr.getData();
		    // cout<<*toprint<<endl;
		    //skip deletions in the alt
		    if(toprint->getRef().length() != 1 )
			continue;

		    if(toprint->getRef()[0] != refeBase){
			cerr<<reconstructedReference<<endl;
			cerr<<al.Position<<endl;
			cerr<<numberOfDeletions(&al)<<endl;
			cerr<<"Problem2 position "<<*toprint<<" does not have a  "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
			exit(1);
		    }

		    //if the VCF has at least one G but no A 
		    // if(toprint->hasAtLeastOneG() &&
		    //    toprint->getAlt().find("A") == string::npos){
		    if(  toprint->hasAtLeastOneG() && 
			!toprint->hasAtLeastOneA() ){
			isDeaminated=true; 
		    }
		}
	    }

	    //last  base next to 5'
	    i = (al.QueryBases.length()-1) ;
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);
	    //refeBase  == 'G'  &&
	    if( readBase  == 'A' &&  int(al.Qualities[i]-offset) >= minBaseQuality){  //isDeaminated=true; }

		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		transformRef(&refeBase,&readBase);


		int lengthMatches=countMatchesRecons(reconstructedReference,0);		
		int positionJump = al.Position+lengthMatches+numberOfDeletions(&al);
		vcfr.repositionIterator(chrname,positionJump,positionJump);
		while(vcfr.hasData()){
		    SimpleVCF * toprint=vcfr.getData();

		    //skip deletions in the alt
		    if(toprint->getRef().length() != 1 )
			continue;
		    
		    if(toprint->getRef()[0] != refeBase){
			cerr<<reconstructedReference<<endl;
			cerr<<al.Position<<endl;
			cerr<<lengthMatches<<endl;
			cerr<<numberOfDeletions(&al)<<endl;
			cerr<<positionJump<<endl;
			cerr<<"Problem3 position "<<*toprint<<" does not have a  "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
			exit(1);
		    }


		    //if the VCF has at least one G but no A
		    if(  toprint->hasAtLeastOneG() && 
			!toprint->hasAtLeastOneA() ){
			isDeaminated=true; 
		    }
		}

	    }

	}else{

		
	    //first base next to 5'
	    i = 0;
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);
	    //refeBase  == 'C' 
	    if( readBase  == 'T' &&  int(al.Qualities[i]-offset) >= minBaseQuality){ 

		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		transformRef(&refeBase,&readBase);

		vcfr.repositionIterator(chrname,al.Position+1,al.Position+1);
		while(vcfr.hasData()){
		    SimpleVCF * toprint=vcfr.getData();
		    //cout<<*toprint<<endl;
		    //skip deletions in the alt
		    if(toprint->getRef().length() != 1 )
			continue;
		    
		    if(toprint->getRef()[0] != refeBase){
			cerr<<reconstructedReference<<endl;
			cerr<<al.Position<<endl;
			cerr<<numberOfDeletions(&al)<<endl;			
			cerr<<"Problem4 position "<<*toprint<<" does not have a  "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
			exit(1);
		    }

		    //if the VCF has at least one C but no T
		    if(  toprint->hasAtLeastOneC() && 
			!toprint->hasAtLeastOneT() ){
			isDeaminated=true; 
		    }

		}

		//cout<<al.Position+
		 
	    }

	    //second last base next to 3'
	    i = (al.QueryBases.length()-2);
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);
	    //refeBase  == 'C'  &&
	    if( readBase  == 'T' &&  int(al.Qualities[i]-offset) >= minBaseQuality){  



		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		transformRef(&refeBase,&readBase);		
		int lengthMatches=countMatchesRecons(reconstructedReference,1);	
		int positionJump = al.Position+lengthMatches+numberOfDeletions(&al);

		vcfr.repositionIterator(chrname,positionJump,positionJump);
		while(vcfr.hasData()){
		    SimpleVCF * toprint=vcfr.getData();
		    //skip deletions in the alt
		    if(toprint->getRef().length() != 1 )
			continue;

		    if(toprint->getRef()[0] != refeBase){
			cerr<<reconstructedReference<<endl;
			cerr<<al.Position<<endl;
			cerr<<lengthMatches<<endl;
			cerr<<numberOfDeletions(&al)<<endl;
			cerr<<positionJump<<endl;
			cerr<<"Problem5 position "<<*toprint<<" does not have a  "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
			exit(1);
		    }

		    if(  toprint->hasAtLeastOneC() && 
			!toprint->hasAtLeastOneT() ){
			isDeaminated=true; 
		    }
		}

		 

	    }

	    //last base next to 3'
	    i = (al.QueryBases.length()-1);
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);
	    //&& refeBase  == 'C' 
	    if( readBase  == 'T'  && int(al.Qualities[i]-offset) >= minBaseQuality){  
		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		transformRef(&refeBase,&readBase);		

		int lengthMatches=countMatchesRecons(reconstructedReference,0);	
		int positionJump = al.Position+lengthMatches+numberOfDeletions(&al);

		vcfr.repositionIterator(chrname,positionJump,positionJump);
		while(vcfr.hasData()){
		    SimpleVCF * toprint=vcfr.getData();
		    //skip deletions in the alt
		    if(toprint->getRef().length() != 1 )
			continue;

		    if(toprint->getRef()[0] != refeBase){
			cerr<<reconstructedReference<<endl;
			cerr<<al.Position<<endl;
			cerr<<lengthMatches<<endl;
			cerr<<numberOfDeletions(&al)<<endl;
			cerr<<positionJump<<endl;
			cerr<<"Problem6 position "<<*toprint<<" does not have a  "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
			exit(1);
		    }

		    if(  toprint->hasAtLeastOneC() && 
			!toprint->hasAtLeastOneT() ){
			isDeaminated=true; 
		    }
		}

	    }	

	   
	    
	}
		  



	totalReads++;

	if(isDeaminated){
	    deaminatedReads++;
	    writerDeam.SaveAlignment(al);		
	}else{
	    ndeaminatedReads++;
	    writerNoDeam.SaveAlignment(al);		
	}


    
    }//end for each read









    reader.Close();
    writerDeam.Close();
    writerNoDeam.Close();

    cerr<<"Program finished sucessfully, out of "<<totalReads<<" mapped reads (skipped: "<<skipped<<" reads) we flagged "<<deaminatedReads<<" as deaminated and "<<ndeaminatedReads<<" as not deaminated"<<endl;

   
    return 0;
}
int main (int argc, char *argv[]) {

    int  minBaseQuality = 0;

    string usage=string(""+string(argv[0])+"  [in BAM file] [in VCF file] [chr name] [deam out BAM] [not deam out BAM]"+
			"\nThis program divides aligned single end reads into potentially deaminated\n"+
			"\nreads and the puts the rest into another bam file if the deaminated positions are not called as the alternative base in the VCF.\n"+
			"\nThis is like filterDeaminatedVCF but it loads the VCF before then labels the reads instead of doing it on the fly\n"+
			"\nwhich is good if you have many reads in the bam file.\n"+			
			"\nTip: if you do not need one of them, use /dev/null as your output\n"+
			"\narguments:\n"+
			"\t"+"--bq    [base qual]  : Minimum base quality to flag a deaminated site (Default: "+stringify(minBaseQuality)+")\n"+
			"\t"+"--1000g [vcf file]   : VCF file from 1000g to get the putative A and T positions in modern humans (Default: "+vcf1000g+")\n"+
			"\n");

    if(argc == 1 ||
       argc < 4  ||
       (argc == 2 && (string(argv[0]) == "-h" || string(argv[0]) == "--help") )
       ){
	cerr << "Usage "<<usage<<endl;
	return 1;       
    }


    for(int i=1;i<(argc-2);i++){ 

	
        if(string(argv[i]) == "--bq"){
	    minBaseQuality=destringify<int>(argv[i+1]);
            i++;
            continue;
	}

        if(string(argv[i]) == "--1000g"){
	    vcf1000g=string(argv[i+1]);
            i++;
            continue;
	}



    }

    unsigned int maxSizeChromosome=250000000;//larger than chr1 hg19
    bool * hasCnoT;
    bool * hasGnoA;
    bool * thousandGenomesHasA;
    bool * thousandGenomesHasT;

    cerr<<"Trying to allocating memory"<<endl;
    try{
	hasCnoT              = new bool[ maxSizeChromosome ];
	hasGnoA              = new bool[ maxSizeChromosome ];
	thousandGenomesHasA  = new bool[ maxSizeChromosome ];
	thousandGenomesHasT  = new bool[ maxSizeChromosome ];
    }catch(bad_alloc& exc){
	cerr<<"ERROR: allocating memory failed"<<endl;
	return 1;
    }
    cerr<<"Success in allocating memory"<<endl;

    for(unsigned int i = 0;i<maxSizeChromosome;i++){
	hasCnoT[i]=false;
	hasGnoA[i]=false;
	thousandGenomesHasA[i]=false;
	thousandGenomesHasT[i]=false;
    }

    string bamfiletopen = string( argv[ argc-5 ] );
    string vcffiletopen = string( argv[ argc-4 ] );
    string chrname      = string( argv[ argc-3 ] );
    string deambam      = string( argv[ argc-2 ] );
    string nondeambam   = string( argv[ argc-1 ] );

    cerr<<"Reading consensus VCF "<<vcffiletopen<<"  ... "<<endl;

    VCFreader vcfr (vcffiletopen,
 		    // vcffiletopen+".tbi",
 		    // chrname,
 		    // 1,
 		    // maxSizeChromosome,
 		    0);

    
    while(vcfr.hasData()){
	SimpleVCF * toprint=vcfr.getData();

	if(toprint->getRef().length() != 1 )
	    continue;
	

	//if the VCF has a at least one G but no A
	if(  toprint->hasAtLeastOneG() && 
	     !toprint->hasAtLeastOneA() ){
	    hasGnoA[ toprint->getPosition() ] =true;
	}

	if(  toprint->hasAtLeastOneC() && 
	     !toprint->hasAtLeastOneT() ){
	    hasCnoT[ toprint->getPosition() ] =true;    
	}

	   
    }
    cerr<<"done reading VCF"<<endl;







    cerr<<"Reading 1000g VCF :"<<vcf1000g<<"  ..."<<endl;
    string line1000g;
    ifstream myFile1000g;
    
    myFile1000g.open(vcf1000g.c_str(), ios::in);

    if (myFile1000g.is_open()){
	while ( getline (myFile1000g,line1000g)){
	    vector<string> fields=allTokens(line1000g,'\t');
	    //0 chr
	    //1 pos
	    //2 id
	    //3 ref
	    //4 alt

	    //check if same chr
	    if(fields[0] != chrname){
		cerr <<"Error, wrong chromosome in 1000g file for line=  "<<line1000g<<endl;
		return 1;
	    }

	    //skip indels
	    if(fields[3].size() != 1 ||
	       fields[4].size() != 1 )
		continue;

	    char         ref=toupper(fields[3][0]);
	    char         alt=toupper(fields[4][0]);
	    unsigned int pos=destringify<unsigned int>( fields[1] );
	    
	    thousandGenomesHasA[ pos ] = ( (ref=='A') || (alt=='A') );
	    thousandGenomesHasT[ pos ] = ( (ref=='T') || (alt=='T') );
	    
	}
	myFile1000g.close();
    }else{
	cerr <<"Unable to open file "<<vcf1000g<<endl;
	return 1;
    }



    cerr<<"done reading 1000g VCF"<<endl;











    BamReader reader;
    
    if ( !reader.Open(bamfiletopen) ) {
    	cerr << "Could not open input BAM file"<< bamfiletopen << endl;
    	return 1;
    }


    //positioning the bam file
    int refid=reader.GetReferenceID(chrname);
    if(refid < 0){
	cerr << "Cannot retrieve the reference ID for "<< chrname << endl;
	return 1;
    }
    //cout<<"redif "<<refid<<endl;	    

    //setting the BAM reader at that position
    reader.SetRegion(refid,
		     0,
		     refid,
		     -1); 	



    vector<RefData>  testRefData=reader.GetReferenceData();
    const SamHeader header = reader.GetHeader();
    const RefVector references = reader.GetReferenceData();

    BamWriter writerDeam;
    if ( !writerDeam.Open(deambam,      header, references) ) {
	cerr << "Could not open output BAM file" << endl;
	return 1;
    }

    BamWriter writerNoDeam;
    if ( !writerNoDeam.Open(nondeambam, header, references) ) {
	cerr << "Could not open output BAM file" << endl;
	return 1;
    }



    unsigned int totalReads      =0;
    unsigned int deaminatedReads =0;
    unsigned int ndeaminatedReads =0;
    unsigned int skipped      =0;



    //iterating over the alignments for these regions
    BamAlignment al;
    int i;

    while ( reader.GetNextAlignment(al) ) {
	// cerr<<al.Name<<endl;

	//skip unmapped
	if(!al.IsMapped()){
	    skipped++;
	    continue;
	}

	//skip paired end !
	if(al.IsPaired() ){  
	    continue;
	    // cerr<<"Paired end not yet coded"<<endl;
	    // return 1;
	}


	string reconstructedReference = reconstructRef(&al);



	char refeBase;
	char readBase;
	bool isDeaminated;
	if(al.Qualities.size() != reconstructedReference.size()){
	    cerr<<"Quality line is not the same size as the reconstructed reference"<<endl;
	    return 1;
	}

	isDeaminated=false;

	if(al.IsReverseStrand()){

	    //first base next to 3'
	    i = 0 ;
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);

	    if(  readBase  == 'A' && int(al.Qualities[i]-offset) >= minBaseQuality){  //isDeaminated=true; }

		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		if( hasGnoA[al.Position+1] && 
		    !thousandGenomesHasA[al.Position+1] )
		    isDeaminated=true; 


		// transformRef(&refeBase,&readBase);

		// vcfr.repositionIterator(chrname,al.Position+1,al.Position+1);
		// while(vcfr.hasData()){
		//     SimpleVCF * toprint=vcfr.getData();
		//     // cout<<*toprint<<endl;
		//     //skip deletions in the alt
		//     if(toprint->getRef().length() != 1 )
		// 	continue;

		//     if(toprint->getRef()[0] != refeBase){
		// 	cerr<<reconstructedReference<<endl;
		// 	cerr<<al.Position<<endl;			
		// 	cerr<<numberOfDeletions(&al)<<endl;			
		// 	cerr<<"Problem1 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
		// 	exit(1);
		//     }
		    
		//     //if the VCF has a at least one G but no A
		//     if(  toprint->hasAtLeastOneG() && 
		// 	!toprint->hasAtLeastOneA() ){
		// 	isDeaminated=true; 
		//     }
		// }

	    }


	    //second base next to 3'
	    i = 1;
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);

	    //refeBase  == 'G'  &&
	    if( readBase  == 'A' &&  int(al.Qualities[i]-offset) >= minBaseQuality){  //isDeaminated=true; }

		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		if( hasGnoA[al.Position+2] && 
		   !thousandGenomesHasA[al.Position+2] )
		    isDeaminated=true; 


		// transformRef(&refeBase,&readBase);


		// vcfr.repositionIterator(chrname,al.Position+2,al.Position+2);

		// while(vcfr.hasData()){
		//     SimpleVCF * toprint=vcfr.getData();
		//     // cout<<*toprint<<endl;
		//     //skip deletions in the alt
		//     if(toprint->getRef().length() != 1 )
		// 	continue;

		//     if(toprint->getRef()[0] != refeBase){
		// 	cerr<<reconstructedReference<<endl;
		// 	cerr<<al.Position<<endl;
		// 	cerr<<numberOfDeletions(&al)<<endl;
		// 	cerr<<"Problem2 position "<<*toprint<<" does not have a  "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
		// 	exit(1);
		//     }

		//     //if the VCF has at least one G but no A 
		//     // if(toprint->hasAtLeastOneG() &&
		//     //    toprint->getAlt().find("A") == string::npos){
		//     if(  toprint->hasAtLeastOneG() && 
		// 	!toprint->hasAtLeastOneA() ){
		// 	isDeaminated=true; 
		//     }
		// }
	    }

	    //last  base next to 5'
	    i = (al.QueryBases.length()-1) ;
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);
	    //refeBase  == 'G'  &&
	    if( readBase  == 'A' &&  int(al.Qualities[i]-offset) >= minBaseQuality){  //isDeaminated=true; }

		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		transformRef(&refeBase,&readBase);


		int lengthMatches=countMatchesRecons(reconstructedReference,0);		
		int positionJump = al.Position+lengthMatches+numberOfDeletions(&al);
		if(hasGnoA[positionJump] && 
		   !thousandGenomesHasA[positionJump] )
		    isDeaminated=true; 


		// vcfr.repositionIterator(chrname,positionJump,positionJump);
		// while(vcfr.hasData()){
		//     SimpleVCF * toprint=vcfr.getData();

		//     //skip deletions in the alt
		//     if(toprint->getRef().length() != 1 )
		// 	continue;
		    
		//     if(toprint->getRef()[0] != refeBase){
		// 	cerr<<reconstructedReference<<endl;
		// 	cerr<<al.Position<<endl;
		// 	cerr<<lengthMatches<<endl;
		// 	cerr<<numberOfDeletions(&al)<<endl;
		// 	cerr<<positionJump<<endl;
		// 	cerr<<"Problem3 position "<<*toprint<<" does not have a  "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
		// 	exit(1);
		//     }


		//     //if the VCF has at least one G but no A
		//     if(  toprint->hasAtLeastOneG() && 
		// 	!toprint->hasAtLeastOneA() ){
		// 	isDeaminated=true; 
		//     }
		// }

	    }

	}else{

		
	    //first base next to 5'
	    i = 0;
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);
	    //refeBase  == 'C' 
	    if( readBase  == 'T' &&  int(al.Qualities[i]-offset) >= minBaseQuality){ 

		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		// transformRef(&refeBase,&readBase);

		if(hasCnoT[al.Position+1]  && 
		   !thousandGenomesHasT[al.Position+1] )
		    isDeaminated=true; 

		// vcfr.repositionIterator(chrname,al.Position+1,al.Position+1);
		// while(vcfr.hasData()){
		//     SimpleVCF * toprint=vcfr.getData();
		//     //cout<<*toprint<<endl;
		//     //skip deletions in the alt
		//     if(toprint->getRef().length() != 1 )
		// 	continue;
		    
		//     if(toprint->getRef()[0] != refeBase){
		// 	cerr<<reconstructedReference<<endl;
		// 	cerr<<al.Position<<endl;
		// 	cerr<<numberOfDeletions(&al)<<endl;			
		// 	cerr<<"Problem4 position "<<*toprint<<" does not have a  "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
		// 	exit(1);
		//     }

		//     //if the VCF has at least one C but no T
		//     if(  toprint->hasAtLeastOneC() && 
		// 	!toprint->hasAtLeastOneT() ){
		// 	isDeaminated=true; 
		//     }

		// }

		//cout<<al.Position+
		 
	    }

	    //second last base next to 3'
	    i = (al.QueryBases.length()-2);
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);
	    //refeBase  == 'C'  &&
	    if( readBase  == 'T' &&  int(al.Qualities[i]-offset) >= minBaseQuality){  



		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		//transformRef(&refeBase,&readBase);		
		int lengthMatches=countMatchesRecons(reconstructedReference,1);	
		int positionJump = al.Position+lengthMatches+numberOfDeletions(&al);
		if(hasCnoT[positionJump] && 
		   !thousandGenomesHasT[positionJump] )
		    isDeaminated=true; 

		// vcfr.repositionIterator(chrname,positionJump,positionJump);
		// while(vcfr.hasData()){
		//     SimpleVCF * toprint=vcfr.getData();
		//     //skip deletions in the alt
		//     if(toprint->getRef().length() != 1 )
		// 	continue;

		//     if(toprint->getRef()[0] != refeBase){
		// 	cerr<<reconstructedReference<<endl;
		// 	cerr<<al.Position<<endl;
		// 	cerr<<lengthMatches<<endl;
		// 	cerr<<numberOfDeletions(&al)<<endl;
		// 	cerr<<positionJump<<endl;
		// 	cerr<<"Problem5 position "<<*toprint<<" does not have a  "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
		// 	exit(1);
		//     }

		//     if(  toprint->hasAtLeastOneC() && 
		// 	!toprint->hasAtLeastOneT() ){
		// 	isDeaminated=true; 
		//     }
		// }

		 

	    }

	    //last base next to 3'
	    i = (al.QueryBases.length()-1);
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);
	    //&& refeBase  == 'C' 
	    if( readBase  == 'T'  && int(al.Qualities[i]-offset) >= minBaseQuality){  
		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		transformRef(&refeBase,&readBase);		

		int lengthMatches=countMatchesRecons(reconstructedReference,0);	
		int positionJump = al.Position+lengthMatches+numberOfDeletions(&al);
		if(hasCnoT[positionJump] && 
		   !thousandGenomesHasT[positionJump] )
		    isDeaminated=true; 

		// vcfr.repositionIterator(chrname,positionJump,positionJump);
		// while(vcfr.hasData()){
		//     SimpleVCF * toprint=vcfr.getData();
		//     //skip deletions in the alt
		//     if(toprint->getRef().length() != 1 )
		// 	continue;

		//     if(toprint->getRef()[0] != refeBase){
		// 	cerr<<reconstructedReference<<endl;
		// 	cerr<<al.Position<<endl;
		// 	cerr<<lengthMatches<<endl;
		// 	cerr<<numberOfDeletions(&al)<<endl;
		// 	cerr<<positionJump<<endl;
		// 	cerr<<"Problem6 position "<<*toprint<<" does not have a  "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
		// 	exit(1);
		//     }

		//     if(  toprint->hasAtLeastOneC() && 
		// 	!toprint->hasAtLeastOneT() ){
		// 	isDeaminated=true; 
		//     }
		// }

	    }	

	   
	    
	}
		  



	totalReads++;

	if(isDeaminated){
	    deaminatedReads++;
	    writerDeam.SaveAlignment(al);		
	}else{
	    ndeaminatedReads++;
	    writerNoDeam.SaveAlignment(al);		
	}


    
    }//end for each read









    reader.Close();
    writerDeam.Close();
    writerNoDeam.Close();
    delete(hasCnoT);
    delete(hasGnoA);

    cerr<<"Program finished sucessfully, out of "<<totalReads<<" mapped reads (skipped: "<<skipped<<" reads) we flagged "<<deaminatedReads<<" as deaminated and "<<ndeaminatedReads<<" as not deaminated"<<endl;

   
    return 0;
}
Beispiel #29
0
int main (int argc, char *argv[]) {

     if( (argc!= 3) ||
    	(argc== 2 && string(argv[1]) == "-h") ||
    	(argc== 2 && string(argv[1]) == "-help") ||
    	(argc== 2 && string(argv[1]) == "--help") ){
	 cerr<<"Usage:splitByRG [in bam] [out prefix]"<<endl<<"this program creates one bam file per RG in the with the outprefix\nFor example splitByRG in.bam out will create\nout.rg1.bam\nout.rg2.bam\n"<<endl;
    	return 1;
    }


     string bamfiletopen = string(argv[1]);
     // if(!strEndsWith(bamfiletopen,".bam")){

     // }
     string bamDirOutPrefix    = string(argv[2]);
     map<string,BamWriter *> rg2BamWriter;
     
     // if(!isDirectory(bamDirOut)){
     // 	 cerr<<"ERROR: the out directory does not exist"<<endl;
     // 	return 1;
     // }

     BamReader reader;

     if ( !reader.Open(bamfiletopen) ) {
    	cerr << "Could not open input BAM files." << endl;
    	return 1;
     }

    SamHeader header = reader.GetHeader();
    const RefVector references = reader.GetReferenceData();
    vector<RefData>  refData=reader.GetReferenceData();
    string pID          = "splitByRG";   
    string pName        = "splitByRG";   
    string pCommandLine = "";
    for(int i=0;i<(argc);i++){
        pCommandLine += (string(argv[i])+" ");
    }
    putProgramInHeader(&header,pID,pName,pCommandLine,returnGitHubVersion(string(argv[0]),".."));


    SamReadGroupDictionary 	srgd=header.ReadGroups;
    for(SamReadGroupConstIterator srgci=srgd.ConstBegin();
	srgci<srgd.ConstEnd();
	srgci++){
	//cout<<*srgci<<endl;
	const SamReadGroup rg = (*srgci);
	//cout<<rg.ID<<endl;
	rg2BamWriter[rg.ID] = new  BamWriter();
	rg2BamWriter[rg.ID]->Open(bamDirOutPrefix+"."+rg.ID+".bam",header,references); 
    }



    BamAlignment al;
    unsigned int total=0;
    while ( reader.GetNextAlignment(al) ) {

	// al.SetIsFailedQC(false);
	// writer.SaveAlignment(al);
	// if(al.IsMapped () ){
	//     if(rg2BamWriter.find(refData[al.RefID].RefName) == rg2BamWriter.end()){ //new
	// 	rg2BamWriter[refData[al.RefID].RefName] = new  BamWriter();
	// 	if ( !rg2BamWriter[refData[al.RefID].RefName]->Open(bamDirOutPrefix+"."+refData[al.RefID].RefName+".bam",header,references) ) {
	// 	    cerr     << "Could not open output BAM file "<< bamDirOutPrefix<<"."<<refData[al.RefID].RefName<<".bam" << endl;
	// 	    return 1;
	// 	}
	
	//     }else{
	// 	rg2BamWriter[refData[al.RefID].RefName]->SaveAlignment(al);
	//     }
	// }else{
	//     unmapped.SaveAlignment(al);
	// }
	if(al.HasTag("RG")){
	    string rgTag;
	    al.GetTag("RG",rgTag);
	    //cout<<rgTag<<endl;
	    if(rg2BamWriter.find(rgTag) == rg2BamWriter.end()){ //new
		cerr<<"Found new RG "<<rgTag<<endl;
		rg2BamWriter[rgTag] = new  BamWriter();
	 	if ( !rg2BamWriter[rgTag]->Open(bamDirOutPrefix+"."+rgTag+".bam",header,references) ) {
	 	    cerr     << "Could not open output BAM file "<< bamDirOutPrefix<<"."<<rgTag<<".bam" << endl;
	 	    return 1;
	 	}
		rg2BamWriter[rgTag]->SaveAlignment(al);	    	   
	    }else{
		rg2BamWriter[rgTag]->SaveAlignment(al);	    	   
	    }
	}else{
	    string rgTag="unknown";	    
	    //cout<<rgTag<<endl;
	    if(rg2BamWriter.find(rgTag) == rg2BamWriter.end()){ //new
		cerr<<"Found new RG "<<rgTag<<endl;
		rg2BamWriter[rgTag] = new  BamWriter();
	 	if ( !rg2BamWriter[rgTag]->Open(bamDirOutPrefix+"."+rgTag+".bam",header,references) ) {
	 	    cerr     << "Could not open output BAM file "<< bamDirOutPrefix<<"."<<rgTag<<".bam" << endl;
	 	    return 1;
	 	}
		rg2BamWriter[rgTag]->SaveAlignment(al);	    	   
	    }else{
		rg2BamWriter[rgTag]->SaveAlignment(al);	    	   
	    }

	    // cerr << "Cannot get RG tag for " << al.Name<<endl;
	    // return 1;
	}

	total++;
    } //while al

    reader.Close();
    // writer.Close();
    
    // unmapped.Close();

    map<string,BamWriter *>::iterator rg2BamWriterIt;
    for (rg2BamWriterIt =rg2BamWriter.begin(); 
	 rg2BamWriterIt!=rg2BamWriter.end(); 
	 rg2BamWriterIt++){
	rg2BamWriterIt->second->Close();
    }
    cerr<<"Wrote succesfully "<<total<<" reads"<<endl;


    return 0;
}
Beispiel #30
0
// ValidateReaders checks that all the readers point to BAM files representing
// alignments against the same set of reference sequences, and that the
// sequences are identically ordered.  If these checks fail the operation of
// the multireader is undefined, so we force program exit.
bool BamMultiReaderPrivate::ValidateReaders() const
{

    m_errorString.clear();

    // skip if 0 or 1 readers opened
    if (m_readers.empty() || (m_readers.size() == 1)) return true;

    // retrieve first reader
    const MergeItem& firstItem = m_readers.front();
    const BamReader* firstReader = firstItem.Reader;
    if (firstReader == 0) return false;

    // retrieve first reader's header data
    const SamHeader& firstReaderHeader = firstReader->GetHeader();
    const std::string& firstReaderSortOrder = firstReaderHeader.SortOrder;

    // retrieve first reader's reference data
    const RefVector& firstReaderRefData = firstReader->GetReferenceData();
    const int firstReaderRefCount = firstReader->GetReferenceCount();
    const int firstReaderRefSize = firstReaderRefData.size();

    // iterate over all readers
    std::vector<MergeItem>::const_iterator readerIter = m_readers.begin();
    std::vector<MergeItem>::const_iterator readerEnd = m_readers.end();
    for (; readerIter != readerEnd; ++readerIter) {
        const MergeItem& item = (*readerIter);
        BamReader* reader = item.Reader;
        if (reader == 0) continue;

        // get current reader's header data
        const SamHeader& currentReaderHeader = reader->GetHeader();
        const std::string& currentReaderSortOrder = currentReaderHeader.SortOrder;

        // check compatible sort order
        if (currentReaderSortOrder != firstReaderSortOrder) {
            const std::string message =
                std::string("mismatched sort order in ") + reader->GetFilename() + ", expected " +
                firstReaderSortOrder + ", but found " + currentReaderSortOrder;
            SetErrorString("BamMultiReader::ValidateReaders", message);
            return false;
        }

        // get current reader's reference data
        const RefVector currentReaderRefData = reader->GetReferenceData();
        const int currentReaderRefCount = reader->GetReferenceCount();
        const int currentReaderRefSize = currentReaderRefData.size();

        // init reference data iterators
        RefVector::const_iterator firstRefIter = firstReaderRefData.begin();
        RefVector::const_iterator firstRefEnd = firstReaderRefData.end();
        RefVector::const_iterator currentRefIter = currentReaderRefData.begin();

        // compare reference counts from BamReader ( & container size, in case of BR error)
        if ((currentReaderRefCount != firstReaderRefCount) ||
            (firstReaderRefSize != currentReaderRefSize)) {
            std::stringstream s;
            s << "mismatched reference count in " << reader->GetFilename() << ", expected "
              << firstReaderRefCount << ", but found " << currentReaderRefCount;
            SetErrorString("BamMultiReader::ValidateReaders", s.str());
            return false;
        }

        // this will be ok; we just checked above that we have identically-sized sets of references
        // here we simply check if they are all, in fact, equal in content
        while (firstRefIter != firstRefEnd) {
            const RefData& firstRef = (*firstRefIter);
            const RefData& currentRef = (*currentRefIter);

            // compare reference name & length
            if ((firstRef.RefName != currentRef.RefName) ||
                (firstRef.RefLength != currentRef.RefLength)) {
                std::stringstream s;
                s << "mismatched references found in" << reader->GetFilename()
                  << "expected: " << std::endl;

                // print first reader's reference data
                RefVector::const_iterator refIter = firstReaderRefData.begin();
                RefVector::const_iterator refEnd = firstReaderRefData.end();
                for (; refIter != refEnd; ++refIter) {
                    const RefData& entry = (*refIter);
                    std::stringstream s;
                    s << entry.RefName << ' ' << std::endl;
                }

                s << "but found: " << std::endl;

                // print current reader's reference data
                refIter = currentReaderRefData.begin();
                refEnd = currentReaderRefData.end();
                for (; refIter != refEnd; ++refIter) {
                    const RefData& entry = (*refIter);
                    s << entry.RefName << ' ' << entry.RefLength << std::endl;
                }

                SetErrorString("BamMultiReader::ValidateReaders", s.str());
                return false;
            }

            // update iterators
            ++firstRefIter;
            ++currentRefIter;
        }
    }

    // if we get here, everything checks out
    return true;
}