//{{{ SV_Pair:: SV_Pair(const BamAlignment &bam_a, // if both reads are on the same chrome, then read_l must map before read_r // if the reads are on different strands then read_l must be on the lexo // lesser chrom (using the string.compare() method) SV_Pair:: SV_Pair(const BamAlignment &bam_a, const BamAlignment &bam_b, const RefVector &refs, int _weight, int _ev_id, SV_PairReader *_reader) { reader = _reader; if ( bam_a.MapQuality < bam_b.MapQuality ) min_mapping_quality = bam_a.MapQuality; else min_mapping_quality = bam_b.MapQuality; struct interval tmp_a, tmp_b; tmp_a.start = bam_a.Position; tmp_a.end = bam_a.GetEndPosition(false, false) - 1; tmp_a.chr = refs.at(bam_a.RefID).RefName; if ( bam_a.IsReverseStrand() == true ) tmp_a.strand = '-'; else tmp_a.strand = '+'; tmp_b.start = bam_b.Position; tmp_b.end = bam_b.GetEndPosition(false, false) - 1; tmp_b.chr = refs.at(bam_b.RefID).RefName; if ( bam_b.IsReverseStrand() == true ) tmp_b.strand = '-'; else tmp_b.strand = '+'; //if ( tmp_a.chr.compare(tmp_b.chr) > 0 ) { if ( bam_a.RefID < bam_b.RefID ) { read_l = tmp_a; read_r = tmp_b; //} else if ( tmp_a.chr.compare(tmp_b.chr) < 0 ) { } else if ( bam_a.RefID > bam_b.RefID) { read_l = tmp_b; read_r = tmp_a; } else { // == if (tmp_a.start > tmp_b.start) { read_l = tmp_b; read_r = tmp_a; } else { read_l = tmp_a; read_r = tmp_b; } } weight = _weight; ev_id = _ev_id; }
void BedCoverage::CollectCoverageBam(string bamFile) { // load the "B" bed file into a map so // that we can easily compare "A" to it for overlaps _bedB->loadBedCovFileIntoMap(); // open the BAM file BamReader reader; reader.Open(bamFile); // get header & reference information string header = reader.GetHeaderText(); RefVector refs = reader.GetReferenceData(); // convert each aligned BAM entry to BED // and compute coverage on B BamAlignment bam; while (reader.GetNextAlignment(bam)) { if (bam.IsMapped()) { // treat the BAM alignment as a single "block" if (_obeySplits == false) { // construct a new BED entry from the current BAM alignment. BED a; a.chrom = refs.at(bam.RefID).RefName; a.start = bam.Position; a.end = bam.GetEndPosition(false, false); a.strand = "+"; if (bam.IsReverseStrand()) a.strand = "-"; _bedB->countHits(a, _sameStrand, _diffStrand, _countsOnly); } // split the BAM alignment into discrete blocks and // look for overlaps only within each block. else { // vec to store the discrete BED "blocks" from a bedVector bedBlocks; // since we are counting coverage, we do want to split blocks when a // deletion (D) CIGAR op is encountered (hence the true for the last parm) GetBamBlocks(bam, refs.at(bam.RefID).RefName, bedBlocks, false, true); // use countSplitHits to avoid over-counting each split chunk // as distinct read coverage. _bedB->countSplitHits(bedBlocks, _sameStrand, _diffStrand, _countsOnly); } } } // report the coverage (summary or histogram) for BED B. if (_countsOnly == true) ReportCounts(); else ReportCoverage(); // close the BAM file reader.Close(); }
//{{{ void process_pair(const BamAlignment &curr, void SV_Pair:: process_pair(const BamAlignment &curr, const RefVector refs, map<string, BamAlignment> &mapped_pairs, UCSCBins<SV_BreakPoint*> &r_bin, int weight, int ev_id, SV_PairReader *reader) { if (mapped_pairs.find(curr.Name) == mapped_pairs.end()) mapped_pairs[curr.Name] = curr; else { SV_Pair *new_pair = new SV_Pair(mapped_pairs[curr.Name], curr, refs, weight, ev_id, reader); //cerr << count_clipped(curr.CigarData) << "\t" << //count_clipped(mapped_pairs[curr.Name].CigarData) << endl; if ( new_pair->is_sane() && new_pair->is_aberrant() && (count_clipped(curr.CigarData) > 0) && (count_clipped(mapped_pairs[curr.Name].CigarData) > 0) ) { SV_BreakPoint *new_bp = new_pair->get_bp(); #ifdef TRACE cerr << "READ\t" << refs.at(mapped_pairs[curr.Name].RefID).RefName << "," << mapped_pairs[curr.Name].Position << "," << (mapped_pairs[curr.Name].GetEndPosition(false, false) - 1) << "\t" << refs.at(curr.RefID).RefName << "," << curr.Position << "," << (curr.GetEndPosition(false, false) - 1) << endl; cerr << "\tPE\t" << *new_bp << endl; #endif new_bp->cluster(r_bin); } else { delete(new_pair); } mapped_pairs.erase(curr.Name); } }
void getBamBlocks(const BamAlignment &bam, const RefVector &refs, vector<BED> &blocks, bool breakOnDeletionOps) { CHRPOS currPosition = bam.Position; CHRPOS blockStart = bam.Position; string chrom = refs.at(bam.RefID).RefName; string name = bam.Name; string strand = "+"; string score = ToString(bam.MapQuality); char prevOp = '\0'; if (bam.IsReverseStrand()) strand = "-"; bool blocksFound = false; vector<CigarOp>::const_iterator cigItr = bam.CigarData.begin(); vector<CigarOp>::const_iterator cigEnd = bam.CigarData.end(); for ( ; cigItr != cigEnd; ++cigItr ) { if (cigItr->Type == 'M') { currPosition += cigItr->Length; // we only want to create a new block if the current M op // was preceded by an N op or a D op (and we are breaking on D ops) if ((prevOp == 'D' && breakOnDeletionOps == true) || (prevOp == 'N')) { blocks.push_back( BED(chrom, blockStart, currPosition, name, score, strand) ); blockStart = currPosition; } } else if (cigItr->Type == 'D') { if (breakOnDeletionOps == false) currPosition += cigItr->Length; else { blocksFound = true; currPosition += cigItr->Length; blockStart = currPosition; } } else if (cigItr->Type == 'N') { blocks.push_back( BED(chrom, blockStart, currPosition, name, score, strand) ); blocksFound = true; currPosition += cigItr->Length; blockStart = currPosition; } else if (cigItr->Type == 'S' || cigItr->Type == 'H' || cigItr->Type == 'P' || cigItr->Type == 'I') { // do nothing } else { cerr << "Input error: invalid CIGAR type (" << cigItr->Type << ") for: " << bam.Name << endl; exit(1); } prevOp = cigItr->Type; } // if there were no splits, we just create a block representing the contiguous alignment. if (blocksFound == false) { blocks.push_back( BED(chrom, bam.Position, currPosition, name, score, strand) ); } }
void getBamBlocks(const BamAlignment &bam, const RefVector &refs, BedVec &blocks, bool breakOnDeletionOps) { CHRPOS currPosition = bam.Position; CHRPOS blockStart = bam.Position; string chrom = refs.at(bam.RefID).RefName; string name = bam.Name; string strand = "+"; float score = bam.MapQuality; if (bam.IsReverseStrand()) strand = "-"; vector<CigarOp>::const_iterator cigItr = bam.CigarData.begin(); vector<CigarOp>::const_iterator cigEnd = bam.CigarData.end(); for ( ; cigItr != cigEnd; ++cigItr ) { if (cigItr->Type == 'M') { currPosition += cigItr->Length; blocks.push_back( Bed(chrom, blockStart, currPosition, name, score, strand) ); blockStart = currPosition; } else if (cigItr->Type == 'D') { if (breakOnDeletionOps == false) currPosition += cigItr->Length; else { currPosition += cigItr->Length; blockStart = currPosition; } } else if (cigItr->Type == 'N') { currPosition += cigItr->Length; blockStart = currPosition; } else if (cigItr->Type == 'S' || cigItr->Type == 'H' || cigItr->Type == 'P' || cigItr->Type == 'I') { // do nothing } else { cerr << "Input error: invalid CIGAR type (" << cigItr->Type << ") for: " << bam.Name << endl; exit(1); } } }
bool RandomTool::RandomToolPrivate::Run(void) { // set to default stdin if no input files provided if ( !m_settings->HasInput && !m_settings->HasInputFilelist ) m_settings->InputFiles.push_back(Options::StandardIn()); // add files in the filelist to the input file list if ( m_settings->HasInputFilelist ) { ifstream filelist(m_settings->InputFilelist.c_str(), ios::in); if ( !filelist.is_open() ) { cerr << "bamtools random ERROR: could not open input BAM file list... Aborting." << endl; return false; } string line; while ( getline(filelist, line) ) m_settings->InputFiles.push_back(line); } // open our reader BamMultiReader reader; if ( !reader.Open(m_settings->InputFiles) ) { cerr << "bamtools random ERROR: could not open input BAM file(s)... Aborting." << endl; return false; } // look up index files for all BAM files reader.LocateIndexes(); // make sure index data is available if ( !reader.HasIndexes() ) { cerr << "bamtools random ERROR: could not load index data for all input BAM file(s)... Aborting." << endl; reader.Close(); return false; } // get BamReader metadata const string headerText = reader.GetHeaderText(); const RefVector references = reader.GetReferenceData(); if ( references.empty() ) { cerr << "bamtools random ERROR: no reference data available... Aborting." << endl; reader.Close(); return false; } // determine compression mode for BamWriter bool writeUncompressed = ( m_settings->OutputFilename == Options::StandardOut() && !m_settings->IsForceCompression ); BamWriter::CompressionMode compressionMode = BamWriter::Compressed; if ( writeUncompressed ) compressionMode = BamWriter::Uncompressed; // open BamWriter BamWriter writer; writer.SetCompressionMode(compressionMode); if ( !writer.Open(m_settings->OutputFilename, headerText, references) ) { cerr << "bamtools random ERROR: could not open " << m_settings->OutputFilename << " for writing... Aborting." << endl; reader.Close(); return false; } // if user specified a REGION constraint, attempt to parse REGION string BamRegion region; if ( m_settings->HasRegion && !Utilities::ParseRegionString(m_settings->Region, reader, region) ) { cerr << "bamtools random ERROR: could not parse REGION: " << m_settings->Region << endl; cerr << "Check that REGION is in valid format (see documentation) and that the coordinates are valid" << endl; reader.Close(); writer.Close(); return false; } // seed our random number generator srand( time(NULL) ); // grab random alignments BamAlignment al; unsigned int i = 0; while ( i < m_settings->AlignmentCount ) { int randomRefId = 0; int randomPosition = 0; // use REGION constraints to select random refId & position if ( m_settings->HasRegion ) { // select a random refId randomRefId = getRandomInt(region.LeftRefID, region.RightRefID); // select a random position based on randomRefId const int lowerBoundPosition = ( (randomRefId == region.LeftRefID) ? region.LeftPosition : 0 ); const int upperBoundPosition = ( (randomRefId == region.RightRefID) ? region.RightPosition : (references.at(randomRefId).RefLength - 1) ); randomPosition = getRandomInt(lowerBoundPosition, upperBoundPosition); } // otherwise select from all possible random refId & position else { // select random refId randomRefId = getRandomInt(0, (int)references.size() - 1); // select random position based on randomRefId const int lowerBoundPosition = 0; const int upperBoundPosition = references.at(randomRefId).RefLength - 1; randomPosition = getRandomInt(lowerBoundPosition, upperBoundPosition); } // if jump & read successful, save first alignment that overlaps random refId & position if ( reader.Jump(randomRefId, randomPosition) ) { while ( reader.GetNextAlignmentCore(al) ) { if ( al.RefID == randomRefId && al.Position >= randomPosition ) { writer.SaveAlignment(al); ++i; break; } } } } // cleanup & exit reader.Close(); writer.Close(); return true; }
//{{{ SV_SplitRead:: SV_SplitRead(vector< BamAlignment > &block, SV_SplitRead:: SV_SplitRead(const BamAlignment &bam_a, const BamAlignment &bam_b, const RefVector &refs, int _weight, int _id, int _sample_id, SV_SplitReadReader *_reader) { reader = _reader; sample_id = _sample_id; if ( bam_a.MapQuality < bam_b.MapQuality ) min_mapping_quality = bam_a.MapQuality; else min_mapping_quality = bam_b.MapQuality; struct cigar_query query_a = calc_query_pos_from_cigar(bam_a.CigarData, bam_a.IsReverseStrand() ); struct cigar_query query_b = calc_query_pos_from_cigar(bam_b.CigarData, bam_b.IsReverseStrand() ); struct interval tmp_a, tmp_b; tmp_a.strand = '+'; if (bam_a.IsReverseStrand()) tmp_a.strand = '-'; tmp_a.chr = refs.at(bam_a.RefID).RefName; tmp_a.start = bam_a.Position; tmp_a.end = bam_a.GetEndPosition(); tmp_b.strand = '+'; if (bam_b.IsReverseStrand()) tmp_b.strand = '-'; tmp_b.chr = refs.at(bam_b.RefID).RefName; tmp_b.start = bam_b.Position; tmp_b.end = bam_b.GetEndPosition(); //if ( ( tmp_a.chr.compare(tmp_b.chr) > 0 ) || //( ( tmp_a.chr.compare(tmp_b.chr) == 0 ) && //( tmp_a.start > tmp_b.start ) ) ) { if ( (bam_a.RefID > bam_b.RefID) || ( (bam_a.RefID == bam_b.RefID) && (tmp_a.start > tmp_b.start ) ) ) { side_r = tmp_a; side_l = tmp_b; query_r = query_a; query_l = query_b; } else { side_l = tmp_a; side_r = tmp_b; query_l = query_a; query_r = query_b; } if (side_l.strand != side_r.strand) type = SV_BreakPoint::INVERSION; else if ( ( ( side_l.strand == '+' ) && ( side_r.strand == '+' ) && ( query_l.qs_pos < query_r.qs_pos ) ) || ( ( side_l.strand == '-' ) && ( side_r.strand == '-' ) && ( query_l.qs_pos > query_r.qs_pos) ) ) type = SV_BreakPoint::DELETION; else if ( ( ( side_l.strand == '+' ) && ( side_r.strand == '+' ) && ( query_l.qs_pos > query_r.qs_pos ) ) || ( ( side_l.strand == '-' ) && ( side_r.strand == '-' ) && ( query_l.qs_pos < query_r.qs_pos) ) ) type = SV_BreakPoint::DUPLICATION; else { cerr << "ERROR IN BAM FILE. " << "TYPE not detected (DELETION,DUPLICATION,INVERSION)" << endl; cerr << "\t" << query_l.qs_pos << "," << side_l.strand << "\t" << query_r.qs_pos << "," << side_r.strand << "\t" << tmp_a.chr << "," << tmp_a.start << "," << tmp_a.end << "\t" << tmp_b.chr << "," << tmp_b.start << "," << tmp_b.end << "\t" << endl; throw(1); } weight = _weight; id = _id; }
// Same as ParseRegionString() above, but accepts a BamMultiReader bool ParseRegionString(const string& regionString, const BamReader& reader, BamRegion& region) { // ------------------------------- // parse region string // check first for empty string if ( regionString.empty() ) return false; //cerr << "ParseRegionString Input: " << regionString << endl; // non-empty string, look for a colom size_t foundFirstColon = regionString.find(':'); // store chrom strings, and numeric positions string startChrom; string stopChrom; int startPos; int stopPos; // no colon found // going to use entire contents of requested chromosome // just store entire region string as startChrom name // use BamReader methods to check if its valid for current BAM file if ( foundFirstColon == string::npos ) { startChrom = regionString; startPos = 0; stopChrom = regionString; stopPos = -1; } // colon found, so we at least have some sort of startPos requested else { // store start chrom from beginning to first colon startChrom = regionString.substr(0,foundFirstColon); // look for ".." after the colon size_t foundRangeDots = regionString.find("..", foundFirstColon+1); // no dots found // so we have a startPos but no range // store contents before colon as startChrom, after as startPos if ( foundRangeDots == string::npos ) { startPos = atoi( regionString.substr(foundFirstColon+1).c_str() ); stopChrom = startChrom; stopPos = -1; } // ".." found, so we have some sort of range selected else { // store startPos between first colon and range dots ".." startPos = atoi( regionString.substr(foundFirstColon+1, foundRangeDots-foundFirstColon-1).c_str() ); // look for second colon size_t foundSecondColon = regionString.find(':', foundRangeDots+1); // no second colon found // so we have a "standard" chrom:start..stop input format (on single chrom) if ( foundSecondColon == string::npos ) { stopChrom = startChrom; stopPos = atoi( regionString.substr(foundRangeDots+2).c_str() ); } // second colon found // so we have a range requested across 2 chrom's else { stopChrom = regionString.substr(foundRangeDots+2, foundSecondColon-(foundRangeDots+2)); stopPos = atoi( regionString.substr(foundSecondColon+1).c_str() ); } } } // ------------------------------- // validate reference IDs & genomic positions const RefVector references = reader.GetReferenceData(); // if startRefID not found, return false int startRefID = reader.GetReferenceID(startChrom); if ( startRefID == -1 ) return false; // startPos cannot be greater than or equal to reference length const RefData& startReference = references.at(startRefID); if ( startPos >= startReference.RefLength ) return false; // if stopRefID not found, return false int stopRefID = reader.GetReferenceID(stopChrom); if ( stopRefID == -1 ) return false; // stopPosition cannot be larger than reference length const RefData& stopReference = references.at(stopRefID); if ( stopPos > stopReference.RefLength ) return false; // if no stopPosition specified, set to reference end if ( stopPos == -1 ) stopPos = stopReference.RefLength; // ------------------------------- // set up Region struct & return region.LeftRefID = startRefID; region.LeftPosition = startPos; region.RightRefID = stopRefID;; region.RightPosition = stopPos; //cerr << "ParseRegionString " << region.LeftRefID << " " << region.LeftPosition << " " << region.RightPosition << endl; return true; }
void BedIntersect::IntersectBam(string bamFile) { // load the "B" bed file into a map so // that we can easily compare "A" to it for overlaps _bedB = new BedFile(_bedBFile); _bedB->loadBedFileIntoMap(); // create a dummy BED A file for printing purposes if not // using BAM output. if (_bamOutput == false) { _bedA = new BedFile(_bedAFile); _bedA->bedType = 12; } // open the BAM file BamReader reader; BamWriter writer; reader.Open(bamFile); // get header & reference information string bamHeader = reader.GetHeaderText(); RefVector refs = reader.GetReferenceData(); // open a BAM output to stdout if we are writing BAM if (_bamOutput == true) { // set compression mode BamWriter::CompressionMode compressionMode = BamWriter::Compressed; if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed; writer.SetCompressionMode(compressionMode); // open our BAM writer writer.Open("stdout", bamHeader, refs); } vector<BED> hits; // reserve some space hits.reserve(100); BamAlignment bam; // get each set of alignments for each pair. while (reader.GetNextAlignment(bam)) { // save an unaligned read if -v if (!bam.IsMapped()) { if (_noHit == true) writer.SaveAlignment(bam); continue; } // break alignment into discrete blocks, bedVector bed_blocks; string chrom = refs.at(bam.RefID).RefName; GetBamBlocks(bam, chrom, bed_blocks, false, true); // create a basic BED entry from the BAM alignment BED bed; MakeBedFromBam(bam, chrom, bed_blocks, bed); bool overlapsFound = false; if ((_bamOutput == true) && (_obeySplits == false)) { overlapsFound = _bedB->anyHits(bed.chrom, bed.start, bed.end, bed.strand, _sameStrand, _diffStrand, _overlapFraction, _reciprocal); } else if ( ((_bamOutput == true) && (_obeySplits == true)) || ((_bamOutput == false) && (_obeySplits == true)) ) { // find the hits that overlap with the full span of the blocked BED _bedB->allHits(bed.chrom, bed.start, bed.end, bed.strand, hits, _sameStrand, _diffStrand, _overlapFraction, _reciprocal); // find the overlaps between the block in A and B overlapsFound = FindBlockedOverlaps(bed, bed_blocks, hits, _bamOutput); } else if ((_bamOutput == false) && (_obeySplits == false)) { FindOverlaps(bed, hits); } // save the BAM alignment if overlap reqs. were met if (_bamOutput == true) { if ((overlapsFound == true) && (_noHit == false)) writer.SaveAlignment(bam); else if ((overlapsFound == false) && (_noHit == true)) writer.SaveAlignment(bam); } hits.clear(); } // close the relevant BAM files. reader.Close(); if (_bamOutput == true) { writer.Close(); } }
void TagBam::Tag() { // open the annotations files for processing; OpenAnnoFiles(); // open the BAM file BamReader reader; BamWriter writer; if (!reader.Open(_bamFile)) { cerr << "Failed to open BAM file " << _bamFile << endl; exit(1); } // get header & reference information string bamHeader = reader.GetHeaderText(); RefVector refs = reader.GetReferenceData(); // set compression mode BamWriter::CompressionMode compressionMode = BamWriter::Compressed; // if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed; writer.SetCompressionMode(compressionMode); // open our BAM writer writer.Open("stdout", bamHeader, refs); // rip through the BAM file and test for overlaps with each annotation file. BamAlignment al; vector<BED> hits; while (reader.GetNextAlignment(al)) { if (al.IsMapped() == true) { BED a; a.chrom = refs.at(al.RefID).RefName; a.start = al.Position; a.end = al.GetEndPosition(false, false); a.strand = "+"; if (al.IsReverseStrand()) a.strand = "-"; ostringstream annotations; // annotate the BAM file based on overlaps with the annotation files. for (size_t i = 0; i < _annoFiles.size(); ++i) { // grab the current annotation file. BedFile *anno = _annoFiles[i]; if (!_useNames && !_useScores && !_useIntervals) { // add the label for this annotation file to tag if there is overlap if (anno->anyHits(a.chrom, a.start, a.end, a.strand, _sameStrand, _diffStrand, _overlapFraction, false)) { annotations << _annoLabels[i] << ";"; } } // use the score field else if (!_useNames && _useScores && !_useIntervals) { anno->allHits(a.chrom, a.start, a.end, a.strand, hits, _sameStrand, _diffStrand, 0.0, false); for (size_t i = 0; i < hits.size(); ++i) { annotations << hits[i].score; if (i < hits.size() - 1) annotations << ","; } if (hits.size() > 0) annotations << ";"; hits.clear(); } // use the name field from the annotation files to populate tag else if (_useNames && !_useScores && !_useIntervals) { anno->allHits(a.chrom, a.start, a.end, a.strand, hits, _sameStrand, _diffStrand, 0.0, false); for (size_t j = 0; j < hits.size(); ++j) { annotations << hits[j].name; if (j < hits.size() - 1) annotations << ","; } if (hits.size() > 0) annotations << ";"; hits.clear(); } // use the full interval information annotation files to populate tag else if (!_useNames && !_useScores && _useIntervals) { anno->allHits(a.chrom, a.start, a.end, a.strand, hits, _sameStrand, _diffStrand, 0.0, false); for (size_t j = 0; j < hits.size(); ++j) { annotations << _annoLabels[i] << ":" << hits[j].chrom << ":" << hits[j].start << "-" << hits[j].end << "," << hits[j].name << "," << hits[j].score << "," << hits[j].strand; if (j < hits.size() - 1) annotations << ","; } if (hits.size() > 0) annotations << ";"; hits.clear(); } } // were there any overlaps with which to make a tag? if (annotations.str().size() > 0) { al.AddTag(_tag, "Z", annotations.str().substr(0, annotations.str().size() - 1)); // get rid of the last ";" } } writer.SaveAlignment(al); } reader.Close(); writer.Close(); // close the annotations files; CloseAnnoFiles(); }
// this has been copied from bamtools utilities, since it isn't in the API. Original file is bamtools_utilities.cpp. // Like the rest of Bamtools, it is under the BSD license. bool Filter::ParseRegionString(const string& regionString, BamRegion& region) { // ------------------------------- // parse region string // check first for empty string if ( regionString.empty() ) return false; // non-empty string, look for a colom size_t foundFirstColon = regionString.find(':'); // store chrom strings, and numeric positions string chrom; int startPos; int stopPos; // no colon found // going to use entire contents of requested chromosome // just store entire region string as startChrom name // use BamReader methods to check if its valid for current BAM file if ( foundFirstColon == string::npos ) { chrom = regionString; startPos = 0; stopPos = -1; } // colon found, so we at least have some sort of startPos requested else { // store start chrom from beginning to first colon chrom = regionString.substr(0,foundFirstColon); // look for ".." after the colon size_t foundRangeDots = regionString.find("..", foundFirstColon+1); // no dots found // so we have a startPos but no range // store contents before colon as startChrom, after as startPos if ( foundRangeDots == string::npos ) { startPos = atoi( regionString.substr(foundFirstColon+1).c_str() ); stopPos = -1; } // ".." found, so we have some sort of range selected else { // store startPos between first colon and range dots ".." startPos = atoi( regionString.substr(foundFirstColon+1, foundRangeDots-foundFirstColon-1).c_str() ); // look for second colon size_t foundSecondColon = regionString.find(':', foundRangeDots+1); // no second colon found // so we have a "standard" chrom:start..stop input format (on single chrom) if ( foundSecondColon == string::npos ) { stopPos = atoi( regionString.substr(foundRangeDots+2).c_str() ); } else { return false; } } } // ------------------------------- // validate reference IDs & genomic positions const RefVector references = getReferences(); int RefID = -1; for(int i = 0; i < references.size(); i++) { if(references[i].RefName == chrom) RefID = i; } // if startRefID not found, return false if ( RefID == -1 ) { cerr << "Can't find chromosome'" << chrom << "'" << endl; return false; } // startPos cannot be greater than or equal to reference length const RefData& startReference = references.at(RefID); if ( startPos >= startReference.RefLength ) { cerr << "Start position (" << startPos << ") after end of the reference sequence (" << startReference.RefLength << ")" << endl; return false; } // stopPosition cannot be larger than reference length const RefData& stopReference = references.at(RefID); if ( stopPos > stopReference.RefLength ) { cerr << "Start position (" << stopPos << ") after end of the reference sequence (" << stopReference.RefLength << ")" << endl; return false; } // if no stopPosition specified, set to reference end if ( stopPos == -1 ) stopPos = stopReference.RefLength; // ------------------------------- // set up Region struct & return region.LeftRefID = RefID; region.LeftPosition = startPos; region.RightRefID = RefID;; region.RightPosition = stopPos; return true; }
bool check(const PropertyFilter& filter, const BamAlignment& al) { bool keepAlignment = true; const PropertyMap& properties = filter.Properties; PropertyMap::const_iterator propertyIter = properties.begin(); PropertyMap::const_iterator propertyEnd = properties.end(); for ( ; propertyIter != propertyEnd; ++propertyIter ) { // check alignment data field depending on propertyName const string& propertyName = (*propertyIter).first; const PropertyFilterValue& valueFilter = (*propertyIter).second; if ( propertyName == ALIGNMENTFLAG_PROPERTY ) keepAlignment &= valueFilter.check(al.AlignmentFlag); else if ( propertyName == CIGAR_PROPERTY ) { stringstream cigarSs; const vector<CigarOp>& cigarData = al.CigarData; if ( !cigarData.empty() ) { vector<CigarOp>::const_iterator cigarBegin = cigarData.begin(); vector<CigarOp>::const_iterator cigarIter = cigarBegin; vector<CigarOp>::const_iterator cigarEnd = cigarData.end(); for ( ; cigarIter != cigarEnd; ++cigarIter ) { const CigarOp& op = (*cigarIter); cigarSs << op.Length << op.Type; } keepAlignment &= valueFilter.check(cigarSs.str()); } } else if ( propertyName == INSERTSIZE_PROPERTY ) keepAlignment &= valueFilter.check(al.InsertSize); else if ( propertyName == ISDUPLICATE_PROPERTY ) keepAlignment &= valueFilter.check(al.IsDuplicate()); else if ( propertyName == ISFAILEDQC_PROPERTY ) keepAlignment &= valueFilter.check(al.IsFailedQC()); else if ( propertyName == ISFIRSTMATE_PROPERTY ) keepAlignment &= valueFilter.check(al.IsFirstMate()); else if ( propertyName == ISMAPPED_PROPERTY ) keepAlignment &= valueFilter.check(al.IsMapped()); else if ( propertyName == ISMATEMAPPED_PROPERTY ) keepAlignment &= valueFilter.check(al.IsMateMapped()); else if ( propertyName == ISMATEREVERSESTRAND_PROPERTY ) keepAlignment &= valueFilter.check(al.IsMateReverseStrand()); else if ( propertyName == ISPAIRED_PROPERTY ) keepAlignment &= valueFilter.check(al.IsPaired()); else if ( propertyName == ISPRIMARYALIGNMENT_PROPERTY ) keepAlignment &= valueFilter.check(al.IsPrimaryAlignment()); else if ( propertyName == ISPROPERPAIR_PROPERTY ) keepAlignment &= valueFilter.check(al.IsProperPair()); else if ( propertyName == ISREVERSESTRAND_PROPERTY ) keepAlignment &= valueFilter.check(al.IsReverseStrand()); else if ( propertyName == ISSECONDMATE_PROPERTY ) keepAlignment &= valueFilter.check(al.IsSecondMate()); else if ( propertyName == ISSINGLETON_PROPERTY ) { const bool isSingleton = al.IsPaired() && al.IsMapped() && !al.IsMateMapped(); keepAlignment &= valueFilter.check(isSingleton); } else if ( propertyName == MAPQUALITY_PROPERTY ) keepAlignment &= valueFilter.check(al.MapQuality); else if ( propertyName == MATEPOSITION_PROPERTY ) keepAlignment &= ( al.IsPaired() && al.IsMateMapped() && valueFilter.check(al.MateRefID) ); else if ( propertyName == MATEREFERENCE_PROPERTY ) { if ( !al.IsPaired() || !al.IsMateMapped() ) return false; BAMTOOLS_ASSERT_MESSAGE( (al.MateRefID>=0 && (al.MateRefID<(int)filterToolReferences.size())), "Invalid MateRefID"); const string& refName = filterToolReferences.at(al.MateRefID).RefName; keepAlignment &= valueFilter.check(refName); } else if ( propertyName == NAME_PROPERTY ) keepAlignment &= valueFilter.check(al.Name); else if ( propertyName == POSITION_PROPERTY ) keepAlignment &= valueFilter.check(al.Position); else if ( propertyName == QUERYBASES_PROPERTY ) keepAlignment &= valueFilter.check(al.QueryBases); else if ( propertyName == REFERENCE_PROPERTY ) { BAMTOOLS_ASSERT_MESSAGE( (al.RefID>=0 && (al.RefID<(int)filterToolReferences.size())), "Invalid RefID"); const string& refName = filterToolReferences.at(al.RefID).RefName; keepAlignment &= valueFilter.check(refName); } else if ( propertyName == TAG_PROPERTY ) keepAlignment &= checkAlignmentTag(valueFilter, al); else BAMTOOLS_ASSERT_UNREACHABLE; // if alignment fails at ANY point, just quit and return false if ( !keepAlignment ) return false; } BAMTOOLS_ASSERT_MESSAGE( keepAlignment, "Error in BamAlignmentChecker... keepAlignment should be true here"); return keepAlignment; }
void BedWindow::WindowIntersectBam(string bamFile) { // load the "B" bed file into a map so // that we can easily compare "A" to it for overlaps _bedB->loadBedFileIntoMap(); // open the BAM file BamReader reader; BamWriter writer; reader.Open(bamFile); // get header & reference information string bamHeader = reader.GetHeaderText(); RefVector refs = reader.GetReferenceData(); // open a BAM output to stdout if we are writing BAM if (_bamOutput == true) { // set compression mode BamWriter::CompressionMode compressionMode = BamWriter::Compressed; if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed; writer.SetCompressionMode(compressionMode); // open our BAM writer writer.Open("stdout", bamHeader, refs); } vector<BED> hits; // vector of potential hits // reserve some space hits.reserve(100); _bedA->bedType = 6; BamAlignment bam; bool overlapsFound; // get each set of alignments for each pair. while (reader.GetNextAlignment(bam)) { if (bam.IsMapped()) { BED a; a.chrom = refs.at(bam.RefID).RefName; a.start = bam.Position; a.end = bam.GetEndPosition(false, false); // build the name field from the BAM alignment. a.name = bam.Name; if (bam.IsFirstMate()) a.name += "/1"; if (bam.IsSecondMate()) a.name += "/2"; a.score = ToString(bam.MapQuality); a.strand = "+"; if (bam.IsReverseStrand()) a.strand = "-"; if (_bamOutput == true) { overlapsFound = FindOneOrMoreWindowOverlaps(a); if (overlapsFound == true) { if (_noHit == false) writer.SaveAlignment(bam); } else { if (_noHit == true) writer.SaveAlignment(bam); } } else { FindWindowOverlaps(a, hits); hits.clear(); } } // BAM IsMapped() is false else if (_noHit == true) { writer.SaveAlignment(bam); } } // close the relevant BAM files. reader.Close(); if (_bamOutput == true) { writer.Close(); } }
void BedGenomeCoverage::CoverageBam(string bamFile) { ResetChromCoverage(); // open the BAM file BamReader reader; if (!reader.Open(bamFile)) { cerr << "Failed to open BAM file " << bamFile << endl; exit(1); } // get header & reference information string header = reader.GetHeaderText(); RefVector refs = reader.GetReferenceData(); // load the BAM header references into a BEDTools "genome file" _genome = new GenomeFile(refs); // convert each aligned BAM entry to BED // and compute coverage on B BamAlignment bam; while (reader.GetNextAlignment(bam)) { // skip if the read is unaligned if (bam.IsMapped() == false) continue; bool _isReverseStrand = bam.IsReverseStrand(); //changing second mate's strand to opposite if( _dUTP && bam.IsPaired() && bam.IsMateMapped() && bam.IsSecondMate()) _isReverseStrand = !bam.IsReverseStrand(); // skip if we care about strands and the strand isn't what // the user wanted if ( (_filterByStrand == true) && ((_requestedStrand == "-") != _isReverseStrand) ) continue; // extract the chrom, start and end from the BAM alignment string chrom(refs.at(bam.RefID).RefName); CHRPOS start = bam.Position; CHRPOS end = bam.GetEndPosition(false, false) - 1; // are we on a new chromosome? if ( chrom != _currChromName ) StartNewChrom(chrom); if(_pair_chip_) { // Skip if not a proper pair if (bam.IsPaired() && (!bam.IsProperPair() or !bam.IsMateMapped()) ) continue; // Skip if wrong coordinates if( ( (bam.Position<bam.MatePosition) && bam.IsReverseStrand() ) || ( (bam.MatePosition < bam.Position) && bam.IsMateReverseStrand() ) ) { //chemically designed: left on positive strand, right on reverse one continue; } /*if(_haveSize) { if (bam.IsFirstMate() && bam.IsReverseStrand()) { //put fragmentSize in to the middle of pair end_fragment int mid = bam.MatePosition+abs(bam.InsertSize)/2; if(mid<_fragmentSize/2) AddCoverage(0, mid+_fragmentSize/2); else AddCoverage(mid-_fragmentSize/2, mid+_fragmentSize/2); } else if (bam.IsFirstMate() && bam.IsMateReverseStrand()) { //put fragmentSize in to the middle of pair end_fragment int mid = start+abs(bam.InsertSize)/2; if(mid<_fragmentSize/2) AddCoverage(0, mid+_fragmentSize/2); else AddCoverage(mid-_fragmentSize/2, mid+_fragmentSize/2); } } else */ if (bam.IsFirstMate() && bam.IsReverseStrand()) { //prolong to the mate to the left AddCoverage(bam.MatePosition, end); } else if (bam.IsFirstMate() && bam.IsMateReverseStrand()) { //prolong to the mate to the right AddCoverage(start, start + abs(bam.InsertSize) - 1); } } else if (_haveSize) { if(bam.IsReverseStrand()) { if(end<_fragmentSize) { //sometimes fragmentSize is bigger :( AddCoverage(0, end); } else { AddCoverage(end + 1 - _fragmentSize, end ); } } else { AddCoverage(start,start+_fragmentSize - 1); } } else // add coverage accordingly. if (!_only_5p_end && !_only_3p_end) { bedVector bedBlocks; // we always want to split blocks when a D CIGAR op is found. // if the user invokes -split, we want to also split on N ops. if (_obeySplits) { // "D" true, "N" true GetBamBlocks(bam, refs.at(bam.RefID).RefName, bedBlocks, true, true); } else { // "D" true, "N" false GetBamBlocks(bam, refs.at(bam.RefID).RefName, bedBlocks, true, false); } AddBlockedCoverage(bedBlocks); } else if (_only_5p_end) { CHRPOS pos = ( !bam.IsReverseStrand() ) ? start : end; AddCoverage(pos,pos); } else if (_only_3p_end) { CHRPOS pos = ( bam.IsReverseStrand() ) ? start : end; AddCoverage(pos,pos); } } // close the BAM reader.Close(); // process the results of the last chromosome. ReportChromCoverage(_currChromCoverage, _currChromSize, _currChromName, _currChromDepthHist); // report all empty chromsomes PrintEmptyChromosomes(); // report the overall coverage if asked. PrintFinalCoverage(); }
void BedIntersect::IntersectBam(string bamFile) { // load the "B" bed file into a map so // that we can easily compare "A" to it for overlaps _bedB->loadBedFileIntoMap(); // open the BAM file BamReader reader; BamWriter writer; reader.Open(bamFile); // get header & reference information string header = reader.GetHeaderText(); RefVector refs = reader.GetReferenceData(); // open a BAM output to stdout if we are writing BAM if (_bamOutput == true) { // open our BAM writer writer.Open("stdout", header, refs, _isUncompressedBam); } vector<BED> hits; // reserve some space hits.reserve(100); _bedA->bedType = 6; BamAlignment bam; // get each set of alignments for each pair. while (reader.GetNextAlignment(bam)) { if (bam.IsMapped()) { BED a; a.chrom = refs.at(bam.RefID).RefName; a.start = bam.Position; a.end = bam.GetEndPosition(false); // build the name field from the BAM alignment. a.name = bam.Name; if (bam.IsFirstMate()) a.name += "/1"; if (bam.IsSecondMate()) a.name += "/2"; a.score = ToString(bam.MapQuality); a.strand = "+"; if (bam.IsReverseStrand()) a.strand = "-"; if (_bamOutput == true) { bool overlapsFound = false; // treat the BAM alignment as a single "block" if (_obeySplits == false) { overlapsFound = FindOneOrMoreOverlap(a); } // split the BAM alignment into discrete blocks and // look for overlaps only within each block. else { bool overlapFoundForBlock; bedVector bedBlocks; // vec to store the discrete BED "blocks" from a // we don't want to split on "D" ops, hence the "false" getBamBlocks(bam, refs, bedBlocks, false); vector<BED>::const_iterator bedItr = bedBlocks.begin(); vector<BED>::const_iterator bedEnd = bedBlocks.end(); for (; bedItr != bedEnd; ++bedItr) { overlapFoundForBlock = FindOneOrMoreOverlap(a); if (overlapFoundForBlock == true) overlapsFound = true; } } if (overlapsFound == true) { if (_noHit == false) writer.SaveAlignment(bam); } else { if (_noHit == true) { writer.SaveAlignment(bam); } } } else { // treat the BAM alignment as a single BED "block" if (_obeySplits == false) { FindOverlaps(a, hits); hits.clear(); } // split the BAM alignment into discrete BED blocks and // look for overlaps only within each block. else { bedVector bedBlocks; // vec to store the discrete BED "blocks" from a getBamBlocks(bam, refs, bedBlocks, false); vector<BED>::const_iterator bedItr = bedBlocks.begin(); vector<BED>::const_iterator bedEnd = bedBlocks.end(); for (; bedItr != bedEnd; ++bedItr) { FindOverlaps(*bedItr, hits); hits.clear(); } } } } } // close the relevant BAM files. reader.Close(); if (_bamOutput == true) { writer.Close(); } }
int main ( int argc, char *argv[] ) { struct parameters *param = 0; param = interface(param, argc, argv); //region file input (the region file should be sorted as the same way as the bam file) ifstream region_f; region_f.open(param->region_f, ios_base::in); // the region file is opened //bam input and generate index if not yet //-------------------------------------------------------------------------------------------------------+ // BAM input (file or filenames?) | //-------------------------------------------------------------------------------------------------------+ char *fof = param->mapping_f; FILE *IN=NULL; char linefof[5000]; int filecount=0; vector <string> fnames; if (strchr(fof,' ')!=NULL) { char *ptr; ptr=strtok(fof," "); while (ptr!=NULL) { fnames.push_back(ptr); filecount++; ptr=strtok(NULL," "); } } else { IN=fopen(fof,"rt"); if (IN!=NULL) { long linecount=0; while (fgets(linefof,5000-1,IN)!=NULL) { linecount++; if (linefof[0]!='#' && linefof[0]!='\n') { char *ptr=strchr(linefof,'\n'); if (ptr!=NULL && ptr[0]=='\n') { ptr[0]='\0'; } FILE *dummy=NULL; dummy=fopen(linefof,"rt"); if (dummy!=NULL) { // seems to be a file of filenames... fclose(dummy); fnames.push_back(linefof); filecount++; } else if (filecount==0 || linecount>=1000-1) { // seems to be a single file fnames.push_back(fof); filecount++; break; } } } fclose(IN); } } //file or file name decided and stored in vector "fnames" cerr << "the input mapping files are:" << endl; vector <string>::iterator fit = fnames.begin(); for(; fit != fnames.end(); fit++) { cerr << *fit << endl; } //-------------------------------------------------------------------------------------------------------+ // end of file or filenames | //-------------------------------------------------------------------------------------------------------+ // open the BAM file(s) BamMultiReader reader; reader.Open(fnames); // get header & reference information string header = reader.GetHeaderText(); RefVector refs = reader.GetReferenceData(); if ( ! reader.LocateIndexes() ) // opens any existing index files that match our BAM files reader.CreateIndexes(); // creates index files for BAM files that still lack one // locus bias struct lb empty_profile = {0,0,0,0}; vector <struct lb> locus_b(1000, empty_profile); // output locus bias file string locus_bias_set = param->lbias; ofstream locus_bias; if ( locus_bias_set != "" ) { locus_bias.open(param->lbias); if ( !locus_bias ) { cerr << "can not open locus_bias file.\n"; exit(0); } } //should decide which chromosome string line; string old_chr = "SRP"; string type = param->type; //whether do some position-level pile-up stuff bool posc = false; ofstream posc_f; ofstream chrmap_f; string poscset = param->posc; if ( poscset != "" ) { posc = true; posc_f.open(param->posc); chrmap_f.open(param->chrmap); } bool noChr; if ( param->nochr == 1 ){ noChr = true; } else { noChr = false; } //regions for the input of region file deque <struct region> regions; getline(region_f, line); //get the first line eatline(line,regions,noChr); deque <struct region>::iterator it = regions.begin(); while ( it->chr != old_chr ) { old_chr = it->chr; // set the current chr as old chr int chr_id = reader.GetReferenceID(it->chr); if ( chr_id == -1 ) { //reference not found for (; it != regions.end() && it->chr == old_chr; ) { gene_processing(*it,locus_b); // print the old region info it = regions.erase(it); // erase the current region } while ( regions.empty() ) { getline(region_f, line); if ( region_f.eof() ){ cerr << "finished: end of region file, zone 0" << endl; break; } eatline(line, regions,noChr); it = regions.begin(); if (it->chr == old_chr){ gene_processing(*it,locus_b); regions.clear(); continue; } } continue; } int chr_len = refs.at(chr_id).RefLength; if ( !reader.SetRegion(chr_id, 1, chr_id, chr_len) ) // here set region { cerr << "bamtools count ERROR: Jump region failed " << it->chr << endl; reader.Close(); exit(1); } //pile-up pos stats set <string> fragment; map <string, unsigned int> pileup; bool isposPileup = false; unsigned int old_start = 0; unsigned int total_tags = 0; unsigned int total_pos = 0; unsigned int pileup_pos = 0; BamAlignment bam; while (reader.GetNextAlignment(bam)) { if ( bam.IsMapped() == false ) continue; // skip unaligned reads unsigned int unique; bam.GetTag("NH", unique); if (param->unique == 1) { if (unique != 1) { // skipe uniquelly mapped reads continue; } } if (read_length == 0){ read_length = bam.Length; } //cout << bam.Name << endl; string chrom = refs.at(bam.RefID).RefName; string strand = "+"; if (bam.IsReverseStrand()) strand = "-"; unsigned int alignmentStart = bam.Position+1; unsigned int mateStart; if (type == "p") mateStart = bam.MatePosition+1; unsigned int alignmentEnd = bam.GetEndPosition(); unsigned int cigarEnd; vector <int> blockLengths; vector <int> blockStarts; blockStarts.push_back(0); ParseCigar(bam.CigarData, blockStarts, blockLengths, cigarEnd); // position check for unique mapped reads (because is paired-end reads, shoule base on fragment level for paired end reads) if (posc == true && unique == 1) { if (type == "p" && fragment.count(bam.Name) > 0) fragment.erase(bam.Name); else { total_tags++; if (type == "p"){ fragment.insert(bam.Name); } string alignSum; if (type == "p") { alignSum = int2str(alignmentStart) + "\t" + int2str(mateStart) + "\t.\t" + strand; } else { alignSum = int2str(alignmentStart) + "\t" + int2str(alignmentEnd) + "\t.\t" + strand; } if ( alignmentStart != old_start ) { isposPileup = false; map <string, unsigned int>::iterator pit = pileup.begin(); for (; pit != pileup.end(); pit++) { posc_f << chrom << "\truping\tpileup\t" << pit->first << "\t.\t" << "Pileup=" << pit->second << endl; //print pileup } pileup.clear(); //clear pileup set pileup.insert( pair <string, unsigned int> (alignSum, 1) ); //insert the new read total_pos++; } else if ( alignmentStart == old_start ) { // same starts if ( pileup.count(alignSum) > 0 ) { // pileup if ( pileup[alignSum] == 1 && isposPileup == false ) { pileup_pos++; isposPileup = true; } pileup[alignSum]++; } else { pileup.insert( pair <string, unsigned int> (alignSum, 1) ); } } //same starts } //new fragment old_start = alignmentStart; } // do pos check float incre = 1.; if (blockStarts.size() > 1) incre = 0.5; // incre half for junction reads incre /= static_cast < float >(unique); // for multi aligned reads deque <struct region>::iterator iter = regions.begin(); if ( iter->start > alignmentEnd ) continue; // skip reads not overlapping with the first region while ( iter->chr == old_chr && iter->start <= alignmentEnd && iter != regions.end() ) { if (iter->end < alignmentStart) { // the region end is beyond the alignmentStart gene_processing(*iter,locus_b); // processing iter = regions.erase(iter); // this region should be removed if ( regions.empty() ) { getline(region_f, line); // get a line of region file if ( ! region_f.eof() ) { eatline(line, regions, noChr); // eat a line and put it into the duque iter = regions.begin(); } else { // it's reaching the end of the region file cerr << "finished: end of region file, zone 3" << endl; break; } } continue; } if (iter->end >= alignmentStart && iter->start <= alignmentEnd) { //overlapping, should take action vector <int>::iterator cigit = blockStarts.begin(); for (; cigit != blockStarts.end(); cigit++) { unsigned int current_start = *cigit + alignmentStart; int current_pos = current_start - (iter->start); //cout << iter->chr << "\t" << iter->start << "\t" << iter->end << "\t" << current_start << endl; if ( (iter->tags).count(current_pos) > 0 ) { (iter->tags)[current_pos] += incre; } else (iter->tags).insert( pair<int, float>(current_pos, incre) ); } } // overlapping take action! if ( (iter+1) != regions.end() ) iter++; // if this region is not the last element in the deque else { // the last element getline(region_f, line); // get a line of region file if ( ! region_f.eof() ){ eatline(line, regions, noChr); // eat a line and put it into the duque iter = regions.end(); iter--; } else { //it's reaching the end of the region file cerr << "finished: end of region file, zone 4" << endl; break; } } } //while } // read a bam // print chr map if (posc == true) { chrmap_f << old_chr << "\t" << total_tags << "\t" << total_pos << "\t" << pileup_pos << endl; } //somehow to loop back it = regions.begin(); //reset to begin for (; it != regions.end() && it->chr == old_chr; ) { gene_processing(*it,locus_b); // print the old region info it = regions.erase(it); // erase the current region } while ( regions.empty() ) { getline(region_f, line); if ( region_f.eof() ){ cerr << "finished: end of region file, zone 5" << endl; //print locus bias for (unsigned int l = 0; l < 1000; l++){ locus_bias << l << "\t" << locus_b[l].ps << "\t" << locus_b[l].hs << "\t" << locus_b[l].pe << "\t" << locus_b[l].he << endl; } exit(0); } eatline(line, regions, noChr); it = regions.begin(); if (it->chr == old_chr){ gene_processing(*it, locus_b); regions.clear(); continue; } } } // region chr != old chr regions.clear(); reader.Close(); region_f.close(); return 0; } //main