ReadGroup::ReadGroup(BamAlignment &al, int max_isize, int isize_samples, string prefix, list<string> blacklist) : max_isize(max_isize), isize_samples(isize_samples), prefix(prefix), blacklisted(false) { if (!al.GetReadGroup(name)) name = "none"; nreads = 0; /* Determine if this read group is in the blacklist */ for (list<string>::iterator it = blacklist.begin(); it != blacklist.end(); ++it) { if (*it == name) { blacklisted = true; break; } } if (!blacklisted) { f1.open((prefix + "/" + name + "_1.fq.gz").c_str()); f2.open((prefix + "/" + name + "_2.fq.gz").c_str()); } witness(al); }
void clipAlignment(BamAlignment &al) { int offset, length; CigarOp cop1 = al.CigarData[0]; CigarOp cop2 = al.CigarData[al.CigarData.size() - 1]; if (copcomp(cop2, cop1)) { offset = 0; length = min(al.Length, (signed)cop1.Length); } else { offset = al.Length - min(al.Length, (signed)cop2.Length); length = min(al.Length, (signed)cop2.Length); } try { al.Qualities = al.Qualities.substr(offset, length); al.QueryBases = al.QueryBases.substr(offset, length); } catch (exception &e) { cout << "ERROR: substr failed in clipAlignment()" << endl; cout << al.Name << " " << (al.IsReverseStrand() ? "(-)" : "(+)"); cout << " offset: " << offset << " length: " << length << " taglen: " << al.Length << endl; cout << "cop1: " << cop1.Length << cop1.Type << endl; cout << "cop2: " << cop2.Length << cop2.Type << endl; exit(1); } }
pos_t VariantProcessor::processMatchOrMismatch(const BamAlignment& alignment, vector<VariantPtr>& read_variants, const uint32_t& op_length, const string& refseq, const pos_t& refpos, const pos_t& readpos) { // Process a matching or mismatching sequence in the CIGAR string, // adding any SNP variants present. int endpos = alignment.GetEndPosition(); for (int i = 0; i < op_length; i++) { assert(alignment.Position + i < endpos); char query_base = alignment.QueryBases[readpos + i]; assert(refpos + i < refseq.size()); char ref_base = refseq[refpos + i]; if (ref_base != query_base) { // SNP string ref(1, ref_base), alt(1, query_base); char qual_base = alignment.Qualities[refpos + i]; // TODO check VariantPtr snp(new Variant(VariantType::SNP, alignment.RefID, alignment.Position+i, 1, 0, ref, alt)); block_variants.insert(snp); read_variants.push_back(snp); //cout << "mismatch at " << alignment.Position + i <<" refbase: " << ref_base << " querybase: " << query_base << endl; } } }
int main (int argc, char *argv[]) { if( (argc== 1) || (argc== 2 && string(argv[1]) == "-h") || (argc== 2 && string(argv[1]) == "-help") || (argc== 2 && string(argv[1]) == "--help") ){ cout<<"Usage:setAsUnpaired [in bam] [outbam]"<<endl<<"this program takes flags all paired sequences as singles"<<endl; return 1; } string bamfiletopen = string(argv[1]); string bamFileOUT = string(argv[2]); BamReader reader; BamWriter writer; if ( !reader.Open(bamfiletopen) ) { cerr << "Could not open input BAM files." << endl; return 1; } const SamHeader header = reader.GetHeader(); const RefVector references = reader.GetReferenceData(); if ( !writer.Open(bamFileOUT,header,references) ) { cerr << "Could not open output BAM file "<<bamFileOUT << endl; return 1; } BamAlignment al; while ( reader.GetNextAlignment(al) ) { if(al.IsMapped()){ cerr << "Cannot yet handle mapped reads " << endl; return 1; } al.SetIsPaired (false); writer.SaveAlignment(al); } //while al reader.Close(); writer.Close(); return 0; }
int main (int argc, char *argv[]) { if( (argc== 1) || (argc== 2 && string(argv[1]) == "-h") || (argc== 2 && string(argv[1]) == "-help") || (argc== 2 && string(argv[1]) == "--help") ){ cout<<"Usage:editDist [in bam]"<<endl<<"this program returns the NM field of all aligned reads"<<endl; return 1; } string bamfiletopen = string(argv[1]); // cout<<bamfiletopen<<endl; BamReader reader; // cout<<"ok"<<endl; if ( !reader.Open(bamfiletopen) ) { cerr << "Could not open input BAM files." << endl; return 1; } BamAlignment al; // cout<<"ok"<<endl; while ( reader.GetNextAlignment(al) ) { // cout<<al.Name<<endl; if(!al.IsMapped()) continue; if(al.HasTag("NM") ){ int editDist; if(al.GetTag("NM",editDist) ){ cout<<editDist<<endl; }else{ cerr<<"Cannot retrieve NM field for "<<al.Name<<endl; return 1; } }else{ cerr<<"Warning: read "<<al.Name<<" is aligned but has no NM field"<<endl; } } //while al reader.Close(); return 0; }
int VariantProcessor::run() { int nmapped = 0; last_aln_pos = 0; bool stop; BamAlignment al; // TODO: mem copying issues? if (!reader.IsOpen()) { std::cerr << "error: BAM file '" << filename << "' is not open." << std::endl; } while (reader.GetNextAlignment(al)) { if (!al.IsMapped()) continue; // TODO add chromosome checking code here assert(al.Position >= last_aln_pos); // ensure is sorted if (al.Position > block_start) { // only check if we can stop if we've moved in the block stop = isBlockEnd(al); } if (stop) { // end of block; process all variants in this block and output // haplotype count statistics processBlockAlignments(); // reset block blockReset((pos_t) al.Position); stop = false; } else { // process read last_aln_pos = processAlignment(al); } // for debug TODO for (set<VariantPtr>::const_iterator it = block_variants.begin(); it != block_variants.end(); ++it) { (*it)->print(); } nmapped++; } return nmapped; }
// returns region state - whether alignment ends before, overlaps, or starts after currently specified region // this *internal* method should ONLY called when (at least) IsLeftBoundSpecified == true BamReaderPrivate::RegionState BamReaderPrivate::IsOverlap(BamAlignment& bAlignment) { // if alignment is on any reference sequence before left bound if ( bAlignment.RefID < Region.LeftRefID ) return BEFORE_REGION; // if alignment starts on left bound reference else if ( bAlignment.RefID == Region.LeftRefID ) { // if alignment starts at or after left boundary if ( bAlignment.Position >= Region.LeftPosition) { // if right boundary is specified AND // left/right boundaries are on same reference AND // alignment starts past right boundary if ( Region.isRightBoundSpecified() && Region.LeftRefID == Region.RightRefID && bAlignment.Position > Region.RightPosition ) return AFTER_REGION; // otherwise, alignment is within region return WITHIN_REGION; } // alignment starts before left boundary else { // check if alignment overlaps left boundary if ( bAlignment.GetEndPosition() >= Region.LeftPosition ) return WITHIN_REGION; else return BEFORE_REGION; } } // alignment starts on a reference after the left bound else { // if region has a right boundary if ( Region.isRightBoundSpecified() ) { // alignment is on reference between boundaries if ( bAlignment.RefID < Region.RightRefID ) return WITHIN_REGION; // alignment is on reference after right boundary else if ( bAlignment.RefID > Region.RightRefID ) return AFTER_REGION; // alignment is on right bound reference else { // check if alignment starts before or at right boundary if ( bAlignment.Position <= Region.RightPosition ) return WITHIN_REGION; else return AFTER_REGION; } } // otherwise, alignment is after left bound reference, but there is no right boundary else return WITHIN_REGION; } }
void getBamBlocks(const BamAlignment &bam, const RefVector &refs, vector<BED> &blocks, bool breakOnDeletionOps) { CHRPOS currPosition = bam.Position; CHRPOS blockStart = bam.Position; string chrom = refs.at(bam.RefID).RefName; string name = bam.Name; string strand = "+"; string score = ToString(bam.MapQuality); char prevOp = '\0'; if (bam.IsReverseStrand()) strand = "-"; bool blocksFound = false; vector<CigarOp>::const_iterator cigItr = bam.CigarData.begin(); vector<CigarOp>::const_iterator cigEnd = bam.CigarData.end(); for ( ; cigItr != cigEnd; ++cigItr ) { if (cigItr->Type == 'M') { currPosition += cigItr->Length; // we only want to create a new block if the current M op // was preceded by an N op or a D op (and we are breaking on D ops) if ((prevOp == 'D' && breakOnDeletionOps == true) || (prevOp == 'N')) { blocks.push_back( BED(chrom, blockStart, currPosition, name, score, strand) ); blockStart = currPosition; } } else if (cigItr->Type == 'D') { if (breakOnDeletionOps == false) currPosition += cigItr->Length; else { blocksFound = true; currPosition += cigItr->Length; blockStart = currPosition; } } else if (cigItr->Type == 'N') { blocks.push_back( BED(chrom, blockStart, currPosition, name, score, strand) ); blocksFound = true; currPosition += cigItr->Length; blockStart = currPosition; } else if (cigItr->Type == 'S' || cigItr->Type == 'H' || cigItr->Type == 'P' || cigItr->Type == 'I') { // do nothing } else { cerr << "Input error: invalid CIGAR type (" << cigItr->Type << ") for: " << bam.Name << endl; exit(1); } prevOp = cigItr->Type; } // if there were no splits, we just create a block representing the contiguous alignment. if (blocksFound == false) { blocks.push_back( BED(chrom, bam.Position, currPosition, name, score, strand) ); } }
int DataStatisticsTool::Execute() { // iterate over reads in BAM file(s) BamAlignment alignObj; while(bamReader.GetNextAlignment(alignObj)) { if (alignObj.IsDuplicate()) continue; if (alignObj.IsFailedQC()) continue; if (!alignObj.IsMapped()) continue; if (!alignObj.IsPrimaryAlignment()) continue; if (alignObj.IsPaired() && !alignObj.IsProperPair()) continue; if (alignObj.IsPaired() && !alignObj.IsMateMapped()) continue; if (!alignObj.HasTag("MD")) continue; // // debug // GenericBamAlignmentTools::printBamAlignmentCigar(alignObj); // GenericBamAlignmentTools::printBamAlignmentMD(alignObj); // shift InDel GenericBamAlignmentTools::leftShiftInDel(alignObj); // // debug // GenericBamAlignmentTools::printBamAlignmentCigar(alignObj); // GenericBamAlignmentTools::printBamAlignmentMD(alignObj); // get the alignment sequences string alignRead; string alignGenome; GenericBamAlignmentTools::getAlignmentSequences(alignObj, alignRead, alignGenome); // update the statistics statistics.update(alignRead, alignGenome); } // print to screen cout << statistics << endl; // statistics.printMatchMismatch(); // close BAM reader bamReader.Close(); // close Fasta genomeFasta.Close(); return 1; }
// get next alignment (with character data fully parsed) bool BamReaderPrivate::GetNextAlignment(BamAlignment& alignment) { // if valid alignment found if ( GetNextAlignmentCore(alignment) ) { // store alignment's "source" filename alignment.Filename = m_filename; // return success/failure of parsing char data if ( alignment.BuildCharData() ) return true; else { const string alError = alignment.GetErrorString(); const string message = string("could not populate alignment data: \n\t") + alError; SetErrorString("BamReader::GetNextAlignment", message); return false; } } // no valid alignment found return false; }
//{{{ void process_pair(const BamAlignment &curr, void SV_Pair:: process_pair(const BamAlignment &curr, const RefVector refs, map<string, BamAlignment> &mapped_pairs, UCSCBins<SV_BreakPoint*> &r_bin, int weight, int ev_id, SV_PairReader *reader) { if (mapped_pairs.find(curr.Name) == mapped_pairs.end()) mapped_pairs[curr.Name] = curr; else { SV_Pair *new_pair = new SV_Pair(mapped_pairs[curr.Name], curr, refs, weight, ev_id, reader); //cerr << count_clipped(curr.CigarData) << "\t" << //count_clipped(mapped_pairs[curr.Name].CigarData) << endl; if ( new_pair->is_sane() && new_pair->is_aberrant() && (count_clipped(curr.CigarData) > 0) && (count_clipped(mapped_pairs[curr.Name].CigarData) > 0) ) { SV_BreakPoint *new_bp = new_pair->get_bp(); #ifdef TRACE cerr << "READ\t" << refs.at(mapped_pairs[curr.Name].RefID).RefName << "," << mapped_pairs[curr.Name].Position << "," << (mapped_pairs[curr.Name].GetEndPosition(false, false) - 1) << "\t" << refs.at(curr.RefID).RefName << "," << curr.Position << "," << (curr.GetEndPosition(false, false) - 1) << endl; cerr << "\tPE\t" << *new_bp << endl; #endif new_bp->cluster(r_bin); } else { delete(new_pair); } mapped_pairs.erase(curr.Name); } }
string createReferenceSequence(const BamAlignment& alignment) { // Recreate a reference sequence for a particular alignment. This is // the reference sequence that is identical to the reference at this // spot. This means skipping insertions or soft clipped regions in // reads, adding deletions back in, and keeping read matches. const vector<CigarOp> cigar = alignment.CigarData; const string querybases = alignment.QueryBases; string md_tag; alignment.GetTag("MD", md_tag); vector<MDToken> tokens; string refseq, alignedseq; // final ref bases; aligned portion of ref bases int md_len = TokenizeMD(md_tag, tokens); // Create reference-aligned sequence of read; doesn't contain soft // clips or insertions. Then, deletions and reference alleles are // added onto this. int pos=0; for (vector<CigarOp>::const_iterator op = cigar.begin(); op != cigar.end(); ++op) { if (!(op->Type == 'S' || op->Type == 'I')) { alignedseq.append(querybases.substr(pos, op->Length)); pos += op->Length; } else { pos += op->Length; // increment read position past skipped bases } } // the size of the aligned sequence MUST equal what is returned from // TokenizeMD: the number of aligned bases. Not the real reference // sequence is this length + deletions, which we add in below. assert(alignedseq.size() == md_len); pos = 0; for (vector<MDToken>::const_iterator it = tokens.begin(); it != tokens.end(); ++it) { if (it->type == MDType::isMatch) { refseq.append(alignedseq.substr(pos, it->length)); pos += it->length; } else if (it->type == MDType::isSNP) { assert(it->length == it->seq.size()); refseq.append(it->seq); pos += it->length; } else if (it->type == MDType::isDel) { // does not increment position in alignedseq assert(it->length == it->seq.size()); refseq.append(it->seq); } else { assert(false); } } return refseq; }
void CountDepth(Histogram& hist, BamMultiReader& reader, BamAlignment& al, int32_t refID, int64_t refLen) { bool moreReads = (al.RefID == refID); int32_t maxReadLen = 1000; vector<int64_t> readEnds(maxReadLen); int64_t depth = 0; for(int64_t pos=0; pos<refLen; ++pos){ while(moreReads and al.Position == pos){ ++depth; assert(al.GetEndPosition() - pos < maxReadLen); ++readEnds[al.GetEndPosition() % maxReadLen]; moreReads = GetNextAlignment(al, reader, refID); } depth -= readEnds[pos % maxReadLen]; assert(depth >= 0); readEnds[pos % maxReadLen] = 0; if(depth >= hist.size()) hist.resize(2 * depth); ++hist[depth]; } }
// get next alignment (with character data fully parsed) bool BamReaderPrivate::GetNextAlignment(BamAlignment& alignment) { // if valid alignment found if ( GetNextAlignmentCore(alignment) ) { // store alignment's "source" filename alignment.Filename = m_filename; // return success/failure of parsing char data return alignment.BuildCharData(); } // no valid alignment found return false; }
bool processReadPair(const BamAlignment& al1, const BamAlignment& al2, const RefVector& refs, const int32_t totalTail, const int32_t critTail, const bool diff_ref) { if ((al1.IsFirstMate() && al2.IsFirstMate()) || (al1.IsSecondMate() && al2.IsSecondMate())) { cerr << "Incompatible mate orders: name1 = " << al1.Name << " is1stmate " << al1.IsFirstMate() << " is2ndmate " << al1.IsSecondMate() << " name2 = " << al2.Name << " is1stmate " << al2.IsFirstMate() << " is2ndmate " << al2.IsSecondMate() << endl; exit(1); } int32_t total_tail = -1; if (! (total_tail = checkLinkPair(al1, al2, refs, totalTail, critTail, diff_ref))) { return false; // reject all but link pairs // continue; } if (critTail && ! checkLinkPairCandidate(al1, refs, critTail) && ! checkLinkPairCandidate(al2, refs, critTail)) { return false; // neither read was a link pair candidate } if (debug_processReadPair) cout << "---------------------------------" << endl; int32_t lpc_tail1 = checkLinkPairCandidate(al1, refs, critTail); int32_t lpc_tail2 = checkLinkPairCandidate(al2, refs, critTail); if (debug_processReadPair) { printAlignmentInfo(al1, refs); if (lpc_tail1) { cout << "LINK PAIR CANDIDATE "; cout << ((lpc_tail1 > 0) ? "--->" : "<---") << " " << lpc_tail1 << endl; } printAlignmentInfo(al2, refs); if (lpc_tail2) { cout << "LINK PAIR CANDIDATE "; cout << ((lpc_tail2 > 0) ? "--->" : "<---") << " " << lpc_tail2 << endl; } cout << "TOTAL TAIL " << (abs(readTail(al1, refs)) + abs(readTail(al2, refs))) << endl; } return true; }
bool shouldRealign(BamAlignment& alignment, string& ref, long int offset, Parameters& params, AlignmentStats& stats) { if (allN(alignment.QueryBases)) { if (params.debug) { cerr << "not realigning because query is all Ns! " << alignment.Name << endl; } return false; } if (!alignment.IsMapped()) { if (params.debug) { cerr << "realigning because read " << alignment.Name << " is not mapped " << endl; } return true; } if (alignment.CigarData.empty()) { cerr << "realigning because alignment " << alignment.Name << " @ " << alignment.Position << " has empty (or corrupted?) CIGAR" << endl; return true; } Cigar cigar(alignment.CigarData); countMismatchesAndGaps(alignment, cigar, ref, offset, stats, params.debug); if (stats.mismatch_qsum >= params.mismatch_qsum_threshold || stats.softclip_qsum >= params.softclip_qsum_threshold || stats.gaps >= params.gap_count_threshold || stats.gapslen >= params.gap_length_threshold) { if (params.debug) { cerr << "realigning because read " << alignment.Name << " meets mismatch (q" << stats.mismatch_qsum << " in " << stats.mismatches << ")" //<< " vs. " << params.mismatch_qsum_threshold << ")," << " softclip (q" << stats.softclip_qsum << " in " << stats.softclips << ")" //<< " vs. " << params.softclip_qsum_threshold << ")," << " gap count (" << stats.gaps << ")" //" vs. " << params.gap_count_threshold << ")," << " or gap length (" << stats.gapslen << ")" //<< " vs. " << params.gap_length_threshold << ") " << " thresholds" << endl; cerr << cigar << endl; } return true; } else { return false; } }
/** * Gets the library name from the header for the record. If the RG tag is not present on * the record, or the library isn't denoted on the read group, a constant string is * returned. */ string MarkDuplicates::getLibraryName(SamHeader & header, const BamAlignment & rec) { string read_group; static const string RG("RG"); static const string unknown_library("Unknown Library"); rec.GetTag(RG, read_group); if (read_group.size() > 0 && header.ReadGroups.Contains(read_group)) { SamReadGroupDictionary & d = header.ReadGroups; const SamReadGroup & rg = d[read_group]; if(rg.HasLibrary()) { return rg.Library; } } return unknown_library; }
//bool SampleManager::IdentifySample(Alignment& ra) const bool SampleManager::IdentifySample(const BamAlignment& alignment, int& sample_index, bool& primary_sample) const { string read_group; if (!alignment.GetTag("RG", read_group)) { cerr << "ERROR: Couldn't find read group id (@RG tag) for BAM Alignment " << alignment.Name << endl; exit(1); } map<string,int>::const_iterator I = read_group_to_sample_idx_.find(read_group); if (I == read_group_to_sample_idx_.end()) return false; sample_index =I->second; primary_sample = (sample_index == primary_sample_); return true; }
void Contig::updateContig(BamAlignment b, int max_nsert, bool is_mp) { readStatus read_status = computeReadType(b, max_nsert, is_mp); uint32_t readLength = b.Length; uint32_t iSize = abs(b.InsertSize); uint32_t startRead = b.Position; uint32_t endRead = startRead + readLength ; // position where reads ends uint32_t startMateRead = b.MatePosition; if (read_status == unmapped or read_status == lowQualty) { return; } if (read_status != unmapped and read_status != lowQualty) { //if the read is aligned and is not duplicated or low quality use it in cov computation updateCov(startRead, endRead, readCov); // update coverage } if (b.IsFirstMate() && read_status == pair_proper) { int iSize = abs(b.InsertSize); if(startRead < startMateRead) { updateCov(startRead, startRead + iSize, insertCov); } else { updateCov(startMateRead, startMateRead + iSize, insertCov); } } switch (read_status) { case singleton: updateCov(startRead, endRead, singCov); break; case pair_wrongChrs: updateCov(startRead, endRead, mdcCov); break; case pair_wrongDistance: updateCov(startRead, endRead, woCov); // break; case pair_wrongOrientation: updateCov(startRead, endRead, woCov); break; case pair_proper: updateCov(startRead, endRead, cmCov); break; default: cout << read_status << " --> This should never be printed\n"; break; } }
// Can be either unique or multi-mapping reads inline void PROBerReadModel_iCLIP::update(AlignmentGroup& ag) { int size = ag.size(); BamAlignment *ba = NULL; char dir; if (size > 1) { assert(model_type >= 2); double frac = 1.0 / size; for (int i = 0; i < size; ++i) { ba = ag.getAlignment(i); fld->update(ba->getInsertSize(), frac); } return; } assert(ag.getSEQ(seq)); if (model_type & 1) assert(ag.getQUAL(qual)); for (int i = 0; i < size; ++i) { ba = ag.getAlignment(i); dir = ba->getMateDir(); assert(ba->getCIGAR(cigar)); assert(ba->getMD(mdstr)); refseq.setUp(dir, cigar, mdstr, seq); seqmodel->update(1.0, dir, 0, &refseq, &cigar, &seq, ((model_type & 1) ? &qual : NULL)); } if (model_type >= 2) { assert(ag.getSEQ(seq, 2)); if (model_type & 1) assert(ag.getQUAL(qual, 2)); for (int i = 0; i < size; ++i) { ba = ag.getAlignment(i); dir = ba->getMateDir(2); assert(ba->getCIGAR(cigar, 2)); assert(ba->getMD(mdstr, 2)); refseq.setUp(dir, cigar, mdstr, seq); seqmodel->update(1.0, dir, 0, &refseq, &cigar, &seq, ((model_type & 1) ? &qual : NULL)); } } }
inline void PROBerReadModel_iCLIP::calcProbs(AlignmentGroup& ag, double* conprbs) { int size = ag.size(); BamAlignment *ba = NULL; char dir; assert(ag.getSEQ(seq)); if (model_type & 1) assert(ag.getQUAL(qual)); for (int i = 0; i < size; ++i) { ba = ag.getAlignment(i); dir = ba->getMateDir(); assert(ba->getCIGAR(cigar)); assert(ba->getMD(mdstr)); refseq.setUp(dir, cigar, mdstr, seq); conprbs[i] = seqmodel->getProb(dir, 0, &refseq, &cigar, &seq, ((model_type & 1) ? &qual : NULL)); } if (model_type >= 2) { assert(ag.getSEQ(seq, 2)); if (model_type & 1) assert(ag.getQUAL(qual, 2)); for (int i = 0; i < size; ++i) { ba = ag.getAlignment(i); dir = ba->getMateDir(2); assert(ba->getCIGAR(cigar, 2)); assert(ba->getMD(mdstr, 2)); refseq.setUp(dir, cigar, mdstr, seq); conprbs[i] *= seqmodel->getProb(dir, 0, &refseq, &cigar, &seq, ((model_type & 1) ? &qual : NULL)); conprbs[i] *= fld->getProb(ba->getInsertSize()); // fragment length distribution } } double sum = 0.0; for (int i = 0; i < size; ++i) sum += conprbs[i]; //assert(sum > 0.0); if (sum <= 0.0) sum = 1.0; for (int i = 0; i < size; ++i) conprbs[i] /= sum; }
//increases the counters mismatches and typesOfMismatches of a given BamAlignment object inline void increaseCounters(BamAlignment & al,string & reconstructedReference,int firstCycleRead,int increment){ char refeBase; char readBase; int cycleToUse=firstCycleRead; // cout<<"name "<<al.Name<<endl; // cout<<"firstCycleRead "<<firstCycleRead<<endl; // cout<<"increment "<<increment<<endl; for(int i=0;i<numberOfCycles;i++,cycleToUse+=increment){ // cout<<"i = "<<i<<" cyc "<<cycleToUse<<endl; refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); //match if(refeBase == 'M'){ matches[cycleToUse]++; continue; } if(refeBase == 'S' ||refeBase == 'I'){ //don't care about soft clipped or indels continue; } //mismatch if( isResolvedDNA(refeBase) && isResolvedDNA(readBase) ){ if(al.IsReverseStrand()){ //need to take the complement refeBase=complement(refeBase); readBase=complement(readBase); } if(readBase == refeBase){ cerr<<"Internal error in reconstruction of read "<<al.Name<<", contact developer"<<endl; exit(1);; } mismatches[cycleToUse]++; typesOfMismatches[dimer2index(refeBase,readBase)][cycleToUse]++; continue; } } }
void getBamBlocks(const BamAlignment &bam, const RefVector &refs, BedVec &blocks, bool breakOnDeletionOps) { CHRPOS currPosition = bam.Position; CHRPOS blockStart = bam.Position; string chrom = refs.at(bam.RefID).RefName; string name = bam.Name; string strand = "+"; float score = bam.MapQuality; if (bam.IsReverseStrand()) strand = "-"; vector<CigarOp>::const_iterator cigItr = bam.CigarData.begin(); vector<CigarOp>::const_iterator cigEnd = bam.CigarData.end(); for ( ; cigItr != cigEnd; ++cigItr ) { if (cigItr->Type == 'M') { currPosition += cigItr->Length; blocks.push_back( Bed(chrom, blockStart, currPosition, name, score, strand) ); blockStart = currPosition; } else if (cigItr->Type == 'D') { if (breakOnDeletionOps == false) currPosition += cigItr->Length; else { currPosition += cigItr->Length; blockStart = currPosition; } } else if (cigItr->Type == 'N') { currPosition += cigItr->Length; blockStart = currPosition; } else if (cigItr->Type == 'S' || cigItr->Type == 'H' || cigItr->Type == 'P' || cigItr->Type == 'I') { // do nothing } else { cerr << "Input error: invalid CIGAR type (" << cigItr->Type << ") for: " << bam.Name << endl; exit(1); } } }
/* * snip() doesn't leave a valid BamAlignment; it contains * correct FASTQ data. Handles negative strand alignments: * 'start=0' will always correspond to the 5'-most basepair * in the read. */ BamAlignment snip(BamAlignment &a, int start, int len) { BamAlignment copy(a); /* Handle reverse strand mappings */ int converted_start = copy.IsReverseStrand() ? copy.Length - start - len : start; copy.Length = len; try { copy.QueryBases = copy.QueryBases.substr(converted_start, len); copy.Qualities = copy.Qualities.substr(converted_start, len); } catch (exception &e) { cout << "ERROR: substr failed in snip(" << a.Name << ", " << start << ", " << len << ")" << endl; cout << (a.IsReverseStrand() ? "(-)" : "(+)") << ", converted_start: " << converted_start << endl; cout << a.QueryBases << endl; cout << a.Qualities << endl; exit(1); } return copy; }
// print BamAlignment in FASTA format // N.B. - uses QueryBases NOT AlignedBases void ConvertTool::ConvertToolPrivate::PrintFasta(const BamAlignment& a) { // >BamAlignment.Name // BamAlignment.QueryBases (up to FASTA_LINE_MAX bases per line) // ... // // N.B. - QueryBases are reverse-complemented if aligned to reverse strand // print header m_out << ">" << a.Name << endl; // handle reverse strand alignment - bases string sequence = a.QueryBases; if ( a.IsReverseStrand() ) Utilities::ReverseComplement(sequence); // if sequence fits on single line if ( sequence.length() <= FASTA_LINE_MAX ) m_out << sequence << endl; // else split over multiple lines else { size_t position = 0; size_t seqLength = sequence.length(); // handle reverse strand alignment - bases & qualitiesth(); // write subsequences to each line while ( position < (seqLength - FASTA_LINE_MAX) ) { m_out << sequence.substr(position, FASTA_LINE_MAX) << endl; position += FASTA_LINE_MAX; } // write final subsequence m_out << sequence.substr(position) << endl; } }
// print BamAlignment in SAM format void ConvertTool::ConvertToolPrivate::PrintSam(const BamAlignment& a) { // tab-delimited // <QNAME> <FLAG> <RNAME> <POS> <MAPQ> <CIGAR> <MRNM> <MPOS> <ISIZE> <SEQ> <QUAL> [ <TAG>:<VTYPE>:<VALUE> [...] ] // write name & alignment flag m_out << a.Name << "\t" << a.AlignmentFlag << "\t"; // write reference name if ( (a.RefID >= 0) && (a.RefID < (int)m_references.size()) ) m_out << m_references[a.RefID].RefName << "\t"; else m_out << "*\t"; // write position & map quality m_out << a.Position+1 << "\t" << a.MapQuality << "\t"; // write CIGAR const vector<CigarOp>& cigarData = a.CigarData; if ( cigarData.empty() ) m_out << "*\t"; else { vector<CigarOp>::const_iterator cigarIter = cigarData.begin(); vector<CigarOp>::const_iterator cigarEnd = cigarData.end(); for ( ; cigarIter != cigarEnd; ++cigarIter ) { const CigarOp& op = (*cigarIter); m_out << op.Length << op.Type; } m_out << "\t"; } // write mate reference name, mate position, & insert size if ( a.IsPaired() && (a.MateRefID >= 0) && (a.MateRefID < (int)m_references.size()) ) { if ( a.MateRefID == a.RefID ) m_out << "=\t"; else m_out << m_references[a.MateRefID].RefName << "\t"; m_out << a.MatePosition+1 << "\t" << a.InsertSize << "\t"; } else m_out << "*\t0\t0\t"; // write sequence if ( a.QueryBases.empty() ) m_out << "*\t"; else m_out << a.QueryBases << "\t"; // write qualities if ( a.Qualities.empty() || (a.Qualities.at(0) == (char)0xFF) ) m_out << "*"; else m_out << a.Qualities; // write tag data const char* tagData = a.TagData.c_str(); const size_t tagDataLength = a.TagData.length(); size_t index = 0; while ( index < tagDataLength ) { // write tag name string tagName = a.TagData.substr(index, 2); m_out << "\t" << tagName << ":"; index += 2; // get data type char type = a.TagData.at(index); ++index; switch ( type ) { case (Constants::BAM_TAG_TYPE_ASCII) : m_out << "A:" << tagData[index]; ++index; break; case (Constants::BAM_TAG_TYPE_INT8) : case (Constants::BAM_TAG_TYPE_UINT8) : m_out << "i:" << (int)tagData[index]; ++index; break; case (Constants::BAM_TAG_TYPE_INT16) : m_out << "i:" << BamTools::UnpackSignedShort(&tagData[index]); index += sizeof(int16_t); break; case (Constants::BAM_TAG_TYPE_UINT16) : m_out << "i:" << BamTools::UnpackUnsignedShort(&tagData[index]); index += sizeof(uint16_t); break; case (Constants::BAM_TAG_TYPE_INT32) : m_out << "i:" << BamTools::UnpackSignedInt(&tagData[index]); index += sizeof(int32_t); break; case (Constants::BAM_TAG_TYPE_UINT32) : m_out << "i:" << BamTools::UnpackUnsignedInt(&tagData[index]); index += sizeof(uint32_t); break; case (Constants::BAM_TAG_TYPE_FLOAT) : m_out << "f:" << BamTools::UnpackFloat(&tagData[index]); index += sizeof(float); break; case (Constants::BAM_TAG_TYPE_HEX) : case (Constants::BAM_TAG_TYPE_STRING) : m_out << type << ":"; while (tagData[index]) { m_out << tagData[index]; ++index; } ++index; break; } if ( tagData[index] == '\0') break; } m_out << endl; }
// print BamAlignment in JSON format void ConvertTool::ConvertToolPrivate::PrintJson(const BamAlignment& a) { // write name & alignment flag m_out << "{\"name\":\"" << a.Name << "\",\"alignmentFlag\":\"" << a.AlignmentFlag << "\","; // write reference name if ( (a.RefID >= 0) && (a.RefID < (int)m_references.size()) ) m_out << "\"reference\":\"" << m_references[a.RefID].RefName << "\","; // write position & map quality m_out << "\"position\":" << a.Position+1 << ",\"mapQuality\":" << a.MapQuality << ","; // write CIGAR const vector<CigarOp>& cigarData = a.CigarData; if ( !cigarData.empty() ) { m_out << "\"cigar\":["; vector<CigarOp>::const_iterator cigarBegin = cigarData.begin(); vector<CigarOp>::const_iterator cigarIter = cigarBegin; vector<CigarOp>::const_iterator cigarEnd = cigarData.end(); for ( ; cigarIter != cigarEnd; ++cigarIter ) { const CigarOp& op = (*cigarIter); if (cigarIter != cigarBegin) m_out << ","; m_out << "\"" << op.Length << op.Type << "\""; } m_out << "],"; } // write mate reference name, mate position, & insert size if ( a.IsPaired() && (a.MateRefID >= 0) && (a.MateRefID < (int)m_references.size()) ) { m_out << "\"mate\":{" << "\"reference\":\"" << m_references[a.MateRefID].RefName << "\"," << "\"position\":" << a.MatePosition+1 << ",\"insertSize\":" << a.InsertSize << "},"; } // write sequence if ( !a.QueryBases.empty() ) m_out << "\"queryBases\":\"" << a.QueryBases << "\","; // write qualities if ( !a.Qualities.empty() && a.Qualities.at(0) != (char)0xFF ) { string::const_iterator s = a.Qualities.begin(); m_out << "\"qualities\":[" << static_cast<short>(*s) - 33; ++s; for ( ; s != a.Qualities.end(); ++s ) m_out << "," << static_cast<short>(*s) - 33; m_out << "],"; } // write alignment's source BAM file m_out << "\"filename\":" << a.Filename << ","; // write tag data const char* tagData = a.TagData.c_str(); const size_t tagDataLength = a.TagData.length(); size_t index = 0; if ( index < tagDataLength ) { m_out << "\"tags\":{"; while ( index < tagDataLength ) { if ( index > 0 ) m_out << ","; // write tag name m_out << "\"" << a.TagData.substr(index, 2) << "\":"; index += 2; // get data type char type = a.TagData.at(index); ++index; switch ( type ) { case (Constants::BAM_TAG_TYPE_ASCII) : m_out << "\"" << tagData[index] << "\""; ++index; break; case (Constants::BAM_TAG_TYPE_INT8) : case (Constants::BAM_TAG_TYPE_UINT8) : m_out << (int)tagData[index]; ++index; break; case (Constants::BAM_TAG_TYPE_INT16) : m_out << BamTools::UnpackSignedShort(&tagData[index]); index += sizeof(int16_t); break; case (Constants::BAM_TAG_TYPE_UINT16) : m_out << BamTools::UnpackUnsignedShort(&tagData[index]); index += sizeof(uint16_t); break; case (Constants::BAM_TAG_TYPE_INT32) : m_out << BamTools::UnpackSignedInt(&tagData[index]); index += sizeof(int32_t); break; case (Constants::BAM_TAG_TYPE_UINT32) : m_out << BamTools::UnpackUnsignedInt(&tagData[index]); index += sizeof(uint32_t); break; case (Constants::BAM_TAG_TYPE_FLOAT) : m_out << BamTools::UnpackFloat(&tagData[index]); index += sizeof(float); break; case (Constants::BAM_TAG_TYPE_HEX) : case (Constants::BAM_TAG_TYPE_STRING) : m_out << "\""; while (tagData[index]) { if (tagData[index] == '\"') m_out << "\\\""; // escape for json else m_out << tagData[index]; ++index; } m_out << "\""; ++index; break; } if ( tagData[index] == '\0') break; } m_out << "}"; } m_out << "}" << endl; }
int main_asequantmultirg(const vector<string> &all_args) { Init(all_args); cerr << "* Reading bam file " << endl; OpenBam(bam_reader, bam_file); bam_reader.OpenIndex(bam_file + ".bai"); vector<string> readGroupVector; SamHeader header = bam_reader.GetHeader(); SamReadGroupDictionary headerRG = header.ReadGroups; for (SamReadGroupIterator it = headerRG.Begin(); it != headerRG.End(); it ++) { readGroupVector.push_back(it -> ID); } vector<RefData> chroms = bam_reader.GetReferenceData(); cout << "#CHROM" << "\t" << "POS" << "\t" << "REF" << "\t" << "ALT"; for (vector<string>::iterator it = readGroupVector.begin(); it != readGroupVector.end(); it ++) { cout << "\t" << *it; } cout << endl; StlFor(chrom_idx, chroms) { string &chrom = chroms[chrom_idx].RefName; vector<Snp> snps = snps_by_chrom[chrom]; int s = 0; // Index into snp array BamAlignment bam; bam_reader.Jump(chrom_idx); string align; string qualities; cerr << "* On chrom " << chrom << endl; while (bam_reader.GetNextAlignment(bam) && bam.RefID == chrom_idx) { if (bam.MapQuality < min_map_qual || !bam.IsMapped()) continue; string currentRG; Assert(bam.GetReadGroup(currentRG)); int start = AlignStart(bam); int end = AlignEnd(bam); // Move the current SNP pointer so that it is ahead of the read's start (since bam alignments are in sorted order) while (s < snps.size() && snps[s].pos < start) ++s; // Stop everything if we have visited all SNPs on this chrom if (s >= snps.size()) break; // Find any/all SNPs that are within the bam alignment int n = 0; // Number of SNPs overlapped while ((s + n) < snps.size() && snps[s + n].pos < end) // Then it overlaps! ++n; // Now, look at each SNP and see which way it votes AlignedString(bam, align); AlignedQualities(bam, qualities); Assert(align.size() == qualities.size()); // Now, tally votes for (int i = 0; i < n; ++i) { Snp &snp = snps[s + i]; char base = align[snp.pos - start]; // Base from the read int qual = int(qualities[snp.pos - start]) - ascii_offset; // Base from the read //AssertMsg(qual >= 0 && qual <= 100, ToStr(qual) + "\n" + bam.Name + "\n" + CigarToStr(bam.CigarData) + "\n" + bam.QueryBases + "\n" + bam.Qualities); if (base == '-' || qual < min_base_qual) continue; map<string, Counts> &RG_counts = bam.IsReverseStrand() ? snp.rev : snp.fwd; map<string, Counts>::iterator searchIt = RG_counts.find(currentRG); if (searchIt == RG_counts.end()) { if (base == snp.ref) { RG_counts[currentRG].num_ref = 1; RG_counts[currentRG].num_alt = 0; RG_counts[currentRG].num_other = 0; } else if (base == snp.alt) { RG_counts[currentRG].num_ref = 0; RG_counts[currentRG].num_alt = 1; RG_counts[currentRG].num_other = 0; } else { RG_counts[currentRG].num_ref = 0; RG_counts[currentRG].num_alt = 0; RG_counts[currentRG].num_other = 1; } } else { if (base == snp.ref) { searchIt -> second.num_ref += 1; } else if (base == snp.alt) { searchIt -> second.num_alt += 1; } else { searchIt -> second.num_other += 1; } } } } // Output counts for (int s = 0; s < snps.size(); ++s) { cout << chrom << "\t" << snps[s].pos + 1 << "\t" << snps[s].ref << "\t" << snps[s].alt; for (vector<string>::iterator it = readGroupVector.begin(); it != readGroupVector.end(); it ++) { map<string, Counts>::iterator searchIt = snps[s].fwd.find(*it); if (searchIt != snps[s].fwd.end()) { cout << "\t" << searchIt -> second.num_ref << "," << searchIt -> second.num_alt << "," << searchIt -> second.num_other << ","; } else { cout << "\t" << "0,0,0,"; } searchIt = snps[s].rev.find(*it); if (searchIt != snps[s].rev.end()) { cout << searchIt -> second.num_ref << "," << searchIt -> second.num_alt << "," << searchIt -> second.num_other; } else { cout << "0,0,0"; } } cout << endl; } }
int IonstatsTestFragments(int argc, const char *argv[]) { OptArgs opts; opts.ParseCmdLine(argc, argv); string input_bam_filename = opts.GetFirstString('i', "input", ""); string fasta_filename = opts.GetFirstString('r', "ref", ""); string output_json_filename = opts.GetFirstString('o', "output", "ionstats_tf.json"); int histogram_length = opts.GetFirstInt ('h', "histogram-length", 400); if(argc < 2 or input_bam_filename.empty() or fasta_filename.empty()) { IonstatsTestFragmentsHelp(); return 1; } // // Prepare for metric calculation // map<string,string> tf_sequences; PopulateReferenceSequences(tf_sequences, fasta_filename); BamReader input_bam; if (!input_bam.Open(input_bam_filename)) { fprintf(stderr, "[ionstats] ERROR: cannot open %s\n", input_bam_filename.c_str()); return 1; } int num_tfs = input_bam.GetReferenceCount(); SamHeader sam_header = input_bam.GetHeader(); if(!sam_header.HasReadGroups()) { fprintf(stderr, "[ionstats] ERROR: no read groups in %s\n", input_bam_filename.c_str()); return 1; } string flow_order; string key; for (SamReadGroupIterator rg = sam_header.ReadGroups.Begin(); rg != sam_header.ReadGroups.End(); ++rg) { if(rg->HasFlowOrder()) flow_order = rg->FlowOrder; if(rg->HasKeySequence()) key = rg->KeySequence; } // Need these metrics stratified by TF. vector<ReadLengthHistogram> called_histogram(num_tfs); vector<ReadLengthHistogram> aligned_histogram(num_tfs); vector<ReadLengthHistogram> AQ10_histogram(num_tfs); vector<ReadLengthHistogram> AQ17_histogram(num_tfs); vector<SimpleHistogram> error_by_position(num_tfs); vector<MetricGeneratorSNR> system_snr(num_tfs); vector<MetricGeneratorHPAccuracy> hp_accuracy(num_tfs); for (int tf = 0; tf < num_tfs; ++tf) { called_histogram[tf].Initialize(histogram_length); aligned_histogram[tf].Initialize(histogram_length); AQ10_histogram[tf].Initialize(histogram_length); AQ17_histogram[tf].Initialize(histogram_length); error_by_position[tf].Initialize(histogram_length); } vector<uint16_t> flow_signal_fz(flow_order.length()); vector<int16_t> flow_signal_zm(flow_order.length()); const RefVector& refs = input_bam.GetReferenceData(); // Missing: // - hp accuracy - tough, copy verbatim from TFMapper? BamAlignment alignment; vector<char> MD_op; vector<int> MD_len; MD_op.reserve(1024); MD_len.reserve(1024); string MD_tag; // // Main loop over mapped reads in the input BAM // while(input_bam.GetNextAlignment(alignment)) { if (!alignment.IsMapped() or !alignment.GetTag("MD",MD_tag)) continue; // The check below eliminates unexpected alignments if (alignment.IsReverseStrand() or alignment.Position > 5) continue; int current_tf = alignment.RefID; // // Step 1. Parse MD tag // MD_op.clear(); MD_len.clear(); for (const char *MD_ptr = MD_tag.c_str(); *MD_ptr;) { int item_length = 0; if (*MD_ptr >= '0' and *MD_ptr <= '9') { // Its a match MD_op.push_back('M'); for (; *MD_ptr and *MD_ptr >= '0' and *MD_ptr <= '9'; ++MD_ptr) item_length = 10*item_length + *MD_ptr - '0'; } else { if (*MD_ptr == '^') { // Its a deletion MD_ptr++; MD_op.push_back('D'); } else // Its a substitution MD_op.push_back('X'); for (; *MD_ptr and *MD_ptr >= 'A' and *MD_ptr <= 'Z'; ++MD_ptr) item_length++; } MD_len.push_back(item_length); } // // Step 2. Synchronously scan through Cigar and MD, doing error accounting // int MD_idx = alignment.IsReverseStrand() ? MD_op.size()-1 : 0; int cigar_idx = alignment.IsReverseStrand() ? alignment.CigarData.size()-1 : 0; int increment = alignment.IsReverseStrand() ? -1 : 1; int AQ10_bases = 0; int AQ17_bases = 0; int num_bases = 0; int num_errors = 0; while (cigar_idx < (int)alignment.CigarData.size() and MD_idx < (int) MD_op.size() and cigar_idx >= 0 and MD_idx >= 0) { if (alignment.CigarData[cigar_idx].Length == 0) { // Try advancing cigar cigar_idx += increment; continue; } if (MD_len[MD_idx] == 0) { // Try advancing MD MD_idx += increment; continue; } // Match if (alignment.CigarData[cigar_idx].Type == 'M' and MD_op[MD_idx] == 'M') { int advance = min((int)alignment.CigarData[cigar_idx].Length, MD_len[MD_idx]); num_bases += advance; alignment.CigarData[cigar_idx].Length -= advance; MD_len[MD_idx] -= advance; // Insertion (read has a base, reference doesn't) } else if (alignment.CigarData[cigar_idx].Type == 'I') { int advance = alignment.CigarData[cigar_idx].Length; for (int cnt = 0; cnt < advance; ++cnt) { error_by_position[current_tf].Add(num_bases); num_bases++; num_errors++; } alignment.CigarData[cigar_idx].Length -= advance; // Deletion (reference has a base, read doesn't) } else if (alignment.CigarData[cigar_idx].Type == 'D' and MD_op[MD_idx] == 'D') { int advance = min((int)alignment.CigarData[cigar_idx].Length, MD_len[MD_idx]); for (int cnt = 0; cnt < advance; ++cnt) { error_by_position[current_tf].Add(num_bases); num_errors++; } alignment.CigarData[cigar_idx].Length -= advance; MD_len[MD_idx] -= advance; // Substitution } else if (MD_op[MD_idx] == 'X') { int advance = min((int)alignment.CigarData[cigar_idx].Length, MD_len[MD_idx]); for (int cnt = 0; cnt < advance; ++cnt) { error_by_position[current_tf].Add(num_bases); num_bases++; num_errors++; } alignment.CigarData[cigar_idx].Length -= advance; MD_len[MD_idx] -= advance; } else { printf("ionstats tf: Unexpected OP combination: %s Cigar=%c, MD=%c !\n", alignment.Name.c_str(), alignment.CigarData[cigar_idx].Type, MD_op[MD_idx]); break; } if (num_errors*10 <= num_bases) AQ10_bases = num_bases; if (num_errors*50 <= num_bases) AQ17_bases = num_bases; } // // Step 3. Profit // called_histogram[current_tf].Add(alignment.Length); aligned_histogram[current_tf].Add(num_bases); AQ10_histogram[current_tf].Add(AQ10_bases); AQ17_histogram[current_tf].Add(AQ17_bases); if(alignment.GetTag("ZM", flow_signal_zm)) system_snr[current_tf].Add(flow_signal_zm, key.c_str(), flow_order); else if(alignment.GetTag("FZ", flow_signal_fz)) system_snr[current_tf].Add(flow_signal_fz, key.c_str(), flow_order); // HP accuracy - keeping it simple if (!alignment.IsReverseStrand()) { string genome = key + tf_sequences[refs[current_tf].RefName]; string calls = key + alignment.QueryBases; const char *genome_ptr = genome.c_str(); const char *calls_ptr = calls.c_str(); for (int flow = 0; flow < (int)flow_order.length() and *genome_ptr and *calls_ptr; ++flow) { int genome_hp = 0; int calls_hp = 0; while (*genome_ptr == flow_order[flow]) { genome_hp++; genome_ptr++; } while (*calls_ptr == flow_order[flow]) { calls_hp++; calls_ptr++; } hp_accuracy[current_tf].Add(genome_hp, calls_hp); } } } // // Processing complete, generate ionstats_tf.json // Json::Value output_json(Json::objectValue); output_json["meta"]["creation_date"] = get_time_iso_string(time(NULL)); output_json["meta"]["format_name"] = "ionstats_tf"; output_json["meta"]["format_version"] = "1.0"; output_json["results_by_tf"] = Json::objectValue; for (int tf = 0; tf < num_tfs; ++tf) { if (aligned_histogram[tf].num_reads() < 1000) continue; called_histogram[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]["full"]); aligned_histogram[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]["aligned"]); AQ10_histogram[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]["AQ10"]); AQ17_histogram[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]["AQ17"]); error_by_position[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]["error_by_position"]); system_snr[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]); hp_accuracy[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]); output_json["results_by_tf"][refs[tf].RefName]["sequence"] = tf_sequences[refs[tf].RefName]; } input_bam.Close(); ofstream out(output_json_filename.c_str(), ios::out); if (out.good()) { out << output_json.toStyledString(); return 0; } else { fprintf(stderr, "ERROR: unable to write to '%s'\n", output_json_filename.c_str()); return 1; } }
void PileupEngine::PileupEnginePrivate::ParseAlignmentCigar(const BamAlignment& al) { // skip if unmapped if ( !al.IsMapped() ) return; // intialize local variables int genomePosition = al.Position; int positionInAlignment = 0; bool isNewReadSegment = true; bool saveAlignment = true; PileupAlignment pileupAlignment(al); // iterate over CIGAR operations const int numCigarOps = (const int)al.CigarData.size(); for (int i = 0; i < numCigarOps; ++i ) { const CigarOp& op = al.CigarData.at(i); // if op is MATCH if ( op.Type == 'M' ) { // if match op overlaps current position if ( genomePosition + (int)op.Length > CurrentPosition ) { // set pileup data pileupAlignment.IsCurrentDeletion = false; pileupAlignment.IsNextDeletion = false; pileupAlignment.IsNextInsertion = false; pileupAlignment.PositionInAlignment = positionInAlignment + (CurrentPosition - genomePosition); // check for beginning of read segment if ( genomePosition == CurrentPosition && isNewReadSegment ) pileupAlignment.IsSegmentBegin = true; // if we're at the end of a match operation if ( genomePosition + (int)op.Length - 1 == CurrentPosition ) { // if not last operation if ( i < numCigarOps - 1 ) { // check next CIGAR op const CigarOp& nextOp = al.CigarData.at(i+1); // if next CIGAR op is DELETION if ( nextOp.Type == 'D') { pileupAlignment.IsNextDeletion = true; pileupAlignment.DeletionLength = nextOp.Length; } // if next CIGAR op is INSERTION else if ( nextOp.Type == 'I' ) { pileupAlignment.IsNextInsertion = true; pileupAlignment.InsertionLength = nextOp.Length; } // if next CIGAR op is either DELETION or INSERTION if ( nextOp.Type == 'D' || nextOp.Type == 'I' ) { // if there is a CIGAR op after the DEL/INS if ( i < numCigarOps - 2 ) { const CigarOp& nextNextOp = al.CigarData.at(i+2); // if next CIGAR op is clipping or ref_skip if ( nextNextOp.Type == 'S' || nextNextOp.Type == 'N' || nextNextOp.Type == 'H' ) pileupAlignment.IsSegmentEnd = true; } else { pileupAlignment.IsSegmentEnd = true; // if next CIGAR op is clipping or ref_skip if ( nextOp.Type == 'S' || nextOp.Type == 'N' || nextOp.Type == 'H' ) pileupAlignment.IsSegmentEnd = true; } } // otherwise else { // if next CIGAR op is clipping or ref_skip if ( nextOp.Type == 'S' || nextOp.Type == 'N' || nextOp.Type == 'H' ) pileupAlignment.IsSegmentEnd = true; } } // else this is last operation else pileupAlignment.IsSegmentEnd = true; } } // increment markers genomePosition += op.Length; positionInAlignment += op.Length; } // if op is DELETION else if ( op.Type == 'D' ) { // if deletion op overlaps current position if ( genomePosition + (int)op.Length > CurrentPosition ) { // set pileup data pileupAlignment.IsCurrentDeletion = true; pileupAlignment.IsNextDeletion = false; pileupAlignment.IsNextInsertion = true; pileupAlignment.PositionInAlignment = positionInAlignment + (CurrentPosition - genomePosition); } // increment marker genomePosition += op.Length; } // if op is REF_SKIP else if ( op.Type == 'N' ) { genomePosition += op.Length; } // if op is INSERTION or SOFT_CLIP else if ( op.Type == 'I' || op.Type == 'S' ) { positionInAlignment += op.Length; } // checl for beginning of new read segment if ( op.Type == 'N' || op.Type == 'S' || op.Type == 'H' ) isNewReadSegment = true; else isNewReadSegment = false; // if we've moved beyond current position if ( genomePosition > CurrentPosition ) { if ( op.Type == 'N' ) saveAlignment = false; // ignore alignment if REF_SKIP break; } } // save pileup position if flag is true if ( saveAlignment ) CurrentPileupData.PileupAlignments.push_back( pileupAlignment ); }