//{{{ SV_Pair:: SV_Pair(const BamAlignment &bam_a, // if both reads are on the same chrome, then read_l must map before read_r // if the reads are on different strands then read_l must be on the lexo // lesser chrom (using the string.compare() method) SV_Pair:: SV_Pair(const BamAlignment &bam_a, const BamAlignment &bam_b, const RefVector &refs, int _weight, int _ev_id, SV_PairReader *_reader) { reader = _reader; if ( bam_a.MapQuality < bam_b.MapQuality ) min_mapping_quality = bam_a.MapQuality; else min_mapping_quality = bam_b.MapQuality; struct interval tmp_a, tmp_b; tmp_a.start = bam_a.Position; tmp_a.end = bam_a.GetEndPosition(false, false) - 1; tmp_a.chr = refs.at(bam_a.RefID).RefName; if ( bam_a.IsReverseStrand() == true ) tmp_a.strand = '-'; else tmp_a.strand = '+'; tmp_b.start = bam_b.Position; tmp_b.end = bam_b.GetEndPosition(false, false) - 1; tmp_b.chr = refs.at(bam_b.RefID).RefName; if ( bam_b.IsReverseStrand() == true ) tmp_b.strand = '-'; else tmp_b.strand = '+'; //if ( tmp_a.chr.compare(tmp_b.chr) > 0 ) { if ( bam_a.RefID < bam_b.RefID ) { read_l = tmp_a; read_r = tmp_b; //} else if ( tmp_a.chr.compare(tmp_b.chr) < 0 ) { } else if ( bam_a.RefID > bam_b.RefID) { read_l = tmp_b; read_r = tmp_a; } else { // == if (tmp_a.start > tmp_b.start) { read_l = tmp_b; read_r = tmp_a; } else { read_l = tmp_a; read_r = tmp_b; } } weight = _weight; ev_id = _ev_id; }
int computeInsertSize(const BamAlignment & firstEnd, const BamAlignment & secondEnd) { if (!firstEnd.IsMapped() || !secondEnd.IsMapped()) { return 0; } if (!firstEnd.RefID == secondEnd.RefID) { return 0; } const int firstEnd5PrimePosition = firstEnd.IsReverseStrand()? firstEnd.Position + firstEnd.Length: firstEnd.Position; const int secondEnd5PrimePosition = secondEnd.IsReverseStrand()? secondEnd.Position + secondEnd.Length: secondEnd.Position; const int adjustment = (secondEnd5PrimePosition >= firstEnd5PrimePosition) ? +1 : -1; return secondEnd5PrimePosition - firstEnd5PrimePosition + adjustment; }
int32_t readTail(const BamAlignment& al, const RefVector& refs) { return readTailS(al.IsMapped(), al.IsReverseStrand(), al.Position, refs[al.RefID].RefLength, al.AlignedBases.length()); }
void clipAlignment(BamAlignment &al) { int offset, length; CigarOp cop1 = al.CigarData[0]; CigarOp cop2 = al.CigarData[al.CigarData.size() - 1]; if (copcomp(cop2, cop1)) { offset = 0; length = min(al.Length, (signed)cop1.Length); } else { offset = al.Length - min(al.Length, (signed)cop2.Length); length = min(al.Length, (signed)cop2.Length); } try { al.Qualities = al.Qualities.substr(offset, length); al.QueryBases = al.QueryBases.substr(offset, length); } catch (exception &e) { cout << "ERROR: substr failed in clipAlignment()" << endl; cout << al.Name << " " << (al.IsReverseStrand() ? "(-)" : "(+)"); cout << " offset: " << offset << " length: " << length << " taglen: " << al.Length << endl; cout << "cop1: " << cop1.Length << cop1.Type << endl; cout << "cop2: " << cop2.Length << cop2.Type << endl; exit(1); } }
void BamToFastq::SingleFastq() { // open the 1st fastq file for writing ofstream fq(_fastq1.c_str(), ios::out); if ( !fq ) { cerr << "Error: The first fastq file (" << _fastq1 << ") could not be opened. Exiting!" << endl; exit (1); } // open the BAM file BamReader reader; reader.Open(_bamFile); BamAlignment bam; while (reader.GetNextAlignment(bam)) { // extract the sequence and qualities for the BAM "query" string seq = bam.QueryBases; string qual = bam.Qualities; if (bam.IsReverseStrand() == true) { reverseComplement(seq); reverseSequence(qual); } fq << "@" << bam.Name << endl; fq << seq << endl; fq << "+" << endl; fq << qual << endl; } }
// print BamAlignment in FASTQ format // N.B. - uses QueryBases NOT AlignedBases void ConvertTool::ConvertToolPrivate::PrintFastq(const BamAlignment& a) { // @BamAlignment.Name // BamAlignment.QueryBases // + // BamAlignment.Qualities // // N.B. - QueryBases are reverse-complemented (& Qualities reversed) if aligned to reverse strand . // Name is appended "/1" or "/2" if paired-end, to reflect which mate this entry is. // handle paired-end alignments string name = a.Name; if ( a.IsPaired() ) name.append( (a.IsFirstMate() ? "/1" : "/2") ); // handle reverse strand alignment - bases & qualities string qualities = a.Qualities; string sequence = a.QueryBases; if ( a.IsReverseStrand() ) { Utilities::Reverse(qualities); Utilities::ReverseComplement(sequence); } // write to output stream m_out << "@" << name << endl << sequence << endl << "+" << endl << qualities << endl; }
void getBamBlocks(const BamAlignment &bam, const RefVector &refs, vector<BED> &blocks, bool breakOnDeletionOps) { CHRPOS currPosition = bam.Position; CHRPOS blockStart = bam.Position; string chrom = refs.at(bam.RefID).RefName; string name = bam.Name; string strand = "+"; string score = ToString(bam.MapQuality); char prevOp = '\0'; if (bam.IsReverseStrand()) strand = "-"; bool blocksFound = false; vector<CigarOp>::const_iterator cigItr = bam.CigarData.begin(); vector<CigarOp>::const_iterator cigEnd = bam.CigarData.end(); for ( ; cigItr != cigEnd; ++cigItr ) { if (cigItr->Type == 'M') { currPosition += cigItr->Length; // we only want to create a new block if the current M op // was preceded by an N op or a D op (and we are breaking on D ops) if ((prevOp == 'D' && breakOnDeletionOps == true) || (prevOp == 'N')) { blocks.push_back( BED(chrom, blockStart, currPosition, name, score, strand) ); blockStart = currPosition; } } else if (cigItr->Type == 'D') { if (breakOnDeletionOps == false) currPosition += cigItr->Length; else { blocksFound = true; currPosition += cigItr->Length; blockStart = currPosition; } } else if (cigItr->Type == 'N') { blocks.push_back( BED(chrom, blockStart, currPosition, name, score, strand) ); blocksFound = true; currPosition += cigItr->Length; blockStart = currPosition; } else if (cigItr->Type == 'S' || cigItr->Type == 'H' || cigItr->Type == 'P' || cigItr->Type == 'I') { // do nothing } else { cerr << "Input error: invalid CIGAR type (" << cigItr->Type << ") for: " << bam.Name << endl; exit(1); } prevOp = cigItr->Type; } // if there were no splits, we just create a block representing the contiguous alignment. if (blocksFound == false) { blocks.push_back( BED(chrom, bam.Position, currPosition, name, score, strand) ); } }
void ConvertTool::ConvertToolPrivate::PrintBed(const BamAlignment& a) { // tab-delimited, 0-based half-open // (e.g. a 50-base read aligned to pos 10 could have BED coordinates (10, 60) instead of BAM coordinates (10, 59) ) // <chromName> <chromStart> <chromEnd> <readName> <score> <strand> m_out << m_references.at(a.RefID).RefName << '\t' << a.Position << '\t' << a.GetEndPosition() << '\t' << a.Name << '\t' << a.MapQuality << '\t' << (a.IsReverseStrand() ? '-' : '+') << std::endl; }
void BedCoverage::CollectCoverageBam(string bamFile) { // load the "B" bed file into a map so // that we can easily compare "A" to it for overlaps _bedB->loadBedCovFileIntoMap(); // open the BAM file BamReader reader; reader.Open(bamFile); // get header & reference information string header = reader.GetHeaderText(); RefVector refs = reader.GetReferenceData(); // convert each aligned BAM entry to BED // and compute coverage on B BamAlignment bam; while (reader.GetNextAlignment(bam)) { if (bam.IsMapped()) { // treat the BAM alignment as a single "block" if (_obeySplits == false) { // construct a new BED entry from the current BAM alignment. BED a; a.chrom = refs.at(bam.RefID).RefName; a.start = bam.Position; a.end = bam.GetEndPosition(false, false); a.strand = "+"; if (bam.IsReverseStrand()) a.strand = "-"; _bedB->countHits(a, _sameStrand, _diffStrand, _countsOnly); } // split the BAM alignment into discrete blocks and // look for overlaps only within each block. else { // vec to store the discrete BED "blocks" from a bedVector bedBlocks; // since we are counting coverage, we do want to split blocks when a // deletion (D) CIGAR op is encountered (hence the true for the last parm) GetBamBlocks(bam, refs.at(bam.RefID).RefName, bedBlocks, false, true); // use countSplitHits to avoid over-counting each split chunk // as distinct read coverage. _bedB->countSplitHits(bedBlocks, _sameStrand, _diffStrand, _countsOnly); } } } // report the coverage (summary or histogram) for BED B. if (_countsOnly == true) ReportCounts(); else ReportCoverage(); // close the BAM file reader.Close(); }
// use current input alignment to update BAM file alignment stats void StatsTool::StatsToolPrivate::ProcessAlignment(const BamAlignment& al) { // increment total alignment counter ++numReads; // check the paired-independent flags if ( al.IsDuplicate() ) ++numDuplicates; if ( al.IsFailedQC() ) ++numFailedQC; if ( al.IsMapped() ) ++numMapped; // check forward/reverse strand if ( al.IsReverseStrand() ) ++numReverseStrand; else ++numForwardStrand; // if alignment is paired-end if ( al.IsPaired() ) { // increment PE counter ++numPaired; // increment first mate/second mate counters if ( al.IsFirstMate() ) ++numFirstMate; if ( al.IsSecondMate() ) ++numSecondMate; // if alignment is mapped, check mate status if ( al.IsMapped() ) { // if mate mapped if ( al.IsMateMapped() ) ++numBothMatesMapped; // else singleton else ++numSingletons; } // check for explicit proper pair flag if ( al.IsProperPair() ) ++numProperPair; // store insert size for first mate if ( settings->IsShowingInsertSizeSummary && al.IsFirstMate() && (al.InsertSize != 0) ) { int insertSize = abs(al.InsertSize); insertSizes.push_back( insertSize ); } } }
//increases the counters mismatches and typesOfMismatches of a given BamAlignment object inline void increaseCounters(BamAlignment & al,string & reconstructedReference,int firstCycleRead,int increment){ char refeBase; char readBase; int cycleToUse=firstCycleRead; // cout<<"name "<<al.Name<<endl; // cout<<"firstCycleRead "<<firstCycleRead<<endl; // cout<<"increment "<<increment<<endl; for(int i=0;i<numberOfCycles;i++,cycleToUse+=increment){ // cout<<"i = "<<i<<" cyc "<<cycleToUse<<endl; refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); //match if(refeBase == 'M'){ matches[cycleToUse]++; continue; } if(refeBase == 'S' ||refeBase == 'I'){ //don't care about soft clipped or indels continue; } //mismatch if( isResolvedDNA(refeBase) && isResolvedDNA(readBase) ){ if(al.IsReverseStrand()){ //need to take the complement refeBase=complement(refeBase); readBase=complement(readBase); } if(readBase == refeBase){ cerr<<"Internal error in reconstruction of read "<<al.Name<<", contact developer"<<endl; exit(1);; } mismatches[cycleToUse]++; typesOfMismatches[dimer2index(refeBase,readBase)][cycleToUse]++; continue; } } }
void getBamBlocks(const BamAlignment &bam, const RefVector &refs, BedVec &blocks, bool breakOnDeletionOps) { CHRPOS currPosition = bam.Position; CHRPOS blockStart = bam.Position; string chrom = refs.at(bam.RefID).RefName; string name = bam.Name; string strand = "+"; float score = bam.MapQuality; if (bam.IsReverseStrand()) strand = "-"; vector<CigarOp>::const_iterator cigItr = bam.CigarData.begin(); vector<CigarOp>::const_iterator cigEnd = bam.CigarData.end(); for ( ; cigItr != cigEnd; ++cigItr ) { if (cigItr->Type == 'M') { currPosition += cigItr->Length; blocks.push_back( Bed(chrom, blockStart, currPosition, name, score, strand) ); blockStart = currPosition; } else if (cigItr->Type == 'D') { if (breakOnDeletionOps == false) currPosition += cigItr->Length; else { currPosition += cigItr->Length; blockStart = currPosition; } } else if (cigItr->Type == 'N') { currPosition += cigItr->Length; blockStart = currPosition; } else if (cigItr->Type == 'S' || cigItr->Type == 'H' || cigItr->Type == 'P' || cigItr->Type == 'I') { // do nothing } else { cerr << "Input error: invalid CIGAR type (" << cigItr->Type << ") for: " << bam.Name << endl; exit(1); } } }
/* * snip() doesn't leave a valid BamAlignment; it contains * correct FASTQ data. Handles negative strand alignments: * 'start=0' will always correspond to the 5'-most basepair * in the read. */ BamAlignment snip(BamAlignment &a, int start, int len) { BamAlignment copy(a); /* Handle reverse strand mappings */ int converted_start = copy.IsReverseStrand() ? copy.Length - start - len : start; copy.Length = len; try { copy.QueryBases = copy.QueryBases.substr(converted_start, len); copy.Qualities = copy.Qualities.substr(converted_start, len); } catch (exception &e) { cout << "ERROR: substr failed in snip(" << a.Name << ", " << start << ", " << len << ")" << endl; cout << (a.IsReverseStrand() ? "(-)" : "(+)") << ", converted_start: " << converted_start << endl; cout << a.QueryBases << endl; cout << a.Qualities << endl; exit(1); } return copy; }
// print BamAlignment in FASTA format // N.B. - uses QueryBases NOT AlignedBases void ConvertTool::ConvertToolPrivate::PrintFasta(const BamAlignment& a) { // >BamAlignment.Name // BamAlignment.QueryBases (up to FASTA_LINE_MAX bases per line) // ... // // N.B. - QueryBases are reverse-complemented if aligned to reverse strand // print header m_out << ">" << a.Name << endl; // handle reverse strand alignment - bases string sequence = a.QueryBases; if ( a.IsReverseStrand() ) Utilities::ReverseComplement(sequence); // if sequence fits on single line if ( sequence.length() <= FASTA_LINE_MAX ) m_out << sequence << endl; // else split over multiple lines else { size_t position = 0; size_t seqLength = sequence.length(); // handle reverse strand alignment - bases & qualitiesth(); // write subsequences to each line while ( position < (seqLength - FASTA_LINE_MAX) ) { m_out << sequence.substr(position, FASTA_LINE_MAX) << endl; position += FASTA_LINE_MAX; } // write final subsequence m_out << sequence.substr(position) << endl; } }
void GenericIndividualSnpCall::PyroHMMsnp(Fasta &fastaObj, BamReader &bamObj, int chrID, int leftPosition, int rightPosition, GenericProbabilisticAlignment &probAligner, list<Allele>& allelesInBlock, VariantCallSetting& snpCallSettings, vector<GenericVariant> &variantResults) { VariantCallSetting settingForPyroHMMsnp = snpCallSettings; // allele pool vector<Allele> allelePool; for (list<Allele>::iterator allelesInBlockIter=allelesInBlock.begin(); allelesInBlockIter!=allelesInBlock.end(); allelesInBlockIter++) { allelePool.push_back(*allelesInBlockIter); } // add 10bp flanking segment at each side int windowLeftPosition = leftPosition - snpCallSettings.m_flankingSize; int windowRightPosition = rightPosition + snpCallSettings.m_flankingSize; // genome string genome; fastaObj.GetSequence(chrID, windowLeftPosition, windowRightPosition, genome); int globalDepth; double globalMapQual; int globalStrandPos; int globalStrandNeg; vector<PyroHMMsnp_Sequence_t> readsInWindow; // rewind BAM reader bamObj.Rewind(); // set BAM region bamObj.SetRegion(chrID, windowLeftPosition, chrID, windowRightPosition); // read alignment BamAlignment al; while (bamObj.GetNextAlignment(al)) { // skip if it is not a good alignment if (!GenericBamAlignmentTools::goodAlignment(al)) { continue; } // skip if it is not valid at length if (!GenericBamAlignmentTools::validReadLength(al, m_minReadLength)) { continue; } // skip if it is not valid at map quality if (!GenericBamAlignmentTools::validMapQuality(al, m_minMapQuality)) { continue; } // skip if it is not valid at alignment identity if (!GenericBamAlignmentTools::validReadIdentity(al, m_maxMismatchFrac)) { continue; } // global info globalDepth += 1; globalMapQual += al.MapQuality*al.MapQuality; if (al.IsReverseStrand()) globalStrandNeg += 1; else globalStrandPos += 1; // get local alignment string t_localRead, t_localGenome; Cigar t_cigar; BamMD t_md; int t_numMismatch, t_numInDel; GenericBamAlignmentTools::getLocalAlignment(al, windowLeftPosition, windowRightPosition-windowLeftPosition, t_localRead, t_localGenome, t_cigar, t_md, t_numMismatch, t_numInDel); if (t_localRead.empty() || t_localGenome.empty()) continue; // save into set PyroHMMsnp_Sequence_t t_seq; t_seq.t_ID = GenericBamAlignmentTools::getBamAlignmentID(al); t_seq.t_sequence = t_localRead; t_seq.t_cigar = t_cigar; t_seq.t_md = t_md; t_seq.t_numMismatch = t_numMismatch; t_seq.t_numInDel = t_numInDel; t_seq.t_mapQualScore = al.MapQuality; if (al.Position>windowLeftPosition) t_seq.t_startPositionShift = al.Position-windowLeftPosition; else t_seq.t_startPositionShift = 0; if (al.GetEndPosition()<windowRightPosition) t_seq.t_endPositionShift = windowRightPosition-al.GetEndPosition(); else t_seq.t_endPositionShift = 0; readsInWindow.push_back(t_seq); } int numData = readsInWindow.size(); // construct the consensus sequence graph GenericDagGraph consensusGraph; vector<string> consensusGraphReads; vector<Cigar> consensusGraphReadCigars; vector<int> consensusGraphReadStarts; // set of aligned reads to construct the graph for (int i=0; i<numData; ++i) { consensusGraphReads.push_back(readsInWindow[i].t_sequence); consensusGraphReadCigars.push_back(readsInWindow[i].t_cigar); consensusGraphReadStarts.push_back(readsInWindow[i].t_startPositionShift); } // build up the graph consensusGraph.buildDagGraph(genome, consensusGraphReads, consensusGraphReadCigars, consensusGraphReadStarts); consensusGraph.edgePruning(snpCallSettings.m_graphPruneLevel); // search topK paths, excluding reference vector<string> topRankConsensusGraphPaths; vector<list<Vertex>> topRankConsensusGraphPathVertexs; vector<double> topRankConsensusGraphPathWeights; consensusGraph.topRankPathsExcludeGenome(30, topRankConsensusGraphPaths, topRankConsensusGraphPathVertexs, topRankConsensusGraphPathWeights); // change vertex list to vertex set vector<set<Vertex>> topRankConsensusGraphPathVertexSet; for (int i=0; i<topRankConsensusGraphPathVertexs.size(); i++) { list<Vertex>::iterator vertexIter = topRankConsensusGraphPathVertexs[i].begin(); set<Vertex> vertexSet; for (; vertexIter!=topRankConsensusGraphPathVertexs[i].end(); vertexIter++) { vertexSet.insert(*vertexIter); } topRankConsensusGraphPathVertexSet.push_back(vertexSet); } // get variant vertices vector<int> allelePositions; vector<string> alleleChars; for (list<Allele>::iterator alleleIter=allelesInBlock.begin(); alleleIter!=allelesInBlock.end(); alleleIter++) { Allele allele = *alleleIter; allelePositions.push_back(allele.m_chrPosition-windowLeftPosition); alleleChars.push_back(allele.m_allele); } // map allele to graph vertex set<Vertex> variantVertexs; map<int,Vertex> mapAlleleToVertex; map<Vertex,int> mapVertexToAllele; for (int v=0; v<consensusGraph.m_numVertexs; v++) { if (consensusGraph.m_skip[v]) continue; if (!consensusGraph.m_isMismatch[v]) continue; int gp = consensusGraph.m_genomePosition[v] - 1; for (int j=0; j<allelePool.size(); j++) { int ap = allelePositions[j]; if (ap==gp) { if (alleleChars[j]==consensusGraph.m_labels[v]) { variantVertexs.insert(v); mapAlleleToVertex[j] = v; mapVertexToAllele[v] = j; } } } } // set up the haplotypes vector<string> haplotypes; vector<int> haplotypeToPathIndex; vector<set<Vertex>> haplotypeVariantVertexs; haplotypes.push_back(genome); haplotypeToPathIndex.push_back(-1); haplotypeVariantVertexs.push_back(set<Vertex>()); int kk = 0; for (int i=0; i<topRankConsensusGraphPaths.size(); i++) { if (kk>=snpCallSettings.m_topK) continue; bool hasVariantVertex = false; int deltaLength = (topRankConsensusGraphPaths[i].length()-genome.length()); deltaLength = abs(deltaLength); if (deltaLength>5) continue; set<Vertex> pathVertexs = topRankConsensusGraphPathVertexSet[i]; set<Vertex> pathVariantVertexs; for (set<Vertex>::iterator variantIter=variantVertexs.begin(); variantIter!=variantVertexs.end(); variantIter++) { if (pathVertexs.find(*variantIter)!=pathVertexs.end()) { hasVariantVertex = true; pathVariantVertexs.insert(*variantIter); } } int totalNumberVariantVertexInPath = 0; for (set<Vertex>::iterator vertexIter=pathVertexs.begin(); vertexIter!=pathVertexs.end(); vertexIter++) { int v = *vertexIter; if (consensusGraph.m_isMismatch[v]) { totalNumberVariantVertexInPath += 1; } } if (hasVariantVertex && totalNumberVariantVertexInPath<=pathVariantVertexs.size()) { haplotypes.push_back(topRankConsensusGraphPaths[i]); haplotypeToPathIndex.push_back(i); haplotypeVariantVertexs.push_back(pathVariantVertexs); kk++; } } int numHaplotypes = haplotypes.size(); // skip if there is no variant haplotype if (numHaplotypes==1) { return; } // compute haplotype data likelihood vector<vector<long double>> haplotypeDataLikelihoods(numHaplotypes); PyroHMMsnpHaplotypeDataLikelihood(probAligner, snpCallSettings.m_band, numHaplotypes, haplotypes, readsInWindow, haplotypeDataLikelihoods); // genotype vector<vector<int>> genotypes; set<set<int>> genotypeDiscovered; for (int i=0; i<numHaplotypes; i++) { vector<int> precedeHaplotypes; PyroHMMsnpGenotypeSet(snpCallSettings.m_ploidy, i, numHaplotypes, precedeHaplotypes, genotypes, genotypeDiscovered); } int numGenotypes = genotypes.size(); // genotype variant vertex vector<set<Vertex>> genotypeVariantVertexs; for (int i=0; i<numGenotypes; i++) { set<Vertex> variantVertexInGenotype; for (int j=0; j<settingForPyroHMMsnp.m_ploidy; j++) { int haplotype = genotypes[i][j]; set<Vertex> variantVertexInHaplotype = haplotypeVariantVertexs[haplotype]; variantVertexInGenotype.insert(variantVertexInHaplotype.begin(), variantVertexInHaplotype.end()); } genotypeVariantVertexs.push_back(variantVertexInGenotype); } // genotype priors vector<long double> genotypePriors(numGenotypes); PyroHMMsnpGenotypePrior(numGenotypes, genotypes, settingForPyroHMMsnp, genotypePriors); // genotype likelihoods vector<long double> genotypeLikelihoods(numGenotypes); PyroHMMsnpGenotypeLikelihood(numGenotypes, genotypes, readsInWindow.size(), haplotypeDataLikelihoods, snpCallSettings, genotypeLikelihoods); // genotype posteriors vector<long double> genotypePosteriors(numGenotypes); PyroHMMsnpGenotypePosterior(numGenotypes, genotypePriors, genotypeLikelihoods, genotypePosteriors); // search maximal genotype posterior long double maxGenotypePosterior = 0; int inferGenotype; for (int i=1; i<numGenotypes; i++) { if (maxGenotypePosterior<genotypePosteriors[i]) { maxGenotypePosterior = genotypePosteriors[i]; inferGenotype = i; } } // all variant vertexs in the inferred genotype set<Vertex> inferGenotypeVariantVertexs = genotypeVariantVertexs[inferGenotype]; // count haploid type of variant map<Vertex,vector<int>> inferGenotypeVariantHaploidType; set<Vertex>::iterator inferVariantIter = inferGenotypeVariantVertexs.begin(); for (; inferVariantIter!=inferGenotypeVariantVertexs.end(); inferVariantIter++) { int v = *inferVariantIter; vector<int> variantHaploidType; for (int j=0; j<settingForPyroHMMsnp.m_ploidy; j++) { int haplotype = genotypes[inferGenotype][j]; set<Vertex> variantVertexInHaplotype = haplotypeVariantVertexs[haplotype]; if (variantVertexInHaplotype.find(v)==variantVertexInHaplotype.end()) { variantHaploidType.push_back(0); }else { variantHaploidType.push_back(1); } } inferGenotypeVariantHaploidType[v] = variantHaploidType; } // variant score map<Vertex,long double> inferGenotypeVariantScore; inferVariantIter = inferGenotypeVariantVertexs.begin(); for (; inferVariantIter!=inferGenotypeVariantVertexs.end(); inferVariantIter++) { int v = *inferVariantIter; long double variantScore = 0; for (int i=0; i<numGenotypes; i++) { set<Vertex> variantVertexInGenotype = genotypeVariantVertexs[i]; if (variantVertexInGenotype.find(v)!=variantVertexInGenotype.end()) variantScore += genotypePosteriors[i]; } inferGenotypeVariantScore[v] = variantScore; } // save variant result inferVariantIter = inferGenotypeVariantVertexs.begin(); for (; inferVariantIter!=inferGenotypeVariantVertexs.end(); inferVariantIter++) { GenericVariant result; int v = *inferVariantIter; int a = mapVertexToAllele[v]; int variantChrID; int variantChrPos; vector<int> haploidType = inferGenotypeVariantHaploidType[v]; for (int j=0; j<settingForPyroHMMsnp.m_ploidy; j++) { if (haploidType[j]==0) { int g = consensusGraph.m_genomePosition[v]; Allele allele; allele.m_allele = consensusGraph.m_labels[g]; result.m_alleles.push_back(allele); }else { Allele allele = allelePool[a]; result.m_alleles.push_back(allele); variantChrID = allele.m_chrID; variantChrPos = allele.m_chrPosition; } } result.m_chrID = variantChrID; result.m_chrPosition = variantChrPos; result.m_probScoreRef = genotypePosteriors[0]; result.m_probScoreVar = genotypePosteriors[inferGenotype]; result.m_variantType = VARIANT_SNP; long double variantScore = inferGenotypeVariantScore[v]; if (fabs(1-variantScore)<1e-300) result.m_quality = 3000; else if (variantScore<1e-300) result.m_quality = 0; else result.m_quality = -10*log10(1-variantScore); char refBase; fastaObj.GetBase(result.m_chrID, result.m_chrPosition, refBase); result.m_reference = refBase; for (int i=0; i<result.m_alleles.size(); i++) { if (result.m_alleles[i].m_allele==result.m_reference) result.m_haploidType.push_back(0); else result.m_haploidType.push_back(1); } // filter if (result.m_quality>=snpCallSettings.m_variantQualityFilter) variantResults.push_back(result); } }
int main_asequantmultirg(const vector<string> &all_args) { Init(all_args); cerr << "* Reading bam file " << endl; OpenBam(bam_reader, bam_file); bam_reader.OpenIndex(bam_file + ".bai"); vector<string> readGroupVector; SamHeader header = bam_reader.GetHeader(); SamReadGroupDictionary headerRG = header.ReadGroups; for (SamReadGroupIterator it = headerRG.Begin(); it != headerRG.End(); it ++) { readGroupVector.push_back(it -> ID); } vector<RefData> chroms = bam_reader.GetReferenceData(); cout << "#CHROM" << "\t" << "POS" << "\t" << "REF" << "\t" << "ALT"; for (vector<string>::iterator it = readGroupVector.begin(); it != readGroupVector.end(); it ++) { cout << "\t" << *it; } cout << endl; StlFor(chrom_idx, chroms) { string &chrom = chroms[chrom_idx].RefName; vector<Snp> snps = snps_by_chrom[chrom]; int s = 0; // Index into snp array BamAlignment bam; bam_reader.Jump(chrom_idx); string align; string qualities; cerr << "* On chrom " << chrom << endl; while (bam_reader.GetNextAlignment(bam) && bam.RefID == chrom_idx) { if (bam.MapQuality < min_map_qual || !bam.IsMapped()) continue; string currentRG; Assert(bam.GetReadGroup(currentRG)); int start = AlignStart(bam); int end = AlignEnd(bam); // Move the current SNP pointer so that it is ahead of the read's start (since bam alignments are in sorted order) while (s < snps.size() && snps[s].pos < start) ++s; // Stop everything if we have visited all SNPs on this chrom if (s >= snps.size()) break; // Find any/all SNPs that are within the bam alignment int n = 0; // Number of SNPs overlapped while ((s + n) < snps.size() && snps[s + n].pos < end) // Then it overlaps! ++n; // Now, look at each SNP and see which way it votes AlignedString(bam, align); AlignedQualities(bam, qualities); Assert(align.size() == qualities.size()); // Now, tally votes for (int i = 0; i < n; ++i) { Snp &snp = snps[s + i]; char base = align[snp.pos - start]; // Base from the read int qual = int(qualities[snp.pos - start]) - ascii_offset; // Base from the read //AssertMsg(qual >= 0 && qual <= 100, ToStr(qual) + "\n" + bam.Name + "\n" + CigarToStr(bam.CigarData) + "\n" + bam.QueryBases + "\n" + bam.Qualities); if (base == '-' || qual < min_base_qual) continue; map<string, Counts> &RG_counts = bam.IsReverseStrand() ? snp.rev : snp.fwd; map<string, Counts>::iterator searchIt = RG_counts.find(currentRG); if (searchIt == RG_counts.end()) { if (base == snp.ref) { RG_counts[currentRG].num_ref = 1; RG_counts[currentRG].num_alt = 0; RG_counts[currentRG].num_other = 0; } else if (base == snp.alt) { RG_counts[currentRG].num_ref = 0; RG_counts[currentRG].num_alt = 1; RG_counts[currentRG].num_other = 0; } else { RG_counts[currentRG].num_ref = 0; RG_counts[currentRG].num_alt = 0; RG_counts[currentRG].num_other = 1; } } else { if (base == snp.ref) { searchIt -> second.num_ref += 1; } else if (base == snp.alt) { searchIt -> second.num_alt += 1; } else { searchIt -> second.num_other += 1; } } } } // Output counts for (int s = 0; s < snps.size(); ++s) { cout << chrom << "\t" << snps[s].pos + 1 << "\t" << snps[s].ref << "\t" << snps[s].alt; for (vector<string>::iterator it = readGroupVector.begin(); it != readGroupVector.end(); it ++) { map<string, Counts>::iterator searchIt = snps[s].fwd.find(*it); if (searchIt != snps[s].fwd.end()) { cout << "\t" << searchIt -> second.num_ref << "," << searchIt -> second.num_alt << "," << searchIt -> second.num_other << ","; } else { cout << "\t" << "0,0,0,"; } searchIt = snps[s].rev.find(*it); if (searchIt != snps[s].rev.end()) { cout << searchIt -> second.num_ref << "," << searchIt -> second.num_alt << "," << searchIt -> second.num_other; } else { cout << "0,0,0"; } } cout << endl; } }
int IonstatsTestFragments(int argc, const char *argv[]) { OptArgs opts; opts.ParseCmdLine(argc, argv); string input_bam_filename = opts.GetFirstString('i', "input", ""); string fasta_filename = opts.GetFirstString('r', "ref", ""); string output_json_filename = opts.GetFirstString('o', "output", "ionstats_tf.json"); int histogram_length = opts.GetFirstInt ('h', "histogram-length", 400); if(argc < 2 or input_bam_filename.empty() or fasta_filename.empty()) { IonstatsTestFragmentsHelp(); return 1; } // // Prepare for metric calculation // map<string,string> tf_sequences; PopulateReferenceSequences(tf_sequences, fasta_filename); BamReader input_bam; if (!input_bam.Open(input_bam_filename)) { fprintf(stderr, "[ionstats] ERROR: cannot open %s\n", input_bam_filename.c_str()); return 1; } int num_tfs = input_bam.GetReferenceCount(); SamHeader sam_header = input_bam.GetHeader(); if(!sam_header.HasReadGroups()) { fprintf(stderr, "[ionstats] ERROR: no read groups in %s\n", input_bam_filename.c_str()); return 1; } string flow_order; string key; for (SamReadGroupIterator rg = sam_header.ReadGroups.Begin(); rg != sam_header.ReadGroups.End(); ++rg) { if(rg->HasFlowOrder()) flow_order = rg->FlowOrder; if(rg->HasKeySequence()) key = rg->KeySequence; } // Need these metrics stratified by TF. vector<ReadLengthHistogram> called_histogram(num_tfs); vector<ReadLengthHistogram> aligned_histogram(num_tfs); vector<ReadLengthHistogram> AQ10_histogram(num_tfs); vector<ReadLengthHistogram> AQ17_histogram(num_tfs); vector<SimpleHistogram> error_by_position(num_tfs); vector<MetricGeneratorSNR> system_snr(num_tfs); vector<MetricGeneratorHPAccuracy> hp_accuracy(num_tfs); for (int tf = 0; tf < num_tfs; ++tf) { called_histogram[tf].Initialize(histogram_length); aligned_histogram[tf].Initialize(histogram_length); AQ10_histogram[tf].Initialize(histogram_length); AQ17_histogram[tf].Initialize(histogram_length); error_by_position[tf].Initialize(histogram_length); } vector<uint16_t> flow_signal_fz(flow_order.length()); vector<int16_t> flow_signal_zm(flow_order.length()); const RefVector& refs = input_bam.GetReferenceData(); // Missing: // - hp accuracy - tough, copy verbatim from TFMapper? BamAlignment alignment; vector<char> MD_op; vector<int> MD_len; MD_op.reserve(1024); MD_len.reserve(1024); string MD_tag; // // Main loop over mapped reads in the input BAM // while(input_bam.GetNextAlignment(alignment)) { if (!alignment.IsMapped() or !alignment.GetTag("MD",MD_tag)) continue; // The check below eliminates unexpected alignments if (alignment.IsReverseStrand() or alignment.Position > 5) continue; int current_tf = alignment.RefID; // // Step 1. Parse MD tag // MD_op.clear(); MD_len.clear(); for (const char *MD_ptr = MD_tag.c_str(); *MD_ptr;) { int item_length = 0; if (*MD_ptr >= '0' and *MD_ptr <= '9') { // Its a match MD_op.push_back('M'); for (; *MD_ptr and *MD_ptr >= '0' and *MD_ptr <= '9'; ++MD_ptr) item_length = 10*item_length + *MD_ptr - '0'; } else { if (*MD_ptr == '^') { // Its a deletion MD_ptr++; MD_op.push_back('D'); } else // Its a substitution MD_op.push_back('X'); for (; *MD_ptr and *MD_ptr >= 'A' and *MD_ptr <= 'Z'; ++MD_ptr) item_length++; } MD_len.push_back(item_length); } // // Step 2. Synchronously scan through Cigar and MD, doing error accounting // int MD_idx = alignment.IsReverseStrand() ? MD_op.size()-1 : 0; int cigar_idx = alignment.IsReverseStrand() ? alignment.CigarData.size()-1 : 0; int increment = alignment.IsReverseStrand() ? -1 : 1; int AQ10_bases = 0; int AQ17_bases = 0; int num_bases = 0; int num_errors = 0; while (cigar_idx < (int)alignment.CigarData.size() and MD_idx < (int) MD_op.size() and cigar_idx >= 0 and MD_idx >= 0) { if (alignment.CigarData[cigar_idx].Length == 0) { // Try advancing cigar cigar_idx += increment; continue; } if (MD_len[MD_idx] == 0) { // Try advancing MD MD_idx += increment; continue; } // Match if (alignment.CigarData[cigar_idx].Type == 'M' and MD_op[MD_idx] == 'M') { int advance = min((int)alignment.CigarData[cigar_idx].Length, MD_len[MD_idx]); num_bases += advance; alignment.CigarData[cigar_idx].Length -= advance; MD_len[MD_idx] -= advance; // Insertion (read has a base, reference doesn't) } else if (alignment.CigarData[cigar_idx].Type == 'I') { int advance = alignment.CigarData[cigar_idx].Length; for (int cnt = 0; cnt < advance; ++cnt) { error_by_position[current_tf].Add(num_bases); num_bases++; num_errors++; } alignment.CigarData[cigar_idx].Length -= advance; // Deletion (reference has a base, read doesn't) } else if (alignment.CigarData[cigar_idx].Type == 'D' and MD_op[MD_idx] == 'D') { int advance = min((int)alignment.CigarData[cigar_idx].Length, MD_len[MD_idx]); for (int cnt = 0; cnt < advance; ++cnt) { error_by_position[current_tf].Add(num_bases); num_errors++; } alignment.CigarData[cigar_idx].Length -= advance; MD_len[MD_idx] -= advance; // Substitution } else if (MD_op[MD_idx] == 'X') { int advance = min((int)alignment.CigarData[cigar_idx].Length, MD_len[MD_idx]); for (int cnt = 0; cnt < advance; ++cnt) { error_by_position[current_tf].Add(num_bases); num_bases++; num_errors++; } alignment.CigarData[cigar_idx].Length -= advance; MD_len[MD_idx] -= advance; } else { printf("ionstats tf: Unexpected OP combination: %s Cigar=%c, MD=%c !\n", alignment.Name.c_str(), alignment.CigarData[cigar_idx].Type, MD_op[MD_idx]); break; } if (num_errors*10 <= num_bases) AQ10_bases = num_bases; if (num_errors*50 <= num_bases) AQ17_bases = num_bases; } // // Step 3. Profit // called_histogram[current_tf].Add(alignment.Length); aligned_histogram[current_tf].Add(num_bases); AQ10_histogram[current_tf].Add(AQ10_bases); AQ17_histogram[current_tf].Add(AQ17_bases); if(alignment.GetTag("ZM", flow_signal_zm)) system_snr[current_tf].Add(flow_signal_zm, key.c_str(), flow_order); else if(alignment.GetTag("FZ", flow_signal_fz)) system_snr[current_tf].Add(flow_signal_fz, key.c_str(), flow_order); // HP accuracy - keeping it simple if (!alignment.IsReverseStrand()) { string genome = key + tf_sequences[refs[current_tf].RefName]; string calls = key + alignment.QueryBases; const char *genome_ptr = genome.c_str(); const char *calls_ptr = calls.c_str(); for (int flow = 0; flow < (int)flow_order.length() and *genome_ptr and *calls_ptr; ++flow) { int genome_hp = 0; int calls_hp = 0; while (*genome_ptr == flow_order[flow]) { genome_hp++; genome_ptr++; } while (*calls_ptr == flow_order[flow]) { calls_hp++; calls_ptr++; } hp_accuracy[current_tf].Add(genome_hp, calls_hp); } } } // // Processing complete, generate ionstats_tf.json // Json::Value output_json(Json::objectValue); output_json["meta"]["creation_date"] = get_time_iso_string(time(NULL)); output_json["meta"]["format_name"] = "ionstats_tf"; output_json["meta"]["format_version"] = "1.0"; output_json["results_by_tf"] = Json::objectValue; for (int tf = 0; tf < num_tfs; ++tf) { if (aligned_histogram[tf].num_reads() < 1000) continue; called_histogram[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]["full"]); aligned_histogram[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]["aligned"]); AQ10_histogram[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]["AQ10"]); AQ17_histogram[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]["AQ17"]); error_by_position[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]["error_by_position"]); system_snr[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]); hp_accuracy[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]); output_json["results_by_tf"][refs[tf].RefName]["sequence"] = tf_sequences[refs[tf].RefName]; } input_bam.Close(); ofstream out(output_json_filename.c_str(), ios::out); if (out.good()) { out << output_json.toStyledString(); return 0; } else { fprintf(stderr, "ERROR: unable to write to '%s'\n", output_json_filename.c_str()); return 1; } }
//{{{ SV_SplitRead:: SV_SplitRead(vector< BamAlignment > &block, SV_SplitRead:: SV_SplitRead(const BamAlignment &bam_a, const BamAlignment &bam_b, const RefVector &refs, int _weight, int _id, int _sample_id, SV_SplitReadReader *_reader) { reader = _reader; sample_id = _sample_id; if ( bam_a.MapQuality < bam_b.MapQuality ) min_mapping_quality = bam_a.MapQuality; else min_mapping_quality = bam_b.MapQuality; struct cigar_query query_a = calc_query_pos_from_cigar(bam_a.CigarData, bam_a.IsReverseStrand() ); struct cigar_query query_b = calc_query_pos_from_cigar(bam_b.CigarData, bam_b.IsReverseStrand() ); struct interval tmp_a, tmp_b; tmp_a.strand = '+'; if (bam_a.IsReverseStrand()) tmp_a.strand = '-'; tmp_a.chr = refs.at(bam_a.RefID).RefName; tmp_a.start = bam_a.Position; tmp_a.end = bam_a.GetEndPosition(); tmp_b.strand = '+'; if (bam_b.IsReverseStrand()) tmp_b.strand = '-'; tmp_b.chr = refs.at(bam_b.RefID).RefName; tmp_b.start = bam_b.Position; tmp_b.end = bam_b.GetEndPosition(); //if ( ( tmp_a.chr.compare(tmp_b.chr) > 0 ) || //( ( tmp_a.chr.compare(tmp_b.chr) == 0 ) && //( tmp_a.start > tmp_b.start ) ) ) { if ( (bam_a.RefID > bam_b.RefID) || ( (bam_a.RefID == bam_b.RefID) && (tmp_a.start > tmp_b.start ) ) ) { side_r = tmp_a; side_l = tmp_b; query_r = query_a; query_l = query_b; } else { side_l = tmp_a; side_r = tmp_b; query_l = query_a; query_r = query_b; } if (side_l.strand != side_r.strand) type = SV_BreakPoint::INVERSION; else if ( ( ( side_l.strand == '+' ) && ( side_r.strand == '+' ) && ( query_l.qs_pos < query_r.qs_pos ) ) || ( ( side_l.strand == '-' ) && ( side_r.strand == '-' ) && ( query_l.qs_pos > query_r.qs_pos) ) ) type = SV_BreakPoint::DELETION; else if ( ( ( side_l.strand == '+' ) && ( side_r.strand == '+' ) && ( query_l.qs_pos > query_r.qs_pos ) ) || ( ( side_l.strand == '-' ) && ( side_r.strand == '-' ) && ( query_l.qs_pos < query_r.qs_pos) ) ) type = SV_BreakPoint::DUPLICATION; else { cerr << "ERROR IN BAM FILE. " << "TYPE not detected (DELETION,DUPLICATION,INVERSION)" << endl; cerr << "\t" << query_l.qs_pos << "," << side_l.strand << "\t" << query_r.qs_pos << "," << side_r.strand << "\t" << tmp_a.chr << "," << tmp_a.start << "," << tmp_a.end << "\t" << tmp_b.chr << "," << tmp_b.start << "," << tmp_b.end << "\t" << endl; throw(1); } weight = _weight; id = _id; }
int main (int argc, char *argv[]) { int minBaseQuality = 0; string usage=string(""+string(argv[0])+" [in BAM file] [in VCF file] [chr name] [deam out BAM] [not deam out BAM]"+ "\nThis program divides aligned single end reads into potentially deaminated\n"+ "\nreads and the puts the rest into another bam file if the deaminated positions are not called as the alternative base in the VCF.\n"+ "\nTip: if you do not need one of them, use /dev/null as your output\n"+ "arguments:\n"+ "\t"+"--bq [base qual] : Minimum base quality to flag a deaminated site (Default: "+stringify(minBaseQuality)+")\n"+ "\n"); if(argc == 1 || argc < 4 || (argc == 2 && (string(argv[0]) == "-h" || string(argv[0]) == "--help") ) ){ cerr << "Usage "<<usage<<endl; return 1; } for(int i=1;i<(argc-2);i++){ if(string(argv[i]) == "--bq"){ minBaseQuality=destringify<int>(argv[i+1]); i++; continue; } } string bamfiletopen = string( argv[ argc-5 ] ); string vcffiletopen = string( argv[ argc-4 ] ); string chrname = string( argv[ argc-3 ] ); string deambam = string( argv[ argc-2 ] ); string nondeambam = string( argv[ argc-1 ] ); //dummy reader, will need to reposition anyway VCFreader vcfr (vcffiletopen, vcffiletopen+".tbi", chrname, 1, 1, 0); BamReader reader; if ( !reader.Open(bamfiletopen) ) { cerr << "Could not open input BAM file"<< bamfiletopen << endl; return 1; } // if ( !reader.LocateIndex() ) { // cerr << "The index for the BAM file cannot be located" << endl; // return 1; // } // if ( !reader.HasIndex() ) { // cerr << "The BAM file has not been indexed." << endl; // return 1; // } //positioning the bam file int refid=reader.GetReferenceID(chrname); if(refid < 0){ cerr << "Cannot retrieve the reference ID for "<< chrname << endl; return 1; } //cout<<"redif "<<refid<<endl; //setting the BAM reader at that position reader.SetRegion(refid, 0, refid, -1); vector<RefData> testRefData=reader.GetReferenceData(); const SamHeader header = reader.GetHeader(); const RefVector references = reader.GetReferenceData(); BamWriter writerDeam; if ( !writerDeam.Open(deambam, header, references) ) { cerr << "Could not open output BAM file" << endl; return 1; } BamWriter writerNoDeam; if ( !writerNoDeam.Open(nondeambam, header, references) ) { cerr << "Could not open output BAM file" << endl; return 1; } unsigned int totalReads =0; unsigned int deaminatedReads =0; unsigned int ndeaminatedReads =0; unsigned int skipped =0; //iterating over the alignments for these regions BamAlignment al; int i; while ( reader.GetNextAlignment(al) ) { // cerr<<al.Name<<endl; //skip unmapped if(!al.IsMapped()){ skipped++; continue; } //skip paired end ! if(al.IsPaired() ){ continue; // cerr<<"Paired end not yet coded"<<endl; // return 1; } string reconstructedReference = reconstructRef(&al); char refeBase; char readBase; bool isDeaminated; if(al.Qualities.size() != reconstructedReference.size()){ cerr<<"Quality line is not the same size as the reconstructed reference"<<endl; return 1; } isDeaminated=false; if(al.IsReverseStrand()){ //first base next to 3' i = 0 ; refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); if( readBase == 'A' && int(al.Qualities[i]-offset) >= minBaseQuality){ //isDeaminated=true; } if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } transformRef(&refeBase,&readBase); vcfr.repositionIterator(chrname,al.Position+1,al.Position+1); while(vcfr.hasData()){ SimpleVCF * toprint=vcfr.getData(); // cout<<*toprint<<endl; //skip deletions in the alt if(toprint->getRef().length() != 1 ) continue; if(toprint->getRef()[0] != refeBase){ cerr<<reconstructedReference<<endl; cerr<<al.Position<<endl; cerr<<numberOfDeletions(&al)<<endl; cerr<<"Problem1 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; exit(1); } //if the VCF has a at least one G but no A if( toprint->hasAtLeastOneG() && !toprint->hasAtLeastOneA() ){ isDeaminated=true; } } } //second base next to 3' i = 1; refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); //refeBase == 'G' && if( readBase == 'A' && int(al.Qualities[i]-offset) >= minBaseQuality){ //isDeaminated=true; } if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } transformRef(&refeBase,&readBase); vcfr.repositionIterator(chrname,al.Position+2,al.Position+2); while(vcfr.hasData()){ SimpleVCF * toprint=vcfr.getData(); // cout<<*toprint<<endl; //skip deletions in the alt if(toprint->getRef().length() != 1 ) continue; if(toprint->getRef()[0] != refeBase){ cerr<<reconstructedReference<<endl; cerr<<al.Position<<endl; cerr<<numberOfDeletions(&al)<<endl; cerr<<"Problem2 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; exit(1); } //if the VCF has at least one G but no A // if(toprint->hasAtLeastOneG() && // toprint->getAlt().find("A") == string::npos){ if( toprint->hasAtLeastOneG() && !toprint->hasAtLeastOneA() ){ isDeaminated=true; } } } //last base next to 5' i = (al.QueryBases.length()-1) ; refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); //refeBase == 'G' && if( readBase == 'A' && int(al.Qualities[i]-offset) >= minBaseQuality){ //isDeaminated=true; } if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } transformRef(&refeBase,&readBase); int lengthMatches=countMatchesRecons(reconstructedReference,0); int positionJump = al.Position+lengthMatches+numberOfDeletions(&al); vcfr.repositionIterator(chrname,positionJump,positionJump); while(vcfr.hasData()){ SimpleVCF * toprint=vcfr.getData(); //skip deletions in the alt if(toprint->getRef().length() != 1 ) continue; if(toprint->getRef()[0] != refeBase){ cerr<<reconstructedReference<<endl; cerr<<al.Position<<endl; cerr<<lengthMatches<<endl; cerr<<numberOfDeletions(&al)<<endl; cerr<<positionJump<<endl; cerr<<"Problem3 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; exit(1); } //if the VCF has at least one G but no A if( toprint->hasAtLeastOneG() && !toprint->hasAtLeastOneA() ){ isDeaminated=true; } } } }else{ //first base next to 5' i = 0; refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); //refeBase == 'C' if( readBase == 'T' && int(al.Qualities[i]-offset) >= minBaseQuality){ if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } transformRef(&refeBase,&readBase); vcfr.repositionIterator(chrname,al.Position+1,al.Position+1); while(vcfr.hasData()){ SimpleVCF * toprint=vcfr.getData(); //cout<<*toprint<<endl; //skip deletions in the alt if(toprint->getRef().length() != 1 ) continue; if(toprint->getRef()[0] != refeBase){ cerr<<reconstructedReference<<endl; cerr<<al.Position<<endl; cerr<<numberOfDeletions(&al)<<endl; cerr<<"Problem4 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; exit(1); } //if the VCF has at least one C but no T if( toprint->hasAtLeastOneC() && !toprint->hasAtLeastOneT() ){ isDeaminated=true; } } //cout<<al.Position+ } //second last base next to 3' i = (al.QueryBases.length()-2); refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); //refeBase == 'C' && if( readBase == 'T' && int(al.Qualities[i]-offset) >= minBaseQuality){ if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } transformRef(&refeBase,&readBase); int lengthMatches=countMatchesRecons(reconstructedReference,1); int positionJump = al.Position+lengthMatches+numberOfDeletions(&al); vcfr.repositionIterator(chrname,positionJump,positionJump); while(vcfr.hasData()){ SimpleVCF * toprint=vcfr.getData(); //skip deletions in the alt if(toprint->getRef().length() != 1 ) continue; if(toprint->getRef()[0] != refeBase){ cerr<<reconstructedReference<<endl; cerr<<al.Position<<endl; cerr<<lengthMatches<<endl; cerr<<numberOfDeletions(&al)<<endl; cerr<<positionJump<<endl; cerr<<"Problem5 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; exit(1); } if( toprint->hasAtLeastOneC() && !toprint->hasAtLeastOneT() ){ isDeaminated=true; } } } //last base next to 3' i = (al.QueryBases.length()-1); refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); //&& refeBase == 'C' if( readBase == 'T' && int(al.Qualities[i]-offset) >= minBaseQuality){ if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } transformRef(&refeBase,&readBase); int lengthMatches=countMatchesRecons(reconstructedReference,0); int positionJump = al.Position+lengthMatches+numberOfDeletions(&al); vcfr.repositionIterator(chrname,positionJump,positionJump); while(vcfr.hasData()){ SimpleVCF * toprint=vcfr.getData(); //skip deletions in the alt if(toprint->getRef().length() != 1 ) continue; if(toprint->getRef()[0] != refeBase){ cerr<<reconstructedReference<<endl; cerr<<al.Position<<endl; cerr<<lengthMatches<<endl; cerr<<numberOfDeletions(&al)<<endl; cerr<<positionJump<<endl; cerr<<"Problem6 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; exit(1); } if( toprint->hasAtLeastOneC() && !toprint->hasAtLeastOneT() ){ isDeaminated=true; } } } } totalReads++; if(isDeaminated){ deaminatedReads++; writerDeam.SaveAlignment(al); }else{ ndeaminatedReads++; writerNoDeam.SaveAlignment(al); } }//end for each read reader.Close(); writerDeam.Close(); writerNoDeam.Close(); cerr<<"Program finished sucessfully, out of "<<totalReads<<" mapped reads (skipped: "<<skipped<<" reads) we flagged "<<deaminatedReads<<" as deaminated and "<<ndeaminatedReads<<" as not deaminated"<<endl; return 0; }
int IonstatsAlignment(int argc, const char *argv[]) { OptArgs opts; opts.ParseCmdLine(argc, argv); string input_bam_filename = opts.GetFirstString('i', "input", ""); string output_json_filename = opts.GetFirstString('o', "output", "ionstats_alignment.json"); int histogram_length = opts.GetFirstInt ('h', "histogram-length", 400); if(argc < 2 or input_bam_filename.empty()) { IonstatsAlignmentHelp(); return 1; } // // Prepare for metric calculation // BamReader input_bam; if (!input_bam.Open(input_bam_filename)) { fprintf(stderr, "[ionstats] ERROR: cannot open %s\n", input_bam_filename.c_str()); return 1; } ReadLengthHistogram called_histogram; ReadLengthHistogram aligned_histogram; ReadLengthHistogram AQ7_histogram; ReadLengthHistogram AQ10_histogram; ReadLengthHistogram AQ17_histogram; ReadLengthHistogram AQ20_histogram; ReadLengthHistogram AQ47_histogram; SimpleHistogram error_by_position; called_histogram.Initialize(histogram_length); aligned_histogram.Initialize(histogram_length); AQ7_histogram.Initialize(histogram_length); AQ10_histogram.Initialize(histogram_length); AQ17_histogram.Initialize(histogram_length); AQ20_histogram.Initialize(histogram_length); AQ47_histogram.Initialize(histogram_length); error_by_position.Initialize(histogram_length); BamAlignment alignment; vector<char> MD_op; vector<int> MD_len; MD_op.reserve(1024); MD_len.reserve(1024); string MD_tag; // // Main loop over mapped reads in the input BAM // while(input_bam.GetNextAlignment(alignment)) { // Record read length called_histogram.Add(alignment.Length); if (!alignment.IsMapped() or !alignment.GetTag("MD",MD_tag)) continue; // // Step 1. Parse MD tag // MD_op.clear(); MD_len.clear(); for (const char *MD_ptr = MD_tag.c_str(); *MD_ptr;) { int item_length = 0; if (*MD_ptr >= '0' and *MD_ptr <= '9') { // Its a match MD_op.push_back('M'); for (; *MD_ptr and *MD_ptr >= '0' and *MD_ptr <= '9'; ++MD_ptr) item_length = 10*item_length + *MD_ptr - '0'; } else { if (*MD_ptr == '^') { // Its a deletion MD_ptr++; MD_op.push_back('D'); } else // Its a substitution MD_op.push_back('X'); for (; *MD_ptr and *MD_ptr >= 'A' and *MD_ptr <= 'Z'; ++MD_ptr) item_length++; } MD_len.push_back(item_length); } // // Step 2. Synchronously scan through Cigar and MD, doing error accounting // int MD_idx = alignment.IsReverseStrand() ? MD_op.size()-1 : 0; int cigar_idx = alignment.IsReverseStrand() ? alignment.CigarData.size()-1 : 0; int increment = alignment.IsReverseStrand() ? -1 : 1; int AQ7_bases = 0; int AQ10_bases = 0; int AQ17_bases = 0; int AQ20_bases = 0; int AQ47_bases = 0; int num_bases = 0; int num_errors = 0; while (cigar_idx < (int)alignment.CigarData.size() and MD_idx < (int) MD_op.size() and cigar_idx >= 0 and MD_idx >= 0) { if (alignment.CigarData[cigar_idx].Length == 0) { // Try advancing cigar cigar_idx += increment; continue; } if (MD_len[MD_idx] == 0) { // Try advancing MD MD_idx += increment; continue; } // Match if (alignment.CigarData[cigar_idx].Type == 'M' and MD_op[MD_idx] == 'M') { int advance = min((int)alignment.CigarData[cigar_idx].Length, MD_len[MD_idx]); num_bases += advance; alignment.CigarData[cigar_idx].Length -= advance; MD_len[MD_idx] -= advance; // Insertion (read has a base, reference doesn't) } else if (alignment.CigarData[cigar_idx].Type == 'I') { int advance = alignment.CigarData[cigar_idx].Length; for (int cnt = 0; cnt < advance; ++cnt) { error_by_position.Add(num_bases); num_bases++; num_errors++; } alignment.CigarData[cigar_idx].Length -= advance; // Deletion (reference has a base, read doesn't) } else if (alignment.CigarData[cigar_idx].Type == 'D' and MD_op[MD_idx] == 'D') { int advance = min((int)alignment.CigarData[cigar_idx].Length, MD_len[MD_idx]); for (int cnt = 0; cnt < advance; ++cnt) { error_by_position.Add(num_bases); num_errors++; } alignment.CigarData[cigar_idx].Length -= advance; MD_len[MD_idx] -= advance; // Substitution } else if (MD_op[MD_idx] == 'X') { int advance = min((int)alignment.CigarData[cigar_idx].Length, MD_len[MD_idx]); for (int cnt = 0; cnt < advance; ++cnt) { error_by_position.Add(num_bases); num_bases++; num_errors++; } alignment.CigarData[cigar_idx].Length -= advance; MD_len[MD_idx] -= advance; } else { printf("ionstats alignment: Unexpected OP combination: %s Cigar=%c, MD=%c !\n", alignment.Name.c_str(), alignment.CigarData[cigar_idx].Type, MD_op[MD_idx]); break; } if (num_errors*5 <= num_bases) AQ7_bases = num_bases; if (num_errors*10 <= num_bases) AQ10_bases = num_bases; if (num_errors*50 <= num_bases) AQ17_bases = num_bases; if (num_errors*100 <= num_bases) AQ20_bases = num_bases; if (num_errors == 0) AQ47_bases = num_bases; } // // Step 3. Profit // if (num_bases >= 20) aligned_histogram.Add(num_bases); if (AQ7_bases >= 20) AQ7_histogram.Add(AQ7_bases); if (AQ10_bases >= 20) AQ10_histogram.Add(AQ10_bases); if (AQ17_bases >= 20) AQ17_histogram.Add(AQ17_bases); if (AQ20_bases >= 20) AQ20_histogram.Add(AQ20_bases); if (AQ47_bases >= 20) AQ47_histogram.Add(AQ47_bases); } input_bam.Close(); // // Processing complete, generate ionstats_alignment.json // Json::Value output_json(Json::objectValue); output_json["meta"]["creation_date"] = get_time_iso_string(time(NULL)); output_json["meta"]["format_name"] = "ionstats_alignment"; output_json["meta"]["format_version"] = "1.0"; called_histogram.SaveToJson(output_json["full"]); aligned_histogram.SaveToJson(output_json["aligned"]); AQ7_histogram.SaveToJson(output_json["AQ7"]); AQ10_histogram.SaveToJson(output_json["AQ10"]); AQ17_histogram.SaveToJson(output_json["AQ17"]); AQ20_histogram.SaveToJson(output_json["AQ20"]); AQ47_histogram.SaveToJson(output_json["AQ47"]); error_by_position.SaveToJson(output_json["error_by_position"]); ofstream out(output_json_filename.c_str(), ios::out); if (out.good()) { out << output_json.toStyledString(); return 0; } else { fprintf(stderr, "ERROR: unable to write to '%s'\n", output_json_filename.c_str()); return 1; } return 0; }
int main (int argc, char *argv[]) { int minBaseQuality = 0; string usage=string(""+string(argv[0])+" [in BAM file] [in VCF file] [chr name] [deam out BAM] [not deam out BAM]"+ "\nThis program divides aligned single end reads into potentially deaminated\n"+ "\nreads and the puts the rest into another bam file if the deaminated positions are not called as the alternative base in the VCF.\n"+ "\nThis is like filterDeaminatedVCF but it loads the VCF before then labels the reads instead of doing it on the fly\n"+ "\nwhich is good if you have many reads in the bam file.\n"+ "\nTip: if you do not need one of them, use /dev/null as your output\n"+ "\narguments:\n"+ "\t"+"--bq [base qual] : Minimum base quality to flag a deaminated site (Default: "+stringify(minBaseQuality)+")\n"+ "\t"+"--1000g [vcf file] : VCF file from 1000g to get the putative A and T positions in modern humans (Default: "+vcf1000g+")\n"+ "\n"); if(argc == 1 || argc < 4 || (argc == 2 && (string(argv[0]) == "-h" || string(argv[0]) == "--help") ) ){ cerr << "Usage "<<usage<<endl; return 1; } for(int i=1;i<(argc-2);i++){ if(string(argv[i]) == "--bq"){ minBaseQuality=destringify<int>(argv[i+1]); i++; continue; } if(string(argv[i]) == "--1000g"){ vcf1000g=string(argv[i+1]); i++; continue; } } unsigned int maxSizeChromosome=250000000;//larger than chr1 hg19 bool * hasCnoT; bool * hasGnoA; bool * thousandGenomesHasA; bool * thousandGenomesHasT; cerr<<"Trying to allocating memory"<<endl; try{ hasCnoT = new bool[ maxSizeChromosome ]; hasGnoA = new bool[ maxSizeChromosome ]; thousandGenomesHasA = new bool[ maxSizeChromosome ]; thousandGenomesHasT = new bool[ maxSizeChromosome ]; }catch(bad_alloc& exc){ cerr<<"ERROR: allocating memory failed"<<endl; return 1; } cerr<<"Success in allocating memory"<<endl; for(unsigned int i = 0;i<maxSizeChromosome;i++){ hasCnoT[i]=false; hasGnoA[i]=false; thousandGenomesHasA[i]=false; thousandGenomesHasT[i]=false; } string bamfiletopen = string( argv[ argc-5 ] ); string vcffiletopen = string( argv[ argc-4 ] ); string chrname = string( argv[ argc-3 ] ); string deambam = string( argv[ argc-2 ] ); string nondeambam = string( argv[ argc-1 ] ); cerr<<"Reading consensus VCF "<<vcffiletopen<<" ... "<<endl; VCFreader vcfr (vcffiletopen, // vcffiletopen+".tbi", // chrname, // 1, // maxSizeChromosome, 0); while(vcfr.hasData()){ SimpleVCF * toprint=vcfr.getData(); if(toprint->getRef().length() != 1 ) continue; //if the VCF has a at least one G but no A if( toprint->hasAtLeastOneG() && !toprint->hasAtLeastOneA() ){ hasGnoA[ toprint->getPosition() ] =true; } if( toprint->hasAtLeastOneC() && !toprint->hasAtLeastOneT() ){ hasCnoT[ toprint->getPosition() ] =true; } } cerr<<"done reading VCF"<<endl; cerr<<"Reading 1000g VCF :"<<vcf1000g<<" ..."<<endl; string line1000g; ifstream myFile1000g; myFile1000g.open(vcf1000g.c_str(), ios::in); if (myFile1000g.is_open()){ while ( getline (myFile1000g,line1000g)){ vector<string> fields=allTokens(line1000g,'\t'); //0 chr //1 pos //2 id //3 ref //4 alt //check if same chr if(fields[0] != chrname){ cerr <<"Error, wrong chromosome in 1000g file for line= "<<line1000g<<endl; return 1; } //skip indels if(fields[3].size() != 1 || fields[4].size() != 1 ) continue; char ref=toupper(fields[3][0]); char alt=toupper(fields[4][0]); unsigned int pos=destringify<unsigned int>( fields[1] ); thousandGenomesHasA[ pos ] = ( (ref=='A') || (alt=='A') ); thousandGenomesHasT[ pos ] = ( (ref=='T') || (alt=='T') ); } myFile1000g.close(); }else{ cerr <<"Unable to open file "<<vcf1000g<<endl; return 1; } cerr<<"done reading 1000g VCF"<<endl; BamReader reader; if ( !reader.Open(bamfiletopen) ) { cerr << "Could not open input BAM file"<< bamfiletopen << endl; return 1; } //positioning the bam file int refid=reader.GetReferenceID(chrname); if(refid < 0){ cerr << "Cannot retrieve the reference ID for "<< chrname << endl; return 1; } //cout<<"redif "<<refid<<endl; //setting the BAM reader at that position reader.SetRegion(refid, 0, refid, -1); vector<RefData> testRefData=reader.GetReferenceData(); const SamHeader header = reader.GetHeader(); const RefVector references = reader.GetReferenceData(); BamWriter writerDeam; if ( !writerDeam.Open(deambam, header, references) ) { cerr << "Could not open output BAM file" << endl; return 1; } BamWriter writerNoDeam; if ( !writerNoDeam.Open(nondeambam, header, references) ) { cerr << "Could not open output BAM file" << endl; return 1; } unsigned int totalReads =0; unsigned int deaminatedReads =0; unsigned int ndeaminatedReads =0; unsigned int skipped =0; //iterating over the alignments for these regions BamAlignment al; int i; while ( reader.GetNextAlignment(al) ) { // cerr<<al.Name<<endl; //skip unmapped if(!al.IsMapped()){ skipped++; continue; } //skip paired end ! if(al.IsPaired() ){ continue; // cerr<<"Paired end not yet coded"<<endl; // return 1; } string reconstructedReference = reconstructRef(&al); char refeBase; char readBase; bool isDeaminated; if(al.Qualities.size() != reconstructedReference.size()){ cerr<<"Quality line is not the same size as the reconstructed reference"<<endl; return 1; } isDeaminated=false; if(al.IsReverseStrand()){ //first base next to 3' i = 0 ; refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); if( readBase == 'A' && int(al.Qualities[i]-offset) >= minBaseQuality){ //isDeaminated=true; } if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } if( hasGnoA[al.Position+1] && !thousandGenomesHasA[al.Position+1] ) isDeaminated=true; // transformRef(&refeBase,&readBase); // vcfr.repositionIterator(chrname,al.Position+1,al.Position+1); // while(vcfr.hasData()){ // SimpleVCF * toprint=vcfr.getData(); // // cout<<*toprint<<endl; // //skip deletions in the alt // if(toprint->getRef().length() != 1 ) // continue; // if(toprint->getRef()[0] != refeBase){ // cerr<<reconstructedReference<<endl; // cerr<<al.Position<<endl; // cerr<<numberOfDeletions(&al)<<endl; // cerr<<"Problem1 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; // exit(1); // } // //if the VCF has a at least one G but no A // if( toprint->hasAtLeastOneG() && // !toprint->hasAtLeastOneA() ){ // isDeaminated=true; // } // } } //second base next to 3' i = 1; refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); //refeBase == 'G' && if( readBase == 'A' && int(al.Qualities[i]-offset) >= minBaseQuality){ //isDeaminated=true; } if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } if( hasGnoA[al.Position+2] && !thousandGenomesHasA[al.Position+2] ) isDeaminated=true; // transformRef(&refeBase,&readBase); // vcfr.repositionIterator(chrname,al.Position+2,al.Position+2); // while(vcfr.hasData()){ // SimpleVCF * toprint=vcfr.getData(); // // cout<<*toprint<<endl; // //skip deletions in the alt // if(toprint->getRef().length() != 1 ) // continue; // if(toprint->getRef()[0] != refeBase){ // cerr<<reconstructedReference<<endl; // cerr<<al.Position<<endl; // cerr<<numberOfDeletions(&al)<<endl; // cerr<<"Problem2 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; // exit(1); // } // //if the VCF has at least one G but no A // // if(toprint->hasAtLeastOneG() && // // toprint->getAlt().find("A") == string::npos){ // if( toprint->hasAtLeastOneG() && // !toprint->hasAtLeastOneA() ){ // isDeaminated=true; // } // } } //last base next to 5' i = (al.QueryBases.length()-1) ; refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); //refeBase == 'G' && if( readBase == 'A' && int(al.Qualities[i]-offset) >= minBaseQuality){ //isDeaminated=true; } if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } transformRef(&refeBase,&readBase); int lengthMatches=countMatchesRecons(reconstructedReference,0); int positionJump = al.Position+lengthMatches+numberOfDeletions(&al); if(hasGnoA[positionJump] && !thousandGenomesHasA[positionJump] ) isDeaminated=true; // vcfr.repositionIterator(chrname,positionJump,positionJump); // while(vcfr.hasData()){ // SimpleVCF * toprint=vcfr.getData(); // //skip deletions in the alt // if(toprint->getRef().length() != 1 ) // continue; // if(toprint->getRef()[0] != refeBase){ // cerr<<reconstructedReference<<endl; // cerr<<al.Position<<endl; // cerr<<lengthMatches<<endl; // cerr<<numberOfDeletions(&al)<<endl; // cerr<<positionJump<<endl; // cerr<<"Problem3 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; // exit(1); // } // //if the VCF has at least one G but no A // if( toprint->hasAtLeastOneG() && // !toprint->hasAtLeastOneA() ){ // isDeaminated=true; // } // } } }else{ //first base next to 5' i = 0; refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); //refeBase == 'C' if( readBase == 'T' && int(al.Qualities[i]-offset) >= minBaseQuality){ if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } // transformRef(&refeBase,&readBase); if(hasCnoT[al.Position+1] && !thousandGenomesHasT[al.Position+1] ) isDeaminated=true; // vcfr.repositionIterator(chrname,al.Position+1,al.Position+1); // while(vcfr.hasData()){ // SimpleVCF * toprint=vcfr.getData(); // //cout<<*toprint<<endl; // //skip deletions in the alt // if(toprint->getRef().length() != 1 ) // continue; // if(toprint->getRef()[0] != refeBase){ // cerr<<reconstructedReference<<endl; // cerr<<al.Position<<endl; // cerr<<numberOfDeletions(&al)<<endl; // cerr<<"Problem4 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; // exit(1); // } // //if the VCF has at least one C but no T // if( toprint->hasAtLeastOneC() && // !toprint->hasAtLeastOneT() ){ // isDeaminated=true; // } // } //cout<<al.Position+ } //second last base next to 3' i = (al.QueryBases.length()-2); refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); //refeBase == 'C' && if( readBase == 'T' && int(al.Qualities[i]-offset) >= minBaseQuality){ if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } //transformRef(&refeBase,&readBase); int lengthMatches=countMatchesRecons(reconstructedReference,1); int positionJump = al.Position+lengthMatches+numberOfDeletions(&al); if(hasCnoT[positionJump] && !thousandGenomesHasT[positionJump] ) isDeaminated=true; // vcfr.repositionIterator(chrname,positionJump,positionJump); // while(vcfr.hasData()){ // SimpleVCF * toprint=vcfr.getData(); // //skip deletions in the alt // if(toprint->getRef().length() != 1 ) // continue; // if(toprint->getRef()[0] != refeBase){ // cerr<<reconstructedReference<<endl; // cerr<<al.Position<<endl; // cerr<<lengthMatches<<endl; // cerr<<numberOfDeletions(&al)<<endl; // cerr<<positionJump<<endl; // cerr<<"Problem5 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; // exit(1); // } // if( toprint->hasAtLeastOneC() && // !toprint->hasAtLeastOneT() ){ // isDeaminated=true; // } // } } //last base next to 3' i = (al.QueryBases.length()-1); refeBase=toupper(reconstructedReference[i]); readBase=toupper( al.QueryBases[i]); //&& refeBase == 'C' if( readBase == 'T' && int(al.Qualities[i]-offset) >= minBaseQuality){ if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; } transformRef(&refeBase,&readBase); int lengthMatches=countMatchesRecons(reconstructedReference,0); int positionJump = al.Position+lengthMatches+numberOfDeletions(&al); if(hasCnoT[positionJump] && !thousandGenomesHasT[positionJump] ) isDeaminated=true; // vcfr.repositionIterator(chrname,positionJump,positionJump); // while(vcfr.hasData()){ // SimpleVCF * toprint=vcfr.getData(); // //skip deletions in the alt // if(toprint->getRef().length() != 1 ) // continue; // if(toprint->getRef()[0] != refeBase){ // cerr<<reconstructedReference<<endl; // cerr<<al.Position<<endl; // cerr<<lengthMatches<<endl; // cerr<<numberOfDeletions(&al)<<endl; // cerr<<positionJump<<endl; // cerr<<"Problem6 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl; // exit(1); // } // if( toprint->hasAtLeastOneC() && // !toprint->hasAtLeastOneT() ){ // isDeaminated=true; // } // } } } totalReads++; if(isDeaminated){ deaminatedReads++; writerDeam.SaveAlignment(al); }else{ ndeaminatedReads++; writerNoDeam.SaveAlignment(al); } }//end for each read reader.Close(); writerDeam.Close(); writerNoDeam.Close(); delete(hasCnoT); delete(hasGnoA); cerr<<"Program finished sucessfully, out of "<<totalReads<<" mapped reads (skipped: "<<skipped<<" reads) we flagged "<<deaminatedReads<<" as deaminated and "<<ndeaminatedReads<<" as not deaminated"<<endl; return 0; }
void TagBam::Tag() { // open the annotations files for processing; OpenAnnoFiles(); // open the BAM file BamReader reader; BamWriter writer; if (!reader.Open(_bamFile)) { cerr << "Failed to open BAM file " << _bamFile << endl; exit(1); } // get header & reference information string bamHeader = reader.GetHeaderText(); RefVector refs = reader.GetReferenceData(); // set compression mode BamWriter::CompressionMode compressionMode = BamWriter::Compressed; // if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed; writer.SetCompressionMode(compressionMode); // open our BAM writer writer.Open("stdout", bamHeader, refs); // rip through the BAM file and test for overlaps with each annotation file. BamAlignment al; vector<BED> hits; while (reader.GetNextAlignment(al)) { if (al.IsMapped() == true) { BED a; a.chrom = refs.at(al.RefID).RefName; a.start = al.Position; a.end = al.GetEndPosition(false, false); a.strand = "+"; if (al.IsReverseStrand()) a.strand = "-"; ostringstream annotations; // annotate the BAM file based on overlaps with the annotation files. for (size_t i = 0; i < _annoFiles.size(); ++i) { // grab the current annotation file. BedFile *anno = _annoFiles[i]; if (!_useNames && !_useScores && !_useIntervals) { // add the label for this annotation file to tag if there is overlap if (anno->anyHits(a.chrom, a.start, a.end, a.strand, _sameStrand, _diffStrand, _overlapFraction, false)) { annotations << _annoLabels[i] << ";"; } } // use the score field else if (!_useNames && _useScores && !_useIntervals) { anno->allHits(a.chrom, a.start, a.end, a.strand, hits, _sameStrand, _diffStrand, 0.0, false); for (size_t i = 0; i < hits.size(); ++i) { annotations << hits[i].score; if (i < hits.size() - 1) annotations << ","; } if (hits.size() > 0) annotations << ";"; hits.clear(); } // use the name field from the annotation files to populate tag else if (_useNames && !_useScores && !_useIntervals) { anno->allHits(a.chrom, a.start, a.end, a.strand, hits, _sameStrand, _diffStrand, 0.0, false); for (size_t j = 0; j < hits.size(); ++j) { annotations << hits[j].name; if (j < hits.size() - 1) annotations << ","; } if (hits.size() > 0) annotations << ";"; hits.clear(); } // use the full interval information annotation files to populate tag else if (!_useNames && !_useScores && _useIntervals) { anno->allHits(a.chrom, a.start, a.end, a.strand, hits, _sameStrand, _diffStrand, 0.0, false); for (size_t j = 0; j < hits.size(); ++j) { annotations << _annoLabels[i] << ":" << hits[j].chrom << ":" << hits[j].start << "-" << hits[j].end << "," << hits[j].name << "," << hits[j].score << "," << hits[j].strand; if (j < hits.size() - 1) annotations << ","; } if (hits.size() > 0) annotations << ";"; hits.clear(); } } // were there any overlaps with which to make a tag? if (annotations.str().size() > 0) { al.AddTag(_tag, "Z", annotations.str().substr(0, annotations.str().size() - 1)); // get rid of the last ";" } } writer.SaveAlignment(al); } reader.Close(); writer.Close(); // close the annotations files; CloseAnnoFiles(); }
int main (int argc, char *argv[]) { string usage=string(""+string(argv[0])+" [in BAM file]"+ "\nThis program reads a BAM file and computes the error rate for each cycle\n"+ // "\nreads and the puts the rest into another bam file.\n"+ // "\nTip: if you do not need one of them, use /dev/null as your output\n"+ // "arguments:\n"+ // "\t"+"--bq [base qual] : Minimum base quality to flag a deaminated site (Default: "+stringify(minBaseQuality)+")\n"+ "\n"); if(argc == 1 || (argc == 2 && (string(argv[0]) == "-h" || string(argv[0]) == "--help") ) ){ cerr << "Usage "<<usage<<endl; return 1; } // for(int i=1;i<(argc-2);i++){ // if(string(argv[i]) == "--bq"){ // minBaseQuality=destringify<int>(argv[i+1]); // i++; // continue; // } // } string bamfiletopen = string( argv[ argc-1 ] ); // string deambam = string( argv[ argc-2 ] ); // string nondeambam = string( argv[ argc-1 ] ); BamReader reader; if ( !reader.Open(bamfiletopen) ) { cerr << "Could not open input BAM file"<< bamfiletopen << endl; return 1; } //iterating over the alignments for these regions BamAlignment al; bool pairedEnd=false; bool firstRead=true; while ( reader.GetNextAlignment(al) ) { if(firstRead){ //reads are either all paired end or single end, I don't allow a mix numberOfCycles=al.QueryBases.size(); // cout<<"numberOfCycles "<<numberOfCycles<<endl; if(al.IsPaired() ){ pairedEnd=true; matches = vector<unsigned int> (2*numberOfCycles,0); mismatches = vector<unsigned int> (2*numberOfCycles,0); typesOfMismatches = vector< vector<unsigned int> >(); for(int i=0;i<12;i++) typesOfMismatches.push_back( vector<unsigned int> (2*numberOfCycles,0) ); }else{ matches = vector<unsigned int> ( numberOfCycles,0); mismatches = vector<unsigned int> ( numberOfCycles,0); typesOfMismatches = vector< vector<unsigned int> >(); for(int i=0;i<12;i++) typesOfMismatches.push_back( vector<unsigned int> ( numberOfCycles,0) ); } firstRead=false; } if( ( pairedEnd && !al.IsPaired()) || ( !pairedEnd && al.IsPaired()) ){ cerr<<"Read "<<al.Name<<" is wrong, cannot have a mixture of paired and unpaired read for this program"<<endl; return 1; } //skip unmapped if(!al.IsMapped()) continue; if(numberOfCycles!=int(al.QueryBases.size())){ cerr<<"The length of read "<<al.Name<<" is wrong, should be "<<numberOfCycles<<"bp"<<endl; return 1; } string reconstructedReference = reconstructRef(&al); if(al.Qualities.size() != reconstructedReference.size()){ cerr<<"Quality line is not the same size as the reconstructed reference"<<endl; return 1; } if( pairedEnd ){ if( al.IsFirstMate() ){ //start cycle 0 if( al.IsReverseStrand() ){ increaseCounters(al,reconstructedReference,numberOfCycles-1,-1); //start cycle numberOfCycles-1 }else{ increaseCounters(al,reconstructedReference,0 , 1); //start cycle 0 } }else{ if( al.IsSecondMate() ){ if( al.IsReverseStrand() ){ increaseCounters(al,reconstructedReference,2*numberOfCycles-1,-1); //start cycle 2*numberOfCycles-1 }else{ increaseCounters(al,reconstructedReference,numberOfCycles , 1); //start cycle numberOfCycles } }else{ cerr<<"Reads "<<al.Name<<" must be either first or second mate"<<endl; return 1; } } }else{ //single end if( al.IsReverseStrand() ){ increaseCounters(al,reconstructedReference,numberOfCycles-1,-1); //start cycle numberOfCycles-1 }else{ increaseCounters(al,reconstructedReference,0 , 1); //start cycle 0 } } }//end while each read reader.Close(); cout<<"cycle\tmatches\tmismatches\tmismatches%\tA>C\tA>C%\tA>G\tA>G%\tA>T\tA>T%\tC>A\tC>A%\tC>G\tC>G%\tC>T\tC>T%\tG>A\tG>A%\tG>C\tG>C%\tG>T\tG>T%\tT>A\tT>A%\tT>C\tT>C%\tT>G\tT>G%"<<endl; for(unsigned int i=0;i<matches.size();i++){ cout<<(i+1); if( (matches[i]+mismatches[i]!=0) ) cout<<"\t"<<matches[i]<<"\t"<<mismatches[i]<<"\t"<< 100.0*(double(mismatches[i])/double(matches[i]+mismatches[i])) ; else cout<<"\t"<<matches[i]<<"\t"<<mismatches[i]<<"\tNA"; for(int j=0;j<12;j++){ cout<<"\t"<<typesOfMismatches[j][i]; if( (matches[i]+mismatches[i]!=0) ) cout<<"\t"<<100.0*double(typesOfMismatches[j][i])/double(matches[i]+mismatches[i]); else cout<<"\tNA"; } cout<<endl; } return 0; }
int main_asequantmultibam(const vector<string> &all_args) { Init(all_args); for (int bam_files_index = 0; bam_files_index < bam_files.size(); bam_files_index ++) { BamReader bam_reader; bam_file = bam_files[bam_files_index]; cerr << "* Reading bam file " << endl; OpenBam(bam_reader, bam_file); bam_reader.OpenIndex(bam_file + ".bai"); vector<RefData> chroms = bam_reader.GetReferenceData(); StlFor(chrom_idx, chroms) { string &chrom = chroms[chrom_idx].RefName; map<string, vector<Snp> >::iterator snps_ptr = snps_by_chrom.find(chrom); cerr << "* On chrom " << chrom << endl; int s = 0; // Index into snp array BamAlignment bam; bam_reader.Jump(chrom_idx); string align; string qualities; while (bam_reader.GetNextAlignment(bam) && bam.RefID == chrom_idx) { if (bam.MapQuality < min_map_qual) continue; int start = AlignStart(bam); int end = AlignEnd(bam); // Move the current SNP pointer so that it is ahead of the read's start (since bam alignments are in sorted order) while (s < snps_ptr->second.size() && snps_ptr->second[s].pos < start) ++s; // Stop everything if we have visited all SNPs on this chrom if (s >= snps_ptr->second.size()) break; // Find any/all SNPs that are within the bam alignment int n = 0; // Number of SNPs overlapped while ((s + n) < snps_ptr->second.size() && snps_ptr->second[s + n].pos < end) // Then it overlaps! ++n; // Now, look at each SNP and see which way it votes AlignedString(bam, align); AlignedQualities(bam, qualities); Assert(align.size() == qualities.size()); // Now, tally votes for (int i = 0; i < n; ++i) { Snp &snp = snps_ptr->second[s + i]; char base = align[snp.pos - start]; // Base from the read int qual = int(qualities[snp.pos - start]) - ascii_offset; // Base from the read //AssertMsg(qual >= 0 && qual <= 100, ToStr(qual) + "\n" + bam.Name + "\n" + CigarToStr(bam.CigarData) + "\n" + bam.QueryBases + "\n" + bam.Qualities); if (base == '-' || qual < min_base_qual) continue; Counts &counts = bam.IsReverseStrand() ? snp.rev[bam_files_index] : snp.fwd[bam_files_index]; if (base == snp.ref) { counts.num_ref += 1; } else if (base == snp.alt) { counts.num_alt += 1; } else { counts.num_other += 1; } } } } }
bool check(const PropertyFilter& filter, const BamAlignment& al) { bool keepAlignment = true; const PropertyMap& properties = filter.Properties; PropertyMap::const_iterator propertyIter = properties.begin(); PropertyMap::const_iterator propertyEnd = properties.end(); for ( ; propertyIter != propertyEnd; ++propertyIter ) { // check alignment data field depending on propertyName const string& propertyName = (*propertyIter).first; const PropertyFilterValue& valueFilter = (*propertyIter).second; if ( propertyName == ALIGNMENTFLAG_PROPERTY ) keepAlignment &= valueFilter.check(al.AlignmentFlag); else if ( propertyName == CIGAR_PROPERTY ) { stringstream cigarSs; const vector<CigarOp>& cigarData = al.CigarData; if ( !cigarData.empty() ) { vector<CigarOp>::const_iterator cigarBegin = cigarData.begin(); vector<CigarOp>::const_iterator cigarIter = cigarBegin; vector<CigarOp>::const_iterator cigarEnd = cigarData.end(); for ( ; cigarIter != cigarEnd; ++cigarIter ) { const CigarOp& op = (*cigarIter); cigarSs << op.Length << op.Type; } keepAlignment &= valueFilter.check(cigarSs.str()); } } else if ( propertyName == INSERTSIZE_PROPERTY ) keepAlignment &= valueFilter.check(al.InsertSize); else if ( propertyName == ISDUPLICATE_PROPERTY ) keepAlignment &= valueFilter.check(al.IsDuplicate()); else if ( propertyName == ISFAILEDQC_PROPERTY ) keepAlignment &= valueFilter.check(al.IsFailedQC()); else if ( propertyName == ISFIRSTMATE_PROPERTY ) keepAlignment &= valueFilter.check(al.IsFirstMate()); else if ( propertyName == ISMAPPED_PROPERTY ) keepAlignment &= valueFilter.check(al.IsMapped()); else if ( propertyName == ISMATEMAPPED_PROPERTY ) keepAlignment &= valueFilter.check(al.IsMateMapped()); else if ( propertyName == ISMATEREVERSESTRAND_PROPERTY ) keepAlignment &= valueFilter.check(al.IsMateReverseStrand()); else if ( propertyName == ISPAIRED_PROPERTY ) keepAlignment &= valueFilter.check(al.IsPaired()); else if ( propertyName == ISPRIMARYALIGNMENT_PROPERTY ) keepAlignment &= valueFilter.check(al.IsPrimaryAlignment()); else if ( propertyName == ISPROPERPAIR_PROPERTY ) keepAlignment &= valueFilter.check(al.IsProperPair()); else if ( propertyName == ISREVERSESTRAND_PROPERTY ) keepAlignment &= valueFilter.check(al.IsReverseStrand()); else if ( propertyName == ISSECONDMATE_PROPERTY ) keepAlignment &= valueFilter.check(al.IsSecondMate()); else if ( propertyName == ISSINGLETON_PROPERTY ) { const bool isSingleton = al.IsPaired() && al.IsMapped() && !al.IsMateMapped(); keepAlignment &= valueFilter.check(isSingleton); } else if ( propertyName == MAPQUALITY_PROPERTY ) keepAlignment &= valueFilter.check(al.MapQuality); else if ( propertyName == MATEPOSITION_PROPERTY ) keepAlignment &= ( al.IsPaired() && al.IsMateMapped() && valueFilter.check(al.MateRefID) ); else if ( propertyName == MATEREFERENCE_PROPERTY ) { if ( !al.IsPaired() || !al.IsMateMapped() ) return false; BAMTOOLS_ASSERT_MESSAGE( (al.MateRefID>=0 && (al.MateRefID<(int)filterToolReferences.size())), "Invalid MateRefID"); const string& refName = filterToolReferences.at(al.MateRefID).RefName; keepAlignment &= valueFilter.check(refName); } else if ( propertyName == NAME_PROPERTY ) keepAlignment &= valueFilter.check(al.Name); else if ( propertyName == POSITION_PROPERTY ) keepAlignment &= valueFilter.check(al.Position); else if ( propertyName == QUERYBASES_PROPERTY ) keepAlignment &= valueFilter.check(al.QueryBases); else if ( propertyName == REFERENCE_PROPERTY ) { BAMTOOLS_ASSERT_MESSAGE( (al.RefID>=0 && (al.RefID<(int)filterToolReferences.size())), "Invalid RefID"); const string& refName = filterToolReferences.at(al.RefID).RefName; keepAlignment &= valueFilter.check(refName); } else if ( propertyName == TAG_PROPERTY ) keepAlignment &= checkAlignmentTag(valueFilter, al); else BAMTOOLS_ASSERT_UNREACHABLE; // if alignment fails at ANY point, just quit and return false if ( !keepAlignment ) return false; } BAMTOOLS_ASSERT_MESSAGE( keepAlignment, "Error in BamAlignmentChecker... keepAlignment should be true here"); return keepAlignment; }
int main_aseregion(const vector<string> &all_args) { Init(all_args); cerr << "* Reading bam file " << endl; OpenBam(bam_reader, bam_file); bam_reader.OpenIndex(bam_file + ".bai"); vector<string> readGroupVector; //Obtain all the readgroups. SamHeader header = bam_reader.GetHeader(); SamReadGroupDictionary headerRG = header.ReadGroups; for (SamReadGroupIterator it = headerRG.Begin(); it != headerRG.End(); it ++) { readGroupVector.push_back(it -> ID); } cout << "#CHROM" << "\t" << "StartPos" << "\t" << "EndPos"; for (vector<string>::iterator it = readGroupVector.begin(); it != readGroupVector.end(); it ++) { cout << "\t" << *it; } cout << endl; vector<RefData> chroms = bam_reader.GetReferenceData(); StlFor(chrom_idx, chroms) { string &chrom = chroms[chrom_idx].RefName; cerr << "* On chrom " << chrom << endl; map<string, vector<GenomicRegion> >::iterator searchIt = chrom_genomicRegions.find(chrom); BamAlignment startPointer; // This pointer will point to the region immediately before the start of current regions under inspection. bam_reader.Jump(chrom_idx); if (!bam_reader.GetNextAlignment(startPointer)) break; int count = 0; // For each region, walk through all the reads correspoinding to this region and count the reads. for (vector<GenomicRegion>::iterator it = searchIt -> second.begin(); it != searchIt -> second.end(); ++it) { bam_reader.Jump(chrom_idx, startPointer.Position); // Fix the reading pointer. if (!bam_reader.GetNextAlignment(startPointer)) break; int flag = 0; while (true) { int startEnd = startPointer.GetEndPosition(); if (startEnd < it -> start) { if (!bam_reader.GetNextAlignment(startPointer)) { flag = 1; break; } } else { break; } } if (flag == 1) { break; } // Now startPointer assumes its rightful position. BamAlignment nextPointer = startPointer; //This pointer traverse through all reads that align to the current genomic region in bed file and the iteration ends when this pointer pass through the end of the region. while (true) { int nextStart = nextPointer.Position; if (nextStart > it -> end) { break; // This iteration is done. } if (nextPointer.MapQuality < min_map_qual) { if (!bam_reader.GetNextAlignment(nextPointer)) { break; } continue; } string currentRG; Assert(nextPointer.GetReadGroup(currentRG)); map<string, int> &RG_counts = nextPointer.IsReverseStrand() ? it -> revCounts : it -> fwdCounts; map<string, int>::iterator searchItForRG = RG_counts.find(currentRG); if (searchItForRG == RG_counts.end()) { RG_counts[currentRG] = 1; } else { ++ RG_counts[currentRG]; } if (!bam_reader.GetNextAlignment(nextPointer)) { break; } } count ++; if (count % 1000 == 0) cerr << "Processed" << "\t" << count << endl; } // Output the counts for (vector<GenomicRegion>::iterator it = searchIt -> second.begin(); it != searchIt -> second.end(); ++it) { cout << chrom << "\t" << it -> start << "\t" << it -> end; for (vector<string>::iterator subIt = readGroupVector.begin(); subIt != readGroupVector.end(); ++subIt) { map<string, int>::iterator searchItForRG = it -> fwdCounts.find(*subIt); if (searchItForRG != it -> fwdCounts.end()) { cout << "\t" << searchItForRG -> second << ","; } else { cout << "\t" << "0,"; } searchItForRG = it -> revCounts.find(*subIt); if (searchItForRG != it -> revCounts.end()) { cout << searchItForRG -> second; } else { cout << "0"; } } cout << endl; } }
virtual void main() { //init QTextStream out(stdout); BamReader reader; NGSHelper::openBAM(reader, getInfile("in")); FastqOutfileStream out1(getOutfile("out1"), false); FastqOutfileStream out2(getOutfile("out2"), false); long long c_unpaired = 0; long long c_paired = 0; int max_cached = 0; //iterate through reads BamAlignment al; QHash<QByteArray, BamAlignment> al_cache; while (reader.GetNextAlignment(al)) { //skip secondary alinments if(!al.IsPrimaryAlignment()) continue; //skip unpaired if(!al.IsPaired()) { ++c_unpaired; continue; } QByteArray name(al.Name.data()); //TODO use QByteArray::fromStdString (when upgraded to Qt5.4) //store cached read when we encounter the mate if (al_cache.contains(name)) { BamAlignment mate = al_cache.take(name); //out << name << " [AL] First: " << al.IsFirstMate() << " Reverse: " << al.IsReverseStrand() << " Seq: " << al.QueryBases.data() << endl; //out << name << " [MA] First: " << mate.IsFirstMate() << " Reverse: " << mate.IsReverseStrand() << " Seq: " << mate.QueryBases.data() << endl; if (al.IsFirstMate()) { write(out1, al, al.IsReverseStrand()); write(out2, mate, mate.IsReverseStrand()); } else { write(out1, mate, mate.IsReverseStrand()); write(out2, al, al.IsReverseStrand()); } ++c_paired; } //cache read for later retrieval else { al_cache.insert(name, al); } max_cached = std::max(max_cached, al_cache.size()); } reader.Close(); out1.close(); out2.close(); //write debug output out << "Pair reads (written) : " << c_paired << endl; out << "Unpaired reads (skipped) : " << c_unpaired << endl; out << "Unmatched paired reads (skipped): " << al_cache.size() << endl; out << endl; out << "Maximum cached reads : " << max_cached << endl; }
int main (int argc, const char *argv[]) { printf ("------------- bamrealignment --------------\n"); OptArgs opts; opts.ParseCmdLine(argc, argv); vector<int> score_vals(4); string input_bam = opts.GetFirstString ('i', "input", ""); string output_bam = opts.GetFirstString ('o', "output", ""); opts.GetOption(score_vals, "4,-6,-5,-2", 's', "scores"); int clipping = opts.GetFirstInt ('c', "clipping", 2); bool anchors = opts.GetFirstBoolean ('a', "anchors", true); int bandwidth = opts.GetFirstInt ('b', "bandwidth", 10); bool verbose = opts.GetFirstBoolean ('v', "verbose", false); bool debug = opts.GetFirstBoolean ('d', "debug", false); int format = opts.GetFirstInt ('f', "format", 1); int num_threads = opts.GetFirstInt ('t', "threads", 8); string log_fname = opts.GetFirstString ('l', "log", ""); if (input_bam.empty() or output_bam.empty()) return PrintHelp(); opts.CheckNoLeftovers(); std::ofstream logf; if (log_fname.size ()) { logf.open (log_fname.c_str ()); if (!logf.is_open ()) { fprintf (stderr, "bamrealignment: Failed to open log file %s\n", log_fname.c_str()); return 1; } } BamReader reader; if (!reader.Open(input_bam)) { fprintf(stderr, "bamrealignment: Failed to open input file %s\n", input_bam.c_str()); return 1; } SamHeader header = reader.GetHeader(); RefVector refs = reader.GetReferenceData(); BamWriter writer; writer.SetNumThreads(num_threads); if (format == 1) writer.SetCompressionMode(BamWriter::Uncompressed); else writer.SetCompressionMode(BamWriter::Compressed); if (!writer.Open(output_bam, header, refs)) { fprintf(stderr, "bamrealignment: Failed to open output file %s\n", output_bam.c_str()); return 1; } // The meat starts here ------------------------------------ if (verbose) cout << "Verbose option is activated, each alignment will print to screen." << endl << " After a read hit RETURN to continue to the next one," << endl << " or press q RETURN to quit the program," << endl << " or press s Return to silence verbose," << endl << " or press c RETURN to continue printing without further prompt." << endl << endl; unsigned int readcounter = 0; unsigned int mapped_readcounter = 0; unsigned int realigned_readcounter = 0; unsigned int modified_alignment_readcounter = 0; unsigned int pos_update_readcounter = 0; unsigned int failed_clip_realigned_readcount = 0; unsigned int already_perfect_readcount = 0; unsigned int bad_md_tag_readcount = 0; unsigned int error_recreate_ref_readcount = 0; unsigned int error_clip_anchor_readcount = 0; unsigned int error_sw_readcount = 0; unsigned int error_unclip_readcount = 0; unsigned int start_position_shift; int orig_position; int new_position; string md_tag, new_md_tag, input = "x"; vector<CigarOp> new_cigar_data; vector<MDelement> new_md_data; bool position_shift = false; time_t start_time = time(NULL); Realigner aligner; aligner.verbose_ = verbose; aligner.debug_ = debug; if (!aligner.SetScores(score_vals)) cout << "bamrealignment: Four scores need to be provided: match, mismatch, gap open, gap extend score!" << endl; aligner.SetAlignmentBandwidth(bandwidth); BamAlignment alignment; while(reader.GetNextAlignment(alignment)){ readcounter ++; position_shift = false; if ( (readcounter % 100000) == 0 ) cout << "Processed " << readcounter << " reads. Elapsed time: " << (time(NULL) - start_time) << endl; if (alignment.IsMapped()) { orig_position = alignment.Position; mapped_readcounter++; aligner.SetClipping(clipping, !alignment.IsReverseStrand()); if (aligner.verbose_) { cout << endl; if (alignment.IsReverseStrand()) cout << "The read is from the reverse strand." << endl; else cout << "The read is from the forward strand." << endl; } if (!alignment.GetTag("MD", md_tag)) { if (aligner.verbose_) cout << "Warning: Skipping read " << alignment.Name << ". It is mapped but missing MD tag." << endl; if (logf.is_open ()) logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "MISSMD" << '\n'; bad_md_tag_readcount++; } else if (aligner.CreateRefFromQueryBases(alignment.QueryBases, alignment.CigarData, md_tag, anchors)) { bool clipfail = false; if (Realigner::CR_ERR_CLIP_ANCHOR == aligner.GetCreateRefError ()) { clipfail = true; failed_clip_realigned_readcount ++; } if (!aligner.computeSWalignment(new_cigar_data, new_md_data, start_position_shift)) { if (aligner.verbose_) cout << "Error in the alignment! Not updating read information." << endl; if (logf.is_open ()) logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "SWERR" << '\n'; error_sw_readcount++; writer.SaveAlignment(alignment); // Write alignment unchanged continue; } if (!aligner.addClippedBasesToTags(new_cigar_data, new_md_data, alignment.QueryBases.size())) { if (aligner.verbose_) cout << "Error when adding clipped anchors back to tags! Not updating read information." << endl; if (logf.is_open ()) logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "UNCLIPERR" << '\n'; writer.SaveAlignment(alignment); // Write alignment unchanged error_unclip_readcount ++; continue; } new_md_tag = aligner.GetMDstring(new_md_data); realigned_readcounter++; // adjust start position of read if (!aligner.LeftAnchorClipped() and start_position_shift != 0) { new_position = aligner.updateReadPosition(alignment.CigarData, (int)start_position_shift, alignment.Position); if (new_position != alignment.Position) { pos_update_readcounter++; position_shift = true; alignment.Position = new_position; } } if (position_shift || alignment.CigarData.size () != new_cigar_data.size () || md_tag != new_md_tag) { if (logf.is_open ()) { logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "MOD"; if (position_shift) logf << "-SHIFT"; if (clipfail) logf << " NOCLIP"; logf << '\n'; } modified_alignment_readcounter++; } else { if (logf.is_open ()) { logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "UNMOD"; if (clipfail) logf << " NOCLIP"; logf << '\n'; } } if (aligner.verbose_){ cout << alignment.Name << endl; cout << "------------------------------------------" << endl; // Wait for input to continue or quit program if (input.size() == 0) input = 'x'; else if (input[0] != 'c' and input[0] != 'C') getline(cin, input); if (input.size()>0){ if (input[0] == 'q' or input[0] == 'Q') return 1; else if (input[0] == 's' or input[0] == 'S') aligner.verbose_ = false; } } // Finally update alignment information alignment.CigarData = new_cigar_data; alignment.EditTag("MD", "Z" , new_md_tag); } // end of CreateRef else if else { switch (aligner.GetCreateRefError ()) { case Realigner::CR_ERR_RECREATE_REF: if (logf.is_open ()) logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "RECRERR" << '\n'; error_recreate_ref_readcount++; break; case Realigner::CR_ERR_CLIP_ANCHOR: if (logf.is_open ()) logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "CLIPERR" << '\n'; error_clip_anchor_readcount++; break; default: // On a good run this writes way too many reads to the log file - don't want to create a too large txt file // if (logf.is_open ()) //logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "PERFECT" << '\n'; already_perfect_readcount++; break; } if (aligner.verbose_) { cout << alignment.Name << endl; cout << "------------------------------------------" << endl; // Wait for input to continue or quit program if (input.size() == 0) input = 'x'; else if (input[0] != 'c' and input[0] != 'C') getline(cin, input); if (input.size()>0){ if (input[0] == 'q' or input[0] == 'Q') return 1; else if (input[0] == 's' or input[0] == 'S') aligner.verbose_ = false; } } } // --- Debug output for Rajesh --- if (debug && aligner.invalid_cigar_in_input) { aligner.verbose_ = true; cout << "Invalid cigar string / md tag pair in read " << alignment.Name << endl; // Rerun reference generation to display error aligner.CreateRefFromQueryBases(alignment.QueryBases, alignment.CigarData, md_tag, anchors); aligner.verbose_ = verbose; aligner.invalid_cigar_in_input = false; } // --- --- --- } // end of if isMapped writer.SaveAlignment(alignment); } // end while loop over reads if (aligner.invalid_cigar_in_input) cerr << "WARNING bamrealignment: There were invalid cigar string / md tag pairs in the input bam file." << endl; // ---------------------------------------------------------------- // program end -- output summary information cout << " File: " << input_bam << endl << " Total reads: " << readcounter << endl << " Mapped reads: " << mapped_readcounter << endl; if (bad_md_tag_readcount) cout << " Skipped: bad MD tags: " << bad_md_tag_readcount << endl; if (error_recreate_ref_readcount) cout << " Skipped: unable to recreate ref: " << error_recreate_ref_readcount << endl; if (error_clip_anchor_readcount) cout << " Skipped: error clipping anchor: " << error_clip_anchor_readcount << endl; cout << " Skipped: already perfect: " << already_perfect_readcount << endl << " Total reads realigned: " << mapped_readcounter - already_perfect_readcount - bad_md_tag_readcount - error_recreate_ref_readcount - error_clip_anchor_readcount << endl; if (failed_clip_realigned_readcount) cout << " (including " << failed_clip_realigned_readcount << " that failed to clip)" << endl; if (error_sw_readcount) cout << " Failed to complete SW alignment: " << error_sw_readcount << endl; if (error_unclip_readcount) cout << " Failed to unclip anchor: " << error_unclip_readcount << endl; cout << " Succesfully realigned: " << realigned_readcounter << endl << " Modified alignments: " << modified_alignment_readcounter << endl << " Shifted position: " << pos_update_readcounter << endl; cout << "Processing time: " << (time(NULL)-start_time) << " seconds." << endl; cout << "INFO: The output BAM file may be unsorted." << endl; cout << "------------------------------------------" << endl; return 0; }
void BedWindow::WindowIntersectBam(string bamFile) { // load the "B" bed file into a map so // that we can easily compare "A" to it for overlaps _bedB->loadBedFileIntoMap(); // open the BAM file BamReader reader; BamWriter writer; reader.Open(bamFile); // get header & reference information string bamHeader = reader.GetHeaderText(); RefVector refs = reader.GetReferenceData(); // open a BAM output to stdout if we are writing BAM if (_bamOutput == true) { // set compression mode BamWriter::CompressionMode compressionMode = BamWriter::Compressed; if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed; writer.SetCompressionMode(compressionMode); // open our BAM writer writer.Open("stdout", bamHeader, refs); } vector<BED> hits; // vector of potential hits // reserve some space hits.reserve(100); _bedA->bedType = 6; BamAlignment bam; bool overlapsFound; // get each set of alignments for each pair. while (reader.GetNextAlignment(bam)) { if (bam.IsMapped()) { BED a; a.chrom = refs.at(bam.RefID).RefName; a.start = bam.Position; a.end = bam.GetEndPosition(false, false); // build the name field from the BAM alignment. a.name = bam.Name; if (bam.IsFirstMate()) a.name += "/1"; if (bam.IsSecondMate()) a.name += "/2"; a.score = ToString(bam.MapQuality); a.strand = "+"; if (bam.IsReverseStrand()) a.strand = "-"; if (_bamOutput == true) { overlapsFound = FindOneOrMoreWindowOverlaps(a); if (overlapsFound == true) { if (_noHit == false) writer.SaveAlignment(bam); } else { if (_noHit == true) writer.SaveAlignment(bam); } } else { FindWindowOverlaps(a, hits); hits.clear(); } } // BAM IsMapped() is false else if (_noHit == true) { writer.SaveAlignment(bam); } } // close the relevant BAM files. reader.Close(); if (_bamOutput == true) { writer.Close(); } }
int main (int argc, char *argv[]) { bool mapped =false; bool unmapped=false; const string usage=string(string(argv[0])+" [options] input.bam out.bam"+"\n\n"+ "This program takes a BAM file as input and produces\n"+ "another where the putative deaminated bases have\n"+ "have been cut\n"+ "\n"+ "Options:\n"); // "\t"+"-u , --unmapped" +"\n\t\t"+"For an unmapped bam file"+"\n"+ // "\t"+"-m , --mapped" +"\n\t\t"+"For an mapped bam file"+"\n"); if( (argc== 1) || (argc== 2 && string(argv[1]) == "-h") || (argc== 2 && string(argv[1]) == "-help") || (argc== 2 && string(argv[1]) == "--help") ){ cout<<"Usage:"<<endl; cout<<usage<<endl; cout<<""<<endl; return 1; } // for(int i=1;i<(argc-1);i++){ //all but the last arg // if(strcmp(argv[i],"-m") == 0 || strcmp(argv[i],"--mapped") == 0 ){ // mapped=true; // continue; // } // if(strcmp(argv[i],"-u") == 0 || strcmp(argv[i],"--unmapped") == 0 ){ // unmapped=true; // continue; // } // cerr<<"Unknown option "<<argv[i] <<" exiting"<<endl; // return 1; // } if(argc != 3){ cerr<<"Error: Must specify the input and output BAM files"; return 1; } string inbamFile =argv[argc-2]; string outbamFile=argv[argc-1]; // if(!mapped && !unmapped){ // cerr << "Please specify whether you reads are mapped or unmapped" << endl; // return 1; // } // if(mapped && unmapped){ // cerr << "Please specify either mapped or unmapped but not both" << endl; // return 1; // } BamReader reader; if ( !reader.Open(inbamFile) ) { cerr << "Could not open input BAM files." << endl; return 1; } vector<RefData> testRefData=reader.GetReferenceData(); const SamHeader header = reader.GetHeader(); const RefVector references = reader.GetReferenceData(); BamWriter writer; if ( !writer.Open(outbamFile, header, references) ) { cerr << "Could not open output BAM file" << endl; return 1; } BamAlignment al; // BamAlignment al2; // bool al2Null=true; while ( reader.GetNextAlignment(al) ) { if(al.IsPaired() ){ if(al.IsFirstMate() ){ //5' end, need to check first base only if(al.IsReverseStrand()){ // if(!al.IsMapped()){ cerr << "Cannot have reverse complemented unmapped reads: " <<al.Name<< endl; //return 1; } int indexToCheck; //last indexToCheck=al.QueryBases.length()-1; if(toupper(al.QueryBases[indexToCheck]) == 'A'){ //al.Qualities[indexToCheck]=char(offset+baseQualForDeam); al.QueryBases = al.QueryBases.substr(0,indexToCheck); al.Qualities = al.Qualities.substr(0, indexToCheck); } }else{ int indexToCheck; //first base indexToCheck=0; if(toupper(al.QueryBases[indexToCheck]) == 'T'){ //al.Qualities[indexToCheck]=char(offset+baseQualForDeam); al.QueryBases = al.QueryBases.substr(indexToCheck+1); al.Qualities = al.Qualities.substr(indexToCheck+1); } } }else{ //3' end, need to check last two bases only if( al.IsSecondMate() ){ if(al.IsReverseStrand()){ // if(!al.IsMapped()){ cerr << "Cannot have reverse complemented unmapped reads: " <<al.Name<< endl; //return 1; } int indexToCheck; //second to last indexToCheck=al.QueryBases.length()-2; if(toupper(al.QueryBases[indexToCheck]) == 'T'){ //al.Qualities[indexToCheck]=char(offset+baseQualForDeam); al.QueryBases = al.QueryBases.substr(0,indexToCheck); al.Qualities = al.Qualities.substr(0, indexToCheck); }else{ //last indexToCheck=al.QueryBases.length()-1; if(toupper(al.QueryBases[indexToCheck]) == 'T'){ //al.Qualities[indexToCheck]=char(offset+baseQualForDeam); al.QueryBases = al.QueryBases.substr(0,indexToCheck); al.Qualities = al.Qualities.substr(0, indexToCheck); } } }else{ int indexToCheck; //second base indexToCheck=1; if(toupper(al.QueryBases[indexToCheck]) == 'A'){ //al.Qualities[indexToCheck]=char(offset+baseQualForDeam); al.QueryBases = al.QueryBases.substr(indexToCheck+1); al.Qualities = al.Qualities.substr(indexToCheck+1); }else{ //first base indexToCheck=0; if(toupper(al.QueryBases[indexToCheck]) == 'A'){ //al.Qualities[indexToCheck]=char(offset+baseQualForDeam); al.QueryBases = al.QueryBases.substr(indexToCheck+1); al.Qualities = al.Qualities.substr(indexToCheck+1); } } } }else{ cerr << "Wrong state" << endl; return 1; } } }//end of paired end else{//we consider single reads to have been sequenced from 5' to 3' if(al.IsReverseStrand()){ //need to consider if(!al.IsMapped()){ cerr << "Cannot have reverse complemented unmapped reads: " <<al.Name<< endl; //return 1; } int indexToCheck; //second base indexToCheck=1; if(toupper(al.QueryBases[indexToCheck]) == 'A'){ // al.Qualities[indexToCheck]=char(offset+baseQualForDeam); // cout<<"51 "<<al.QueryBases<<endl; // cout<<"51 "<<al.Qualities<<endl; al.QueryBases = al.QueryBases.substr(indexToCheck+1); al.Qualities = al.Qualities.substr(indexToCheck+1); // cout<<"52 "<<al.QueryBases<<endl; // cout<<"52 "<<al.Qualities<<endl; }else{ //first base indexToCheck=0; if(toupper(al.QueryBases[indexToCheck]) == 'A'){ // al.Qualities[indexToCheck]=char(offset+baseQualForDeam); // cout<<"61 "<<al.QueryBases<<endl; // cout<<"61 "<<al.Qualities<<endl; al.QueryBases = al.QueryBases.substr(indexToCheck+1); al.Qualities = al.Qualities.substr(indexToCheck+1); // cout<<"62 "<<al.QueryBases<<endl; // cout<<"62 "<<al.Qualities<<endl; } } //last indexToCheck=al.QueryBases.length()-1; if(toupper(al.QueryBases[indexToCheck]) == 'A'){ // al.Qualities[indexToCheck]=char(offset+baseQualForDeam); // cout<<"21 "<<al.QueryBases<<endl; // cout<<"21 "<<al.Qualities<<endl; al.QueryBases = al.QueryBases.substr(0,indexToCheck); al.Qualities = al.Qualities.substr(0, indexToCheck); // cout<<"22 "<<al.QueryBases<<endl; // cout<<"22 "<<al.Qualities<<endl; } }else{ int indexToCheck; //first base indexToCheck=0; if(toupper(al.QueryBases[indexToCheck]) == 'T'){ // al.Qualities[indexToCheck]=char(offset+baseQualForDeam); // cout<<"11 "<<al.QueryBases<<endl; // cout<<"11 "<<al.Qualities<<endl; al.QueryBases = al.QueryBases.substr(indexToCheck+1); al.Qualities = al.Qualities.substr(indexToCheck+1); // cout<<"12 "<<al.QueryBases<<endl; // cout<<"12 "<<al.Qualities<<endl; } //second to last indexToCheck=al.QueryBases.length()-2; if(toupper(al.QueryBases[indexToCheck]) == 'T'){ // al.Qualities[indexToCheck]=char(offset+baseQualForDeam); // cout<<"31 "<<al.QueryBases<<endl; // cout<<"31 "<<al.Qualities<<endl; al.QueryBases = al.QueryBases.substr(0,indexToCheck); al.Qualities = al.Qualities.substr(0, indexToCheck); // cout<<"32 "<<al.QueryBases<<endl; // cout<<"32 "<<al.Qualities<<endl; }else{ //last indexToCheck=al.QueryBases.length()-1; if(toupper(al.QueryBases[indexToCheck]) == 'T'){ // al.Qualities[indexToCheck]=char(offset+baseQualForDeam); // cout<<"41 "<<al.QueryBases<<endl; // cout<<"41 "<<al.Qualities<<endl; al.QueryBases = al.QueryBases.substr(0,indexToCheck); al.Qualities = al.Qualities.substr(0, indexToCheck); // cout<<"42 "<<al.QueryBases<<endl; // cout<<"42 "<<al.Qualities<<endl; } } } }//end of single end writer.SaveAlignment(al); }// while ( reader.GetNextAlignment(al) ) { reader.Close(); writer.Close(); return 0; }