TEST(CigarStringTest, FromStdString_MultipleOps) { const string multiCigar = "100=2D34I6=6X6="; Cigar cigar = Cigar::FromStdString(multiCigar); ASSERT_TRUE(cigar.size() == 6); CigarOperation op0 = cigar.at(0); CigarOperation op1 = cigar.at(1); CigarOperation op2 = cigar.at(2); CigarOperation op3 = cigar.at(3); CigarOperation op4 = cigar.at(4); CigarOperation op5 = cigar.at(5); EXPECT_TRUE(op0.Char() == '='); EXPECT_TRUE(op0.Length() == 100); EXPECT_TRUE(op1.Char() == 'D'); EXPECT_TRUE(op1.Length() == 2); EXPECT_TRUE(op2.Char() == 'I'); EXPECT_TRUE(op2.Length() == 34); EXPECT_TRUE(op3.Char() == '='); EXPECT_TRUE(op3.Length() == 6); EXPECT_TRUE(op4.Char() == 'X'); EXPECT_TRUE(op4.Length() == 6); EXPECT_TRUE(op5.Char() == '='); EXPECT_TRUE(op5.Length() == 6); }
unsigned UngappedAligner::alignUngapped( FragmentMetadata &fragmentMetadata, Cigar &cigarBuffer, const flowcell::ReadMetadata &readMetadata, const matchSelector::FragmentSequencingAdapterClipper &adapterClipper, const reference::ContigList &contigList, const isaac::reference::ContigAnnotations &contigAnnotations) const { const unsigned cigarOffset = cigarBuffer.size(); // Don't reset alignment to preserve the seed-based anchors. // fragmentMetadata.resetAlignment(); ISAAC_ASSERT_MSG(!fragmentMetadata.isAligned(), "alignUngapped is expected to be performend on a clean fragment"); fragmentMetadata.resetClipping(); const reference::Contig &contig = contigList[fragmentMetadata.contigId]; const Read &read = fragmentMetadata.getRead(); const bool reverse = fragmentMetadata.reverse; const std::vector<char> &sequence = read.getStrandSequence(reverse); const reference::Contig &reference = contig; std::vector<char>::const_iterator sequenceBegin = sequence.begin(); std::vector<char>::const_iterator sequenceEnd = sequence.end(); adapterClipper.clip(contig, fragmentMetadata, sequenceBegin, sequenceEnd); clipReadMasking(read, fragmentMetadata, sequenceBegin, sequenceEnd); clipReference(reference.size(), fragmentMetadata, sequenceBegin, sequenceEnd); const unsigned firstMappedBaseOffset = std::distance(sequence.begin(), sequenceBegin); if (firstMappedBaseOffset) { cigarBuffer.addOperation(firstMappedBaseOffset, Cigar::SOFT_CLIP); } const unsigned mappedBases = std::distance(sequenceBegin, sequenceEnd); if (mappedBases) { const Cigar::OpCode opCode = Cigar::ALIGN; cigarBuffer.addOperation(mappedBases, opCode); } const unsigned clipEndBases = std::distance(sequenceEnd, sequence.end()); if (clipEndBases) { cigarBuffer.addOperation(clipEndBases, Cigar::SOFT_CLIP); } const unsigned ret = updateFragmentCigar( readMetadata, contigList, contigAnnotations, fragmentMetadata, fragmentMetadata.reverse, fragmentMetadata.contigId, fragmentMetadata.position, cigarBuffer, cigarOffset); if (!ret) { fragmentMetadata.setUnaligned(); } return ret; }
TEST(CigarStringTest, ToStdString_SingleOp) { const string singleCigar = "100="; Cigar cigar; cigar.push_back( CigarOperation(CigarOperationType::SEQUENCE_MATCH, 100) ); EXPECT_EQ(singleCigar, cigar.ToStdString()); }
int getMaxClipLen( SamRecord & sam_rec ) { Cigar * myCigar = sam_rec.getCigarInfo(); int begin_clip = myCigar->getNumBeginClips(); int end_clip = myCigar->getNumEndClips(); if (begin_clip >= end_clip) return begin_clip; else return -end_clip; }
TEST(CigarStringTest, FromStdString_SingleOp) { const string singleCigar = "100="; Cigar cigar = Cigar::FromStdString(singleCigar); ASSERT_TRUE(cigar.size() == 1); const CigarOperation& op = cigar.front(); EXPECT_TRUE(op.Char() == '='); EXPECT_TRUE(op.Length() == 100); }
TEST(CigarStringTest, ToStdString_MultipleOps) { const string multiCigar = "100=2D34I6=6X6="; Cigar cigar; cigar.push_back(CigarOperation(CigarOperationType::SEQUENCE_MATCH, 100)); cigar.push_back(CigarOperation(CigarOperationType::DELETION, 2)); cigar.push_back(CigarOperation(CigarOperationType::INSERTION, 34)); cigar.push_back(CigarOperation(CigarOperationType::SEQUENCE_MATCH, 6)); cigar.push_back(CigarOperation(CigarOperationType::SEQUENCE_MISMATCH, 6)); cigar.push_back(CigarOperation(CigarOperationType::SEQUENCE_MATCH, 6)); EXPECT_EQ(multiCigar, cigar.ToStdString()); }
void Sites::addToCurrentCluster( vector<bool> & is_in_coord, SingleSite & new_site, SamRecord & rec ) { if (is_in_coord.size() != NMEI) morphError("[Sites::setNewCluster] is_in_coord size error"); // update breakpoint int old_evi = new_site.evidence; float a1 = (float)1 / float(old_evi+1); int ep = getEstimatedBreakPoint(rec); new_site.breakp = round( a1 * (float)ep + (float)new_site.breakp * (1-a1)); new_site.evidence++; // update position if (rec.get1BasedPosition() < new_site.start) new_site.start = rec.get1BasedPosition(); else if (rec.get1BasedAlignmentEnd() > new_site.end) new_site.end = rec.get1BasedAlignmentEnd(); // update info if (rec.getFlag() & 0x10) { if (new_site.right_clip_only) { Cigar * myCigar = rec.getCigarInfo(); int begin_clip = myCigar->getNumBeginClips(); if ( begin_clip < MIN_CLIP/2) new_site.right_clip_only = 0; } for(int m=0; m<NMEI; m++) { if (is_in_coord[m]) new_site.right[m]++; } } else { if (new_site.left_clip_only) { Cigar * myCigar = rec.getCigarInfo(); int end_clip = myCigar->getNumEndClips(); if (end_clip < MIN_CLIP/2) new_site.left_clip_only = 0; } for( int m=0; m<NMEI; m++) { if (is_in_coord[m]) new_site.left[m]++; } } }
void Sites::setNewCluster( vector<bool> & is_in_coord, SingleSite & new_site, SamRecord & rec ) { if (is_in_coord.size() != NMEI) morphError("[Sites::setNewCluster] is_in_coord size error"); // set info new_site.breakp = getEstimatedBreakPoint(rec); new_site.rcount = 1; new_site.evidence = 1; for(int m=0; m<NMEI; m++) { new_site.left[m] = 0; new_site.right[m] = 0; } new_site.left_clip_only = 1; new_site.right_clip_only = 1; new_site.depth = current_depth; new_site.depth_add = 1; // set position & mtype if ( rec.getFlag() & 0x10 ) { // right anchor new_site.start = rec.get1BasedPosition(); new_site.end = rec.get1BasedAlignmentEnd(); Cigar * myCigar = rec.getCigarInfo(); int begin_clip = myCigar->getNumBeginClips(); if ( begin_clip < MIN_CLIP/2) new_site.right_clip_only = 0; for(int m=0; m<NMEI; m++) { if (is_in_coord[m]) new_site.right[m] = 1; } } else { new_site.start = rec.get1BasedPosition(); new_site.end = rec.get1BasedAlignmentEnd(); Cigar * myCigar = rec.getCigarInfo(); int end_clip = myCigar->getNumEndClips(); if (end_clip < MIN_CLIP/2) new_site.left_clip_only = 0; for(int m=0; m<NMEI; m++) { if (is_in_coord[m]) new_site.left[m] = 1; } } }
bool leftAlign(string& alternateSequence, Cigar& cigar, string& referenceSequence, bool debug = false) { int arsOffset = 0; // pointer to insertion point in aligned reference sequence string alignedReferenceSequence = referenceSequence; int aabOffset = 0; string alignmentAlignedBases = alternateSequence; // store information about the indels vector<VCFIndelAllele> indels; int rp = 0; // read position, 0-based relative to read int sp = 0; // sequence position string softBegin; string softEnd; stringstream cigar_before, cigar_after; for (vector<pair<int, string> >::const_iterator c = cigar.begin(); c != cigar.end(); ++c) { unsigned int l = c->first; char t = c->second.at(0); cigar_before << l << t; if (t == 'M') { // match or mismatch sp += l; rp += l; } else if (t == 'D') { // deletion indels.push_back(VCFIndelAllele(false, l, sp, rp, referenceSequence.substr(sp, l))); alignmentAlignedBases.insert(rp + aabOffset, string(l, '-')); aabOffset += l; sp += l; // update reference sequence position } else if (t == 'I') { // insertion indels.push_back(VCFIndelAllele(true, l, sp, rp, alternateSequence.substr(rp, l))); alignedReferenceSequence.insert(sp + softBegin.size() + arsOffset, string(l, '-')); arsOffset += l; rp += l; } else if (t == 'S') { // soft clip, clipped sequence present in the read not matching the reference // remove these bases from the refseq and read seq, but don't modify the alignment sequence if (rp == 0) { alignedReferenceSequence = string(l, '*') + alignedReferenceSequence; softBegin = alignmentAlignedBases.substr(0, l); } else { alignedReferenceSequence = alignedReferenceSequence + string(l, '*'); softEnd = alignmentAlignedBases.substr(alignmentAlignedBases.size() - l, l); } rp += l; } else if (t == 'H') { // hard clip on the read, clipped sequence is not present in the read } else if (t == 'N') { // skipped region in the reference not present in read, aka splice sp += l; } } int alignedLength = sp; VCFLEFTALIGN_DEBUG("| " << cigar_before.str() << endl << "| " << alignedReferenceSequence << endl << "| " << alignmentAlignedBases << endl); // if no indels, return the alignment if (indels.empty()) { return false; } // for each indel, from left to right // while the indel sequence repeated to the left and we're not matched up with the left-previous indel // move the indel left vector<VCFIndelAllele>::iterator previous = indels.begin(); for (vector<VCFIndelAllele>::iterator id = indels.begin(); id != indels.end(); ++id) { // left shift by repeats // // from 1 base to the length of the indel, attempt to shift left // if the move would cause no change in alignment optimality (no // introduction of mismatches, and by definition no change in gap // length), move to the new position. // in practice this moves the indel left when we reach the size of // the repeat unit. // int steppos, readsteppos; VCFIndelAllele& indel = *id; int i = 1; while (i <= indel.length) { int steppos = indel.position - i; int readsteppos = indel.readPosition - i; #ifdef VERBOSE_DEBUG if (debug) { if (steppos >= 0 && readsteppos >= 0) { cerr << referenceSequence.substr(steppos, indel.length) << endl; cerr << alternateSequence.substr(readsteppos, indel.length) << endl; cerr << indel.sequence << endl; } } #endif while (steppos >= 0 && readsteppos >= 0 && indel.sequence == referenceSequence.substr(steppos, indel.length) && indel.sequence == alternateSequence.substr(readsteppos, indel.length) && (id == indels.begin() || (previous->insertion && steppos >= previous->position) || (!previous->insertion && steppos >= previous->position + previous->length))) { VCFLEFTALIGN_DEBUG((indel.insertion ? "insertion " : "deletion ") << indel << " shifting " << i << "bp left" << endl); indel.position -= i; indel.readPosition -= i; steppos = indel.position - i; readsteppos = indel.readPosition - i; } do { ++i; } while (i <= indel.length && indel.length % i != 0); } // left shift indels with exchangeable flanking sequence // // for example: // // GTTACGTT GTTACGTT // GT-----T ----> G-----TT // // GTGTGACGTGT GTGTGACGTGT // GTGTG-----T ----> GTG-----TGT // // GTGTG-----T GTG-----TGT // GTGTGACGTGT ----> GTGTGACGTGT // // steppos = indel.position - 1; readsteppos = indel.readPosition - 1; while (steppos >= 0 && readsteppos >= 0 && alternateSequence.at(readsteppos) == referenceSequence.at(steppos) && alternateSequence.at(readsteppos) == indel.sequence.at(indel.sequence.size() - 1) && (id == indels.begin() || (previous->insertion && indel.position - 1 >= previous->position) || (!previous->insertion && indel.position - 1 >= previous->position + previous->length))) { VCFLEFTALIGN_DEBUG((indel.insertion ? "insertion " : "deletion ") << indel << " exchanging bases " << 1 << "bp left" << endl); indel.sequence = indel.sequence.at(indel.sequence.size() - 1) + indel.sequence.substr(0, indel.sequence.size() - 1); indel.position -= 1; indel.readPosition -= 1; steppos = indel.position - 1; readsteppos = indel.readPosition - 1; } // tracks previous indel, so we don't run into it with the next shift previous = id; } // bring together floating indels // from left to right // check if we could merge with the next indel // if so, adjust so that we will merge in the next step if (indels.size() > 1) { previous = indels.begin(); for (vector<VCFIndelAllele>::iterator id = (indels.begin() + 1); id != indels.end(); ++id) { VCFIndelAllele& indel = *id; // parsimony: could we shift right and merge with the previous indel? // if so, do it int prev_end_ref = previous->insertion ? previous->position : previous->position + previous->length; int prev_end_read = !previous->insertion ? previous->readPosition : previous->readPosition + previous->length; if (previous->insertion == indel.insertion && ((previous->insertion && (previous->position < indel.position && previous->readPosition + previous->readPosition < indel.readPosition)) || (!previous->insertion && (previous->position + previous->length < indel.position) && (previous->readPosition < indel.readPosition) ))) { if (previous->homopolymer()) { string seq = referenceSequence.substr(prev_end_ref, indel.position - prev_end_ref); string readseq = alternateSequence.substr(prev_end_read, indel.position - prev_end_ref); VCFLEFTALIGN_DEBUG("seq: " << seq << endl << "readseq: " << readseq << endl); if (previous->sequence.at(0) == seq.at(0) && FBhomopolymer(seq) && FBhomopolymer(readseq)) { VCFLEFTALIGN_DEBUG("moving " << *previous << " right to " << (indel.insertion ? indel.position : indel.position - previous->length) << endl); previous->position = indel.insertion ? indel.position : indel.position - previous->length; } } else { int pos = previous->position; while (pos < (int) referenceSequence.length() && ((previous->insertion && pos + previous->length <= indel.position) || (!previous->insertion && pos + previous->length < indel.position)) && previous->sequence == referenceSequence.substr(pos + previous->length, previous->length)) { pos += previous->length; } if (pos < previous->position && ((previous->insertion && pos + previous->length == indel.position) || (!previous->insertion && pos == indel.position - previous->length)) ) { VCFLEFTALIGN_DEBUG("right-merging tandem repeat: moving " << *previous << " right to " << pos << endl); previous->position = pos; } } } previous = id; } } // for each indel // if ( we're matched up to the previous insertion (or deletion) // and it's also an insertion or deletion ) // merge the indels // // and simultaneously reconstruct the cigar Cigar newCigar; if (!softBegin.empty()) { newCigar.push_back(make_pair(softBegin.size(), "S")); } vector<VCFIndelAllele>::iterator id = indels.begin(); VCFIndelAllele last = *id++; if (last.position > 0) { newCigar.push_back(make_pair(last.position, "M")); newCigar.push_back(make_pair(last.length, (last.insertion ? "I" : "D"))); } else { newCigar.push_back(make_pair(last.length, (last.insertion ? "I" : "D"))); } int lastend = last.insertion ? last.position : (last.position + last.length); VCFLEFTALIGN_DEBUG(last << ","); for (; id != indels.end(); ++id) { VCFIndelAllele& indel = *id; VCFLEFTALIGN_DEBUG(indel << ","); if (indel.position < lastend) { cerr << "impossibility?: indel realigned left of another indel" << endl << referenceSequence << endl << alternateSequence << endl; exit(1); } else if (indel.position == lastend && indel.insertion == last.insertion) { pair<int, string>& op = newCigar.back(); op.first += indel.length; } else if (indel.position >= lastend) { // also catches differential indels, but with the same position newCigar.push_back(make_pair(indel.position - lastend, "M")); newCigar.push_back(make_pair(indel.length, (indel.insertion ? "I" : "D"))); } last = *id; lastend = last.insertion ? last.position : (last.position + last.length); } if (lastend < alignedLength) { newCigar.push_back(make_pair(alignedLength - lastend, "M")); } if (!softEnd.empty()) { newCigar.push_back(make_pair(softEnd.size(), "S")); } VCFLEFTALIGN_DEBUG(endl); cigar = newCigar; for (vector<pair<int, string> >::const_iterator c = cigar.begin(); c != cigar.end(); ++c) { unsigned int l = c->first; char t = c->second.at(0); cigar_after << l << t; } //cerr << cigar_before.str() << " changes to " << cigar_after.str() << endl; VCFLEFTALIGN_DEBUG(cigar_after.str() << endl); // check if we're realigned if (cigar_after.str() == cigar_before.str()) { return false; } else { return true; } }
TEST(CigarStringTest, ToStdString_Empty) { const string empty; Cigar cigar; EXPECT_EQ(empty, cigar.ToStdString()); }
TEST(CigarStringTest, FromStdString_Empty) { const string emptyCigar = ""; Cigar cigar = Cigar::FromStdString(emptyCigar); EXPECT_TRUE(cigar.empty()); }
void BAMUtils::padded_alignment() { Cigar cig = bam_record.get_cigar(); Sequence tdna = bam_record.get_seq(); int sdna_pos = 0; int tdna_pos = 0; pad_source.reserve(t_dna.length()); pad_target.reserve(t_dna.length()); pad_match.reserve(t_dna.length()); Sequence::iterator tdna_itr = tdna.get_iterator(); int tot = 0; //find out if the first cigar op could be soft clipped or not is_three_prime_soft_clipped = false; for (Cigar::iterator i = cig.get_iterator(); i.good(); i.next()) { //i.op(); i.len(); if (this->bam_record.mapped_reverse_strand()) { if (tot > ( cig.get_length( ) - 3) ){ if (i.op() == 'S') is_three_prime_soft_clipped = true; else is_three_prime_soft_clipped = false; } } else { if (tot < 2) { if (i.op() == 'S') is_three_prime_soft_clipped = true; else is_three_prime_soft_clipped = false; } } if (i.op() == 'I' ) { pad_source.append(i.len(), '-'); int count = 0; tdna_itr.set_position(tdna_pos); while (tdna_itr.good()) { if (count >= i.len()) { break; } else { pad_target += tdna_itr.get(); tdna_itr.next(); tdna_pos++; count++; } } pad_match.append(i.len(), '+'); } else if(i.op() == 'D' || i.op() == 'N') { pad_source.append( t_dna.substr(sdna_pos, i.len())); sdna_pos += i.len(); pad_target.append(i.len(), '-'); pad_match.append(i.len(), '-'); } else if(i.op() == 'P') { pad_source.append(i.len(), '*'); pad_target.append(i.len(), '*'); pad_match.append(i.len(), ' '); } else if (i.op() == 'S') { if (!truncate_soft_clipped) { pad_source.append(i.len(), '-'); pad_match.append(i.len(), '+'); pad_target.append(i.len(), '+'); } int count = 0; while (tdna_itr.good()) { if (count >= i.len()) { break; } tdna_pos++; tdna_itr.next(); count++; } } else if (i.op() == 'H') { //nothing for clipped bases }else { std::string ps, pt, pm; ps.reserve(i.len()); pm.reserve(i.len()); ps = t_dna.substr(sdna_pos,i.len()); //tdna is really qdna tdna_itr.set_position(tdna_pos); int count = 0; while (tdna_itr.good()) { if (count < i.len()) { pt += tdna_itr.get(); } else { break; } tdna_itr.next(); count++; } for (unsigned int z = 0; z < ps.length(); z++) { if (ps[z] == pt[z]) { pad_match += '|'; } else if (ps[z] != 'A' || ps[z] != 'C' || ps[z] != 'G' || ps[z] != 'T') { if (iupac_flag) { std::vector<char> nukes(IUPAC::get_base(ps[z])); bool replaced = false; unsigned int nuke_ptr = 0; for (unsigned int n = 0; n < nukes.size(); n++) { if (nukes[n] == pt[z]) { pad_match += '|'; replaced = true; nuke_ptr = n; break; } //nuke_ptr++; } if (!replaced) { pad_match += ' '; } else if (!keep_iupac) { //std::cerr << "nukes["<<nuke_ptr<<"]: " << nukes[nuke_ptr] << " nukes.size() " << nukes.size() << std::endl; ps[z] = nukes[nuke_ptr]; }//keep_iupac }//iupac_flag else { pad_match += ' '; } }//end else if checking ps[z] agianst nukes else { pad_match += ' '; } }//end for loop pad_source += ps; pad_target += pt; sdna_pos += i.len(); tdna_pos += i.len(); } tot++; } /* std::cerr << "pad_source: " << pad_source << std::endl; std::cerr << "pad_target: " << pad_target << std::endl; std::cerr << "pad_match : " << pad_match << std::endl; */ }
void BAMUtils::dna() { MD md = bam_record.get_md(); Cigar cig = bam_record.get_cigar(); Sequence qseq = bam_record.get_seq(); int position = 0; std::string seq; Sequence::iterator qseq_itr = qseq.get_iterator(); for (Cigar::iterator i = cig.get_iterator(); i.good(); i.next()) { if (i.op() == 'M') { int count = 0; while (qseq_itr.good()) { if (count >= i.len()) { break; } else { seq += qseq_itr.get(); qseq_itr.next(); count++; } } } else if ((i.op() == 'I') || (i.op() == 'S')) { int count = 0; while (qseq_itr.good()) { if (count >= i.len()) { break; } qseq_itr.next(); count++; } //bool is_error = false; if (i.op() == 'S') { soft_clipped_bases += i.len(); //is_error = true; } } position++; } t_dna.reserve(seq.length()); int start = 0; MD::iterator md_itr = md.get_iterator(); std::string num; coord_t md_len = 0; char cur; while (md_itr.good()) { cur = md_itr.get(); if (std::isdigit(cur)) { num+=cur; //md_itr.next(); } else { if (num.length() > 0) { md_len = convert(num); num.clear(); t_dna += seq.substr(start, md_len); start += md_len; } } if (cur == '^') { //get nuc md_itr.next(); char nuc = md_itr.get(); while (std::isalpha(nuc)) { t_dna += nuc; md_itr.next(); nuc = md_itr.get(); } num += nuc; //it's a number now will //lose this value if i don't do it here //cur = nuc; } else if (std::isalpha(cur)) { t_dna += cur; start++; } md_itr.next(); } //clean up residual num if there is any if (num.length() > 0) { md_len = convert(num); num.clear(); t_dna += seq.substr(start, md_len); start += md_len; } }
int Stats::execute(int argc, char **argv) { // Extract command line arguments. String inFile = ""; String indexFile = ""; bool basic = false; bool noeof = false; bool params = false; bool qual = false; bool phred = false; int maxNumReads = -1; bool unmapped = false; String pBaseQC = ""; String cBaseQC = ""; String regionList = ""; int excludeFlags = 0; int requiredFlags = 0; bool withinRegion = false; int minMapQual = 0; String dbsnp = ""; PosList *dbsnpListPtr = NULL; bool baseSum = false; int bufferSize = PileupHelper::DEFAULT_WINDOW_SIZE; ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_PARAMETER_GROUP("Required Parameters") LONG_STRINGPARAMETER("in", &inFile) LONG_PARAMETER_GROUP("Types of Statistics") LONG_PARAMETER("basic", &basic) LONG_PARAMETER("qual", &qual) LONG_PARAMETER("phred", &phred) LONG_STRINGPARAMETER("pBaseQC", &pBaseQC) LONG_STRINGPARAMETER("cBaseQC", &cBaseQC) LONG_PARAMETER_GROUP("Optional Parameters") LONG_INTPARAMETER("maxNumReads", &maxNumReads) LONG_PARAMETER("unmapped", &unmapped) LONG_STRINGPARAMETER("bamIndex", &indexFile) LONG_STRINGPARAMETER("regionList", ®ionList) LONG_INTPARAMETER("excludeFlags", &excludeFlags) LONG_INTPARAMETER("requiredFlags", &requiredFlags) LONG_PARAMETER("noeof", &noeof) LONG_PARAMETER("params", ¶ms) LONG_PARAMETER_GROUP("Optional phred/qual Only Parameters") LONG_PARAMETER("withinRegion", &withinRegion) LONG_PARAMETER_GROUP("Optional BaseQC Only Parameters") LONG_PARAMETER("baseSum", &baseSum) LONG_INTPARAMETER("bufferSize", &bufferSize) LONG_INTPARAMETER("minMapQual", &minMapQual) LONG_STRINGPARAMETER("dbsnp", &dbsnp) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); inputParameters.Read(argc-1, &(argv[1])); // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } // Check to see if the in file was specified, if not, report an error. if(inFile == "") { usage(); inputParameters.Status(); // In file was not specified but it is mandatory. std::cerr << "--in is a mandatory argument for stats, " << "but was not specified" << std::endl; return(-1); } // Use the index file if unmapped or regionList is not empty. bool useIndex = (unmapped|| (!regionList.IsEmpty())); // IndexFile is required, so check to see if it has been set. if(useIndex && (indexFile == "")) { // In file was not specified, so set it to the in file // + ".bai" indexFile = inFile + ".bai"; } //////////////////////////////////////// // Setup in case pileup is used. Pileup<PileupElementBaseQCStats> pileup(bufferSize); // Initialize start/end positions. myStartPos = 0; myEndPos = -1; // Open the output qc file if applicable. IFILE baseQCPtr = NULL; if(!pBaseQC.IsEmpty() && !cBaseQC.IsEmpty()) { usage(); inputParameters.Status(); // Cannot specify both types of baseQC. std::cerr << "Cannot specify both --pBaseQC & --cBaseQC." << std::endl; return(-1); } else if(!pBaseQC.IsEmpty()) { baseQCPtr = ifopen(pBaseQC, "w"); PileupElementBaseQCStats::setPercentStats(true); } else if(!cBaseQC.IsEmpty()) { baseQCPtr = ifopen(cBaseQC, "w"); PileupElementBaseQCStats::setPercentStats(false); } if(baseQCPtr != NULL) { PileupElementBaseQCStats::setOutputFile(baseQCPtr); PileupElementBaseQCStats::printHeader(); } if((baseQCPtr != NULL) || baseSum) { PileupElementBaseQCStats::setMapQualFilter(minMapQual); PileupElementBaseQCStats::setBaseSum(baseSum); } if(params) { inputParameters.Status(); } // Open the file for reading. SamFile samIn; if(!samIn.OpenForRead(inFile)) { fprintf(stderr, "%s\n", samIn.GetStatusMessage()); return(samIn.GetStatus()); } samIn.SetReadFlags(requiredFlags, excludeFlags); // Set whether or not basic statistics should be generated. samIn.GenerateStatistics(basic); // Read the sam header. SamFileHeader samHeader; if(!samIn.ReadHeader(samHeader)) { fprintf(stderr, "%s\n", samIn.GetStatusMessage()); return(samIn.GetStatus()); } // Open the bam index file for reading if we are // doing unmapped reads (also set the read section). if(useIndex) { samIn.ReadBamIndex(indexFile); if(unmapped) { samIn.SetReadSection(-1); } if(!regionList.IsEmpty()) { myRegionList = ifopen(regionList, "r"); } } ////////////////////////// // Read dbsnp if specified and doing baseQC if(((baseQCPtr != NULL) || baseSum) && (!dbsnp.IsEmpty())) { // Read the dbsnp file. IFILE fdbSnp; fdbSnp = ifopen(dbsnp,"r"); // Determine how many entries. const SamReferenceInfo& refInfo = samHeader.getReferenceInfo(); int maxRefLen = 0; for(int i = 0; i < refInfo.getNumEntries(); i++) { int refLen = refInfo.getReferenceLength(i); if(refLen >= maxRefLen) { maxRefLen = refLen + 1; } } dbsnpListPtr = new PosList(refInfo.getNumEntries(),maxRefLen); if(fdbSnp==NULL) { std::cerr << "Open dbSNP file " << dbsnp.c_str() << " failed!\n"; } else if(dbsnpListPtr == NULL) { std::cerr << "Failed to init the memory allocation for the dbsnpList.\n"; } else { // Read the dbsnp file. StringArray tokens; String buffer; int position = 0; int refID = 0; // Loop til the end of the file. while (!ifeof(fdbSnp)) { // Read the next line. buffer.ReadLine(fdbSnp); // If it does not have at least 2 columns, // continue to the next line. if (buffer.IsEmpty() || buffer[0] == '#') continue; tokens.AddTokens(buffer); if(tokens.Length() < 2) continue; if(!tokens[1].AsInteger(position)) { std::cerr << "Improperly formatted region line, start position " << "(2nd column) is not an integer: " << tokens[1] << "; Skipping to the next line.\n"; continue; } // Look up the reference name. refID = samHeader.getReferenceID(tokens[0]); if(refID != SamReferenceInfo::NO_REF_ID) { // Reference id was found, so add it to the dbsnp dbsnpListPtr->addPosition(refID, position); } tokens.Clear(); buffer.Clear(); } } ifclose(fdbSnp); } // Read the sam records. SamRecord samRecord; int numReads = 0; ////////////////////// // Setup in case doing a quality count. // Quality histogram. const int MAX_QUAL = 126; const int START_QUAL = 33; uint64_t qualCount[MAX_QUAL+1]; for(int i = 0; i <= MAX_QUAL; i++) { qualCount[i] = 0; } const int START_PHRED = 0; const int PHRED_DIFF = START_QUAL - START_PHRED; const int MAX_PHRED = MAX_QUAL - PHRED_DIFF; uint64_t phredCount[MAX_PHRED+1]; for(int i = 0; i <= MAX_PHRED; i++) { phredCount[i] = 0; } int refPos = 0; Cigar* cigarPtr = NULL; char cigarChar = '?'; // Exclude clips from the qual/phred counts if unmapped reads are excluded. bool qualExcludeClips = excludeFlags & SamFlag::UNMAPPED; ////////////////////////////////// // When not reading by sections, getNextSection returns true // the first time, then false the next time. while(getNextSection(samIn)) { // Keep reading records from the file until SamFile::ReadRecord // indicates to stop (returns false). while(((maxNumReads < 0) || (numReads < maxNumReads)) && samIn.ReadRecord(samHeader, samRecord)) { // Another record was read, so increment the number of reads. ++numReads; // See if the quality histogram should be genereated. if(qual || phred) { // Get the quality. const char* qual = samRecord.getQuality(); // Check for no quality ('*'). if((qual[0] == '*') && (qual[1] == 0)) { // This record does not have a quality string, so no // quality processing is necessary. } else { int index = 0; cigarPtr = samRecord.getCigarInfo(); cigarChar = '?'; refPos = samRecord.get0BasedPosition(); if(!qualExcludeClips && (cigarPtr != NULL)) { // Offset the reference position by any soft clips // by subtracting the queryIndex of this start position. // refPos is now the start position of the clips. refPos -= cigarPtr->getQueryIndex(0); } while(qual[index] != 0) { // Skip this quality if it is clipped and we are skipping clips. if(cigarPtr != NULL) { cigarChar = cigarPtr->getCigarCharOpFromQueryIndex(index); } if(qualExcludeClips && Cigar::isClip(cigarChar)) { // Skip a clipped quality. ++index; // Increment the position. continue; } if(withinRegion && (myEndPos != -1) && (refPos >= myEndPos)) { // We have hit the end of the region, stop processing this // quality string. break; } if(withinRegion && (refPos < myStartPos)) { // This position is not in the target. ++index; // Update the position if this is found in the reference or a clip. if(Cigar::foundInReference(cigarChar) || Cigar::isClip(cigarChar)) { ++refPos; } continue; } // Check for valid quality. if((qual[index] < START_QUAL) || (qual[index] > MAX_QUAL)) { if(qual) { std::cerr << "Invalid Quality found: " << qual[index] << ". Must be between " << START_QUAL << " and " << MAX_QUAL << ".\n"; } if(phred) { std::cerr << "Invalid Phred Quality found: " << qual[index] - PHRED_DIFF << ". Must be between " << START_QUAL << " and " << MAX_QUAL << ".\n"; } // Skip an invalid quality. ++index; // Update the position if this is found in the reference or a clip. if(Cigar::foundInReference(cigarChar) || Cigar::isClip(cigarChar)) { ++refPos; } continue; } // Increment the count for this quality. ++(qualCount[(int)(qual[index])]); ++(phredCount[(int)(qual[index]) - PHRED_DIFF]); // Update the position if this is found in the reference or a clip. if(Cigar::foundInReference(cigarChar) || Cigar::isClip(cigarChar)) { ++refPos; } ++index; } } } // Check the next thing to do for the read. if((baseQCPtr != NULL) || baseSum) { // Pileup the bases for this read. pileup.processAlignmentRegion(samRecord, myStartPos, myEndPos, dbsnpListPtr); } } // Done with a section, move on to the next one. // New section, so flush the pileup. pileup.flushPileup(); } // Flush the rest of the pileup. if((baseQCPtr != NULL) || baseSum) { // Pileup the bases. pileup.processAlignmentRegion(samRecord, myStartPos, myEndPos, dbsnpListPtr); PileupElementBaseQCStats::printSummary(); ifclose(baseQCPtr); } std::cerr << "Number of records read = " << samIn.GetCurrentRecordCount() << std::endl; if(basic) { std::cerr << std::endl; samIn.PrintStatistics(); } // Print the quality stats. if(qual) { std::cerr << std::endl; std::cerr << "Quality\tCount\n"; for(int i = START_QUAL; i <= MAX_QUAL; i++) { std::cerr << i << "\t" << qualCount[i] << std::endl; } } // Print the phred quality stats. if(phred) { std::cerr << std::endl; std::cerr << "Phred\tCount\n"; for(int i = START_PHRED; i <= MAX_PHRED; i++) { std::cerr << i << "\t" << phredCount[i] << std::endl; } } SamStatus::Status status = samIn.GetStatus(); if(status == SamStatus::NO_MORE_RECS) { // A status of NO_MORE_RECS means that all reads were successful. status = SamStatus::SUCCESS; } return(status); }
// Soft clip the cigar from the front and/or the back, writing the value // into the new cigar. SamFilter::FilterStatus SamFilter::softClip(Cigar& oldCigar, int32_t numFrontClips, int32_t numBackClips, int32_t& startPos, CigarRoller& updatedCigar) { int32_t readLength = oldCigar.getExpectedQueryBaseCount(); int32_t endClipPos = readLength - numBackClips; FilterStatus status = NONE; if((numFrontClips != 0) || (numBackClips != 0)) { // Clipping from front and/or from the back. // Check to see if the entire read was clipped. int32_t totalClips = numFrontClips + numBackClips; if(totalClips >= readLength) { ///////////////////////////// // The entire read is clipped, so rather than clipping it, // filter it out. return(FILTERED); } // Part of the read was clipped. status = CLIPPED; // Loop through, creating an updated cigar. int origCigarOpIndex = 0; // Track how many read positions are covered up to this // point by the cigar to determine up to up to what // point in the cigar is affected by this clipping. int32_t numPositions = 0; // Track if any non-clips are in the new cigar. bool onlyClips = true; const Cigar::CigarOperator* op = NULL; ////////////////// // Clip from front while((origCigarOpIndex < oldCigar.size()) && (numPositions < numFrontClips)) { op = &(oldCigar.getOperator(origCigarOpIndex)); switch(op->operation) { case Cigar::hardClip: // Keep this operation as the new clips do not // affect other clips. updatedCigar += *op; break; case Cigar::del: case Cigar::skip: // Skip and delete are going to be dropped, and // are not in the read, so the read index doesn't // need to be updated break; case Cigar::insert: case Cigar::match: case Cigar::mismatch: case Cigar::softClip: // Update the read index as these types // are found in the read. numPositions += op->count; break; case Cigar::none: default: // Nothing to do for none. break; }; ++origCigarOpIndex; } // If bases were clipped from the front, add the clip and // any partial cigar operation as necessary. if(numFrontClips != 0) { // Add the softclip to the front of the read. updatedCigar.Add(Cigar::softClip, numFrontClips); // Add the rest of the last Cigar operation if // it is not entirely clipped. int32_t newCount = numPositions - numFrontClips; if(newCount > 0) { // Before adding it, check to see if the same // operation is clipped from the end. // numPositions greater than the endClipPos // means that it is equal or past that position, // so shorten the number of positions. if(numPositions > endClipPos) { newCount -= (numPositions - endClipPos); } if(newCount > 0) { updatedCigar.Add(op->operation, newCount); if(!Cigar::isClip(op->operation)) { onlyClips = false; } } } } // Add operations until the point of the end clip is reached. // For example... // 2M1D3M = MMDMMM readLength = 5 // readIndex: 01 234 // at cigarOpIndex 0 (2M), numPositions = 2. // at cigarOpIndex 1 (1D), numPositions = 2. // at cigarOpIndex 2 (3M), numPositions = 5. // if endClipPos = 2, we still want to consume the 1D, so // need to keep looping until numPositions > endClipPos while((origCigarOpIndex < oldCigar.size()) && (numPositions <= endClipPos)) { op = &(oldCigar.getOperator(origCigarOpIndex)); // Update the numPositions count if the operations indicates // bases within the read. if(!Cigar::foundInQuery(op->operation)) { // This operation is not in the query read sequence, // so it is not yet to the endClipPos, just add the // operation do not increment the number of positions. updatedCigar += *op; if(!Cigar::isClip(op->operation)) { onlyClips = false; } } else { // This operation appears in the query sequence, so // check to see if the clip occurs in this operation. // endClipPos is 0 based & numPositions is a count. // If endClipPos is 4, then it is the 5th position. // If 4 positions are covered so far (numPositions = 4), // then we are right at endCLipPos: 4-4 = 0, none of // this operation should be kept. // If only 3 positions were covered, then we are at offset // 3, so offset 3 should be added: 4-3 = 1. uint32_t numPosTilClip = endClipPos - numPositions; if(numPosTilClip < op->count) { // this operation is partially clipped, write the part // that was not clipped if it is not all clipped. if(numPosTilClip != 0) { updatedCigar.Add(op->operation, numPosTilClip); if(!Cigar::isClip(op->operation)) { onlyClips = false; } } } else { // This operation is not clipped, so add it updatedCigar += *op; if(!Cigar::isClip(op->operation)) { onlyClips = false; } } // This operation occurs in the query sequence, so // increment the number of positions covered. numPositions += op->count; } // Move to the next cigar position. ++origCigarOpIndex; } ////////////////// // Add the softclip to the back. if(numBackClips != 0) { // Add the softclip to the end updatedCigar.Add(Cigar::softClip, numBackClips); } ////////////////// // Add any hardclips remaining in the original cigar to the back. while(origCigarOpIndex < oldCigar.size()) { op = &(oldCigar.getOperator(origCigarOpIndex)); if(op->operation == Cigar::hardClip) { // Keep this operation as the new clips do not // affect other clips. updatedCigar += *op; } ++origCigarOpIndex; } // Check to see if the new cigar is only clips. if(onlyClips) { // Only clips in the new cigar, so mark the read as filtered // instead of updating the cigar. ///////////////////////////// // The entire read was clipped. status = FILTERED; } else { // Part of the read was clipped. // Update the starting position if a clip was added to // the front. if(numFrontClips > 0) { // Convert from query index to reference position (from the // old cigar) // Get the position for the last front clipped position by // getting the position associated with the clipped base on // the reference. Then add one to get to the first // non-clipped position. int32_t lastFrontClipPos = numFrontClips - 1; int32_t newStartPos = oldCigar.getRefPosition(lastFrontClipPos, startPos); if(newStartPos != Cigar::INDEX_NA) { // Add one to get first non-clipped position. startPos = newStartPos + 1; } } } } return(status); }
// Soft Clip from the beginning of the read to the specified reference position. int32_t CigarHelper::softClipBeginByRefPos(SamRecord& record, int32_t refPosition0Based, CigarRoller& newCigar, int32_t &new0BasedPosition) { newCigar.clear(); Cigar* cigar = record.getCigarInfo(); if(cigar == NULL) { // Failed to get the cigar. ErrorHandler::handleError("Soft clipping, but failed to read the cigar"); return(NO_CLIP); } // No cigar or position in the record, so return no clip. if((cigar->size() == 0) || (record.get0BasedPosition() == -1)) { return(NO_CLIP); } // Check to see if the reference position occurs before the record starts, // if it does, do no clipping. if(refPosition0Based < record.get0BasedPosition()) { // Not within this read, so nothing to clip. newCigar.Set(record.getCigar()); return(NO_CLIP); } // The position falls after the read starts, so loop through until the // position or the end of the read is found. int32_t readClipPosition = 0; bool clipWritten = false; new0BasedPosition = record.get0BasedPosition(); for(int i = 0; i < cigar->size(); i++) { const Cigar::CigarOperator* op = &(cigar->getOperator(i)); if(clipWritten) { // Clip point has been found, so just add everything. newCigar += *op; // Go to the next operation. continue; } // The clip point has not yet been found, so check to see if we found // it now. // Not a clip, check to see if the operation is found in the // reference. if(Cigar::foundInReference(*op)) { // match, mismatch, deletion, skip // increment the current reference position to just past this // operation. new0BasedPosition += op->count; // Check to see if this is also in the query, because otherwise // the operation is still being consumed. if(Cigar::foundInQuery(*op)) { // Also in the query, determine if the entire thing should // be clipped or just part of it. uint32_t numKeep = 0; // Check to see if we have hit our clip position. if(refPosition0Based < new0BasedPosition) { // The specified clip position is in this cigar operation. numKeep = new0BasedPosition - refPosition0Based - 1; if(numKeep > op->count) { // Keep the entire read. This happens because // we keep reading until the first match/mismatch // after the clip. numKeep = op->count; } } // Add the part of this operation that is being clipped // to the clip count. readClipPosition += (op->count - numKeep); // Only write the clip if we found a match/mismatch // to write. Otherwise we will keep accumulating clips // for the case of insertions. if(numKeep > 0) { new0BasedPosition -= numKeep; newCigar.Add(Cigar::softClip, readClipPosition); // Add the clipped part of this cigar to the clip // position. newCigar.Add(op->operation, numKeep); // Found a match after the clip point, so stop // consuming cigar operations. clipWritten = true; continue; } } } else { // Only add hard clips. The softclips will be added in // when the total number is found. if(op->operation == Cigar::hardClip) { // Check if this is the first operation, if so, just write it. if(i == 0) { newCigar += *op; } // Check if it is the last operation (otherwise skip it). else if(i == (cigar->size() - 1)) { // Check whether or not the clip was ever written, and if // not, write it. if(clipWritten == false) { newCigar.Add(Cigar::softClip, readClipPosition); // Since no match/mismatch was ever found, set // the new ref position to the original one. new0BasedPosition = record.get0BasedPosition(); clipWritten = true; } // Add the hard clip. newCigar += *op; } } // Not yet to the clip position, so do not add this operation. if(Cigar::foundInQuery(*op)) { // Found in the query, so update the read clip position. readClipPosition += op->count; } } } // End loop through cigar. // Check whether or not the clip was ever written, and if // not, write it. if(clipWritten == false) { newCigar.Add(Cigar::softClip, readClipPosition); // Since no match/mismatch was ever found, set // the new ref position to the original one. new0BasedPosition = record.get0BasedPosition(); } // Subtract 1 since readClipPosition atually contains the first 0based // position that is not clipped. return(readClipPosition - 1); }
// Soft Clip from the end of the read at the specified reference position. int32_t CigarHelper::softClipEndByRefPos(SamRecord& record, int32_t refPosition0Based, CigarRoller& newCigar) { newCigar.clear(); Cigar* cigar = record.getCigarInfo(); if(cigar == NULL) { // Failed to get the cigar. ErrorHandler::handleError("Soft clipping, but failed to read the cigar"); return(NO_CLIP); } // No cigar or position in the record, so return no clip. if((cigar->size() == 0) || (record.get0BasedPosition() == -1)) { return(NO_CLIP); } // Check to see if the reference position occurs after the record ends, // if so, do no clipping. if(refPosition0Based > record.get0BasedAlignmentEnd()) { // Not within this read, so nothing to clip. newCigar.Set(record.getCigar()); return(NO_CLIP); } // The position falls before the read ends, so loop through until the // position is found. int32_t currentRefPosition = record.get0BasedPosition(); int32_t readClipPosition = 0; for(int i = 0; i < cigar->size(); i++) { const Cigar::CigarOperator* op = &(cigar->getOperator(i)); // If the operation is found in the reference, increase the // reference position. if(Cigar::foundInReference(*op)) { // match, mismatch, deletion, skip // increment the current reference position to just past // this operation. currentRefPosition += op->count; } // Check to see if we have hit our clip position. if(refPosition0Based < currentRefPosition) { // If this read is also in the query (match/mismatch), // write the partial op to the new cigar. int32_t numKeep = 0; if(Cigar::foundInQuery(*op)) { numKeep = op->count - (currentRefPosition - refPosition0Based); if(numKeep > 0) { newCigar.Add(op->operation, numKeep); readClipPosition += numKeep; } } else if(Cigar::isClip(*op)) { // This is a hard clip, so write it. newCigar.Add(op->operation, op->count); } else { // Not found in the query (skip/deletion), // so don't write any of the operation. } // Found the clip point, so break. break; } else if(refPosition0Based == currentRefPosition) { newCigar += *op; if(Cigar::foundInQuery(*op)) { readClipPosition += op->count; } } else { // Not yet to the clip position, so add this operation/size to // the new cigar. newCigar += *op; if(Cigar::foundInQuery(*op)) { // Found in the query, so update the read clip position. readClipPosition += op->count; } } } // End loop through cigar. // Before adding the softclip, read from the end of the cigar checking to // see if the operations are in the query, removing operations that are // not (pad/delete/skip) until a hardclip or an operation in the query is // found. We do not want a pad/delete/skip right before a softclip. for(int j = newCigar.size() - 1; j >= 0; j--) { const Cigar::CigarOperator* op = &(newCigar.getOperator(j)); if(!Cigar::foundInQuery(*op) && !Cigar::isClip(*op)) { // pad/delete/skip newCigar.Remove(j); } else if(Cigar::foundInQuery(*op) & Cigar::isClip(*op)) { // Soft clip, so increment the clip position for the return value. // Remove the softclip since the readClipPosition is used to // calculate teh size of the soft clip added. readClipPosition -= op->count; newCigar.Remove(j); } else { // Found a cigar operation that should not be deleted, so stop deleting. break; } } // Determine the number of soft clips. int32_t numSoftClips = record.getReadLength() - readClipPosition; // NOTE that if the previous operation is a softclip, the CigarRoller logic // will merge this with that one. newCigar.Add(Cigar::softClip, numSoftClips); // Check if an ending hard clip needs to be added. if(cigar->size() != 0) { const Cigar::CigarOperator* lastOp = &(cigar->getOperator(cigar->size() - 1)); if(lastOp->operation == Cigar::hardClip) { newCigar += *lastOp; } } return(readClipPosition); }
std::string Cigar::toString(const Cigar &cigarBuffer, unsigned offset, unsigned length) { ISAAC_ASSERT_MSG(cigarBuffer.size() >= offset + length, "Requested end is outside of cigarBuffer"); return toString(cigarBuffer.begin() + offset, cigarBuffer.begin() + offset + length); }
// Add an entry to this pileup element. void PileupElementBaseQual::addEntry(SamRecord& record) { // Call the base class: PileupElement::addEntry(record); if(myRefAllele.empty()) { genomeIndex_t markerIndex = (*myRefSeq).getGenomePosition(getChromosome(), static_cast<uint32_t>(getRefPosition()+1)); myRefAllele = (*myRefSeq)[markerIndex]; } // Increment the index ++myIndex; // if the index has gone beyond the allocated space, double the size. if(myIndex >= myAllocatedSize) { char* tempBuffer = (char*)realloc(myBases, myAllocatedSize * 2); if(tempBuffer == NULL) { std::cerr << "Memory Allocation Failure\n"; // TODO return; } myBases = tempBuffer; int8_t* tempInt8Buffer = (int8_t*)realloc(myMapQualities, myAllocatedSize * 2 * sizeof(int8_t)); if(tempInt8Buffer == NULL) { std::cerr << "Memory Allocation Failure\n"; // TODO return; } myMapQualities = tempInt8Buffer; tempInt8Buffer = (int8_t*)realloc(myQualities, myAllocatedSize * 2 * sizeof(int8_t)); if(tempInt8Buffer == NULL) { std::cerr << "Memory Allocation Failure\n"; // TODO return; } myQualities = tempInt8Buffer; tempBuffer = (char*)realloc(myStrands, myAllocatedSize * 2); if(tempBuffer == NULL) { std::cerr << "Memory Allocation Failure\n"; // TODO return; } myStrands = tempBuffer; tempInt8Buffer = (int8_t*)realloc(myCycles, myAllocatedSize * 2 * sizeof(int8_t)); if(tempInt8Buffer == NULL) { std::cerr << "Memory Allocation Failure\n"; // TODO return; } myCycles = tempInt8Buffer; int16_t* tempInt16Buffer = (int16_t*)realloc(myGLScores, myAllocatedSize * 2 * sizeof(int16_t)); if(tempInt8Buffer == NULL) { std::cerr << "Memory Allocation Failure\n"; // TODO return; } myGLScores = tempInt16Buffer; myAllocatedSize = myAllocatedSize * 2; } Cigar* cigar = record.getCigarInfo(); if(cigar == NULL) { throw std::runtime_error("Failed to retrieve cigar info from the record."); } int32_t readIndex = cigar->getQueryIndex(getRefPosition(), record.get0BasedPosition()); // If the readPosition is N/A, this is a deletion. if(readIndex != CigarRoller::INDEX_NA) { char base = record.getSequence(readIndex); int8_t mapQual = record.getMapQuality(); //-33 to obtain the PHRED base quality char qual = record.getQuality(readIndex) - 33; if(qual == UNSET_QUAL) { qual = ' '; } char strand = (record.getFlag() & 0x0010) ? 'R' : 'F'; int cycle = strand == 'F' ? readIndex + 1 : record.getReadLength() - readIndex; myBases[myIndex] = base; myMapQualities[myIndex] = mapQual; myQualities[myIndex] = qual; myStrands[myIndex] = strand; myCycles[myIndex] = cycle; } else if(myAddDelAsBase) { int8_t mapQual = record.getMapQuality(); char strand = (record.getFlag() & 0x0010) ? 'R' : 'F'; myBases[myIndex] = '-'; myMapQualities[myIndex] = mapQual; myQualities[myIndex] = -1; myStrands[myIndex] = strand; myCycles[myIndex] = -1; } else { // Do not add a deletion. // Did not add any entries, so decrement the index counter since the // index was not used. --myIndex; } }
bool Recab::processReadBuildTable(SamRecord& samRecord) { static BaseData data; static std::string chromosomeName; static std::string readGroup; static std::string aligTypes; int seqLen = samRecord.getReadLength(); // Check if the parameters have been processed. if(!myParamsSetup) { // This throws an exception if the reference cannot be setup. processParams(); } uint16_t flag = samRecord.getFlag(); if(!SamFlag::isMapped(flag)) { // Unmapped, skip processing ++myUnMappedCount; } else { // This read is mapped. ++myMappedCount; } if(SamFlag::isSecondary(flag)) { // Secondary read ++mySecondaryCount; } if(SamFlag::isDuplicate(flag)) { ++myDupCount; } if(SamFlag::isQCFailure(flag)) { ++myQCFailCount; } // Check if the flag contains an exclude. if((flag & myIntBuildExcludeFlags) != 0) { // Do not use this read for building the recalibration table. ++myNumBuildSkipped; return(false); } if(samRecord.getMapQuality() == 0) { // 0 mapping quality, so skip processing. ++myMapQual0Count; ++myNumBuildSkipped; return(false); } if(samRecord.getMapQuality() == 255) { // 255 mapping quality, so skip processing. ++myMapQual255Count; ++myNumBuildSkipped; return(false); } chromosomeName = samRecord.getReferenceName(); readGroup = samRecord.getString("RG").c_str(); // Look for the read group in the map. // TODO - extra string constructor?? RgInsertReturn insertRet = myRg2Id.insert(std::pair<std::string, uint16_t>(readGroup, 0)); if(insertRet.second == true) { // New element inserted. insertRet.first->second = myId2Rg.size(); myId2Rg.push_back(readGroup); } data.rgid = insertRet.first->second; //reverse bool reverse; if(SamFlag::isReverse(flag)) reverse = true; else reverse = false; if(myReferenceGenome == NULL) { throw std::runtime_error("Failed to setup Reference File.\n"); } genomeIndex_t mapPos = myReferenceGenome->getGenomePosition(chromosomeName.c_str(), samRecord.get1BasedPosition()); if(mapPos==INVALID_GENOME_INDEX) { Logger::gLogger->warning("INVALID_GENOME_INDEX (chrom:pos %s:%ld) and record skipped... Reference in BAM is different from the ref used here!", chromosomeName.c_str(), samRecord.get1BasedPosition()); ++myNumBuildSkipped; return false; } if(!myQField.IsEmpty()) { // Check if there is an old quality. const String* oldQPtr = samRecord.getStringTag(myQField.c_str()); if((oldQPtr != NULL) && (oldQPtr->Length() == seqLen)) { // There is an old quality, so use that. myQualityStrings.oldq = oldQPtr->c_str(); } else { // Tag was not found, so use the current quality. ++myNumQualTagErrors; if(myNumQualTagErrors == 1) { Logger::gLogger->warning("Recab: %s tag was not found/invalid, so using the quality field in records without the tag", myQField.c_str()); } myQualityStrings.oldq = samRecord.getQuality(); } //printf("%s\n",samRecord.getQuality()); //printf("%s:%s\n",myQField.c_str(),temp.c_str()); } else { myQualityStrings.oldq = samRecord.getQuality(); } if(myQualityStrings.oldq.length() != (unsigned int)seqLen) { Logger::gLogger->warning("Quality is not the correct length, so skipping recalibration on that record."); ++myNumBuildSkipped; return(false); } aligTypes = ""; Cigar* cigarPtr = samRecord.getCigarInfo(); if(cigarPtr == NULL) { Logger::gLogger->warning("Failed to get the cigar"); ++myNumBuildSkipped; return(false); } // This read will be used for building the recab table. ++myNumBuildReads; //////////////// ////// iterate sequence //////////////// genomeIndex_t refPos = 0; int32_t refOffset = 0; int32_t prevRefOffset = Cigar::INDEX_NA; int32_t seqPos = 0; int seqIncr = 1; if(reverse) { seqPos = seqLen - 1; seqIncr = -1; } // read if(!SamFlag::isPaired(flag) || SamFlag::isFirstFragment(flag)) // Mark as first if it is not paired or if it is the // first in the pair. data.read = 0; else data.read = 1; // Set unsetbase for curBase. // This will be used for the prebase of cycle 0. data.curBase = 'K'; for (data.cycle = 0; data.cycle < seqLen; data.cycle++, seqPos += seqIncr) { // Store the previous current base in preBase. data.preBase = data.curBase; // Get the current base before checking if we are going to // process this position so it will be set for the next position. data.curBase = samRecord.getSequence(seqPos); if(reverse) { // Complement the current base. // The prebase is already complemented. data.curBase = BaseAsciiMap::base2complement[(unsigned int)(data.curBase)]; } // Get the reference offset. refOffset = cigarPtr->getRefOffset(seqPos); if(refOffset == Cigar::INDEX_NA) { // Not a match/mismatch, so continue to the next one which will // not have a previous match/mismatch. // Set previous ref offset to a negative so // the next one won't be kept. prevRefOffset = -2; continue; } // This one is a match. refPos = mapPos + refOffset; // Check to see if we should process this position. // Do not process if it is cycle 0 and: // 1) current base is in dbsnp if(data.cycle == 0) { if(!(myDbsnpFile.IsEmpty()) && myDbSNP[refPos]) { // Save the previous reference offset. ++myNumDBSnpSkips; prevRefOffset = refOffset; continue; } } else { // Do not process if it is not cycle 0 and: // 1) previous reference position not adjacent // (not a match/mismatch) // 2) previous base is in dbsnp // 3) current base is in dbsnp if((!myKeepPrevNonAdjacent && (refOffset != (prevRefOffset + seqIncr))) || (data.preBase == 'K')) { // Save the previous reference offset. prevRefOffset = refOffset; continue; } if(!(myDbsnpFile.IsEmpty()) && (myDbSNP[refPos] || (!myKeepPrevDbsnp && myDbSNP[refPos - seqIncr]))) { ++myNumDBSnpSkips; // Save the previous reference offset. prevRefOffset = refOffset; continue; } } // Save the previous reference offset. prevRefOffset = refOffset; // Set the reference & read bases in the Covariates char refBase = (*myReferenceGenome)[refPos]; if(BaseUtilities::isAmbiguous(refBase)) { // N reference, so skip it when building the table. ++myAmbiguous; continue; } if(reverse) { refBase = BaseAsciiMap::base2complement[(unsigned int)(refBase)]; } // Get quality char data.qual = BaseUtilities::getPhredBaseQuality(myQualityStrings.oldq[seqPos]); // skip bases with quality below the minimum set. if(data.qual < myMinBaseQual) { ++mySubMinQual; continue; } if(BaseUtilities::areEqual(refBase, data.curBase) && (BaseAsciiMap::base2int[(unsigned int)(data.curBase)] < 4)) myBMatchCount++; else myBMismatchCount++; hasherrormodel.setCell(data, refBase); myBasecounts++; } return true; }
void realign_bam(Parameters& params) { FastaReference reference; reference.open(params.fasta_reference); bool suppress_output = false; int dag_window_size = params.dag_window_size; // open BAM file BamReader reader; if (!reader.Open("stdin")) { cerr << "could not open stdin for reading" << endl; exit(1); } BamWriter writer; if (!params.dry_run && !writer.Open("stdout", reader.GetHeaderText(), reader.GetReferenceData())) { cerr << "could not open stdout for writing" << endl; exit(1); } // store the names of all the reference sequences in the BAM file map<int, string> referenceIDToName; vector<RefData> referenceSequences = reader.GetReferenceData(); int i = 0; for (RefVector::iterator r = referenceSequences.begin(); r != referenceSequences.end(); ++r) { referenceIDToName[i] = r->RefName; ++i; } vcf::VariantCallFile vcffile; if (!params.vcf_file.empty()) { if (!vcffile.open(params.vcf_file)) { cerr << "could not open VCF file " << params.vcf_file << endl; exit(1); } } else { cerr << "realignment requires VCF file" << endl; exit(1); } vcf::Variant var(vcffile); BamAlignment alignment; map<long int, vector<BamAlignment> > alignmentSortQueue; // get alignment // assemble DAG in region around alignment // loop for each alignment in BAM: // update DAG when current alignment gets close to edge of assembled DAG // attempt to realign if read has a certain number of mismatches + gaps or softclips, weighted by basequal // if alignment to DAG has fewer mismatches and gaps than original alignment, use it // flatten read into reference space (for now just output alleles from VCF un-spanned insertions) // write read to queue for streaming re-sorting (some positional change will occur) long int dag_start_position = 0; string currentSeqname; string ref; //vector<Cigar> cigars; // contains the Cigar strings of nodes in the graph //vector<long int> refpositions; // contains the reference start coords of nodes in the graph ReferenceMappings ref_map; gssw_graph* graph = gssw_graph_create(0); int8_t* nt_table = gssw_create_nt_table(); int8_t* mat = gssw_create_score_matrix(params.match, params.mism); int total_reads = 0; int total_realigned = 0; int total_improved = 0; bool emptyDAG = false; // if the dag is constructed over empty sequence // such as when realigning reads mapped to all-N sequence if (params.debug) { cerr << "about to start processing alignments" << endl; } while (reader.GetNextAlignment(alignment)) { string& seqname = referenceIDToName[alignment.RefID]; if (params.debug) { cerr << "--------------------------------------------" << endl << "processing alignment " << alignment.Name << " at " << seqname << ":" << alignment.Position << endl; } /* if (!alignment.IsMapped() && graph->size == 0) { if (params.debug) { cerr << "unable to build DAG using unmapped read " << alignment.Name << " @ " << seqname << ":" << alignment.Position << " no previous mapped read found and DAG currently empty" << endl; } alignmentSortQueue[dag_start_position+dag_window_size].push_back(alignment); continue; } */ ++total_reads; BamAlignment originalAlignment = alignment; long unsigned int initialAlignmentPosition = alignment.Position; //if (dag_start_position == 1) { // dag_start_position = max(1, (int)initialAlignmentPosition - dag_window_size/2); //} // should we construct a new DAG? do so when 3/4 of the way through the current one // center on current position + 1/2 dag window // TODO check this scheme using some scribbles on paper // alignment.IsMapped() if ((seqname != currentSeqname || ((alignment.Position + (alignment.QueryBases.size()/2) > (3*dag_window_size/4) + dag_start_position))) && alignment.Position < reference.sequenceLength(seqname)) { if (seqname != currentSeqname) { if (params.debug) { cerr << "switched ref seqs" << endl; } dag_start_position = max((long int) 0, (long int) (alignment.GetEndPosition() - dag_window_size/2)); // recenter DAG } else if (!ref_map.empty()) { dag_start_position = dag_start_position + dag_window_size/2; dag_start_position = max(dag_start_position, (long int) (alignment.GetEndPosition() - dag_window_size/2)); } else { dag_start_position = alignment.Position - dag_window_size/2; } dag_start_position = max((long int)0, dag_start_position); // TODO get sequence length and use to bound noted window size (edge case) //cerr << "getting ref " << seqname << " " << max((long int) 0, dag_start_position) << " " << dag_window_size << endl; // get variants for new DAG vector<vcf::Variant> variants; if (!vcffile.setRegion(seqname, dag_start_position + 1, dag_start_position + dag_window_size)) { // this is not necessarily an error; there should be a better way to check for VCF file validity /* cerr << "could not set region on VCF file to " << currentSeqname << ":" << dag_start_position << "-" << dag_start_position + ref.size() << endl; */ //exit(1); } else { // check first variant if (vcffile.getNextVariant(var)) { while (var.position <= dag_start_position + 1) { //cerr << "var position == dag_start_position " << endl; dag_start_position -= 1; vcffile.setRegion(seqname, dag_start_position + 1, dag_start_position + dag_window_size); if (!vcffile.getNextVariant(var)) { break; } } } vcffile.setRegion(seqname, dag_start_position + 1, dag_start_position + dag_window_size); while (vcffile.getNextVariant(var)) { if (params.debug) cerr << "getting variant at " << var.sequenceName << ":" << var.position << endl; //cerr << var.position << " + " << var.ref.length() << " <= " << dag_start_position << " + " << dag_window_size << endl; //cerr << var.position << " >= " << dag_start_position << endl; if (var.position + var.ref.length() <= dag_start_position + dag_window_size && var.position >= dag_start_position) { variants.push_back(var); } } } //cerr << "dag_start_position " << dag_start_position << endl; ref = reference.getSubSequence(seqname, max((long int) 0, dag_start_position), dag_window_size); // 0/1 conversion // clear graph and metadata ref_map.clear(); //cigars.clear(); //refpositions.clear(); gssw_graph_destroy(graph); if (params.debug) { cerr << "constructing DAG" << endl; } // and build the DAG graph = gssw_graph_create(0); constructDAGProgressive(graph, ref_map, ref, seqname, variants, dag_start_position, nt_table, mat, params.flat_input_vcf); if (params.debug) { cerr << "graph has " << graph->size << " nodes" << endl; cerr << "DAG generated from input variants over " << seqname << ":" << dag_start_position << "-" << dag_start_position + dag_window_size << endl; } if (params.display_dag) { gssw_graph_print(graph); /* for (Backbone::iterator b = backbone.begin(); b != backbone.end(); ++b) { cout << b->first << " " << b->first->id << " " << b->second.ref_position << " " << b->second.cigar << endl << b->first->seq << endl; } */ } if (graph->size == 1 && allN(ref) || graph->size == 0) { if (params.debug) { cerr << "DAG is empty (1 node, all N). Alignment is irrelevant." << endl; } emptyDAG = true; } else { emptyDAG = false; } } AlignmentStats stats_before; bool was_mapped = alignment.IsMapped(); bool has_realigned = false; if (was_mapped) { if (dag_start_position + dag_window_size < alignment.GetEndPosition()) { ref = reference.getSubSequence(seqname, max((long int) 0, dag_start_position), alignment.GetEndPosition() - dag_start_position); // 0/1 conversion } } if (params.debug) { if (emptyDAG) { cerr << "cannot realign against empty (all-N single node) graph" << endl; } } if (!emptyDAG && shouldRealign(alignment, ref, dag_start_position, params, stats_before)) { ++total_realigned; if (params.debug) { cerr << "realigning: " << alignment.Name << " " << alignment.QueryBases << endl << " aligned @ " << alignment.Position << " to variant graph over " << seqname << ":" << dag_start_position << "-" << dag_start_position + dag_window_size << endl; } //{ try { Cigar flat_cigar; string read = alignment.QueryBases; string qualities = alignment.Qualities; int score; long int position; string strand; gssw_graph_mapping* gm = gswalign(graph, ref_map, read, qualities, params, position, score, flat_cigar, strand, nt_table, mat); // gssw_graph_mapping_destroy(gm); if (params.dry_run) { if (strand == "-" && !alignment.IsMapped()) { read = reverseComplement(read); } cout << read << endl; cout << graph_mapping_to_string(gm) << endl; cout << score << " " << strand << " " << position << " " << flat_cigar << endl; } else { /* if (strand == "-") { read = reverseComplement(trace_report.read); } */ // TODO the qualities are not on the right side of the read if (strand == "-" && alignment.IsMapped()) { // if we're realigning, this is always true unless we swapped strands alignment.SetIsReverseStrand(true); //reverse(alignment.Qualities.begin(), alignment.Qualities.end()); // reverse qualities } //alignment.QueryBases = reverseComplement(trace_report.read); alignment.QueryBases = read; alignment.Qualities = qualities; alignment.Position = position;// + 1;// + 1;//(trace_report.node->position - 1) + trace_report.x; alignment.SetIsMapped(true); if (!alignment.MapQuality) { alignment.MapQuality = 20; // horrible hack... at least approximate with alignment mismatches against graph } // check if somehow we've ended up with an indel at the ends // if so, grab the reference sequence right beyond it and add // a single match to the cigar, allowing variant detection methods // to run on the results without internal modification Cigar& cigar = flat_cigar; //cerr << flat_cigar << " " << flat_cigar.readLen() << " " << flat_cigar.refLen() << endl; int flankSize = params.flatten_flank; if (cigar.front().isIndel() || (cigar.front().isSoftclip() && cigar.at(1).isIndel())) { alignment.Position -= flankSize; string refBase = reference.getSubSequence(seqname, alignment.Position, flankSize); if (cigar.front().isSoftclip()) { alignment.QueryBases.erase(alignment.QueryBases.begin(), alignment.QueryBases.begin()+cigar.front().length); alignment.Qualities.erase(alignment.Qualities.begin(), alignment.Qualities.begin()+cigar.front().length); cigar.erase(cigar.begin()); } alignment.QueryBases.insert(0, refBase); alignment.Qualities.insert(0, string(flankSize, shortInt2QualityChar(30))); Cigar newCigar; newCigar.push_back(CigarElement(flankSize, 'M')); newCigar.append(flat_cigar); flat_cigar = newCigar; } if (cigar.back().isIndel() || (cigar.back().isSoftclip() && cigar.at(cigar.size()-2).isIndel())) { string refBase = reference.getSubSequence(seqname, alignment.Position + flat_cigar.refLen(), flankSize); if (cigar.back().isSoftclip()) { alignment.QueryBases.erase(alignment.QueryBases.end()-cigar.back().length, alignment.QueryBases.end()); alignment.Qualities.erase(alignment.Qualities.end()-cigar.back().length, alignment.Qualities.end()); cigar.pop_back(); } Cigar newCigar; newCigar.push_back(CigarElement(flankSize, 'M')); flat_cigar.append(newCigar); //flat_cigar.append(newCigar); alignment.QueryBases.append(refBase); alignment.Qualities.append(string(flankSize, shortInt2QualityChar(30))); } flat_cigar.toCigarData(alignment.CigarData); //cerr << flat_cigar << " " << flat_cigar.readLen() << " " << flat_cigar.refLen() << endl; if (dag_start_position + dag_window_size < alignment.GetEndPosition()) { ref = reference.getSubSequence(seqname, max((long int) 0, dag_start_position), alignment.GetEndPosition() - dag_start_position); // 0/1 conversion } AlignmentStats stats_after; countMismatchesAndGaps(alignment, flat_cigar, ref, dag_start_position, stats_after, params.debug); /* if ((!was_mapped || (stats_before.softclip_qsum >= stats_after.softclip_qsum && stats_before.mismatch_qsum >= stats_after.mismatch_qsum)) && acceptRealignment(alignment, ref, dag_start_position, params, stats_after)) { */ /* if ((!was_mapped || (stats_before.softclip_qsum + stats_before.mismatch_qsum >= stats_after.softclip_qsum + stats_after.mismatch_qsum)) && acceptRealignment(alignment, ref, dag_start_position, params, stats_after)) { */ // we accept the new alignment if... if (!was_mapped // it wasn't mapped previously // or if we have removed soft clips or mismatches (per quality) from the alignment //|| ((stats_before.softclip_qsum >= stats_after.softclip_qsum // && stats_before.mismatch_qsum >= stats_after.mismatch_qsum) || ((stats_before.softclip_qsum + stats_before.mismatch_qsum >= stats_after.softclip_qsum + stats_after.mismatch_qsum) // and if we have added gaps, we have added them to remove mismatches or softclips && (stats_before.gaps >= stats_after.gaps // accept any time we reduce gaps while not increasing softclips/mismatches || (stats_before.gaps < stats_after.gaps // and allow gap increases when they improve the alignment && (stats_before.softclip_qsum + stats_before.mismatch_qsum > stats_after.softclip_qsum + stats_after.mismatch_qsum)))) // and the alignment must not have more than the acceptable number of gaps, softclips, or mismatches // as provided in input parameters && acceptRealignment(alignment, ref, dag_start_position, params, stats_after)) { // keep the alignment // TODO require threshold of softclips to keep alignment (or count of gaps, mismatches,...) if (params.debug) { cerr << "realigned " << alignment.Name << " to graph, which it maps to with " << stats_after.mismatch_qsum << "q in mismatches and " << stats_after.softclip_qsum << "q in soft clips" << endl; } ++total_improved; has_realigned = true; } else { // reset to old version of alignment if (params.debug) { cerr << "failed realignment of " << alignment.Name << " to graph, which it maps to with: " << stats_after.mismatch_qsum << "q in mismatches " << "(vs " << stats_before.mismatch_qsum << "q before), and " << stats_after.softclip_qsum << "q in soft clips " << "(vs " << stats_before.softclip_qsum << "q before) " << endl; } has_realigned = false; alignment = originalAlignment; } } //} // try block } catch (...) { cerr << "exception when realigning " << alignment.Name << " at position " << referenceIDToName[alignment.RefID] << ":" << alignment.Position << " " << alignment.QueryBases << endl; // reset to original alignment has_realigned = false; alignment = originalAlignment; } } // ensure correct order if alignments move long int maxOutputPos = initialAlignmentPosition - dag_window_size; // if we switched sequences we need to flush out all the reads from the previous one string lastSeqname = currentSeqname; if (seqname != currentSeqname) { // so the max output position is set past the end of the last chromosome if (!currentSeqname.empty()) { maxOutputPos = reference.sequenceLength(currentSeqname) + dag_window_size; } currentSeqname = seqname; } if (!params.dry_run) { map<long int, vector<BamAlignment> >::iterator p = alignmentSortQueue.begin(); for ( ; p != alignmentSortQueue.end(); ++p) { // except if we are running in unsorted mode, stop when we are at the window size if (!params.unsorted_output && p->first > maxOutputPos) { break; // no more to do } else { for (vector<BamAlignment>::iterator a = p->second.begin(); a != p->second.end(); ++a) { writer.SaveAlignment(*a); } } } if (p != alignmentSortQueue.begin()) { alignmentSortQueue.erase(alignmentSortQueue.begin(), p); } if (!params.only_realigned || has_realigned) { alignmentSortQueue[alignment.Position].push_back(alignment); } } } // end GetNextAlignment loop if (!params.dry_run) { map<long int, vector<BamAlignment> >::iterator p = alignmentSortQueue.begin(); for ( ; p != alignmentSortQueue.end(); ++p) { for (vector<BamAlignment>::iterator a = p->second.begin(); a != p->second.end(); ++a) writer.SaveAlignment(*a); } } gssw_graph_destroy(graph); free(nt_table); free(mat); reader.Close(); writer.Close(); if (params.debug) { cerr << "total reads:\t" << total_reads << endl; cerr << "realigned:\t" << total_realigned << endl; cerr << "improved:\t" << total_improved << endl; } }