bool ReadContainer::ParseRead(const BamTools::BamAlignment& aln, AlignedRead* aligned_read, map<pair<string,int>, string>& ref_ext_nucleotides) { // get read ID aligned_read->ID = aln.Name; // get nucleotides aligned_read->nucleotides = aln.QueryBases; // get qualities aligned_read->qualities = aln.Qualities; // get strand aligned_read->strand = aln.IsReverseStrand(); // get chrom aligned_read->chrom = references.at(aln.RefID).RefName; // get read start aligned_read->read_start = aln.Position; // get cigar aligned_read->cigar_ops = aln.CigarData; // get if mate pair if (aln.IsSecondMate()) { aligned_read->mate = 1; } else { aligned_read->mate = 0; } // Only process if it is the primary alignment if (aligned_read->mate) { return false; } // Get all the tag data // don't process if partially spanning (from old lobSTR) int partial = 0; if (GetIntBamTag(aln, "XP", &partial)) { if (partial == 1) return false; } // get read group if (!GetStringBamTag(aln, "RG", &aligned_read->read_group)) { stringstream msg; msg << aln.Name << " Could not get read group."; PrintMessageDieOnError(msg.str(), ERROR); } // get msStart if (!GetIntBamTag(aln, "XS", &aligned_read->msStart)) { stringstream msg; msg << aln.Name << " from group " << aligned_read->read_group << " Could not get STR start coordinate. Did this bam file come from lobSTR?"; PrintMessageDieOnError(msg.str(), ERROR); } // get msEnd if (!GetIntBamTag(aln, "XE", &aligned_read->msEnd)) { stringstream msg; msg << aln.Name << " from group " << aligned_read->read_group << " Could not get STR end coordinate. Did this bam file come from lobSTR?"; PrintMessageDieOnError(msg.str(), ERROR); } // get mapq. Try unsigned/signed if (!GetIntBamTag(aln, "XQ", &aligned_read->mapq)) { stringstream msg; aligned_read->mapq = 0; } // get diff if (!GetIntBamTag(aln, "XD", &aligned_read->diffFromRef)) { return false; } // get mate dist if (!GetIntBamTag(aln, "XM", &aligned_read->matedist)) { aligned_read->matedist = 0; } // get STR seq if (!GetStringBamTag(aln, "XR", &aligned_read->repseq)) { stringstream msg; msg << aln.Name << " from group " << aligned_read->read_group << " Could not get repseq."; PrintMessageDieOnError(msg.str(), ERROR); } // get if stitched if (!GetIntBamTag(aln, "XX", &aligned_read->stitched)) { aligned_read->stitched = 0; } // get ref copy num if (!GetFloatBamTag(aln, "XC", &aligned_read->refCopyNum)) { stringstream msg; msg << aln.Name << " from group " << aligned_read->read_group << " Could not get reference copy number."; PrintMessageDieOnError(msg.str(), ERROR); } // get period aligned_read->period = aligned_read->repseq.length(); if (include_flank) { // diff is just sum of differences in cigar CIGAR_LIST cigar_list; for (vector<BamTools::CigarOp>::const_iterator it = aligned_read->cigar_ops.begin(); it != aligned_read->cigar_ops.end(); it++) { CIGAR cig; cig.num = (*it).Length; cig.cigar_type = (*it).Type; cigar_list.cigars.push_back(cig); } bool added_s; bool cigar_had_s; cigar_list.ResetString(); GenerateCorrectCigar(&cigar_list, aln.QueryBases, &added_s, &cigar_had_s); aligned_read->diffFromRef = GetSTRAllele(cigar_list); } // apply filters if (unit) { if (aligned_read->diffFromRef % aligned_read->period != 0){ filter_counter.increment(FilterCounter::NOT_UNIT); return false; } } if (abs(aligned_read->diffFromRef) > max_diff_ref) { filter_counter.increment(FilterCounter::DIFF_FROM_REF); return false; } if (aligned_read->mapq > max_mapq) { filter_counter.increment(FilterCounter::MAPPING_QUALITY); return false; } if (aligned_read->matedist > max_matedist) { filter_counter.increment(FilterCounter::MATE_DIST); return false; } // Check if the allele length is valid if (aligned_read->diffFromRef + (aligned_read->refCopyNum*aligned_read->period) < MIN_ALLELE_SIZE) { filter_counter.increment(FilterCounter::ALLELE_SIZE); return false; } // check that read sufficiently spans STR int max_read_start = aligned_read->msStart - min_border; int min_read_stop = aligned_read->msEnd + min_border; if (aln.Position > max_read_start || aln.GetEndPosition() < min_read_stop){ filter_counter.increment(FilterCounter::SPANNING_AMOUNT); return false; } // check that both ends of the read contain sufficient perfect matches if (min_read_end_match > 0){ map<pair<string,int>, string>::iterator loc_iter = ref_ext_nucleotides.find(pair<string,int>(aligned_read->chrom, aligned_read->msStart)); if (loc_iter == ref_ext_nucleotides.end()) PrintMessageDieOnError("No extended reference sequence found for locus", ERROR); string ref_ext_seq = loc_iter->second; pair<int,int> num_end_matches = AlignmentFilters::GetNumEndMatches(aligned_read, ref_ext_seq, aligned_read->msStart-extend); if (num_end_matches.first < min_read_end_match || num_end_matches.second < min_read_end_match){ filter_counter.increment(FilterCounter::NUM_END_MATCHES); return false; } } // check that the prefix and suffix of the read match maximally compared to proximal reference locations if (maximal_end_match_window > 0){ map<pair<string,int>, string>::iterator loc_iter = ref_ext_nucleotides.find(pair<string,int>(aligned_read->chrom, aligned_read->msStart)); if (loc_iter == ref_ext_nucleotides.end()) PrintMessageDieOnError("No extended reference sequence found for locus", ERROR); string ref_ext_seq = loc_iter->second; bool maximum_end_matches = AlignmentFilters::HasLargestEndMatches(aligned_read, ref_ext_seq, aligned_read->msStart-extend, maximal_end_match_window, maximal_end_match_window); if (!maximum_end_matches){ filter_counter.increment(FilterCounter::NOT_MAXIMAL_END); return false; } } // check that both ends of the aligned read have sufficient bases before the first indel if (min_bp_before_indel > 0){ pair<int, int> num_bps = AlignmentFilters::GetEndDistToIndel(aligned_read); if (num_bps.first != -1 && num_bps.first < min_bp_before_indel){ filter_counter.increment(FilterCounter::BP_BEFORE_INDEL); return false; } if (num_bps.second != -1 && num_bps.second < min_bp_before_indel){ filter_counter.increment(FilterCounter::BP_BEFORE_INDEL); return false; } } filter_counter.increment(FilterCounter::UNFILTERED); return true; }
void ReadContainer::AddReadsFromFile(const ReferenceSTR& ref_str) { if (ref_str.chrom != "NA") { int refid = -1; if (chrom_to_refid.find(ref_str.chrom) != chrom_to_refid.end()) { refid = chrom_to_refid.at(ref_str.chrom); } if (refid == -1) { PrintMessageDieOnError("Could not locate STR reference chromosome in bam file", ERROR); } BamTools::BamRegion bam_region(refid, ref_str.start-extend, refid, ref_str.stop+extend); if (!reader.SetRegion(bam_region)) { PrintMessageDieOnError("Could not set bam region", ERROR); } } BamTools::BamAlignment aln; while (reader.GetNextAlignment(aln)) { AlignedRead aligned_read; // get read ID aligned_read.ID = aln.Name; // get nucleotides aligned_read.nucleotides = aln.QueryBases; // get qualities aligned_read.qualities = aln.Qualities; // get strand aligned_read.strand = aln.IsReverseStrand(); // get chrom aligned_read.chrom = references.at(aln.RefID).RefName; // get read start aligned_read.read_start = aln.Position; // get cigar aligned_read.cigar_ops = aln.CigarData; // get if mate pair if (aln.IsSecondMate()) { aligned_read.mate = 1; } else { aligned_read.mate = 0; } // Only process if it is the primary alignment if (aligned_read.mate) { continue; } // Get all the tag data // don't process if partially spanning (from old lobSTR) int partial = 0; if (GetIntBamTag(aln, "XP", &partial)) { if (partial == 1) continue; } // get read group if (!GetStringBamTag(aln, "RG", &aligned_read.read_group)) { stringstream msg; msg << aln.Name << " Could not get read group."; PrintMessageDieOnError(msg.str(), ERROR); } // get msStart if (!GetIntBamTag(aln, "XS", &aligned_read.msStart)) { stringstream msg; msg << aln.Name << " from group " << aligned_read.read_group << " Could not get STR start coordinate. Did this bam file come from lobSTR?"; PrintMessageDieOnError(msg.str(), ERROR); } // get msEnd if (!GetIntBamTag(aln, "XE", &aligned_read.msEnd)) { stringstream msg; msg << aln.Name << " from group " << aligned_read.read_group << " Could not get STR end coordinate. Did this bam file come from lobSTR?"; PrintMessageDieOnError(msg.str(), ERROR); } // get mapq. Try unsigned/signed if (!GetIntBamTag(aln, "XQ", &aligned_read.mapq)) { stringstream msg; aligned_read.mapq = 0; } // get diff if (!GetIntBamTag(aln, "XD", &aligned_read.diffFromRef)) { if (aligned_read.mate == 0) { stringstream msg; msg << aln.Name << " from group " << aligned_read.read_group << " Could not get genotype."; PrintMessageDieOnError(msg.str(), ERROR); } continue; } // get mate dist if (!GetIntBamTag(aln, "XM", &aligned_read.matedist)) { aligned_read.matedist = 0; } // get STR seq if (!GetStringBamTag(aln, "XR", &aligned_read.repseq)) { stringstream msg; msg << aln.Name << " from group " << aligned_read.read_group << " Could not get repseq."; PrintMessageDieOnError(msg.str(), ERROR); } // get if stitched if (!GetIntBamTag(aln, "XX", &aligned_read.stitched)) { aligned_read.stitched = 0; } // get ref copy num if (!GetFloatBamTag(aln, "XC", &aligned_read.refCopyNum)) { stringstream msg; msg << aln.Name << " from group " << aligned_read.read_group << " Could not get reference copy number."; PrintMessageDieOnError(msg.str(), ERROR); } // get period aligned_read.period = aligned_read.repseq.length(); if (include_flank) { // diff is just sum of differences in cigar CIGAR_LIST cigar_list; for (vector<BamTools::CigarOp>::const_iterator it = aligned_read.cigar_ops.begin(); it != aligned_read.cigar_ops.end(); it++) { CIGAR cig; cig.num = (*it).Length; cig.cigar_type = (*it).Type; cigar_list.cigars.push_back(cig); } bool added_s; bool cigar_had_s; cigar_list.ResetString(); GenerateCorrectCigar(&cigar_list, aln.QueryBases, &added_s, &cigar_had_s); aligned_read.diffFromRef = GetSTRAllele(cigar_list); } // apply filters if (unit) { if (aligned_read.diffFromRef % aligned_read.period != 0) continue; } if (abs(aligned_read.diffFromRef) > max_diff_ref) { continue; } if (aligned_read.mapq > max_mapq) { continue; } if (aligned_read.matedist > max_matedist) { continue; } // Add to map pair<string, int> coord (aligned_read.chrom, aligned_read.msStart); if (aligned_str_map_.find(coord) != aligned_str_map_.end()) { aligned_str_map_.at(coord).push_back(aligned_read); } else { list<AlignedRead> aligned_read_list; aligned_read_list.push_back(aligned_read); aligned_str_map_.insert(pair< pair<string, int>, list<AlignedRead> > (coord, aligned_read_list)); } } }