bool ReadContainer::ParseRead(const BamTools::BamAlignment& aln,
			      AlignedRead* aligned_read, 
			      map<pair<string,int>, string>& ref_ext_nucleotides) {
  // get read ID
  aligned_read->ID = aln.Name;
  // get nucleotides
  aligned_read->nucleotides = aln.QueryBases;
  // get qualities
  aligned_read->qualities = aln.Qualities;
  // get strand
  aligned_read->strand = aln.IsReverseStrand();
  // get chrom
  aligned_read->chrom = references.at(aln.RefID).RefName;
  // get read start
  aligned_read->read_start = aln.Position;
  // get cigar
  aligned_read->cigar_ops = aln.CigarData;
  // get if mate pair
  if (aln.IsSecondMate()) {
    aligned_read->mate = 1;
  } else {
    aligned_read->mate = 0;
  }
  // Only process if it is the primary alignment
  if (aligned_read->mate) {
    return false;
  }
  // Get all the tag data
  // don't process if partially spanning (from old lobSTR)
  int partial = 0;
  if (GetIntBamTag(aln, "XP", &partial)) {
    if (partial == 1) return false;
  }
  // get read group
  if (!GetStringBamTag(aln, "RG", &aligned_read->read_group)) {
    stringstream msg;
    msg << aln.Name << " Could not get read group.";
    PrintMessageDieOnError(msg.str(), ERROR);
  }
  // get msStart
  if (!GetIntBamTag(aln, "XS", &aligned_read->msStart)) {
    stringstream msg;
    msg << aln.Name << " from group " << aligned_read->read_group << " Could not get STR start coordinate. Did this bam file come from lobSTR?";
    PrintMessageDieOnError(msg.str(), ERROR);
  }
  // get msEnd
  if (!GetIntBamTag(aln, "XE", &aligned_read->msEnd)) {
    stringstream msg;
    msg << aln.Name << " from group " << aligned_read->read_group << " Could not get STR end coordinate. Did this bam file come from lobSTR?";
    PrintMessageDieOnError(msg.str(), ERROR);
  }
  // get mapq. Try unsigned/signed
  if (!GetIntBamTag(aln, "XQ", &aligned_read->mapq)) {
    stringstream msg;
    aligned_read->mapq = 0;
  }
  // get diff
  if (!GetIntBamTag(aln, "XD", &aligned_read->diffFromRef)) {
    return false;
  }
  // get mate dist
  if (!GetIntBamTag(aln, "XM", &aligned_read->matedist)) {
    aligned_read->matedist = 0;
  }
  // get STR seq
  if (!GetStringBamTag(aln, "XR", &aligned_read->repseq)) {
    stringstream msg;
    msg << aln.Name << " from group " << aligned_read->read_group << " Could not get repseq.";
    PrintMessageDieOnError(msg.str(), ERROR);
  }
  // get if stitched
  if (!GetIntBamTag(aln, "XX", &aligned_read->stitched)) {
    aligned_read->stitched = 0;
  }
  // get ref copy num
  if (!GetFloatBamTag(aln, "XC", &aligned_read->refCopyNum)) {
    stringstream msg;
    msg << aln.Name << " from group " << aligned_read->read_group << " Could not get reference copy number.";
    PrintMessageDieOnError(msg.str(), ERROR);
  }
  // get period
  aligned_read->period = aligned_read->repseq.length();
  if (include_flank) {  // diff is just sum of differences in cigar
    CIGAR_LIST cigar_list;
    for (vector<BamTools::CigarOp>::const_iterator
	   it = aligned_read->cigar_ops.begin();
	 it != aligned_read->cigar_ops.end(); it++) {
      CIGAR cig;
      cig.num = (*it).Length;
      cig.cigar_type = (*it).Type;
      cigar_list.cigars.push_back(cig);
    }
    bool added_s;
    bool cigar_had_s;
    cigar_list.ResetString();
    GenerateCorrectCigar(&cigar_list, aln.QueryBases,
			 &added_s, &cigar_had_s);
    aligned_read->diffFromRef = GetSTRAllele(cigar_list);
  }
  // apply filters
  if (unit) {
    if (aligned_read->diffFromRef % aligned_read->period != 0){ 
      filter_counter.increment(FilterCounter::NOT_UNIT);
      return false;
    }
  }
  if (abs(aligned_read->diffFromRef) > max_diff_ref) {
    filter_counter.increment(FilterCounter::DIFF_FROM_REF);
    return false;
  }
  if (aligned_read->mapq > max_mapq) {
    filter_counter.increment(FilterCounter::MAPPING_QUALITY);
    return false;
  }
  if (aligned_read->matedist > max_matedist) {
    filter_counter.increment(FilterCounter::MATE_DIST);
    return false;
  }
  // Check if the allele length is valid
  if (aligned_read->diffFromRef + (aligned_read->refCopyNum*aligned_read->period) < MIN_ALLELE_SIZE) {
    filter_counter.increment(FilterCounter::ALLELE_SIZE);
    return false;
  }

  // check that read sufficiently spans STR
  int max_read_start = aligned_read->msStart - min_border;
  int min_read_stop  = aligned_read->msEnd   + min_border;
  if (aln.Position > max_read_start || aln.GetEndPosition() < min_read_stop){
    filter_counter.increment(FilterCounter::SPANNING_AMOUNT);
    return false; 
  }
  
  // check that both ends of the read contain sufficient perfect matches
  if (min_read_end_match > 0){
    map<pair<string,int>, string>::iterator loc_iter = ref_ext_nucleotides.find(pair<string,int>(aligned_read->chrom, aligned_read->msStart));
    if (loc_iter == ref_ext_nucleotides.end())
      PrintMessageDieOnError("No extended reference sequence found for locus", ERROR);
    string ref_ext_seq = loc_iter->second;
    pair<int,int> num_end_matches = AlignmentFilters::GetNumEndMatches(aligned_read, ref_ext_seq, aligned_read->msStart-extend);
    if (num_end_matches.first < min_read_end_match || num_end_matches.second < min_read_end_match){
      filter_counter.increment(FilterCounter::NUM_END_MATCHES);
      return false;
    }
  }

  // check that the prefix and suffix of the read match maximally compared to proximal reference locations
  if (maximal_end_match_window > 0){
    map<pair<string,int>, string>::iterator loc_iter = ref_ext_nucleotides.find(pair<string,int>(aligned_read->chrom, aligned_read->msStart));
    if (loc_iter == ref_ext_nucleotides.end())
      PrintMessageDieOnError("No extended reference sequence found for locus", ERROR);
    string ref_ext_seq = loc_iter->second;
    bool maximum_end_matches = AlignmentFilters::HasLargestEndMatches(aligned_read, ref_ext_seq, aligned_read->msStart-extend, maximal_end_match_window, maximal_end_match_window);
    if (!maximum_end_matches){
      filter_counter.increment(FilterCounter::NOT_MAXIMAL_END);
      return false;
    }
  }

  // check that both ends of the aligned read have sufficient bases before the first indel
  if (min_bp_before_indel > 0){
    pair<int, int> num_bps = AlignmentFilters::GetEndDistToIndel(aligned_read);
    if (num_bps.first != -1 && num_bps.first < min_bp_before_indel){
      filter_counter.increment(FilterCounter::BP_BEFORE_INDEL);
      return false;
    }
    if (num_bps.second != -1 && num_bps.second < min_bp_before_indel){
      filter_counter.increment(FilterCounter::BP_BEFORE_INDEL);
      return false;
    }
  }
  filter_counter.increment(FilterCounter::UNFILTERED);
  return true;
}
Exemple #2
0
void ReadContainer::AddReadsFromFile(const ReferenceSTR& ref_str) {
  if (ref_str.chrom != "NA") {
    int refid = -1;
    if (chrom_to_refid.find(ref_str.chrom) !=
	chrom_to_refid.end()) {
      refid = chrom_to_refid.at(ref_str.chrom);
    }
    if (refid == -1) {
      PrintMessageDieOnError("Could not locate STR reference chromosome in bam file", ERROR);
    }
    BamTools::BamRegion bam_region(refid, ref_str.start-extend, refid, ref_str.stop+extend);
    if (!reader.SetRegion(bam_region)) {
      PrintMessageDieOnError("Could not set bam region", ERROR);
    }
  }
  BamTools::BamAlignment aln;
  while (reader.GetNextAlignment(aln)) {
    AlignedRead aligned_read;
    // get read ID
    aligned_read.ID = aln.Name;
    // get nucleotides
    aligned_read.nucleotides = aln.QueryBases;
    // get qualities
    aligned_read.qualities = aln.Qualities;
    // get strand
    aligned_read.strand = aln.IsReverseStrand();
    // get chrom
    aligned_read.chrom = references.at(aln.RefID).RefName;
    // get read start
    aligned_read.read_start = aln.Position;
    // get cigar
    aligned_read.cigar_ops = aln.CigarData;
    // get if mate pair
    if (aln.IsSecondMate()) {
      aligned_read.mate = 1;
    } else {
      aligned_read.mate = 0;
    }
    // Only process if it is the primary alignment
    if (aligned_read.mate) {
      continue;
    }
    // Get all the tag data
    // don't process if partially spanning (from old lobSTR)
    int partial = 0;
    if (GetIntBamTag(aln, "XP", &partial)) {
      if (partial == 1) continue;
    }
    // get read group
    if (!GetStringBamTag(aln, "RG", &aligned_read.read_group)) {
      stringstream msg;
      msg << aln.Name << " Could not get read group.";
      PrintMessageDieOnError(msg.str(), ERROR);
    }
    // get msStart
    if (!GetIntBamTag(aln, "XS", &aligned_read.msStart)) {
      stringstream msg;
      msg << aln.Name << " from group " << aligned_read.read_group << " Could not get STR start coordinate. Did this bam file come from lobSTR?";
      PrintMessageDieOnError(msg.str(), ERROR);
    }
    // get msEnd
    if (!GetIntBamTag(aln, "XE", &aligned_read.msEnd)) {
      stringstream msg;
      msg << aln.Name << " from group " << aligned_read.read_group << " Could not get STR end coordinate. Did this bam file come from lobSTR?";
      PrintMessageDieOnError(msg.str(), ERROR);
    }
    // get mapq. Try unsigned/signed
    if (!GetIntBamTag(aln, "XQ", &aligned_read.mapq)) {
      stringstream msg;
      aligned_read.mapq = 0;
    }
    // get diff
    if (!GetIntBamTag(aln, "XD", &aligned_read.diffFromRef)) {
      if (aligned_read.mate == 0) {
	stringstream msg;
	msg << aln.Name << " from group " << aligned_read.read_group << " Could not get genotype.";
	PrintMessageDieOnError(msg.str(), ERROR);
      }
      continue;
    }
    // get mate dist
    if (!GetIntBamTag(aln, "XM", &aligned_read.matedist)) {
      aligned_read.matedist = 0;
    }
    // get STR seq
    if (!GetStringBamTag(aln, "XR", &aligned_read.repseq)) {
      stringstream msg;
      msg << aln.Name << " from group " << aligned_read.read_group << " Could not get repseq.";
      PrintMessageDieOnError(msg.str(), ERROR);
    }
    // get if stitched
    if (!GetIntBamTag(aln, "XX", &aligned_read.stitched)) {
      aligned_read.stitched = 0;
    }
    // get ref copy num
    if (!GetFloatBamTag(aln, "XC", &aligned_read.refCopyNum)) {
      stringstream msg;
      msg << aln.Name << " from group " << aligned_read.read_group << " Could not get reference copy number.";
      PrintMessageDieOnError(msg.str(), ERROR);
    }
    // get period
    aligned_read.period = aligned_read.repseq.length();
    if (include_flank) {  // diff is just sum of differences in cigar
      CIGAR_LIST cigar_list;
      for (vector<BamTools::CigarOp>::const_iterator
	     it = aligned_read.cigar_ops.begin();
	   it != aligned_read.cigar_ops.end(); it++) {
	CIGAR cig;
	cig.num = (*it).Length;
	cig.cigar_type = (*it).Type;
	cigar_list.cigars.push_back(cig);
      }
      bool added_s;
      bool cigar_had_s;
      cigar_list.ResetString();
      GenerateCorrectCigar(&cigar_list, aln.QueryBases,
			   &added_s, &cigar_had_s);
      aligned_read.diffFromRef = GetSTRAllele(cigar_list);
    }
    // apply filters
    if (unit) {
      if (aligned_read.diffFromRef % aligned_read.period  != 0) continue;
    }
    if (abs(aligned_read.diffFromRef) > max_diff_ref) {
      continue;
    }
    if (aligned_read.mapq > max_mapq) {
      continue;
    }
    if (aligned_read.matedist > max_matedist) {
      continue;
    }
    // Add to map
    pair<string, int> coord
      (aligned_read.chrom, aligned_read.msStart);
    if (aligned_str_map_.find(coord) != aligned_str_map_.end()) {
      aligned_str_map_.at(coord).push_back(aligned_read);
    } else {
      list<AlignedRead> aligned_read_list;
      aligned_read_list.push_back(aligned_read);
      aligned_str_map_.insert(pair< pair<string, int>, list<AlignedRead> >
			      (coord, aligned_read_list));
    }
  }
}