Exemple #1
0
void RegionCoverage::TrackReadsOnRegion( const BamTools::BamAlignment &aread, uint32_t endPos )
{
	// track total and on-target reads
	uint32_t readEnd = endPos ? endPos : aread.GetEndPosition();
	uint32_t covType = ReadOnRegion( aread.RefID, aread.Position + 1, readEnd );
	TargetContig *contig = m_contigList[m_rcovContigIdx];
	if( aread.IsReverseStrand() ) {
		++contig->fwdReads;
		if( covType & 1 ) ++contig->fwdTrgReads;
	} else {
		++contig->revReads;
		if( covType & 1 ) ++contig->revTrgReads;
	}
}
bool ReadContainer::ParseRead(const BamTools::BamAlignment& aln,
			      AlignedRead* aligned_read, 
			      map<pair<string,int>, string>& ref_ext_nucleotides) {
  // get read ID
  aligned_read->ID = aln.Name;
  // get nucleotides
  aligned_read->nucleotides = aln.QueryBases;
  // get qualities
  aligned_read->qualities = aln.Qualities;
  // get strand
  aligned_read->strand = aln.IsReverseStrand();
  // get chrom
  aligned_read->chrom = references.at(aln.RefID).RefName;
  // get read start
  aligned_read->read_start = aln.Position;
  // get cigar
  aligned_read->cigar_ops = aln.CigarData;
  // get if mate pair
  if (aln.IsSecondMate()) {
    aligned_read->mate = 1;
  } else {
    aligned_read->mate = 0;
  }
  // Only process if it is the primary alignment
  if (aligned_read->mate) {
    return false;
  }
  // Get all the tag data
  // don't process if partially spanning (from old lobSTR)
  int partial = 0;
  if (GetIntBamTag(aln, "XP", &partial)) {
    if (partial == 1) return false;
  }
  // get read group
  if (!GetStringBamTag(aln, "RG", &aligned_read->read_group)) {
    stringstream msg;
    msg << aln.Name << " Could not get read group.";
    PrintMessageDieOnError(msg.str(), ERROR);
  }
  // get msStart
  if (!GetIntBamTag(aln, "XS", &aligned_read->msStart)) {
    stringstream msg;
    msg << aln.Name << " from group " << aligned_read->read_group << " Could not get STR start coordinate. Did this bam file come from lobSTR?";
    PrintMessageDieOnError(msg.str(), ERROR);
  }
  // get msEnd
  if (!GetIntBamTag(aln, "XE", &aligned_read->msEnd)) {
    stringstream msg;
    msg << aln.Name << " from group " << aligned_read->read_group << " Could not get STR end coordinate. Did this bam file come from lobSTR?";
    PrintMessageDieOnError(msg.str(), ERROR);
  }
  // get mapq. Try unsigned/signed
  if (!GetIntBamTag(aln, "XQ", &aligned_read->mapq)) {
    stringstream msg;
    aligned_read->mapq = 0;
  }
  // get diff
  if (!GetIntBamTag(aln, "XD", &aligned_read->diffFromRef)) {
    return false;
  }
  // get mate dist
  if (!GetIntBamTag(aln, "XM", &aligned_read->matedist)) {
    aligned_read->matedist = 0;
  }
  // get STR seq
  if (!GetStringBamTag(aln, "XR", &aligned_read->repseq)) {
    stringstream msg;
    msg << aln.Name << " from group " << aligned_read->read_group << " Could not get repseq.";
    PrintMessageDieOnError(msg.str(), ERROR);
  }
  // get if stitched
  if (!GetIntBamTag(aln, "XX", &aligned_read->stitched)) {
    aligned_read->stitched = 0;
  }
  // get ref copy num
  if (!GetFloatBamTag(aln, "XC", &aligned_read->refCopyNum)) {
    stringstream msg;
    msg << aln.Name << " from group " << aligned_read->read_group << " Could not get reference copy number.";
    PrintMessageDieOnError(msg.str(), ERROR);
  }
  // get period
  aligned_read->period = aligned_read->repseq.length();
  if (include_flank) {  // diff is just sum of differences in cigar
    CIGAR_LIST cigar_list;
    for (vector<BamTools::CigarOp>::const_iterator
	   it = aligned_read->cigar_ops.begin();
	 it != aligned_read->cigar_ops.end(); it++) {
      CIGAR cig;
      cig.num = (*it).Length;
      cig.cigar_type = (*it).Type;
      cigar_list.cigars.push_back(cig);
    }
    bool added_s;
    bool cigar_had_s;
    cigar_list.ResetString();
    GenerateCorrectCigar(&cigar_list, aln.QueryBases,
			 &added_s, &cigar_had_s);
    aligned_read->diffFromRef = GetSTRAllele(cigar_list);
  }
  // apply filters
  if (unit) {
    if (aligned_read->diffFromRef % aligned_read->period != 0){ 
      filter_counter.increment(FilterCounter::NOT_UNIT);
      return false;
    }
  }
  if (abs(aligned_read->diffFromRef) > max_diff_ref) {
    filter_counter.increment(FilterCounter::DIFF_FROM_REF);
    return false;
  }
  if (aligned_read->mapq > max_mapq) {
    filter_counter.increment(FilterCounter::MAPPING_QUALITY);
    return false;
  }
  if (aligned_read->matedist > max_matedist) {
    filter_counter.increment(FilterCounter::MATE_DIST);
    return false;
  }
  // Check if the allele length is valid
  if (aligned_read->diffFromRef + (aligned_read->refCopyNum*aligned_read->period) < MIN_ALLELE_SIZE) {
    filter_counter.increment(FilterCounter::ALLELE_SIZE);
    return false;
  }

  // check that read sufficiently spans STR
  int max_read_start = aligned_read->msStart - min_border;
  int min_read_stop  = aligned_read->msEnd   + min_border;
  if (aln.Position > max_read_start || aln.GetEndPosition() < min_read_stop){
    filter_counter.increment(FilterCounter::SPANNING_AMOUNT);
    return false; 
  }
  
  // check that both ends of the read contain sufficient perfect matches
  if (min_read_end_match > 0){
    map<pair<string,int>, string>::iterator loc_iter = ref_ext_nucleotides.find(pair<string,int>(aligned_read->chrom, aligned_read->msStart));
    if (loc_iter == ref_ext_nucleotides.end())
      PrintMessageDieOnError("No extended reference sequence found for locus", ERROR);
    string ref_ext_seq = loc_iter->second;
    pair<int,int> num_end_matches = AlignmentFilters::GetNumEndMatches(aligned_read, ref_ext_seq, aligned_read->msStart-extend);
    if (num_end_matches.first < min_read_end_match || num_end_matches.second < min_read_end_match){
      filter_counter.increment(FilterCounter::NUM_END_MATCHES);
      return false;
    }
  }

  // check that the prefix and suffix of the read match maximally compared to proximal reference locations
  if (maximal_end_match_window > 0){
    map<pair<string,int>, string>::iterator loc_iter = ref_ext_nucleotides.find(pair<string,int>(aligned_read->chrom, aligned_read->msStart));
    if (loc_iter == ref_ext_nucleotides.end())
      PrintMessageDieOnError("No extended reference sequence found for locus", ERROR);
    string ref_ext_seq = loc_iter->second;
    bool maximum_end_matches = AlignmentFilters::HasLargestEndMatches(aligned_read, ref_ext_seq, aligned_read->msStart-extend, maximal_end_match_window, maximal_end_match_window);
    if (!maximum_end_matches){
      filter_counter.increment(FilterCounter::NOT_MAXIMAL_END);
      return false;
    }
  }

  // check that both ends of the aligned read have sufficient bases before the first indel
  if (min_bp_before_indel > 0){
    pair<int, int> num_bps = AlignmentFilters::GetEndDistToIndel(aligned_read);
    if (num_bps.first != -1 && num_bps.first < min_bp_before_indel){
      filter_counter.increment(FilterCounter::BP_BEFORE_INDEL);
      return false;
    }
    if (num_bps.second != -1 && num_bps.second < min_bp_before_indel){
      filter_counter.increment(FilterCounter::BP_BEFORE_INDEL);
      return false;
    }
  }
  filter_counter.increment(FilterCounter::UNFILTERED);
  return true;
}
Exemple #3
0
void AmpliconRegionStatistics::TrackReadsOnRegion( const BamTools::BamAlignment &aread, uint32_t endPos )
{
	// pseudo-random number generator 'seed' for resolving equivalent read assignments
	static uint16_t clockSeed = 0;
	// check/set first region read overlaps
	uint32_t readSrt = aread.Position + 1;
	uint32_t readEnd = endPos ? endPos : aread.GetEndPosition();
	uint32_t covType = ReadOnRegion( aread.RefID, readSrt, readEnd );
	// maintain base method of tracking total reads
	TargetContig *contig = m_contigList[m_rcovContigIdx];
	bool isRev = aread.IsReverseStrand();
	if( isRev ) {
		++contig->revReads;
	} else {
		++contig->fwdReads;
	}
	// Tracking of reads on target
	if( covType & 1 ) {
		// iterate over all regions overlapping read...
		int32_t bestEndDist = -m_maxUpstreamPrimerStart;
		int32_t bestOverlap = 0;
		uint32_t numBestRegions = 0;
		bool haveBestEnd = false;
		for( TargetRegion *cur = m_rcovRegion; cur; cur = cur->next ) {
			if( readEnd < cur->trgSrt ) break;
			if( readSrt > m_rcovRegion->trgEnd ) continue;
			// save stats for all overlapped reads
			++(GetStats(cur)->overlaps);
			// find most likely AmpliSeq primed region of those overlapped
			// NOTE: can still be wrong for regions starting very close together, given 5' digestion uncertainty,
			// coupled with read length and digestion uncertainty at 3'
			int32_t dSrt = readSrt - cur->trgSrt;
			int32_t dEnd = cur->trgEnd - readEnd;
			int32_t endDist5p = isRev ? dEnd : dSrt;
			// for non-amplicon reads, ends are ignored and only maximum overlap is employed to distinguish target region
			if( m_ampliconReads ) {
				// always select region that is closest start before 5p primer
				if( endDist5p < 0 && endDist5p > bestEndDist ) {
					haveBestEnd = true;
					bestEndDist = endDist5p;
					bestOverlap = 0; // force record best below
				} else if( haveBestEnd && endDist5p != bestEndDist ) {
					// region is not closer primed or same distance from false priming site
					continue;
				}
			}
			// save region based on max overlap for equivalent regions
			if( dSrt < 0 ) dSrt = 0;
			if( dEnd < 0 ) dEnd = 0;
			int32_t overlap = cur->trgEnd - cur->trgSrt - dSrt - dEnd; // +1
			if( overlap >= bestOverlap ) {
				// if overlaps also match then default to region starting most 3'
				// - cannot do better w/o knowing exact priming location, or possibly using ZA tag value
				if( overlap == bestOverlap ) {
					// stack multiple equivalent solutions
					if( numBestRegions >= m_regionStackSize ) {
						// safety code - only triggered if many targets overlapping read
						m_regionStackSize <<= 1;	// *2
						m_regionStack = (TargetRegion **)realloc(
							m_regionStack, m_regionStackSize * sizeof(TargetRegion *) );
					}
				} else {
					// save new best solution - these values are the same for all equivalent solutions
					bestOverlap = overlap;
					numBestRegions = 0;
				}
				m_regionStack[numBestRegions++] = cur;
			}
		}
		// pseudo-randomly choose best region of equivalent best regions
		TargetRegion *bestRegion = m_regionStack[ clockSeed % numBestRegions ];
		bool e2e_or_cov;
		if( m_sigFacCoverage ) {
			int32_t trgLen = bestRegion->trgEnd - bestRegion->trgSrt + 1;
			e2e_or_cov = (double(bestOverlap+1)/trgLen >= m_sigFacCoverage);
		} else {
			int32_t dSrt = readSrt - bestRegion->trgSrt;
			int32_t dEnd = bestRegion->trgEnd - readEnd;
			if( dSrt < 0 ) dSrt = 0;
			if( dEnd < 0 ) dEnd = 0;
			e2e_or_cov = ((dSrt > dEnd ? dSrt : dEnd) <= m_maxE2eEndDist);
		}
		StatsData *stats = GetStats(bestRegion);
		if( isRev ) {
			++contig->revTrgReads;
			++stats->revReads;
			if( e2e_or_cov ) ++stats->rev_e2e;
		} else {
			++contig->fwdTrgReads;
			++stats->fwdReads;
			if( e2e_or_cov ) ++stats->fwd_e2e;
		}
	}
	++clockSeed;
}
Exemple #4
0
// Returns true if the paired reads are a short-insert pair
bool filterByGraph(StringGraph* pGraph, 
                   const BamTools::RefVector& referenceVector, 
                   BamTools::BamAlignment& record1, 
                   BamTools::BamAlignment& record2)
{
    std::string vertexID1 = referenceVector[record1.RefID].RefName;
    std::string vertexID2 = referenceVector[record2.RefID].RefName;

    // Get the vertices for this pair using the mapped IDs
    Vertex* pX = pGraph->getVertex(vertexID1);
    Vertex* pY = pGraph->getVertex(vertexID2);

    // Ensure that the vertices are found
    assert(pX != NULL && pY != NULL);

#ifdef DEBUG_CONNECT
    std::cout << "Finding path from " << vertexID1 << " to " << vertexID2 << "\n";
#endif

    EdgeDir walkDirectionXOut = ED_SENSE;
    EdgeDir walkDirectionYIn = ED_SENSE;

    // Flip walk directions if the alignment is to the reverse strand
    if(record1.IsReverseStrand())
        walkDirectionXOut = !walkDirectionXOut;
    
    if(record2.IsReverseStrand())
        walkDirectionYIn = !walkDirectionYIn;

    int fromX = walkDirectionXOut == ED_SENSE ? record1.Position : record1.GetEndPosition();
    int toY = walkDirectionYIn == ED_SENSE ? record2.Position : record2.GetEndPosition();

    // Calculate the amount of contig X that already covers the fragment
    // Using this number, we calculate how far we should search
    int coveredX = walkDirectionXOut == ED_SENSE ? pX->getSeqLen() - fromX : fromX;
    int maxWalkDistance = opt::maxDistance - coveredX;

    bool bShortInsertPair = false;
    if(pX == pY)
    {
        if(abs(record1.InsertSize) < opt::maxDistance)
            bShortInsertPair = true;
    }
    else
    {

        SGWalkVector walks;
        SGSearch::findWalks(pX, pY, walkDirectionXOut, maxWalkDistance, 10000, true, walks);

        if(!walks.empty())
        {
            for(size_t i = 0; i < walks.size(); ++i)
            {
                std::string fragment = walks[i].getFragmentString(pX, 
                                                                  pY, 
                                                                  fromX,
                                                                  toY,
                                                                  walkDirectionXOut,
                                                                  walkDirectionYIn);
                if((int)fragment.size() < opt::maxDistance)
                {
                    bShortInsertPair = true;
                    //std::cout << "Found completing fragment (" << pX->getID() << " -> " << pY->getID() << ": " << fragment.size() << "\n";
                    break;
                }
            }
        }
    }
    
    return bShortInsertPair;
}