示例#1
0
void TargetsManager::TrimAmpliseqPrimers(Alignment *rai, int unmerged_target_hint) const
{
  // set these before any trimming
  rai->align_start = rai->alignment.Position;
  rai->align_end = rai->alignment.GetEndPosition(false, true);
  rai->old_cigar = rai->alignment.CigarData;

  if (not trim_ampliseq_primers)
    return;

  // Step 1: Find the first potential target region

  int target_idx = unmerged_target_hint;
  while (target_idx and (rai->alignment.RefID < unmerged[target_idx].chr or
          (rai->alignment.RefID == unmerged[target_idx].chr and rai->alignment.Position < unmerged[target_idx].end)))
    --target_idx;

  while (target_idx < (int)unmerged.size() and (rai->alignment.RefID > unmerged[target_idx].chr or
          (rai->alignment.RefID == unmerged[target_idx].chr and rai->alignment.Position >= unmerged[target_idx].end)))
    ++target_idx;


  // Step 2: Iterate over potential target regions, evaluate fit, pick the best fit

  int best_target_idx = -1;
  int best_fit_penalty = 100;
  int best_overlap = 0;

  while (target_idx < (int)unmerged.size() and rai->alignment.RefID == unmerged[target_idx].chr and rai->end >= unmerged[target_idx].begin) {

    int read_start = rai->alignment.Position;
    int read_end = rai->end;
    int read_prefix_size = unmerged[target_idx].begin - read_start;
    int read_postfix_size = read_end - unmerged[target_idx].end;
    int overlap = min(unmerged[target_idx].end, read_end) - max(unmerged[target_idx].begin, read_start);
    int fit_penalty = 100;

    if (not rai->alignment.IsReverseStrand()) {
      if (read_prefix_size > 0)
        fit_penalty = min(read_prefix_size,50) + max(0,50-overlap);
      else
        fit_penalty = min(-3*read_prefix_size,50) + max(0,50-overlap);
    } else {
      if (read_postfix_size > 0)
        fit_penalty = min(read_postfix_size,50) + max(0,50-overlap);
      else
        fit_penalty = min(-3*read_postfix_size,50) + max(0,50-overlap);
    }
    if (read_prefix_size > 0 and read_postfix_size > 0)
      fit_penalty -= 10;

    if ((best_fit_penalty > fit_penalty and overlap > 0) or (best_fit_penalty == fit_penalty and overlap > best_overlap)) {
      best_fit_penalty = fit_penalty;
      best_target_idx = target_idx;
      best_overlap = overlap;
    }

    ++target_idx;
  }

  if (best_target_idx == -1) {
    rai->filtered = true;
    return;
  }


  // Step 3: Do the actual primer trimming.
  //
  // For now, only adjust Position and Cigar.
  // Later, also adjust MD tag.
  // Even later, ensure the reads stay sorted, so no extra sorting is required outside of tvc

  vector<CigarOp>& old_cigar = rai->alignment.CigarData;
  vector<CigarOp> new_cigar;
  new_cigar.reserve(old_cigar.size() + 2);
  vector<CigarOp>::iterator old_op = old_cigar.begin();
  int ref_pos = rai->alignment.Position;

  // 3A: Cigar ops left of the target

  int begin = unmerged[best_target_idx].begin + unmerged[best_target_idx].trim_left;
  if (begin > unmerged[best_target_idx].end)
    begin = unmerged[best_target_idx].end;

  int end = unmerged[best_target_idx].end - unmerged[best_target_idx].trim_right;
  if (end <= begin)
    end = begin;
    

  while (old_op != old_cigar.end() and ref_pos <= begin) {
    if (old_op->Type == 'H') {
      ++old_op;
      continue;
    }

    if (old_op->Type == 'S' or old_op->Type == 'I') {
      if (new_cigar.empty())
        new_cigar.push_back(CigarOp('S'));
      new_cigar.back().Length += old_op->Length;
      ++old_op;
      continue;
    }

    unsigned int gap = begin - ref_pos;
    if (gap == 0)
      break;

    if (old_op->Type == 'M' or old_op->Type == 'N') {
      if (new_cigar.empty())
        new_cigar.push_back(CigarOp('S'));
      if (old_op->Length > gap) {
        new_cigar.back().Length += gap;
        old_op->Length -= gap;
        ref_pos += gap;
        break;
      } else {
        new_cigar.back().Length += old_op->Length;
        ref_pos += old_op->Length;
        ++old_op;
        continue;
      }
    }

    if (old_op->Type == 'D') {
      if (old_op->Length > gap) {
        old_op->Length -= gap;
        ref_pos += gap;
        break;
      } else {
        ref_pos += old_op->Length;
        ++old_op;
        continue;
      }
    }
  }


  // 3B: Cigar ops in the middle of the target

  rai->alignment.Position = ref_pos;

  while (old_op != old_cigar.end() and ref_pos < end) {
    if (old_op->Type == 'H') {
      ++old_op;
      continue;
    }

    unsigned int gap = end - ref_pos;

    if (old_op->Type == 'S' or old_op->Type == 'I') {
      new_cigar.push_back(*old_op);
      ++old_op;
      continue;
    }

    if (old_op->Type == 'M' or old_op->Type == 'N') {
      new_cigar.push_back(CigarOp(old_op->Type));
      if (old_op->Length > gap) {
        new_cigar.back().Length = gap;
        old_op->Length -= gap;
        ref_pos += gap;
        break;
      } else {
        new_cigar.back().Length = old_op->Length;
        ref_pos += old_op->Length;
        ++old_op;
        continue;
      }
    }

    if (old_op->Type == 'D') {
      new_cigar.push_back(CigarOp('D'));
      if (old_op->Length > gap) {
        new_cigar.back().Length = gap;
        old_op->Length -= gap;
        ref_pos += gap;
        break;
      } else {
        new_cigar.back().Length = old_op->Length;
        ref_pos += old_op->Length;
        ++old_op;
        continue;
      }
    }
  }


  // 3C: Cigar ops to the right of the target

  for (; old_op != old_cigar.end(); ++old_op) {
    if (old_op->Type == 'H' or old_op->Type == 'D')
      continue;

    if (new_cigar.empty() or new_cigar.back().Type != 'S')
      new_cigar.push_back(CigarOp('S'));
    new_cigar.back().Length += old_op->Length;
  }

  rai->alignment.CigarData.swap(new_cigar);


  // Debugging info

  stringstream ZL;
  ZL << unmerged[best_target_idx].name << ":" <<  best_fit_penalty << ":" << best_overlap;

  rai->alignment.AddTag("ZL", "Z", ZL.str());


}
示例#2
0
// Attempts to left-realign all the indels represented by the alignment cigar.
//
// This is done by shifting all indels as far left as they can go without
// mismatch, then merging neighboring indels of the same class.  leftAlign
// updates the alignment cigar with changes, and returns true if realignment
// changed the alignment cigar.
//
// To left-align, we move multi-base indels left by their own length as long as
// the preceding bases match the inserted or deleted sequence.  After this
// step, we handle multi-base homopolymer indels by shifting them one base to
// the left until they mismatch the reference.
//
// To merge neighboring indels, we iterate through the set of left-stabilized
// indels.  For each indel we add a new cigar element to the new cigar.  If a
// deletion follows a deletion, or an insertion occurs at the same place as
// another insertion, we merge the events by extending the previous cigar
// element.
//
// In practice, we must call this function until the alignment is stabilized.
//
bool leftAlign(BamAlignment& alignment, string& referenceSequence) {

	int arsOffset = 0; // pointer to insertion point in aligned reference sequence
	string alignedReferenceSequence = referenceSequence;
	int aabOffset = 0;
	string alignmentAlignedBases = alignment.QueryBases;

	// store information about the indels
	vector<FBIndelAllele> indels;

	int rp = 0;  // read position, 0-based relative to read
	int sp = 0;  // sequence position

	string softBegin;
	string softEnd;

	stringstream cigar_before, cigar_after;
	for (vector<CigarOp>::const_iterator c = alignment.CigarData.begin();
		 c != alignment.CigarData.end(); ++c) {
		unsigned int l = c->Length;
		char t = c->Type;
		cigar_before << l << t;
		if (t == 'M') { // match or mismatch
			sp += l;
			rp += l;
		} else if (t == 'D') { // deletion
			indels.push_back(FBIndelAllele(false, l, sp, rp, referenceSequence.substr(sp, l)));
			alignmentAlignedBases.insert(rp + aabOffset, string(l, '-'));
			aabOffset += l;
			sp += l;  // update reference sequence position
		} else if (t == 'I') { // insertion
			indels.push_back(FBIndelAllele(true, l, sp, rp, alignment.QueryBases.substr(rp, l)));
			alignedReferenceSequence.insert(sp + softBegin.size() + arsOffset, string(l, '-'));
			arsOffset += l;
			rp += l;
		} else if (t == 'S') { // soft clip, clipped sequence present in the read not matching the reference
			// remove these bases from the refseq and read seq, but don't modify the alignment sequence
			if (rp == 0) {
				alignedReferenceSequence = string(l, '*') + alignedReferenceSequence;
				softBegin = alignmentAlignedBases.substr(0, l);
			} else {
				alignedReferenceSequence = alignedReferenceSequence + string(l, '*');
				softEnd = alignmentAlignedBases.substr(alignmentAlignedBases.size() - l, l);
			}
			rp += l;
		} else if (t == 'H') { // hard clip on the read, clipped sequence is not present in the read
		} else if (t == 'N') { // skipped region in the reference not present in read, aka splice
			sp += l;
		}
	}


	int alignedLength = sp;

	// if no indels, return the alignment
	if (indels.empty()) { return false; }

	// for each indel, from left to right
	//     while the indel sequence repeated to the left and we're not matched up with the left-previous indel
	//         move the indel left

	vector<FBIndelAllele>::iterator previous = indels.begin();
	for (vector<FBIndelAllele>::iterator id = indels.begin(); id != indels.end(); ++id) {

		// left shift by repeats
		//
		// from 1 base to the length of the indel, attempt to shift left
		// if the move would cause no change in alignment optimality (no
		// introduction of mismatches, and by definition no change in gap
		// length), move to the new position.
		// in practice this moves the indel left when we reach the size of
		// the repeat unit.
		//
		int steppos, readsteppos;
		FBIndelAllele& indel = *id;
		int i = 1;
		while (i <= indel.length) {

			int steppos = indel.position - i;
			int readsteppos = indel.readPosition - i;

			while (steppos >= 0 && readsteppos >= 0
				   && indel.sequence == referenceSequence.substr(steppos, indel.length)
				   && indel.sequence == alignment.QueryBases.substr(readsteppos, indel.length)
				   && (id == indels.begin()
					   || (previous->insertion && steppos >= previous->position)
					   || (!previous->insertion && steppos >= previous->position + previous->length))) {
				indel.position -= i;
				indel.readPosition -= i;
				steppos = indel.position - i;
				readsteppos = indel.readPosition - i;
			}
			do {
				++i;
			} while (i <= indel.length && indel.length % i != 0);
		}

		// left shift indels with exchangeable flanking sequence
		//
		// for example:
		//
		//    GTTACGTT           GTTACGTT
		//    GT-----T   ---->   G-----TT
		//
		// GTGTGACGTGT           GTGTGACGTGT
		// GTGTG-----T   ---->   GTG-----TGT
		//
		// GTGTG-----T           GTG-----TGT
		// GTGTGACGTGT   ---->   GTGTGACGTGT
		//
		//
		steppos = indel.position - 1;
		readsteppos = indel.readPosition - 1;
		while (steppos >= 0 && readsteppos >= 0
			   && alignment.QueryBases.at(readsteppos) == referenceSequence.at(steppos)
			   && alignment.QueryBases.at(readsteppos) == indel.sequence.at(indel.sequence.size() - 1)
			   && (id == indels.begin()
				   || (previous->insertion && indel.position - 1 >= previous->position)
				   || (!previous->insertion && indel.position - 1 >= previous->position + previous->length))) {
			indel.sequence = indel.sequence.at(indel.sequence.size() - 1) + indel.sequence.substr(0, indel.sequence.size() - 1);
			indel.position -= 1;
			indel.readPosition -= 1;
			steppos = indel.position - 1;
			readsteppos = indel.readPosition - 1;
		}
		// tracks previous indel, so we don't run into it with the next shift
		previous = id;
	}

	// bring together floating indels
	// from left to right
	// check if we could merge with the next indel
	// if so, adjust so that we will merge in the next step
	if (indels.size() > 1) {
		previous = indels.begin();
		for (vector<FBIndelAllele>::iterator id = (indels.begin() + 1); id != indels.end(); ++id) {
			FBIndelAllele& indel = *id;
			// parsimony: could we shift right and merge with the previous indel?
			// if so, do it
			int prev_end_ref = previous->insertion ? previous->position : previous->position + previous->length;
			int prev_end_read = !previous->insertion ? previous->readPosition : previous->readPosition + previous->length;
			if (previous->insertion == indel.insertion
				&& ((previous->insertion
					 && (previous->position < indel.position
						 && previous->readPosition + previous->readPosition < indel.readPosition))
					||
					(!previous->insertion
					 && (previous->position + previous->length < indel.position)
					 && (previous->readPosition < indel.readPosition)
					 ))) {
				if (previous->homopolymer()) {
					string seq = referenceSequence.substr(prev_end_ref, indel.position - prev_end_ref);
					string readseq = alignment.QueryBases.substr(prev_end_read, indel.position - prev_end_ref);
					if (previous->sequence.at(0) == seq.at(0)
						&& FBhomopolymer(seq)
						&& FBhomopolymer(readseq)) {
						previous->position = indel.insertion ? indel.position : indel.position - previous->length;
					}
				}
				else {
					int pos = previous->position;
					while (pos < (int) referenceSequence.length() &&
						   ((previous->insertion && pos + previous->length <= indel.position)
							||
							(!previous->insertion && pos + previous->length < indel.position))
						   && previous->sequence
						   == referenceSequence.substr(pos + previous->length, previous->length)) {
						pos += previous->length;
					}
					if (pos < previous->position &&
						((previous->insertion && pos + previous->length == indel.position)
						 ||
						 (!previous->insertion && pos == indel.position - previous->length))
						) {
						previous->position = pos;
					}
				}
			}
			previous = id;
		}
	}

	// for each indel
	//     if ( we're matched up to the previous insertion (or deletion)
	//          and it's also an insertion or deletion )
	//         merge the indels
	//
	// and simultaneously reconstruct the cigar

	vector<CigarOp> newCigar;

	if (!softBegin.empty()) {
		newCigar.push_back(CigarOp('S', softBegin.size()));
	}

	vector<FBIndelAllele>::iterator id = indels.begin();
	FBIndelAllele last = *id++;
	if (last.position > 0) {
		newCigar.push_back(CigarOp('M', last.position));
		newCigar.push_back(CigarOp((last.insertion ? 'I' : 'D'), last.length));
	} else {
		newCigar.push_back(CigarOp((last.insertion ? 'I' : 'D'), last.length));
	}
	int lastend = last.insertion ? last.position : (last.position + last.length);

	for (; id != indels.end(); ++id) {
		FBIndelAllele& indel = *id;
		if (indel.position < lastend) {
			cerr << "impossibility?: indel realigned left of another indel" << endl << alignment.Name
				 << " " << alignment.Position << endl << alignment.QueryBases << endl;
			exit(1);
		} else if (indel.position == lastend && indel.insertion == last.insertion) {
			CigarOp& op = newCigar.back();
			op.Length += indel.length;
		} else if (indel.position >= lastend) {  // also catches differential indels, but with the same position
			newCigar.push_back(CigarOp('M', indel.position - lastend));
			newCigar.push_back(CigarOp((indel.insertion ? 'I' : 'D'), indel.length));
		}
		last = *id;
		lastend = last.insertion ? last.position : (last.position + last.length);
	}

	if (lastend < alignedLength) {
		newCigar.push_back(CigarOp('M', alignedLength - lastend));
	}

	if (!softEnd.empty()) {
		newCigar.push_back(CigarOp('S', softEnd.size()));
	}

	alignment.CigarData = newCigar;

	for (vector<CigarOp>::const_iterator c = alignment.CigarData.begin();
		 c != alignment.CigarData.end(); ++c) {
		unsigned int l = c->Length;
		char t = c->Type;
		cigar_after << l << t;
	}

	// check if we're realigned
	if (cigar_after.str() == cigar_before.str()) {
		return false;
	} else {
		return true;
	}

}