Пример #1
// Checks the reference area around variantPos for a multi-nucleotide repeat and it's span
// Logic: When shifting a window of the same period as the MNR, the base entering the window has to be equal to the base leaving the window.
// example with period 2: XYZACACA|CA|CACAIJK
bool AlleleIdentity::IdentifyMultiNucRepeatSection(const LocalReferenceContext &seq_context, unsigned int rep_period,
    const ReferenceReader &ref_reader, int chr_idx) {

  //cout << "Hello from IdentifyMultiNucRepeatSection with period " << rep_period << "!"<< endl;
  unsigned int variantPos = seq_context.position0 + left_anchor;
  if (variantPos + rep_period >= (unsigned long)ref_reader.chr_size(chr_idx))
    return (false);

  CircluarBuffer<char> window(rep_period);
  for (unsigned int idx = 0; idx < rep_period; idx++)
    window.assign(idx, ref_reader.base(chr_idx,variantPos+idx));

  // Investigate (inclusive) start position of MNR region
  start_window = variantPos - 1; // 1 anchor base
  while (start_window > 0 and window.first() == ref_reader.base(chr_idx,start_window)) {

  // Investigate (exclusive) end position of MNR region
  end_window = variantPos + rep_period;
  if (end_window >= ref_reader.chr_size(chr_idx))
    return false;
  for (unsigned int idx = 0; idx < rep_period; idx++)
    window.assign(idx, ref_reader.base(chr_idx,variantPos+idx));
  while (end_window < ref_reader.chr_size(chr_idx) and window.last() == ref_reader.base(chr_idx,end_window)) {

  //cout << "Found repeat stretch of length: " << (end_window - start_window) << endl;
  // Require that a stretch of at least 3*rep_period has to be found to count as a MNR
  if ((end_window - start_window) >= (3*(int)rep_period)) {

    // Correct start and end of the window if they are not fully outside variant allele
    if (start_window >= seq_context.position0)
        start_window = seq_context.my_hp_start_pos[0] - 1;
    if (end_window <= seq_context.right_hp_start) {
      if (status.isInsertion)
        end_window = seq_context.right_hp_start + seq_context.right_hp_length + 1;
        end_window = seq_context.right_hp_start + 1;
    if (start_window < 0)
      start_window = 0;
    if (end_window > ref_reader.chr_size(chr_idx))
      end_window = ref_reader.chr_size(chr_idx);
    return (true);
    return (false);
Пример #2
// Identify some special motives
bool AlleleIdentity::IdentifyDyslexicMotive(char base, int position,
    const ReferenceReader &ref_reader, int chr_idx) {

  status.isDyslexic = false;
  long  test_position = position-2;

  unsigned int max_hp_distance = 4;
  unsigned int hp_distance = 0;
  unsigned int my_hp_length = 0;

  // Test left vicinity of insertion
  while (!status.isDyslexic and test_position>0 and hp_distance < max_hp_distance) {
    if (ref_reader.base(chr_idx,test_position) != ref_reader.base(chr_idx,test_position-1)) {
      my_hp_length = 0;
    else if (ref_reader.base(chr_idx,test_position) == base) {
      if(my_hp_length >= 2) {  // trigger when a 3mer or more is found
    	  status.isDyslexic = true;
  if (status.isDyslexic) return (true);

  // test right vicinity of insertion
  hp_distance = 0;
  my_hp_length = 0;
  test_position = position+1;

  while (!status.isDyslexic and test_position<ref_reader.chr_size(chr_idx) and hp_distance < max_hp_distance) {
    if (ref_reader.base(chr_idx,test_position) != ref_reader.base(chr_idx,test_position-1)) {
      my_hp_length = 0;
    else if (ref_reader.base(chr_idx,test_position) == base) {
      if(my_hp_length >= 2) {  // trigger when a 3mer or more is found
    	  status.isDyslexic = true;
  return status.isDyslexic;
Пример #3
bool SpliceVariantHypotheses(const Alignment &current_read, const EnsembleEval &my_ensemble,
                        const LocalReferenceContext &local_context, PersistingThreadObjects &thread_objects,
                        int &splice_start_flow, int &splice_end_flow, vector<string> &my_hypotheses,
                        vector<bool> & same_as_null_hypothesis, bool & changed_alignment, const InputStructures &global_context,
                        const ReferenceReader &ref_reader, int chr_idx)

  // Hypotheses: 1) Null; read as called 2) Reference Hypothesis 3-?) Variant Hypotheses
  same_as_null_hypothesis.assign(my_hypotheses.size(), false);

  // Set up variables to log the flows we splice into
  splice_start_flow = -1;
  splice_end_flow = -1;
  int splice_start_idx = -1;
  vector<int> splice_end_idx;
  splice_end_idx.assign(my_hypotheses.size(), -1);

  // 1) Null hypothesis is read as called
  if (global_context.resolve_clipped_bases) {
    unsigned int null_hyp_length = current_read.read_bases.length() - current_read.left_sc - current_read.right_sc;
    my_hypotheses[0] = current_read.read_bases.substr(current_read.start_sc, null_hyp_length);
    my_hypotheses[0] = current_read.read_bases;

  // Initialize hypotheses variables for splicing
  for (unsigned int i_hyp = 1; i_hyp < my_hypotheses.size(); i_hyp++) {
    my_hypotheses[i_hyp].reserve(current_read.alignment.QueryBases.length() + 20 + local_context.reference_allele.length());
    // Add soft clipped bases on the left side of alignment if desired
    if (!global_context.resolve_clipped_bases)
      my_hypotheses[i_hyp] += current_read.alignment.QueryBases.substr(0, current_read.left_sc);

  int read_idx = current_read.left_sc;
  int ref_idx  = current_read.alignment.Position;
  int read_idx_max = current_read.alignment.QueryBases.length() - current_read.right_sc;
  bool did_splicing = false;
  bool just_did_splicing = false;
  string pretty_alignment;
  changed_alignment = false;

  // do realignment of a small region around variant if desired
  if (my_ensemble.doRealignment) {
    pretty_alignment = SpliceDoRealignement(thread_objects, current_read, local_context.position0,
                                            changed_alignment, global_context.DEBUG, ref_reader, chr_idx);
    if (pretty_alignment.empty() and global_context.DEBUG > 0)
      cerr << "Realignment returned an empty string in read " << current_read.alignment.Name << endl;

  if (pretty_alignment.empty()) {
    pretty_alignment = current_read.pretty_aln;
    changed_alignment = false;

  // Now fill in 2) and 3)

  for (unsigned int pretty_idx = 0; pretty_idx < pretty_alignment.length(); pretty_idx++) {

    bool outside_of_window = ref_idx < my_ensemble.multiallele_window_start or ref_idx >= my_ensemble.multiallele_window_end;
    bool outside_ref_allele = (long)ref_idx < local_context.position0 or ref_idx >= (int)(local_context.position0 + local_context.reference_allele.length());

    // Basic sanity checks
    if (read_idx >= read_idx_max
        or  ref_idx > ref_reader.chr_size(chr_idx)
        or (ref_idx == ref_reader.chr_size(chr_idx) and pretty_alignment[pretty_idx] != '+')) {
      did_splicing = false;

    // --- Splice ---
    if (ref_idx == local_context.position0 and !did_splicing and !outside_of_window) {
      // Add insertions before variant window
      while (pretty_idx < pretty_alignment.length() and pretty_alignment[pretty_idx] == '+') {
    	for (unsigned int i_hyp = 1; i_hyp < my_hypotheses.size(); i_hyp++)
      did_splicing = SpliceAddVariantAlleles(current_read, pretty_alignment, my_ensemble,
    		                    local_context, my_hypotheses, pretty_idx, global_context.DEBUG);
      just_did_splicing = did_splicing;
    } // --- ---

    // Have reference bases inside of window but outside of span of reference allele
    if (outside_ref_allele and !outside_of_window and pretty_alignment[pretty_idx] != '+') {
      for (unsigned int i_hyp = 1; i_hyp < my_hypotheses.size(); i_hyp++)

    // Have read bases as called outside of variant window
    if (outside_of_window and pretty_alignment[pretty_idx] != '-') {
      for (unsigned int i_hyp = 1; i_hyp < my_hypotheses.size(); i_hyp++)

      // --- Information to log flows. Indices are w.r.t. aligned portion of the read
      if (!did_splicing) { // Log index of the last base left of window which is the same for all hypotheses.
        splice_start_idx = read_idx - current_read.left_sc;
      else if (just_did_splicing) { // Log length of hypothesis after splicing
    	splice_end_idx[0] = read_idx  - current_read.left_sc;
    	int clipped_bases = 0;
    	if (!global_context.resolve_clipped_bases)
    	  clipped_bases = current_read.left_sc;
        for (unsigned int i_hyp=1; i_hyp<my_hypotheses.size(); i_hyp++)
          splice_end_idx[i_hyp] = my_hypotheses[i_hyp].length()-1 - clipped_bases; // Hyp length depends on whether there is resolving!
        just_did_splicing = false;
      // --- ---

    IncrementAlignmentIndices(pretty_alignment[pretty_idx], ref_idx, read_idx);

  } // end of for loop over extended pretty alignment

  // Check whether the whole reference allele fit
  // It seems that with primer trimming ion TVC, many a read throw this warning
  if (ref_idx < (int)(local_context.position0 + local_context.reference_allele.length())) {
    did_splicing = false;
    if (global_context.DEBUG>0)
      cout << "Warning in Splicing: Reference allele "<< local_context.reference_allele << " did not fit into read " << current_read.alignment.Name << endl;

  if (did_splicing) {
    // --- Add soft clipped bases to the right of the alignment and reverse complement ---
    for (unsigned int i_hyp = 1; i_hyp<my_hypotheses.size(); i_hyp++) {
      if (!global_context.resolve_clipped_bases)
        my_hypotheses[i_hyp] += current_read.alignment.QueryBases.substr(current_read.alignment.QueryBases.length()-current_read.right_sc, current_read.right_sc);

      if (current_read.is_reverse_strand)

    // Get the main flows before and after splicing
    splice_end_flow = GetSpliceFlows(current_read, global_context, my_hypotheses, same_as_null_hypothesis,
                                     splice_start_idx, splice_end_idx, splice_start_flow);
    if (splice_start_flow < 0 or splice_end_flow <= splice_start_flow) {
      did_splicing = false;
      cout << "Warning in Splicing: Splice flows are not valid in read " << current_read.alignment.Name
           << ". splice start flow: "<< splice_start_flow << " splice end flow " << splice_end_flow << endl;

  // Check for non-ACGT bases in hypotheses
  bool valid_bases = true;
  for (unsigned int i_hyp=0; i_hyp<my_hypotheses.size(); i_hyp++) {
	unsigned int iBase = 0;
	while (iBase<my_hypotheses[i_hyp].length() and valid_bases){
      if (my_hypotheses[i_hyp].at(iBase) == 'A' or my_hypotheses[i_hyp].at(iBase) == 'C' or
          my_hypotheses[i_hyp].at(iBase) == 'G' or my_hypotheses[i_hyp].at(iBase) == 'T')
        valid_bases = false;
  if (not valid_bases){
    cerr << "Non-Fatal ERROR in Splicing for " << local_context.contigName << ":" << local_context.position0+1
         << ": Read Hypotheses for " << current_read.alignment.Name << " contain non-ACGT characters." << endl;
    did_splicing = false;

  // --- Fail safe for hypotheses and verbose
  if (!did_splicing) {
	for (unsigned int i_hyp=1; i_hyp<my_hypotheses.size(); i_hyp++)
      my_hypotheses[i_hyp] = my_hypotheses[0];
    if (global_context.DEBUG > 1) {
      cout << "Failed to splice " << local_context.reference_allele << "->";
      for (unsigned int i_alt = 0; i_alt<my_ensemble.allele_identity_vector.size(); i_alt++) {
    	cout << my_ensemble.allele_identity_vector[i_alt].altAllele;
        if (i_alt < my_ensemble.allele_identity_vector.size()-1)
          cout << ",";
      cout << " into read " << current_read.alignment.Name << endl;
  else if (global_context.DEBUG > 1) {
	cout << "Spliced " << local_context.reference_allele << "->";
    for (unsigned int i_alt = 0; i_alt<my_ensemble.allele_identity_vector.size(); i_alt++) {
      cout << my_ensemble.allele_identity_vector[i_alt].altAllele;
      if (i_alt < my_ensemble.allele_identity_vector.size()-1)
        cout << ",";
    cout << " into ";
    if (current_read.is_reverse_strand) cout << "reverse ";
    else cout << "forward ";
    cout <<	"strand read read " << current_read.alignment.Name << endl;
    cout << "- Read as called: " << my_hypotheses[0] << endl;
    cout << "- Reference Hyp.: " << my_hypotheses[1] << endl;
    for (unsigned int i_hyp = 2; i_hyp<my_hypotheses.size(); i_hyp++)
      cout << "- Variant Hyp. " << (i_hyp-1) << ": " << my_hypotheses[i_hyp] << endl;
    cout << "- Splice start flow: " << splice_start_flow << " Splice end flow: " << splice_end_flow << endl;

  return did_splicing;
Пример #4
string SpliceDoRealignement (PersistingThreadObjects &thread_objects, const Alignment &current_read, long variant_position,
		                     bool &changed_alignment, int DEBUG, const ReferenceReader &ref_reader, int chr_idx) {

  // We do not allow any clipping since we align a short substring
  thread_objects.realigner.SetClipping(0, true);
  string new_alignment;

  // --- Get index positions at snp variant position
  int read_idx = current_read.left_sc;
  int ref_idx  = current_read.alignment.Position;
  unsigned int pretty_idx = 0;

  while (pretty_idx < current_read.pretty_aln.length() and ref_idx < variant_position) {
    IncrementAlignmentIndices(current_read.pretty_aln[pretty_idx], ref_idx, read_idx);
  if (DEBUG > 1)
    cout << "Computed variant position as (red, ref, pretty) " << read_idx << " " << ref_idx << " " << pretty_idx << endl;

  if (pretty_idx >= current_read.pretty_aln.length()
       or ref_idx  >= ref_reader.chr_size(chr_idx)
       or read_idx >= (int)current_read.alignment.QueryBases.length() - current_read.right_sc)
    return new_alignment;

  // --- Get small sequence context for very local realignment ------------------------
  int min_bases = 5;

  // Looking at alignment to the left of variant position to find right place to cut sequence
  int read_left = read_idx;
  int ref_left  = ref_idx;
  unsigned int pretty_left = pretty_idx;
  bool continue_looking = pretty_idx > 0;

  while (continue_looking) {
	DecrementAlignmentIndices(current_read.pretty_aln[pretty_left], ref_left, read_left);

	// Stopping criterion
	if (pretty_left < 1) {
      continue_looking = false;
	if (ref_idx - ref_left < min_bases)
      continue_looking = true;
	else {
	  // make sure to start with a matching base and don't split large HPs
	  if (current_read.pretty_aln[pretty_left] != '|'
          or (ref_reader.base(chr_idx,ref_left+1) == ref_reader.base(chr_idx,ref_left)))
	    continue_looking = true;
	    continue_looking = false;
  if (DEBUG > 1)
    cout << "Computed left realignment window as (red, ref, pretty) " << read_left << " " << ref_left << " " << pretty_left << endl;

  // Looking at alignment to the right to find right place to cut sequence
  int read_right = read_idx;
  int ref_right  = ref_idx;
  unsigned int pretty_right = pretty_idx;
  continue_looking = pretty_idx < current_read.pretty_aln.length()-1;

  while (continue_looking) {
  	IncrementAlignmentIndices(current_read.pretty_aln[pretty_right], ref_right, read_right);
  	// Stopping criterion (half open interval)
  	if (pretty_right >= current_read.pretty_aln.length()
        or ref_right >= ref_reader.chr_size(chr_idx)) {
      continue_looking = false;
  	if (ref_right - ref_idx < min_bases)
        continue_looking = true;
  	else {
  	  // make sure to stop with a matching base and don't split large HPs
  	  if (current_read.pretty_aln[pretty_right-1] != '|'
          or (ref_reader.base(chr_idx,ref_right-1) == ref_reader.base(chr_idx,ref_right)))
  	    continue_looking = true;
  	    continue_looking = false;
  if (DEBUG > 1)
    cout << "Computed right realignment window as (red, ref, pretty) " << read_right << " " << ref_right << " " << pretty_right << endl;
  // Put in some sanity checks for alignment boundaries found...

  // --- Realign -------------------------
  unsigned int start_position_shift;
  vector<CigarOp>    new_cigar_data;
  vector<MDelement>  new_md_data;

  // printouts
  if (DEBUG > 1) {
    thread_objects.realigner.verbose_ = true;
    cout << "Realigned " << current_read.alignment.Name << " from " << endl;
  if (read_left >= read_right and ref_left >= ref_right) {
    if (DEBUG > 1)
      cout << "ERROR: realignment window has zero size! " << endl;
    return new_alignment;

  string old_alignment = current_read.pretty_aln.substr(pretty_left, pretty_right-pretty_left);
  thread_objects.realigner.SetSequences(current_read.alignment.QueryBases.substr(read_left, read_right-read_left),
                         ref_reader.substr(chr_idx, ref_left, ref_right-ref_left), old_alignment, true);

  if (!thread_objects.realigner.computeSWalignment(new_cigar_data, new_md_data, start_position_shift)) {
    if (DEBUG > 1)
      cout << "ERROR: realignment failed! " << endl;
    return new_alignment;

  // --- Fuse realigned partial sequence back into pretty_aln string
  new_alignment = current_read.pretty_aln;
  if (old_alignment == thread_objects.realigner.pretty_aln()) {
    changed_alignment = false;
  else {
    new_alignment.replace(pretty_left, (pretty_right-pretty_left), thread_objects.realigner.pretty_aln());
    changed_alignment = true;
  return new_alignment;
Пример #5
void EnsembleEval::SetupAllAlleles(const ExtendParameters &parameters,
                                                 const InputStructures  &global_context,
                                                 const ReferenceReader &ref_reader,
                                                 int chr_idx)
  seq_context.DetectContext(*variant, global_context.DEBUG, ref_reader, chr_idx);

  if (global_context.DEBUG > 0 and variant->alt.size()>0) {
    cout << "Investigating variant candidate " << seq_context.reference_allele
         << " -> " << variant->alt[0];
    for (uint8_t i_allele = 1; i_allele < allele_identity_vector.size(); i_allele++)
      cout << ',' << variant->alt[i_allele];
    cout << endl;

  //now calculate the allele type (SNP/Indel/MNV/HPIndel etc.) and window for hypothesis calculation for each alt allele.
  for (uint8_t i_allele = 0; i_allele < allele_identity_vector.size(); i_allele++) {

    // TODO: Hotspot should be an allele property but we only set all or none to Hotspots, depending on the vcf record
    allele_identity_vector[i_allele].status.isHotSpot = variant->isHotSpot;
    allele_identity_vector[i_allele].DEBUG = global_context.DEBUG;

    allele_identity_vector[i_allele].indelActAsHPIndel = parameters.my_controls.filter_variant.indel_as_hpindel;

    allele_identity_vector[i_allele].getVariantType(variant->alt[i_allele], seq_context,
        global_context.ErrorMotifs,  parameters.my_controls.filter_variant, ref_reader, chr_idx);
    allele_identity_vector[i_allele].CalculateWindowForVariant(seq_context, global_context.DEBUG, ref_reader, chr_idx);

  multiallele_window_start = -1;
  multiallele_window_end   = -1;

  // Mark Ensemble for realignment if any of the possible variants should be realigned
  // TODO: Should we exclude already filtered alleles?
  for (uint8_t i_allele = 0; i_allele < allele_identity_vector.size(); i_allele++) {
    //if (!allele_identity_vector[i_allele].status.isNoCallVariant) {
    if (allele_identity_vector[i_allele].start_window < multiallele_window_start or multiallele_window_start == -1)
      multiallele_window_start = allele_identity_vector[i_allele].start_window;
    if (allele_identity_vector[i_allele].end_window > multiallele_window_end or multiallele_window_end == -1)
      multiallele_window_end = allele_identity_vector[i_allele].end_window;

    if (allele_identity_vector[i_allele].ActAsSNP() && parameters.my_controls.filter_variant.do_snp_realignment) {
      doRealignment = doRealignment or allele_identity_vector[i_allele].status.doRealignment;
    if (allele_identity_vector[i_allele].ActAsMNP() && parameters.my_controls.filter_variant.do_mnp_realignment) {
      doRealignment = doRealignment or allele_identity_vector[i_allele].status.doRealignment;
  // Hack: pass allele windows back down the object
  for (uint8_t i_allele = 0; i_allele < allele_identity_vector.size(); i_allele++) {
    allele_identity_vector[i_allele].start_window = multiallele_window_start;
    allele_identity_vector[i_allele].end_window = multiallele_window_end;

  if (global_context.DEBUG > 0) {
	cout << "Realignment for this candidate is turned " << (doRealignment ? "on" : "off") << endl;
    cout << "Final window for multi-allele: " << ": (" << multiallele_window_start << ") ";
    for (int p_idx = multiallele_window_start; p_idx < multiallele_window_end; p_idx++)
      cout << ref_reader.base(chr_idx,p_idx);
    cout << " (" << multiallele_window_end << ") " << endl;
Пример #6
void AlleleIdentity::CalculateWindowForVariant(const LocalReferenceContext &seq_context, int DEBUG,
    const ReferenceReader &ref_reader, int chr_idx) {

  // If we have an invalid vcf candidate, set a length zero window and exit
  if (!seq_context.context_detected or status.isProblematicAllele) {
    start_window = seq_context.position0;
    end_window = seq_context.position0;

  // Check for MNRs first, for InDelLengths 2,3,4,5
  if (status.isIndel and !status.isHPIndel and inDelLength < 5)
    for (int rep_period = 2; rep_period < 6; rep_period++)
      if (IdentifyMultiNucRepeatSection(seq_context, rep_period, ref_reader, chr_idx)) {
        if (DEBUG > 0) {
          cout << "MNR found in allele " << seq_context.reference_allele << " -> " << altAllele << endl;
          cout << "Window for allele " << altAllele << ": (" << start_window << ") ";
          for (int p_idx = start_window; p_idx < end_window; p_idx++)
            cout << ref_reader.base(chr_idx,p_idx);
          cout << " (" << end_window << ") " << endl;
        return; // Found a matching period and computed window

  // not an MNR. Moving on along to InDels.
  if (status.isIndel) {
	// Default variant window
    end_window = seq_context.right_hp_start +1; // Anchor base to the right of allele
    start_window = seq_context.position0;

    // Adjustments if necessary
    if (status.isDeletion)
      if (seq_context.my_hp_start_pos[left_anchor] == seq_context.my_hp_start_pos[0])
        start_window = seq_context.my_hp_start_pos[0] - 1;

    if (status.isInsertion) {
      if (left_anchor == 0) {
        start_window = seq_context.my_hp_start_pos[0] - 1;
      else if (altAllele[left_anchor] == altAllele[left_anchor - 1] and
          seq_context.position0 > (seq_context.my_hp_start_pos[left_anchor - 1] - 1)) {
        start_window = seq_context.my_hp_start_pos[left_anchor - 1] - 1;
      if (altAllele[altAllele.length() - 1] == seq_context.ref_right_hp_base) {
        end_window += seq_context.right_hp_length;

    // Safety
    if (start_window < 0)
      start_window = 0;
    if (end_window > ref_reader.chr_size(chr_idx))
      end_window = ref_reader.chr_size(chr_idx);
  else {
    // SNPs and MNVs are 1->1 base replacements
    start_window = seq_context.position0;
    end_window = seq_context.position0 + seq_context.reference_allele.length();
  } // */

  if (DEBUG > 0) {
    cout << "Window for allele " << altAllele << ": (" << start_window << ") ";
    for (int p_idx = start_window; p_idx < end_window; p_idx++)
      cout << ref_reader.base(chr_idx,p_idx);
    cout << " (" << end_window << ") " << endl;