// Checks the reference area around variantPos for a multi-nucleotide repeat and it's span // Logic: When shifting a window of the same period as the MNR, the base entering the window has to be equal to the base leaving the window. // example with period 2: XYZACACA|CA|CACAIJK bool AlleleIdentity::IdentifyMultiNucRepeatSection(const LocalReferenceContext &seq_context, unsigned int rep_period, const ReferenceReader &ref_reader, int chr_idx) { //cout << "Hello from IdentifyMultiNucRepeatSection with period " << rep_period << "!"<< endl; unsigned int variantPos = seq_context.position0 + left_anchor; if (variantPos + rep_period >= (unsigned long)ref_reader.chr_size(chr_idx)) return (false); CircluarBuffer<char> window(rep_period); for (unsigned int idx = 0; idx < rep_period; idx++) window.assign(idx, ref_reader.base(chr_idx,variantPos+idx)); // Investigate (inclusive) start position of MNR region start_window = variantPos - 1; // 1 anchor base window.shiftLeft(1); while (start_window > 0 and window.first() == ref_reader.base(chr_idx,start_window)) { start_window--; window.shiftLeft(1); } // Investigate (exclusive) end position of MNR region end_window = variantPos + rep_period; if (end_window >= ref_reader.chr_size(chr_idx)) return false; for (unsigned int idx = 0; idx < rep_period; idx++) window.assign(idx, ref_reader.base(chr_idx,variantPos+idx)); window.shiftRight(1); while (end_window < ref_reader.chr_size(chr_idx) and window.last() == ref_reader.base(chr_idx,end_window)) { end_window++; window.shiftRight(1); } //cout << "Found repeat stretch of length: " << (end_window - start_window) << endl; // Require that a stretch of at least 3*rep_period has to be found to count as a MNR if ((end_window - start_window) >= (3*(int)rep_period)) { // Correct start and end of the window if they are not fully outside variant allele if (start_window >= seq_context.position0) start_window = seq_context.my_hp_start_pos[0] - 1; if (end_window <= seq_context.right_hp_start) { if (status.isInsertion) end_window = seq_context.right_hp_start + seq_context.right_hp_length + 1; else end_window = seq_context.right_hp_start + 1; } if (start_window < 0) start_window = 0; if (end_window > ref_reader.chr_size(chr_idx)) end_window = ref_reader.chr_size(chr_idx); return (true); } else return (false); }
// Identify some special motives bool AlleleIdentity::IdentifyDyslexicMotive(char base, int position, const ReferenceReader &ref_reader, int chr_idx) { status.isDyslexic = false; long test_position = position-2; unsigned int max_hp_distance = 4; unsigned int hp_distance = 0; unsigned int my_hp_length = 0; // Test left vicinity of insertion while (!status.isDyslexic and test_position>0 and hp_distance < max_hp_distance) { if (ref_reader.base(chr_idx,test_position) != ref_reader.base(chr_idx,test_position-1)) { hp_distance++; my_hp_length = 0; } else if (ref_reader.base(chr_idx,test_position) == base) { my_hp_length++; if(my_hp_length >= 2) { // trigger when a 3mer or more is found status.isDyslexic = true; } } test_position--; } if (status.isDyslexic) return (true); // test right vicinity of insertion hp_distance = 0; my_hp_length = 0; test_position = position+1; while (!status.isDyslexic and test_position<ref_reader.chr_size(chr_idx) and hp_distance < max_hp_distance) { if (ref_reader.base(chr_idx,test_position) != ref_reader.base(chr_idx,test_position-1)) { hp_distance++; my_hp_length = 0; } else if (ref_reader.base(chr_idx,test_position) == base) { my_hp_length++; if(my_hp_length >= 2) { // trigger when a 3mer or more is found status.isDyslexic = true; } } test_position++; } return status.isDyslexic; }
bool SpliceVariantHypotheses(const Alignment ¤t_read, const EnsembleEval &my_ensemble, const LocalReferenceContext &local_context, PersistingThreadObjects &thread_objects, int &splice_start_flow, int &splice_end_flow, vector<string> &my_hypotheses, vector<bool> & same_as_null_hypothesis, bool & changed_alignment, const InputStructures &global_context, const ReferenceReader &ref_reader, int chr_idx) { // Hypotheses: 1) Null; read as called 2) Reference Hypothesis 3-?) Variant Hypotheses my_hypotheses.resize(my_ensemble.allele_identity_vector.size()+2); same_as_null_hypothesis.assign(my_hypotheses.size(), false); // Set up variables to log the flows we splice into splice_start_flow = -1; splice_end_flow = -1; int splice_start_idx = -1; vector<int> splice_end_idx; splice_end_idx.assign(my_hypotheses.size(), -1); // 1) Null hypothesis is read as called if (global_context.resolve_clipped_bases) { unsigned int null_hyp_length = current_read.read_bases.length() - current_read.left_sc - current_read.right_sc; my_hypotheses[0] = current_read.read_bases.substr(current_read.start_sc, null_hyp_length); } else my_hypotheses[0] = current_read.read_bases; // Initialize hypotheses variables for splicing for (unsigned int i_hyp = 1; i_hyp < my_hypotheses.size(); i_hyp++) { my_hypotheses[i_hyp].clear(); my_hypotheses[i_hyp].reserve(current_read.alignment.QueryBases.length() + 20 + local_context.reference_allele.length()); // Add soft clipped bases on the left side of alignment if desired if (!global_context.resolve_clipped_bases) my_hypotheses[i_hyp] += current_read.alignment.QueryBases.substr(0, current_read.left_sc); } int read_idx = current_read.left_sc; int ref_idx = current_read.alignment.Position; int read_idx_max = current_read.alignment.QueryBases.length() - current_read.right_sc; bool did_splicing = false; bool just_did_splicing = false; string pretty_alignment; changed_alignment = false; // do realignment of a small region around variant if desired if (my_ensemble.doRealignment) { pretty_alignment = SpliceDoRealignement(thread_objects, current_read, local_context.position0, changed_alignment, global_context.DEBUG, ref_reader, chr_idx); if (pretty_alignment.empty() and global_context.DEBUG > 0) cerr << "Realignment returned an empty string in read " << current_read.alignment.Name << endl; } if (pretty_alignment.empty()) { pretty_alignment = current_read.pretty_aln; changed_alignment = false; } // Now fill in 2) and 3) for (unsigned int pretty_idx = 0; pretty_idx < pretty_alignment.length(); pretty_idx++) { bool outside_of_window = ref_idx < my_ensemble.multiallele_window_start or ref_idx >= my_ensemble.multiallele_window_end; bool outside_ref_allele = (long)ref_idx < local_context.position0 or ref_idx >= (int)(local_context.position0 + local_context.reference_allele.length()); // Basic sanity checks if (read_idx >= read_idx_max or ref_idx > ref_reader.chr_size(chr_idx) or (ref_idx == ref_reader.chr_size(chr_idx) and pretty_alignment[pretty_idx] != '+')) { did_splicing = false; break; } // --- Splice --- if (ref_idx == local_context.position0 and !did_splicing and !outside_of_window) { // Add insertions before variant window while (pretty_idx < pretty_alignment.length() and pretty_alignment[pretty_idx] == '+') { for (unsigned int i_hyp = 1; i_hyp < my_hypotheses.size(); i_hyp++) my_hypotheses[i_hyp].push_back(current_read.alignment.QueryBases[read_idx]); read_idx++; pretty_idx++; } did_splicing = SpliceAddVariantAlleles(current_read, pretty_alignment, my_ensemble, local_context, my_hypotheses, pretty_idx, global_context.DEBUG); just_did_splicing = did_splicing; } // --- --- // Have reference bases inside of window but outside of span of reference allele if (outside_ref_allele and !outside_of_window and pretty_alignment[pretty_idx] != '+') { for (unsigned int i_hyp = 1; i_hyp < my_hypotheses.size(); i_hyp++) my_hypotheses[i_hyp].push_back(ref_reader.base(chr_idx,ref_idx)); } // Have read bases as called outside of variant window if (outside_of_window and pretty_alignment[pretty_idx] != '-') { for (unsigned int i_hyp = 1; i_hyp < my_hypotheses.size(); i_hyp++) my_hypotheses[i_hyp].push_back(current_read.alignment.QueryBases[read_idx]); // --- Information to log flows. Indices are w.r.t. aligned portion of the read if (!did_splicing) { // Log index of the last base left of window which is the same for all hypotheses. splice_start_idx = read_idx - current_read.left_sc; } else if (just_did_splicing) { // Log length of hypothesis after splicing splice_end_idx[0] = read_idx - current_read.left_sc; int clipped_bases = 0; if (!global_context.resolve_clipped_bases) clipped_bases = current_read.left_sc; for (unsigned int i_hyp=1; i_hyp<my_hypotheses.size(); i_hyp++) splice_end_idx[i_hyp] = my_hypotheses[i_hyp].length()-1 - clipped_bases; // Hyp length depends on whether there is resolving! just_did_splicing = false; } // --- --- } IncrementAlignmentIndices(pretty_alignment[pretty_idx], ref_idx, read_idx); } // end of for loop over extended pretty alignment // Check whether the whole reference allele fit // It seems that with primer trimming ion TVC, many a read throw this warning if (ref_idx < (int)(local_context.position0 + local_context.reference_allele.length())) { did_splicing = false; if (global_context.DEBUG>0) cout << "Warning in Splicing: Reference allele "<< local_context.reference_allele << " did not fit into read " << current_read.alignment.Name << endl; } if (did_splicing) { // --- Add soft clipped bases to the right of the alignment and reverse complement --- for (unsigned int i_hyp = 1; i_hyp<my_hypotheses.size(); i_hyp++) { if (!global_context.resolve_clipped_bases) my_hypotheses[i_hyp] += current_read.alignment.QueryBases.substr(current_read.alignment.QueryBases.length()-current_read.right_sc, current_read.right_sc); if (current_read.is_reverse_strand) RevComplementInPlace(my_hypotheses[i_hyp]); } // Get the main flows before and after splicing splice_end_flow = GetSpliceFlows(current_read, global_context, my_hypotheses, same_as_null_hypothesis, splice_start_idx, splice_end_idx, splice_start_flow); if (splice_start_flow < 0 or splice_end_flow <= splice_start_flow) { did_splicing = false; cout << "Warning in Splicing: Splice flows are not valid in read " << current_read.alignment.Name << ". splice start flow: "<< splice_start_flow << " splice end flow " << splice_end_flow << endl; } } // Check for non-ACGT bases in hypotheses bool valid_bases = true; for (unsigned int i_hyp=0; i_hyp<my_hypotheses.size(); i_hyp++) { unsigned int iBase = 0; while (iBase<my_hypotheses[i_hyp].length() and valid_bases){ if (my_hypotheses[i_hyp].at(iBase) == 'A' or my_hypotheses[i_hyp].at(iBase) == 'C' or my_hypotheses[i_hyp].at(iBase) == 'G' or my_hypotheses[i_hyp].at(iBase) == 'T') iBase++; else valid_bases = false; } } if (not valid_bases){ cerr << "Non-Fatal ERROR in Splicing for " << local_context.contigName << ":" << local_context.position0+1 << ": Read Hypotheses for " << current_read.alignment.Name << " contain non-ACGT characters." << endl; did_splicing = false; } // --- Fail safe for hypotheses and verbose if (!did_splicing) { for (unsigned int i_hyp=1; i_hyp<my_hypotheses.size(); i_hyp++) my_hypotheses[i_hyp] = my_hypotheses[0]; if (global_context.DEBUG > 1) { cout << "Failed to splice " << local_context.reference_allele << "->"; for (unsigned int i_alt = 0; i_alt<my_ensemble.allele_identity_vector.size(); i_alt++) { cout << my_ensemble.allele_identity_vector[i_alt].altAllele; if (i_alt < my_ensemble.allele_identity_vector.size()-1) cout << ","; } cout << " into read " << current_read.alignment.Name << endl; } } else if (global_context.DEBUG > 1) { cout << "Spliced " << local_context.reference_allele << "->"; for (unsigned int i_alt = 0; i_alt<my_ensemble.allele_identity_vector.size(); i_alt++) { cout << my_ensemble.allele_identity_vector[i_alt].altAllele; if (i_alt < my_ensemble.allele_identity_vector.size()-1) cout << ","; } cout << " into "; if (current_read.is_reverse_strand) cout << "reverse "; else cout << "forward "; cout << "strand read read " << current_read.alignment.Name << endl; cout << "- Read as called: " << my_hypotheses[0] << endl; cout << "- Reference Hyp.: " << my_hypotheses[1] << endl; for (unsigned int i_hyp = 2; i_hyp<my_hypotheses.size(); i_hyp++) cout << "- Variant Hyp. " << (i_hyp-1) << ": " << my_hypotheses[i_hyp] << endl; cout << "- Splice start flow: " << splice_start_flow << " Splice end flow: " << splice_end_flow << endl; } return did_splicing; };
string SpliceDoRealignement (PersistingThreadObjects &thread_objects, const Alignment ¤t_read, long variant_position, bool &changed_alignment, int DEBUG, const ReferenceReader &ref_reader, int chr_idx) { // We do not allow any clipping since we align a short substring thread_objects.realigner.SetClipping(0, true); string new_alignment; // --- Get index positions at snp variant position int read_idx = current_read.left_sc; int ref_idx = current_read.alignment.Position; unsigned int pretty_idx = 0; while (pretty_idx < current_read.pretty_aln.length() and ref_idx < variant_position) { IncrementAlignmentIndices(current_read.pretty_aln[pretty_idx], ref_idx, read_idx); pretty_idx++; } if (DEBUG > 1) cout << "Computed variant position as (red, ref, pretty) " << read_idx << " " << ref_idx << " " << pretty_idx << endl; if (pretty_idx >= current_read.pretty_aln.length() or ref_idx >= ref_reader.chr_size(chr_idx) or read_idx >= (int)current_read.alignment.QueryBases.length() - current_read.right_sc) return new_alignment; // --- Get small sequence context for very local realignment ------------------------ int min_bases = 5; // Looking at alignment to the left of variant position to find right place to cut sequence int read_left = read_idx; int ref_left = ref_idx; unsigned int pretty_left = pretty_idx; bool continue_looking = pretty_idx > 0; while (continue_looking) { pretty_left--; DecrementAlignmentIndices(current_read.pretty_aln[pretty_left], ref_left, read_left); // Stopping criterion if (pretty_left < 1) { continue_looking = false; break; } if (ref_idx - ref_left < min_bases) continue_looking = true; else { // make sure to start with a matching base and don't split large HPs if (current_read.pretty_aln[pretty_left] != '|' or (ref_reader.base(chr_idx,ref_left+1) == ref_reader.base(chr_idx,ref_left))) continue_looking = true; else continue_looking = false; } } if (DEBUG > 1) cout << "Computed left realignment window as (red, ref, pretty) " << read_left << " " << ref_left << " " << pretty_left << endl; // Looking at alignment to the right to find right place to cut sequence int read_right = read_idx; int ref_right = ref_idx; unsigned int pretty_right = pretty_idx; continue_looking = pretty_idx < current_read.pretty_aln.length()-1; while (continue_looking) { IncrementAlignmentIndices(current_read.pretty_aln[pretty_right], ref_right, read_right); pretty_right++; // Stopping criterion (half open interval) if (pretty_right >= current_read.pretty_aln.length() or ref_right >= ref_reader.chr_size(chr_idx)) { continue_looking = false; break; } if (ref_right - ref_idx < min_bases) continue_looking = true; else { // make sure to stop with a matching base and don't split large HPs if (current_read.pretty_aln[pretty_right-1] != '|' or (ref_reader.base(chr_idx,ref_right-1) == ref_reader.base(chr_idx,ref_right))) continue_looking = true; else continue_looking = false; } } if (DEBUG > 1) cout << "Computed right realignment window as (red, ref, pretty) " << read_right << " " << ref_right << " " << pretty_right << endl; // Put in some sanity checks for alignment boundaries found... // --- Realign ------------------------- unsigned int start_position_shift; vector<CigarOp> new_cigar_data; vector<MDelement> new_md_data; // printouts if (DEBUG > 1) { thread_objects.realigner.verbose_ = true; cout << "Realigned " << current_read.alignment.Name << " from " << endl; } if (read_left >= read_right and ref_left >= ref_right) { if (DEBUG > 1) cout << "ERROR: realignment window has zero size! " << endl; return new_alignment; } string old_alignment = current_read.pretty_aln.substr(pretty_left, pretty_right-pretty_left); thread_objects.realigner.SetSequences(current_read.alignment.QueryBases.substr(read_left, read_right-read_left), ref_reader.substr(chr_idx, ref_left, ref_right-ref_left), old_alignment, true); if (!thread_objects.realigner.computeSWalignment(new_cigar_data, new_md_data, start_position_shift)) { if (DEBUG > 1) cout << "ERROR: realignment failed! " << endl; return new_alignment; } // --- Fuse realigned partial sequence back into pretty_aln string new_alignment = current_read.pretty_aln; if (old_alignment == thread_objects.realigner.pretty_aln()) { changed_alignment = false; } else { new_alignment.replace(pretty_left, (pretty_right-pretty_left), thread_objects.realigner.pretty_aln()); changed_alignment = true; } return new_alignment; }
void EnsembleEval::SetupAllAlleles(const ExtendParameters ¶meters, const InputStructures &global_context, const ReferenceReader &ref_reader, int chr_idx) { seq_context.DetectContext(*variant, global_context.DEBUG, ref_reader, chr_idx); allele_identity_vector.resize(variant->alt.size()); if (global_context.DEBUG > 0 and variant->alt.size()>0) { cout << "Investigating variant candidate " << seq_context.reference_allele << " -> " << variant->alt[0]; for (uint8_t i_allele = 1; i_allele < allele_identity_vector.size(); i_allele++) cout << ',' << variant->alt[i_allele]; cout << endl; } //now calculate the allele type (SNP/Indel/MNV/HPIndel etc.) and window for hypothesis calculation for each alt allele. for (uint8_t i_allele = 0; i_allele < allele_identity_vector.size(); i_allele++) { // TODO: Hotspot should be an allele property but we only set all or none to Hotspots, depending on the vcf record allele_identity_vector[i_allele].status.isHotSpot = variant->isHotSpot; allele_identity_vector[i_allele].filterReasons.clear(); allele_identity_vector[i_allele].DEBUG = global_context.DEBUG; allele_identity_vector[i_allele].indelActAsHPIndel = parameters.my_controls.filter_variant.indel_as_hpindel; allele_identity_vector[i_allele].getVariantType(variant->alt[i_allele], seq_context, global_context.ErrorMotifs, parameters.my_controls.filter_variant, ref_reader, chr_idx); allele_identity_vector[i_allele].CalculateWindowForVariant(seq_context, global_context.DEBUG, ref_reader, chr_idx); } //GetMultiAlleleVariantWindow(); multiallele_window_start = -1; multiallele_window_end = -1; // Mark Ensemble for realignment if any of the possible variants should be realigned // TODO: Should we exclude already filtered alleles? for (uint8_t i_allele = 0; i_allele < allele_identity_vector.size(); i_allele++) { //if (!allele_identity_vector[i_allele].status.isNoCallVariant) { if (allele_identity_vector[i_allele].start_window < multiallele_window_start or multiallele_window_start == -1) multiallele_window_start = allele_identity_vector[i_allele].start_window; if (allele_identity_vector[i_allele].end_window > multiallele_window_end or multiallele_window_end == -1) multiallele_window_end = allele_identity_vector[i_allele].end_window; if (allele_identity_vector[i_allele].ActAsSNP() && parameters.my_controls.filter_variant.do_snp_realignment) { doRealignment = doRealignment or allele_identity_vector[i_allele].status.doRealignment; } if (allele_identity_vector[i_allele].ActAsMNP() && parameters.my_controls.filter_variant.do_mnp_realignment) { doRealignment = doRealignment or allele_identity_vector[i_allele].status.doRealignment; } } // Hack: pass allele windows back down the object for (uint8_t i_allele = 0; i_allele < allele_identity_vector.size(); i_allele++) { allele_identity_vector[i_allele].start_window = multiallele_window_start; allele_identity_vector[i_allele].end_window = multiallele_window_end; } if (global_context.DEBUG > 0) { cout << "Realignment for this candidate is turned " << (doRealignment ? "on" : "off") << endl; cout << "Final window for multi-allele: " << ": (" << multiallele_window_start << ") "; for (int p_idx = multiallele_window_start; p_idx < multiallele_window_end; p_idx++) cout << ref_reader.base(chr_idx,p_idx); cout << " (" << multiallele_window_end << ") " << endl; } }
void AlleleIdentity::CalculateWindowForVariant(const LocalReferenceContext &seq_context, int DEBUG, const ReferenceReader &ref_reader, int chr_idx) { // If we have an invalid vcf candidate, set a length zero window and exit if (!seq_context.context_detected or status.isProblematicAllele) { start_window = seq_context.position0; end_window = seq_context.position0; return; } // Check for MNRs first, for InDelLengths 2,3,4,5 if (status.isIndel and !status.isHPIndel and inDelLength < 5) for (int rep_period = 2; rep_period < 6; rep_period++) if (IdentifyMultiNucRepeatSection(seq_context, rep_period, ref_reader, chr_idx)) { if (DEBUG > 0) { cout << "MNR found in allele " << seq_context.reference_allele << " -> " << altAllele << endl; cout << "Window for allele " << altAllele << ": (" << start_window << ") "; for (int p_idx = start_window; p_idx < end_window; p_idx++) cout << ref_reader.base(chr_idx,p_idx); cout << " (" << end_window << ") " << endl; } return; // Found a matching period and computed window } // not an MNR. Moving on along to InDels. if (status.isIndel) { // Default variant window end_window = seq_context.right_hp_start +1; // Anchor base to the right of allele start_window = seq_context.position0; // Adjustments if necessary if (status.isDeletion) if (seq_context.my_hp_start_pos[left_anchor] == seq_context.my_hp_start_pos[0]) start_window = seq_context.my_hp_start_pos[0] - 1; if (status.isInsertion) { if (left_anchor == 0) { start_window = seq_context.my_hp_start_pos[0] - 1; } else if (altAllele[left_anchor] == altAllele[left_anchor - 1] and seq_context.position0 > (seq_context.my_hp_start_pos[left_anchor - 1] - 1)) { start_window = seq_context.my_hp_start_pos[left_anchor - 1] - 1; } if (altAllele[altAllele.length() - 1] == seq_context.ref_right_hp_base) { end_window += seq_context.right_hp_length; } } // Safety if (start_window < 0) start_window = 0; if (end_window > ref_reader.chr_size(chr_idx)) end_window = ref_reader.chr_size(chr_idx); } else { // SNPs and MNVs are 1->1 base replacements start_window = seq_context.position0; end_window = seq_context.position0 + seq_context.reference_allele.length(); } // */ if (DEBUG > 0) { cout << "Window for allele " << altAllele << ": (" << start_window << ") "; for (int p_idx = start_window; p_idx < end_window; p_idx++) cout << ref_reader.base(chr_idx,p_idx); cout << " (" << end_window << ") " << endl; } }