// Checks the reference area around variantPos for a multi-nucleotide repeat and it's span // Logic: When shifting a window of the same period as the MNR, the base entering the window has to be equal to the base leaving the window. // example with period 2: XYZACACA|CA|CACAIJK bool AlleleIdentity::IdentifyMultiNucRepeatSection(const LocalReferenceContext &seq_context, unsigned int rep_period, const ReferenceReader &ref_reader, int chr_idx) { //cout << "Hello from IdentifyMultiNucRepeatSection with period " << rep_period << "!"<< endl; unsigned int variantPos = seq_context.position0 + left_anchor; if (variantPos + rep_period >= (unsigned long)ref_reader.chr_size(chr_idx)) return (false); CircluarBuffer<char> window(rep_period); for (unsigned int idx = 0; idx < rep_period; idx++) window.assign(idx, ref_reader.base(chr_idx,variantPos+idx)); // Investigate (inclusive) start position of MNR region start_window = variantPos - 1; // 1 anchor base window.shiftLeft(1); while (start_window > 0 and window.first() == ref_reader.base(chr_idx,start_window)) { start_window--; window.shiftLeft(1); } // Investigate (exclusive) end position of MNR region end_window = variantPos + rep_period; if (end_window >= ref_reader.chr_size(chr_idx)) return false; for (unsigned int idx = 0; idx < rep_period; idx++) window.assign(idx, ref_reader.base(chr_idx,variantPos+idx)); window.shiftRight(1); while (end_window < ref_reader.chr_size(chr_idx) and window.last() == ref_reader.base(chr_idx,end_window)) { end_window++; window.shiftRight(1); } //cout << "Found repeat stretch of length: " << (end_window - start_window) << endl; // Require that a stretch of at least 3*rep_period has to be found to count as a MNR if ((end_window - start_window) >= (3*(int)rep_period)) { // Correct start and end of the window if they are not fully outside variant allele if (start_window >= seq_context.position0) start_window = seq_context.my_hp_start_pos[0] - 1; if (end_window <= seq_context.right_hp_start) { if (status.isInsertion) end_window = seq_context.right_hp_start + seq_context.right_hp_length + 1; else end_window = seq_context.right_hp_start + 1; } if (start_window < 0) start_window = 0; if (end_window > ref_reader.chr_size(chr_idx)) end_window = ref_reader.chr_size(chr_idx); return (true); } else return (false); }
// Entry point for variant classification bool AlleleIdentity::getVariantType( const string _altAllele, const LocalReferenceContext &reference_context, const TIonMotifSet & ErrorMotifs, const ClassifyFilters &filter_variant, const ReferenceReader &ref_reader, int chr_idx) { altAllele = _altAllele; bool is_ok = reference_context.context_detected; if ((reference_context.position0 + (long)altAllele.length()) > ref_reader.chr_size(chr_idx)) { is_ok = false; } // We should now be guaranteed a valid variant position in here if (is_ok) { is_ok = CharacterizeVariantStatus(reference_context, ref_reader, chr_idx); PredictSequenceMotifSSE(reference_context, ErrorMotifs, ref_reader, chr_idx); } is_ok = is_ok and CheckValidAltAllele(reference_context); if (!is_ok) { status.isProblematicAllele = true; filterReasons.push_back("BADCANDIDATE"); } return(is_ok); }
// Identify some special motives bool AlleleIdentity::IdentifyDyslexicMotive(char base, int position, const ReferenceReader &ref_reader, int chr_idx) { status.isDyslexic = false; long test_position = position-2; unsigned int max_hp_distance = 4; unsigned int hp_distance = 0; unsigned int my_hp_length = 0; // Test left vicinity of insertion while (!status.isDyslexic and test_position>0 and hp_distance < max_hp_distance) { if (ref_reader.base(chr_idx,test_position) != ref_reader.base(chr_idx,test_position-1)) { hp_distance++; my_hp_length = 0; } else if (ref_reader.base(chr_idx,test_position) == base) { my_hp_length++; if(my_hp_length >= 2) { // trigger when a 3mer or more is found status.isDyslexic = true; } } test_position--; } if (status.isDyslexic) return (true); // test right vicinity of insertion hp_distance = 0; my_hp_length = 0; test_position = position+1; while (!status.isDyslexic and test_position<ref_reader.chr_size(chr_idx) and hp_distance < max_hp_distance) { if (ref_reader.base(chr_idx,test_position) != ref_reader.base(chr_idx,test_position-1)) { hp_distance++; my_hp_length = 0; } else if (ref_reader.base(chr_idx,test_position) == base) { my_hp_length++; if(my_hp_length >= 2) { // trigger when a 3mer or more is found status.isDyslexic = true; } } test_position++; } return status.isDyslexic; }
void AlleleIdentity::PredictSequenceMotifSSE(const LocalReferenceContext &reference_context, const TIonMotifSet & ErrorMotifs, const ReferenceReader &ref_reader, int chr_idx) { //cout << "Hello from PredictSequenceMotifSSE" << endl; sse_prob_positive_strand = 0; sse_prob_negative_strand = 0; //long vcf_position = reference_context.position0+1; long var_position = reference_context.position0 + left_anchor; // This points to the first deleted base string seqContext; // status.isHPIndel && status.isDeletion implies reference_context.my_hp_length.at(left_anchor) > 1 if (status.isHPIndel && status.isDeletion) { // cout << start_pos << "\t" << variant_context.refBaseAtCandidatePosition << variant_context.ref_hp_length << "\t" << variant_context.refBaseLeft << variant_context.left_hp_length << "\t" << variant_context.refBaseRight << variant_context.right_hp_length << "\t"; unsigned context_left = var_position >= 10 ? 10 : var_position; //if (var_position + reference_context.my_hp_length.at(left_anchor) + 10 < ref_reader.chr_size(chr_idx)) seqContext = ref_reader.substr(chr_idx, var_position - context_left, context_left + (unsigned int)reference_context.my_hp_length[left_anchor] + 10); // else // seqContext = ref_reader.substr(chr_idx, var_position - context_left); if (seqContext.length() > 0 && context_left < seqContext.length()) { sse_prob_positive_strand = ErrorMotifs.get_sse_probability(seqContext, context_left); //cout << seqContext << "\t" << context_left << "\t" << sse_prob_positive_strand << "\t"; context_left = seqContext.length() - context_left - 1; string reverse_seqContext; ReverseComplement(seqContext, reverse_seqContext); sse_prob_negative_strand = ErrorMotifs.get_sse_probability(reverse_seqContext, context_left); // cout << reverse_seqContext << "\t" << context_left << "\t" << sse_prob_negative_strand << "\t"; } } }
// open BAM input file void BAMWalkerEngine::InitializeBAMs(const ReferenceReader& ref_reader, const vector<string>& bam_filenames) { if (not bam_reader_.SetExplicitMergeOrder(BamMultiReader::MergeByCoordinate)) { cerr << "ERROR: Could not set merge order to BamMultiReader::MergeByCoordinate" << endl; exit(1); } if (not bam_reader_.Open(bam_filenames)) { cerr << "ERROR: Could not open input BAM file(s) : " << bam_reader_.GetErrorString() << endl; exit(1); } if (not bam_reader_.LocateIndexes()) { cerr << "ERROR: Could not open BAM index file(s) : " << bam_reader_.GetErrorString() << endl; exit(1); } // BAM multi reader combines the read group information of the different BAMs but does not merge comment sections bam_header_ = bam_reader_.GetHeader(); if (!bam_header_.HasReadGroups()) { cerr << "ERROR: there is no read group in BAM files specified" << endl; exit(1); } // Manually merge comment sections of BAM files if we have more than one BAM file if (bam_filenames.size() > 1) { unsigned int num_duplicates = 0; unsigned int num_merged = 0; for (unsigned int bam_idx = 0; bam_idx < bam_filenames.size(); bam_idx++) { BamReader reader; if (not reader.Open(bam_filenames.at(bam_idx))) { cerr << "TVC ERROR: Failed to open input BAM file " << reader.GetErrorString() << endl; exit(1); } SamHeader header = reader.GetHeader(); for (unsigned int i_co = 0; i_co < header.Comments.size(); i_co++) { // Step 1: Check if this comment is already part of the merged header unsigned int m_co = 0; while (m_co < bam_header_.Comments.size() and bam_header_.Comments.at(m_co) != header.Comments.at(i_co)) m_co++; if (m_co < bam_header_.Comments.size()){ num_duplicates++; continue; } // Add comment line to merged header if it is a new one num_merged++; bam_header_.Comments.push_back(header.Comments.at(i_co)); } } // Verbose what we did cout << "Merged " << num_merged << " unique comment lines into combined BAM header. Encountered " << num_duplicates << " duplicate comments." << endl; } // // Reference sequences in the bam file must match that in the fasta file // vector<RefData> referenceSequences = bam_reader_.GetReferenceData(); if ((int)referenceSequences.size() != ref_reader.chr_count()) { cerr << "ERROR: Reference in BAM file does not match fasta file" << endl << " BAM has " << referenceSequences.size() << " chromosomes while fasta has " << ref_reader.chr_count() << endl; exit(1); } for (int chr_idx = 0; chr_idx < ref_reader.chr_count(); ++chr_idx) { if (referenceSequences[chr_idx].RefName != ref_reader.chr_str(chr_idx)) { cerr << "ERROR: Reference in BAM file does not match fasta file" << endl << " Chromosome #" << (chr_idx+1) << "in BAM is " << referenceSequences[chr_idx].RefName << " while fasta has " << ref_reader.chr_str(chr_idx) << endl; exit(1); } if (referenceSequences[chr_idx].RefLength != ref_reader.chr_size(chr_idx)) { cerr << "ERROR: Reference in BAM file does not match fasta file" << endl << " Chromosome " << referenceSequences[chr_idx].RefName << "in BAM has length " << referenceSequences[chr_idx].RefLength << " while fasta has " << ref_reader.chr_size(chr_idx) << endl; exit(1); } } // // Retrieve BaseCaller and TMAP version strings from BAM header // set<string> basecaller_versions; set<string> tmap_versions; for (SamProgramIterator I = bam_header_.Programs.Begin(); I != bam_header_.Programs.End(); ++I) { if (I->ID.substr(0,2) == "bc") basecaller_versions.insert(I->Version); if (I->ID.substr(0,4) == "tmap") tmap_versions.insert(I->Version); } basecaller_version_ = ""; for (set<string>::const_iterator I = basecaller_versions.begin(); I != basecaller_versions.end(); ++I) { if (not basecaller_version_.empty()) basecaller_version_ += ", "; basecaller_version_ += *I; } tmap_version_ = ""; for (set<string>::const_iterator I = tmap_versions.begin(); I != tmap_versions.end(); ++I) { if (not tmap_version_.empty()) tmap_version_ += ", "; tmap_version_ += *I; } }
void TargetsManager::Initialize(const ReferenceReader& ref_reader, const string& _targets, bool _trim_ampliseq_primers /*const ExtendParameters& parameters*/) { // // Step 1. Retrieve raw target definitions // list<UnmergedTarget> raw_targets; if (not _targets.empty()) { LoadRawTargets(ref_reader, _targets, raw_targets); } else { for (int chr = 0; chr < ref_reader.chr_count(); ++chr) { raw_targets.push_back(UnmergedTarget()); UnmergedTarget& target = raw_targets.back(); target.begin = 0; target.end = ref_reader.chr_size(chr); target.chr = chr; } } // // Step 2. Sort raw targets and transfer to the vector // int num_unmerged = raw_targets.size(); vector<UnmergedTarget*> raw_sort; raw_sort.reserve(num_unmerged); for (list<UnmergedTarget>::iterator I = raw_targets.begin(); I != raw_targets.end(); ++I) raw_sort.push_back(&(*I)); sort(raw_sort.begin(), raw_sort.end(), CompareTargets); unmerged.reserve(num_unmerged); bool already_sorted = true; list<UnmergedTarget>::iterator I = raw_targets.begin(); for (int idx = 0; idx < num_unmerged; ++idx, ++I) { if (raw_sort[idx] != &(*I) and already_sorted) { already_sorted = false; cerr << "TargetsManager: BED not sorted at position " << idx; cerr << " replaced " << I->name << ":" << I->chr << ":" << I->begin << "-" << I->end; cerr << " with " << raw_sort[idx]->name << ":" << raw_sort[idx]->chr << ":" << raw_sort[idx]->begin << "-" << raw_sort[idx]->end << endl; } unmerged.push_back(*raw_sort[idx]); } // // Step 3. Merge targets and link merged/unmerged entries // merged.reserve(num_unmerged); bool already_merged = true; for (int idx = 0; idx < num_unmerged; ++idx) { if (idx and merged.back().chr == unmerged[idx].chr and merged.back().end >= unmerged[idx].begin) { merged.back().end = max(merged.back().end, unmerged[idx].end); already_merged = false; } else { merged.push_back(MergedTarget()); merged.back().chr = unmerged[idx].chr; merged.back().begin = unmerged[idx].begin; merged.back().end = unmerged[idx].end; merged.back().first_unmerged = idx; } unmerged[idx].merged = merged.size(); } if (_targets.empty()) { cout << "TargetsManager: No targets file specified, processing entire reference" << endl; } else { cout << "TargetsManager: Loaded targets file " << _targets << endl; cout << "TargetsManager: " << num_unmerged << " target(s)"; if (not already_merged) cout << " (" << merged.size() << " after merging)"; cout << endl; if (not already_sorted) cout << "TargetsManager: Targets required sorting" << endl; trim_ampliseq_primers = _trim_ampliseq_primers; if (trim_ampliseq_primers) cout << "TargetsManager: Trimming of AmpliSeq primers is enabled" << endl; } }
void TargetsManager::LoadRawTargets(const ReferenceReader& ref_reader, const string& bed_filename, list<UnmergedTarget>& raw_targets) { FILE *bed_file = fopen(bed_filename.c_str(), "r"); if (not bed_file) { cerr << "ERROR: Unable to open target file " << bed_filename << " : " << strerror(errno) << endl; exit(1); } char line[4096]; char chr_name[4096]; int begin; int end; char region_name[4096]; int line_number = 0; while (fgets(line, 4096, bed_file)) { ++line_number; if (strncmp(line,"track",5) == 0) { // Parse track line if needed continue; } int num_fields = sscanf(line, "%s\t%d\t%d\t%s", chr_name, &begin, &end, region_name); if (num_fields == 0) continue; if (num_fields < 3) { cerr << "ERROR: Failed to parse target file line " << line_number << endl; exit(1); } raw_targets.push_back(UnmergedTarget()); UnmergedTarget& target = raw_targets.back(); target.begin = begin; target.end = end; target.chr = ref_reader.chr_idx(chr_name); if (num_fields > 3 and strcmp(region_name,".") != 0) target.name = region_name; if (target.chr < 0) { cerr << "ERROR: Target region " << target.name << " (" << chr_name << ":" << begin << "-" << end << ")" << " has unrecognized chromosome name" << endl; exit(1); } if (begin < 0 || end > ref_reader.chr_size(target.chr)) { cerr << "ERROR: Target region " << target.name << " (" << chr_name << ":" << begin << "-" << end << ")" << " is outside of reference sequence bounds (" << chr_name << ":0-" << ref_reader.chr_size(target.chr) << ")" << endl; exit(1); } if (end < begin) { cerr << "ERROR: Target region " << target.name << " (" << chr_name << ":" << begin << "-" << end << ")" << " has inverted coordinates" << endl; exit(1); } AddExtraTrim(target, line, num_fields); } fclose(bed_file); if (raw_targets.empty()) { cerr << "ERROR: No targets loaded from " << bed_filename << " after parsing " << line_number << " lines" << endl; exit(1); } }
bool SpliceVariantHypotheses(const Alignment ¤t_read, const EnsembleEval &my_ensemble, const LocalReferenceContext &local_context, PersistingThreadObjects &thread_objects, int &splice_start_flow, int &splice_end_flow, vector<string> &my_hypotheses, vector<bool> & same_as_null_hypothesis, bool & changed_alignment, const InputStructures &global_context, const ReferenceReader &ref_reader, int chr_idx) { // Hypotheses: 1) Null; read as called 2) Reference Hypothesis 3-?) Variant Hypotheses my_hypotheses.resize(my_ensemble.allele_identity_vector.size()+2); same_as_null_hypothesis.assign(my_hypotheses.size(), false); // Set up variables to log the flows we splice into splice_start_flow = -1; splice_end_flow = -1; int splice_start_idx = -1; vector<int> splice_end_idx; splice_end_idx.assign(my_hypotheses.size(), -1); // 1) Null hypothesis is read as called if (global_context.resolve_clipped_bases) { unsigned int null_hyp_length = current_read.read_bases.length() - current_read.left_sc - current_read.right_sc; my_hypotheses[0] = current_read.read_bases.substr(current_read.start_sc, null_hyp_length); } else my_hypotheses[0] = current_read.read_bases; // Initialize hypotheses variables for splicing for (unsigned int i_hyp = 1; i_hyp < my_hypotheses.size(); i_hyp++) { my_hypotheses[i_hyp].clear(); my_hypotheses[i_hyp].reserve(current_read.alignment.QueryBases.length() + 20 + local_context.reference_allele.length()); // Add soft clipped bases on the left side of alignment if desired if (!global_context.resolve_clipped_bases) my_hypotheses[i_hyp] += current_read.alignment.QueryBases.substr(0, current_read.left_sc); } int read_idx = current_read.left_sc; int ref_idx = current_read.alignment.Position; int read_idx_max = current_read.alignment.QueryBases.length() - current_read.right_sc; bool did_splicing = false; bool just_did_splicing = false; string pretty_alignment; changed_alignment = false; // do realignment of a small region around variant if desired if (my_ensemble.doRealignment) { pretty_alignment = SpliceDoRealignement(thread_objects, current_read, local_context.position0, changed_alignment, global_context.DEBUG, ref_reader, chr_idx); if (pretty_alignment.empty() and global_context.DEBUG > 0) cerr << "Realignment returned an empty string in read " << current_read.alignment.Name << endl; } if (pretty_alignment.empty()) { pretty_alignment = current_read.pretty_aln; changed_alignment = false; } // Now fill in 2) and 3) for (unsigned int pretty_idx = 0; pretty_idx < pretty_alignment.length(); pretty_idx++) { bool outside_of_window = ref_idx < my_ensemble.multiallele_window_start or ref_idx >= my_ensemble.multiallele_window_end; bool outside_ref_allele = (long)ref_idx < local_context.position0 or ref_idx >= (int)(local_context.position0 + local_context.reference_allele.length()); // Basic sanity checks if (read_idx >= read_idx_max or ref_idx > ref_reader.chr_size(chr_idx) or (ref_idx == ref_reader.chr_size(chr_idx) and pretty_alignment[pretty_idx] != '+')) { did_splicing = false; break; } // --- Splice --- if (ref_idx == local_context.position0 and !did_splicing and !outside_of_window) { // Add insertions before variant window while (pretty_idx < pretty_alignment.length() and pretty_alignment[pretty_idx] == '+') { for (unsigned int i_hyp = 1; i_hyp < my_hypotheses.size(); i_hyp++) my_hypotheses[i_hyp].push_back(current_read.alignment.QueryBases[read_idx]); read_idx++; pretty_idx++; } did_splicing = SpliceAddVariantAlleles(current_read, pretty_alignment, my_ensemble, local_context, my_hypotheses, pretty_idx, global_context.DEBUG); just_did_splicing = did_splicing; } // --- --- // Have reference bases inside of window but outside of span of reference allele if (outside_ref_allele and !outside_of_window and pretty_alignment[pretty_idx] != '+') { for (unsigned int i_hyp = 1; i_hyp < my_hypotheses.size(); i_hyp++) my_hypotheses[i_hyp].push_back(ref_reader.base(chr_idx,ref_idx)); } // Have read bases as called outside of variant window if (outside_of_window and pretty_alignment[pretty_idx] != '-') { for (unsigned int i_hyp = 1; i_hyp < my_hypotheses.size(); i_hyp++) my_hypotheses[i_hyp].push_back(current_read.alignment.QueryBases[read_idx]); // --- Information to log flows. Indices are w.r.t. aligned portion of the read if (!did_splicing) { // Log index of the last base left of window which is the same for all hypotheses. splice_start_idx = read_idx - current_read.left_sc; } else if (just_did_splicing) { // Log length of hypothesis after splicing splice_end_idx[0] = read_idx - current_read.left_sc; int clipped_bases = 0; if (!global_context.resolve_clipped_bases) clipped_bases = current_read.left_sc; for (unsigned int i_hyp=1; i_hyp<my_hypotheses.size(); i_hyp++) splice_end_idx[i_hyp] = my_hypotheses[i_hyp].length()-1 - clipped_bases; // Hyp length depends on whether there is resolving! just_did_splicing = false; } // --- --- } IncrementAlignmentIndices(pretty_alignment[pretty_idx], ref_idx, read_idx); } // end of for loop over extended pretty alignment // Check whether the whole reference allele fit // It seems that with primer trimming ion TVC, many a read throw this warning if (ref_idx < (int)(local_context.position0 + local_context.reference_allele.length())) { did_splicing = false; if (global_context.DEBUG>0) cout << "Warning in Splicing: Reference allele "<< local_context.reference_allele << " did not fit into read " << current_read.alignment.Name << endl; } if (did_splicing) { // --- Add soft clipped bases to the right of the alignment and reverse complement --- for (unsigned int i_hyp = 1; i_hyp<my_hypotheses.size(); i_hyp++) { if (!global_context.resolve_clipped_bases) my_hypotheses[i_hyp] += current_read.alignment.QueryBases.substr(current_read.alignment.QueryBases.length()-current_read.right_sc, current_read.right_sc); if (current_read.is_reverse_strand) RevComplementInPlace(my_hypotheses[i_hyp]); } // Get the main flows before and after splicing splice_end_flow = GetSpliceFlows(current_read, global_context, my_hypotheses, same_as_null_hypothesis, splice_start_idx, splice_end_idx, splice_start_flow); if (splice_start_flow < 0 or splice_end_flow <= splice_start_flow) { did_splicing = false; cout << "Warning in Splicing: Splice flows are not valid in read " << current_read.alignment.Name << ". splice start flow: "<< splice_start_flow << " splice end flow " << splice_end_flow << endl; } } // Check for non-ACGT bases in hypotheses bool valid_bases = true; for (unsigned int i_hyp=0; i_hyp<my_hypotheses.size(); i_hyp++) { unsigned int iBase = 0; while (iBase<my_hypotheses[i_hyp].length() and valid_bases){ if (my_hypotheses[i_hyp].at(iBase) == 'A' or my_hypotheses[i_hyp].at(iBase) == 'C' or my_hypotheses[i_hyp].at(iBase) == 'G' or my_hypotheses[i_hyp].at(iBase) == 'T') iBase++; else valid_bases = false; } } if (not valid_bases){ cerr << "Non-Fatal ERROR in Splicing for " << local_context.contigName << ":" << local_context.position0+1 << ": Read Hypotheses for " << current_read.alignment.Name << " contain non-ACGT characters." << endl; did_splicing = false; } // --- Fail safe for hypotheses and verbose if (!did_splicing) { for (unsigned int i_hyp=1; i_hyp<my_hypotheses.size(); i_hyp++) my_hypotheses[i_hyp] = my_hypotheses[0]; if (global_context.DEBUG > 1) { cout << "Failed to splice " << local_context.reference_allele << "->"; for (unsigned int i_alt = 0; i_alt<my_ensemble.allele_identity_vector.size(); i_alt++) { cout << my_ensemble.allele_identity_vector[i_alt].altAllele; if (i_alt < my_ensemble.allele_identity_vector.size()-1) cout << ","; } cout << " into read " << current_read.alignment.Name << endl; } } else if (global_context.DEBUG > 1) { cout << "Spliced " << local_context.reference_allele << "->"; for (unsigned int i_alt = 0; i_alt<my_ensemble.allele_identity_vector.size(); i_alt++) { cout << my_ensemble.allele_identity_vector[i_alt].altAllele; if (i_alt < my_ensemble.allele_identity_vector.size()-1) cout << ","; } cout << " into "; if (current_read.is_reverse_strand) cout << "reverse "; else cout << "forward "; cout << "strand read read " << current_read.alignment.Name << endl; cout << "- Read as called: " << my_hypotheses[0] << endl; cout << "- Reference Hyp.: " << my_hypotheses[1] << endl; for (unsigned int i_hyp = 2; i_hyp<my_hypotheses.size(); i_hyp++) cout << "- Variant Hyp. " << (i_hyp-1) << ": " << my_hypotheses[i_hyp] << endl; cout << "- Splice start flow: " << splice_start_flow << " Splice end flow: " << splice_end_flow << endl; } return did_splicing; };
string SpliceDoRealignement (PersistingThreadObjects &thread_objects, const Alignment ¤t_read, long variant_position, bool &changed_alignment, int DEBUG, const ReferenceReader &ref_reader, int chr_idx) { // We do not allow any clipping since we align a short substring thread_objects.realigner.SetClipping(0, true); string new_alignment; // --- Get index positions at snp variant position int read_idx = current_read.left_sc; int ref_idx = current_read.alignment.Position; unsigned int pretty_idx = 0; while (pretty_idx < current_read.pretty_aln.length() and ref_idx < variant_position) { IncrementAlignmentIndices(current_read.pretty_aln[pretty_idx], ref_idx, read_idx); pretty_idx++; } if (DEBUG > 1) cout << "Computed variant position as (red, ref, pretty) " << read_idx << " " << ref_idx << " " << pretty_idx << endl; if (pretty_idx >= current_read.pretty_aln.length() or ref_idx >= ref_reader.chr_size(chr_idx) or read_idx >= (int)current_read.alignment.QueryBases.length() - current_read.right_sc) return new_alignment; // --- Get small sequence context for very local realignment ------------------------ int min_bases = 5; // Looking at alignment to the left of variant position to find right place to cut sequence int read_left = read_idx; int ref_left = ref_idx; unsigned int pretty_left = pretty_idx; bool continue_looking = pretty_idx > 0; while (continue_looking) { pretty_left--; DecrementAlignmentIndices(current_read.pretty_aln[pretty_left], ref_left, read_left); // Stopping criterion if (pretty_left < 1) { continue_looking = false; break; } if (ref_idx - ref_left < min_bases) continue_looking = true; else { // make sure to start with a matching base and don't split large HPs if (current_read.pretty_aln[pretty_left] != '|' or (ref_reader.base(chr_idx,ref_left+1) == ref_reader.base(chr_idx,ref_left))) continue_looking = true; else continue_looking = false; } } if (DEBUG > 1) cout << "Computed left realignment window as (red, ref, pretty) " << read_left << " " << ref_left << " " << pretty_left << endl; // Looking at alignment to the right to find right place to cut sequence int read_right = read_idx; int ref_right = ref_idx; unsigned int pretty_right = pretty_idx; continue_looking = pretty_idx < current_read.pretty_aln.length()-1; while (continue_looking) { IncrementAlignmentIndices(current_read.pretty_aln[pretty_right], ref_right, read_right); pretty_right++; // Stopping criterion (half open interval) if (pretty_right >= current_read.pretty_aln.length() or ref_right >= ref_reader.chr_size(chr_idx)) { continue_looking = false; break; } if (ref_right - ref_idx < min_bases) continue_looking = true; else { // make sure to stop with a matching base and don't split large HPs if (current_read.pretty_aln[pretty_right-1] != '|' or (ref_reader.base(chr_idx,ref_right-1) == ref_reader.base(chr_idx,ref_right))) continue_looking = true; else continue_looking = false; } } if (DEBUG > 1) cout << "Computed right realignment window as (red, ref, pretty) " << read_right << " " << ref_right << " " << pretty_right << endl; // Put in some sanity checks for alignment boundaries found... // --- Realign ------------------------- unsigned int start_position_shift; vector<CigarOp> new_cigar_data; vector<MDelement> new_md_data; // printouts if (DEBUG > 1) { thread_objects.realigner.verbose_ = true; cout << "Realigned " << current_read.alignment.Name << " from " << endl; } if (read_left >= read_right and ref_left >= ref_right) { if (DEBUG > 1) cout << "ERROR: realignment window has zero size! " << endl; return new_alignment; } string old_alignment = current_read.pretty_aln.substr(pretty_left, pretty_right-pretty_left); thread_objects.realigner.SetSequences(current_read.alignment.QueryBases.substr(read_left, read_right-read_left), ref_reader.substr(chr_idx, ref_left, ref_right-ref_left), old_alignment, true); if (!thread_objects.realigner.computeSWalignment(new_cigar_data, new_md_data, start_position_shift)) { if (DEBUG > 1) cout << "ERROR: realignment failed! " << endl; return new_alignment; } // --- Fuse realigned partial sequence back into pretty_aln string new_alignment = current_read.pretty_aln; if (old_alignment == thread_objects.realigner.pretty_aln()) { changed_alignment = false; } else { new_alignment.replace(pretty_left, (pretty_right-pretty_left), thread_objects.realigner.pretty_aln()); changed_alignment = true; } return new_alignment; }
void EnsembleEval::SetupAllAlleles(const ExtendParameters ¶meters, const InputStructures &global_context, const ReferenceReader &ref_reader, int chr_idx) { seq_context.DetectContext(*variant, global_context.DEBUG, ref_reader, chr_idx); allele_identity_vector.resize(variant->alt.size()); if (global_context.DEBUG > 0 and variant->alt.size()>0) { cout << "Investigating variant candidate " << seq_context.reference_allele << " -> " << variant->alt[0]; for (uint8_t i_allele = 1; i_allele < allele_identity_vector.size(); i_allele++) cout << ',' << variant->alt[i_allele]; cout << endl; } //now calculate the allele type (SNP/Indel/MNV/HPIndel etc.) and window for hypothesis calculation for each alt allele. for (uint8_t i_allele = 0; i_allele < allele_identity_vector.size(); i_allele++) { // TODO: Hotspot should be an allele property but we only set all or none to Hotspots, depending on the vcf record allele_identity_vector[i_allele].status.isHotSpot = variant->isHotSpot; allele_identity_vector[i_allele].filterReasons.clear(); allele_identity_vector[i_allele].DEBUG = global_context.DEBUG; allele_identity_vector[i_allele].indelActAsHPIndel = parameters.my_controls.filter_variant.indel_as_hpindel; allele_identity_vector[i_allele].getVariantType(variant->alt[i_allele], seq_context, global_context.ErrorMotifs, parameters.my_controls.filter_variant, ref_reader, chr_idx); allele_identity_vector[i_allele].CalculateWindowForVariant(seq_context, global_context.DEBUG, ref_reader, chr_idx); } //GetMultiAlleleVariantWindow(); multiallele_window_start = -1; multiallele_window_end = -1; // Mark Ensemble for realignment if any of the possible variants should be realigned // TODO: Should we exclude already filtered alleles? for (uint8_t i_allele = 0; i_allele < allele_identity_vector.size(); i_allele++) { //if (!allele_identity_vector[i_allele].status.isNoCallVariant) { if (allele_identity_vector[i_allele].start_window < multiallele_window_start or multiallele_window_start == -1) multiallele_window_start = allele_identity_vector[i_allele].start_window; if (allele_identity_vector[i_allele].end_window > multiallele_window_end or multiallele_window_end == -1) multiallele_window_end = allele_identity_vector[i_allele].end_window; if (allele_identity_vector[i_allele].ActAsSNP() && parameters.my_controls.filter_variant.do_snp_realignment) { doRealignment = doRealignment or allele_identity_vector[i_allele].status.doRealignment; } if (allele_identity_vector[i_allele].ActAsMNP() && parameters.my_controls.filter_variant.do_mnp_realignment) { doRealignment = doRealignment or allele_identity_vector[i_allele].status.doRealignment; } } // Hack: pass allele windows back down the object for (uint8_t i_allele = 0; i_allele < allele_identity_vector.size(); i_allele++) { allele_identity_vector[i_allele].start_window = multiallele_window_start; allele_identity_vector[i_allele].end_window = multiallele_window_end; } if (global_context.DEBUG > 0) { cout << "Realignment for this candidate is turned " << (doRealignment ? "on" : "off") << endl; cout << "Final window for multi-allele: " << ": (" << multiallele_window_start << ") "; for (int p_idx = multiallele_window_start; p_idx < multiallele_window_end; p_idx++) cout << ref_reader.base(chr_idx,p_idx); cout << " (" << multiallele_window_end << ") " << endl; } }
void AlleleIdentity::CalculateWindowForVariant(const LocalReferenceContext &seq_context, int DEBUG, const ReferenceReader &ref_reader, int chr_idx) { // If we have an invalid vcf candidate, set a length zero window and exit if (!seq_context.context_detected or status.isProblematicAllele) { start_window = seq_context.position0; end_window = seq_context.position0; return; } // Check for MNRs first, for InDelLengths 2,3,4,5 if (status.isIndel and !status.isHPIndel and inDelLength < 5) for (int rep_period = 2; rep_period < 6; rep_period++) if (IdentifyMultiNucRepeatSection(seq_context, rep_period, ref_reader, chr_idx)) { if (DEBUG > 0) { cout << "MNR found in allele " << seq_context.reference_allele << " -> " << altAllele << endl; cout << "Window for allele " << altAllele << ": (" << start_window << ") "; for (int p_idx = start_window; p_idx < end_window; p_idx++) cout << ref_reader.base(chr_idx,p_idx); cout << " (" << end_window << ") " << endl; } return; // Found a matching period and computed window } // not an MNR. Moving on along to InDels. if (status.isIndel) { // Default variant window end_window = seq_context.right_hp_start +1; // Anchor base to the right of allele start_window = seq_context.position0; // Adjustments if necessary if (status.isDeletion) if (seq_context.my_hp_start_pos[left_anchor] == seq_context.my_hp_start_pos[0]) start_window = seq_context.my_hp_start_pos[0] - 1; if (status.isInsertion) { if (left_anchor == 0) { start_window = seq_context.my_hp_start_pos[0] - 1; } else if (altAllele[left_anchor] == altAllele[left_anchor - 1] and seq_context.position0 > (seq_context.my_hp_start_pos[left_anchor - 1] - 1)) { start_window = seq_context.my_hp_start_pos[left_anchor - 1] - 1; } if (altAllele[altAllele.length() - 1] == seq_context.ref_right_hp_base) { end_window += seq_context.right_hp_length; } } // Safety if (start_window < 0) start_window = 0; if (end_window > ref_reader.chr_size(chr_idx)) end_window = ref_reader.chr_size(chr_idx); } else { // SNPs and MNVs are 1->1 base replacements start_window = seq_context.position0; end_window = seq_context.position0 + seq_context.reference_allele.length(); } // */ if (DEBUG > 0) { cout << "Window for allele " << altAllele << ": (" << start_window << ") "; for (int p_idx = start_window; p_idx < end_window; p_idx++) cout << ref_reader.base(chr_idx,p_idx); cout << " (" << end_window << ") " << endl; } }
// open BAM input file void BAMWalkerEngine::InitializeBAMs(const ReferenceReader& ref_reader, const vector<string>& bam_filenames) { if (not bam_reader_.SetExplicitMergeOrder(BamMultiReader::MergeByCoordinate)) { cerr << "ERROR: Could not set merge order to BamMultiReader::MergeByCoordinate" << endl; exit(1); } if (not bam_reader_.Open(bam_filenames)) { cerr << "ERROR: Could not open input BAM file(s) : " << bam_reader_.GetErrorString(); exit(1); } if (not bam_reader_.LocateIndexes()) { cerr << "ERROR: Could not open BAM index file(s) : " << bam_reader_.GetErrorString(); exit(1); } bam_header_ = bam_reader_.GetHeader(); if (!bam_header_.HasReadGroups()) { cerr << "ERROR: there is no read group in BAM files specified" << endl; exit(1); } // // Reference sequences in the bam file must match that in the fasta file // vector<RefData> referenceSequences = bam_reader_.GetReferenceData(); if ((int)referenceSequences.size() != ref_reader.chr_count()) { cerr << "ERROR: Reference in BAM file does not match fasta file" << endl << " BAM has " << referenceSequences.size() << " chromosomes while fasta has " << ref_reader.chr_count() << endl; exit(1); } for (int chr_idx = 0; chr_idx < ref_reader.chr_count(); ++chr_idx) { if (referenceSequences[chr_idx].RefName != ref_reader.chr_str(chr_idx)) { cerr << "ERROR: Reference in BAM file does not match fasta file" << endl << " Chromosome #" << (chr_idx+1) << "in BAM is " << referenceSequences[chr_idx].RefName << " while fasta has " << ref_reader.chr_str(chr_idx) << endl; exit(1); } if (referenceSequences[chr_idx].RefLength != ref_reader.chr_size(chr_idx)) { cerr << "ERROR: Reference in BAM file does not match fasta file" << endl << " Chromosome " << referenceSequences[chr_idx].RefName << "in BAM has length " << referenceSequences[chr_idx].RefLength << " while fasta has " << ref_reader.chr_size(chr_idx) << endl; exit(1); } } // // Retrieve BaseCaller and TMAP version strings from BAM header // set<string> basecaller_versions; set<string> tmap_versions; for (SamProgramIterator I = bam_header_.Programs.Begin(); I != bam_header_.Programs.End(); ++I) { if (I->ID.substr(0,2) == "bc") basecaller_versions.insert(I->Version); if (I->ID.substr(0,4) == "tmap") tmap_versions.insert(I->Version); } basecaller_version_ = ""; for (set<string>::const_iterator I = basecaller_versions.begin(); I != basecaller_versions.end(); ++I) { if (not basecaller_version_.empty()) basecaller_version_ += ", "; basecaller_version_ += *I; } tmap_version_ = ""; for (set<string>::const_iterator I = tmap_versions.begin(); I != tmap_versions.end(); ++I) { if (not tmap_version_.empty()) tmap_version_ += ", "; tmap_version_ += *I; } }