void ExtendedReadInfo::CreateFlowIndex(string &flowOrder) { string read_bases = alignment.QueryBases; if (!is_forward_strand) RevComplementInPlace(read_bases); flowIndex.assign(read_bases.length(), flowOrder.length()); unsigned int flow = start_flow; unsigned int base_idx = 0; while (base_idx < read_bases.length() and flow < flowOrder.length()){ while (flow < flowOrder.length() and flowOrder[flow] != read_bases[base_idx]) flow++; flowIndex[base_idx] = flow; base_idx++; } if (base_idx != read_bases.length()) cerr << "WARNING in ExtendedReadInfo::CreateFlowIndex: There are more bases in the read than fit into the flow order."; }
bool SpliceVariantHypotheses(const Alignment ¤t_read, const EnsembleEval &my_ensemble, const LocalReferenceContext &local_context, PersistingThreadObjects &thread_objects, int &splice_start_flow, int &splice_end_flow, vector<string> &my_hypotheses, vector<bool> & same_as_null_hypothesis, bool & changed_alignment, const InputStructures &global_context, const ReferenceReader &ref_reader, int chr_idx) { // Hypotheses: 1) Null; read as called 2) Reference Hypothesis 3-?) Variant Hypotheses my_hypotheses.resize(my_ensemble.allele_identity_vector.size()+2); same_as_null_hypothesis.assign(my_hypotheses.size(), false); // Set up variables to log the flows we splice into splice_start_flow = -1; splice_end_flow = -1; int splice_start_idx = -1; vector<int> splice_end_idx; splice_end_idx.assign(my_hypotheses.size(), -1); // 1) Null hypothesis is read as called if (global_context.resolve_clipped_bases) { unsigned int null_hyp_length = current_read.read_bases.length() - current_read.left_sc - current_read.right_sc; my_hypotheses[0] = current_read.read_bases.substr(current_read.start_sc, null_hyp_length); } else my_hypotheses[0] = current_read.read_bases; // Initialize hypotheses variables for splicing for (unsigned int i_hyp = 1; i_hyp < my_hypotheses.size(); i_hyp++) { my_hypotheses[i_hyp].clear(); my_hypotheses[i_hyp].reserve(current_read.alignment.QueryBases.length() + 20 + local_context.reference_allele.length()); // Add soft clipped bases on the left side of alignment if desired if (!global_context.resolve_clipped_bases) my_hypotheses[i_hyp] += current_read.alignment.QueryBases.substr(0, current_read.left_sc); } int read_idx = current_read.left_sc; int ref_idx = current_read.alignment.Position; int read_idx_max = current_read.alignment.QueryBases.length() - current_read.right_sc; bool did_splicing = false; bool just_did_splicing = false; string pretty_alignment; changed_alignment = false; // do realignment of a small region around variant if desired if (my_ensemble.doRealignment) { pretty_alignment = SpliceDoRealignement(thread_objects, current_read, local_context.position0, changed_alignment, global_context.DEBUG, ref_reader, chr_idx); if (pretty_alignment.empty() and global_context.DEBUG > 0) cerr << "Realignment returned an empty string in read " << current_read.alignment.Name << endl; } if (pretty_alignment.empty()) { pretty_alignment = current_read.pretty_aln; changed_alignment = false; } // Now fill in 2) and 3) for (unsigned int pretty_idx = 0; pretty_idx < pretty_alignment.length(); pretty_idx++) { bool outside_of_window = ref_idx < my_ensemble.multiallele_window_start or ref_idx >= my_ensemble.multiallele_window_end; bool outside_ref_allele = (long)ref_idx < local_context.position0 or ref_idx >= (int)(local_context.position0 + local_context.reference_allele.length()); // Basic sanity checks if (read_idx >= read_idx_max or ref_idx > ref_reader.chr_size(chr_idx) or (ref_idx == ref_reader.chr_size(chr_idx) and pretty_alignment[pretty_idx] != '+')) { did_splicing = false; break; } // --- Splice --- if (ref_idx == local_context.position0 and !did_splicing and !outside_of_window) { // Add insertions before variant window while (pretty_idx < pretty_alignment.length() and pretty_alignment[pretty_idx] == '+') { for (unsigned int i_hyp = 1; i_hyp < my_hypotheses.size(); i_hyp++) my_hypotheses[i_hyp].push_back(current_read.alignment.QueryBases[read_idx]); read_idx++; pretty_idx++; } did_splicing = SpliceAddVariantAlleles(current_read, pretty_alignment, my_ensemble, local_context, my_hypotheses, pretty_idx, global_context.DEBUG); just_did_splicing = did_splicing; } // --- --- // Have reference bases inside of window but outside of span of reference allele if (outside_ref_allele and !outside_of_window and pretty_alignment[pretty_idx] != '+') { for (unsigned int i_hyp = 1; i_hyp < my_hypotheses.size(); i_hyp++) my_hypotheses[i_hyp].push_back(ref_reader.base(chr_idx,ref_idx)); } // Have read bases as called outside of variant window if (outside_of_window and pretty_alignment[pretty_idx] != '-') { for (unsigned int i_hyp = 1; i_hyp < my_hypotheses.size(); i_hyp++) my_hypotheses[i_hyp].push_back(current_read.alignment.QueryBases[read_idx]); // --- Information to log flows. Indices are w.r.t. aligned portion of the read if (!did_splicing) { // Log index of the last base left of window which is the same for all hypotheses. splice_start_idx = read_idx - current_read.left_sc; } else if (just_did_splicing) { // Log length of hypothesis after splicing splice_end_idx[0] = read_idx - current_read.left_sc; int clipped_bases = 0; if (!global_context.resolve_clipped_bases) clipped_bases = current_read.left_sc; for (unsigned int i_hyp=1; i_hyp<my_hypotheses.size(); i_hyp++) splice_end_idx[i_hyp] = my_hypotheses[i_hyp].length()-1 - clipped_bases; // Hyp length depends on whether there is resolving! just_did_splicing = false; } // --- --- } IncrementAlignmentIndices(pretty_alignment[pretty_idx], ref_idx, read_idx); } // end of for loop over extended pretty alignment // Check whether the whole reference allele fit // It seems that with primer trimming ion TVC, many a read throw this warning if (ref_idx < (int)(local_context.position0 + local_context.reference_allele.length())) { did_splicing = false; if (global_context.DEBUG>0) cout << "Warning in Splicing: Reference allele "<< local_context.reference_allele << " did not fit into read " << current_read.alignment.Name << endl; } if (did_splicing) { // --- Add soft clipped bases to the right of the alignment and reverse complement --- for (unsigned int i_hyp = 1; i_hyp<my_hypotheses.size(); i_hyp++) { if (!global_context.resolve_clipped_bases) my_hypotheses[i_hyp] += current_read.alignment.QueryBases.substr(current_read.alignment.QueryBases.length()-current_read.right_sc, current_read.right_sc); if (current_read.is_reverse_strand) RevComplementInPlace(my_hypotheses[i_hyp]); } // Get the main flows before and after splicing splice_end_flow = GetSpliceFlows(current_read, global_context, my_hypotheses, same_as_null_hypothesis, splice_start_idx, splice_end_idx, splice_start_flow); if (splice_start_flow < 0 or splice_end_flow <= splice_start_flow) { did_splicing = false; cout << "Warning in Splicing: Splice flows are not valid in read " << current_read.alignment.Name << ". splice start flow: "<< splice_start_flow << " splice end flow " << splice_end_flow << endl; } } // Check for non-ACGT bases in hypotheses bool valid_bases = true; for (unsigned int i_hyp=0; i_hyp<my_hypotheses.size(); i_hyp++) { unsigned int iBase = 0; while (iBase<my_hypotheses[i_hyp].length() and valid_bases){ if (my_hypotheses[i_hyp].at(iBase) == 'A' or my_hypotheses[i_hyp].at(iBase) == 'C' or my_hypotheses[i_hyp].at(iBase) == 'G' or my_hypotheses[i_hyp].at(iBase) == 'T') iBase++; else valid_bases = false; } } if (not valid_bases){ cerr << "Non-Fatal ERROR in Splicing for " << local_context.contigName << ":" << local_context.position0+1 << ": Read Hypotheses for " << current_read.alignment.Name << " contain non-ACGT characters." << endl; did_splicing = false; } // --- Fail safe for hypotheses and verbose if (!did_splicing) { for (unsigned int i_hyp=1; i_hyp<my_hypotheses.size(); i_hyp++) my_hypotheses[i_hyp] = my_hypotheses[0]; if (global_context.DEBUG > 1) { cout << "Failed to splice " << local_context.reference_allele << "->"; for (unsigned int i_alt = 0; i_alt<my_ensemble.allele_identity_vector.size(); i_alt++) { cout << my_ensemble.allele_identity_vector[i_alt].altAllele; if (i_alt < my_ensemble.allele_identity_vector.size()-1) cout << ","; } cout << " into read " << current_read.alignment.Name << endl; } } else if (global_context.DEBUG > 1) { cout << "Spliced " << local_context.reference_allele << "->"; for (unsigned int i_alt = 0; i_alt<my_ensemble.allele_identity_vector.size(); i_alt++) { cout << my_ensemble.allele_identity_vector[i_alt].altAllele; if (i_alt < my_ensemble.allele_identity_vector.size()-1) cout << ","; } cout << " into "; if (current_read.is_reverse_strand) cout << "reverse "; else cout << "forward "; cout << "strand read read " << current_read.alignment.Name << endl; cout << "- Read as called: " << my_hypotheses[0] << endl; cout << "- Reference Hyp.: " << my_hypotheses[1] << endl; for (unsigned int i_hyp = 2; i_hyp<my_hypotheses.size(); i_hyp++) cout << "- Variant Hyp. " << (i_hyp-1) << ": " << my_hypotheses[i_hyp] << endl; cout << "- Splice start flow: " << splice_start_flow << " Splice end flow: " << splice_end_flow << endl; } return did_splicing; };
// Function to fill in predicted signal values void BaseHypothesisEvaluator(BamTools::BamAlignment &alignment, const string &flow_order_str, const string &alt_base_hyp, float &delta_score, float &fit_score, int heavy_verbose) { // --- Step 1: Initialize Objects and retrieve relevant tags delta_score = 1e5; fit_score = 1e5; vector<string> Hypotheses(2); vector<float> measurements, phase_params; int start_flow, num_flows, prefix_flow=0; if (not GetBamTags(alignment, flow_order_str.length(), measurements, phase_params, start_flow)) return; num_flows = measurements.size(); ion::FlowOrder flow_order(flow_order_str, num_flows); BasecallerRead master_read; master_read.SetData(measurements, flow_order.num_flows()); TreephaserLite treephaser(flow_order); treephaser.SetModelParameters(phase_params[0], phase_params[1]); // --- Step 2: Solve beginning of the read // Look at mapped vs. unmapped reads in BAM Hypotheses[0] = alignment.QueryBases; Hypotheses[1] = alt_base_hyp; // Safety: reverse complement reverse strand reads in mapped bam if (alignment.IsMapped() and alignment.IsReverseStrand()) { RevComplementInPlace(Hypotheses[0]); RevComplementInPlace(Hypotheses[1]); } prefix_flow = GetMasterReadPrefix(treephaser, flow_order, start_flow, Hypotheses[0], master_read); unsigned int prefix_size = master_read.sequence.size(); // --- Step 3: creating predictions for the individual hypotheses vector<BasecallerRead> hypothesesReads(Hypotheses.size()); vector<float> squared_distances(Hypotheses.size(), 0.0); int max_last_flow = 0; for (unsigned int i_hyp=0; i_hyp<hypothesesReads.size(); ++i_hyp) { hypothesesReads[i_hyp] = master_read; // --- add hypothesis sequence to clipped prefix unsigned int i_base = 0; int i_flow = prefix_flow; while (i_base<Hypotheses[i_hyp].length() and i_base<(2*(unsigned int)flow_order.num_flows()-prefix_size)) { while (i_flow < flow_order.num_flows() and flow_order.nuc_at(i_flow) != Hypotheses[i_hyp][i_base]) i_flow++; if (i_flow < flow_order.num_flows() and i_flow > max_last_flow) max_last_flow = i_flow; if (i_flow >= flow_order.num_flows()) break; // Add base to sequence only if it fits into flow order hypothesesReads[i_hyp].sequence.push_back(Hypotheses[i_hyp][i_base]); i_base++; } i_flow = min(i_flow, flow_order.num_flows()-1); // Solver simulates beginning of the read and then fills in the remaining clipped bases for which we have flow information treephaser.Solve(hypothesesReads[i_hyp], num_flows, i_flow); } // Compute L2-distance of measurements and predictions for (unsigned int i_hyp=0; i_hyp<hypothesesReads.size(); ++i_hyp) { for (int iFlow=0; iFlow<=max_last_flow; iFlow++) squared_distances[i_hyp] += (measurements.at(iFlow) - hypothesesReads[i_hyp].prediction.at(iFlow)) * (measurements.at(iFlow) - hypothesesReads[i_hyp].prediction.at(iFlow)); } // Delta: L2-distance of alternative base Hypothesis - L2-distance of bases as called delta_score = squared_distances.at(1) - squared_distances.at(0); fit_score = min(squared_distances.at(1), squared_distances.at(0)); // --- verbose --- if (heavy_verbose > 1 or (delta_score < 0 and heavy_verbose > 0)) { cout << "Processed read " << alignment.Name << endl; cout << "Delta Fit: " << delta_score << " Overall Fit: " << fit_score << endl; PredictionGenerationVerbose(Hypotheses, hypothesesReads, phase_params, flow_order, start_flow, prefix_size); } }
void UnpackOnLoad(Alignment *rai, const InputStructures &global_context) { // No need to waste time if the read is filtered if (rai->filtered) return; rai->is_reverse_strand = rai->alignment.IsReverseStrand(); // Parse read name, run id & flow order index rai->runid.clear(); if (not rai->alignment.Name.empty()) { rai->well_rowcol.resize(2); ion_readname_to_rowcol(rai->alignment.Name.c_str(), &rai->well_rowcol[0], &rai->well_rowcol[1]); // extract runid while we are at it rai->runid = rai->alignment.Name.substr(0,rai->alignment.Name.find(":")); } if (rai->runid.empty()){ cerr << "WARNING: Unable to determine run id of read " << rai->alignment.Name << endl; rai->filtered = true; return; } std::map<string,int>::const_iterator fo_it = global_context.flow_order_index_by_run_id.find(rai->runid); if (fo_it == global_context.flow_order_index_by_run_id.end()){ cerr << "WARNING: No matching flow oder found for read " << rai->alignment.Name << endl; rai->filtered = true; return; } rai->flow_order_index = fo_it->second; const ion::FlowOrder & flow_order = global_context.flow_order_vector.at(rai->flow_order_index); // Retrieve measurements from ZM tag vector<int16_t> quantized_measurements; if (not rai->alignment.GetTag("ZM", quantized_measurements)) { cerr << "ERROR: Normalized measurements ZM:tag is not present in read " << rai->alignment.Name << endl; exit(1); } if ((int)quantized_measurements.size() > global_context.num_flows_by_run_id.at(rai->runid)) { cerr << "ERROR: Normalized measurements ZM:tag length " << quantized_measurements.size() << " exceeds flow order length " << global_context.num_flows_by_run_id.at(rai->runid) <<" in read " << rai->alignment.Name << endl; exit(1); } rai->measurements.assign(global_context.num_flows_by_run_id.at(rai->runid), 0.0); for (size_t counter = 0; counter < quantized_measurements.size(); ++counter) rai->measurements[counter] = (float)quantized_measurements[counter]/256; rai->measurements_length = quantized_measurements.size(); // Retrieve phasing parameters from ZP tag if (not rai->alignment.GetTag("ZP", rai->phase_params)) { cerr << "ERROR: Phasing Parameters ZP:tag is not present in read " << rai->alignment.Name << endl; exit(1); } if (rai->phase_params.size() != 3) { cerr << "ERROR: Phasing Parameters ZP:tag does not have 3 phase parameters in read " << rai->alignment.Name << endl; exit(1); } if (rai->phase_params[0] < 0 or rai->phase_params[0] > 1 or rai->phase_params[1] < 0 or rai->phase_params[1] > 1 or rai->phase_params[2] < 0 or rai->phase_params[2] > 1) { cerr << "ERROR: Phasing Parameters ZP:tag outside of [0,1] range in read " << rai->alignment.Name << endl; exit(1); } rai->phase_params[2] = 0.0f; // ad-hoc corrector: zero droop // Populate read_bases (bases without rev-comp on reverse-mapped reads) and flow_index rai->read_bases = rai->alignment.QueryBases; if (rai->is_reverse_strand) RevComplementInPlace(rai->read_bases); if (rai->read_bases.empty()){ cerr << "WARNING: Ignoring length zero read " << rai->alignment.Name << endl; rai->filtered = true; return; } // Unpack alignment rai->pretty_aln.reserve(global_context.num_flows_by_run_id.at(rai->runid)); UnpackAlignmentInfo(rai); if (rai->is_reverse_strand) rai->start_sc = rai->right_sc; else rai->start_sc = rai->left_sc; // Generate flow index rai->start_flow = 0; if (not rai->alignment.GetTag("ZF", rai->start_flow)) { uint8_t start_flow_byte = 0; if (not rai->alignment.GetTag("ZF", start_flow_byte)) { cerr << "ERROR: Start Flow ZF:tag not found in read " << rai->alignment.Name << endl; exit(1); } rai->start_flow = (int)start_flow_byte; } if (rai->start_flow == 0) { cerr << "WARNING: Start Flow ZF:tag has zero value in read " << rai->alignment.Name << endl; rai->filtered = true; return; } CreateFlowIndex(rai, flow_order); if (global_context.resolve_clipped_bases) { // Increment start flow to first aligned base rai->start_flow = rai->flow_index[rai->start_sc]; } // Check validity of input arguments if (rai->start_flow < 0 or rai->start_flow >= global_context.num_flows_by_run_id.at(rai->runid)) { cerr << "ERROR: Start flow outside of [0,num_flows) range in read " << rai->alignment.Name << endl; cerr << "Start flow: " << rai->start_flow << " Number of flows: " << global_context.flow_order_vector.at(rai->flow_order_index).num_flows(); exit(1); } // Retrieve read group name & generate prefix flow if (not rai->alignment.GetTag("RG",rai->read_group)) { cerr << "WARNING: No read group found in read " << rai->alignment.Name << endl; // No big problem, we'll just have to solve the prefix like it's 2013! rai->read_group.clear(); } // Get read prefix - hard clipped start of the read: [KS][ZT][ZE] rai->prefix_flow = -1; std::map<string,string>::const_iterator key_it = global_context.key_by_read_group.find(rai->read_group); if (key_it != global_context.key_by_read_group.end()) { rai->prefix_bases = key_it->second; string temp_zt, temp_ze; if (rai->alignment.GetTag("ZT", temp_zt)) rai->prefix_bases += temp_zt; if (rai->alignment.GetTag("ZE", temp_ze)) rai->prefix_bases += temp_ze; if (not rai->prefix_bases.empty()) GetPrefixFlow(rai, rai->prefix_bases, flow_order); } // Check consistency of prefix_flow and start_flow - maybe we don't have all info about hard clipped bases if (rai->prefix_flow >= 0) { int check_start_flow = rai->prefix_flow; while (check_start_flow < flow_order.num_flows() and flow_order.nuc_at(check_start_flow) != rai->read_bases.at(0)) check_start_flow++; if (check_start_flow != rai->start_flow) { rai->prefix_flow = -1; rai->prefix_bases.clear(); } } }