// update the training data on the current segment void train_segment(HMMRealignmentInput& window, uint32_t segment_id) { // Get the segments assert(segment_id + 2 < window.anchored_columns.size()); HMMAnchoredColumn& start_column = window.anchored_columns[segment_id]; HMMAnchoredColumn& middle_column = window.anchored_columns[segment_id + 1]; HMMAnchoredColumn& end_column = window.anchored_columns[segment_id + 2]; std::string s_m_base = start_column.base_sequence; std::string m_e_base = middle_column.base_sequence; // Set up the the input data for the HMM std::vector<HMMInputData> input = get_input_for_columns(window, start_column, end_column); // no training can be performed if there are no reads for this segment if(input.empty()) { return; } // assume models for all the reads have the same k const uint32_t k = input[0].read->pore_model[input[0].strand].k; std::string segment_sequence = join_sequences_at_kmer(s_m_base, m_e_base, k); for(uint32_t ri = 0; ri < input.size(); ++ri) { std::vector<HMMAlignmentState> decodes = profile_hmm_align(segment_sequence, input[ri]); update_training_with_segment(segment_sequence, input[ri]); } }
void run_splice_segment(HMMRealignmentInput& window, uint32_t segment_id, const uint32_t k) { // The structure of the data looks like this: // -------------------------------------------------------- // S M E // where is the start column, M is the middle column and E // is the end column. We want to call a new consensus from S // to E. We do this by generating the base sequence from S to E // and then applying all of the alternatives indicated by the // start and middle column. We score these alternatives using // the read strands spanning from S to E. After a new consensus // has been selected, we re-calculate the alignments of events to // the middle anchor. // Get the segments assert(segment_id + 2 < window.anchored_columns.size()); HMMAnchoredColumn& start_column = window.anchored_columns[segment_id]; HMMAnchoredColumn& middle_column = window.anchored_columns[segment_id + 1]; HMMAnchoredColumn& end_column = window.anchored_columns[segment_id + 2]; std::string s_m_base = start_column.base_sequence; std::string m_e_base = middle_column.base_sequence; // The collection of alternative sequences std::vector<std::string> alts; for(uint32_t ai = 0; ai < start_column.alt_sequences.size(); ++ai) { alts.push_back(start_column.alt_sequences[ai]); } // set up the input data for the HMM std::vector<HMMInputData> data = get_input_for_columns(window, start_column, end_column); if(opt::verbose > 0) { fprintf(stderr, "correcting segment %u with %zu reads\n", segment_id, data.size()); } // The current consensus sequence std::string original = join_sequences_at_kmer(s_m_base, m_e_base, k); std::string base = original; // filter out poor quality reads filter_outlier_data(data, base); // Only attempt correction if there are any reads here if(!data.empty()) { std::string bs_result = run_block_substitution(base, data, alts); std::string mut_result = run_mutation(bs_result, data); base = mut_result; } if(opt::verbose > 0) { fprintf(stderr, "ORIGINAL[%d] %s\n", segment_id, original.c_str()); fprintf(stderr, "RESULT[%d] %s\n", segment_id, base.c_str()); } // Update the sequences for the start and middle segments // by cutting the new consensus in the middle // We maintain the k-mer match invariant by requiring the // sequences to overlap by k-bp assert(base.length() >= k); uint32_t midpoint_kmer = (base.length() - k + 1) / 2; std::string s_m_fixed = base.substr(0, midpoint_kmer + k); std::string m_e_fixed = base.substr(midpoint_kmer); assert(s_m_fixed.substr(s_m_fixed.size() - k) == m_e_fixed.substr(0, k)); start_column.base_sequence = s_m_fixed; middle_column.base_sequence = m_e_fixed; // Update the event indices in the first column to match for(uint32_t ri = 0; ri < data.size(); ++ri) { // Realign to the consensus sequence std::vector<HMMAlignmentState> decodes = profile_hmm_align(base, data[ri]); // Get the closest event aligned to the target kmer int32_t min_k_dist = base.length(); uint32_t event_idx = 0; for(uint32_t di = 0; di < decodes.size(); ++di) { int32_t dist = abs(decodes[di].kmer_idx - midpoint_kmer); if(dist <= min_k_dist) { min_k_dist = dist; event_idx = decodes[di].event_idx; } } middle_column.anchors[data[ri].anchor_index].event_idx = event_idx; } }
void debug_sequence(const std::string& name, uint32_t seq_id, uint32_t read_id, const HMMInputSequence& sequence, const HMMInputData& data) { std::vector<HMMAlignmentState> alignment = profile_hmm_align(sequence, data); print_alignment(name, seq_id, read_id, sequence, data, alignment); }
void update_training_with_segment(const HMMInputSequence& sequence, const HMMInputData& data) { std::vector<HMMAlignmentState> alignment = profile_hmm_align(sequence, data); data.read->parameters[data.strand].add_training_from_alignment(sequence, data, alignment); }
std::vector<AlignmentState> hmm_align(const std::string& sequence, const HMMInputData& data) { return profile_hmm_align(sequence, data); // return khmm_posterior_decode(sequence, state); }