// update the training data on the current segment
void train_segment(HMMRealignmentInput& window, uint32_t segment_id)
{
    // Get the segments
    assert(segment_id + 2 < window.anchored_columns.size());
    HMMAnchoredColumn& start_column = window.anchored_columns[segment_id];
    HMMAnchoredColumn& middle_column = window.anchored_columns[segment_id + 1];
    HMMAnchoredColumn& end_column = window.anchored_columns[segment_id + 2];

    std::string s_m_base = start_column.base_sequence;
    std::string m_e_base = middle_column.base_sequence;

    // Set up the the input data for the HMM
    std::vector<HMMInputData> input = get_input_for_columns(window, start_column, end_column);

    // no training can be performed if there are no reads for this segment
    if(input.empty()) {
        return;
    }

    // assume models for all the reads have the same k
    const uint32_t k = input[0].read->pore_model[input[0].strand].k;

    std::string segment_sequence = join_sequences_at_kmer(s_m_base, m_e_base, k);
     
    for(uint32_t ri = 0; ri < input.size(); ++ri) {
        std::vector<HMMAlignmentState> decodes = profile_hmm_align(segment_sequence, input[ri]);
        update_training_with_segment(segment_sequence, input[ri]);
    }
}
void run_splice_segment(HMMRealignmentInput& window, uint32_t segment_id, const uint32_t k)
{
    // The structure of the data looks like this:

    // --------------------------------------------------------
    // S                       M                              E
    // where is the start column, M is the middle column and E
    // is the end column. We want to call a new consensus from S
    // to E. We do this by generating the base sequence from S to E
    // and then applying all of the alternatives indicated by the
    // start and middle column. We score these alternatives using
    // the read strands spanning from S to E. After a new consensus
    // has been selected, we re-calculate the alignments of events to
    // the middle anchor.

    // Get the segments
    assert(segment_id + 2 < window.anchored_columns.size());
    HMMAnchoredColumn& start_column = window.anchored_columns[segment_id];
    HMMAnchoredColumn& middle_column = window.anchored_columns[segment_id + 1];
    HMMAnchoredColumn& end_column = window.anchored_columns[segment_id + 2];

    std::string s_m_base = start_column.base_sequence;
    std::string m_e_base = middle_column.base_sequence;

    // The collection of alternative sequences
    std::vector<std::string> alts;

    for(uint32_t ai = 0; ai < start_column.alt_sequences.size(); ++ai) {
        alts.push_back(start_column.alt_sequences[ai]);
    }

    // set up the input data for the HMM
    std::vector<HMMInputData> data = get_input_for_columns(window, start_column, end_column);

    if(opt::verbose > 0) {
        fprintf(stderr, "correcting segment %u with %zu reads\n", segment_id, data.size());
    }
    
    // The current consensus sequence
    std::string original = join_sequences_at_kmer(s_m_base, m_e_base, k);
    std::string base = original;

    // filter out poor quality reads
    filter_outlier_data(data, base);

    // Only attempt correction if there are any reads here
    if(!data.empty()) {
        
        std::string bs_result = run_block_substitution(base, data, alts);
        std::string mut_result = run_mutation(bs_result, data);
        base = mut_result;
    }

    if(opt::verbose > 0) {
        fprintf(stderr, "ORIGINAL[%d] %s\n", segment_id, original.c_str());
        fprintf(stderr, "RESULT[%d]   %s\n", segment_id, base.c_str());
    }
        
    // Update the sequences for the start and middle segments
    // by cutting the new consensus in the middle
    // We maintain the k-mer match invariant by requiring the
    // sequences to overlap by k-bp
    assert(base.length() >= k);
    uint32_t midpoint_kmer = (base.length() - k + 1) / 2;

    std::string s_m_fixed = base.substr(0, midpoint_kmer + k);
    std::string m_e_fixed = base.substr(midpoint_kmer);

    assert(s_m_fixed.substr(s_m_fixed.size() - k) == m_e_fixed.substr(0, k));

    start_column.base_sequence = s_m_fixed;
    middle_column.base_sequence = m_e_fixed;

    // Update the event indices in the first column to match 
    for(uint32_t ri = 0; ri < data.size(); ++ri) {

        // Realign to the consensus sequence
        std::vector<HMMAlignmentState> decodes = profile_hmm_align(base, data[ri]);

        // Get the closest event aligned to the target kmer
        int32_t min_k_dist = base.length();
        uint32_t event_idx = 0;
        for(uint32_t di = 0; di < decodes.size(); ++di) {
            int32_t dist = abs(decodes[di].kmer_idx - midpoint_kmer);
            if(dist <= min_k_dist) {
                min_k_dist = dist;
                event_idx = decodes[di].event_idx;
            }
        }

        middle_column.anchors[data[ri].anchor_index].event_idx = event_idx;
    }
}
void debug_sequence(const std::string& name, uint32_t seq_id, uint32_t read_id, const HMMInputSequence& sequence, const HMMInputData& data)
{
    std::vector<HMMAlignmentState> alignment = profile_hmm_align(sequence, data);
    print_alignment(name, seq_id, read_id, sequence, data, alignment);
}
void update_training_with_segment(const HMMInputSequence& sequence, const HMMInputData& data)
{
    std::vector<HMMAlignmentState> alignment = profile_hmm_align(sequence, data);
    data.read->parameters[data.strand].add_training_from_alignment(sequence, data, alignment);
}
Example #5
0
std::vector<AlignmentState> hmm_align(const std::string& sequence, const HMMInputData& data)
{
    return profile_hmm_align(sequence, data);
//    return khmm_posterior_decode(sequence, state);
}