// Test CpG sites in this read for methylation void calculate_methylation_for_read(const ModelMap& model_map, const Fast5Map& name_map, const faidx_t* fai, const bam_hdr_t* hdr, const bam1_t* record, size_t read_idx, const OutputHandles& handles) { // Load a squiggle read for the mapped read std::string read_name = bam_get_qname(record); std::string fast5_path = name_map.get_path(read_name); SquiggleRead sr(read_name, fast5_path); // An output map from reference positions to scored CpG sites std::map<int, ScoredSite> site_score_map; for(size_t strand_idx = 0; strand_idx < NUM_STRANDS; ++strand_idx) { std::vector<double> site_scores; std::vector<int> site_starts; std::vector<int> site_ends; std::vector<int> site_count; // replace the baked-in pore model with the methylation model // (including unmethylated kmers) for this strand std::string curr_model = sr.pore_model[strand_idx].name; std::string methyl_model = curr_model + ".ecoli_er2925.pcr_MSssI.timp.021216.alphabet_cpg.model"; auto model_iter = model_map.find(methyl_model); if(model_iter != model_map.end()) { sr.pore_model[strand_idx].update_states( model_iter->second ); } else { fprintf(stderr, "Error, methylated model %s not found\n", methyl_model.c_str()); exit(EXIT_FAILURE); } size_t k = sr.pore_model[strand_idx].k; // Align in event space using the new model EventAlignmentParameters params; params.sr = &sr; params.fai = fai; params.hdr = hdr; params.record = record; params.strand_idx = strand_idx; params.read_idx = read_idx; params.alphabet = mtest_alphabet; std::vector<EventAlignment> alignment_output = align_read_to_ref(params); if(alignment_output.empty()) continue; std::string contig = alignment_output.front().ref_name.c_str(); // Convert the EventAlignment to a map between reference positions and events std::vector<AlignedPair> event_aligned_pairs; for(size_t i = 0; i < alignment_output.size(); ++i) { AlignedPair ap = { alignment_output[i].ref_position, alignment_output[i].event_idx }; event_aligned_pairs.push_back(ap); } int ref_start_pos = event_aligned_pairs.front().ref_pos; int ref_end_pos = event_aligned_pairs.back().ref_pos; // Extract the reference sequence for this region int fetched_len = 0; assert(ref_end_pos >= ref_start_pos); std::string ref_seq = get_reference_region_ts(params.fai, contig.c_str(), ref_start_pos, ref_end_pos, &fetched_len); // Remove non-ACGT bases from this reference segment ref_seq = gDNAAlphabet.disambiguate(ref_seq); // Scan the sequence for CpGs std::vector<int> cpg_sites; assert(ref_seq.size() != 0); for(size_t i = 0; i < ref_seq.size() - 1; ++i) { if(ref_seq[i] == 'C' && ref_seq[i+1] == 'G') { cpg_sites.push_back(i); } } // Batch the CpGs together into groups that are separated by some minimum distance int min_separation = 10; size_t curr_idx = 0; while(curr_idx < cpg_sites.size()) { // Find the endpoint of this group of sites size_t end_idx = curr_idx + 1; while(end_idx < cpg_sites.size()) { if(cpg_sites[end_idx] - cpg_sites[end_idx - 1] > min_separation) break; end_idx += 1; } // the coordinates on the reference substring for this group of sites int sub_start_pos = cpg_sites[curr_idx] - min_separation; int sub_end_pos = cpg_sites[end_idx - 1] + min_separation; if(sub_start_pos > min_separation && cpg_sites[end_idx - 1] - cpg_sites[curr_idx] < 200) { std::string subseq = ref_seq.substr(sub_start_pos, sub_end_pos - sub_start_pos + 1); std::string rc_subseq = mtest_alphabet->reverse_complement(subseq); // using the reference-to-event map, look up the event indices for this segment AlignedPairRefLBComp lb_comp; AlignedPairConstIter start_iter = std::lower_bound(event_aligned_pairs.begin(), event_aligned_pairs.end(), sub_start_pos + ref_start_pos, lb_comp); AlignedPairConstIter stop_iter = std::lower_bound(event_aligned_pairs.begin(), event_aligned_pairs.end(), sub_end_pos + ref_start_pos, lb_comp); // Only process this region if the the read is aligned within the boundaries // and the span between the start/end is not unusually short if(start_iter != event_aligned_pairs.end() && stop_iter != event_aligned_pairs.end() && abs(start_iter->read_pos - stop_iter->read_pos) > 10) { uint32_t hmm_flags = HAF_ALLOW_PRE_CLIP | HAF_ALLOW_POST_CLIP; // Set up event data HMMInputData data; data.read = &sr; data.anchor_index = -1; // unused data.strand = strand_idx; data.rc = alignment_output.front().rc; data.event_start_idx = start_iter->read_pos; data.event_stop_idx = stop_iter->read_pos; data.event_stride = data.event_start_idx <= data.event_stop_idx ? 1 : -1; // Calculate the likelihood of the unmethylated sequence HMMInputSequence unmethylated(subseq, rc_subseq, mtest_alphabet); double unmethylated_score = profile_hmm_score(unmethylated, data, hmm_flags); // Methylate all CpGs in the sequence and score again std::string mcpg_subseq = mtest_alphabet->methylate(subseq); std::string rc_mcpg_subseq = mtest_alphabet->reverse_complement(mcpg_subseq); // Calculate the likelihood of the methylated sequence HMMInputSequence methylated(mcpg_subseq, rc_mcpg_subseq, mtest_alphabet); double methylated_score = profile_hmm_score(methylated, data, hmm_flags); // Aggregate score int start_position = cpg_sites[curr_idx] + ref_start_pos; auto iter = site_score_map.find(start_position); if(iter == site_score_map.end()) { // insert new score into the map ScoredSite ss; ss.chromosome = contig; ss.start_position = start_position; ss.end_position = cpg_sites[end_idx - 1] + ref_start_pos; ss.n_cpg = end_idx - curr_idx; // extract the CpG site(s) with a k-mers worth of surrounding context size_t site_output_start = cpg_sites[curr_idx] - k + 1; size_t site_output_end = cpg_sites[end_idx - 1] + k; ss.sequence = ref_seq.substr(site_output_start, site_output_end - site_output_start); // insert into the map iter = site_score_map.insert(std::make_pair(start_position, ss)).first; } // set strand-specific score // upon output below the strand scores will be summed iter->second.ll_unmethylated[strand_idx] = unmethylated_score; iter->second.ll_methylated[strand_idx] = methylated_score; } } curr_idx = end_idx; } } // for strands #pragma omp critical(methyltest_write) { // these variables are sums over all sites within a read double ll_ratio_sum_strand[2] = { 0.0f, 0.0f }; double ll_ratio_sum_both = 0; size_t num_positive = 0; // write all sites for this read for(auto iter = site_score_map.begin(); iter != site_score_map.end(); ++iter) { const ScoredSite& ss = iter->second; double sum_ll_m = ss.ll_methylated[0] + ss.ll_methylated[1]; double sum_ll_u = ss.ll_unmethylated[0] + ss.ll_unmethylated[1]; double diff = sum_ll_m - sum_ll_u; num_positive += diff > 0; fprintf(handles.site_writer, "%s\t%d\t%d\t", ss.chromosome.c_str(), ss.start_position, ss.end_position); fprintf(handles.site_writer, "ReadIdx=%zu;", read_idx); fprintf(handles.site_writer, "LogLikMeth=%.2lf;LogLikUnmeth=%.2lf;LogLikRatio=%.2lf;", sum_ll_m, sum_ll_u, diff); fprintf(handles.site_writer, "LogLikMethByStrand=%.2lf,%.2lf;", ss.ll_methylated[0], ss.ll_methylated[1]); fprintf(handles.site_writer, "LogLikUnmethByStrand=%.2lf,%.2lf;", ss.ll_unmethylated[0], ss.ll_unmethylated[1]); fprintf(handles.site_writer, "NumCpGs=%d;Sequence=%s\n", ss.n_cpg, ss.sequence.c_str()); ll_ratio_sum_strand[0] += ss.ll_methylated[0] - ss.ll_unmethylated[0]; ll_ratio_sum_strand[1] += ss.ll_methylated[1] - ss.ll_unmethylated[1]; ll_ratio_sum_both += diff; } std::string complement_model = sr.pore_model[C_IDX].name; fprintf(handles.read_writer, "%s\t%.2lf\t%zu\t%s\tNumPositive=%zu\n", fast5_path.c_str(), ll_ratio_sum_both, site_score_map.size(), complement_model.c_str(), num_positive); for(size_t si = 0; si < NUM_STRANDS; ++si) { std::string model = sr.pore_model[si].name; fprintf(handles.strand_writer, "%s\t%.2lf\t%zu\t%s\n", fast5_path.c_str(), ll_ratio_sum_strand[si], site_score_map.size(), model.c_str()); } } }
// Update the training data with aligned events from a read void add_aligned_events(const Fast5Map& name_map, const faidx_t* fai, const bam_hdr_t* hdr, const bam1_t* record, size_t read_idx, int region_start, int region_end, size_t round, ModelTrainingMap& training) { // Load a squiggle read for the mapped read std::string read_name = bam_get_qname(record); std::string fast5_path = name_map.get_path(read_name); // load read SquiggleRead sr(read_name, fast5_path); // replace the models that are built into the read with the current trained model sr.replace_models(opt::trained_model_type); for(size_t strand_idx = 0; strand_idx < NUM_STRANDS; ++strand_idx) { // skip if 1D reads and this is the wrong strand if(!sr.has_events_for_strand(strand_idx)) { continue; } // set k uint32_t k = sr.pore_model[strand_idx].k; // Align to the new model EventAlignmentParameters params; params.sr = &sr; params.fai = fai; params.hdr = hdr; params.record = record; params.strand_idx = strand_idx; params.alphabet = mtrain_alphabet; params.read_idx = read_idx; params.region_start = region_start; params.region_end = region_end; std::vector<EventAlignment> alignment_output = align_read_to_ref(params); if (alignment_output.size() == 0) return; // Update pore model based on alignment std::string curr_model = sr.pore_model[strand_idx].metadata.get_short_name(); double orig_score = -INFINITY; if (opt::output_scores) { orig_score = model_score(sr, strand_idx, fai, alignment_output, 500, NULL); #pragma omp critical(print) std::cout << round << " " << curr_model << " " << read_idx << " " << strand_idx << " Original " << orig_score << std::endl; } if ( opt::calibrate ) { double resid = 0.; recalibrate_model(sr, strand_idx, alignment_output, mtrain_alphabet, resid, true); if (opt::output_scores) { double rescaled_score = model_score(sr, strand_idx, fai, alignment_output, 500, NULL); #pragma omp critical(print) { std::cout << round << " " << curr_model << " " << read_idx << " " << strand_idx << " Rescaled " << rescaled_score << std::endl; std::cout << round << " " << curr_model << " " << read_idx << " " << strand_idx << " Delta " << rescaled_score-orig_score << std::endl; } } } // Get the training data for this model auto& emission_map = training[curr_model]; for(size_t i = 0; i < alignment_output.size(); ++i) { const EventAlignment& ea = alignment_output[i]; std::string model_kmer = ea.model_kmer; // Grab the previous/next model kmer from the alignment_output table. // If the read is from the same strand as the reference // the next kmer comes from the next alignment_output (and vice-versa) // other the indices are swapped int next_stride = ea.rc ? -1 : 1; std::string prev_kmer = ""; std::string next_kmer = ""; if(i > 0 && i < alignment_output.size() - 1) { // check that the event indices are correct for the next expected position assert(alignment_output[i + next_stride].event_idx - ea.event_idx == 1); assert(alignment_output[i - next_stride].event_idx - ea.event_idx == -1); // only set the previous/next when there was exactly one base of movement along the referenc if( std::abs(alignment_output[i + next_stride].ref_position - ea.ref_position) == 1) { next_kmer = alignment_output[i + next_stride].model_kmer; } if( std::abs(alignment_output[i - next_stride].ref_position - ea.ref_position) == 1) { prev_kmer = alignment_output[i - next_stride].model_kmer; } } // Get the rank of the kmer that we aligned to (on the sequencing strand, = model_kmer) uint32_t rank = mtrain_alphabet->kmer_rank(model_kmer.c_str(), k); assert(rank < emission_map.size()); auto& kmer_summary = emission_map[rank]; // We only use this event for training if its not at the end of the alignment // (to avoid bad alignments around the read edges) and if its not too short (to // avoid bad measurements from effecting the levels too much) bool use_for_training = i > opt::min_distance_from_alignment_end && i + opt::min_distance_from_alignment_end < alignment_output.size() && alignment_output[i].hmm_state == 'M' && sr.get_duration( alignment_output[i].event_idx, strand_idx) >= opt::min_event_duration && sr.get_fully_scaled_level(alignment_output[i].event_idx, strand_idx) >= 1.0; if(use_for_training) { StateTrainingData std(sr, ea, rank, prev_kmer, next_kmer); #pragma omp critical(kmer) kmer_summary.events.push_back(std); } if(ea.hmm_state == 'M') { #pragma omp atomic kmer_summary.num_matches += 1; } else if(ea.hmm_state == 'E') { #pragma omp atomic kmer_summary.num_stays += 1; } } } // for strands }