void SNPBamProcessor::process_reads(std::vector< std::vector<BamTools::BamAlignment> >& paired_strs_by_rg, std::vector< std::vector<BamTools::BamAlignment> >& mate_pairs_by_rg, std::vector< std::vector<BamTools::BamAlignment> >& unpaired_strs_by_rg, std::vector<std::string>& rg_names, Region& region, std::string& ref_allele, std::string& chrom_seq, std::ostream& out){ // Only use specialized function for 10X genomics BAMs if flag has been set if(bams_from_10x_){ process_10x_reads(paired_strs_by_rg, mate_pairs_by_rg, unpaired_strs_by_rg, rg_names, region, ref_allele, chrom_seq, out); return; } locus_snp_phase_info_time_ = clock(); assert(paired_strs_by_rg.size() == mate_pairs_by_rg.size() && paired_strs_by_rg.size() == unpaired_strs_by_rg.size()); if (paired_strs_by_rg.size() == 0 && unpaired_strs_by_rg.size() == 0) return; std::vector< std::vector<BamTools::BamAlignment> > alignments(paired_strs_by_rg.size()); std::vector< std::vector<double> > log_p1s, log_p2s; bool got_snp_info = false; if (have_snp_vcf_){ std::vector<SNPTree*> snp_trees; std::map<std::string, unsigned int> sample_indices; if(create_snp_trees(region.chrom(), (region.start() > MAX_MATE_DIST ? region.start()-MAX_MATE_DIST : 1), region.stop()+MAX_MATE_DIST, phased_snp_vcf_, sample_indices, snp_trees, logger())){ got_snp_info = true; std::set<std::string> bad_samples, good_samples; for (unsigned int i = 0; i < paired_strs_by_rg.size(); ++i){ if (sample_indices.find(rg_names[i]) != sample_indices.end()){ good_samples.insert(rg_names[i]); std::vector<double> log_p1, log_p2; SNPTree* snp_tree = snp_trees[sample_indices[rg_names[i]]]; calc_het_snp_factors(paired_strs_by_rg[i], mate_pairs_by_rg[i], base_quality_, snp_tree, log_p1, log_p2, match_count_, mismatch_count_); calc_het_snp_factors(unpaired_strs_by_rg[i], base_quality_, snp_tree, log_p1, log_p2, match_count_, mismatch_count_); log_p1s.push_back(log_p1); log_p2s.push_back(log_p2); } else { std::vector<double> log_p1, log_p2; for (unsigned int j = 0; j < paired_strs_by_rg[i].size()+unpaired_strs_by_rg[i].size(); ++j){ log_p1.push_back(0); log_p2.push_back(0); // Assign equal phasing LLs as no SNP info is available } log_p1s.push_back(log_p1); log_p2s.push_back(log_p2); bad_samples.insert(rg_names[i]); } // Copy alignments alignments[i].insert(alignments[i].end(), paired_strs_by_rg[i].begin(), paired_strs_by_rg[i].end()); alignments[i].insert(alignments[i].end(), unpaired_strs_by_rg[i].begin(), unpaired_strs_by_rg[i].end()); } logger() << "Found VCF info for " << good_samples.size() << " out of " << good_samples.size()+bad_samples.size() << " samples with STR reads" << std::endl; } else logger() << "Warning: Failed to construct SNP trees for " << region.chrom() << ":" << region.start() << "-" << region.stop() << std::endl; destroy_snp_trees(snp_trees); } if (!got_snp_info){ for (unsigned int i = 0; i < paired_strs_by_rg.size(); i++){ // Copy alignments alignments[i].insert(alignments[i].end(), paired_strs_by_rg[i].begin(), paired_strs_by_rg[i].end()); alignments[i].insert(alignments[i].end(), unpaired_strs_by_rg[i].begin(), unpaired_strs_by_rg[i].end()); // Assign equal phasing LLs as no SNP info is available log_p1s.push_back(std::vector<double>(paired_strs_by_rg[i].size()+unpaired_strs_by_rg[i].size(), 0.0)); log_p2s.push_back(std::vector<double>(paired_strs_by_rg[i].size()+unpaired_strs_by_rg[i].size(), 0.0)); } } int phased_samples = 0, phased_reads = 0, total_reads = 0; for (unsigned int i = 0; i < alignments.size(); i++){ bool sample_phased = false; for (unsigned int j = 0; j < alignments[i].size(); j++){ sample_phased |= (log_p1s[i][j] != log_p2s[i][j]); phased_reads += (log_p1s[i][j] != log_p2s[i][j]); } total_reads += alignments[i].size(); phased_samples += sample_phased; } logger() << "Phased SNPs add info for " << phased_reads << " out of " << total_reads << " reads" << " and " << phased_samples << " out of " << rg_names.size() << " samples" << std::endl; locus_snp_phase_info_time_ = (clock() - locus_snp_phase_info_time_)/CLOCKS_PER_SEC; total_snp_phase_info_time_ += locus_snp_phase_info_time_; // Run any additional analyses using phasing probabilities analyze_reads_and_phasing(alignments, log_p1s, log_p2s, rg_names, region, ref_allele, chrom_seq, 0); }