bool operator()(const uint32_t& p1, const uint32_t& p2) { const char* c_seq1 = &(genome.sequence[p1]); const char* c_seq2 = &(genome.sequence[p2]); uint32_t chr_id1 = getChromID(genome.start_index, p1); uint32_t chr_id2 = getChromID(genome.start_index, p2); uint32_t l1 = genome.start_index[chr_id1 + 1] - p1; uint32_t l2 = genome.start_index[chr_id2 + 1] - p2; for (uint32_t j = F2SEEDKEYWIGTH; j < F2CAREDPOSITION_SIZE; ++j) { /*Strict Weak Ordering */ if (F2CAREDPOSITION[j] >= l2) return false; if (F2CAREDPOSITION[j] >= l1) return true; if (c_seq1[F2CAREDPOSITION[j]] < c_seq2[F2CAREDPOSITION[j]]) return true; else if (c_seq1[F2CAREDPOSITION[j]] > c_seq2[F2CAREDPOSITION[j]]) return false; } return false; }
void PairEndMapping(const string& org_read, const Genome& genome, const HashTable& hash_table, const char& strand, const bool& AG_WILDCARD, const uint32_t& max_mismatches, TopCandidates& top_match) { uint32_t read_len = org_read.size(); if (read_len < MINIMALREADLEN) { return; } /* return the maximal seed length for a particular read length */ uint32_t seed_pattern_repeats = (read_len - SEEPATTERNLEN + 1) / SEEPATTERNLEN; uint32_t seed_len = seed_pattern_repeats * SEEPATTERNCAREDWEIGHT; string read; if (AG_WILDCARD) { G2A(org_read, read_len, read); } else { C2T(org_read, read_len, read); } uint32_t cur_max_mismatches = max_mismatches; for (uint32_t seed_i = 0; seed_i < SEEPATTERNLEN; ++seed_i) { /* all exact matches are covered by the first seed */ if (!top_match.Empty() && top_match.Full() && top_match.Top().mismatch == 0 && seed_i) break; #if defined SEEDPATTERN3 || SEEDPATTERN5 /* all matches with 1 mismatch are covered by the first two seeds */ if (!top_match.Empty() && top_match.Full() && top_match.Top().mismatch == 1 && seed_i >= 2) break; #endif #ifdef SEEDPATTERN7 /* all matches with 1 mismatch are covered by the first two seeds */ if (!top_match.Empty() && top_match.Full() && top_match.Top().mismatch == 1 && seed_i >= 4) break; #endif string read_seed = read.substr(seed_i); uint32_t hash_value = getHashValue(read_seed.c_str()); pair<uint32_t, uint32_t> region; region.first = hash_table.counter[hash_value]; region.second = hash_table.counter[hash_value + 1]; if (region.first == region.second) continue; IndexRegion(read_seed, genome, hash_table, seed_len, region); if (region.second - region.first + 1 > 5000) { continue; } for (uint32_t j = region.first; j <= region.second; ++j) { uint32_t genome_pos = hash_table.index[j]; uint32_t chr_id = getChromID(genome.start_index, genome_pos); if (genome_pos - genome.start_index[chr_id] < seed_i) continue; genome_pos = genome_pos - seed_i; if (genome_pos + read_len >= genome.start_index[chr_id + 1]) continue; /* check the position */ uint32_t num_of_mismatch = 0; uint32_t num_of_nocared = seed_pattern_repeats * SEEPATTERNNOCAREDWEIGHT + seed_i; for (uint32_t p = 0; p < num_of_nocared && num_of_mismatch <= cur_max_mismatches; ++p) { if (genome.sequence[genome_pos + F2NOCAREDPOSITION[seed_i][p]] != read[F2NOCAREDPOSITION[seed_i][p]]) { num_of_mismatch++; } } for (uint32_t p = seed_pattern_repeats * SEEPATTERNLEN + seed_i; p < read_len && num_of_mismatch <= cur_max_mismatches; ++p) { if (genome.sequence[genome_pos + p] != read[p]) { num_of_mismatch++; } } if (num_of_mismatch > max_mismatches) { continue; } top_match.Push(CandidatePosition(genome_pos, strand, num_of_mismatch)); if (top_match.Full()) { cur_max_mismatches = top_match.Top().mismatch; } } } }
/* merge the mapping results from paired reads */ void MergePairedEndResults( const Genome& genome, const string& read_name, const string& read_seq1, const string& read_score1, const string& read_seq2, const string& read_score2, const vector<vector<CandidatePosition> >& ranked_results, const vector<int>& ranked_results_size, const int& frag_range, const uint32_t& max_mismatches, const bool& SAM, StatPairedReads& stat_paired_reads, FILE * fout) { #ifdef DEBUG for (int i = ranked_results_size[0] - 1; i >= 0; --i) { const CandidatePosition& r1 = ranked_results[0][i]; uint32_t chr_id1 = getChromID(genome.start_index, r1.genome_pos); uint32_t start_pos = r1.genome_pos - genome.start_index[chr_id1]; if ('-' == r1.strand) { start_pos = genome.length[chr_id1] - start_pos - read_seq1.size(); } uint32_t end_pos = start_pos + read_seq1.size(); fprintf(stderr, "%u %s %u %u %c %u\n", r1.genome_pos, genome.name[chr_id1].c_str(), start_pos, end_pos, r1.strand, r1.mismatch); } for (int j = ranked_results_size[1] - 1; j >= 0; --j) { const CandidatePosition& r2 = ranked_results[1][j]; uint32_t chr_id2 = getChromID(genome.start_index, r2.genome_pos); uint32_t start_pos = r2.genome_pos - genome.start_index[chr_id2]; if ('-' == r2.strand) { start_pos = genome.length[chr_id2] - start_pos - read_seq2.size(); } uint32_t end_pos = start_pos + read_seq2.size(); fprintf(stderr, "%u %s %u %u %c %u\n", r2.genome_pos, genome.name[chr_id2].c_str(), start_pos, end_pos, r2.strand, r2.mismatch); } #endif uint32_t read_len1 = read_seq1.size(); uint32_t read_len2 = read_seq2.size(); pair<int, int> best_pair(-1, -1); uint32_t min_num_of_mismatch = max_mismatches; uint64_t best_pos = 0; uint32_t best_times = 0; for (int i = ranked_results_size[0] - 1; i >= 0; --i) { for (int j = ranked_results_size[1] - 1; j >= 0; --j) { const CandidatePosition& r1 = ranked_results[0][i]; const CandidatePosition& r2 = ranked_results[1][j]; if (r1.strand == r2.strand) continue; uint32_t num_of_mismatch = r1.mismatch + r2.mismatch; if (num_of_mismatch > min_num_of_mismatch) break; uint32_t chr_id1 = getChromID(genome.start_index, r1.genome_pos); uint32_t chr_id2 = getChromID(genome.start_index, r2.genome_pos); if (chr_id1 != chr_id2) continue; int frag_size = GetFragmentLength(r1, r2, frag_range, read_len1, read_len2, genome, chr_id1, chr_id2); if (frag_size <= 0 || frag_size > frag_range) continue; uint64_t cur_pos = r1.genome_pos; cur_pos <<= 32; cur_pos += r2.genome_pos; if (num_of_mismatch < min_num_of_mismatch) { best_pair = make_pair(i, j); best_times = 1; min_num_of_mismatch = num_of_mismatch; best_pos = cur_pos; } else if (num_of_mismatch == min_num_of_mismatch && cur_pos != best_pos) { best_pair = make_pair(i, j); best_times++; } } } BestMatch best_match_1(0, 0, '+', max_mismatches); BestMatch best_match_2(0, 0, '+', max_mismatches); bool is_paired_mapped = false; int len = 0; if (best_times == 1) { stat_paired_reads.unique_mapped_pairs++; len = OutputBestPairedResults(ranked_results[0][best_pair.first], ranked_results[1][best_pair.second], frag_range, read_len1, read_len2, genome, read_name, read_seq1, read_score1, read_seq2, read_score2, SAM, fout); if (SAM) { // SAM is_paired_mapped = true; const CandidatePosition& r1 = ranked_results[0][best_pair.first]; const CandidatePosition& r2 = ranked_results[1][best_pair.second]; best_match_1 = BestMatch(r1.genome_pos, 1, r1.strand, r1.mismatch); best_match_2 = BestMatch(r2.genome_pos, 1, r2.strand, r2.mismatch); } } else { if (best_times >= 2) { stat_paired_reads.ambiguous_mapped_pairs++; } else { stat_paired_reads.unmapped_pairs++; } GetBestMatch4Single(ranked_results[0], ranked_results_size[0], genome, read_len1, read_name, read_seq1, read_score1, max_mismatches, best_match_1); GetBestMatch4Single(ranked_results[1], ranked_results_size[1], genome, read_len2, read_name, read_seq2, read_score2, max_mismatches, best_match_2); StatInfoUpdate(best_match_1.times, stat_paired_reads.stat_single_reads_1); StatInfoUpdate(best_match_2.times, stat_paired_reads.stat_single_reads_2); if (!SAM) { OutputSingleResults(best_match_1, read_name, read_seq1, read_score1, genome, false, stat_paired_reads.stat_single_reads_1, fout); OutputSingleResults(best_match_2, read_name, read_seq2, read_score2, genome, true, stat_paired_reads.stat_single_reads_2, fout); } } if (SAM) { // Output SAM int flag_1 = GetSAMFLAG(true, is_paired_mapped, best_match_1.times == 0, best_match_2.times == 0, best_match_1.strand == '-', best_match_2.strand == '-', true, false, best_match_1.times >= 2); int flag_2 = GetSAMFLAG(true, is_paired_mapped, best_match_2.times == 0, best_match_1.times == 0, best_match_2.strand == '-', best_match_1.strand == '-', false, true, best_match_2.times >= 2); OutputPairedSAM(best_match_1, best_match_2, genome, read_name, read_seq1, read_score1, read_seq2, read_score2, len, flag_1, flag_2, stat_paired_reads, fout); } }
void OutputPairedSAM(const BestMatch& best_match_1, const BestMatch& best_match_2, const Genome& genome, const string& read_name, const string& read_seq1, const string& read_score1, const string& read_seq2, const string& read_score2, const int& len, const int& flag_1, const int& flag_2, StatPairedReads& stat_paired_reads, FILE * fout) { uint32_t chr_id_1 = getChromID(genome.start_index, best_match_1.genome_pos); uint32_t chr_id_2 = getChromID(genome.start_index, best_match_2.genome_pos); uint32_t s1 = 0, s2 = 0, e1 = 0, e2 = 0; ForwardChromPosition(best_match_1.genome_pos, best_match_1.strand, chr_id_1, read_seq1.size(), genome, s1, e1); ForwardChromPosition(best_match_2.genome_pos, best_match_2.strand, chr_id_2, read_seq2.size(), genome, s2, e2); uint32_t mismatch1 = best_match_1.mismatch; uint32_t mismatch2 = best_match_2.mismatch; if (best_match_1.times == 0) { s1 = 0; mismatch1 = 0; } else { s1 += 1; } if (best_match_2.times == 0) { s2 = 0; mismatch2 = 0; } else { s2 += 1; } int len1 = best_match_1.strand == '+' ? len : -len; int len2 = best_match_2.strand == '+' ? len : -len; string rnext1 = "=", rnext2 = "="; if (flag_1 & 0x2) { rnext1 = "="; rnext2 = "="; } else { if (best_match_1.times == 0) { rnext1 = "*"; } else { rnext1 = genome.name[chr_id_1].c_str(); } if (best_match_2.times == 0) { rnext2 = "*"; } else { rnext2 = genome.name[chr_id_2].c_str(); } } string read_seq1_tmp = read_seq1; string read_seq2_tmp = read_seq2; string read_score1_tmp = read_score1; string read_score2_tmp = read_score2; if (best_match_1.strand == '-') { read_seq1_tmp = ReverseComplimentString(read_seq1_tmp); read_score1_tmp = ReverseString(read_score1_tmp); } if (best_match_2.strand == '-') { read_seq2_tmp = ReverseComplimentString(read_seq2_tmp); read_score2_tmp = ReverseString(read_score2_tmp); } uint32_t read_len1 = read_seq1.size(); uint32_t read_len2 = read_seq2.size(); if (best_match_1.times == 0 && stat_paired_reads.stat_single_reads_1.unmapped) { fprintf(fout, "%s\t%d\t*\t%u\t255\t*\t%s\t%u\t%d\t%s\t%s\tNM:i:%u\n", read_name.c_str(), flag_1, s1, rnext2.c_str(), s2, len1, read_seq1_tmp.c_str(), read_score1_tmp.c_str(), mismatch1); } else if (best_match_1.times == 1) { fprintf(fout, "%s\t%d\t%s\t%u\t255\t%uM\t%s\t%u\t%d\t%s\t%s\tNM:i:%u\n", read_name.c_str(), flag_1, genome.name[chr_id_1].c_str(), s1, read_len1, rnext2.c_str(), s2, len1, read_seq1_tmp.c_str(), read_score1_tmp.c_str(), mismatch1); } else if (best_match_1.times >= 2 && stat_paired_reads.stat_single_reads_1.ambiguous) { fprintf(fout, "%s\t%d\t%s\t%u\t255\t%uM\t%s\t%u\t%d\t%s\t%s\tNM:i:%u\n", read_name.c_str(), flag_1, genome.name[chr_id_1].c_str(), s1, read_len1, rnext2.c_str(), s2, len1, read_seq1_tmp.c_str(), read_score1_tmp.c_str(), mismatch1); } if (best_match_2.times == 0 && stat_paired_reads.stat_single_reads_2.unmapped) { fprintf(fout, "%s\t%d\t*\t%u\t255\t*\t%s\t%u\t%d\t%s\t%s\tNM:i:%u\n", read_name.c_str(), flag_2, s2, rnext1.c_str(), s1, len2, read_seq2_tmp.c_str(), read_score2_tmp.c_str(), mismatch2); } else if (best_match_2.times == 1) { fprintf(fout, "%s\t%d\t%s\t%u\t255\t%uM\t%s\t%u\t%d\t%s\t%s\tNM:i:%u\n", read_name.c_str(), flag_2, genome.name[chr_id_2].c_str(), s2, read_len2, rnext1.c_str(), s1, len2, read_seq2_tmp.c_str(), read_score2_tmp.c_str(), mismatch2); } else if (best_match_2.times >= 2 && stat_paired_reads.stat_single_reads_2.ambiguous) { fprintf(fout, "%s\t%d\t%s\t%u\t255\t%uM\t%s\t%u\t%d\t%s\t%s\tNM:i:%u\n", read_name.c_str(), flag_2, genome.name[chr_id_2].c_str(), s2, read_len2, rnext1.c_str(), s1, len2, read_seq2_tmp.c_str(), read_score2_tmp.c_str(), mismatch2); } }
int OutputBestPairedResults(const CandidatePosition& r1, const CandidatePosition& r2, const int& frag_range, const uint32_t& read_len1, const uint32_t& read_len2, const Genome& genome, const string& read_name, const string& read_seq1, const string& read_score1, const string& read_seq2, const string& read_score2, const bool& SAM, FILE * fout) { string read_seq2_rev = ReverseComplimentString(read_seq2); string read_scr2_rev = ReverseString(read_score2); uint32_t chr_id1 = getChromID(genome.start_index, r1.genome_pos); uint32_t chr_id2 = getChromID(genome.start_index, r2.genome_pos); uint32_t s1 = 0, s2 = 0, e1 = 0, e2 = 0; ForwardChromPosition(r1.genome_pos, r1.strand, chr_id1, read_len1, genome, s1, e1); ForwardChromPosition(r2.genome_pos, r2.strand, chr_id2, read_len2, genome, s2, e2); uint32_t overlap_s = MAX(s1, s2); uint32_t overlap_e = MIN(e1, e2); uint32_t one_l = r1.strand == '+' ? s1 : MAX(overlap_e, s1); uint32_t one_r = r1.strand == '+' ? MIN(overlap_s, e1) : e1; uint32_t two_l = r1.strand == '+' ? MAX(overlap_e, s2) : s2; uint32_t two_r = r1.strand == '+' ? e2 : MIN(overlap_s, e2); int len = r1.strand == '+' ? (two_r - one_l) : (one_r - two_l); if (SAM) { return len; } string seq(len, 'N'); string scr(len, 'B'); if (len > 0 && len <= frag_range) { // lim_one: offset in merged sequence where overlap starts uint32_t lim_one = one_r - one_l; copy(read_seq1.begin(), read_seq1.begin() + lim_one, seq.begin()); copy(read_score1.begin(), read_score1.begin() + lim_one, scr.begin()); uint32_t lim_two = two_r - two_l; copy(read_seq2_rev.end() - lim_two, read_seq2_rev.end(), seq.end() - lim_two); copy(read_scr2_rev.end() - lim_two, read_scr2_rev.end(), scr.end() - lim_two); // deal with overlapping part if (overlap_s < overlap_e) { uint32_t one_bads = count(read_seq1.begin(), read_seq1.end(), 'N'); int info_one = read_len1 - (one_bads + r1.mismatch); uint32_t two_bads = count(read_seq2_rev.begin(), read_seq2_rev.end(), 'N'); int info_two = read_len2 - (two_bads + r2.mismatch); // use the mate with the most info to fill in the overlap if (info_one >= info_two) { uint32_t a = r1.strand == '+' ? (overlap_s - s1) : (e1 - overlap_e); uint32_t b = r1.strand == '+' ? (overlap_e - s1) : (e1 - overlap_s); copy(read_seq1.begin() + a, read_seq1.begin() + b, seq.begin() + lim_one); copy(read_score1.begin() + a, read_score1.begin() + b, scr.begin() + lim_one); } else { uint32_t a = r1.strand == '+' ? (overlap_s - s2) : (e2 - overlap_e); uint32_t b = r1.strand == '+' ? (overlap_e - s2) : (e2 - overlap_s); copy(read_seq2_rev.begin() + a, read_seq2_rev.begin() + b, seq.begin() + lim_one); copy(read_scr2_rev.begin() + a, read_scr2_rev.begin() + b, scr.begin() + lim_one); } } } uint32_t start_pos = r1.strand == '+' ? s1 : s2; fprintf(fout, "%s\t%u\t%u\tFRAG:%s\t%u\t%c\t%s\t%s\n", genome.name[chr_id1].c_str(), start_pos, start_pos + len, read_name.c_str(), r1.mismatch + r2.mismatch, r1.strand, seq.c_str(), scr.c_str()); return 0; }