Exemplo n.º 1
0
  bool operator()(const uint32_t& p1, const uint32_t& p2) {
    const char* c_seq1 = &(genome.sequence[p1]);
    const char* c_seq2 = &(genome.sequence[p2]);

    uint32_t chr_id1 = getChromID(genome.start_index, p1);
    uint32_t chr_id2 = getChromID(genome.start_index, p2);

    uint32_t l1 = genome.start_index[chr_id1 + 1] - p1;
    uint32_t l2 = genome.start_index[chr_id2 + 1] - p2;

    for (uint32_t j = F2SEEDKEYWIGTH; j < F2CAREDPOSITION_SIZE; ++j) {
      /*Strict Weak Ordering */
      if (F2CAREDPOSITION[j] >= l2)
        return false;
      if (F2CAREDPOSITION[j] >= l1)
        return true;

      if (c_seq1[F2CAREDPOSITION[j]] < c_seq2[F2CAREDPOSITION[j]])
        return true;
      else if (c_seq1[F2CAREDPOSITION[j]] > c_seq2[F2CAREDPOSITION[j]])
        return false;
    }
    return false;
  }
Exemplo n.º 2
0
void PairEndMapping(const string& org_read, const Genome& genome,
                    const HashTable& hash_table, const char& strand,
                    const bool& AG_WILDCARD, const uint32_t& max_mismatches,
                    TopCandidates& top_match) {
  uint32_t read_len = org_read.size();
  if (read_len < MINIMALREADLEN) {
    return;
  }

  /* return the maximal seed length for a particular read length */
  uint32_t seed_pattern_repeats = (read_len - SEEPATTERNLEN + 1)
      / SEEPATTERNLEN;
  uint32_t seed_len = seed_pattern_repeats * SEEPATTERNCAREDWEIGHT;

  string read;
  if (AG_WILDCARD) {
    G2A(org_read, read_len, read);
  } else {
    C2T(org_read, read_len, read);
  }

  uint32_t cur_max_mismatches = max_mismatches;
  for (uint32_t seed_i = 0; seed_i < SEEPATTERNLEN; ++seed_i) {
    /* all exact matches are covered by the first seed */
    if (!top_match.Empty() && top_match.Full() && top_match.Top().mismatch == 0
        && seed_i)
      break;

#if defined SEEDPATTERN3 || SEEDPATTERN5
    /* all matches with 1 mismatch are covered by the first two seeds */
    if (!top_match.Empty() && top_match.Full() && top_match.Top().mismatch == 1
        && seed_i >= 2)
      break;
#endif

#ifdef SEEDPATTERN7
    /* all matches with 1 mismatch are covered by the first two seeds */
    if (!top_match.Empty() && top_match.Full() && top_match.Top().mismatch == 1
        && seed_i >= 4)
      break;
#endif

    string read_seed = read.substr(seed_i);
    uint32_t hash_value = getHashValue(read_seed.c_str());
    pair<uint32_t, uint32_t> region;
    region.first = hash_table.counter[hash_value];
    region.second = hash_table.counter[hash_value + 1];

    if (region.first == region.second)
      continue;

    IndexRegion(read_seed, genome, hash_table, seed_len, region);
    if (region.second - region.first + 1 > 5000) {
      continue;
    }
    for (uint32_t j = region.first; j <= region.second; ++j) {
      uint32_t genome_pos = hash_table.index[j];
      uint32_t chr_id = getChromID(genome.start_index, genome_pos);
      if (genome_pos - genome.start_index[chr_id] < seed_i)
        continue;
      genome_pos = genome_pos - seed_i;
      if (genome_pos + read_len >= genome.start_index[chr_id + 1])
        continue;

      /* check the position */
      uint32_t num_of_mismatch = 0;
      uint32_t num_of_nocared = seed_pattern_repeats * SEEPATTERNNOCAREDWEIGHT
          + seed_i;
      for (uint32_t p = 0;
          p < num_of_nocared && num_of_mismatch <= cur_max_mismatches; ++p) {
        if (genome.sequence[genome_pos + F2NOCAREDPOSITION[seed_i][p]]
            != read[F2NOCAREDPOSITION[seed_i][p]]) {
          num_of_mismatch++;
        }
      }
      for (uint32_t p = seed_pattern_repeats * SEEPATTERNLEN + seed_i;
          p < read_len && num_of_mismatch <= cur_max_mismatches; ++p) {
        if (genome.sequence[genome_pos + p] != read[p]) {
          num_of_mismatch++;
        }
      }

      if (num_of_mismatch > max_mismatches) {
        continue;
      }
      top_match.Push(CandidatePosition(genome_pos, strand, num_of_mismatch));
      if (top_match.Full()) {
        cur_max_mismatches = top_match.Top().mismatch;
      }
    }
  }
}
Exemplo n.º 3
0
/* merge the mapping results from paired reads */
void MergePairedEndResults(
    const Genome& genome, const string& read_name, const string& read_seq1,
    const string& read_score1, const string& read_seq2,
    const string& read_score2,
    const vector<vector<CandidatePosition> >& ranked_results,
    const vector<int>& ranked_results_size, const int& frag_range,
    const uint32_t& max_mismatches, const bool& SAM,
    StatPairedReads& stat_paired_reads, FILE * fout) {
#ifdef DEBUG
  for (int i = ranked_results_size[0] - 1; i >= 0; --i) {
    const CandidatePosition& r1 = ranked_results[0][i];
    uint32_t chr_id1 = getChromID(genome.start_index, r1.genome_pos);
    uint32_t start_pos = r1.genome_pos - genome.start_index[chr_id1];
    if ('-' == r1.strand) {
      start_pos = genome.length[chr_id1] - start_pos - read_seq1.size();
    }
    uint32_t end_pos = start_pos + read_seq1.size();
    fprintf(stderr, "%u %s %u %u %c %u\n", r1.genome_pos,
            genome.name[chr_id1].c_str(), start_pos, end_pos, r1.strand,
            r1.mismatch);
  }
  for (int j = ranked_results_size[1] - 1; j >= 0; --j) {
    const CandidatePosition& r2 = ranked_results[1][j];
    uint32_t chr_id2 = getChromID(genome.start_index, r2.genome_pos);
    uint32_t start_pos = r2.genome_pos - genome.start_index[chr_id2];
    if ('-' == r2.strand) {
      start_pos = genome.length[chr_id2] - start_pos - read_seq2.size();
    }
    uint32_t end_pos = start_pos + read_seq2.size();
    fprintf(stderr, "%u %s %u %u %c %u\n", r2.genome_pos,
            genome.name[chr_id2].c_str(), start_pos, end_pos, r2.strand,
            r2.mismatch);
  }
#endif
  uint32_t read_len1 = read_seq1.size();
  uint32_t read_len2 = read_seq2.size();
  pair<int, int> best_pair(-1, -1);
  uint32_t min_num_of_mismatch = max_mismatches;
  uint64_t best_pos = 0;
  uint32_t best_times = 0;
  for (int i = ranked_results_size[0] - 1; i >= 0; --i) {
    for (int j = ranked_results_size[1] - 1; j >= 0; --j) {
      const CandidatePosition& r1 = ranked_results[0][i];
      const CandidatePosition& r2 = ranked_results[1][j];
      if (r1.strand == r2.strand)
        continue;

      uint32_t num_of_mismatch = r1.mismatch + r2.mismatch;
      if (num_of_mismatch > min_num_of_mismatch)
        break;

      uint32_t chr_id1 = getChromID(genome.start_index, r1.genome_pos);
      uint32_t chr_id2 = getChromID(genome.start_index, r2.genome_pos);
      if (chr_id1 != chr_id2)
        continue;

      int frag_size = GetFragmentLength(r1, r2, frag_range, read_len1,
                                        read_len2, genome, chr_id1, chr_id2);
      if (frag_size <= 0 || frag_size > frag_range)
        continue;

      uint64_t cur_pos = r1.genome_pos;
      cur_pos <<= 32;
      cur_pos += r2.genome_pos;
      if (num_of_mismatch < min_num_of_mismatch) {
        best_pair = make_pair(i, j);
        best_times = 1;
        min_num_of_mismatch = num_of_mismatch;
        best_pos = cur_pos;
      } else if (num_of_mismatch == min_num_of_mismatch
          && cur_pos != best_pos) {
        best_pair = make_pair(i, j);
        best_times++;
      }
    }
  }

  BestMatch best_match_1(0, 0, '+', max_mismatches);
  BestMatch best_match_2(0, 0, '+', max_mismatches);
  bool is_paired_mapped = false;
  int len = 0;
  if (best_times == 1) {
    stat_paired_reads.unique_mapped_pairs++;
    len = OutputBestPairedResults(ranked_results[0][best_pair.first],
                                  ranked_results[1][best_pair.second],
                                  frag_range, read_len1, read_len2, genome,
                                  read_name, read_seq1, read_score1, read_seq2,
                                  read_score2, SAM, fout);
    if (SAM) {  // SAM
      is_paired_mapped = true;
      const CandidatePosition& r1 = ranked_results[0][best_pair.first];
      const CandidatePosition& r2 = ranked_results[1][best_pair.second];
      best_match_1 = BestMatch(r1.genome_pos, 1, r1.strand, r1.mismatch);
      best_match_2 = BestMatch(r2.genome_pos, 1, r2.strand, r2.mismatch);
    }
  } else {
    if (best_times >= 2) {
      stat_paired_reads.ambiguous_mapped_pairs++;
    } else {
      stat_paired_reads.unmapped_pairs++;
    }
    GetBestMatch4Single(ranked_results[0], ranked_results_size[0], genome,
                        read_len1, read_name, read_seq1, read_score1,
                        max_mismatches, best_match_1);
    GetBestMatch4Single(ranked_results[1], ranked_results_size[1], genome,
                        read_len2, read_name, read_seq2, read_score2,
                        max_mismatches, best_match_2);
    StatInfoUpdate(best_match_1.times, stat_paired_reads.stat_single_reads_1);
    StatInfoUpdate(best_match_2.times, stat_paired_reads.stat_single_reads_2);
    if (!SAM) {
      OutputSingleResults(best_match_1, read_name, read_seq1, read_score1,
                          genome, false, stat_paired_reads.stat_single_reads_1,
                          fout);
      OutputSingleResults(best_match_2, read_name, read_seq2, read_score2,
                          genome, true, stat_paired_reads.stat_single_reads_2,
                          fout);
    }
  }
  if (SAM) {  // Output SAM
    int flag_1 = GetSAMFLAG(true, is_paired_mapped, best_match_1.times == 0,
                            best_match_2.times == 0, best_match_1.strand == '-',
                            best_match_2.strand == '-', true, false,
                            best_match_1.times >= 2);
    int flag_2 = GetSAMFLAG(true, is_paired_mapped, best_match_2.times == 0,
                            best_match_1.times == 0, best_match_2.strand == '-',
                            best_match_1.strand == '-', false, true,
                            best_match_2.times >= 2);
    OutputPairedSAM(best_match_1, best_match_2, genome, read_name, read_seq1,
                    read_score1, read_seq2, read_score2, len, flag_1, flag_2,
                    stat_paired_reads, fout);
  }
}
Exemplo n.º 4
0
void OutputPairedSAM(const BestMatch& best_match_1,
                     const BestMatch& best_match_2, const Genome& genome,
                     const string& read_name, const string& read_seq1,
                     const string& read_score1, const string& read_seq2,
                     const string& read_score2, const int& len,
                     const int& flag_1, const int& flag_2,
                     StatPairedReads& stat_paired_reads, FILE * fout) {
  uint32_t chr_id_1 = getChromID(genome.start_index, best_match_1.genome_pos);
  uint32_t chr_id_2 = getChromID(genome.start_index, best_match_2.genome_pos);
  uint32_t s1 = 0, s2 = 0, e1 = 0, e2 = 0;
  ForwardChromPosition(best_match_1.genome_pos, best_match_1.strand, chr_id_1,
                       read_seq1.size(), genome, s1, e1);
  ForwardChromPosition(best_match_2.genome_pos, best_match_2.strand, chr_id_2,
                       read_seq2.size(), genome, s2, e2);

  uint32_t mismatch1 = best_match_1.mismatch;
  uint32_t mismatch2 = best_match_2.mismatch;
  if (best_match_1.times == 0) {
    s1 = 0;
    mismatch1 = 0;
  } else {
    s1 += 1;
  }
  if (best_match_2.times == 0) {
    s2 = 0;
    mismatch2 = 0;
  } else {
    s2 += 1;
  }

  int len1 = best_match_1.strand == '+' ? len : -len;
  int len2 = best_match_2.strand == '+' ? len : -len;

  string rnext1 = "=", rnext2 = "=";
  if (flag_1 & 0x2) {
    rnext1 = "=";
    rnext2 = "=";
  } else {
    if (best_match_1.times == 0) {
      rnext1 = "*";
    } else {
      rnext1 = genome.name[chr_id_1].c_str();
    }

    if (best_match_2.times == 0) {
      rnext2 = "*";
    } else {
      rnext2 = genome.name[chr_id_2].c_str();
    }
  }

  string read_seq1_tmp = read_seq1;
  string read_seq2_tmp = read_seq2;
  string read_score1_tmp = read_score1;
  string read_score2_tmp = read_score2;
  if (best_match_1.strand == '-') {
    read_seq1_tmp = ReverseComplimentString(read_seq1_tmp);
    read_score1_tmp = ReverseString(read_score1_tmp);
  }
  if (best_match_2.strand == '-') {
    read_seq2_tmp = ReverseComplimentString(read_seq2_tmp);
    read_score2_tmp = ReverseString(read_score2_tmp);
  }

  uint32_t read_len1 = read_seq1.size();
  uint32_t read_len2 = read_seq2.size();

  if (best_match_1.times == 0
      && stat_paired_reads.stat_single_reads_1.unmapped) {
    fprintf(fout, "%s\t%d\t*\t%u\t255\t*\t%s\t%u\t%d\t%s\t%s\tNM:i:%u\n",
            read_name.c_str(), flag_1, s1, rnext2.c_str(), s2, len1,
            read_seq1_tmp.c_str(), read_score1_tmp.c_str(), mismatch1);
  } else if (best_match_1.times == 1) {
    fprintf(fout, "%s\t%d\t%s\t%u\t255\t%uM\t%s\t%u\t%d\t%s\t%s\tNM:i:%u\n",
            read_name.c_str(), flag_1, genome.name[chr_id_1].c_str(), s1,
            read_len1, rnext2.c_str(), s2, len1, read_seq1_tmp.c_str(),
            read_score1_tmp.c_str(), mismatch1);
  } else if (best_match_1.times >= 2
      && stat_paired_reads.stat_single_reads_1.ambiguous) {
    fprintf(fout, "%s\t%d\t%s\t%u\t255\t%uM\t%s\t%u\t%d\t%s\t%s\tNM:i:%u\n",
            read_name.c_str(), flag_1, genome.name[chr_id_1].c_str(), s1,
            read_len1, rnext2.c_str(), s2, len1, read_seq1_tmp.c_str(),
            read_score1_tmp.c_str(), mismatch1);
  }

  if (best_match_2.times == 0
      && stat_paired_reads.stat_single_reads_2.unmapped) {
    fprintf(fout, "%s\t%d\t*\t%u\t255\t*\t%s\t%u\t%d\t%s\t%s\tNM:i:%u\n",
            read_name.c_str(), flag_2, s2, rnext1.c_str(), s1, len2,
            read_seq2_tmp.c_str(), read_score2_tmp.c_str(), mismatch2);
  } else if (best_match_2.times == 1) {
    fprintf(fout, "%s\t%d\t%s\t%u\t255\t%uM\t%s\t%u\t%d\t%s\t%s\tNM:i:%u\n",
            read_name.c_str(), flag_2, genome.name[chr_id_2].c_str(), s2,
            read_len2, rnext1.c_str(), s1, len2, read_seq2_tmp.c_str(),
            read_score2_tmp.c_str(), mismatch2);
  } else if (best_match_2.times >= 2
      && stat_paired_reads.stat_single_reads_2.ambiguous) {
    fprintf(fout, "%s\t%d\t%s\t%u\t255\t%uM\t%s\t%u\t%d\t%s\t%s\tNM:i:%u\n",
            read_name.c_str(), flag_2, genome.name[chr_id_2].c_str(), s2,
            read_len2, rnext1.c_str(), s1, len2, read_seq2_tmp.c_str(),
            read_score2_tmp.c_str(), mismatch2);
  }
}
Exemplo n.º 5
0
int OutputBestPairedResults(const CandidatePosition& r1,
                            const CandidatePosition& r2, const int& frag_range,
                            const uint32_t& read_len1,
                            const uint32_t& read_len2, const Genome& genome,
                            const string& read_name, const string& read_seq1,
                            const string& read_score1, const string& read_seq2,
                            const string& read_score2, const bool& SAM,
                            FILE * fout) {
  string read_seq2_rev = ReverseComplimentString(read_seq2);
  string read_scr2_rev = ReverseString(read_score2);

  uint32_t chr_id1 = getChromID(genome.start_index, r1.genome_pos);
  uint32_t chr_id2 = getChromID(genome.start_index, r2.genome_pos);

  uint32_t s1 = 0, s2 = 0, e1 = 0, e2 = 0;
  ForwardChromPosition(r1.genome_pos, r1.strand, chr_id1, read_len1, genome, s1,
                       e1);
  ForwardChromPosition(r2.genome_pos, r2.strand, chr_id2, read_len2, genome, s2,
                       e2);

  uint32_t overlap_s = MAX(s1, s2);
  uint32_t overlap_e = MIN(e1, e2);

  uint32_t one_l = r1.strand == '+' ? s1 : MAX(overlap_e, s1);
  uint32_t one_r = r1.strand == '+' ? MIN(overlap_s, e1) : e1;

  uint32_t two_l = r1.strand == '+' ? MAX(overlap_e, s2) : s2;
  uint32_t two_r = r1.strand == '+' ? e2 : MIN(overlap_s, e2);

  int len = r1.strand == '+' ? (two_r - one_l) : (one_r - two_l);
  if (SAM) {
    return len;
  }

  string seq(len, 'N');
  string scr(len, 'B');
  if (len > 0 && len <= frag_range) {
    // lim_one: offset in merged sequence where overlap starts
    uint32_t lim_one = one_r - one_l;
    copy(read_seq1.begin(), read_seq1.begin() + lim_one, seq.begin());
    copy(read_score1.begin(), read_score1.begin() + lim_one, scr.begin());

    uint32_t lim_two = two_r - two_l;
    copy(read_seq2_rev.end() - lim_two, read_seq2_rev.end(),
         seq.end() - lim_two);
    copy(read_scr2_rev.end() - lim_two, read_scr2_rev.end(),
         scr.end() - lim_two);

    // deal with overlapping part
    if (overlap_s < overlap_e) {
      uint32_t one_bads = count(read_seq1.begin(), read_seq1.end(), 'N');
      int info_one = read_len1 - (one_bads + r1.mismatch);

      uint32_t two_bads = count(read_seq2_rev.begin(), read_seq2_rev.end(),
                                'N');
      int info_two = read_len2 - (two_bads + r2.mismatch);

      // use the mate with the most info to fill in the overlap
      if (info_one >= info_two) {
        uint32_t a = r1.strand == '+' ? (overlap_s - s1) : (e1 - overlap_e);
        uint32_t b = r1.strand == '+' ? (overlap_e - s1) : (e1 - overlap_s);
        copy(read_seq1.begin() + a, read_seq1.begin() + b,
             seq.begin() + lim_one);
        copy(read_score1.begin() + a, read_score1.begin() + b,
             scr.begin() + lim_one);
      } else {
        uint32_t a = r1.strand == '+' ? (overlap_s - s2) : (e2 - overlap_e);
        uint32_t b = r1.strand == '+' ? (overlap_e - s2) : (e2 - overlap_s);
        copy(read_seq2_rev.begin() + a, read_seq2_rev.begin() + b,
             seq.begin() + lim_one);
        copy(read_scr2_rev.begin() + a, read_scr2_rev.begin() + b,
             scr.begin() + lim_one);
      }
    }
  }

  uint32_t start_pos = r1.strand == '+' ? s1 : s2;
  fprintf(fout, "%s\t%u\t%u\tFRAG:%s\t%u\t%c\t%s\t%s\n",
          genome.name[chr_id1].c_str(), start_pos, start_pos + len,
          read_name.c_str(), r1.mismatch + r2.mismatch, r1.strand, seq.c_str(),
          scr.c_str());

  return 0;
}