Пример #1
0
static tmap_map_sams_t *
tmap_map1_sam_to_real(tmap_map_sams_t *sams, tmap_map1_aux_occ_t *occs, tmap_string_t *bases, int32_t seed2_len,
                       tmap_refseq_t *refseq, tmap_bwt_t *bwt, tmap_sa_t *sa, tmap_bwt_match_hash_t *hash, tmap_map_opt_t *opt) 
{
  tmap_map_sams_t *sams_tmp = NULL;
  tmap_map_sam_t *sam_cur = NULL;
  uint32_t i, j, aln_ref;
  tmap_bwt_int_t k, n, num_all_sa;

  // max # of entries
  for(i=n=0;i<sams->n;i++) {
      n += occs[i].l - occs[i].k + 1;
  }
  num_all_sa = n;

  // bound the # of returned hits
  if(opt->max_best_cals < n) {
      n = opt->max_best_cals;
  }

  // alloc
  sams_tmp = tmap_map_sams_init(sams);
  tmap_map_sams_realloc(sams_tmp, n);
            
  // copy over
  for(i=j=0;i<sams->n && j<n;i++) {
      tmap_map_sam_t *sam;

      sam = &sams->sams[i];

      // go through SA interval
      for(k=occs[i].k;k<=occs[i].l;k++) {
          uint32_t pos = 0, seqid = 0;
          tmap_bwt_int_t pacpos = 0;
          uint8_t strand;

          sam_cur = &sams_tmp->sams[j];
          tmap_map_sam_init(sam_cur);

          strand = sams->sams[i].strand;
          aln_ref = sam->aux.map1_aux->aln_ref;
              
          pacpos = bwt->seq_len - tmap_sa_pac_pos_hash(sa, bwt, k, hash);
          pacpos = (pacpos < aln_ref) ? 0 : (pacpos - aln_ref);
          pacpos++; // make one-based
          
          // NB: addressing the symptom, not the problem
          // This happens when we are at the end of the reference on the reverse
          // strand
          if(pacpos <= refseq->len && refseq->len < pacpos + aln_ref - 1) {
              aln_ref = refseq->len - pacpos + 1;
          }
          else if(refseq->len * 2 < pacpos + aln_ref - 1) {
              aln_ref = (refseq->len * 2) - pacpos + 1;
          }
          
          // save the hit
          if(0 < tmap_refseq_pac2real(refseq, pacpos, aln_ref, &seqid, &pos, &strand)) {
              // copy over previous parameters
              sam_cur->algo_id = TMAP_MAP_ALGO_MAP1;
              sam_cur->algo_stage = opt->algo_stage;
              sam_cur->strand = strand;
              sam_cur->seqid = seqid;
              sam_cur->pos = pos-1; // adjust to zero-based
              sam_cur->target_len = aln_ref;
              sam_cur->score_subo = INT32_MIN;
              if(0 < opt->seed2_length && seed2_len < bases->l) { // adjust if we used a secondary seed
                  // adjust both the target length and position
                  if(0 == strand) { // forward
                      sam_cur->target_len += (bases->l - seed2_len);
                      // NB: sam_cur->pos is zero based
                      if(refseq->annos[sam_cur->seqid].len < sam_cur->pos + sam_cur->target_len) {
                          sam_cur->target_len = refseq->annos[sam_cur->seqid].len - sam_cur->pos;
                      }
                  }
                  else { // reverse
                      if(sam_cur->pos < (bases->l - seed2_len)) { // before the start of the chromosome
                          sam_cur->target_len += sam_cur->pos;
                          sam_cur->pos = 0;
                      }
                      else { // move to the end of the read
                          sam_cur->target_len += (bases->l - seed2_len);
                          sam_cur->pos -= (bases->l - seed2_len);
                      }
                  }
              }
              else if(sam_cur->target_len < bases->l) { // do not adjust, we used the full read
                  sam_cur->target_len = bases->l;
              }

              // aux
              tmap_map_sam_malloc_aux(sam_cur);
              sam_cur->aux.map1_aux->n_mm = sam->aux.map1_aux->n_mm;
              sam_cur->aux.map1_aux->n_gapo = sam->aux.map1_aux->n_gapo;
              sam_cur->aux.map1_aux->n_gape = sam->aux.map1_aux->n_gape;
              sam_cur->aux.map1_aux->aln_ref = 0;
              sam_cur->aux.map1_aux->num_all_sa = num_all_sa;
              j++;

              // only save the top n hits
              if(n <= j) {  // equivalent to opt->max_bset_cals <= j
                  break;
              }
          }
      }
  }
  
  // destroy
  tmap_map_sams_destroy(sams);

  // realloc
  tmap_map_sams_realloc(sams_tmp, j);

  //NB: do not use occs later
  free(occs);
  occs = NULL;
  
  return sams_tmp;
}
Пример #2
0
static tmap_map_sams_t *
tmap_map2_aux_store_hits(tmap_refseq_t *refseq, tmap_map_opt_t *opt, 
                         tmap_map2_aln_t *aln, int32_t seq_len)
{
  int32_t i, j;
  tmap_map_sams_t *sams = NULL;

  if(NULL == aln) return NULL;

  sams = tmap_map_sams_init(NULL);
  tmap_map_sams_realloc(sams, aln->n);

  for(i=j=0;i<aln->n;i++) {
      tmap_map2_hit_t *p = aln->hits + i;
      uint32_t seqid = 0, pos = 0;
      uint8_t strand;
      int32_t beg;
      tmap_map_sam_t *sam = &sams->sams[j];
      tmap_map_sam_init(sam);

      //strand = (p->flag & 0x10) ? 1 : 0;

      // skip over duplicate hits, or sub-optimal hits to the same location
      if(0 < i) {
          tmap_map2_hit_t *q = aln->hits + i - 1;
          if(q->flag == p->flag && q->k == p->k && q->G <= p->G && q->tlen <= p->tlen) continue;
      }
      /*
      if(i < aln->n) {
          tmap_map2_hit_t *q = aln->hits + i + 1;
          if(p->flag == q->flag && p->k == q->k && p->G <= q->G && p->tlen <= q->tlen) continue;
      }
      */

      // adjust for contig boundaries
      if(tmap_refseq_pac2real(refseq, p->k, p->tlen, &seqid, &pos, &strand) <= 0) {
          if(1 == strand) { // reverse
              if(tmap_refseq_pac2real(refseq, p->k + p->tlen - 1, 1, &seqid, &pos, &strand) <= 0) {
                  continue;
              }
              else {
                  // move to the contig and position
                  p->k = refseq->annos[seqid].offset+1;
                  p->tlen = (p->tlen < refseq->annos[seqid].len) ? p->tlen : refseq->annos[seqid].len;
              }
          }
          else {
              if(tmap_refseq_pac2real(refseq, p->k, 1, &seqid, &pos, &strand) <= 0) {
                  continue;
              }
              else {
                  // move to the contig and position
                  p->k = refseq->annos[seqid].offset+1;
                  p->tlen = (p->tlen < refseq->annos[seqid].len) ? p->tlen : refseq->annos[seqid].len;
              }
          }
      }

      // adjust based on where the hit was in the read
      beg = (1 == strand) ? (seq_len - p->end) : p->beg;
      pos = (pos <= beg) ? 1 : (pos - beg); // adjust pos

      if((p->flag & 0x1)) {
          p->G2 = p->G; // Note: the flag indicates a repetitive match, so we need to update the sub-optimal score
      }

      sam->strand = strand;
      sam->seqid = seqid;
      sam->pos = pos-1; // make it zero-based
      sam->algo_id = TMAP_MAP_ALGO_MAP2;
      sam->algo_stage = opt->algo_stage;
      sam->score = p->G;
      sam->score_subo = p->G2;
      sam->target_len = (seq_len < p->tlen) ? p->tlen : seq_len;

      // auxiliary data
      tmap_map_sam_malloc_aux(sam);
      sam->aux.map2_aux->XE = p->n_seeds;
      sam->aux.map2_aux->XF = p->flag >> 16;
      if(p->l) {
          sam->aux.map2_aux->XI = p->l - p->k + 1;
      }
      else {
          sam->aux.map2_aux->XI = 0;
      }
      sam->aux.map2_aux->flag = (p->flag & 0x1) ? 1 : 0;
      j++;
  }
  if(j != aln->n) {
      tmap_map_sams_realloc(sams, j);
  }

  return sams;
}
Пример #3
0
void 
tmap_bwt_check_core2(tmap_bwt_t *bwt, int32_t length, int32_t print_msg, int32_t print_sa, int32_t warn)
{
  uint8_t *seqs[2] = {NULL,NULL};
  char *str = NULL;
  int32_t i, asymmetric, k, l;
  uint64_t hash_j;
  int64_t sum, j;
  tmap_bwt_match_occ_t sa;
  tmap_bwt_int_t n[2];

  for(i=1;i<=length;i++) {
      seqs[0] = tmap_calloc(i, sizeof(uint8_t), "seqs[0]");
      seqs[1] = tmap_calloc(i, sizeof(uint8_t), "seqs[1]");
      str = tmap_calloc(i+1, sizeof(char), "str");
      for(j=0;j<i;j++) {
          seqs[1][j] = 3;
      }

      asymmetric = 0;
      j = 0;
      hash_j = sum = 0;
      while(1) {
          if(i == j) {
              for(k=0;k<i;k++) {
                  seqs[1][k] = 3 - seqs[0][i-k-1];
              }
              for(k=0;k<2;k++) {
                  //n[k] = tmap_bwt_match_exact(bwt, i, seqs[k], &sa);
                  n[k] = tmap_bwt_match_exact_reverse(bwt, i, seqs[k], &sa);
                  if(0 == k) {
                      if(0 < n[k] && TMAP_BWT_INT_MAX != sa.k && sa.k <= sa.l) {
                          sum += n[k];
                      }
                      if(1 == print_msg && 1 == print_sa) {
                          for(l=0;l<i;l++) {
                              str[l] = "ACGTN"[seqs[k][l]];
                          }
                          if(0 < n[k] && TMAP_BWT_INT_MAX != sa.k && sa.k <= sa.l) {
                              tmap_progress_print2("%s\t%llu\t%llu\t%llu", str, sa.k, sa.l, n[k]);
#ifdef TMAP_BWT_CHECK_DEBUG 
                              while(sa.k <= sa.l) {
                                  uint32_t seqid, pos;
                                  uint8_t strand;
                                  tmap_bwt_int_t pacpos = tmap_sa_pac_pos(tmap_bwt_index->sa, bwt, sa.k);
                                  if(0 < tmap_refseq_pac2real(tmap_bwt_index->refseq, pacpos, 1, &seqid, &pos, &strand)) {
                                      tmap_progress_print2("%s\t%llu\t%llu\t%llu\t%c%u:%u", str, sa.k, sa.l, n[k],
                                                           "+-"[strand], seqid, pos);
                                  }
                                  else {
                                      tmap_progress_print2("%s\t%llu\t%llu\t%llu\t%c%u:%u", str, sa.k, sa.l, n[k],
                                                           '?', 0, 0);
                                  }
                                  sa.k++;
                              }
#endif
                          }
                          else {
                              tmap_progress_print2("%s\tNA\tNA\tNA", str);
                          }
                      }
                  }
              }
              if(0 == asymmetric && n[0] != n[1]) {
                  asymmetric = 1;
                  //fprintf(stderr, "n[0]=%u n[1]=%u\n", n[0], n[1]);
                  tmap_error("Asymmetry found", Warn, OutOfRange);
              }

              j--;
              while(0 <= j && 3 == seqs[0][j]) {
                  seqs[0][j] = 0;
                  hash_j >>= 2;
                  j--;
              }
              if(j < 0) break;
              seqs[0][j]++;
              hash_j++;
              j++;
          }
          else {
              hash_j <<= 2;
              j++;
          }
      }

      free(seqs[0]);
      free(seqs[1]);
      free(str);

      j = (sum == (bwt->seq_len - i + 1)) ? 0 : 1; // j==1 on fail
      if(1 == print_msg) {
          if(0 == j) tmap_progress_print2("%d-mer validation passed", i);
          else tmap_progress_print2("%d-mer validation failed: observed (%llu) != expected (%llu)\n", i, sum, bwt->seq_len - i + 1);
      }
      if(0 == warn && 1 == j) {
          tmap_error("inconsistency found in the BWT", Exit, OutOfRange);
      }
  }