예제 #1
0
파일: tmap_map1.c 프로젝트: Brainiarc7/TS
tmap_map_sams_t*
tmap_map1_thread_map(void **data, tmap_seq_t **seqs, tmap_index_t *index, tmap_bwt_match_hash_t *hash, tmap_rand_t *rand, tmap_map_opt_t *opt)
{
  int32_t seq_len = 0;;
  tmap_map_sams_t *sams = NULL;

  // sequence length
  seq_len = tmap_seq_get_bases_length(seqs[0]);

  // sequence length not in range
  if((0 < opt->min_seq_len && seq_len < opt->min_seq_len)
     || (0 < opt->max_seq_len && opt->max_seq_len < seq_len)) {
      return tmap_map_sams_init(NULL);
  }

  // not enough bases, ignore
  if(0 < opt->seed_length && seq_len < opt->seed_length){
      return tmap_map_sams_init(NULL);
  }

  // core algorithm; use the reverse
  sams = tmap_map1_thread_map_core(data, seqs, seq_len, index, hash, opt);

  return sams;
}
예제 #2
0
파일: tmap_map1.c 프로젝트: Brainiarc7/TS
static tmap_map_sams_t*
tmap_map1_thread_map_core(void **data, tmap_seq_t *seqs[4], int32_t seq_len,
                          tmap_index_t *index, tmap_bwt_match_hash_t *hash, tmap_map_opt_t *opt)
{
  tmap_map1_thread_data_t *d = (tmap_map1_thread_data_t*)(*data);
  int32_t seed2_len = 0;
  tmap_map_opt_t opt_local = (*opt); // copy over values
  tmap_map_sams_t *sams = NULL;
  tmap_string_t *bases = NULL;

  if((0 < opt->min_seq_len && seq_len < opt->min_seq_len)
     || (0 < opt->max_seq_len && opt->max_seq_len < seq_len)) {
      // go to the next loop
      return tmap_map_sams_init(NULL);
  }

  // not enough bases, ignore
  if(0 < opt->seed_length && seq_len < opt->seed_length){
      return tmap_map_sams_init(NULL);
  }

  if(opt->seed2_length < 0 || seq_len < opt->seed2_length) {
      seed2_len = seq_len;
      // remember to round up
      opt_local.max_mm = (opt->max_mm < 0) ? (int)(0.99 + opt->max_mm_frac * seed2_len) : opt->max_mm; 
      opt_local.max_gapo = (opt->max_gapo < 0) ? (int)(0.99 + opt->max_gapo_frac * seed2_len) : opt->max_gapo; 
      opt_local.max_gape = (opt->max_gape < 0) ? (int)(0.99 + opt->max_gape_frac * seed2_len) : opt->max_gape; 
  }
  else {
      seed2_len = opt->seed2_length;
      opt_local.max_mm = d->max_mm;
      opt_local.max_gapo = d->max_gapo;
      opt_local.max_gape = d->max_gape;
  }
  
  // get bases for the reversed sequence
  bases = tmap_seq_get_bases(seqs[2]);	

  // primary width, use seed2 length
  if(d->width_length < seed2_len) {
      d->width_length = seed2_len;
      d->width = tmap_realloc(d->width, (1+d->width_length) * sizeof(tmap_bwt_match_width_t), "d->width");
      memset(d->width, 0, (1+d->width_length) * sizeof(tmap_bwt_match_width_t));
  }
  // NB: use the reversed sequence
  tmap_bwt_match_cal_width_reverse(index->bwt, seed2_len, bases->s + (seq_len - seed2_len), d->width);

  // seed width
  if(0 < opt->seed_length) {
      // NB: use the reversed sequence
      tmap_bwt_match_cal_width_reverse(index->bwt, opt->seed_length, bases->s + (seq_len - opt->seed_length), d->seed_width);
  }

  // NB: use the reverse complimented sequence
  sams = tmap_map1_aux_core(seqs[1], index, hash, d->width, (0 < opt_local.seed_length) ? d->seed_width : NULL, &opt_local, d->stack, seed2_len);

  return sams;
}
예제 #3
0
tmap_map_sams_t*
tmap_map_vsw_thread_map(void **data, tmap_seq_t **seqs, tmap_index_t *index, tmap_map_stats_t *stat, tmap_rand_t *rand, tmap_bwt_match_hash_t *hash[2], tmap_map_opt_t *opt)
{
  int32_t seq_len = 0;;
  tmap_seq_t *seqs_tmp[2]={NULL, NULL};
  tmap_map_sams_t *sams = NULL;

  // sequence length
  seq_len = tmap_seq_get_bases_length(seqs[0]);

  // sequence length not in range
  if((0 < opt->min_seq_len && seq_len < opt->min_seq_len)
     || (0 < opt->max_seq_len && opt->max_seq_len < seq_len)) {
      return tmap_map_sams_init(NULL);
  }

  // clone the sequence 
  seqs_tmp[0] = seqs[0];
  seqs_tmp[1] = seqs[1];

  // core algorithm
  sams = tmap_map_vsw_thread_map_core(data, seqs_tmp, seq_len, index, opt);

  return sams;
}
예제 #4
0
// reverse and reverse compliment
tmap_map_sams_t*
tmap_map_vsw_thread_map_core(void **data, tmap_seq_t *seqs[2], int32_t seq_len,
                             tmap_index_t *index, tmap_map_opt_t *opt)
{
  int32_t i;
  tmap_map_sams_t *sams = NULL;

  if((0 < opt->min_seq_len && seq_len < opt->min_seq_len)
     || (0 < opt->max_seq_len && opt->max_seq_len < seq_len)) {
      // go to the next loop
      return tmap_map_sams_init(NULL);
  }

  sams = tmap_map_sams_init(NULL);
  tmap_map_sams_realloc(sams, index->refseq->num_annos<<1); // one for each contig and strand

  for(i=0;i<index->refseq->num_annos<<1;i++) {
      tmap_map_sam_t *s;

      // save
      s = &sams->sams[i];

      // save the hit
      s->algo_id = TMAP_MAP_ALGO_MAPVSW;
      s->algo_stage = opt->algo_stage;
      s->strand = i & 1;
      s->seqid = i >> 1;
      s->pos = 0;
      s->target_len = index->refseq->annos[s->seqid].len;
      s->score_subo = INT32_MIN;
      
      // mapvswaux data
      tmap_map_sam_malloc_aux(s, TMAP_MAP_ALGO_MAPVSW);
  }

  return sams;
}
예제 #5
0
파일: tmap_map3_aux.c 프로젝트: a1aks/TMAP
// TODO: memory pools?
tmap_map_sams_t *
tmap_map3_aux_core(tmap_seq_t *seq, 
                   uint8_t *flow_order,
                   int32_t flow_order_len,
                   tmap_refseq_t *refseq,
                   tmap_bwt_t *bwt,
                   tmap_sa_t *sa,
                   tmap_bwt_match_hash_t *hash,
                   tmap_map_opt_t *opt)
{
  int32_t i, j, n, seed_length, hp_diff = 0;
  int32_t seq_len;
  tmap_string_t *bases;
  uint8_t *query;
  uint8_t *flow=NULL;
  tmap_map3_aux_seed_t *seeds;
  int32_t m_seeds, n_seeds;
  tmap_map_sams_t *sams = NULL;

  if(0 < opt->hp_diff) {
      // set up the flow order to be used
      if(NULL == flow_order) {
          hp_diff = 0;
      }
      else {
          flow = tmap_malloc(sizeof(uint8_t)*flow_order_len, "flow[0]");
          for(i=0;i<flow_order_len;i++) {
              flow[i] = flow_order[i]; // forward
          }
      }
  }

  // init
  sams = tmap_map_sams_init(NULL);

  // update the seed length based on the read length
  seed_length = opt->seed_length;
  if(0 == opt->seed_length_set) {
      i = tmap_seq_get_bases_length(seq);
      while(0 < i) {
          seed_length++;
          i >>= 1; // divide by two
      }
  }
예제 #6
0
tmap_map_sams_t *
tmap_map2_aux_core(tmap_map_opt_t *_opt,
                   tmap_seq_t *seqs[4],
                   tmap_refseq_t *refseq,
                   tmap_bwt_t *bwt,
                   tmap_sa_t *sa,
                   tmap_bwt_match_hash_t *hash,
                   tmap_rand_t *rand,
                   tmap_map2_global_mempool_t *pool)
{
  tmap_map_opt_t opt;
  tmap_seq_t *orig_seq = NULL;
  tmap_string_t *seq[2]={NULL, NULL};
  tmap_string_t *rseq[2]={NULL, NULL};
  tmap_map_sams_t *sams = NULL;
  tmap_map2_aln_t *b[2]={NULL,NULL};
  tmap_string_t *bases = NULL;
  int32_t i, k, l, num_n;

  opt = (*_opt);

  // sequence length
  bases = tmap_seq_get_bases(seqs[0]);
  l = bases->l;

  // update the local opt
  tmap_map2_aux_core_update_opt(&opt, _opt, l);

  // set opt->score_thr
  if(pool->max_l < l) { // then enlarge working space for tmap_sw_extend_core()
      int32_t tmp;
      if(0 == opt.pen_gape) {
          tmp = ((l + 1) / 2 * opt.score_match + opt.pen_gape) + l;
      }
      else {
          tmp = ((l + 1) / 2 * opt.score_match + opt.pen_gape) / opt.pen_gape + l;
      }
      pool->max_l = l;
      pool->aln_mem = tmap_realloc(pool->aln_mem, sizeof(uint8_t) * (tmp + 2) * 24, "pool->aln_mem");
  }

  // get the number of Ns
  for(i=num_n=0;i<l;i++) {
      uint8_t c = (uint8_t)tmap_nt_char_to_int[(int)bases->s[i]];
      if(c >= 4) num_n++; // FIXME: ambiguous bases are not properly handled
  }

  // will we always be lower than the score threshold
  if((l*opt.score_match) + (num_n*opt.pen_mm) < opt.score_thr) {
      return tmap_map_sams_init(NULL);
  }
  
  // save sequences
  seq[0] = tmap_seq_get_bases(seqs[0]); 
  seq[1] = tmap_seq_get_bases(seqs[1]);
  rseq[0] = tmap_seq_get_bases(seqs[2]); 
  rseq[1] = tmap_seq_get_bases(seqs[3]);

  // handle ambiguous bases
  if(0 < num_n) {
      // save original to de-randomize later
      orig_seq = tmap_seq_clone(seqs[0]);

      // randomize
      for(i=0;i<l;i++) {
          uint8_t c = (uint8_t)bases->s[i];
          if(c >= 4) {
              c = (int)(tmap_rand_get(rand) * 4); // FIXME: ambiguous bases are not properly handled
              seq[0]->s[i] = c; // original
              seq[1]->s[l-1-i] = 3 - c; // reverse compliment
              rseq[0]->s[l-1-i] = 3 - c; // reverse compliment
              rseq[1]->s[i] = c; // original
              //rseq[0]->s[l-1-i] = c; // reverse 
              //rseq[1]->s[i] = 3 - c; // compliment
          }
      }
  }

  // alignment
  b[0] = tmap_map2_aux_aln(&opt, refseq, bwt, sa, hash, seq, 0, pool);
  for(k = 0; k < b[0]->n; ++k) {
      if(b[0]->hits[k].n_seeds < opt.seeds_rev) break;
  } 
  if(k < b[0]->n) {
      b[1] = tmap_map2_aux_aln(&opt, refseq, bwt, sa, hash, rseq, 1, pool);
      for(i = 0; i < b[1]->n; ++i) {
          tmap_map2_hit_t *p = b[1]->hits + i;
          int x = p->beg;
          p->flag ^= 0x10, p->is_rev ^= 1; // flip the strand
          p->beg = l - p->end;
          p->end = l - x;
          if(p->l == 0) {
              if(refseq->len * 2 < (p->k + p->tlen)) p->k = 0;
              else p->k = 2 * refseq->len - (p->k + p->tlen);
          }
      }
      tmap_map2_aux_merge_hits(b, l, 0);
  } else b[1] = 0;
  
  // set the flag to forward/reverse
  tmap_map2_aux_flag_fr(b);
  
  // tlen may overestimated due to not counting insertions properly, bound it!
  for(i = 0; i < b[0]->n; ++i) {
      if(refseq->len * 2 <= b[0]->hits[i].k + b[0]->hits[i].tlen) {
          b[0]->hits[i].tlen = (refseq->len * 2) - b[0]->hits[i].k;
      }
      else if(b[0]->hits[i].k < refseq->len && refseq->len <= b[0]->hits[i].k + b[0]->hits[i].tlen) {
          b[0]->hits[i].tlen = refseq->len - b[0]->hits[i].k;
      }
  }
  
  // make one-based for pac2real
  for(i = 0; i < b[0]->n; ++i) {
      b[0]->hits[i].k++;
  }
  
  // store in SAM 
  sams = tmap_map2_aux_store_hits(refseq, &opt, b[0], l);

  // free
  tmap_map2_aln_destroy(b[0]);

  // revert ambiguous bases 
  if(0 < num_n) {
      // de-randomize
      bases = tmap_seq_get_bases(orig_seq);
      for(i=0;i<l;i++) {
          uint8_t c = (uint8_t)bases->s[i];
          if(c >= 4) { 
              // NB: always keep them at "4"
              seq[0]->s[i] = c; // original
              seq[1]->s[l-1-i] = c; // reverse compliment
              rseq[0]->s[l-1-i] = c; // reverse compliment
              rseq[1]->s[i] = c; // original
              //rseq[0]->s[l-1-i] = c; // reverse 
              //rseq[1]->s[i] = 3 - c; // compliment
          }
      }
      tmap_seq_destroy(orig_seq);
  }

  return sams;
}
예제 #7
0
static tmap_map_sams_t *
tmap_map2_aux_store_hits(tmap_refseq_t *refseq, tmap_map_opt_t *opt, 
                         tmap_map2_aln_t *aln, int32_t seq_len)
{
  int32_t i, j;
  tmap_map_sams_t *sams = NULL;

  if(NULL == aln) return NULL;

  sams = tmap_map_sams_init(NULL);
  tmap_map_sams_realloc(sams, aln->n);

  for(i=j=0;i<aln->n;i++) {
      tmap_map2_hit_t *p = aln->hits + i;
      uint32_t seqid = 0, pos = 0;
      uint8_t strand;
      int32_t beg;
      tmap_map_sam_t *sam = &sams->sams[j];
      tmap_map_sam_init(sam);

      //strand = (p->flag & 0x10) ? 1 : 0;

      // skip over duplicate hits, or sub-optimal hits to the same location
      if(0 < i) {
          tmap_map2_hit_t *q = aln->hits + i - 1;
          if(q->flag == p->flag && q->k == p->k && q->G <= p->G && q->tlen <= p->tlen) continue;
      }
      /*
      if(i < aln->n) {
          tmap_map2_hit_t *q = aln->hits + i + 1;
          if(p->flag == q->flag && p->k == q->k && p->G <= q->G && p->tlen <= q->tlen) continue;
      }
      */

      // adjust for contig boundaries
      if(tmap_refseq_pac2real(refseq, p->k, p->tlen, &seqid, &pos, &strand) <= 0) {
          if(1 == strand) { // reverse
              if(tmap_refseq_pac2real(refseq, p->k + p->tlen - 1, 1, &seqid, &pos, &strand) <= 0) {
                  continue;
              }
              else {
                  // move to the contig and position
                  p->k = refseq->annos[seqid].offset+1;
                  p->tlen = (p->tlen < refseq->annos[seqid].len) ? p->tlen : refseq->annos[seqid].len;
              }
          }
          else {
              if(tmap_refseq_pac2real(refseq, p->k, 1, &seqid, &pos, &strand) <= 0) {
                  continue;
              }
              else {
                  // move to the contig and position
                  p->k = refseq->annos[seqid].offset+1;
                  p->tlen = (p->tlen < refseq->annos[seqid].len) ? p->tlen : refseq->annos[seqid].len;
              }
          }
      }

      // adjust based on where the hit was in the read
      beg = (1 == strand) ? (seq_len - p->end) : p->beg;
      pos = (pos <= beg) ? 1 : (pos - beg); // adjust pos

      if((p->flag & 0x1)) {
          p->G2 = p->G; // Note: the flag indicates a repetitive match, so we need to update the sub-optimal score
      }

      sam->strand = strand;
      sam->seqid = seqid;
      sam->pos = pos-1; // make it zero-based
      sam->algo_id = TMAP_MAP_ALGO_MAP2;
      sam->algo_stage = opt->algo_stage;
      sam->score = p->G;
      sam->score_subo = p->G2;
      sam->target_len = (seq_len < p->tlen) ? p->tlen : seq_len;

      // auxiliary data
      tmap_map_sam_malloc_aux(sam);
      sam->aux.map2_aux->XE = p->n_seeds;
      sam->aux.map2_aux->XF = p->flag >> 16;
      if(p->l) {
          sam->aux.map2_aux->XI = p->l - p->k + 1;
      }
      else {
          sam->aux.map2_aux->XI = 0;
      }
      sam->aux.map2_aux->flag = (p->flag & 0x1) ? 1 : 0;
      j++;
  }
  if(j != aln->n) {
      tmap_map_sams_realloc(sams, j);
  }

  return sams;
}
예제 #8
0
tmap_map_sams_t *
tmap_map1_aux_core(tmap_seq_t *seq, tmap_index_t *index,
                   tmap_bwt_match_hash_t *hash,
                   tmap_bwt_match_width_t *width, tmap_bwt_match_width_t *seed_width, tmap_map_opt_t *opt,
                   tmap_map1_aux_stack_t *stack, int32_t seed2_len)
{
  int32_t max_mm = opt->max_mm, max_gapo = opt->max_gapo, max_gape = opt->max_gape, seed_max_diff = opt->seed_max_diff;
  int32_t best_score, next_best_score;
  int32_t best_cnt = 0;
  int32_t i, j, num_n = 0;
  int32_t max_edit_score;
  tmap_bwt_match_occ_t match_sa_start;
  tmap_string_t *bases=NULL;
  tmap_map_sams_t *sams = NULL;
  int32_t max_diff, best_diff;
  tmap_bwt_int_t k, l;
  tmap_refseq_t *refseq = index->refseq;
  tmap_bwt_t *bwt = index->bwt;
  tmap_sa_t *sa = index->sa;
  tmap_map1_aux_occ_t *occs = NULL;


  max_edit_score = opt->pen_mm;
  //if(max_edit_score < opt->pen_gapo + opt->pen_gape) max_edit_score = opt->pen_gapo + opt->pen_gape;
  //if(max_edit_score < opt->pen_gape) max_edit_score = opt->pen_gape;

  bases = tmap_seq_get_bases(seq);
  /*
  fputc('\n', stderr);
  for(i=0;i<bases->l;i++) {
      fputc("ACGTN"[(int)bases->s[i]], stderr);
  }
  fputc('\n', stderr);
  */
  
  // the maximum # of differences
  if(bases->l <= TMAP_MAP_OPT_MAX_DIFF_READ_LENGTH) {
      best_diff = max_diff = opt->max_diff_table[bases->l];
  }
  else {
      best_diff = max_diff = opt->max_diff_table[TMAP_MAP_OPT_MAX_DIFF_READ_LENGTH];
  }
  
  // bound differenes by the maximum # of differences
  if(max_diff < max_mm) max_mm = max_diff;
  if(max_diff < max_gapo) max_gapo = max_diff;
  //if(max_diff < max_gape) max_gape = max_diff;
  
  best_score = next_best_score = aln_score(max_mm+1, max_gapo+1, max_gape+1, opt);

  // check whether there are too many N
  for(j=bases->l-seed2_len,num_n=0;j<bases->l;j++) {
      if(3 < bases->s[j]) {
          num_n++;
      }
  }
  if(max_mm < num_n || max_diff < num_n) {
      return tmap_map_sams_init(NULL);
  }

  // initialize
  sams = tmap_map_sams_init(NULL);
  occs = NULL;

  match_sa_start.offset = 0;
  match_sa_start.hi = 0;
  match_sa_start.k = 0;
  match_sa_start.l = bwt->seq_len;

  stack = tmap_map1_aux_stack_reset(stack, max_mm, max_gapo, max_gape, opt); // reset stack
  tmap_map1_aux_stack_push(stack, bases->l, &match_sa_start, 0, 0, 0, STATE_M, 0, NULL, opt);

  while(0 < tmap_map1_aux_stack_size(stack) && tmap_map1_aux_stack_size(stack) < opt->max_entries) {
      tmap_map1_aux_stack_entry_t *e = NULL;
      int32_t len=-1; 
      int32_t n_seed_mm=0, offset, width_cur_i;
      const uint8_t *str=NULL;
      int32_t sam_found, m;
      tmap_bwt_match_width_t *width_cur = NULL;
      const tmap_bwt_match_width_t *seed_width_cur = NULL;
      tmap_bwt_match_occ_t match_sa_cur, match_sa_next[4];
      
      // get the best entry
      e = tmap_map1_aux_stack_pop(stack); 

      // bound with best score
      if(best_score + max_edit_score < e->score) {
          break; // no need to continue
      }

      // some more information
      match_sa_cur = e->match_sa; 

      // check if we have too many edits
      m = max_diff - (e->n_mm + e->n_gapo + e->n_gape);
      if(m < 0) {
          continue; // too many edits
      }

      // get the rest of the information
      offset = e->offset; // zero-based
      str = (uint8_t*)bases->s;
      len = bases->l;
      width_cur = width;
      width_cur_i = seed2_len - (len - offset);

      if(NULL != seed_width) {
          seed_width_cur = seed_width;
          n_seed_mm = seed_max_diff - (e->n_mm + e->n_gapo + e->n_gape); // consider only mismatches in the seed
      }
      else {
          seed_width_cur = NULL;
      }
      if(0 < width_cur_i && m < width_cur[width_cur_i-1].bid) { // too many edits
          continue;
      }

      // check whether a sam is found
      sam_found = 0;
      if(len - seed2_len == offset) {
          sam_found = 1;
      }
      else if(max_mm == e->n_mm // no mismatches from any state
              && ((e->state == STATE_M && max_gapo == e->n_gapo) // in STATE_M but no more gap opens
                  || (e->state != STATE_M && max_gape == e->n_gape))) { // in STATE_I/STATE_D but no more extensions
          if(0 < tmap_bwt_match_hash_exact_alt_reverse(bwt, offset, str, &match_sa_cur, hash)) { // the alignment must match exactly to sam
              sam_found = 2;
          }
          else {
              continue; // no sam, skip
          }
      }

      if(0 < sam_found) { // alignment found
          // check for duplicates
          if(0 < sams->n) {
              for(i=0;i<sams->n;i++) {
                  // check contained
                  if(match_sa_cur.k <= occs[i].k
                     && occs[i].k <= match_sa_cur.l) { // MK <= SK <= ML
                      if(occs[i].l <= match_sa_cur.l) { // MK <= SK <= SL <= ML
                          // Want (SK - MK) + (ML - SL)
                          k = occs[i].k - match_sa_cur.k; // (SK - MK)
                          k += match_sa_cur.l - occs[i].l; // (ML - SL)
                          occs[i].l = match_sa_cur.l; // Make SL = ML
                      }
                      else { // MK <= SK <= ML <= SL
                          k = occs[i].k - match_sa_cur.k; // (SK - MK)
                      }
                      occs[i].k = match_sa_cur.k; // Make SK = MK
                      break;
                  }
                  else if(match_sa_cur.k <= occs[i].l
                          && occs[i].l <= match_sa_cur.l) { // MK <= SL <= ML
                      if(match_sa_cur.k <= occs[i].k) { // MK <= SK <= SL <= ML
                          // Want (SK - MK) + (ML - SL)
                          k = occs[i].k - match_sa_cur.k; // (SK - MK)
                          k += match_sa_cur.l - occs[i].l; // (ML - SL)
                          occs[i].k = match_sa_cur.k; // Make SK = MK
                      }
                      else { // SK <= MK <= SL <= ML
                          k = match_sa_cur.l - occs[i].l; // (ML - SL)
                      }
                      occs[i].l = match_sa_cur.l; // Make SL = ML
                      break;
                  }
              }
              if(i < sams->n) {
                  // shadow
                  if(0 < k) {
                      //tmap_map1_aux_stack_shadow(k, bwt->seq_len, e->last_diff_offset, width_cur);
                      width_cur_i = seed2_len - (len - e->last_diff_offset);
                      tmap_map1_aux_stack_shadow(k, seed2_len, width_cur_i, width_cur);
                  }
                  sam_found = 0;
                  continue;
              }
          }

          int32_t score = aln_score(e->n_mm, e->n_gapo, e->n_gape, opt);
          int32_t do_add = 1;
          if(sams->n == 0) {
              best_score = score;
              best_cnt = 0;
              best_diff = e->n_mm + e->n_gapo + e->n_gape;
          }
          if(score == best_score) {
              best_cnt += match_sa_cur.l - match_sa_cur.k + 1;
          }
          else {
              if(best_diff + 1 <= max_diff) {
                  max_diff = best_diff + 1;
              }
              if(score < next_best_score) {
                  next_best_score = score;
              }
              else if(next_best_score < score) {
                  // no need to examine further
                  break;
              }
          }
          if(do_add) { // append
              uint32_t op, op_len, cigar_i;
              tmap_map_sam_t *sam = NULL;
              tmap_map1_aux_stack_entry_t *cur = NULL;
  
              tmap_map_sams_realloc(sams, sams->n+1);
              occs = tmap_realloc(occs, sizeof(tmap_map1_aux_occ_t) * sams->n, "occs");

              sam = &sams->sams[sams->n-1];

              sam->algo_id = TMAP_MAP_ALGO_MAP1;
              sam->algo_stage = 0;
              sam->score = e->score;

              // aux data
              tmap_map_sam_malloc_aux(sam);
              k = occs[sams->n-1].k = match_sa_cur.k;
              l = occs[sams->n-1].l= match_sa_cur.l;
              sam->aux.map1_aux->n_mm = e->n_mm;
              sam->aux.map1_aux->n_gapo = e->n_gapo;
              sam->aux.map1_aux->n_gape = e->n_gape;

              // aux data: reference length
              cur = e;
              i = e->i;
              sam->aux.map1_aux->aln_ref = 0;
              cigar_i = 0;
              if(2 == sam_found) { // we used 'tmap_bwt_match_exact_alt_reverse' 
                  op = STATE_M;
                  op_len = offset;
              }
              else {
                  op = -1;
                  op_len = 0;
              }
              while(0 <= i) {
                  cur = stack->entry_pool[i];
                  if(len == cur->offset) break;
                  if(op != cur->state) {
                      if(STATE_M == op || STATE_D == op) {
                          sam->aux.map1_aux->aln_ref += op_len;
                      }
                      op = cur->state;
                      op_len = 1;
                  }
                  else {
                      op_len++;
                  }
                  //fprintf(stderr, "cur->state=%c op_len=%d cur->prev_i=%d k=%u l=%u\n", "MIDS"[cur->state], op_len, cur->prev_i, cur->match_sa.k, cur->match_sa.l);
                  i = cur->prev_i;
              }
              if(STATE_M == op || STATE_D == op) {
                  sam->aux.map1_aux->aln_ref += op_len;
              }

              /*
              fprintf(stderr, "shadow 2 k=%u l=%u len=%d offset=%d last_diff_offset=%d\n",
                      k, l, len, offset, e->last_diff_offset);
              fprintf(stderr, "e->n_mm=%d e->n_gapo=%d e->n_gape=%d\n",
                      e->n_mm, e->n_gapo, e->n_gape);
              */
              //tmap_map1_aux_stack_shadow(l - k + 1, bwt->seq_len, e->last_diff_offset, width_cur);
              width_cur_i = seed2_len - (len - e->last_diff_offset);
              tmap_map1_aux_stack_shadow(l - k + 1, seed2_len, width_cur_i, width_cur);
              if(opt->max_best_cals < best_cnt) {
                  // ignore if too many "best" have been found
                  occs[sams->n-1].l -= (best_cnt - opt->max_best_cals); // only save the maximum
                  break;
              }
          }
      }
      else {
          int32_t allow_diff = 1, allow_mm = (e->n_mm < max_mm) ? 1 : 0;

          // decrement the offset
          offset--;

          // use a bound for mismatches
          if(0 < offset) {
              int32_t seed_width_cur_i = offset - (len - opt->seed_length);
              width_cur_i = seed2_len - (len - offset);
              if(0 < width_cur_i) {
                  if(m-1 < width_cur[width_cur_i-1].bid) { 
                      allow_diff = 0;
                  }
                  else if(width_cur[width_cur_i-1].bid == m-1
                          && width_cur[width_cur_i].bid == m-1
                          && width_cur[width_cur_i-1].w == width_cur[width_cur_i].w) {
                      allow_mm = 0;
                  }
              }
              if(0 < seed_width_cur_i) {
                  if(NULL != seed_width_cur && 0 < seed_width_cur_i) {
                      if(n_seed_mm-1 < seed_width_cur[seed_width_cur_i-1].bid) {
                          allow_diff = 0;
                      }
                      else if(seed_width_cur[seed_width_cur_i-1].bid == n_seed_mm-1
                              && seed_width_cur[seed_width_cur_i].bid == n_seed_mm-1
                              && seed_width_cur[seed_width_cur_i-1].w == seed_width_cur[seed_width_cur_i].w) {
                          allow_mm = 0;
                      }
                  }
              }
          }

          // retrieve the next SA interval
          tmap_bwt_match_hash_2occ4(bwt, &e->match_sa, match_sa_next, hash); 

          // insertions/deletions
          if(allow_diff 
             && opt->indel_ends_bound + e->n_gapo + e->n_gape <= offset
             && opt->indel_ends_bound + e->n_gapo + e->n_gape <= len - offset) { // check to add gaps
              if(STATE_M == e->state) { // gap open
                  if(e->n_gapo < max_gapo) { // gap open is allowed
                      // insertion
                      tmap_map1_aux_stack_push(stack, offset, &match_sa_cur, e->n_mm, e->n_gapo + 1, e->n_gape, STATE_I, 1, e, opt);

                      // deletion
                      for(j = 0; j != 4; ++j) {
                          if(match_sa_next[j].k <= match_sa_next[j].l) {
                              //   remember that a gap deletion does not consume a
                              //   read base, so use 'offset+1'
                              tmap_map1_aux_stack_push(stack, offset+1, &match_sa_next[j], e->n_mm, e->n_gapo + 1, e->n_gape, STATE_D, 1, e, opt);
                          }
                      }
                  }
              }
              else if(STATE_I == e->state) { // extension of an insertion
                  if(e->n_gape < max_gape) { // gap extension is allowed
                      tmap_map1_aux_stack_push(stack, offset, &match_sa_cur, e->n_mm, e->n_gapo, e->n_gape + 1, STATE_I, 1, e, opt);
                  }
              }
              else if(STATE_D == e->state) { // extension of a deletion
                  if(e->n_gape < max_gape) {
                      if(e->n_gape + e->n_gapo < max_diff || e->match_sa.l - e->match_sa.k + 1 < opt->max_cals_del) { // gap extension is allowed
                          for(j = 0; j != 4; ++j) {
                              if(match_sa_next[j].k <= match_sa_next[j].l) {
                                  //   remember that a gap deletion does not consume a
                                  //   read base, so use 'offset+1'
                                  tmap_map1_aux_stack_push(stack, offset+1, &match_sa_next[j], e->n_mm, e->n_gapo, e->n_gape + 1, STATE_D, 1, e, opt);
                              }
                          }
                      }
                  }
              }
          }

          // mismatches
          if(1 == allow_mm && 1 == allow_diff) { // mismatches allowed
              for(j=0;j<4;j++) {
                  int32_t c = (str[offset] + j) & 3;
                  int32_t is_mm = (0 < j || 3 < str[offset]);
                  if(match_sa_next[c].k <= match_sa_next[c].l) {
                      tmap_map1_aux_stack_push(stack, offset, &match_sa_next[c], e->n_mm + is_mm, e->n_gapo, e->n_gape, STATE_M, is_mm, e, opt);
                  }
              }
          } 
          else if(str[offset] < 4) { // try exact match only
              int32_t c = str[offset] & 3;
              if(match_sa_next[c].k <= match_sa_next[c].l) {
                  tmap_map1_aux_stack_push(stack, offset, &match_sa_next[c], e->n_mm, e->n_gapo, e->n_gape, STATE_M, 0, e, opt);
              }
          }
      }
  }

  return tmap_map1_sam_to_real(sams, occs, bases, seed2_len, refseq, bwt, sa, hash, opt);
}
예제 #9
0
static tmap_map_sams_t *
tmap_map1_sam_to_real(tmap_map_sams_t *sams, tmap_map1_aux_occ_t *occs, tmap_string_t *bases, int32_t seed2_len,
                       tmap_refseq_t *refseq, tmap_bwt_t *bwt, tmap_sa_t *sa, tmap_bwt_match_hash_t *hash, tmap_map_opt_t *opt) 
{
  tmap_map_sams_t *sams_tmp = NULL;
  tmap_map_sam_t *sam_cur = NULL;
  uint32_t i, j, aln_ref;
  tmap_bwt_int_t k, n, num_all_sa;

  // max # of entries
  for(i=n=0;i<sams->n;i++) {
      n += occs[i].l - occs[i].k + 1;
  }
  num_all_sa = n;

  // bound the # of returned hits
  if(opt->max_best_cals < n) {
      n = opt->max_best_cals;
  }

  // alloc
  sams_tmp = tmap_map_sams_init(sams);
  tmap_map_sams_realloc(sams_tmp, n);
            
  // copy over
  for(i=j=0;i<sams->n && j<n;i++) {
      tmap_map_sam_t *sam;

      sam = &sams->sams[i];

      // go through SA interval
      for(k=occs[i].k;k<=occs[i].l;k++) {
          uint32_t pos = 0, seqid = 0;
          tmap_bwt_int_t pacpos = 0;
          uint8_t strand;

          sam_cur = &sams_tmp->sams[j];
          tmap_map_sam_init(sam_cur);

          strand = sams->sams[i].strand;
          aln_ref = sam->aux.map1_aux->aln_ref;
              
          pacpos = bwt->seq_len - tmap_sa_pac_pos_hash(sa, bwt, k, hash);
          pacpos = (pacpos < aln_ref) ? 0 : (pacpos - aln_ref);
          pacpos++; // make one-based
          
          // NB: addressing the symptom, not the problem
          // This happens when we are at the end of the reference on the reverse
          // strand
          if(pacpos <= refseq->len && refseq->len < pacpos + aln_ref - 1) {
              aln_ref = refseq->len - pacpos + 1;
          }
          else if(refseq->len * 2 < pacpos + aln_ref - 1) {
              aln_ref = (refseq->len * 2) - pacpos + 1;
          }
          
          // save the hit
          if(0 < tmap_refseq_pac2real(refseq, pacpos, aln_ref, &seqid, &pos, &strand)) {
              // copy over previous parameters
              sam_cur->algo_id = TMAP_MAP_ALGO_MAP1;
              sam_cur->algo_stage = opt->algo_stage;
              sam_cur->strand = strand;
              sam_cur->seqid = seqid;
              sam_cur->pos = pos-1; // adjust to zero-based
              sam_cur->target_len = aln_ref;
              sam_cur->score_subo = INT32_MIN;
              if(0 < opt->seed2_length && seed2_len < bases->l) { // adjust if we used a secondary seed
                  // adjust both the target length and position
                  if(0 == strand) { // forward
                      sam_cur->target_len += (bases->l - seed2_len);
                      // NB: sam_cur->pos is zero based
                      if(refseq->annos[sam_cur->seqid].len < sam_cur->pos + sam_cur->target_len) {
                          sam_cur->target_len = refseq->annos[sam_cur->seqid].len - sam_cur->pos;
                      }
                  }
                  else { // reverse
                      if(sam_cur->pos < (bases->l - seed2_len)) { // before the start of the chromosome
                          sam_cur->target_len += sam_cur->pos;
                          sam_cur->pos = 0;
                      }
                      else { // move to the end of the read
                          sam_cur->target_len += (bases->l - seed2_len);
                          sam_cur->pos -= (bases->l - seed2_len);
                      }
                  }
              }
              else if(sam_cur->target_len < bases->l) { // do not adjust, we used the full read
                  sam_cur->target_len = bases->l;
              }

              // aux
              tmap_map_sam_malloc_aux(sam_cur);
              sam_cur->aux.map1_aux->n_mm = sam->aux.map1_aux->n_mm;
              sam_cur->aux.map1_aux->n_gapo = sam->aux.map1_aux->n_gapo;
              sam_cur->aux.map1_aux->n_gape = sam->aux.map1_aux->n_gape;
              sam_cur->aux.map1_aux->aln_ref = 0;
              sam_cur->aux.map1_aux->num_all_sa = num_all_sa;
              j++;

              // only save the top n hits
              if(n <= j) {  // equivalent to opt->max_bset_cals <= j
                  break;
              }
          }
      }
  }
  
  // destroy
  tmap_map_sams_destroy(sams);

  // realloc
  tmap_map_sams_realloc(sams_tmp, j);

  //NB: do not use occs later
  free(occs);
  occs = NULL;
  
  return sams_tmp;
}