static tmap_map_sams_t * tmap_map1_sam_to_real(tmap_map_sams_t *sams, tmap_map1_aux_occ_t *occs, tmap_string_t *bases, int32_t seed2_len, tmap_refseq_t *refseq, tmap_bwt_t *bwt, tmap_sa_t *sa, tmap_bwt_match_hash_t *hash, tmap_map_opt_t *opt) { tmap_map_sams_t *sams_tmp = NULL; tmap_map_sam_t *sam_cur = NULL; uint32_t i, j, aln_ref; tmap_bwt_int_t k, n, num_all_sa; // max # of entries for(i=n=0;i<sams->n;i++) { n += occs[i].l - occs[i].k + 1; } num_all_sa = n; // bound the # of returned hits if(opt->max_best_cals < n) { n = opt->max_best_cals; } // alloc sams_tmp = tmap_map_sams_init(sams); tmap_map_sams_realloc(sams_tmp, n); // copy over for(i=j=0;i<sams->n && j<n;i++) { tmap_map_sam_t *sam; sam = &sams->sams[i]; // go through SA interval for(k=occs[i].k;k<=occs[i].l;k++) { uint32_t pos = 0, seqid = 0; tmap_bwt_int_t pacpos = 0; uint8_t strand; sam_cur = &sams_tmp->sams[j]; tmap_map_sam_init(sam_cur); strand = sams->sams[i].strand; aln_ref = sam->aux.map1_aux->aln_ref; pacpos = bwt->seq_len - tmap_sa_pac_pos_hash(sa, bwt, k, hash); pacpos = (pacpos < aln_ref) ? 0 : (pacpos - aln_ref); pacpos++; // make one-based // NB: addressing the symptom, not the problem // This happens when we are at the end of the reference on the reverse // strand if(pacpos <= refseq->len && refseq->len < pacpos + aln_ref - 1) { aln_ref = refseq->len - pacpos + 1; } else if(refseq->len * 2 < pacpos + aln_ref - 1) { aln_ref = (refseq->len * 2) - pacpos + 1; } // save the hit if(0 < tmap_refseq_pac2real(refseq, pacpos, aln_ref, &seqid, &pos, &strand)) { // copy over previous parameters sam_cur->algo_id = TMAP_MAP_ALGO_MAP1; sam_cur->algo_stage = opt->algo_stage; sam_cur->strand = strand; sam_cur->seqid = seqid; sam_cur->pos = pos-1; // adjust to zero-based sam_cur->target_len = aln_ref; sam_cur->score_subo = INT32_MIN; if(0 < opt->seed2_length && seed2_len < bases->l) { // adjust if we used a secondary seed // adjust both the target length and position if(0 == strand) { // forward sam_cur->target_len += (bases->l - seed2_len); // NB: sam_cur->pos is zero based if(refseq->annos[sam_cur->seqid].len < sam_cur->pos + sam_cur->target_len) { sam_cur->target_len = refseq->annos[sam_cur->seqid].len - sam_cur->pos; } } else { // reverse if(sam_cur->pos < (bases->l - seed2_len)) { // before the start of the chromosome sam_cur->target_len += sam_cur->pos; sam_cur->pos = 0; } else { // move to the end of the read sam_cur->target_len += (bases->l - seed2_len); sam_cur->pos -= (bases->l - seed2_len); } } } else if(sam_cur->target_len < bases->l) { // do not adjust, we used the full read sam_cur->target_len = bases->l; } // aux tmap_map_sam_malloc_aux(sam_cur); sam_cur->aux.map1_aux->n_mm = sam->aux.map1_aux->n_mm; sam_cur->aux.map1_aux->n_gapo = sam->aux.map1_aux->n_gapo; sam_cur->aux.map1_aux->n_gape = sam->aux.map1_aux->n_gape; sam_cur->aux.map1_aux->aln_ref = 0; sam_cur->aux.map1_aux->num_all_sa = num_all_sa; j++; // only save the top n hits if(n <= j) { // equivalent to opt->max_bset_cals <= j break; } } } } // destroy tmap_map_sams_destroy(sams); // realloc tmap_map_sams_realloc(sams_tmp, j); //NB: do not use occs later free(occs); occs = NULL; return sams_tmp; }
static tmap_map_sams_t * tmap_map2_aux_store_hits(tmap_refseq_t *refseq, tmap_map_opt_t *opt, tmap_map2_aln_t *aln, int32_t seq_len) { int32_t i, j; tmap_map_sams_t *sams = NULL; if(NULL == aln) return NULL; sams = tmap_map_sams_init(NULL); tmap_map_sams_realloc(sams, aln->n); for(i=j=0;i<aln->n;i++) { tmap_map2_hit_t *p = aln->hits + i; uint32_t seqid = 0, pos = 0; uint8_t strand; int32_t beg; tmap_map_sam_t *sam = &sams->sams[j]; tmap_map_sam_init(sam); //strand = (p->flag & 0x10) ? 1 : 0; // skip over duplicate hits, or sub-optimal hits to the same location if(0 < i) { tmap_map2_hit_t *q = aln->hits + i - 1; if(q->flag == p->flag && q->k == p->k && q->G <= p->G && q->tlen <= p->tlen) continue; } /* if(i < aln->n) { tmap_map2_hit_t *q = aln->hits + i + 1; if(p->flag == q->flag && p->k == q->k && p->G <= q->G && p->tlen <= q->tlen) continue; } */ // adjust for contig boundaries if(tmap_refseq_pac2real(refseq, p->k, p->tlen, &seqid, &pos, &strand) <= 0) { if(1 == strand) { // reverse if(tmap_refseq_pac2real(refseq, p->k + p->tlen - 1, 1, &seqid, &pos, &strand) <= 0) { continue; } else { // move to the contig and position p->k = refseq->annos[seqid].offset+1; p->tlen = (p->tlen < refseq->annos[seqid].len) ? p->tlen : refseq->annos[seqid].len; } } else { if(tmap_refseq_pac2real(refseq, p->k, 1, &seqid, &pos, &strand) <= 0) { continue; } else { // move to the contig and position p->k = refseq->annos[seqid].offset+1; p->tlen = (p->tlen < refseq->annos[seqid].len) ? p->tlen : refseq->annos[seqid].len; } } } // adjust based on where the hit was in the read beg = (1 == strand) ? (seq_len - p->end) : p->beg; pos = (pos <= beg) ? 1 : (pos - beg); // adjust pos if((p->flag & 0x1)) { p->G2 = p->G; // Note: the flag indicates a repetitive match, so we need to update the sub-optimal score } sam->strand = strand; sam->seqid = seqid; sam->pos = pos-1; // make it zero-based sam->algo_id = TMAP_MAP_ALGO_MAP2; sam->algo_stage = opt->algo_stage; sam->score = p->G; sam->score_subo = p->G2; sam->target_len = (seq_len < p->tlen) ? p->tlen : seq_len; // auxiliary data tmap_map_sam_malloc_aux(sam); sam->aux.map2_aux->XE = p->n_seeds; sam->aux.map2_aux->XF = p->flag >> 16; if(p->l) { sam->aux.map2_aux->XI = p->l - p->k + 1; } else { sam->aux.map2_aux->XI = 0; } sam->aux.map2_aux->flag = (p->flag & 0x1) ? 1 : 0; j++; } if(j != aln->n) { tmap_map_sams_realloc(sams, j); } return sams; }
void tmap_bwt_check_core2(tmap_bwt_t *bwt, int32_t length, int32_t print_msg, int32_t print_sa, int32_t warn) { uint8_t *seqs[2] = {NULL,NULL}; char *str = NULL; int32_t i, asymmetric, k, l; uint64_t hash_j; int64_t sum, j; tmap_bwt_match_occ_t sa; tmap_bwt_int_t n[2]; for(i=1;i<=length;i++) { seqs[0] = tmap_calloc(i, sizeof(uint8_t), "seqs[0]"); seqs[1] = tmap_calloc(i, sizeof(uint8_t), "seqs[1]"); str = tmap_calloc(i+1, sizeof(char), "str"); for(j=0;j<i;j++) { seqs[1][j] = 3; } asymmetric = 0; j = 0; hash_j = sum = 0; while(1) { if(i == j) { for(k=0;k<i;k++) { seqs[1][k] = 3 - seqs[0][i-k-1]; } for(k=0;k<2;k++) { //n[k] = tmap_bwt_match_exact(bwt, i, seqs[k], &sa); n[k] = tmap_bwt_match_exact_reverse(bwt, i, seqs[k], &sa); if(0 == k) { if(0 < n[k] && TMAP_BWT_INT_MAX != sa.k && sa.k <= sa.l) { sum += n[k]; } if(1 == print_msg && 1 == print_sa) { for(l=0;l<i;l++) { str[l] = "ACGTN"[seqs[k][l]]; } if(0 < n[k] && TMAP_BWT_INT_MAX != sa.k && sa.k <= sa.l) { tmap_progress_print2("%s\t%llu\t%llu\t%llu", str, sa.k, sa.l, n[k]); #ifdef TMAP_BWT_CHECK_DEBUG while(sa.k <= sa.l) { uint32_t seqid, pos; uint8_t strand; tmap_bwt_int_t pacpos = tmap_sa_pac_pos(tmap_bwt_index->sa, bwt, sa.k); if(0 < tmap_refseq_pac2real(tmap_bwt_index->refseq, pacpos, 1, &seqid, &pos, &strand)) { tmap_progress_print2("%s\t%llu\t%llu\t%llu\t%c%u:%u", str, sa.k, sa.l, n[k], "+-"[strand], seqid, pos); } else { tmap_progress_print2("%s\t%llu\t%llu\t%llu\t%c%u:%u", str, sa.k, sa.l, n[k], '?', 0, 0); } sa.k++; } #endif } else { tmap_progress_print2("%s\tNA\tNA\tNA", str); } } } } if(0 == asymmetric && n[0] != n[1]) { asymmetric = 1; //fprintf(stderr, "n[0]=%u n[1]=%u\n", n[0], n[1]); tmap_error("Asymmetry found", Warn, OutOfRange); } j--; while(0 <= j && 3 == seqs[0][j]) { seqs[0][j] = 0; hash_j >>= 2; j--; } if(j < 0) break; seqs[0][j]++; hash_j++; j++; } else { hash_j <<= 2; j++; } } free(seqs[0]); free(seqs[1]); free(str); j = (sum == (bwt->seq_len - i + 1)) ? 0 : 1; // j==1 on fail if(1 == print_msg) { if(0 == j) tmap_progress_print2("%d-mer validation passed", i); else tmap_progress_print2("%d-mer validation failed: observed (%llu) != expected (%llu)\n", i, sum, bwt->seq_len - i + 1); } if(0 == warn && 1 == j) { tmap_error("inconsistency found in the BWT", Exit, OutOfRange); } }