int32_t tmap_sam_io_read(tmap_sam_io_t *samio, tmap_sam_t *sam) { if(NULL != sam->b) { bam_destroy1(sam->b); } sam->b = bam_init1(); // check if we're past optional end bam virtual file offset if (samio->bam_end_vfo > 0) { BGZF* bgzf_fp = samio->fp->x.bam; if (bam_tell(bgzf_fp) >= samio->bam_end_vfo) { fprintf(stderr, "stopping at bam virtual file offset %lu\n", samio->bam_end_vfo); return -1; } } if(0 < samread(samio->fp, sam->b)) { char *str; int32_t i, len; // name str = bam1_qname(sam->b); len = strlen(str); tmap_sam_io_update_string(&sam->name, str, len); sam->name->s[len] = '\0'; // seq and qual len = sam->b->core.l_qseq; tmap_sam_io_update_string(&sam->seq, NULL, len); tmap_sam_io_update_string(&sam->qual, (char*)bam1_qual(sam->b), len); for(i=0;i<len;i++) { sam->seq->s[i] = bam_nt16_rev_table[bam1_seqi(bam1_seq(sam->b), i)]; sam->qual->s[i] = QUAL2CHAR(sam->qual->s[i]); } sam->seq->s[len] = sam->qual->s[len] = '\0'; // reverse compliment if necessary if((sam->b->core.flag & BAM_FREVERSE)) { tmap_sam_reverse_compliment(sam); } return 1; } return -1; }
/* callback for bam_fetch() */ static int fetch_func(const bam1_t *b) { const bam1_core_t *c = &b->core; int i; char* read_name=(char*) bam1_qname(b); printf("%s\t",read_name); char* read_seq=(char*)malloc(c->l_qseq+1); char* s=(char*) bam1_seq(b); for(i=0;i<c->l_qseq;i++) read_seq[i]=bam_nt16_rev_table[bam1_seqi(s,i)]; read_seq[i]=0; printf("%s\t",read_seq); char* read_qual=(char*)malloc(c->l_qseq+1); char* t=(char*) bam1_qual(b); for(i=0;i<c->l_qseq;i++) read_qual[i]=t[i]+33; read_qual[i]=0; printf("%s\n",read_qual); free(read_seq); free(read_qual); return 0; }
/** * Get string containing bam1 sequence nucleotides. */ ERROR_CODE new_sequence_from_bam_ref(bam1_t *bam1, char *seq, uint32_t max_l) { char *bam_seq = (char *)bam1_seq(bam1); int seq_len = bam1->core.l_qseq; int i; if(seq_len > max_l) seq_len = max_l; // nucleotide content for (i = 0; i < seq_len; i++) { switch (bam1_seqi(bam_seq, i)) { case 1: seq[i] = 'A'; break; case 2: seq[i] = 'C'; break; case 4: seq[i] = 'G'; break; case 8: seq[i] = 'T'; break; case 15: seq[i] = 'N'; //printf("N"); break; default: seq[i] = 'N'; break; } } if(max_l > seq_len) seq[i] = '\0'; return NO_ERROR; }
static void unpad_seq(bam1_t *b, kstring_t *s) { int k, j, i; uint32_t *cigar = bam1_cigar(b); uint8_t *seq = bam1_seq(b); ks_resize(s, b->core.l_qseq); for (k = 0, s->l = 0, j = 0; k < b->core.n_cigar; ++k) { int op, ol; op = bam_cigar_op(cigar[k]); ol = bam_cigar_oplen(cigar[k]); assert(op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CSOFT_CLIP); if (op == BAM_CMATCH) { for (i = 0; i < ol; ++i) s->s[s->l++] = bam1_seqi(seq, j); ++j; } else if (op == BAM_CSOFT_CLIP) { j += ol; } else { for (i = 0; i < ol; ++i) s->s[s->l++] = 0; } } }
void gt_sam_alignment_sequence_external_buffer(GtSamAlignment *sam_alignment, GtUchar **seq_buffer, unsigned long *bufsize) { unsigned long query_len, idx; uint8_t *bam_seq; query_len = (unsigned long) sam_alignment->s_alignment->core.l_qseq; if ((*bufsize) < query_len) { (*seq_buffer) = gt_realloc(*seq_buffer, sizeof (**seq_buffer) * (query_len + 1UL)); (*bufsize) = query_len; } gt_assert((*seq_buffer) != NULL); bam_seq = bam1_seq(sam_alignment->s_alignment); for (idx = 0UL; idx < query_len; idx++) { (*seq_buffer)[idx] = bambase2gtbase((uint8_t) bam1_seqi(bam_seq, idx), sam_alignment->alphabet); } (*seq_buffer)[query_len] = (GtUchar)'\0'; }
/** * Get string containing bam1 sequence nucleotides. */ char * new_sequence_from_bam(bam1_t *bam1) { char *seq; char *bam_seq = (char *)bam1_seq(bam1); int seq_len = bam1->core.l_qseq; seq = (char *) malloc(seq_len * sizeof(char)); // nucleotide content for (int i = 0; i < seq_len; i++) { switch (bam1_seqi(bam_seq, i)) { case 1: seq[i] = 'A'; break; case 2: seq[i] = 'C'; break; case 4: seq[i] = 'G'; break; case 8: seq[i] = 'T'; break; case 15: seq[i] = 'N'; //printf("N"); break; default: seq[i] = 'N'; break; } } return seq; }
// Mostly stolen from bwa_read_bam. void bam1_to_seq(bam1_t *raw, bwa_seq_t *p, int is_comp, int trim_qual) { // long n_trimmed = 0; uint8_t *s, *q; int i, l = raw->core.l_qseq; p->tid = -1; // no assigned to a thread p->qual = 0; p->full_len = p->clip_len = p->len = l; // n_tot += p->full_len; s = bam1_seq(raw); q = bam1_qual(raw); p->seq = (ubyte_t*)calloc(p->len + 1, 1); p->qual = (ubyte_t*)calloc(p->len + 1, 1); for (i = 0; i != p->full_len; ++i) { p->seq[i] = bam_nt16_nt4_table[(int)bam1_seqi(s, i)]; p->qual[i] = q[i] + 33 < 126? q[i] + 33 : 126; } if (bam1_strand(raw)) { // then reverse seq_reverse(p->len, p->seq, 1); seq_reverse(p->len, p->qual, 0); } if (trim_qual >= 1) /* n_trimmed += */ bwa_trim_read(trim_qual, p); p->rseq = (ubyte_t*)calloc(p->full_len, 1); memcpy(p->rseq, p->seq, p->len); seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped() seq_reverse(p->len, p->rseq, is_comp); p->max_entries = 0 ; // We don't set a name, it's contained in the original record // anyway. // p->name = strdup((const char*)bam1_qname(raw)); // No place to put the tally right now. // if (n_seqs && trim_qual >= 1) // fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot); }
static int sum_err(int *n, const bam_pileup1_t **plp, mc_aux_t *ma) { int i, j, tot = 0; memset(ma->qsum, 0, sizeof(int) * 4 * ma->n); memset(ma->bcnt, 0, sizeof(int) * 4 * ma->n); for (j = 0; j < ma->n; ++j) { int *qsum = ma->qsum + j * 4; int *bcnt = ma->bcnt + j * 4; for (i = 0; i < n[j]; ++i) { const bam_pileup1_t *p = plp[j] + i; int q, b; if (p->is_del || (p->b->core.flag&BAM_FUNMAP)) continue; q = bam1_qual(p->b)[p->qpos]; if (p->b->core.qual < q) q = p->b->core.qual; if (q < MC_MIN_QUAL) continue; // small qual b = bam_nt16_nt4_table[(int)bam1_seqi(bam1_seq(p->b), p->qpos)]; if (b > 3) continue; // N qsum[b] += q; ++bcnt[b]; ++tot; } } return tot; }
char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of) { uint8_t *s = bam1_seq(b), *t = bam1_qual(b); int i; const bam1_core_t *c = &b->core; kstring_t str; str.l = str.m = 0; str.s = 0; kputsn(bam1_qname(b), c->l_qname-1, &str); kputc('\t', &str); if (of == BAM_OFDEC) { kputw(c->flag, &str); kputc('\t', &str); } else if (of == BAM_OFHEX) ksprintf(&str, "0x%x\t", c->flag); else { // BAM_OFSTR for (i = 0; i < 16; ++i) if ((c->flag & 1<<i) && bam_flag2char_table[i]) kputc(bam_flag2char_table[i], &str); kputc('\t', &str); } if (c->tid < 0) kputsn("*\t", 2, &str); else { if (header) kputs(header->target_name[c->tid] , &str); else kputw(c->tid, &str); kputc('\t', &str); } kputw(c->pos + 1, &str); kputc('\t', &str); kputw(c->qual, &str); kputc('\t', &str); if (c->n_cigar == 0) kputc('*', &str); else { for (i = 0; i < c->n_cigar; ++i) { kputw(bam1_cigar(b)[i]>>BAM_CIGAR_SHIFT, &str); kputc("MIDNSHP"[bam1_cigar(b)[i]&BAM_CIGAR_MASK], &str); } } kputc('\t', &str); if (c->mtid < 0) kputsn("*\t", 2, &str); else if (c->mtid == c->tid) kputsn("=\t", 2, &str); else { if (header) kputs(header->target_name[c->mtid], &str); else kputw(c->mtid, &str); kputc('\t', &str); } kputw(c->mpos + 1, &str); kputc('\t', &str); kputw(c->isize, &str); kputc('\t', &str); if (c->l_qseq) { for (i = 0; i < c->l_qseq; ++i) kputc(bam_nt16_rev_table[bam1_seqi(s, i)], &str); kputc('\t', &str); if (t[0] == 0xff) kputc('*', &str); else for (i = 0; i < c->l_qseq; ++i) kputc(t[i] + 33, &str); } else kputsn("*\t*", 3, &str); s = bam1_aux(b); while (s < b->data + b->data_len) { uint8_t type, key[2]; key[0] = s[0]; key[1] = s[1]; s += 2; type = *s; ++s; kputc('\t', &str); kputsn((char*)key, 2, &str); kputc(':', &str); if (type == 'A') { kputsn("A:", 2, &str); kputc(*s, &str); ++s; } else if (type == 'C') { kputsn("i:", 2, &str); kputw(*s, &str); ++s; } else if (type == 'c') { kputsn("i:", 2, &str); kputw(*(int8_t*)s, &str); ++s; } else if (type == 'S') { kputsn("i:", 2, &str); kputw(*(uint16_t*)s, &str); s += 2; } else if (type == 's') { kputsn("i:", 2, &str); kputw(*(int16_t*)s, &str); s += 2; } else if (type == 'I') { kputsn("i:", 2, &str); kputuw(*(uint32_t*)s, &str); s += 4; } else if (type == 'i') { kputsn("i:", 2, &str); kputw(*(int32_t*)s, &str); s += 4; } else if (type == 'f') { ksprintf(&str, "f:%g", *(float*)s); s += 4; } else if (type == 'd') { ksprintf(&str, "d:%lg", *(double*)s); s += 8; } else if (type == 'Z' || type == 'H') { kputc(type, &str); kputc(':', &str); while (*s) kputc(*s++, &str); ++s; } } return str.s; }
uint8_t bam1_seqi_(uint8_t *s, int i) { return bam1_seqi(s, i); }
// TODO soft-clipping bam1_t *sw_align(graph_t *g, bam1_t *b, node_t *n, sw_heap_t *heap, char *rg_id, int32_t offset, cov_cutoffs_t *cutoffs, uint8_t correct_bases, uint8_t use_qualities, int32_t max_total_coverage, int32_t max_heap_size) { char *colors = NULL; char *color_qualities = NULL; char base, qual; uint8_t space = SRMA_SPACE_NT; uint8_t strand; int32_t i, j, aln_start; int32_t num_start_nodes_added=0; int32_t sw_node_i=-1, sw_node_best_i=-1, sw_node_cur_i=-1, sw_node_next_i=-1; int32_t soft_clip_start_l = 0, soft_clip_end_l = 0; strand = bam1_strand(b); // soft-clipping if(1 == strand) { //reverse // going from 3'->5' soft_clip_start_l = sw_align_get_soft_clip(b, 1); soft_clip_end_l = sw_align_get_soft_clip(b, 0); } else { // going from 5'->3' soft_clip_start_l = sw_align_get_soft_clip(b, 0); soft_clip_end_l = sw_align_get_soft_clip(b, 1); } // FOR NOW if(0 < soft_clip_start_l || 0 < soft_clip_end_l) { return b; } // Check color space colors = sw_align_get_cs(b); if(NULL == colors) { space = SRMA_SPACE_NT; } else { space = SRMA_SPACE_CS; color_qualities = sw_align_get_cq(b); // Some aligners include a quality value for the adapter. A quality value // IMHO should not be given for an unobserved (assumed) peice of data. Trim // the first quality in this case if(strlen(colors) == strlen(color_qualities)) { // ignore leading quality color_qualities++; } if(0 < soft_clip_start_l || 0 < soft_clip_end_l) { srma_error(__func__, "Soft clipping not supported for color space", Exit, OutOfRange); } } // remove mate info b->core.flag &= ~(BAM_FPROPER_PAIR | BAM_FMREVERSE | BAM_FMUNMAP); b->core.mtid = -1; b->core.mpos = -1; b->core.isize = 0; // re-type heap heap->type = (1 == strand) ? SRMA_SW_HEAP_MAX : SRMA_SW_HEAP_MIN; // bound with original alignment sw_node_best_i = sw_align_bound(g, b, n, heap, strand, colors, color_qualities, space, cutoffs, use_qualities, max_total_coverage, max_heap_size); if(0 <= sw_node_best_i) { /* sw_heap_reset(heap); // reset the heap, keep old nodes fprintf(stderr, "BOUNDED score=%d coverage_sum=%hu\n", heap->nodes[sw_node_best_i].score, heap->nodes[sw_node_best_i].coverage_sum); // DEBUG */ } else { //fprintf(stderr, "NOT BOUNDED\n"); // DEBUG // nodes do not need to be preserved sw_heap_clear(heap); } //return b; // HERE DEBUG HERE BUG // add start nodes if(strand) { if(SRMA_SPACE_CS == space) { base = nt2int_table[(int)colors[1]]; qual = color_qualities[0]; } else { base = nt4bit_to_int[bam1_seqi(bam1_seq(b), b->core.l_qseq-1)]; qual = bam1_qual(b)[b->core.l_qseq-1] + 33; } aln_start = bam_calend(&b->core, bam1_cigar(b)); for(i=aln_start+offset;aln_start-offset<=i;i--) { int32_t pos = graph_get_node_list_index_at_or_before(g, i); node_list_t *list = graph_get_node_list(g, pos); if(1 != pos && NULL != list) { for(j=0;j<list->length;j++) { node_t *node = list->nodes[j]; int32_t pass = pass_filters1(g, node, cutoffs, max_total_coverage); if(0 == pass) { sw_node_i = sw_heap_get_node_i(heap); sw_node_init(&heap->nodes[sw_node_i], NULL, node, node->coverage, base, qual, use_qualities, space); sw_heap_add_i(heap, sw_node_i); } else if(pass < 0) { sw_heap_clear(heap); // clear heap return b; } if(node->position < i) { i = node->position; } num_start_nodes_added++; } } } } else { if(SRMA_SPACE_CS == space) { base = nt2int_table[(int)colors[1]]; qual = color_qualities[0]; } else { base = nt4bit_to_int[bam1_seqi(bam1_seq(b), 0)]; qual = bam1_qual(b)[0] + 33; } aln_start = b->core.pos; for(i=aln_start-offset;i<=aln_start+offset;i++) { int32_t pos = graph_get_node_list_index_at_or_after(g, i); node_list_t *list = graph_get_node_list(g, pos); if(0 != pos && NULL != list) { for(j=0;j<list->length;j++) { node_t *node = list->nodes[j]; int32_t pass = pass_filters1(g, node, cutoffs, max_total_coverage); if(0 == pass) { sw_node_i = sw_heap_get_node_i(heap); sw_node_init(&heap->nodes[sw_node_i], NULL, node, node->coverage, base, qual, use_qualities, space); sw_heap_add_i(heap, sw_node_i); } else if(pass < 0) { sw_heap_clear(heap); // clear heap return b; } if(node->position < i) { i = node->position; } num_start_nodes_added++; } } } } if(0 == num_start_nodes_added) { srma_error(__func__, "Did not add any start nodes", Exit, OutOfRange); } sw_node_cur_i = sw_heap_poll_i(heap); while(0 <= sw_node_cur_i) { if(max_heap_size < heap->queue_end - heap->queue_start + 1) { // too many to consider sw_heap_clear(heap); // clear heap return b; } sw_node_next_i = sw_heap_peek_i(heap); assert(0 <= sw_node_cur_i); // DEBUG while(NODE_INSERTION != __node_type(heap->nodes[sw_node_cur_i].node) && 0 <= sw_node_next_i && 0 == sw_node_compare(&heap->nodes[sw_node_cur_i], &heap->nodes[sw_node_next_i], heap->type)) { if(heap->nodes[sw_node_cur_i].score < heap->nodes[sw_node_next_i].score || (heap->nodes[sw_node_cur_i].score == heap->nodes[sw_node_next_i].score && heap->nodes[sw_node_cur_i].coverage_sum < heap->nodes[sw_node_next_i].coverage_sum)) { sw_node_cur_i = sw_heap_poll_i(heap); } else { // ignore the next node sw_heap_poll_i(heap); } sw_node_next_i = sw_heap_peek_i(heap); } sw_node_next_i = -1; if(heap->nodes[sw_node_cur_i].read_offset == b->core.l_qseq-1) { // found, keep best if(sw_node_best_i < 0 || heap->nodes[sw_node_best_i].score < heap->nodes[sw_node_cur_i].score || (heap->nodes[sw_node_best_i].score == heap->nodes[sw_node_cur_i].score && heap->nodes[sw_node_best_i].coverage_sum < heap->nodes[sw_node_cur_i].coverage_sum)) { //fprintf(stderr, "FOUND BEST\n"); // DEBUG sw_node_best_i = sw_node_cur_i; } } else if(0 <= sw_node_best_i && heap->nodes[sw_node_cur_i].score < heap->nodes[sw_node_best_i].score) { // ignore, under the assumption that scores can only // become more negative. } else { edge_list_t *list = NULL; if(1 == strand) { // reverse list = heap->nodes[sw_node_cur_i].node->prev; } else { list = heap->nodes[sw_node_cur_i].node->next; } { // get the base and quality if(SRMA_SPACE_CS == space) { base = nt2int_table[(int)colors[1 + (heap->nodes[sw_node_cur_i].read_offset+1)]]; qual = color_qualities[heap->nodes[sw_node_cur_i].read_offset+1]; } else { if(strand) { base = nt4bit_to_int[bam1_seqi(bam1_seq(b), b->core.l_qseq-1-heap->nodes[sw_node_cur_i].read_offset-1)]; qual = bam1_qual(b)[b->core.l_qseq-1-heap->nodes[sw_node_cur_i].read_offset-1] + 33; } else { base = nt4bit_to_int[bam1_seqi(bam1_seq(b), (heap->nodes[sw_node_cur_i].read_offset+1))]; qual = bam1_qual(b)[(heap->nodes[sw_node_cur_i].read_offset+1)] + 33; } } } /* node_t *node = heap->nodes[sw_node_cur_i].node; fprintf(stderr, "NODE %d:%d offset=%d coverage=%d base=%d\n", node->contig, node->position, node->offset, node->coverage, node->base); fprintf(stderr, "SW_NODE read_offset=%d score=%d coverage_sum=%d start_position=%d space=%d\n", heap->nodes[sw_node_cur_i].read_offset, heap->nodes[sw_node_cur_i].score, heap->nodes[sw_node_cur_i].coverage_sum, heap->nodes[sw_node_cur_i].start_position, space); */ for(i=0;i<list->length;i++) { node_t *node_cur = list->nodes[i]; uint16_t coverage_cur = list->coverages[i]; int32_t pass = pass_filters(g, node_cur, coverage_cur, cutoffs, max_total_coverage); if(0 == pass) { // add to the heap sw_node_i = sw_heap_get_node_i(heap); // DEBUG assert(0 <= sw_node_cur_i); assert(0 <= heap->nodes[sw_node_cur_i].read_offset); sw_node_init(&heap->nodes[sw_node_i], &heap->nodes[sw_node_cur_i], node_cur, coverage_cur, base, qual, use_qualities, space); sw_heap_add_i(heap, sw_node_i); } else if(pass < 0) { sw_heap_clear(heap); // clear heap return b; } } } // get the next node sw_node_cur_i = sw_heap_poll_i(heap); } /* fprintf(stderr, "sw_node_best_i=%d\n", sw_node_best_i); // DEBUG if(0 <= sw_node_best_i) { fprintf(stderr, "END score=%d coverage_sum=%hu\n", heap->nodes[sw_node_best_i].score, heap->nodes[sw_node_best_i].coverage_sum); // DEBUG } */ // update SAM/BAM b = sw_align_update_bam(b, rg_id, heap, sw_node_best_i, space, colors, color_qualities, strand, correct_bases); sw_heap_clear(heap); // clear heap return b; }
static int32_t sw_align_bound(graph_t *g, bam1_t *b, node_t *n, sw_heap_t *heap, uint8_t strand, const char *colors, const char *color_qualities, uint8_t space, cov_cutoffs_t *cutoffs, uint8_t use_qualities, int32_t max_total_coverage, int32_t max_heap_size) { int32_t sw_node_i=-1, sw_node_best_i=-1, sw_node_cur_i=-1, sw_node_next_i=-1; int32_t i; char base, qual; if(0 != pass_filters1(g, n, cutoffs, max_total_coverage)) { return -1; } { // add the start node to the heap // Get first base if(SRMA_SPACE_CS == space) { base = nt2int_table[(int)colors[1]]; qual = color_qualities[0]; } else { if(strand) { base = nt4bit_to_int[bam1_seqi(bam1_seq(b), b->core.l_qseq-1)]; qual = bam1_qual(b)[b->core.l_qseq-1] + 33; } else { base = nt4bit_to_int[bam1_seqi(bam1_seq(b), 0)]; qual = bam1_qual(b)[0] + 33; } } sw_node_i = sw_heap_get_node_i(heap); sw_node_init(&heap->nodes[sw_node_i], NULL, n, n->coverage, base, qual, use_qualities, space); sw_heap_add_i(heap, sw_node_i); } sw_node_cur_i = sw_heap_poll_i(heap); assert(0 <= sw_node_cur_i); // DEBUG while(0 <= sw_node_cur_i) { //fprintf(stderr, "sw_node_cur_i=%d\n", sw_node_cur_i); // DEBUG if(max_heap_size <- heap->queue_end - heap->queue_start + 1) { // too many to consider //fprintf(stderr, "NOT BOUNDED 2\n"); // DEBUG return -1; } sw_node_next_i = sw_heap_peek_i(heap); assert(0 <= sw_node_cur_i); // DEBUG while(NODE_INSERTION != __node_type(heap->nodes[sw_node_cur_i].node) && 0 <= sw_node_next_i && 0 == sw_node_compare(&heap->nodes[sw_node_cur_i], &heap->nodes[sw_node_next_i], heap->type)) { if(heap->nodes[sw_node_cur_i].score < heap->nodes[sw_node_next_i].score || (heap->nodes[sw_node_cur_i].score == heap->nodes[sw_node_next_i].score && heap->nodes[sw_node_cur_i].coverage_sum < heap->nodes[sw_node_next_i].coverage_sum)) { sw_node_cur_i = sw_heap_poll_i(heap); } else { // ignore the next node sw_heap_poll_i(heap); } sw_node_next_i = sw_heap_peek_i(heap); } sw_node_next_i = -1; // DEBUG /* fprintf(stderr, "read_offset=%d l_qseq-1=%d\n", heap->nodes[sw_node_cur_i].read_offset, b->core.l_qseq-1); */ if(heap->nodes[sw_node_cur_i].read_offset == b->core.l_qseq-1) { // found, keep best if(sw_node_best_i < 0 || heap->nodes[sw_node_best_i].score < heap->nodes[sw_node_cur_i].score || (heap->nodes[sw_node_best_i].score == heap->nodes[sw_node_cur_i].score && heap->nodes[sw_node_best_i].coverage_sum < heap->nodes[sw_node_cur_i].coverage_sum)) { sw_node_best_i = sw_node_cur_i; } } else { edge_list_t *list = NULL; if(1 == strand) { // reverse list = heap->nodes[sw_node_cur_i].node->prev; } else { list = heap->nodes[sw_node_cur_i].node->next; } { // get the aligned base and quality // do not use color space data for bounding if(strand) { base = nt4bit_to_int[bam1_seqi(bam1_seq(b), b->core.l_qseq-1-heap->nodes[sw_node_cur_i].read_offset-1)]; qual = bam1_qual(b)[b->core.l_qseq-1-heap->nodes[sw_node_cur_i].read_offset-1] + 33; } else { base = nt4bit_to_int[bam1_seqi(bam1_seq(b), (heap->nodes[sw_node_cur_i].read_offset+1))]; qual = bam1_qual(b)[(heap->nodes[sw_node_cur_i].read_offset+1)] + 33; } } //fprintf(stderr, "list->length=%d\n", list->length); // DEBUG for(i=0;i<list->length;i++) { node_t *node_cur= list->nodes[i]; // DEBUG /* fprintf(stderr, "%d:%d __node_base(node_cur)=%d base=%d __node_type(node_cur)=%d coverages_cur=%d\n", node_cur->contig, node_cur->position, __node_base(node_cur), base, __node_type(node_cur), list->coverages[i]); */ // base should match unless filters don't pass if(__node_base(node_cur) == base) { uint16_t coverage_cur = list->coverages[i]; int32_t pass = pass_filters(g, node_cur, coverage_cur, cutoffs, max_total_coverage); //fprintf(stderr, "pass=%d\n", pass); // DEBUG if(0 == pass) { if(SRMA_SPACE_CS == space) { // use color space data base = nt2int_table[(int)colors[1 + (heap->nodes[sw_node_cur_i].read_offset+1)]]; qual = color_qualities[heap->nodes[sw_node_cur_i].read_offset+1]; } // add to the heap sw_node_i = sw_heap_get_node_i(heap); // DEBUG assert(0 <= sw_node_cur_i); assert(0 <= heap->nodes[sw_node_cur_i].read_offset); sw_node_init(&heap->nodes[sw_node_i], &heap->nodes[sw_node_cur_i], node_cur, coverage_cur, base, qual, use_qualities, space); sw_heap_add_i(heap, sw_node_i); } else if(pass < 0) { //fprintf(stderr, "NOT BOUNDED 3\n"); // DEBUG return -1; } } } } // get the next node sw_node_cur_i = sw_heap_poll_i(heap); } //fprintf(stderr, "BOUNDED %d\n", sw_node_best_i); // DEBUG return sw_node_best_i; }
// TODO soft clipping bam1_t *sw_align_update_bam(bam1_t *bam_old, char *rg_id, sw_heap_t *heap, int32_t sw_node_best_i, uint8_t space, char *colors, char *color_qualities, uint8_t strand, uint8_t correct_bases) { bam1_t *bam_new=NULL; int32_t sw_node_cur_i=-1, sw_node_prev_i=-1; int32_t i; int32_t cigar_cur_op, cigar_prev_op; int32_t cigar_cur_length, cigar_prev_length; uint32_t read_index; char *color_errors = NULL; if(sw_node_best_i < 0) { // none found, do not modify alignment return bam_old; } bam_new = srma_calloc(1, sizeof(bam1_t), __func__, "bam_new"); if(1 == strand) { read_index = 0; } else { read_index = bam_old->core.l_qseq-1; } { // query name bam_new->core.l_qname = bam_old->core.l_qname; bam_new->data_len += bam_new->core.l_qname; sw_align_bam_alloc_data(bam_new, bam_new->data_len); memcpy(bam1_qname(bam_new), bam1_qname(bam_old), bam_old->core.l_qname); } { // flag bam_new->core.flag = bam_old->core.flag; } { // tid, pos, qual bam_new->core.tid = heap->nodes[sw_node_best_i].node->contig-1; // it is one-based, we want zero-based if(1 == strand) { // reverse strand bam_new->core.pos = heap->nodes[sw_node_best_i].node->position-1; } else { bam_new->core.pos = heap->nodes[sw_node_best_i].start_position-1; // zero-based } bam_new->core.qual = bam_old->core.qual; // should we change the mapping quality? bam_new->core.mtid = -1; bam_new->core.mpos = -1; bam_new->core.isize = 0; } { // cigar length bam_new->core.n_cigar = 0; cigar_cur_op = cigar_prev_op = -1; sw_node_cur_i = sw_node_best_i; while(0 <= sw_node_cur_i) { if(0 <= sw_node_prev_i && BAM_CDEL == cigar_prev_op && 1 < fabs(heap->nodes[sw_node_cur_i].node->position - heap->nodes[sw_node_prev_i].node->position)) { cigar_cur_op = BAM_CDEL; } else { switch(__node_type(heap->nodes[sw_node_cur_i].node)) { case NODE_MATCH: case NODE_MISMATCH: cigar_cur_op = BAM_CMATCH; break; case NODE_INSERTION: cigar_cur_op = BAM_CINS; break; default: srma_error(__func__, "unknown node type", Exit, OutOfRange); } } if(cigar_prev_op != cigar_cur_op) { // update the previous cigar operator cigar_prev_op = cigar_cur_op; bam_new->core.n_cigar++; } // Update if(BAM_CDEL != cigar_cur_op) { sw_node_prev_i = sw_node_cur_i; sw_node_cur_i = heap->nodes[sw_node_cur_i].prev_i; } } } { // cigar and seq uint32_t *cigar_ptr=NULL; uint8_t *seq_ptr=NULL; uint32_t cigar_i = 0; // cigar bam_new->data_len += bam_new->core.n_cigar*sizeof(uint32_t); sw_align_bam_alloc_data(bam_new, bam_new->data_len); cigar_ptr = bam1_cigar(bam_new); // seq bam_new->core.l_qseq = bam_old->core.l_qseq; bam_new->data_len += (bam_new->core.l_qseq + 1)/2; sw_align_bam_alloc_data(bam_new, bam_new->data_len); seq_ptr = bam1_seq(bam_new); // fill in cigar and seq cigar_i = (1 == strand) ? bam_new->core.n_cigar-1 : 0; cigar_cur_op = cigar_prev_op = -1; cigar_cur_length = cigar_prev_length = -1; sw_node_cur_i = sw_node_best_i; while(0 <= sw_node_cur_i) { if(0 <= sw_node_prev_i && BAM_CDEL == cigar_prev_op && 1 < fabs(heap->nodes[sw_node_cur_i].node->position - heap->nodes[sw_node_prev_i].node->position)) { cigar_cur_op = BAM_CDEL; } else { switch(__node_type(heap->nodes[sw_node_cur_i].node)) { case NODE_MATCH: case NODE_MISMATCH: cigar_cur_op = BAM_CMATCH; break; case NODE_INSERTION: cigar_cur_op = BAM_CINS; break; default: srma_error(__func__, "unknown node type", Exit, OutOfRange); } // pack sequence if(1 == strand && 0 == read_index%2) { seq_ptr[read_index/2] = 0; } else if(0 == strand && 1 == read_index%2) { seq_ptr[read_index/2] = 0; } // DEBUG /* fprintf(stderr, "read_index=%d base=%d\n", read_index, __node_base(heap->nodes[sw_node_cur_i].node)); */ seq_ptr[read_index/2] |= int_to_nt4bit[__node_base(heap->nodes[sw_node_cur_i].node)] << 4*(1-(read_index%2)); if(1 == strand) { read_index++; } else { read_index--; } } if(cigar_prev_op != cigar_cur_op) { // add the previous cigar operator if(-1 != cigar_prev_op) { bam1_cigar(bam_new)[cigar_i] = (cigar_prev_length << BAM_CIGAR_SHIFT) | cigar_prev_op; if(1 == strand) { // reverse strand cigar_i--; } else { cigar_i++; } } // update the previous cigar operator cigar_prev_op = cigar_cur_op; if(cigar_cur_op == BAM_CDEL) { // deletion length cigar_prev_length = (int)fabs(heap->nodes[sw_node_cur_i].node->position - heap->nodes[sw_node_cur_i].node->position) - 1; } else { cigar_prev_length = 1; } } else { cigar_prev_length++; } // Update if(BAM_CDEL != cigar_cur_op) { sw_node_prev_i = sw_node_cur_i; sw_node_cur_i = heap->nodes[sw_node_cur_i].prev_i; } } if(0 < cigar_prev_length) { if(-1 == cigar_prev_op || BAM_CDEL == cigar_prev_op) { srma_error(__func__, "Alignment ended with a null cigar or a deletion", Exit, OutOfRange); } bam1_cigar(bam_new)[cigar_i] = (cigar_prev_length << BAM_CIGAR_SHIFT) | cigar_prev_op; // DEBUG if(1 == strand) { // reverse strand assert(cigar_i == 0); } else { assert(cigar_i == bam_new->core.n_cigar-1); } } } { // qualities uint8_t *qual_ptr = NULL; char qual, q1, q2; uint8_t prev_base = 0, next_base; bam_new->data_len += bam_new->core.l_qseq; sw_align_bam_alloc_data(bam_new, bam_new->data_len); qual_ptr = bam1_qual(bam_new); if(space == SRMA_SPACE_CS) { color_errors = srma_malloc(sizeof(char)*(1 + bam_new->core.l_qseq), __func__, "color_errors"); prev_base = nt2int_table[(int)colors[0]]; for(i=0;i<bam_new->core.l_qseq;i++) { if(0 == strand) { next_base = nt4bit_to_int[bam1_seqi(bam1_seq(bam_new), i)]; } else { next_base = nt4bit_to_int[bam1_seqi(bam1_seq(bam_new), bam_new->core.l_qseq-i-1)]; if(next_base < 4) next_base = 3 - next_base; } if((prev_base ^ next_base) == nt2int_table[(int)colors[i+1]]) { color_errors[i] = '-'; } else { color_errors[i] = colors[i+1]; } prev_base = next_base; } color_errors[i]='\0'; // Get new base qualities based on color qualities for(i=0;i<bam_new->core.l_qseq;i++) { // use MAQ 0.7.1 conversion if(i == bam_new->core.l_qseq-1) { qual = srma_char2qual(color_qualities[i]); } else { int m1, m2; if(0 == strand) { // forward m1 = ('-' == color_errors[i]) ? 1 : 0; m2 = ('-' == color_errors[i+1]) ? 1 : 0; q1 = color_qualities[i]; q2 = color_qualities[i+1]; } else { m1 = ('-' == color_errors[bam_new->core.l_qseq-i-1]) ? 1 : 0; m2 = ('-' == color_errors[bam_new->core.l_qseq-i-2]) ? 1 : 0; q1 = color_qualities[bam_new->core.l_qseq-i-1]; q2 = color_qualities[bam_new->core.l_qseq-i-2]; } if(1 == m1 && 1 == m2) { qual = srma_char2qual(q1) + srma_char2qual(q2) + 10; } else if(1 == m1) { qual = srma_char2qual(q1) - srma_char2qual(q2); } else if(1 == m2) { qual = srma_char2qual(q2) - srma_char2qual(q1); } else { qual = 1; } } if(0 == strand) { bam1_qual(bam_new)[i] = __bound_qual(qual); } else { bam1_qual(bam_new)[bam_new->core.l_qseq-i-1] = __bound_qual(qual); } } } else if(1 == correct_bases) { // Get new base qualities for(i=0;i<bam_new->core.l_qseq;i++) { if(bam1_seqi(bam1_seq(bam_new), i) == bam1_seqi(bam1_seq(bam_old), i)) { bam1_qual(bam_new)[i] = bam1_qual(bam_old)[i]; } else { qual = srma_char2qual(bam1_qual(bam_old)[i]) - 33; bam1_qual(bam_new)[i] = srma_qual2char(__bound_qual(qual - SRMA_CORRECT_BASE_QUALITY_PENALTY)); } } } else { // Copy old quality memcpy(bam1_qual(bam_new), bam1_qual(bam_old), bam_new->core.l_qseq); } } // TODO soft-clipping { // Add in any auxiliary data as necessary uint8_t *s; int32_t i = 0; bam_new->l_aux = 0; while(NULL != sw_align_save_tags[i]) { __copy_old(sw_align_save_tags[i]); i++; } // TODO // PG // TODO: is AS correct bam_aux_append(bam_new, "AS", 'i', sizeof(uint32_t), (uint8_t*)&heap->nodes[sw_node_best_i].score); if(1 == correct_bases) { int32_t l = bam_old->core.l_qseq; char *str; str = srma_malloc(sizeof(char)*(l+1), __func__, "seq"); for(i=0;i<l;i++) { str[i] = bam_nt16_rev_table[bam1_seqi(bam1_seq(bam_old), i)]; } str[i] = '\0'; bam_aux_append(bam_new, "XO", 'Z', l+1, (uint8_t*)str); for(i=0;i<l;i++) { str[i] = bam1_qual(bam_old)[i] + 33; } str[i] = '\0'; bam_aux_append(bam_new, "XQ", 'Z', l+1, (uint8_t*)str); free(str); } bam_aux_append(bam_new, "XC", 'i', sizeof(uint32_t), (uint8_t*)&heap->nodes[sw_node_best_i].coverage_sum); if(space == SRMA_SPACE_CS) { bam_aux_append(bam_new, "XE", 'Z', bam_new->core.l_qseq+1, (uint8_t*)color_errors); } } // destroy the old bam structure bam_destroy1(bam_old); if(space == SRMA_SPACE_CS) { free(color_errors); } return bam_new; }
/* Count matches (OP_MATCH), mismatches (OP_MISMATCH), insertions * (OP_INS) and deletions (OP_DEL) for an aligned read. Written to * (preallocated, size 4) counts at indices given above. Will ignore * all mis-/match bases if their bq is below min_bq. * * Returns the total number of operations counted (excl. clipped bases * or those with bq<min_bq) or -1 on error. Consecutive indels are * counted as one operation, using INDEL_QUAL_DEFAULT, which is * suboptimal. 0 is a valid return value, e.g. if all bases are below * the quality threshold. * * If quals is not NULL it will be used as a two dim array (has to be * preallocated) with OPs as first dim (len NUM_OP_CATS) and the * qualities of the bases as second dim. NOTE/FIXME: this uses bq for * mis/matches and INDEL_QUAL_DEFAULT for now in case of indels. The * number of elements corresponds to the count entry and can be at max * readlen. * * If target is non-NULL will ignore preloaded variant positions via * var_in_ign_list * * WARNING code duplication with calc_read_alnerrprof but merging the * two functions was too complicated (and the latter is unused anyway) */ int count_cigar_ops(int *counts, int **quals, const bam1_t *b, const char *ref, int min_bq, char *target) { #if 0 #define TRACE 1 #endif int num_ops = 0; /* modelled after bam.c:bam_calend(), bam_format1_core() and * pysam's aligned_pairs (./pysam/csamtools.pyx) */ uint32_t *cigar = bam1_cigar(b); const bam1_core_t *c = &b->core; uint32_t tpos = c->pos; /* pos on genome */ uint32_t qpos = 0; /* pos on read/query */ uint32_t k, i; #if 0 int32_t qlen = (int32_t) bam_cigar2qlen(c, cigar); /* read length */ #else int qlen = b->core.l_qseq; /* read length */ #endif if (! ref) { return -1; } if (! counts) { return -1; } memset(counts, 0, NUM_OP_CATS*sizeof(int)); /* loop over cigar to get aligned bases * * read: bam_format1_core(NULL, b, BAM_OFDEC); */ for (k=0; k < c->n_cigar; ++k) { /* n_cigar: number of cigar operations */ int op = cigar[k] & BAM_CIGAR_MASK; /* the cigar operation */ uint32_t l = cigar[k] >> BAM_CIGAR_SHIFT; /* following conditionals could be collapsed to much shorter * code, but we keep them roughly as they were in pysam's * aligned_pairs to make later comparison and handling of * indels easier */ if (op == BAM_CMATCH || op == BAM_CDIFF) { for (i=tpos; i<tpos+l; i++) { int actual_op; assert(qpos < qlen); char ref_nt = ref[i]; char read_nt = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), qpos)]; int bq = bam1_qual(b)[qpos]; if (ref_nt != read_nt || op == BAM_CDIFF) { actual_op = OP_MISMATCH; } else { actual_op = OP_MATCH; } /* ignoring base if below min_bq, independent of type */ if (bq<min_bq) { #ifdef TRACE fprintf(stderr, "TRACE(%s): [M]MATCH ignoring base because of bq=%d at %d (qpos %d)\n", bam1_qname(b), bq, i, qpos); #endif qpos += 1; continue; } /* for mismatches only */ if (target && actual_op == OP_MISMATCH) { var_t fake_var; memset(&fake_var, 0, sizeof(var_t)); fake_var.chrom = target; fake_var.pos = i; /* FIXME evil, evil hack. only works as long as var_in_ign_list only uses chrom and pos */ if (var_in_ign_list(&fake_var)) { #ifdef TRACE fprintf(stderr, "TRACE(%s): MM: ignoring because in ign list at %d (qpos %d)\n", bam1_qname(b), i, qpos); #endif qpos += 1; continue; } } #ifdef TRACE fprintf(stderr, "TRACE(%s): adding [M]MATCH qpos,tpos,ref,read,bq = %d,%d,%c,%c,%d\n", bam1_qname(b), qpos, tpos, ref_nt, read_nt, bq); #endif counts[actual_op] += 1; if (quals) { quals[actual_op][counts[actual_op]-1] = bq; } qpos += 1; } tpos += l; } else if (op == BAM_CINS || op == BAM_CDEL) { if (target) { /* vcf: * indel at tpos 1 means, that qpos 2 is an insertion (e.g. A to AT) * del at tpos 1 means, that qpos 2 is missing (e.g. AT to A) */ var_t fake_var; fake_var.chrom = target; fake_var.pos = tpos; if (op==BAM_CINS) { fake_var.pos -= 1; } /* FIXME see above: only works as long as var_in_ign_list only uses chrom and pos */ if (var_in_ign_list(&fake_var)) { if (op == BAM_CINS) { qpos += l; } #ifdef TRACE fprintf(stderr, "TRACE(%s): %c: ignoring because in ign list at tpos %d (qpos %d)\n", bam1_qname(b), op == BAM_CINS? 'I':'D', tpos, qpos); #endif continue; } } #ifdef TRACE fprintf(stderr, "TRACE(%s): adding %c qpos,tpos = %d,%d\n", bam1_qname(b), op==BAM_CINS?'I':'D', qpos, tpos); #endif if (op == BAM_CINS) { counts[OP_INS] += 1; /* counts indel as 1 operation only */ if (quals) { quals[OP_INS][counts[OP_INS]-1] = INDEL_QUAL_DEFAULT; /* FIXME use iq */ } qpos += l;/* forward query pos by length of operation */ } else if (op == BAM_CDEL) { counts[OP_DEL] += 1; /* counts indel as 1 operation only */ if (quals) { quals[OP_DEL][counts[OP_DEL]-1] = INDEL_QUAL_DEFAULT; /* FIXME use dq */ } tpos += l; /* forward genome pos by length of operation */ } else { LOG_FATAL("%s\n", "INTERNAL ERROR: should never get here"); exit(1); } } else if (op == BAM_CREF_SKIP) { tpos += l; } else if (op == BAM_CSOFT_CLIP) { #if 0 printf("SOFT CLIP qpos = %d\n", qpos); #endif qpos += l; } else if (op != BAM_CHARD_CLIP) { LOG_WARN("Untested op %d in cigar %s\n", op, cigar_str_from_bam(b)); /* don't think we need to do anything here */ } } /* for k */ assert(qpos == bam_calend(&b->core, bam1_cigar(b))); /* FIXME correct assert? what if hard clipped? */ if (qpos != qlen) { LOG_WARN("got qpos=%d and qlen=%d for cigar %s l_qseq %d in read %s\n", qpos, qlen, cigar_str_from_bam(b), b->core.l_qseq, bam1_qname(b)); } assert(qpos == qlen); num_ops = 0; for (i=0; i<NUM_OP_CATS; i++) { num_ops += counts[i]; #ifdef TRACE int j; for (j=0; j<counts[i]; j++) { fprintf(stderr, "TRACE(%s) op %s #%d: %d\n", bam1_qname(b), op_cat_str[i], j, quals[i][j]); } #endif } return num_ops; }
static int mpileup(mplp_conf_t *conf, int n, char **fn) { extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list); extern void bcf_call_del_rghash(void *rghash); mplp_aux_t **data; int i, tid, pos, *n_plp, tid0 = -1, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid = -1, max_depth, max_indel_depth; const bam_pileup1_t **plp; bam_mplp_t iter; bam_header_t *h = 0; char *ref; void *rghash = 0; bcf_callaux_t *bca = 0; bcf_callret1_t *bcr = 0; bcf_call_t bc; bcf_t *bp = 0; bcf_hdr_t *bh = 0; bam_sample_t *sm = 0; kstring_t buf; mplp_pileup_t gplp; memset(&gplp, 0, sizeof(mplp_pileup_t)); memset(&buf, 0, sizeof(kstring_t)); memset(&bc, 0, sizeof(bcf_call_t)); data = calloc(n, sizeof(void*)); plp = calloc(n, sizeof(void*)); n_plp = calloc(n, sizeof(int*)); sm = bam_smpl_init(); // read the header and initialize data for (i = 0; i < n; ++i) { bam_header_t *h_tmp; data[i] = calloc(1, sizeof(mplp_aux_t)); data[i]->fp = strcmp(fn[i], "-") == 0? bam_dopen(fileno(stdin), "r") : bam_open(fn[i], "r"); data[i]->conf = conf; h_tmp = bam_header_read(data[i]->fp); data[i]->h = i? h : h_tmp; // for i==0, "h" has not been set yet bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text); rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list); if (conf->reg) { int beg, end; bam_index_t *idx; idx = bam_index_load(fn[i]); if (idx == 0) { fprintf(stderr, "[%s] fail to load index for %d-th input.\n", __func__, i+1); exit(1); } if (bam_parse_region(h_tmp, conf->reg, &tid, &beg, &end) < 0) { fprintf(stderr, "[%s] malformatted region or wrong seqname for %d-th input.\n", __func__, i+1); exit(1); } if (i == 0) tid0 = tid, beg0 = beg, end0 = end; data[i]->iter = bam_iter_query(idx, tid, beg, end); bam_index_destroy(idx); } if (i == 0) h = h_tmp; else { // FIXME: to check consistency bam_header_destroy(h_tmp); } } gplp.n = sm->n; gplp.n_plp = calloc(sm->n, sizeof(int)); gplp.m_plp = calloc(sm->n, sizeof(int)); gplp.plp = calloc(sm->n, sizeof(void*)); fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, sm->n, n); // write the VCF header if (conf->flag & MPLP_GLF) { kstring_t s; bh = calloc(1, sizeof(bcf_hdr_t)); s.l = s.m = 0; s.s = 0; bp = bcf_open("-", (conf->flag&MPLP_NO_COMP)? "wu" : "w"); for (i = 0; i < h->n_targets; ++i) { kputs(h->target_name[i], &s); kputc('\0', &s); } bh->l_nm = s.l; bh->name = malloc(s.l); memcpy(bh->name, s.s, s.l); s.l = 0; for (i = 0; i < sm->n; ++i) { kputs(sm->smpl[i], &s); kputc('\0', &s); } bh->l_smpl = s.l; bh->sname = malloc(s.l); memcpy(bh->sname, s.s, s.l); bh->txt = malloc(strlen(BAM_VERSION) + 64); bh->l_txt = 1 + sprintf(bh->txt, "##samtoolsVersion=%s\n", BAM_VERSION); free(s.s); bcf_hdr_sync(bh); bcf_hdr_write(bp, bh); bca = bcf_call_init(-1., conf->min_baseQ); bcr = calloc(sm->n, sizeof(bcf_callret1_t)); bca->rghash = rghash; bca->openQ = conf->openQ, bca->extQ = conf->extQ, bca->tandemQ = conf->tandemQ; bca->min_frac = conf->min_frac; bca->min_support = conf->min_support; } if (tid0 >= 0 && conf->fai) { // region is set ref = faidx_fetch_seq(conf->fai, h->target_name[tid0], 0, 0x7fffffff, &ref_len); ref_tid = tid0; for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid0; } else ref_tid = -1, ref = 0; iter = bam_mplp_init(n, mplp_func, (void**)data); max_depth = conf->max_depth; if (max_depth * sm->n > 1<<20) fprintf(stderr, "(%s) Max depth is above 1M. Potential memory hog!\n", __func__); if (max_depth * sm->n < 8000) { max_depth = 8000 / sm->n; fprintf(stderr, "<%s> Set max per-file depth to %d\n", __func__, max_depth); } max_indel_depth = conf->max_indel_depth * sm->n; bam_mplp_set_maxcnt(iter, max_depth); int storeSize = 100; int delStore[2][100] = {{0},{0}}; typedef char * mstring; while (bam_mplp_auto(iter, &tid, &pos, n_plp, plp) > 0) { if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue; if (tid != ref_tid) { free(ref); ref = 0; if (conf->fai) ref = faidx_fetch_seq(conf->fai, h->target_name[tid], 0, 0x7fffffff, &ref_len); for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid; ref_tid = tid; } if (conf->flag & MPLP_GLF) { int total_depth, _ref0, ref16; bcf1_t *b = calloc(1, sizeof(bcf1_t)); for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i]; group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp, conf->flag & MPLP_IGNORE_RG); _ref0 = (ref && pos < ref_len)? ref[pos] : 'N'; ref16 = bam_nt16_table[_ref0]; for (i = 0; i < gplp.n; ++i) bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], ref16, bca, bcr + i); bcf_call_combine(gplp.n, bcr, ref16, &bc); bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0, (conf->flag&MPLP_FMT_SP), 0, 0); bcf_write(bp, bh, b); bcf_destroy(b); // call indels if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) { for (i = 0; i < gplp.n; ++i) bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i); if (bcf_call_combine(gplp.n, bcr, -1, &bc) >= 0) { b = calloc(1, sizeof(bcf1_t)); bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0, (conf->flag&MPLP_FMT_SP), bca, ref); bcf_write(bp, bh, b); bcf_destroy(b); } } } else { printf("%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N'); for (i = 0; i < n; ++i) { int j; printf("\t%d\t", n_plp[i]); if (n_plp[i] == 0) { printf("*\t*"); // FIXME: printf() is very slow... if (conf->flag & MPLP_PRINT_POS) printf("\t*"); } else { //MDW start //for each position in the pileup column int charLen = 16; int countChars[ charLen ][2]; int countiChars[ charLen ][2]; int countGap[2]={0,0}; //double qvTotal=0; int numStruck=0; int numGood=0; int tti; int ttj; mstring insAllele[100]; int insAlleleCnt[100]; int sf=0; int flag=0; //typedef char * string; char insStr0[10000]; int iCnt0=0; char insStr1[10000]; int iCnt1=0; char delStr0[10000]; int dCnt0=0; char delStr1[10000]; int dCnt1=0; float qposP[10000]; int qposCnt=0; //initialize with zeros for(tti=0;tti<charLen;tti++){ countChars[tti][0]=0; countChars[tti][1]=0; } // define repeat length here; look back up to 10 prior positions // start one position away. int replC=0; // for(tti=1;tti<=15;tti++){ // check for greater than zero if(toupper(ref[pos-1])==toupper(ref[pos-tti])){ replC++; }else{ // breaks the chain at first non identical to current position not strict homopolymer break; } } int reprC=0; // for(tti=1;tti<=15;tti++){ // check for greater than zero if(toupper(ref[pos+1])==toupper(ref[pos+tti])){ reprC++; }else{ // breaks the chain at first non identical to current position not strict homopolymer break; } } int repT = replC; if(replC < reprC){ repT=reprC; } for (j = 0; j < n_plp[i]; ++j){ const bam_pileup1_t *p = plp[i] + j; /* SAME LOGIC AS pileup_seq() */ if(p->is_refskip){ // never count intron gaps in numStruck continue; } if(p->is_del){ // skip deletion gap, after first position which is the first aligned char continue; } if( p->b->core.qual < conf->min_mqToCount || // mapping quality conf->maxrepC < (repT) || // max homopolymer run, this will not (!p->is_del && bam1_qual(p->b)[p->qpos] < conf->min_baseQ) || // base quality for matches p->alignedQPosBeg <= (conf->trimEnd ) || p->alignedQPosEnd <= (conf->trimEnd ) || // trimEnd is 1-based p->zf == 1 || // fusion tag p->ih > conf->maxIH || // max hit index (p->nmd > conf->maxNM) || // max mismatch (conf->flagFilter == 1 && !(p->b->core.flag&BAM_FPROPER_PAIR)) || // optionally keep only proper pairs (conf->flagFilter == 2 && p->b->core.flag&BAM_FSECONDARY) || // optionally strike secondary (conf->flagFilter == 3 && p->b->core.flag&BAM_FDUP) || // optionally strike dup (conf->flagFilter == 4 && (p->b->core.flag&BAM_FDUP || p->b->core.flag&BAM_FSECONDARY)) || // optionally strike secondary or dup (conf->flagFilter == 5 && (p->b->core.flag&BAM_FDUP || p->b->core.flag&BAM_FSECONDARY || p->b->core.flag&BAM_FQCFAIL || !(p->b->core.flag&BAM_FPROPER_PAIR) )) // optionally strike secondary, dup and QCfail ){ numStruck++; continue; } //printf("repT=%d: %d %c %c %c %c \n",repT,p->indel,ref[pos],ref[pos-1],ref[pos-2],ref[pos-3]); if(!p->is_del && p->indel==0){ countChars[ bam1_seqi(bam1_seq(p->b), p->qpos) ][ bam1_strand(p->b) ] ++; numGood++; }else if(p->is_refskip){ countGap[ bam1_strand(p->b) ]++; } if(p->indel<0){ numGood++; if(bam1_strand(p->b) ==0){ for(tti=1;tti<= -p->indel; tti++) { // current spot, starting at 0 in store, because indel<0 refers to next position delStr0[dCnt0] = ref[pos+tti]; dCnt0++; } delStr0[dCnt0] = ','; dCnt0++; }else{ for(tti=1;tti<= -p->indel; tti++) { // current spot, starting at 0 in store, because indel<0 refers to next position delStr1[dCnt1] = ref[pos+tti]; dCnt1++; } delStr1[dCnt1] = ','; dCnt1++; } }else if(p->indel>0){ numGood++; if(bam1_strand(p->b) ==0){ for(tti=1;tti<= p->indel; tti++) { // current spot, starting at 0 in store, because indel<0 refers to next position insStr0[iCnt0] = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + tti)]; iCnt0++; } insStr0[iCnt0] = ','; iCnt0++; }else{ for(tti=1;tti<= p->indel; tti++) { // current spot, starting at 0 in store, because indel<0 refers to next position insStr1[iCnt1] = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + tti)]; iCnt1++; } insStr1[iCnt1] = ','; iCnt1++; } } //calculate position of variant within aligned read - no soft clips if( toupper(ref[pos]) != toupper(bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]) || p->indel>0 || p->indel<0 ){ //distance to end; calculate distance to end of aligned read. removes soft clips. int distToEnd = (p->alignedQPosBeg < p->alignedQPosEnd) ? p->alignedQPosBeg : p->alignedQPosEnd; qposP[qposCnt] = distToEnd; qposCnt++; // printf("id=%s, pos=%d",bam1_qname(p->b),distToEnd); } } // //print A,C,G,T, by +/- printf("\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d", countChars[1][0],countChars[1][1], countChars[2][0],countChars[2][1], countChars[4][0],countChars[4][1], countChars[8][0],countChars[8][1], countChars[7][0],countChars[7][1]); putchar('\t'); for(tti=0;tti<dCnt0;tti++){ putchar(delStr0[tti]); } putchar('\t'); for(tti=0;tti<dCnt1;tti++){ putchar(delStr1[tti]); } putchar('\t'); for(tti=0;tti<iCnt0;tti++){ putchar(insStr0[tti]); } putchar('\t'); for(tti=0;tti<iCnt1;tti++){ putchar(insStr1[tti]); } printf("\t%d\t%d",numGood,numStruck); // get non-ref qpos variation float medqpos = -1; float medAbsDev = -1; if(qposCnt>0){ medqpos = median(qposCnt,qposP); float absDev[qposCnt]; for(tti=0;tti<qposCnt;tti++){ absDev[tti] = abs(medqpos - qposP[tti]); } medAbsDev = median(qposCnt-1,absDev); } printf("\t%f",medAbsDev); ///END MDW } } putchar('\n'); } } bcf_close(bp); bam_smpl_destroy(sm); free(buf.s); for (i = 0; i < gplp.n; ++i) free(gplp.plp[i]); free(gplp.plp); free(gplp.n_plp); free(gplp.m_plp); bcf_call_del_rghash(rghash); bcf_hdr_destroy(bh); bcf_call_destroy(bca); free(bc.PL); free(bcr); bam_mplp_destroy(iter); bam_header_destroy(h); for (i = 0; i < n; ++i) { bam_close(data[i]->fp); if (data[i]->iter) bam_iter_destroy(data[i]->iter); free(data[i]); } free(data); free(plp); free(ref); free(n_plp); return 0; }
/* Counts probability of non-match count along the read after * subtracting error prob at that position (using the original * orientation). used_pos is an array of ints indicating whether * position was used or not (trimmed, clipped etc). alnerrprof and * used_pos must be of at least length b->core.l_qseq. Note: will add * to alnerrprof and used_pos, i.e. arrays should be initialized to 0 if * you don't want aggregate values. * * WARNING code duplication with count_cigar_ops but merging the two * functions is messy. */ void calc_read_alnerrprof(double *alnerrprof, unsigned long int *used_pos, const bam1_t *b, const char *ref) { /* modelled after bam.c:bam_calend(), bam_format1_core() and * pysam's aligned_pairs (./pysam/csamtools.pyx) */ uint32_t *cigar = bam1_cigar(b); uint32_t k, i; const bam1_core_t *c = &b->core; #if 0 int32_t qlen = (int32_t) bam_cigar2qlen(c, cigar); /* read length */ #else int qlen = b->core.l_qseq; /* read length */ #endif uint32_t pos = c->pos; /* pos on genome */ uint32_t qpos = 0; /* pos on read/query */ uint32_t qpos_org = bam1_strand(b) ? qlen-qpos-1 : qpos;/* original qpos before mapping as possible reverse */ /* loop over cigar to get aligned bases * * read: bam_format1_core(NULL, b, BAM_OFDEC); */ for (k=0; k < c->n_cigar; ++k) { /* n_cigar: number of cigar operations */ int op = cigar[k] & BAM_CIGAR_MASK; /* the cigar operation */ uint32_t l = cigar[k] >> BAM_CIGAR_SHIFT; /* following conditionals could be collapsed to much shorter * code, but we keep them as they were in pysam's * aligned_pairs to make later handling of indels easier */ if (op == BAM_CMATCH || op == BAM_CDIFF) { for (i=pos; i<pos+l; i++) { assert(qpos < qlen); /* case agnostic */ char ref_nt = ref[i]; char read_nt = bam_nt16_rev_table[bam1_seqi(bam1_seq(b), qpos)]; int bq = bam1_qual(b)[qpos]; #if 0 printf("[M]MATCH qpos,i,ref,read = %d,%d,%c,%c\n", qpos, i, ref_nt, read_nt); #endif if (ref_nt != 'N') { if (ref_nt != read_nt || op == BAM_CDIFF) { alnerrprof[qpos_org] += (1.0 - PHREDQUAL_TO_PROB(bq)); } /* otherwise leave at 0.0 but count anyway */ used_pos[qpos_org] += 1; } qpos += 1; qpos_org = bam1_strand(b) ? qlen-qpos-1 : qpos; } pos += l; } else if (op == BAM_CINS) { for (i=pos; i<pos+l; i++) { assert(qpos < qlen); alnerrprof[qpos] += (1.0 - PHREDQUAL_TO_PROB(INDEL_QUAL_DEFAULT)); used_pos[qpos] += 1; #if 0 printf("INS qpos,i = %d,None\n", qpos); #endif qpos += 1; qpos_org = bam1_strand(b) ? qlen-qpos-1 : qpos; } } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) { for (i=pos; i<pos+l; i++) { #if 0 printf("DEL qpos,i = None,%d\n", i); #endif if (op == BAM_CDEL) { alnerrprof[qpos] += (1.0 - PHREDQUAL_TO_PROB(INDEL_QUAL_DEFAULT)); used_pos[qpos] += 1; } } pos += l; /* deletion: don't increase qpos */ } else if (op == BAM_CSOFT_CLIP) { #if 0 printf("SOFT CLIP qpos = %d\n", qpos); #endif qpos += l; qpos_org = bam1_strand(b) ? qlen-qpos-1 : qpos; } else if (op != BAM_CHARD_CLIP) { LOG_WARN("Unknown op %d in cigar %s\n", op, cigar_str_from_bam(b)); } } /* for k */ assert(pos == bam_calend(&b->core, bam1_cigar(b))); /* FIXME correct assert? what if hard clipped? */ if (qpos != qlen) { LOG_FIXME("got qpos=%d and qlen=%d for cigar %s l_qseq %d\n", qpos, qlen, cigar_str_from_bam(b), b->core.l_qseq); } assert(qpos == qlen); /* FIXME correct assert? What if hard clipped? */ #if 0 fprintf(stderr, "%s:", __FUNCTION__); for (i=0; i< b->core.l_qseq; i++) { fprintf(stderr, " %g/%d", alnerrprof[i], used_pos[i]); } fprintf(stderr, "\n"); #endif }
SamRead::SamRead(const bam1_t *b, bam_header_t *_bamHeader, const faidx_t *_fai) { fai = _fai; const bam1_core_t *c = &b->core; uint32_t len = c->l_qseq; double mapPhred = (double) c->qual; mapQual = (1.0 - pow(10.0, -mapPhred / 10.0)); if (mapQual < 0.0 || mapQual > 1.0 || std::isnan(mapQual) || std::isinf(mapQual)) { throw std::string("Phred error."); } else if (mapQual < 1e-16) { mapQual = 1e-16; } else if (mapQual > 1 - 1e-16) { mapQual = 1 - 1e-16; } pos = c->pos; // zero-based seq_name = (std::string)reinterpret_cast<char *>(bam1_qname(b)); seq.reserve(len); qual.reserve(len); for (size_t x = 0; x < len; x++) { seq += (bam_nt16_rev_table[ bam1_seqi(bam1_seq(b), x) ]); // convert phred to probability double basePhred = (double)(((uint8_t *) bam1_qual(b))[x]); double q = (1.0 - pow(10.0, -basePhred / 10.0)); if (q < 0.0 || q > 1.0 || std::isnan(q) || std::isinf(q)) { throw std::string("Phred error."); } if (q < 1e-16) { q = 1e-16; } if (q > 1.0 - 1e-16) { q = 1.0 - 1e-16; } qual.push_back(q); // base quality is on log10 scale } bam = new bam1_t; *bam = *b; bam->data = new uint8_t[b->m_data]; bam->m_data = b->m_data; for (int m = 0; m < b->m_data; m++) { bam->data[m] = b->data[m]; } if (bam->core.flag & BAM_FREVERSE) { onReverseStrand = true; } else { onReverseStrand = false; } matePos = bam->core.mpos; mateLen = -1; this->bamHeader = _bamHeader; uint32_t *rawCigar = bam1_cigar(b); leftMostPos = pos; rightMostPos = getEndPos(); if(!isUnmapped()) { rightMostPos = pos; hasIndel = false; hasHardClip = false; hasSoftClip = false; hasOtherCigarFlag = false; bool isLeftMost = true; softClipSize = 0; // set CIGAR string for (int k = 0; k < c->n_cigar; ++k) { int op = rawCigar[k] & BAM_CIGAR_MASK; int32_t len = rawCigar[k] >> BAM_CIGAR_SHIFT; // update position for the next cigar if (op == BAM_CMATCH) { cigars.push_back(CIGAR(CIGAR::MATCH, len)); rightMostPos += len; isLeftMost = false; } else if (op == BAM_CINS) { cigars.push_back(CIGAR(CIGAR::INS, len)); hasIndel = true; isLeftMost = false; } else if (op == BAM_CDEL) { cigars.push_back(CIGAR(CIGAR::DEL, len)); rightMostPos += len; hasIndel = true; isLeftMost = false; } else if (op == BAM_CSOFT_CLIP) { rightMostPos += len; if (softClipSize < (int)len) { softClipSize = (int)len; } hasSoftClip = true; cigars.push_back(CIGAR(CIGAR::SOFTCLIP, len)); if (isLeftMost) { leftMostPos -= len; isLeftMost = false; } } else if (op == BAM_CHARD_CLIP) { hasHardClip = true; } else { hasOtherCigarFlag = true; } } parseVariants(); }
bam_stats_t *bam1_stats(bam1_t *bam1, bam_stats_options_t *opts) { bam_stats_t *bam_stats = NULL; uint32_t bam_flag = (uint32_t) bam1->core.flag; if (bam_flag & BAM_FUNMAP) { // not mapped, then return bam_stats = bam_stats_new(); bam_stats->mapped = 0; return bam_stats; } if (opts->region_table) { region_t region; region.chromosome = opts->sequence_labels[bam1->core.tid]; region.start_position = bam1->core.pos; region.end_position = region.start_position + bam1->core.l_qseq; region.strand = NULL; region.type = NULL; if (find_region(®ion, opts->region_table)) { bam_stats = bam_stats_new(); } else { return NULL; } } else { bam_stats = bam_stats_new(); } // mapped !! bam_stats->mapped = 1; bam_stats->strand = (int) ((bam_flag & BAM_FREVERSE) > 0); // number of errors bam_stats->num_errors = bam_aux2i(bam_aux_get(bam1, "NM")); // cigar handling: number of indels and length uint32_t cigar_int, *cigar = bam1_cigar(bam1); int num_cigar_ops = (int) bam1->core.n_cigar; for (int j = 0; j < num_cigar_ops; j++) { cigar_int = cigar[j]; switch (cigar_int & BAM_CIGAR_MASK) { case BAM_CINS: //I: insertion to the reference case BAM_CDEL: //D: deletion from the reference bam_stats->num_indels++; bam_stats->indels_length += (cigar_int >> BAM_CIGAR_SHIFT); break; } } // quality bam_stats->quality = bam1->core.qual; // unique alignment if (!(bam_flag & BAM_FSECONDARY)) { bam_stats->unique_alignment = 1; } // handling pairs bam_stats->single_end = 1; if (bam_flag & BAM_FPAIRED) { bam_stats->single_end = 0; if (bam_flag & BAM_FUNMAP) { if (bam_flag & BAM_FREAD1) { bam_stats->unmapped_pair_1 = 1; } else { bam_stats->unmapped_pair_2 = 1; } } else { if (bam_flag & BAM_FREAD1) { bam_stats->mapped_pair_1 = 1; } else { bam_stats->mapped_pair_2 = 1; } } if (!(bam_flag & BAM_FUNMAP) && !(bam_flag & BAM_FMUNMAP) && (bam_flag & BAM_FPROPER_PAIR)) { bam_stats->isize = abs(bam1->core.isize); } } // mapping length char *bam_seq = bam1_seq(bam1); int seq_len = bam1->core.l_qseq; bam_stats->seq_length = seq_len; // nucleotide content for (int i = 0; i < seq_len; i++) { switch (bam1_seqi(bam_seq, i)) { case 1: bam_stats->num_As++; break; case 2: bam_stats->num_Cs++; break; case 4: bam_stats->num_Gs++; break; case 8: bam_stats->num_Ts++; break; case 15: bam_stats->num_Ns++; break; } } bam_stats->num_GCs = bam_stats->num_Gs + bam_stats->num_Cs; return bam_stats; }
void printReadBuffered(aRead &rd,aHead *hd,kstring_t &str) { str.l = 0; if(bam_validate1(hd,rd)==0){ fprintf(stderr,"problems validateing\n"); exit(0); } kputsn((char *)rd.vDat,rd.l_qname-1,&str);kputc('\t', &str); kputw((int)rd.flag_nc>>16, &str); kputc('\t', &str); if(rd.refID==-1)//unmatched read kputc('*', &str); else kputs(hd->name[rd.refID] , &str); kputc('\t', &str); kputw(rd.pos+1, &str); kputc('\t', &str); kputw(rd.mapQ, &str);kputc('\t', &str); int nCigs = rd.nCig; if(nCigs==0) kputc('*', &str);// if no cigars else{ for (int i = 0; i < nCigs; ++i) {//print cigars uint32_t *cigs =getCig(&rd); kputw(cigs[i]>>BAM_CIGAR_SHIFT, &str); kputc("MIDNSHP"[cigs[i]&BAM_CIGAR_MASK], &str); } } kputc('\t', &str); if(rd.next_refID==-1) kputc('*', &str);// if no cigars else if(rd.refID==rd.next_refID) kputc('=', &str); else kputs(hd->name[rd.next_refID] , &str); kputc('\t', &str); kputw(rd.next_pos+1, &str); kputc('\t', &str); kputw(rd.tlen, &str); kputc('\t', &str); //start seq char *seq = (char *)getSeq(&rd); for(int i=0;i<rd.l_seq;i++) kputc(bam_nt16_rev_table2[bam1_seqi(seq, i)], &str); kputc('\t', &str); char *quals =(char *)getQuals(&rd); for(int i=0;i<rd.l_seq;i++) kputc(quals[i]+33, &str); //below is taken directly from samtools,(not to steal, to preserve ordering etc, all credits go where credit is due) //from aux start to the last memadrs in chunk printAuxBuffered(getAuxStart(&rd),rd.vDat+rd.block_size,str); kputc('\n', &str); }
static bwa_seq_t *bwa_read_bam(bwa_seqio_t *bs, int n_needed, int *n, int is_comp, int trim_qual) { bwa_seq_t *seqs, *p; int n_seqs, l, i; long n_trimmed = 0, n_tot = 0; bam1_t *b; int res; b = bam_init1(); n_seqs = 0; seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t)); #ifdef USE_HTSLIB while ((res = sam_read1(bs->fp, bs->h, b)) >= 0) { #else while ((res = bam_read1(bs->fp, b)) >= 0) { #endif uint8_t *s, *q; int go = 0; if ((bs->which & 1) && (b->core.flag & BAM_FREAD1)) go = 1; if ((bs->which & 2) && (b->core.flag & BAM_FREAD2)) go = 1; if ((bs->which & 4) && !(b->core.flag& BAM_FREAD1) && !(b->core.flag& BAM_FREAD2))go = 1; if (go == 0) continue; l = b->core.l_qseq; p = &seqs[n_seqs++]; p->tid = -1; // no assigned to a thread p->qual = 0; p->full_len = p->clip_len = p->len = l; n_tot += p->full_len; #ifdef USE_HTSLIB s = bam_get_seq(b); q = bam_get_qual(b); #else s = bam1_seq(b); q = bam1_qual(b); #endif p->seq = (ubyte_t*)calloc(p->len + 1, 1); p->qual = (ubyte_t*)calloc(p->len + 1, 1); for (i = 0; i != p->full_len; ++i) { #ifdef USE_HTSLIB p->seq[i] = bam_nt16_nt4_table[(int)bam_seqi(s, i)]; #else p->seq[i] = bam_nt16_nt4_table[(int)bam1_seqi(s, i)]; #endif p->qual[i] = q[i] + 33 < 126? q[i] + 33 : 126; } #ifdef USE_HTSLIB if (bam_is_rev(b)) { // then reverse #else if (bam1_strand(b)) { // then reverse #endif seq_reverse(p->len, p->seq, 1); seq_reverse(p->len, p->qual, 0); } if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p); p->rseq = (ubyte_t*)calloc(p->full_len, 1); memcpy(p->rseq, p->seq, p->len); seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped() seq_reverse(p->len, p->rseq, is_comp); #ifdef USE_HTSLIB p->name = strdup((const char*)bam_get_qname(b)); #else p->name = strdup((const char*)bam1_qname(b)); #endif if (n_seqs == n_needed) break; } if (res < 0 && res != -1) err_fatal_simple("Error reading bam file"); *n = n_seqs; if (n_seqs && trim_qual >= 1) fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot); if (n_seqs == 0) { free(seqs); bam_destroy1(b); return 0; } bam_destroy1(b); return seqs; } #define BARCODE_LOW_QUAL 13 bwa_seq_t *bwa_read_seq(bwa_seqio_t *bs, int n_needed, int *n, int mode, int trim_qual) { bwa_seq_t *seqs, *p; kseq_t *seq = bs->ks; int n_seqs, l, i, is_comp = mode&BWA_MODE_COMPREAD, is_64 = mode&BWA_MODE_IL13, l_bc = mode>>24; long n_trimmed = 0, n_tot = 0; if (l_bc > BWA_MAX_BCLEN) { fprintf(stderr, "[%s] the maximum barcode length is %d.\n", __func__, BWA_MAX_BCLEN); return 0; } if (bs->is_bam) return bwa_read_bam(bs, n_needed, n, is_comp, trim_qual); // l_bc has no effect for BAM input n_seqs = 0; seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t)); while ((l = kseq_read(seq)) >= 0) { if ((mode & BWA_MODE_CFY) && (seq->comment.l != 0)) { // skip reads that are marked to be filtered by Casava char *s = index(seq->comment.s, ':'); if (s && *(++s) == 'Y') { continue; } } if (is_64 && seq->qual.l) for (i = 0; i < seq->qual.l; ++i) seq->qual.s[i] -= 31; if (seq->seq.l <= l_bc) continue; // sequence length equals or smaller than the barcode length p = &seqs[n_seqs++]; if (l_bc) { // then trim barcode for (i = 0; i < l_bc; ++i) p->bc[i] = (seq->qual.l && seq->qual.s[i]-33 < BARCODE_LOW_QUAL)? tolower(seq->seq.s[i]) : toupper(seq->seq.s[i]); p->bc[i] = 0; for (; i < seq->seq.l; ++i) seq->seq.s[i - l_bc] = seq->seq.s[i]; seq->seq.l -= l_bc; seq->seq.s[seq->seq.l] = 0; if (seq->qual.l) { for (i = l_bc; i < seq->qual.l; ++i) seq->qual.s[i - l_bc] = seq->qual.s[i]; seq->qual.l -= l_bc; seq->qual.s[seq->qual.l] = 0; } l = seq->seq.l; } else p->bc[0] = 0; p->tid = -1; // no assigned to a thread p->qual = 0; p->full_len = p->clip_len = p->len = l; n_tot += p->full_len; p->seq = (ubyte_t*)calloc(p->full_len, 1); for (i = 0; i != p->full_len; ++i) p->seq[i] = nst_nt4_table[(int)seq->seq.s[i]]; if (seq->qual.l) { // copy quality p->qual = (ubyte_t*)strdup((char*)seq->qual.s); if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p); } p->rseq = (ubyte_t*)calloc(p->full_len, 1); memcpy(p->rseq, p->seq, p->len); seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped() seq_reverse(p->len, p->rseq, is_comp); p->name = strdup((const char*)seq->name.s); { // trim /[12]$ int t = strlen(p->name); if (t > 2 && p->name[t-2] == '/' && (p->name[t-1] == '1' || p->name[t-1] == '2')) p->name[t-2] = '\0'; } if (n_seqs == n_needed) break; } *n = n_seqs; if (n_seqs && trim_qual >= 1) fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot); if (n_seqs == 0) { free(seqs); return 0; } return seqs; } void bwa_free_read_seq(int n_seqs, bwa_seq_t *seqs) { int i, j; for (i = 0; i != n_seqs; ++i) { bwa_seq_t *p = seqs + i; for (j = 0; j < p->n_multi; ++j) if (p->multi[j].cigar) free(p->multi[j].cigar); free(p->name); free(p->seq); free(p->rseq); free(p->qual); free(p->aln); free(p->md); free(p->multi); free(p->cigar); } free(seqs); }
static bwa_seq_t *bwa_read_bam(bwa_seqio_t *bs, int n_needed, int *n, int is_comp, int trim_qual) { bwa_seq_t *seqs, *p; int n_seqs, l, i; long n_trimmed = 0, n_tot = 0; bam1_t *b; b = bam_init1(); n_seqs = 0; seqs = (bwa_seq_t*) calloc(n_needed, sizeof(bwa_seq_t)); while (bam_read1(bs->fp, b) >= 0) { uint8_t *s, *q; int go = 0; if ((bs->which & 1) && (b->core.flag & BAM_FREAD1)) go = 1; if ((bs->which & 2) && (b->core.flag & BAM_FREAD2)) go = 1; if ((bs->which & 4) && !(b->core.flag & BAM_FREAD1) && !(b->core.flag & BAM_FREAD2)) go = 1; if (go == 0) continue; l = b->core.l_qseq; p = &seqs[n_seqs++]; p->tid = -1; // no assigned to a thread p->qual = 0; p->full_len = p->clip_len = p->len = l; n_tot += p->full_len; s = bam1_seq(b); q = bam1_qual(b); p->seq = (ubyte_t*) calloc(p->len + 1, 1); p->qual = (ubyte_t*) calloc(p->len + 1, 1); for (i = 0; i != p->full_len; ++i) { p->seq[i] = bam_nt16_nt4_table[(int) bam1_seqi(s, i)]; p->qual[i] = q[i] + 33 < 126 ? q[i] + 33 : 126; } if (bam1_strand(b)) { // then reverse seq_reverse(p->len, p->seq, 1); seq_reverse(p->len, p->qual, 0); } if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p); p->rseq = (ubyte_t*) calloc(p->full_len, 1); memcpy(p->rseq, p->seq, p->len); seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped() seq_reverse(p->len, p->rseq, is_comp); p->name = strdup((const char*) bam1_qname(b)); if (n_seqs == n_needed) break; } *n = n_seqs; if (n_seqs && trim_qual >= 1) fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed / n_tot); if (n_seqs == 0) { free(seqs); bam_destroy1(b); return 0; } bam_destroy1(b); return seqs; }
// from bam_md.c in SAMtools // modified not fill in the NM tag, and not to start the reference a c->pos static void tmap_sam_md1_core(bam1_t *b, char *ref) { uint8_t *seq = bam1_seq(b); uint32_t *cigar = bam1_cigar(b); bam1_core_t *c = &b->core; int i, x, y, u = 0; kstring_t *str; uint8_t *old_md, *old_nm; int32_t old_nm_i=-1, nm=0; str = (kstring_t*)calloc(1, sizeof(kstring_t)); for (i = y = x = 0; i < c->n_cigar; ++i) { int j, l = cigar[i]>>4, op = cigar[i]&0xf; if (op == BAM_CMATCH) { for (j = 0; j < l; ++j) { int z = y + j; int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]]; if (ref[x+j] == 0) break; // out of boundary if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match ++u; } else { ksprintf(str, "%d", u); kputc(ref[x+j], str); u = 0; nm++; } } if (j < l) break; x += l; y += l; } else if (op == BAM_CDEL) { ksprintf(str, "%d", u); kputc('^', str); for (j = 0; j < l; ++j) { if (ref[x+j] == 0) break; kputc(ref[x+j], str); } u = 0; if (j < l) break; x += l; nm += l; } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) { y += l; if (op == BAM_CINS) nm += l; } else if (op == BAM_CREF_SKIP) { x += l; } } ksprintf(str, "%d", u); // update MD old_md = bam_aux_get(b, "MD"); if(NULL == old_md) { bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s); } else { int is_diff = 0; if(strlen((char*)old_md+1) == str->l) { for(i = 0; i < str->l; ++i) { if(toupper(old_md[i+1]) != toupper(str->s[i])) { break; } } if(i < str->l) { is_diff = 1; } } else { is_diff = 1; } if(1 == is_diff) { bam_aux_del(b, old_md); bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s); } } // update NM old_nm = bam_aux_get(b, "NM"); if(NULL != old_nm) { old_nm_i = bam_aux2i(old_nm); if(old_nm_i != nm) { bam_aux_del(b, old_nm); bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); } } free(str->s); free(str); }