static inline void pileup_seq(FILE *fp, const bam_pileup1_t *p, int pos, int ref_len, const char *ref) { int j; if (p->is_head) { putc('^', fp); putc(p->b->core.qual > 93? 126 : p->b->core.qual + 33, fp); } if (!p->is_del) { int c = seq_nt16_str[bam_seqi(bam_get_seq(p->b), p->qpos)]; if (ref) { int rb = pos < ref_len? ref[pos] : 'N'; if (c == '=' || seq_nt16_table[c] == seq_nt16_table[rb]) c = bam_is_rev(p->b)? ',' : '.'; else c = bam_is_rev(p->b)? tolower(c) : toupper(c); } else { if (c == '=') c = bam_is_rev(p->b)? ',' : '.'; else c = bam_is_rev(p->b)? tolower(c) : toupper(c); } putc(c, fp); } else putc(p->is_refskip? (bam_is_rev(p->b)? '<' : '>') : '*', fp); if (p->indel > 0) { putc('+', fp); printw(p->indel, fp); for (j = 1; j <= p->indel; ++j) { int c = seq_nt16_str[bam_seqi(bam_get_seq(p->b), p->qpos + j)]; putc(bam_is_rev(p->b)? tolower(c) : toupper(c), fp); } } else if (p->indel < 0) { printw(p->indel, fp); for (j = 1; j <= -p->indel; ++j) { int c = (ref && (int)pos+j < ref_len)? ref[pos+j] : 'N'; putc(bam_is_rev(p->b)? tolower(c) : toupper(c), fp); } } if (p->is_tail) putc('$', fp); }
/** * Gets the read sequence from a bam record */ void bam_get_seq_string(bam1_t *s, kstring_t *seq) { seq->l=0; uint8_t* sq = bam_get_seq(s); for (uint16_t i = 0; i < bam_get_l_qseq(s); ++i) { kputc("=ACMGRSVTWYHKDBN"[bam_seqi(sq, i)], seq); } };
void dump_read(bam1_t* b) { printf("->core.tid:(%d)\n", b->core.tid); printf("->core.pos:(%d)\n", b->core.pos); printf("->core.bin:(%d)\n", b->core.bin); printf("->core.qual:(%d)\n", b->core.qual); printf("->core.l_qname:(%d)\n", b->core.l_qname); printf("->core.flag:(%d)\n", b->core.flag); printf("->core.n_cigar:(%d)\n", b->core.n_cigar); printf("->core.l_qseq:(%d)\n", b->core.l_qseq); printf("->core.mtid:(%d)\n", b->core.mtid); printf("->core.mpos:(%d)\n", b->core.mpos); printf("->core.isize:(%d)\n", b->core.isize); if (b->data) { printf("->data:"); int i; for (i = 0; i < b->l_data; ++i) { printf("%x ", b->data[i]); } printf("\n"); } if (b->core.l_qname) { printf("qname: %s\n",bam_get_qname(b)); } if (b->core.l_qseq) { printf("qseq:"); int i; for (i = 0; i < b->core.l_qseq; ++i) { printf("%c",seq_nt16_str[seq_nt16_table[bam_seqi(bam_get_seq(b),i)]]); } printf("\n"); printf("qual:"); for (i = 0; i < b->core.l_qseq; ++i) { printf("%c",bam_get_qual(b)[i]); } printf("\n"); } if (bam_get_l_aux(b)) { int i = 0; uint8_t* aux = bam_get_aux(b); while (i < bam_get_l_aux(b)) { printf("%.2s:%c:",aux+i,*(aux+i+2)); i += 2; switch (*(aux+i)) { case 'Z': while (*(aux+1+i) != '\0') { putc(*(aux+1+i), stdout); ++i; } break; } putc('\n',stdout); ++i;++i; } } printf("\n"); }
Alignment bam_to_alignment(const bam1_t *b, map<string, string>& rg_sample) { Alignment alignment; // get the sequence and qual int32_t lqseq = b->core.l_qseq; string sequence; sequence.resize(lqseq); uint8_t* qualptr = bam_get_qual(b); string quality;//(lqseq, 0); quality.assign((char*)qualptr, lqseq); // process the sequence into chars uint8_t* seqptr = bam_get_seq(b); for (int i = 0; i < lqseq; ++i) { sequence[i] = "=ACMGRSVTWYHKDBN"[bam_seqi(seqptr, i)]; } // get the read group and sample name uint8_t *rgptr = bam_aux_get(b, "RG"); char* rg = (char*) (rgptr+1); //if (!rg_sample string sname; if (!rg_sample.empty()) { sname = rg_sample[string(rg)]; } // Now name the read after the scaffold string read_name = bam_get_qname(b); // Decide if we are a first read (/1) or second (last) read (/2) if(b->core.flag & BAM_FREAD1) { read_name += "/1"; } if(b->core.flag & BAM_FREAD2) { read_name += "/2"; } // If we are marked as both first and last we get /1/2, and if we are marked // as neither the scaffold name comes through unchanged as the read name. // TODO: produce correct names for intermediate reads on >2 read scaffolds. // add features to the alignment alignment.set_name(read_name); alignment.set_sequence(sequence); alignment.set_quality(quality); // TODO: htslib doesn't wrap this flag for some reason. alignment.set_is_secondary(b->core.flag & BAM_FSECONDARY); if (sname.size()) { alignment.set_sample_name(sname); alignment.set_read_group(rg); } return alignment; }
char *get_sequence(const bam1_t *entry) { int i; int length = entry->core.l_qseq; char *seq = malloc(length + 1); unsigned char *bam_seq = bam_get_seq(entry); /* The for could be factored but this prevents from having a if in the loop */ if (entry->core.flag & BAM_FREVERSE) { for (i = 0; i < length; i++) { seq[length - i - 1] = comp_seq_nt16_str[bam_seqi(bam_seq, i)]; } } else { for (i = 0; i < length; ++i) { seq[i] = seq_nt16_str[bam_seqi(bam_seq,i)]; } } seq[i] = 0; return seq; }
const char* get_sequence(const bam1_t *b) { if(b == NULL) die("get_sequence: parameter error\n"); const uint8_t *seq = bam_get_seq(b); size_t len = b->core.l_qseq; char* sequence; sequence = malloc(len*sizeof(char)); uint8_t offset = (b->core.flag & BAM_FREVERSE) ? 16 : 0; size_t i; for (i=0; i<len; i++) { switch(bam_seqi(seq, i) + offset) { case 1: sequence[i] = 'A'; break; case 2: sequence[i] = 'C'; break; case 4: sequence[i] = 'G'; break; case 8: sequence[i] = 'T'; break; case 15: sequence[i] = 'N'; break; //Complements (original index + 16) case 17: sequence[i] = 'T'; break; case 18: sequence[i] = 'G'; break; case 20: sequence[i] = 'C'; break; case 24: sequence[i] = 'A'; break; case 31: sequence[i] = 'N'; break; default: sequence[i] = 'N'; break; } } if (offset) sequence = strrev(sequence); return sequence; }
static int unpad_seq(bam1_t *b, kstring_t *s) { // Returns 0 on success, -1 on an error int k, j, i; int length; int cigar_n_warning = 0; /* Make this a global and limit to one CIGAR N warning? */ uint32_t *cigar = bam_get_cigar(b); uint8_t *seq = bam_get_seq(b); // b->core.l_qseq gives length of the SEQ entry (including soft clips, S) // We need the padded length after alignment from the CIGAR (excluding // soft clips S, but including pads from CIGAR D operations) length = bam_cigar2rlen(b->core.n_cigar, cigar); ks_resize(s, length); for (k = 0, s->l = 0, j = 0; k < b->core.n_cigar; ++k) { int op, ol; op = bam_cigar_op(cigar[k]); ol = bam_cigar_oplen(cigar[k]); if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { for (i = 0; i < ol; ++i, ++j) s->s[s->l++] = bam_seqi(seq, j); } else if (op == BAM_CSOFT_CLIP) { j += ol; } else if (op == BAM_CHARD_CLIP) { /* do nothing */ } else if (op == BAM_CDEL) { for (i = 0; i < ol; ++i) s->s[s->l++] = 0; } else if (op == BAM_CREF_SKIP) { /* Treat CIGAR N as D (not ideal, but better than ignoring it) */ for (i = 0; i < ol; ++i) s->s[s->l++] = 0; if (0 == cigar_n_warning) { cigar_n_warning = -1; fprintf(stderr, "[depad] WARNING: CIGAR op N treated as op D in read %s\n", bam_get_qname(b)); } } else { fprintf(stderr, "[depad] ERROR: Didn't expect CIGAR op %c in read %s\n", BAM_CIGAR_STR[op], bam_get_qname(b)); return -1; } } return length != s->l; }
Mapping(const bam_hdr_t * hdr_p, bam1_t * rec_p) : _rec_p(rec_p) { _query_name = bam_get_qname(rec_p); _flag = rec_p->core.flag; for (int i = 0; i < rec_p->core.l_qseq; ++i) { _seq += seq_nt16_str[bam_seqi(bam_get_seq(rec_p), i)]; } if (is_mapped()) { _chr_name = hdr_p->target_name[rec_p->core.tid]; _rf_start = rec_p->core.pos; _cigar = Cigar(bam_get_cigar(rec_p), rec_p->core.n_cigar); _rf_len = _cigar.rf_len(); } if (is_paired() and mp_is_mapped()) { _mp_chr_name = hdr_p->target_name[rec_p->core.mtid]; _mp_rf_start = rec_p->core.mpos; } }
int baseCodeAt(int pos) const { assert(pos >= 0 && pos < len); return (return_current ? codes[bam_seqi(seq, pos)] : rcodes[bam_seqi(seq, len - pos - 1)]); }
char baseAt(int pos) const { assert(pos >= 0 && pos < len); return (return_current ? decode[bam_seqi(seq, pos)] : decode_r[bam_seqi(seq, len - pos - 1)]); }
void bam_fillmd1_core(bam1_t *b, char *ref, int ref_len, int flag, int max_nm) { uint8_t *seq = bam_get_seq(b); uint32_t *cigar = bam_get_cigar(b); bam1_core_t *c = &b->core; int i, x, y, u = 0; kstring_t *str; int32_t old_nm_i = -1, nm = 0; str = (kstring_t*)calloc(1, sizeof(kstring_t)); for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) { int j, l = cigar[i]>>4, op = cigar[i]&0xf; if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { for (j = 0; j < l; ++j) { int c1, c2, z = y + j; if (x+j >= ref_len || ref[x+j] == '\0') break; // out of bounds c1 = bam_seqi(seq, z), c2 = seq_nt16_table[(int)ref[x+j]]; if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match if (flag&USE_EQUAL) seq[z/2] &= (z&1)? 0xf0 : 0x0f; ++u; } else { kputw(u, str); kputc(ref[x+j], str); u = 0; ++nm; } } if (j < l) break; x += l; y += l; } else if (op == BAM_CDEL) { kputw(u, str); kputc('^', str); for (j = 0; j < l; ++j) { if (x+j >= ref_len || ref[x+j] == '\0') break; kputc(ref[x+j], str); } u = 0; x += j; nm += j; if (j < l) break; } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) { y += l; if (op == BAM_CINS) nm += l; } else if (op == BAM_CREF_SKIP) { x += l; } } kputw(u, str); // apply max_nm if (max_nm > 0 && nm >= max_nm) { for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) { int j, l = cigar[i]>>4, op = cigar[i]&0xf; if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { for (j = 0; j < l; ++j) { int c1, c2, z = y + j; if (x+j >= ref_len || ref[x+j] == '\0') break; // out of bounds c1 = bam_seqi(seq, z), c2 = seq_nt16_table[(int)ref[x+j]]; if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match seq[z/2] |= (z&1)? 0x0f : 0xf0; bam_get_qual(b)[z] = 0; } } if (j < l) break; x += l; y += l; } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l; else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l; } } // update NM if ((flag & UPDATE_NM) && !(c->flag & BAM_FUNMAP)) { uint8_t *old_nm = bam_aux_get(b, "NM"); if (old_nm) old_nm_i = bam_aux2i(old_nm); if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); else if (nm != old_nm_i) { fprintf(stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam_get_qname(b), old_nm_i, nm); bam_aux_del(b, old_nm); bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); } } // update MD if ((flag & UPDATE_MD) && !(c->flag & BAM_FUNMAP)) { uint8_t *old_md = bam_aux_get(b, "MD"); if (!old_md) bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s); else { int is_diff = 0; if (strlen((char*)old_md+1) == str->l) { for (i = 0; i < str->l; ++i) if (toupper(old_md[i+1]) != toupper(str->s[i])) break; if (i < str->l) is_diff = 1; } else is_diff = 1; if (is_diff) { fprintf(stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam_get_qname(b), old_md+1, str->s); bam_aux_del(b, old_md); bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s); } } } // drop all tags but RG if (flag&DROP_TAG) { uint8_t *q = bam_aux_get(b, "RG"); bam_aux_drop_other(b, q); } // reduce the resolution of base quality if (flag&BIN_QUAL) { uint8_t *qual = bam_get_qual(b); for (i = 0; i < b->core.l_qseq; ++i) if (qual[i] >= 3) qual[i] = qual[i]/10*10 + 7; } free(str->s); free(str); }
// Transform a bam1_t record into a string with the FASTQ representation of it // @returns false for error, true for success static bool bam1_to_fq(const bam1_t *b, kstring_t *linebuf, const bam2fq_state_t *state) { int i; int32_t qlen = b->core.l_qseq; assert(qlen >= 0); uint8_t *seq; uint8_t *qual = bam_get_qual(b); const uint8_t *oq = NULL; if (state->use_oq) { oq = bam_aux_get(b, "OQ"); if (oq) oq++; // skip tag type } bool has_qual = (qual[0] != 0xff || (state->use_oq && oq)); // test if there is quality linebuf->l = 0; // Write read name readpart readpart = which_readpart(b); kputc(state->filetype == FASTA? '>' : '@', linebuf); kputs(bam_get_qname(b), linebuf); // Add the /1 /2 if requested if (state->has12) { if (readpart == READ_1) kputs("/1", linebuf); else if (readpart == READ_2) kputs("/2", linebuf); } if (state->copy_tags) { for (i = 0; copied_tags[i]; ++i) { uint8_t *s; if ((s = bam_aux_get(b, copied_tags[i])) != 0) { kputc('\t', linebuf); kputsn(copied_tags[i], 2, linebuf); kputsn(":Z:", 3, linebuf); kputs(bam_aux2Z(s), linebuf); } } } kputc('\n', linebuf); seq = bam_get_seq(b); if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented for (i = qlen-1; i > -1; --i) { char c = seq_nt16_str[seq_comp_table[bam_seqi(seq,i)]]; kputc(c, linebuf); } } else { for (i = 0; i < qlen; ++i) { char c = seq_nt16_str[bam_seqi(seq,i)]; kputc(c, linebuf); } } kputc('\n', linebuf); if (state->filetype == FASTQ) { // Write quality kputs("+\n", linebuf); if (has_qual) { if (state->use_oq && oq) { if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented for (i = qlen-1; i > -1; --i) { kputc(oq[i], linebuf); } } else { kputs((char*)oq, linebuf); } } else { if (b->core.flag & BAM_FREVERSE) { // read is reverse complemented for (i = qlen-1; i > -1; --i) { kputc(33 + qual[i], linebuf); } } else { for (i = 0; i < qlen; ++i) { kputc(33 + qual[i], linebuf); } } } } else { for (i = 0; i < qlen; ++i) { kputc(33 + state->def_qual, linebuf); } } kputc('\n', linebuf); } return true; }
static int trim_ns(bam1_t *b, void *data) { int ret = 0; opts_t *op((opts_t *)data); std::vector<uint8_t> aux(bam_get_aux(b), bam_get_aux(b) + bam_get_l_aux(b)); int tmp; uint8_t *const seq(bam_get_seq(b)); uint32_t *const cigar(bam_get_cigar(b)); //op->n_cigar = b->core.n_cigar; op->resize(b->l_data); // Make sure it's big enough to hold everything. memcpy(op->data, b->data, b->core.l_qname); // Get #Ns at the beginning for(tmp = 0; bam_seqi(seq, tmp) == dlib::htseq::HTS_N; ++tmp); const int n_start(tmp); if(tmp == b->core.l_qseq - 1) // all bases are N -- garbage read ret |= op->skip_all_ns; // Get #Ns at the end for(tmp = b->core.l_qseq - 1; bam_seqi(seq, tmp) == dlib::htseq::HTS_N; --tmp); const int n_end(b->core.l_qseq - 1 - tmp); // Get new length for read int final_len(b->core.l_qseq - n_end - n_start); if(final_len < 0) final_len = 0; if(final_len < op->min_trimmed_len) // Too short. ret |= 1; // Copy in qual and all of aux. if(n_end) { if((tmp = bam_cigar_oplen(cigar[b->core.n_cigar - 1]) - n_end) == 0) { LOG_DEBUG("Entire cigar operation is the softclip. Decrease the number of new cigar operations.\n"); --b->core.n_cigar; } else { LOG_DEBUG("Updating second cigar operation in-place.\n"); cigar[b->core.n_cigar - 1] = bam_cigar_gen(tmp, BAM_CSOFT_CLIP); } } // Get new n_cigar. if((tmp = bam_cigar_oplen(*cigar) - n_start) == 0) { memcpy(op->data + b->core.l_qname, cigar + 1, (--b->core.n_cigar) << 2); // << 2 for 4 bit per cigar op } else { if(n_start) *cigar = bam_cigar_gen(tmp, BAM_CSOFT_CLIP); memcpy(op->data + b->core.l_qname, cigar, b->core.n_cigar << 2); } uint8_t *opseq(op->data + b->core.l_qname + (b->core.n_cigar << 2)); // Pointer to the seq region of new data field. for(tmp = 0; tmp < final_len >> 1; ++tmp) opseq[tmp] = (bam_seqi(seq, ((tmp << 1) + n_start)) << 4) | (bam_seqi(seq, (tmp << 1) + n_start + 1)); if(final_len & 1) opseq[tmp] = (bam_seqi(seq, ((tmp << 1) + n_start)) << 4); tmp = bam_get_l_aux(b); memcpy(opseq + ((final_len + 1) >> 1), bam_get_qual(b) + n_start, final_len + tmp); // Switch data strings std::swap(op->data, b->data); b->core.l_qseq = final_len; memcpy(bam_get_aux(b), aux.data(), aux.size()); b->l_data = (bam_get_aux(b) - b->data) + aux.size(); if(n_end) bam_aux_append(b, "NE", 'i', sizeof(int), (uint8_t *)&n_end); if(n_start) bam_aux_append(b, "NS", 'i', sizeof(int), (uint8_t *)&n_start); const uint32_t *pvar((uint32_t *)dlib::array_tag(b, "PV")); tmp = b->core.flag & BAM_FREVERSE ? n_end: n_start; if(pvar) { std::vector<uint32_t>pvals(pvar + tmp, pvar + final_len + tmp); bam_aux_del(b, (uint8_t *)(pvar) - 6); dlib::bam_aux_array_append(b, "PV", 'I', sizeof(uint32_t), final_len, (uint8_t *)pvals.data()); } const uint32_t *fvar((uint32_t *)dlib::array_tag(b, "FA")); if(fvar) { std::vector<uint32_t>fvals(fvar + tmp, fvar + final_len + tmp); bam_aux_del(b, (uint8_t *)(fvar) - 6); dlib::bam_aux_array_append(b, "FA", 'I', sizeof(uint32_t), final_len, (uint8_t *)fvals.data()); } return ret; }
static bwa_seq_t *bwa_read_bam(bwa_seqio_t *bs, int n_needed, int *n, int is_comp, int trim_qual) { bwa_seq_t *seqs, *p; int n_seqs, l, i; long n_trimmed = 0, n_tot = 0; bam1_t *b; int res; b = bam_init1(); n_seqs = 0; seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t)); #ifdef USE_HTSLIB while ((res = sam_read1(bs->fp, bs->h, b)) >= 0) { #else while ((res = bam_read1(bs->fp, b)) >= 0) { #endif uint8_t *s, *q; int go = 0; if ((bs->which & 1) && (b->core.flag & BAM_FREAD1)) go = 1; if ((bs->which & 2) && (b->core.flag & BAM_FREAD2)) go = 1; if ((bs->which & 4) && !(b->core.flag& BAM_FREAD1) && !(b->core.flag& BAM_FREAD2))go = 1; if (go == 0) continue; l = b->core.l_qseq; p = &seqs[n_seqs++]; p->tid = -1; // no assigned to a thread p->qual = 0; p->full_len = p->clip_len = p->len = l; n_tot += p->full_len; #ifdef USE_HTSLIB s = bam_get_seq(b); q = bam_get_qual(b); #else s = bam1_seq(b); q = bam1_qual(b); #endif p->seq = (ubyte_t*)calloc(p->len + 1, 1); p->qual = (ubyte_t*)calloc(p->len + 1, 1); for (i = 0; i != p->full_len; ++i) { #ifdef USE_HTSLIB p->seq[i] = bam_nt16_nt4_table[(int)bam_seqi(s, i)]; #else p->seq[i] = bam_nt16_nt4_table[(int)bam1_seqi(s, i)]; #endif p->qual[i] = q[i] + 33 < 126? q[i] + 33 : 126; } #ifdef USE_HTSLIB if (bam_is_rev(b)) { // then reverse #else if (bam1_strand(b)) { // then reverse #endif seq_reverse(p->len, p->seq, 1); seq_reverse(p->len, p->qual, 0); } if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p); p->rseq = (ubyte_t*)calloc(p->full_len, 1); memcpy(p->rseq, p->seq, p->len); seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped() seq_reverse(p->len, p->rseq, is_comp); #ifdef USE_HTSLIB p->name = strdup((const char*)bam_get_qname(b)); #else p->name = strdup((const char*)bam1_qname(b)); #endif if (n_seqs == n_needed) break; } if (res < 0 && res != -1) err_fatal_simple("Error reading bam file"); *n = n_seqs; if (n_seqs && trim_qual >= 1) fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot); if (n_seqs == 0) { free(seqs); bam_destroy1(b); return 0; } bam_destroy1(b); return seqs; } #define BARCODE_LOW_QUAL 13 bwa_seq_t *bwa_read_seq(bwa_seqio_t *bs, int n_needed, int *n, int mode, int trim_qual) { bwa_seq_t *seqs, *p; kseq_t *seq = bs->ks; int n_seqs, l, i, is_comp = mode&BWA_MODE_COMPREAD, is_64 = mode&BWA_MODE_IL13, l_bc = mode>>24; long n_trimmed = 0, n_tot = 0; if (l_bc > BWA_MAX_BCLEN) { fprintf(stderr, "[%s] the maximum barcode length is %d.\n", __func__, BWA_MAX_BCLEN); return 0; } if (bs->is_bam) return bwa_read_bam(bs, n_needed, n, is_comp, trim_qual); // l_bc has no effect for BAM input n_seqs = 0; seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t)); while ((l = kseq_read(seq)) >= 0) { if ((mode & BWA_MODE_CFY) && (seq->comment.l != 0)) { // skip reads that are marked to be filtered by Casava char *s = index(seq->comment.s, ':'); if (s && *(++s) == 'Y') { continue; } } if (is_64 && seq->qual.l) for (i = 0; i < seq->qual.l; ++i) seq->qual.s[i] -= 31; if (seq->seq.l <= l_bc) continue; // sequence length equals or smaller than the barcode length p = &seqs[n_seqs++]; if (l_bc) { // then trim barcode for (i = 0; i < l_bc; ++i) p->bc[i] = (seq->qual.l && seq->qual.s[i]-33 < BARCODE_LOW_QUAL)? tolower(seq->seq.s[i]) : toupper(seq->seq.s[i]); p->bc[i] = 0; for (; i < seq->seq.l; ++i) seq->seq.s[i - l_bc] = seq->seq.s[i]; seq->seq.l -= l_bc; seq->seq.s[seq->seq.l] = 0; if (seq->qual.l) { for (i = l_bc; i < seq->qual.l; ++i) seq->qual.s[i - l_bc] = seq->qual.s[i]; seq->qual.l -= l_bc; seq->qual.s[seq->qual.l] = 0; } l = seq->seq.l; } else p->bc[0] = 0; p->tid = -1; // no assigned to a thread p->qual = 0; p->full_len = p->clip_len = p->len = l; n_tot += p->full_len; p->seq = (ubyte_t*)calloc(p->full_len, 1); for (i = 0; i != p->full_len; ++i) p->seq[i] = nst_nt4_table[(int)seq->seq.s[i]]; if (seq->qual.l) { // copy quality p->qual = (ubyte_t*)strdup((char*)seq->qual.s); if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p); } p->rseq = (ubyte_t*)calloc(p->full_len, 1); memcpy(p->rseq, p->seq, p->len); seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped() seq_reverse(p->len, p->rseq, is_comp); p->name = strdup((const char*)seq->name.s); { // trim /[12]$ int t = strlen(p->name); if (t > 2 && p->name[t-2] == '/' && (p->name[t-1] == '1' || p->name[t-1] == '2')) p->name[t-2] = '\0'; } if (n_seqs == n_needed) break; } *n = n_seqs; if (n_seqs && trim_qual >= 1) fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot); if (n_seqs == 0) { free(seqs); return 0; } return seqs; } void bwa_free_read_seq(int n_seqs, bwa_seq_t *seqs) { int i, j; for (i = 0; i != n_seqs; ++i) { bwa_seq_t *p = seqs + i; for (j = 0; j < p->n_multi; ++j) if (p->multi[j].cigar) free(p->multi[j].cigar); free(p->name); free(p->seq); free(p->rseq); free(p->qual); free(p->aln); free(p->md); free(p->multi); free(p->cigar); } free(seqs); }
int main_bam2fq(int argc, char *argv[]) { BGZF *fp, *fpse = 0; bam1_t *b; uint8_t *buf; int max_buf, c, has12 = 0; kstring_t str; int64_t n_singletons = 0, n_reads = 0; char last[512], *fnse = 0; while ((c = getopt(argc, argv, "as:")) > 0) if (c == 'a') has12 = 1; else if (c == 's') fnse = optarg; if (argc == optind) { fprintf(stderr, "\nUsage: bam2fq [-a] [-s outSE] <in.bam>\n\n"); fprintf(stderr, "Options: -a append /1 and /2 to the read name\n"); fprintf(stderr, " -s FILE write singleton reads to FILE [assume single-end]\n"); fprintf(stderr, "\n"); return 1; } fp = strcmp(argv[optind], "-")? bgzf_open(argv[optind], "r") : bgzf_dopen(fileno(stdin), "r"); assert(fp); bam_hdr_destroy(bam_hdr_read(fp)); buf = 0; max_buf = 0; str.l = str.m = 0; str.s = 0; last[0] = 0; if (fnse) fpse = bgzf_open(fnse, "w1"); b = bam_init1(); while (bam_read1(fp, b) >= 0) { int i, qlen = b->core.l_qseq, is_print = 0; uint8_t *qual, *seq; if (b->flag&BAM_FSECONDARY) continue; // skip secondary alignments ++n_reads; if (fpse) { if (str.l && strcmp(last, bam_get_qname(b))) { bgzf_write(fpse, str.s, str.l); str.l = 0; ++n_singletons; } if (str.l) is_print = 1; strcpy(last, bam_get_qname(b)); } else is_print = 1; qual = bam_get_qual(b); kputc(qual[0] == 0xff? '>' : '@', &str); kputsn(bam_get_qname(b), b->core.l_qname - 1, &str); if (has12) { kputc('/', &str); kputw(b->core.flag>>6&3, &str); } kputc('\n', &str); if (max_buf < qlen + 1) { max_buf = qlen + 1; kroundup32(max_buf); buf = (uint8_t*)realloc(buf, max_buf); } buf[qlen] = 0; seq = bam_get_seq(b); for (i = 0; i < qlen; ++i) buf[i] = bam_seqi(seq, i); // copy the sequence if (bam_is_rev(b)) { // reverse complement for (i = 0; i < qlen>>1; ++i) { int8_t t = seq_comp_table[buf[qlen - 1 - i]]; buf[qlen - 1 - i] = seq_comp_table[buf[i]]; buf[i] = t; } if (qlen&1) buf[i] = seq_comp_table[buf[i]]; } for (i = 0; i < qlen; ++i) buf[i] = seq_nt16_str[buf[i]]; kputsn((char*)buf, qlen, &str); kputc('\n', &str); if (qual[0] != 0xff) { kputsn("+\n", 2, &str); for (i = 0; i < qlen; ++i) buf[i] = 33 + qual[i]; if (bam_is_rev(b)) { // reverse for (i = 0; i < qlen>>1; ++i) { uint8_t t = buf[qlen - 1 - i]; buf[qlen - 1 - i] = buf[i]; buf[i] = t; } } } kputsn((char*)buf, qlen, &str); kputc('\n', &str); if (is_print) { fwrite(str.s, 1, str.l, stdout); str.l = 0; } } if (fpse) { if (str.l) { bgzf_write(fpse, str.s, str.l); ++n_singletons; } fprintf(stderr, "[M::%s] discarded %lld singletons\n", __func__, (long long)n_singletons); bgzf_close(fpse); } fprintf(stderr, "[M::%s] processed %lld reads\n", __func__, (long long)n_reads); free(buf); free(str.s); bam_destroy1(b); bgzf_close(fp); return 0; }
BM_mappedRead * extractReads(char * bamFile, char ** contigs, int numContigs, uint16_t * groups, char * prettyName, int headersOnly, int minMapQual, int maxMisMatches, int ignoreSuppAlignments, int ignoreSecondaryAlignments) { //----- // code uses the pattern outlined in samtools view (sam_view.c) // thanks lh3! // int i = 0; int result = -1; int hh = 0; int supp_check = 0x0; // include supp mappings if (ignoreSuppAlignments) { supp_check |= BAM_FSUPPLEMENTARY; } if (ignoreSecondaryAlignments) { supp_check |= BAM_FSECONDARY; } // we need to let the users know if their pairings // will be corrupted int p_corrupt = 0; // helper variables samFile *in = 0; bam_hdr_t *header = NULL; bam1_t *b = bam_init1(); BM_mappedRead * root = 0; BM_mappedRead * prev = 0; // open file handlers if ((in = sam_open(bamFile, "r")) == 0) { fprintf(stderr, "ERROR: Failed to open \"%s\" for reading.\n", bamFile); } else { // retrieve the header if ((header = sam_hdr_read(in)) == 0) { fprintf(stderr, "ERROR: Failed to read the header from \"%s\".\n", bamFile); } else { // check the index is intact hts_idx_t *idx = sam_index_load(in, bamFile); // load index if (idx == 0) { // index is unavailable fprintf(stderr, "ERROR: Random retrieval only works "\ "for indexed files.\n"); } else { cfuhash_table_t *pair_buffer = \ cfuhash_new_with_initial_size(1000000); cfuhash_set_flag(pair_buffer, CFUHASH_FROZEN_UNTIL_GROWS); for (hh = 0; hh < numContigs; ++hh) { // parse a region in the format like `chr2:100-200' hts_itr_t *iter = sam_itr_querys(idx, header, contigs[hh]); if (iter == NULL) { // reference name is not found fprintf(stderr, "WARNING: Could not find contig: "\ "[%s] in BAM: [%s].\n", contigs[hh], bamFile); } // fetch alignments int line = 0; while ((result = sam_itr_next(in, iter, b)) >= 0) { bam1_core_t core = b->core; line += 1; // only high quality?, primary? mappings if ( core.qual < minMapQual) continue; if ((core.flag & supp_check) != 0) continue; if(bam_aux2i(bam_aux_get(b, "NM")) > maxMisMatches) { continue; } char * seqId = bam_get_qname(b); char * seq = 0; char * qual = 0; int qual_len = 0; int seq_len = 0; // get sequence and quality if(0 == headersOnly) { // no point allocating unused space seq = calloc(core.l_qseq+1, sizeof(char)); qual = calloc(core.l_qseq+1, sizeof(char)); uint8_t *s = bam_get_seq(b); if (core.flag&BAM_FREVERSE) { // reverse the read int r = 0; for (i = core.l_qseq-1; i >=0 ; --i) { seq[r]="=TGKCYSBAWRDMHVN"[bam_seqi(s, i)]; ++r; } } else { for (i = 0; i < core.l_qseq; ++i) { seq[i]="=ACMGRSVTWYHKDBN"[bam_seqi(s, i)]; } } seq_len = core.l_qseq; s = bam_get_qual(b); if (s[0] != 0xff) { qual_len = core.l_qseq; for (i = 0; i < core.l_qseq; ++i) { qual[i] = (char)(s[i] + 33); } } else if (qual != 0) { free(qual); qual = 0; } } // work out pairing information uint8_t rpi = RPI_ERROR; if (core.flag&BAM_FPAIRED) { if(core.flag&BAM_FMUNMAP) { if (core.flag&BAM_FREAD1) { rpi = RPI_SNGL_FIR; } else if (core.flag&BAM_FREAD2) { rpi = RPI_SNGL_SEC; } } else { if (core.flag&BAM_FREAD1) { rpi = RPI_FIR; } else if (core.flag&BAM_FREAD2) { rpi = RPI_SEC; } } } else { rpi = RPI_SNGL; } // make the funky Id #define MAX_SEQ_ID_LEN 80 char * seq_id = calloc(MAX_SEQ_ID_LEN, sizeof(char)); // allocate the string to the buffer but check to // ensure we're not cutting anything off int id_len = snprintf(seq_id, MAX_SEQ_ID_LEN, "b_%s;c_%s;r_%s", prettyName, contigs[hh], seqId); if(id_len >= MAX_SEQ_ID_LEN) { seq_id = calloc(id_len+1, sizeof(char)); snprintf(seq_id, id_len+1, // don't forget the NULL! "b_%s;c_%s;r_%s", prettyName, contigs[hh], seqId); } // make the mapped read struct prev = makeMappedRead(seq_id, seq, qual, id_len, seq_len, qual_len, rpi, groups[hh], prev); if (0 == root) { root = prev; } if(rpi == RPI_SNGL || \ rpi == RPI_SNGL_FIR || \ rpi == RPI_SNGL_SEC) { // we can just add away // indicate singleton reads by pointing the // partner pointer to itself prev->partnerRead = prev; } else { // RPI_FIR or RPI_SEC // work out pairing information using the hash // we append a 1 or 2 to the end so that // we don't accidentally pair 1's with 1's etc. char * stripped_result; if(rpi == RPI_FIR) { stripped_result = \ pairStripper(seqId, core.l_qname-1, '2'); } else { stripped_result = \ pairStripper(seqId, core.l_qname-1, '1'); } char * stripped = seqId; if(stripped_result) stripped = stripped_result; //fprintf(stdout, "SEARCH %s\n", stripped); // now stripped always holds a stripped value // see if it is in the hash already BM_mappedRead * stored_MR = \ cfuhash_get(pair_buffer, stripped); if (0 != stored_MR) { // exists in the hash -> Add the pair info if(rpi == RPI_FIR) { prev->partnerRead = stored_MR; } else { stored_MR->partnerRead = prev; } // delete the entry from the hash cfuhash_delete(pair_buffer, stripped); } else { // we should put it in the hash // make sure to change it into something // we will find next time if(rpi == RPI_FIR) stripped[strlen(stripped)-1] = '1'; else stripped[strlen(stripped)-1] = '2'; // check to make sure we're not overwriting // anything important. cfuhash overwrites // duplicate entries, so we need to grab // it and put it to "SNGL_XXX" before we // lose the pointer BM_mappedRead * OWMMR = \ cfuhash_put(pair_buffer, stripped, prev); if(OWMMR) { if(OWMMR->rpi == RPI_FIR) OWMMR->rpi = RPI_SNGL_FIR; else OWMMR->rpi = RPI_SNGL_SEC; OWMMR->partnerRead = OWMMR; printPairCorruptionWarning(p_corrupt); p_corrupt = 1; } } if(stripped_result != 0) { // free this! free(stripped_result); stripped_result = 0; } } } hts_itr_destroy(iter); if (result < -1) { fprintf(stderr, "ERROR: retrieval of reads from "\ "contig: \"%s\" failed due to "\ "truncated file or corrupt BAM index "\ "file\n", header->target_name[hh]); break; } } // any entries left in the hash are pairs whose mates did // not meet quality standards size_t key_size = 0; char * key; BM_mappedRead * LOMMR; size_t pr_size = 1; if(cfuhash_each_data(pair_buffer, (void**)&key, &key_size, (void**)&LOMMR, &pr_size)) { do { // get the mapped read // update it's pairing so we know it's really single if (LOMMR->rpi == RPI_FIR) LOMMR->rpi = RPI_SNGL_FIR; else if (LOMMR->rpi == RPI_SEC) LOMMR->rpi = RPI_SNGL_SEC; // indicate singleton reads by pointing the // partner pointer to itself LOMMR->partnerRead = LOMMR; } while(cfuhash_next_data(pair_buffer, (void**)&key, &key_size, (void**)&LOMMR, &pr_size)); } cfuhash_clear(pair_buffer); cfuhash_destroy(pair_buffer); } hts_idx_destroy(idx); // destroy the BAM index } } // always do this if (in) sam_close(in); bam_destroy1(b); if ( header ) bam_hdr_destroy(header); return root; }
loci_stats *bam_access_get_position_base_counts(char *chr, int posn){ char *region = NULL; hts_itr_t *iter = NULL; bam1_t* b = NULL; bam_plp_t buf; loci_stats *stats = malloc(sizeof(loci_stats *)); check_mem(stats); stats->base_counts = malloc(sizeof(int) * 4); check_mem(stats->base_counts); stats->base_counts[0] = 0; stats->base_counts[1] = 0; stats->base_counts[2] = 0; stats->base_counts[3] = 0; fholder->stats = stats; region = malloc((sizeof(char *) * (strlen(chr)+1))+sizeof(":")+sizeof("-")+(sizeof(char)*((no_of_digits(posn)*2)+1))); sprintf(region,"%s:%d-%d",chr,posn,posn); fholder->beg = posn; fholder->end = posn; // initialize pileup buf = bam_plp_init(pileup_func, (void *)fholder); bam_plp_set_maxcnt(buf,maxitercnt); /* sam_fetch(fholder->in, fholder->idx, ref, fholder->beg, fholder->end, buf, fetch_algo_func); */ //Replace fetch with iterator for htslib compatibility. b = bam_init1(); iter = sam_itr_querys(fholder->idx, fholder->head, region); int result; int count = 0; while ((result = sam_itr_next(fholder->in, iter, b)) >= 0) { if(b->core.qual < min_map_qual || (b->core.flag & BAM_FUNMAP) || !(b->core.flag & BAM_FPROPER_PAIR) || (b->core.flag & BAM_FMUNMAP)//Proper pair and mate unmapped || (b->core.flag & BAM_FDUP)//1024 is PCR/optical duplicate || (b->core.flag & BAM_FSECONDARY) || (b->core.flag & BAM_FQCFAIL)//Secondary alignment, quality fail || (b->core.flag & BAM_FSUPPLEMENTARY) ) continue; count++; bam_plp_push(buf, b); } sam_itr_destroy(iter); bam_plp_push(buf, 0); int tid, pos, n_plp = -1; const bam_pileup1_t *pil; while ( (pil=bam_plp_next(buf, &tid, &pos, &n_plp)) > 0) { if((pos+1) != posn) continue; int i=0; for(i=0;i<n_plp;i++){ const bam_pileup1_t *p = pil + i; int qual = bam_get_qual(p->b)[p->qpos]; uint8_t c = bam_seqi(bam_get_seq(p->b), p->qpos); if(!(p->is_del) && qual >= min_base_qual && p->b->core.qual >= min_map_qual){ //&& (c == 1 /*A*/|| c == 2 /*C*/|| c == 4 /*G*/|| c == 8 /*T*/)){ //Now we add a new read pos struct to the list since the read is valid. //char cbase = toupper(bam_nt16_rev_table[c]); switch(c){ case 1: fholder->stats->base_counts[0]++; break; case 2: fholder->stats->base_counts[1]++; break; case 4: fholder->stats->base_counts[2]++; break; case 8: fholder->stats->base_counts[3]++; break; default: break; }; // End of args switch statement */ } } } //End of iteration through pileup //bam_plp_push(buf, 0); // finalize pileup bam_plp_destroy(buf); free(region); return fholder->stats; error: //if(region) free(region); if(fholder->stats){ if(fholder->stats->base_counts) free(fholder->stats->base_counts); free(fholder->stats); } if(iter) sam_itr_destroy(iter); if(b) bam_destroy1(b); if(region) free(region); return NULL; }