static inline int bam1_lt(const bam1_p a, const bam1_p b) { if (g_is_by_qname) { int t = strnum_cmp(bam1_qname(a), bam1_qname(b)); return (t < 0 || (t == 0 && (((uint64_t)a->core.tid<<32|a->core.pos) < ((uint64_t)b->core.tid<<32|b->core.pos)))); } else return (((uint64_t)a->core.tid<<32|a->core.pos) < ((uint64_t)b->core.tid<<32|b->core.pos)); }
// currently, this function ONLY works if each read has one hit void bam_mating_core(bamFile in, bamFile out) { bam_header_t *header; bam1_t *b[2]; int curr, has_prev, pre_end = 0, cur_end; kstring_t str; str.l = str.m = 0; str.s = 0; header = bam_header_read(in); bam_header_write(out, header); b[0] = bam_init1(); b[1] = bam_init1(); curr = 0; has_prev = 0; while (bam_read1(in, b[curr]) >= 0) { bam1_t *cur = b[curr], *pre = b[1-curr]; if (cur->core.tid < 0) continue; cur_end = bam_calend(&cur->core, bam1_cigar(cur)); if (cur_end > (int)header->target_len[cur->core.tid]) cur->core.flag |= BAM_FUNMAP; if (cur->core.flag & BAM_FSECONDARY) continue; // skip secondary alignments if (has_prev) { if (strcmp(bam1_qname(cur), bam1_qname(pre)) == 0) { // identical pair name cur->core.mtid = pre->core.tid; cur->core.mpos = pre->core.pos; pre->core.mtid = cur->core.tid; pre->core.mpos = cur->core.pos; if (pre->core.tid == cur->core.tid && !(cur->core.flag&(BAM_FUNMAP|BAM_FMUNMAP)) && !(pre->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))) // set TLEN/ISIZE { uint32_t cur5, pre5; cur5 = (cur->core.flag&BAM_FREVERSE)? cur_end : cur->core.pos; pre5 = (pre->core.flag&BAM_FREVERSE)? pre_end : pre->core.pos; cur->core.isize = pre5 - cur5; pre->core.isize = cur5 - pre5; } else cur->core.isize = pre->core.isize = 0; if (pre->core.flag&BAM_FREVERSE) cur->core.flag |= BAM_FMREVERSE; else cur->core.flag &= ~BAM_FMREVERSE; if (cur->core.flag&BAM_FREVERSE) pre->core.flag |= BAM_FMREVERSE; else pre->core.flag &= ~BAM_FMREVERSE; if (cur->core.flag & BAM_FUNMAP) { pre->core.flag |= BAM_FMUNMAP; pre->core.flag &= ~BAM_FPROPER_PAIR; } if (pre->core.flag & BAM_FUNMAP) { cur->core.flag |= BAM_FMUNMAP; cur->core.flag &= ~BAM_FPROPER_PAIR; } bam_template_cigar(pre, cur, &str); bam_write1(out, pre); bam_write1(out, cur); has_prev = 0; } else { // unpaired or singleton pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0; if (pre->core.flag & BAM_FPAIRED) { pre->core.flag |= BAM_FMUNMAP; pre->core.flag &= ~BAM_FMREVERSE & ~BAM_FPROPER_PAIR; } bam_write1(out, pre); } } else has_prev = 1; curr = 1 - curr; pre_end = cur_end; } if (has_prev) bam_write1(out, b[1-curr]); bam_header_destroy(header); bam_destroy1(b[0]); bam_destroy1(b[1]); free(str.s); }
static inline int bam1_lt(const bam1_p a, const bam1_p b) { if (g_is_by_qname) { int t = strnum_cmp(bam1_qname(a), bam1_qname(b)); return (t < 0 || (t == 0 && (a->core.flag&0xc0) < (b->core.flag&0xc0))); } else return (((uint64_t)a->core.tid<<32|(a->core.pos+1)<<1|bam1_strand(a)) < ((uint64_t)b->core.tid<<32|(b->core.pos+1)<<1|bam1_strand(b))); }
static inline int heap_lt(const heap1_t a, const heap1_t b) { if (g_is_by_qname) { int t; if (a.b == 0 || b.b == 0) return a.b == 0? 1 : 0; t = strnum_cmp(bam1_qname(a.b), bam1_qname(b.b)); return (t > 0 || (t == 0 && (a.b->core.flag&0xc0) > (b.b->core.flag&0xc0))); } else return __pos_cmp(a, b); }
static inline int bam1_lt(const bam1_p a, const bam1_p b) { if (g_is_by_qname) { int t = strnum_cmp(bam1_qname(a), bam1_qname(b)); if (t != 0) return (t < 0); if(1 == (a->core.flag&BAM_FPAIRED) && 1 == (b->core.flag&BAM_FPAIRED)) { if(a->core.flag&BAM_FREAD1) return (0 < 1); else return (1 < 0); } else return (((uint64_t)a->core.tid<<32|(a->core.pos+1)) < ((uint64_t)b->core.tid<<32|(b->core.pos+1))); } else return (((uint64_t)a->core.tid<<32|(a->core.pos+1)) < ((uint64_t)b->core.tid<<32|(b->core.pos+1))); }
static inline int heap_lt(const heap1_t a, const heap1_t b) { if (g_is_by_qname) { int t; if (a.b == 0 || b.b == 0) return a.b == 0? 1 : 0; t = strnum_cmp(bam1_qname(a.b), bam1_qname(b.b)); if (t != 0) return (0 > t); if(1 == (a.b->core.flag&BAM_FPAIRED) && 1 == (b.b->core.flag&BAM_FPAIRED)) { if(a.b->core.flag&BAM_FREAD1) return (0 < 1); else return (1 < 0); } else return __pos_cmp(a, b); } else return __pos_cmp(a, b); }
int bam_validate1(const bam_header_t *header, const bam1_t *b) { char *s; if (b->core.tid < -1 || b->core.mtid < -1) return 0; if (header && (b->core.tid >= header->n_targets || b->core.mtid >= header->n_targets)) return 0; if (b->data_len < b->core.l_qname) return 0; s = memchr(bam1_qname(b), '\0', b->core.l_qname); if (s != &bam1_qname(b)[b->core.l_qname-1]) return 0; // FIXME: Other fields could also be checked, especially the auxiliary data return 1; }
hash_table* hash_ids(const char* fn) { fprintf(stderr, "hashing ... \n"); hash_table* T = create_hash_table(); samfile_t* f = samopen(fn, "rb", NULL); if (f == NULL) { fprintf(stderr, "can't open bam file %s\n", fn); exit(1); } bam1_t* b = bam_init1(); uint32_t n = 0; while (samread(f, b) >= 0) { if (++n % 1000000 == 0) { fprintf(stderr, "\t%d reads\n", n); } inc_hash_table(T, bam1_qname(b), b->core.l_qname); } bam_destroy1(b); samclose(f); fprintf(stderr, "done.\n"); return T; }
static int oneBam(const bam1_t *bam, void *data, bam_hdr_t *header) /* This is called on each record retrieved from a .bam file. */ { const bam1_core_t *core = &bam->core; if (core->flag & BAM_FUNMAP) return 0; struct bamTrackData *btd = (struct bamTrackData *)data; if (sameString(bam1_qname(bam), btd->itemName)) { if (btd->pairHash == NULL || (core->flag & BAM_FPAIRED) == 0) { if (core->pos == btd->itemStart) { printf("<B>Read name:</B> %s<BR>\n", btd->itemName); singleBamDetails(bam); } } else { bam1_t *firstBam = (bam1_t *)hashFindVal(btd->pairHash, btd->itemName); if (firstBam == NULL) hashAdd(btd->pairHash, btd->itemName, bamClone(bam)); else { bamPairDetails(firstBam, bam); hashRemove(btd->pairHash, btd->itemName); } } } return 0; }
static int fetch_disc_read_callback(const bam1_t* alignment, void* data) { // MEI_data* mei_data = static_cast<MEI_data*>(data); std::pair<MEI_data*, UserDefinedSettings*>* env = static_cast<std::pair<MEI_data*, UserDefinedSettings*>*>(data); MEI_data* mei_data = env->first; UserDefinedSettings* userSettings = env->second; if (!(alignment->core.flag & BAM_FUNMAP || alignment->core.flag & BAM_FMUNMAP) && // Both ends are mapped. !is_concordant(alignment, mei_data->current_insert_size) && // Ends map discordantly. // Extra check for (very) large mapping distance. This is done beside the check for read // discordance to speed up computation by ignoring signals from small structural variants. (alignment->core.tid != alignment->core.mtid || abs(alignment->core.pos - alignment->core.mpos) > userSettings->MIN_DD_MAP_DISTANCE)) { // Save alignment as simple_read object. std::string read_name = enrich_read_name(bam1_qname(alignment), alignment->core.flag & BAM_FREAD1); char strand = bam1_strand(alignment)? Minus : Plus; char mate_strand = bam1_mstrand(alignment)? Minus : Plus; std::string read_group; get_read_group(alignment, read_group); std::string sample_name; get_sample_name(read_group, mei_data->sample_names, sample_name); simple_read* read = new simple_read(read_name, alignment->core.tid, alignment->core.pos, strand, sample_name, get_sequence(bam1_seq(alignment), alignment->core.l_qseq), alignment->core.mtid, alignment->core.mpos, mate_strand); mei_data->discordant_reads.push_back(read); } return 0; }
void bam_print(bam1_t* bam_p, int base_quality) { printf("\n------------------------------------------------------------------->\n"); printf("bam_p->data (qname): %s\n", bam1_qname(bam_p)); printf("bam_p->data (seq): %s\n", convert_to_sequence_string(bam1_seq(bam_p), bam_p->core.l_qseq)); //quality printf("bam_p->data (qual): "); char* quality = (char*) bam1_qual(bam_p); for (int i = 0; i < bam_p->core.l_qseq; i++) { printf("%c", (quality[i] + base_quality)); } printf("\n"); printf("bam_p->data (cigar): %s\n", convert_to_cigar_string(bam1_cigar(bam_p), bam_p->core.n_cigar)); //aux(optional) data printf("bam_p->data (aux): "); char* optional_fields = (char*) bam1_aux(bam_p); for (int i = 0; i < bam_p->l_aux; i++) { printf("%c", optional_fields[i]); } printf("\n"); //lengths printf("bam_p->l_aux: %i\n", bam_p->l_aux); printf("bam_p->data_len: %i\n", bam_p->data_len); printf("bam_p->m_data: %i\n", bam_p->m_data); //core printf("bam_p->core.tid: %i\n", bam_p->core.tid); printf("bam_p->core.pos: %i\n", bam_p->core.pos); printf("bam_p->core.bin: %u\n", bam_p->core.bin); printf("bam_p->core.qual: %u\n", bam_p->core.qual); printf("bam_p->core.l_qname: %u\n", bam_p->core.l_qname); printf("bam_p->core.flag (16 bits): %u\n", bam_p->core.flag); printf("bam_p->core.n_cigar: %u\n", bam_p->core.n_cigar); printf("bam_p->core.l_qseq: %i\n", bam_p->core.l_qseq); printf("bam_p->core.mtid: %i\n", bam_p->core.mtid); printf("bam_p->core.mpos: %i\n", bam_p->core.mpos); printf("bam_p->core.isize: %i\n", bam_p->core.isize); printf("\nbam1_t.core flags\n"); printf("-----------------------\n"); printf("flag (is_paired_end): %i\n", (bam_p->core.flag & BAM_FPAIRED) ? 1 : 0); printf("flag (is_paired_end_mapped): %i\n", (bam_p->core.flag & BAM_FPROPER_PAIR) ? 1 : 0); printf("flag (is_seq_unmapped): %i\n", (bam_p->core.flag & BAM_FUNMAP) ? 1 : 0); printf("flag (is_mate_unmapped): %i\n", (bam_p->core.flag & BAM_FMUNMAP) ? 1 : 0); printf("flag (seq_strand): %i\n", (bam_p->core.flag & BAM_FREVERSE) ? 1 : 0); printf("flag (mate_strand): %i\n", (bam_p->core.flag & BAM_FMREVERSE) ? 1 : 0); printf("flag (pair_num_1): %i\n", (bam_p->core.flag & BAM_FREAD1) ? 1 : 0); printf("flag (pair_num_2): %i\n", (bam_p->core.flag & BAM_FREAD2) ? 1 : 0); printf("flag (primary_alignment): %i\n", (bam_p->core.flag & BAM_FSECONDARY) ? 1 : 0); printf("flag (fails_quality_check): %i\n", (bam_p->core.flag & BAM_FQCFAIL) ? 1 : 0); printf("flag (pc_optical_duplicate): %i\n", (bam_p->core.flag & BAM_FDUP) ? 1 : 0); }
int check_qname(char *last_qname, int bufsize, bam1_t *bam, int max) { if (0 != strcmp(last_qname, bam1_qname(bam))) { /* stop reading */ if (max) { return -1; /* get next qname, continue reading */ } else { if (bam->core.l_qname > bufsize) { Free(last_qname); bufsize = bam->core.l_qname; last_qname = Calloc(bufsize, char); } strcpy(last_qname, bam1_qname(bam)); return 1; } /* same qname, continue reading */ } else {
alignment_t* alignment_new_by_bam(bam1_t* bam_p, int base_quality) { //memory allocation for the structure alignment_t* alignment_p = (alignment_t*) calloc(1, sizeof(alignment_t)); //numeric data alignment_p->num_cigar_operations = (int) bam_p->core.n_cigar; alignment_p->chromosome = bam_p->core.tid; alignment_p->position = bam_p->core.pos; alignment_p->mate_chromosome = bam_p->core.mtid; alignment_p->mate_position = bam_p->core.mpos; alignment_p->map_quality = bam_p->core.qual; alignment_p->template_length = bam_p->core.isize; //memory allocation for inner fields according to indicated sizes alignment_p->query_name = (char*) calloc(bam_p->core.l_qname, sizeof(char)); alignment_p->sequence = (char*) calloc(bam_p->core.l_qseq + 1, sizeof(char)); alignment_p->quality = (char*) calloc(bam_p->core.l_qseq + 1, sizeof(char)); //same length as sequence alignment_p->cigar = (char*) calloc(max(MIN_ALLOCATED_SIZE_FOR_CIGAR_STRING, alignment_p->num_cigar_operations << 2), sizeof(char)); alignment_p->optional_fields = (uint8_t*) calloc(bam_p->l_aux, sizeof(uint8_t)); alignment_p->optional_fields_length = bam_p->l_aux; //copy the data between structures strcpy(alignment_p->query_name, bam1_qname(bam_p)); strcpy(alignment_p->sequence, convert_to_sequence_string(bam1_seq(bam_p), bam_p->core.l_qseq)); //char* quality_string = (char *)malloc(sizeof(char)*(quality_length + 1)); convert_to_quality_string_length(alignment_p->quality, bam1_qual(bam_p), bam_p->core.l_qseq, base_quality); //strcpy(alignment_p->quality, quality_string); //free(quality_string); strcpy(alignment_p->cigar, convert_to_cigar_string(bam1_cigar(bam_p), alignment_p->num_cigar_operations)); memcpy(alignment_p->optional_fields, bam1_aux(bam_p), bam_p->l_aux); //flags uint32_t flag = (uint32_t) bam_p->core.flag; alignment_p->is_paired_end = (flag & BAM_FPAIRED) ? 1 : 0; alignment_p->is_paired_end_mapped = (flag & BAM_FPROPER_PAIR) ? 1 : 0; alignment_p->is_seq_mapped = (flag & BAM_FUNMAP) ? 0 : 1; //in bam structure is negative flag!!! alignment_p->is_mate_mapped = (flag & BAM_FMUNMAP) ? 0 : 1; //in bam structure is negative flag!!! alignment_p->seq_strand = (flag & BAM_FREVERSE) ? 1 : 0; alignment_p->mate_strand = (flag & BAM_FMREVERSE) ? 1 : 0; if (flag & BAM_FREAD1) { alignment_p->pair_num = 1; } else if (flag & BAM_FREAD2) { alignment_p->pair_num = 2; } else { alignment_p->pair_num = 0; } alignment_p->primary_alignment = (flag & BAM_FSECONDARY) ? 1 : 0; alignment_p->fails_quality_check = (flag & BAM_FQCFAIL) ? 1 : 0; alignment_p->pc_optical_duplicate = (flag & BAM_FDUP) ? 1 : 0; return alignment_p; }
static bwa_seq_t *bwa_read_bam(bwa_seqio_t *bs, int n_needed, int *n, int is_comp, int trim_qual) { bwa_seq_t *seqs, *p; int n_seqs, l, i; long n_trimmed = 0, n_tot = 0; bam1_t *b; int res; b = bam_init1(); n_seqs = 0; seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t)); while ((res = bam_read1(bs->fp, b)) >= 0) { uint8_t *s, *q; int go = 0; if ((bs->which & 1) && (b->core.flag & BAM_FREAD1)) go = 1; if ((bs->which & 2) && (b->core.flag & BAM_FREAD2)) go = 1; if ((bs->which & 4) && !(b->core.flag& BAM_FREAD1) && !(b->core.flag& BAM_FREAD2))go = 1; if (go == 0) continue; l = b->core.l_qseq; p = &seqs[n_seqs++]; p->tid = -1; // no assigned to a thread p->qual = 0; p->full_len = p->clip_len = p->len = l; n_tot += p->full_len; s = bam1_seq(b); q = bam1_qual(b); p->seq = (ubyte_t*)calloc(p->len + 1, 1); p->qual = (ubyte_t*)calloc(p->len + 1, 1); for (i = 0; i != p->full_len; ++i) { p->seq[i] = bam_nt16_nt4_table[(int)bam1_seqi(s, i)]; p->qual[i] = q[i] + 33 < 126? q[i] + 33 : 126; } if (bam1_strand(b)) { // then reverse seq_reverse(p->len, p->seq, 1); seq_reverse(p->len, p->qual, 0); } if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p); p->rseq = (ubyte_t*)calloc(p->full_len, 1); memcpy(p->rseq, p->seq, p->len); seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped() seq_reverse(p->len, p->rseq, is_comp); p->name = strdup((const char*)bam1_qname(b)); if (n_seqs == n_needed) break; } if (res < 0 && res != -1) err_fatal_simple("Error reading bam file"); *n = n_seqs; if (n_seqs && trim_qual >= 1) fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot); if (n_seqs == 0) { free(seqs); bam_destroy1(b); return 0; } bam_destroy1(b); return seqs; }
char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of) { uint8_t *s = bam1_seq(b), *t = bam1_qual(b); int i; const bam1_core_t *c = &b->core; kstring_t str; str.l = str.m = 0; str.s = 0; ksprintf(&str, "%s\t", bam1_qname(b)); if (of == BAM_OFDEC) ksprintf(&str, "%d\t", c->flag); else if (of == BAM_OFHEX) ksprintf(&str, "0x%x\t", c->flag); else { // BAM_OFSTR for (i = 0; i < 16; ++i) if ((c->flag & 1<<i) && bam_flag2char_table[i]) kputc(bam_flag2char_table[i], &str); kputc('\t', &str); } if (c->tid < 0) kputs("*\t", &str); else ksprintf(&str, "%s\t", header->target_name[c->tid]); ksprintf(&str, "%d\t%d\t", c->pos + 1, c->qual); if (c->n_cigar == 0) kputc('*', &str); else { for (i = 0; i < c->n_cigar; ++i) ksprintf(&str, "%d%c", bam1_cigar(b)[i]>>BAM_CIGAR_SHIFT, "MIDNSHP"[bam1_cigar(b)[i]&BAM_CIGAR_MASK]); } kputc('\t', &str); if (c->mtid < 0) kputs("*\t", &str); else if (c->mtid == c->tid) kputs("=\t", &str); else ksprintf(&str, "%s\t", header->target_name[c->mtid]); ksprintf(&str, "%d\t%d\t", c->mpos + 1, c->isize); if (c->l_qseq) { for (i = 0; i < c->l_qseq; ++i) kputc(bam_nt16_rev_table[bam1_seqi(s, i)], &str); kputc('\t', &str); if (t[0] == 0xff) kputc('*', &str); else for (i = 0; i < c->l_qseq; ++i) kputc(t[i] + 33, &str); } else ksprintf(&str, "*\t*"); s = bam1_aux(b); while (s < b->data + b->data_len) { uint8_t type, key[2]; key[0] = s[0]; key[1] = s[1]; s += 2; type = *s; ++s; ksprintf(&str, "\t%c%c:", key[0], key[1]); if (type == 'A') { ksprintf(&str, "A:%c", *s); ++s; } else if (type == 'C') { ksprintf(&str, "i:%u", *s); ++s; } else if (type == 'c') { ksprintf(&str, "i:%d", *(int8_t*)s); ++s; } else if (type == 'S') { ksprintf(&str, "i:%u", *(uint16_t*)s); s += 2; } else if (type == 's') { ksprintf(&str, "i:%d", *(int16_t*)s); s += 2; } else if (type == 'I') { ksprintf(&str, "i:%u", *(uint32_t*)s); s += 4; } else if (type == 'i') { ksprintf(&str, "i:%d", *(int32_t*)s); s += 4; } else if (type == 'f') { ksprintf(&str, "f:%g", *(float*)s); s += 4; } else if (type == 'd') { ksprintf(&str, "d:%lg", *(double*)s); s += 8; } else if (type == 'Z' || type == 'H') { ksprintf(&str, "%c:", type); while (*s) kputc(*s++, &str); ++s; } } return str.s; }
int bam_pad2unpad(bamFile in, bamFile out) { bam_header_t *h; bam1_t *b; kstring_t r, q; uint32_t *cigar2 = 0; int n2 = 0, m2 = 0, *posmap = 0; h = bam_header_read(in); bam_header_write(out, h); b = bam_init1(); r.l = r.m = q.l = q.m = 0; r.s = q.s = 0; while (bam_read1(in, b) >= 0) { uint32_t *cigar = bam1_cigar(b); n2 = 0; if (b->core.pos == 0 && b->core.tid >= 0 && strcmp(bam1_qname(b), h->target_name[b->core.tid]) == 0) { int i, k; unpad_seq(b, &r); write_cigar(cigar2, n2, m2, bam_cigar_gen(b->core.l_qseq, BAM_CMATCH)); replace_cigar(b, n2, cigar2); posmap = realloc(posmap, r.m * sizeof(int)); for (i = k = 0; i < r.l; ++i) { posmap[i] = k; // note that a read should NOT start at a padding if (r.s[i]) ++k; } } else { int i, k, op; unpad_seq(b, &q); if (bam_cigar_op(cigar[0]) == BAM_CSOFT_CLIP) write_cigar(cigar2, n2, m2, cigar[0]); for (i = 0, k = b->core.pos; i < q.l; ++i, ++k) q.s[i] = q.s[i]? (r.s[k]? BAM_CMATCH : BAM_CINS) : (r.s[k]? BAM_CDEL : BAM_CPAD); for (i = k = 1, op = q.s[0]; i < q.l; ++i) { if (op != q.s[i]) { write_cigar(cigar2, n2, m2, bam_cigar_gen(k, op)); op = q.s[i]; k = 1; } else ++k; } write_cigar(cigar2, n2, m2, bam_cigar_gen(k, op)); if (bam_cigar_op(cigar[b->core.n_cigar-1]) == BAM_CSOFT_CLIP) write_cigar(cigar2, n2, m2, cigar[b->core.n_cigar-1]); for (i = 2; i < n2; ++i) if (bam_cigar_op(cigar2[i]) == BAM_CMATCH && bam_cigar_op(cigar2[i-1]) == BAM_CPAD && bam_cigar_op(cigar2[i-2]) == BAM_CMATCH) cigar2[i] += cigar2[i-2], cigar2[i-2] = cigar2[i-1] = 0; for (i = k = 0; i < n2; ++i) if (cigar2[i]) cigar2[k++] = cigar2[i]; n2 = k; replace_cigar(b, n2, cigar2); b->core.pos = posmap[b->core.pos]; } bam_write1(out, b); } free(r.s); free(q.s); free(posmap); bam_destroy1(b); bam_header_destroy(h); return 0; }
void test_next_fragment_single() { char *sam_filename = "testdata/test_next_fragment_single.sam"; samfile_t *samfile = samopen(sam_filename, "r", NULL); bam1_t *reads[] = { bam_init1(), bam_init1() }; int len; len = next_fragment(reads, samfile, 2); assert_equals(1, len, "Num reads"); assert_str_equals("seq.1", bam1_qname(reads[0]), "qname"); len = next_fragment(reads, samfile, 2); assert_equals(1, len, "Num reads"); assert_str_equals("seq.2", bam1_qname(reads[0]), "qname"); len = next_fragment(reads, samfile, 2); assert_equals(0, len, "Num reads"); samclose(samfile); }
hash_table* hash_ids(const char* fn) { fprintf(stderr, "hashing ... \n"); hash_table* T = create_hash_table(); samfile_t* f = samopen(fn, "rb", NULL); if (f == NULL) { fprintf(stderr, "can't open bam file %s\n", fn); exit(1); } bam1_t* b = bam_init1(); uint32_t n = 0; char* qname = NULL; size_t qname_size = 0; while (samread(f, b) >= 0) { if (++n % 1000000 == 0) { fprintf(stderr, "\t%d reads\n", n); } if (qname_size < b->core.l_qname + 3) { qname_size = b->core.l_qname + 3; qname = realloc(qname, qname_size); } memcpy(qname, bam1_qname(b), b->core.l_qname); if (b->core.flag & BAM_FREAD2) { qname[b->core.l_qname] = '/'; qname[b->core.l_qname + 1] = '2'; qname[b->core.l_qname + 2] = '\0'; } else { qname[b->core.l_qname] = '/'; qname[b->core.l_qname + 1] = '1'; qname[b->core.l_qname + 2] = '\0'; } inc_hash_table(T, qname, b->core.l_qname + 2); } free(qname); bam_destroy1(b); samclose(f); fprintf(stderr, "done.\n"); return T; }
/** * @brief Debugging */ void print_readinfo(seq_block_t *bresults,bam1_t *current_read,read_metrics_t *rm,samfile_t *bam_file){ printf("\nREADING %d\n",bresults->total_reads); printf("Chrom %s\n",bam_file->header->target_name[current_read->core.tid]); printf("Pos %d\n",current_read->core.pos); printf("Len %d -> END: %d\n",rm->read_length,rm->read_length+current_read->core.pos); printf("REVCOMP: %d\n",rm->revcomp); printf("SKIP: %d\n",rm->skip); printf("mapq: %d\n",*bam1_qual(current_read)); printf("flag %d\n",current_read->core.flag); printf("n_cigar_op %d\n",current_read->core.n_cigar); printf("NAME %s\n",bam1_qname(current_read)); }
static void singleBamDetails(const bam1_t *bam) /* Print out the properties of this alignment. */ { const bam1_core_t *core = &bam->core; char *itemName = bam1_qname(bam); int tLength = bamGetTargetLength(bam); int tStart = core->pos, tEnd = tStart+tLength; boolean isRc = useStrand && bamIsRc(bam); printPosOnChrom(seqName, tStart, tEnd, NULL, FALSE, itemName); if (!skipQualityScore) printf("<B>Alignment Quality: </B>%d<BR>\n", core->qual); printf("<B>CIGAR string: </B><tt>%s</tt> (", bamGetCigar(bam)); bamShowCigarEnglish(bam); printf(")<BR>\n"); printf("<B>Tags:</B>"); bamShowTags(bam); puts("<BR>"); printf("<B>Flags: </B><tt>0x%02x:</tt><BR>\n ", core->flag); bamShowFlagsEnglish(bam); puts("<BR>"); if (bamIsRc(bam)) printf("<em>Note: although the read was mapped to the reverse strand of the genome, " "the sequence and CIGAR in BAM are relative to the forward strand.</em><BR>\n"); puts("<BR>"); struct dnaSeq *genoSeq = hChromSeq(database, seqName, tStart, tEnd); char *qSeq = bamGetQuerySequence(bam, FALSE); if (isNotEmpty(qSeq) && !sameString(qSeq, "*")) { char *qSeq = NULL; struct ffAli *ffa = bamToFfAli(bam, genoSeq, tStart, useStrand, &qSeq); printf("<B>Alignment of %s to %s:%d-%d%s:</B><BR>\n", itemName, seqName, tStart+1, tEnd, (isRc ? " (reverse complemented)" : "")); ffShowSideBySide(stdout, ffa, qSeq, 0, genoSeq->dna, tStart, tLength, 0, tLength, 8, isRc, FALSE); } if (!skipQualityScore && core->l_qseq > 0) { printf("<B>Sequence quality scores:</B><BR>\n<TT><TABLE><TR>\n"); UBYTE *quals = bamGetQueryQuals(bam, useStrand); int i; for (i = 0; i < core->l_qseq; i++) { if (i > 0 && (i % 24) == 0) printf("</TR>\n<TR>"); printf("<TD>%c<BR>%d</TD>", qSeq[i], quals[i]); } printf("</TR></TABLE></TT>\n"); } }
void CBamLine::b_init(bam_header_t* header) { if (b) { char *name = bam1_qname(b); if (raw_merge) { read_id=0; return; } read_id=(uint64_t)atol(name); if (read_id<1 && header) { char* samline=bam_format1(header, b); err_die("Error: invalid read Id (must be numeric) for BAM record:\n%s\n", samline); } } }
int bamAddOneSamAlignment(const bam1_t *bam, void *data, bam_hdr_t *header) /* bam_fetch() calls this on each bam alignment retrieved. Translate each bam * into a samAlignment. */ { struct bamToSamHelper *helper = (struct bamToSamHelper *)data; struct lm *lm = helper->lm; struct samAlignment *sam; lmAllocVar(lm, sam); const bam1_core_t *core = &bam->core; struct dyString *dy = helper->dy; sam->qName = lmCloneString(lm, bam1_qname(bam)); sam->flag = core->flag; if (helper->chrom != NULL) sam->rName = helper->chrom; else sam->rName = lmCloneString(lm, header->target_name[core->tid]); sam->pos = core->pos + 1; sam->mapQ = core->qual; dyStringClear(dy); bamUnpackCigar(bam, dy); sam->cigar = lmCloneStringZ(lm, dy->string, dy->stringSize); if (core->mtid >= 0) { if (core->tid == core->mtid) sam->rNext = "="; else sam->rNext = lmCloneString(lm, header->target_name[core->mtid]); } else sam->rNext = "*"; sam->pNext = core->mpos + 1; sam->tLen = core->isize; sam->seq = lmAlloc(lm, core->l_qseq + 1); bamUnpackQuerySequence(bam, FALSE, sam->seq); char *bamQual = (char *)bam1_qual(bam); if (isAllSameChar(bamQual, core->l_qseq, -1)) sam->qual = "*"; else { sam->qual = lmCloneStringZ(lm, bamQual, core->l_qseq); addToChars(sam->qual, core->l_qseq, 33); } dyStringClear(dy); bamUnpackAux(bam, dy); sam->tagTypeVals = lmCloneStringZ(lm, dy->string, dy->stringSize); slAddHead(&helper->samList, sam); return 0; }
int32_t tmap_sam_io_read(tmap_sam_io_t *samio, tmap_sam_t *sam) { if(NULL != sam->b) { bam_destroy1(sam->b); } sam->b = bam_init1(); // check if we're past optional end bam virtual file offset if (samio->bam_end_vfo > 0) { BGZF* bgzf_fp = samio->fp->x.bam; if (bam_tell(bgzf_fp) >= samio->bam_end_vfo) { fprintf(stderr, "stopping at bam virtual file offset %lu\n", samio->bam_end_vfo); return -1; } } if(0 < samread(samio->fp, sam->b)) { char *str; int32_t i, len; // name str = bam1_qname(sam->b); len = strlen(str); tmap_sam_io_update_string(&sam->name, str, len); sam->name->s[len] = '\0'; // seq and qual len = sam->b->core.l_qseq; tmap_sam_io_update_string(&sam->seq, NULL, len); tmap_sam_io_update_string(&sam->qual, (char*)bam1_qual(sam->b), len); for(i=0;i<len;i++) { sam->seq->s[i] = bam_nt16_rev_table[bam1_seqi(bam1_seq(sam->b), i)]; sam->qual->s[i] = QUAL2CHAR(sam->qual->s[i]); } sam->seq->s[len] = sam->qual->s[len] = '\0'; // reverse compliment if necessary if((sam->b->core.flag & BAM_FREVERSE)) { tmap_sam_reverse_compliment(sam); } return 1; } return -1; }
/* callback for bam_fetch() */ static int fetch_func(const bam1_t *b) { const bam1_core_t *c = &b->core; int i; char* read_name=(char*) bam1_qname(b); printf("%s\t",read_name); char* read_seq=(char*)malloc(c->l_qseq+1); char* s=(char*) bam1_seq(b); for(i=0;i<c->l_qseq;i++) read_seq[i]=bam_nt16_rev_table[bam1_seqi(s,i)]; read_seq[i]=0; printf("%s\t",read_seq); char* read_qual=(char*)malloc(c->l_qseq+1); char* t=(char*) bam1_qual(b); for(i=0;i<c->l_qseq;i++) read_qual[i]=t[i]+33; read_qual[i]=0; printf("%s\n",read_qual); free(read_seq); free(read_qual); return 0; }
static void bamPairDetails(const bam1_t *leftBam, const bam1_t *rightBam) /* Print out details for paired-end reads. */ { if (leftBam && rightBam) { const bam1_core_t *leftCore = &leftBam->core, *rightCore = &rightBam->core; int leftLength = bamGetTargetLength(leftBam), rightLength = bamGetTargetLength(rightBam); int start = min(leftCore->pos, rightCore->pos); int end = max(leftCore->pos+leftLength, rightCore->pos+rightLength); char *itemName = bam1_qname(leftBam); printf("<B>Paired read name:</B> %s<BR>\n", itemName); printPosOnChrom(seqName, start, end, NULL, FALSE, itemName); puts("<P>"); } showOverlap(leftBam, rightBam); printf("<TABLE><TR><TD valign=top><H4>Left end read</H4>\n"); singleBamDetails(leftBam); printf("</TD><TD valign=top><H4>Right end read</H4>\n"); singleBamDetails(rightBam); printf("</TD></TR></TABLE>\n"); }
void filter_by_id(const char* fn, hash_table* T) { fprintf(stderr, "filtering ... \n"); samfile_t* fin = samopen(fn, "rb", NULL); if (fin == NULL) { fprintf(stderr, "can't open bam file %s\n", fn); exit(1); } samfile_t* fout = samopen("-", "w", (void*)fin->header); if (fout == NULL) { fprintf(stderr, "can't open stdout, for some reason.\n"); exit(1); } fputs(fin->header->text, stdout); bam1_t* b = bam_init1(); uint32_t n = 0; while (samread(fin, b) >= 0) { if (++n % 1000000 == 0) { fprintf(stderr, "\t%d reads\n", n); } if (get_hash_table(T, bam1_qname(b), b->core.l_qname) == 1) { samwrite(fout, b); } } bam_destroy1(b); samclose(fout); samclose(fin); fprintf(stderr, "done.\n"); }
static int fetch_func(const bam1_t *b, void *data) { fetch_data_t *d = (fetch_data_t*) data; char *name = bam1_qname(b); //check if name is requested here if(!d->requestedTranscripts->empty()) { //we're doing transcript filtering if(d->requestedTranscripts->find(name) == d->requestedTranscripts->end()) { //transcript wasn't requested return 0; } } fprintf(stderr,"%s\n",name); //TODO Desparately need some error checking on flag retrieval char *status = bam_aux2Z(bam_aux_get(b,"YT")); int length = bam_aux2i(bam_aux_get(b,"HI")); YTranscript* transcript; YTranscriptSubStructure structure; structure.position = b->core.pos + 1; structure.length = b->core.l_qseq; structure.ordinal = bam_aux2i(bam_aux_get(b,"HI")); if(d->transcriptNames.find(name,&transcript)) { //then we've already found some part of this transcript transcript->orderedStructures.push_back(structure); } else { transcript = new YTranscript; char *tidName = d->in->header->target_name[b->core.tid]; char *refName = new char[ strlen(tidName) + 1]; strcpy(refName, tidName); char *transcriptName = new char[ strlen(name) + 1]; strcpy(transcriptName, name); char *flagGeneName = bam_aux2Z(bam_aux_get(b,"YG")); char *geneName = new char[ strlen(flagGeneName) + 1]; strcpy(geneName, flagGeneName); char *statusName = new char[ strlen(status + 1) ]; strcpy(statusName, status); transcript->gene = geneName; transcript->name = transcriptName; transcript->refName = refName; transcript->status = statusName; transcript->orderedStructures.push_back(structure); transcript->strand = b->core.flag & BAM_FREVERSE ? -1 : 1; transcript->totalNumberOfStructures = bam_aux2i(bam_aux_get(b, "IH")); transcript->length = length; d->transcriptNames.insert(name,transcript); d->transcripts->push_back(transcript); } return 0; }
std::string getName() const {assert(m_dataPtr); return bam1_qname(m_dataPtr);}
/* Read one pair from a bam file. Returns 1 if we got a singleton, 2 if * we got a pair, 0 if we reached EOF, -1 if something outside our * control went wrong, -2 if we got something unexpected (missing mate, * fragment with unexpected PE flags). */ static int read_bam_pair_core(bwa_seqio_t *bs, bam_pair_t *pair, int allow_broken) { static int num_wrong_pair = 128 ; memset(pair, 0, sizeof(bam_pair_t)) ; if (bam_read1(bs->fp, &pair->bam_rec[0]) < 0) return 0 ; while(1) { if (pair->bam_rec[0].core.flag & BAM_FPAIRED) { // paired read, get another if (bam_read1(bs->fp, &pair->bam_rec[1]) >= 0) { uint32_t flag1 = pair->bam_rec[0].core.flag & (BAM_FPAIRED|BAM_FREAD1|BAM_FREAD2); uint32_t flag2 = pair->bam_rec[1].core.flag & (BAM_FPAIRED|BAM_FREAD1|BAM_FREAD2); if (!strcmp(bam1_qname(&pair->bam_rec[0]), bam1_qname(&pair->bam_rec[1]))) { // actual mates if( flag1 == (BAM_FPAIRED|BAM_FREAD1) && flag2 == (BAM_FPAIRED|BAM_FREAD2) ) { // correct order pair->kind = proper_pair ; return 2 ; } else if (flag2 == (BAM_FPAIRED|BAM_FREAD1) && flag1 == (BAM_FPAIRED|BAM_FREAD2) ) { // reverse order memswap(&pair->bam_rec[0], &pair->bam_rec[1], sizeof(bam1_t)); pair->kind = proper_pair ; return 2 ; } else { fprintf( stderr, "[read_bam_pair] got a pair, but the flags are wrong (%s).\n", bam1_qname(&pair->bam_rec[0]) ) ; if( allow_broken ) { pair->bam_rec[0].core.flag &= ~BAM_FREAD2; pair->bam_rec[0].core.flag |= BAM_FPAIRED|BAM_FREAD1; pair->bam_rec[1].core.flag &= ~BAM_FREAD1; pair->bam_rec[1].core.flag |= BAM_FPAIRED|BAM_FREAD2; pair->kind = proper_pair ; return 2 ; } else return -2 ; } } else { // This is arguably wrong, we discard a lone mate. But what else could we do? Buffering it // somewhere to way is too hard for the time being, returning it as a single means we need to buffer the // next one. Not very appealing. So only two options remain: discard it or bail out. if( num_wrong_pair ) { fprintf( stderr, "[read_bam_pair] got two reads, but the names don't match (%s,%s).\n", bam1_qname(&pair->bam_rec[0]), bam1_qname(&pair->bam_rec[1]) ) ; --num_wrong_pair ; if( !num_wrong_pair ) fprintf( stderr, "[read_bam_pair] too many mismatched names, not reporting anymore.\n" ) ; } try_get_sai( bs->sai, flag1 & BAM_FREAD1 ? 1 : 2, &pair->bwa_seq[0].n_aln, &pair->bwa_seq[0].aln ) ; free(pair->bam_rec[0].data); if(pair->bwa_seq[0].n_aln) free(pair->bwa_seq[0].aln); if( !allow_broken ) { free(pair->bam_rec[1].data); if(pair->bwa_seq[0].n_aln) free(pair->bwa_seq[0].aln); return -2 ; } memmove(&pair->bam_rec[0], &pair->bam_rec[1], sizeof(bam1_t)); memset(&pair->bam_rec[1], 0, sizeof(bam1_t)); } } else { fprintf( stderr, "[read_bam_pair] got a paired read and hit EOF.\n" ) ; free(pair->bam_rec[0].data); if(pair->bwa_seq[0].n_aln) free(pair->bwa_seq[0].aln); return allow_broken ? 0 : -2 ; } } else { // singleton read pair->kind = singleton ; return 1 ; } } }
void mapper( char *ref, int length, int start_base_pos, const char *bam ) { anal_t input; gzFile pRef; kseq_t * seq = NULL; char chr[8] = { 0, }; int ret; bam_plbuf_t *buf; bam1_t *b; /* fprintf( stderr, "ref: %s\n", ref ); fprintf( stderr, "length: %d\n", length ); fprintf( stderr, "start_base_pos: %d\n", start_base_pos ); fprintf( stderr, "bam: %s\n", bam ); */ input.beg = 0; input.end = 0x7fffffff; input.in = samopen(bam, "rb", 0); if (input.in == 0) { fprintf(stderr, "Fail to open BAM file %s\n", bam); return; } pRef = gzopen( ref, "r" ); fprintf( stderr, "ref : %s\n", ref ); fprintf( stderr, "pRef: %p\n", pRef ); if( pRef == NULL ) { fprintf( stderr, "ref : %s\n", ref ); fprintf( stderr, "pRef: %p\n", pRef ); return; } seq = kseq_init( pRef ); b = bam_init1(); // alloc memory size of bam1_t //fprintf( stderr, "%\pn", b ); buf = bam_plbuf_init(pileup_func, &input); // alloc memory bam_plbuf_set_mask(buf, -1); while ((ret = samread( input.in, b)) >= 0) { bam_plbuf_push(b, buf); //fprintf( stderr, "%x\n", b->core.flag ); if( b->core.flag & 0x0004 ) // unmapped { // do nothing /* qname1 = strtok(bam1_qname(b), ":\t\n "); qname2 = strtok(NULL, ":\t\n "); qname3 = atoi(qname2); fprintf( stderr, "%s:%10d:%s:%d\t%c:%d:%d:%d\n", qname1, qname3, "*", b->core.pos, '*', b->core.flag, b->core.qual, ret ); */ fprintf( stdout, "%s:%s:%d\t%c:0x%x:%d:%d\n", bam1_qname(b), "*", b->core.pos+1, '*', b->core.flag, b->core.qual, ret ); /* fprintf( stderr, "%s:%s:%d\t%c:0x%x:%d:%d\n", bam1_qname(b), "*", b->core.pos, '*', b->core.flag, b->core.qual, ret ); */ } else { // to find a base in the reference genome, seq. if( ( seq != NULL ) && ( strcmp( input.in->header->target_name[b->core.tid], chr ) == 0 ) ) { // already found that // fprintf( stderr, "found : %s\n", chr ); }else { if( find_chr(input.in->header->target_name[b->core.tid], seq, chr) < 0 ) { fprintf( stderr, "ERROR : cannot find chromosome %s\n", \ input.in->header->target_name[b->core.tid] ); }else { fprintf( stderr, "FOUND CHR : %s\n", chr ); } } // remove not aligned to the chromosome fprintf( stdout, "%s:%s:%d\t%c:%d:%d:%d\n", bam1_qname(b), input.in->header->target_name[b->core.tid], b->core.pos+1, seq->seq.s[b->core.pos], b->core.flag, b->core.qual, ret ); /* fprintf( stderr, "%s:%s:%d\t%c:%d:%d:%d\n", bam1_qname(b), input.in->header->target_name[b->core.tid], b->core.pos, seq->seq.s[b->core.pos], b->core.flag, b->core.qual, ret ); */ } } // for the last bases... // printf("pos:%d(%c), flag:%d qual: %d(ret %d)\n", // b->core.pos+1, seq->seq.s[b->core.pos], b->core.flag, b->core.qual, ret ); bam_plbuf_push(0, buf); bam_plbuf_destroy(buf); // release memory bam_destroy1(b); // release memory size of bam1_t samclose(input.in); kseq_destroy( seq ); gzclose( pRef ); return; }