// Returns 0 on success, -1 on failure. static int sync_mq_mc(bam1_t* src, bam1_t* dest) { if ( (src->core.flag & BAM_FUNMAP) == 0 ) { // If mapped // Copy Mate Mapping Quality uint32_t mq = src->core.qual; uint8_t* data; if ((data = bam_aux_get(dest,"MQ")) != NULL) { bam_aux_del(dest, data); } bam_aux_append(dest, "MQ", 'i', sizeof(uint32_t), (uint8_t*)&mq); } // Copy mate cigar if either read is mapped if ( (src->core.flag & BAM_FUNMAP) == 0 || (dest->core.flag & BAM_FUNMAP) == 0 ) { uint8_t* data_mc; if ((data_mc = bam_aux_get(dest,"MC")) != NULL) { bam_aux_del(dest, data_mc); } // Convert cigar to string kstring_t mc = { 0, 0, NULL }; if (bam_format_cigar(src, &mc) < 0) return -1; bam_aux_append(dest, "MC", 'Z', ks_len(&mc)+1, (uint8_t*)ks_str(&mc)); free(mc.s); } return 0; }
/* * This function calculates ct tag for two bams, it assumes they are from the same template and * writes the tag to the first read in position terms. */ static void bam_template_cigar(bam1_t *b1, bam1_t *b2, kstring_t *str) { bam1_t *swap; int i, end; uint32_t *cigar; str->l = 0; if (b1->core.tid != b2->core.tid || b1->core.tid < 0 || b1->core.pos < 0 || b2->core.pos < 0 || b1->core.flag&BAM_FUNMAP || b2->core.flag&BAM_FUNMAP) return; // coordinateless or not on the same chr; skip if (b1->core.pos > b2->core.pos) swap = b1, b1 = b2, b2 = swap; // make sure b1 has a smaller coordinate kputc((b1->core.flag & BAM_FREAD1)? '1' : '2', str); // segment index kputc((b1->core.flag & BAM_FREVERSE)? 'R' : 'F', str); // strand for (i = 0, cigar = bam_get_cigar(b1); i < b1->core.n_cigar; ++i) { kputw(bam_cigar_oplen(cigar[i]), str); kputc(bam_cigar_opchr(cigar[i]), str); } end = bam_endpos(b1); kputw(b2->core.pos - end, str); kputc('T', str); kputc((b2->core.flag & BAM_FREAD1)? '1' : '2', str); // segment index kputc((b2->core.flag & BAM_FREVERSE)? 'R' : 'F', str); // strand for (i = 0, cigar = bam_get_cigar(b2); i < b2->core.n_cigar; ++i) { kputw(bam_cigar_oplen(cigar[i]), str); kputc(bam_cigar_opchr(cigar[i]), str); } uint8_t* data; if ((data = bam_aux_get(b1,"ct")) != NULL) bam_aux_del(b1, data); if ((data = bam_aux_get(b2,"ct")) != NULL) bam_aux_del(b2, data); bam_aux_append(b1, "ct", 'Z', str->l+1, (uint8_t*)str->s); }
void bam_template_cigar(bam1_t *b1, bam1_t *b2, kstring_t *str) { bam1_t *swap; int i, end; uint32_t *cigar; str->l = 0; if (b1->core.tid != b2->core.tid || b1->core.tid < 0) return; // coordinateless or not on the same chr; skip if (b1->core.pos > b2->core.pos) swap = b1, b1 = b2, b2 = swap; // make sure b1 has a smaller coordinate kputc((b1->core.flag & BAM_FREAD1)? '1' : '2', str); // segment index kputc((b1->core.flag & BAM_FREVERSE)? 'R' : 'F', str); // strand for (i = 0, cigar = bam1_cigar(b1); i < b1->core.n_cigar; ++i) { kputw(bam_cigar_oplen(cigar[i]), str); kputc(bam_cigar_opchr(cigar[i]), str); } end = bam_calend(&b1->core, cigar); kputw(b2->core.pos - end, str); kputc('T', str); kputc((b2->core.flag & BAM_FREAD1)? '1' : '2', str); // segment index kputc((b2->core.flag & BAM_FREVERSE)? 'R' : 'F', str); // strand for (i = 0, cigar = bam1_cigar(b2); i < b2->core.n_cigar; ++i) { kputw(bam_cigar_oplen(cigar[i]), str); kputc(bam_cigar_opchr(cigar[i]), str); } bam_aux_append(b1, "CT", 'Z', str->l+1, (uint8_t*)str->s); }
void addSupplementaryAlignmentEvidence( bam_record& bamRead, const std::string& svStr) { static const char svtag[] = {'S','A'}; bam_aux_append(bamRead.get_data(),svtag,'Z',(svStr.size()+1), (const uint8_t*)(svStr.c_str())); }
static void orphan_only_func(const state_t* state, bam1_t* file_read) { uint8_t* data = (uint8_t*)strdup(state->rg_id); int len = strlen(state->rg_id)+1; // If the old exists don't do anything uint8_t* old = bam_aux_get(file_read, "RG"); if (old == NULL) { bam_aux_append(file_read, "RG",'Z',len,data); } free(data); }
static void sync_mq(bam1_t* src, bam1_t* dest) { if ( (src->core.flag & BAM_FUNMAP) == 0 ) { // If mapped uint32_t mq = src->core.qual; uint8_t* data; if ((data = bam_aux_get(dest,"MQ")) != NULL) { bam_aux_del(dest, data); } bam_aux_append(dest, "MQ", 'i', sizeof(uint32_t), (uint8_t*)&mq); } }
static void overwrite_all_func(const state_t* state, bam1_t* file_read) { uint8_t* data = (uint8_t*)strdup(state->rg_id); int len = strlen(state->rg_id)+1; // If the old exists delete it uint8_t* old = bam_aux_get(file_read, "RG"); if (old != NULL) { bam_aux_del(file_read, old); } bam_aux_append(file_read, "RG", 'Z', len, data); free(data); }
static void bam_translate(bam1_t* b, trans_tbl_t* tbl) { // Update target id if not unmapped tid if ( b->core.tid >= 0 ) { b->core.tid = tbl->tid_trans[b->core.tid]; } if ( b->core.mtid >= 0 ) { b->core.mtid = tbl->tid_trans[b->core.mtid]; } // If we have a RG update it uint8_t *rg = bam_aux_get(b, "RG"); if (rg) { char* decoded_rg = bam_aux2Z(rg); khiter_t k = kh_get(c2c, tbl->rg_trans, decoded_rg); if (k != kh_end(tbl->rg_trans)) { char* translate_rg = kh_value(tbl->rg_trans,k); bam_aux_del(b, rg); bam_aux_append(b, "RG", 'Z', strlen(translate_rg) + 1, (uint8_t*)translate_rg); } else { fprintf(pysamerr, "[bam_translate] RG tag \"%s\" on read \"%s\" encountered with no corresponding entry in header, tag lost\n",decoded_rg, bam_get_qname(b)); bam_aux_del(b, rg); } } // If we have a PG update it uint8_t *pg = bam_aux_get(b, "PG"); if (pg) { char* decoded_pg = bam_aux2Z(pg); khiter_t k = kh_get(c2c, tbl->pg_trans, decoded_pg); if (k != kh_end(tbl->pg_trans)) { char* translate_pg = kh_value(tbl->pg_trans,k); bam_aux_del(b, pg); bam_aux_append(b, "PG", 'Z', strlen(translate_pg) + 1, (uint8_t*)translate_pg); } else { fprintf(pysamerr, "[bam_translate] PG tag \"%s\" on read \"%s\" encountered with no corresponding entry in header, tag lost\n",decoded_pg, bam_get_qname(b)); bam_aux_del(b, pg); } } }
static int uniform_fetch_func(bam1_t *b, void *data) { uint8_t *to_delete; data_t_uniform *tmp = (data_t_uniform*)data; bam1_core_t *c = &b->core; char *iq; char *dq; iq = malloc((c->l_qseq+1) * sizeof(char)); memset(iq, tmp->iq, c->l_qseq); iq[c->l_qseq] = '\0'; to_delete = bam_aux_get(b, BI_TAG); if (to_delete) { bam_aux_del(b, to_delete); } bam_aux_append(b, BI_TAG, 'Z', c->l_qseq+1, (uint8_t*) iq); dq = malloc((c->l_qseq+1) * sizeof(char)); memset(dq, tmp->dq, c->l_qseq); dq[c->l_qseq] = '\0'; to_delete = bam_aux_get(b, BD_TAG); if (to_delete) { bam_aux_del(b, to_delete); } bam_aux_append(b, BD_TAG, 'Z', c->l_qseq+1, (uint8_t*) dq); bam_write1(tmp->out, b); free(iq); free(dq); return 0; }
static int add_mate_score(bam1_t *src, bam1_t *dest) { uint8_t *data_ms; uint32_t mate_score = calc_mate_score(src); if ((data_ms = bam_aux_get(dest, "ms")) != NULL) { bam_aux_del(dest, data_ms); } if (bam_aux_append(dest, "ms", 'i', sizeof(uint32_t), (uint8_t*)&mate_score) == -1) { return -1; } return 0; }
int bam_merge_core2(int by_qname, const char *out, const char *headers, int n, char * const *fn, int flag, const char *reg, int level) #endif { bamFile fpout, *fp; heap1_t *heap; bam_header_t *hout = 0; bam_header_t *hheaders = NULL; int i, j, *RG_len = 0; uint64_t idx = 0; char **RG = 0, mode[8]; bam_iter_t *iter = 0; if (headers) { tamFile fpheaders = sam_open(headers); if (fpheaders == 0) { const char *message = strerror(errno); fprintf(stderr, "[bam_merge_core] cannot open '%s': %s\n", headers, message); return -1; } hheaders = sam_header_read(fpheaders); sam_close(fpheaders); } g_is_by_qname = by_qname; fp = (bamFile*)calloc(n, sizeof(bamFile)); heap = (heap1_t*)calloc(n, sizeof(heap1_t)); iter = (bam_iter_t*)calloc(n, sizeof(bam_iter_t)); // prepare RG tag if (flag & MERGE_RG) { RG = (char**)calloc(n, sizeof(void*)); RG_len = (int*)calloc(n, sizeof(int)); for (i = 0; i != n; ++i) { int l = strlen(fn[i]); const char *s = fn[i]; if (l > 4 && strcmp(s + l - 4, ".bam") == 0) l -= 4; for (j = l - 1; j >= 0; --j) if (s[j] == '/') break; ++j; l -= j; RG[i] = calloc(l + 1, 1); RG_len[i] = l; strncpy(RG[i], s + j, l); } } // read the first for (i = 0; i != n; ++i) { bam_header_t *hin; fp[i] = bam_open(fn[i], "r"); if (fp[i] == 0) { int j; fprintf(stderr, "[bam_merge_core] fail to open file %s\n", fn[i]); for (j = 0; j < i; ++j) bam_close(fp[j]); free(fp); free(heap); // FIXME: possible memory leak return -1; } hin = bam_header_read(fp[i]); if (i == 0) { // the first BAM hout = hin; } else { // validate multiple baf int min_n_targets = hout->n_targets; if (hin->n_targets < min_n_targets) min_n_targets = hin->n_targets; for (j = 0; j < min_n_targets; ++j) if (strcmp(hout->target_name[j], hin->target_name[j]) != 0) { fprintf(stderr, "[bam_merge_core] different target sequence name: '%s' != '%s' in file '%s'\n", hout->target_name[j], hin->target_name[j], fn[i]); return -1; } // If this input file has additional target reference sequences, // add them to the headers to be output if (hin->n_targets > hout->n_targets) { swap_header_targets(hout, hin); // FIXME Possibly we should also create @SQ text headers // for the newly added reference sequences } bam_header_destroy(hin); } } if (hheaders) { // If the text headers to be swapped in include any @SQ headers, // check that they are consistent with the existing binary list // of reference information. if (hheaders->n_targets > 0) { if (hout->n_targets != hheaders->n_targets) { fprintf(stderr, "[bam_merge_core] number of @SQ headers in '%s' differs from number of target sequences\n", headers); if (!reg) return -1; } for (j = 0; j < hout->n_targets; ++j) if (strcmp(hout->target_name[j], hheaders->target_name[j]) != 0) { fprintf(stderr, "[bam_merge_core] @SQ header '%s' in '%s' differs from target sequence\n", hheaders->target_name[j], headers); if (!reg) return -1; } } swap_header_text(hout, hheaders); bam_header_destroy(hheaders); } if (reg) { int tid, beg, end; if (bam_parse_region(hout, reg, &tid, &beg, &end) < 0) { fprintf(stderr, "[%s] Malformated region string or undefined reference name\n", __func__); return -1; } for (i = 0; i < n; ++i) { bam_index_t *idx; idx = bam_index_load(fn[i]); iter[i] = bam_iter_query(idx, tid, beg, end); bam_index_destroy(idx); } } for (i = 0; i < n; ++i) { heap1_t *h = heap + i; h->i = i; h->b = (bam1_t*)calloc(1, sizeof(bam1_t)); if (bam_iter_read(fp[i], iter[i], h->b) >= 0) { h->pos = ((uint64_t)h->b->core.tid<<32) | (uint32_t)((int32_t)h->b->core.pos+1)<<1 | bam1_strand(h->b); h->idx = idx++; } else h->pos = HEAP_EMPTY; } if (flag & MERGE_UNCOMP) level = 0; else if (flag & MERGE_LEVEL1) level = 1; strcpy(mode, "w"); if (level >= 0) sprintf(mode + 1, "%d", level < 9? level : 9); if ((fpout = strcmp(out, "-")? bam_open(out, "w") : bam_dopen(fileno(stdout), "w")) == 0) { fprintf(stderr, "[%s] fail to create the output file.\n", __func__); return -1; } bam_header_write(fpout, hout); bam_header_destroy(hout); #ifndef _PBGZF_USE if (!(flag & MERGE_UNCOMP)) bgzf_mt(fpout, n_threads, 256); #endif ks_heapmake(heap, n, heap); while (heap->pos != HEAP_EMPTY) { bam1_t *b = heap->b; if (flag & MERGE_RG) { uint8_t *rg = bam_aux_get(b, "RG"); if (rg) bam_aux_del(b, rg); bam_aux_append(b, "RG", 'Z', RG_len[heap->i] + 1, (uint8_t*)RG[heap->i]); } bam_write1_core(fpout, &b->core, b->data_len, b->data); if ((j = bam_iter_read(fp[heap->i], iter[heap->i], b)) >= 0) { heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)((int)b->core.pos+1)<<1 | bam1_strand(b); heap->idx = idx++; } else if (j == -1) { heap->pos = HEAP_EMPTY; free(heap->b->data); free(heap->b); heap->b = 0; } else fprintf(stderr, "[bam_merge_core] '%s' is truncated. Continue anyway.\n", fn[heap->i]); ks_heapadjust(heap, 0, n, heap); } if (flag & MERGE_RG) { for (i = 0; i != n; ++i) free(RG[i]); free(RG); free(RG_len); } for (i = 0; i != n; ++i) { bam_iter_destroy(iter[i]); bam_close(fp[i]); } bam_close(fpout); free(fp); free(heap); free(iter); return 0; }
static int dindel_fetch_func(bam1_t *b, void *data) { data_t_dindel *tmp = (data_t_dindel*)data; bam1_core_t *c = &b->core; int rlen; uint8_t *to_delete; /* don't change reads failing default mask: BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP */ if (c->flag & BAM_DEF_MASK) { /* fprintf(stderr, "skipping read: %s at pos %d\n", bam1_qname(b), c->pos); */ bam_write1(tmp->out, b); return 0; } /* get the reference sequence and compute homopolymer array */ if (tmp->tid != c->tid) { /*fprintf(stderr, "fetching reference sequence %s\n", tmp->in->header->target_name[c->tid]); */ char *ref = fai_fetch(tmp->fai, tmp->in->header->target_name[c->tid], &rlen); strtoupper(ref);/* safeguard */ int rlen = strlen(ref); tmp->tid = c->tid; if (tmp->hpcount) free(tmp->hpcount); tmp->hpcount = (int*)malloc(rlen*sizeof(int)); find_homopolymers(ref, tmp->hpcount, rlen); free(ref); tmp->rlen = rlen; /* fprintf(stderr, "fetched reference sequence\n");*/ } /* parse the cigar string */ uint32_t *cigar = bam1_cigar(b); uint8_t indelq[c->l_qseq+1]; /* fprintf(stderr, "l_qseq:%d\n", c->l_qseq); */ int i; int x = c->pos; /* coordinate on reference */ int y = 0; /* coordinate on query */ for (i = 0; i < c->n_cigar; ++i) { int j, oplen = cigar[i]>>4, op = cigar[i]&0xf; if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { for (j = 0; j < oplen; j++) { /*fprintf(stderr, "query:%d, ref:%d, count:%d\n", y, x, tmp->hpcount[x+1]); */ /* FIXME clang complains: The left operand of '>' is a garbage value */ indelq[y] = (x > tmp->rlen-2) ? DINDELQ[0] : (tmp->hpcount[x+1]>18 ? DINDELQ[0] : DINDELQ[tmp->hpcount[x+1]]); x++; y++; } } else if (op == BAM_CHARD_CLIP) { /* do nothing */ } else if (op == BAM_CDEL) { x += oplen; } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) { for (j = 0; j < oplen; j++) { /* fprintf(stderr, "query:%d, ref:%d\n", y, x); */ indelq[y] = DINDELQ[0]; y++; } } else { LOG_FATAL("unknown op %d for read %s\n", op, bam1_qname(b));/* FIXME skip? seen this somewhere else properly handled */ exit(1); } } indelq[y] = '\0'; to_delete = bam_aux_get(b, BI_TAG); if (to_delete) { bam_aux_del(b, to_delete); } bam_aux_append(b, BI_TAG, 'Z', c->l_qseq+1, indelq); to_delete = bam_aux_get(b, BD_TAG); if (to_delete) { bam_aux_del(b, to_delete); } bam_aux_append(b, BD_TAG, 'Z', c->l_qseq+1, indelq); bam_write1(tmp->out, b); return 0; }
static int trim_ns(bam1_t *b, void *data) { int ret = 0; opts_t *op((opts_t *)data); std::vector<uint8_t> aux(bam_get_aux(b), bam_get_aux(b) + bam_get_l_aux(b)); int tmp; uint8_t *const seq(bam_get_seq(b)); uint32_t *const cigar(bam_get_cigar(b)); //op->n_cigar = b->core.n_cigar; op->resize(b->l_data); // Make sure it's big enough to hold everything. memcpy(op->data, b->data, b->core.l_qname); // Get #Ns at the beginning for(tmp = 0; bam_seqi(seq, tmp) == dlib::htseq::HTS_N; ++tmp); const int n_start(tmp); if(tmp == b->core.l_qseq - 1) // all bases are N -- garbage read ret |= op->skip_all_ns; // Get #Ns at the end for(tmp = b->core.l_qseq - 1; bam_seqi(seq, tmp) == dlib::htseq::HTS_N; --tmp); const int n_end(b->core.l_qseq - 1 - tmp); // Get new length for read int final_len(b->core.l_qseq - n_end - n_start); if(final_len < 0) final_len = 0; if(final_len < op->min_trimmed_len) // Too short. ret |= 1; // Copy in qual and all of aux. if(n_end) { if((tmp = bam_cigar_oplen(cigar[b->core.n_cigar - 1]) - n_end) == 0) { LOG_DEBUG("Entire cigar operation is the softclip. Decrease the number of new cigar operations.\n"); --b->core.n_cigar; } else { LOG_DEBUG("Updating second cigar operation in-place.\n"); cigar[b->core.n_cigar - 1] = bam_cigar_gen(tmp, BAM_CSOFT_CLIP); } } // Get new n_cigar. if((tmp = bam_cigar_oplen(*cigar) - n_start) == 0) { memcpy(op->data + b->core.l_qname, cigar + 1, (--b->core.n_cigar) << 2); // << 2 for 4 bit per cigar op } else { if(n_start) *cigar = bam_cigar_gen(tmp, BAM_CSOFT_CLIP); memcpy(op->data + b->core.l_qname, cigar, b->core.n_cigar << 2); } uint8_t *opseq(op->data + b->core.l_qname + (b->core.n_cigar << 2)); // Pointer to the seq region of new data field. for(tmp = 0; tmp < final_len >> 1; ++tmp) opseq[tmp] = (bam_seqi(seq, ((tmp << 1) + n_start)) << 4) | (bam_seqi(seq, (tmp << 1) + n_start + 1)); if(final_len & 1) opseq[tmp] = (bam_seqi(seq, ((tmp << 1) + n_start)) << 4); tmp = bam_get_l_aux(b); memcpy(opseq + ((final_len + 1) >> 1), bam_get_qual(b) + n_start, final_len + tmp); // Switch data strings std::swap(op->data, b->data); b->core.l_qseq = final_len; memcpy(bam_get_aux(b), aux.data(), aux.size()); b->l_data = (bam_get_aux(b) - b->data) + aux.size(); if(n_end) bam_aux_append(b, "NE", 'i', sizeof(int), (uint8_t *)&n_end); if(n_start) bam_aux_append(b, "NS", 'i', sizeof(int), (uint8_t *)&n_start); const uint32_t *pvar((uint32_t *)dlib::array_tag(b, "PV")); tmp = b->core.flag & BAM_FREVERSE ? n_end: n_start; if(pvar) { std::vector<uint32_t>pvals(pvar + tmp, pvar + final_len + tmp); bam_aux_del(b, (uint8_t *)(pvar) - 6); dlib::bam_aux_array_append(b, "PV", 'I', sizeof(uint32_t), final_len, (uint8_t *)pvals.data()); } const uint32_t *fvar((uint32_t *)dlib::array_tag(b, "FA")); if(fvar) { std::vector<uint32_t>fvals(fvar + tmp, fvar + final_len + tmp); bam_aux_del(b, (uint8_t *)(fvar) - 6); dlib::bam_aux_array_append(b, "FA", 'I', sizeof(uint32_t), final_len, (uint8_t *)fvals.data()); } return ret; }
// TODO soft clipping bam1_t *sw_align_update_bam(bam1_t *bam_old, char *rg_id, sw_heap_t *heap, int32_t sw_node_best_i, uint8_t space, char *colors, char *color_qualities, uint8_t strand, uint8_t correct_bases) { bam1_t *bam_new=NULL; int32_t sw_node_cur_i=-1, sw_node_prev_i=-1; int32_t i; int32_t cigar_cur_op, cigar_prev_op; int32_t cigar_cur_length, cigar_prev_length; uint32_t read_index; char *color_errors = NULL; if(sw_node_best_i < 0) { // none found, do not modify alignment return bam_old; } bam_new = srma_calloc(1, sizeof(bam1_t), __func__, "bam_new"); if(1 == strand) { read_index = 0; } else { read_index = bam_old->core.l_qseq-1; } { // query name bam_new->core.l_qname = bam_old->core.l_qname; bam_new->data_len += bam_new->core.l_qname; sw_align_bam_alloc_data(bam_new, bam_new->data_len); memcpy(bam1_qname(bam_new), bam1_qname(bam_old), bam_old->core.l_qname); } { // flag bam_new->core.flag = bam_old->core.flag; } { // tid, pos, qual bam_new->core.tid = heap->nodes[sw_node_best_i].node->contig-1; // it is one-based, we want zero-based if(1 == strand) { // reverse strand bam_new->core.pos = heap->nodes[sw_node_best_i].node->position-1; } else { bam_new->core.pos = heap->nodes[sw_node_best_i].start_position-1; // zero-based } bam_new->core.qual = bam_old->core.qual; // should we change the mapping quality? bam_new->core.mtid = -1; bam_new->core.mpos = -1; bam_new->core.isize = 0; } { // cigar length bam_new->core.n_cigar = 0; cigar_cur_op = cigar_prev_op = -1; sw_node_cur_i = sw_node_best_i; while(0 <= sw_node_cur_i) { if(0 <= sw_node_prev_i && BAM_CDEL == cigar_prev_op && 1 < fabs(heap->nodes[sw_node_cur_i].node->position - heap->nodes[sw_node_prev_i].node->position)) { cigar_cur_op = BAM_CDEL; } else { switch(__node_type(heap->nodes[sw_node_cur_i].node)) { case NODE_MATCH: case NODE_MISMATCH: cigar_cur_op = BAM_CMATCH; break; case NODE_INSERTION: cigar_cur_op = BAM_CINS; break; default: srma_error(__func__, "unknown node type", Exit, OutOfRange); } } if(cigar_prev_op != cigar_cur_op) { // update the previous cigar operator cigar_prev_op = cigar_cur_op; bam_new->core.n_cigar++; } // Update if(BAM_CDEL != cigar_cur_op) { sw_node_prev_i = sw_node_cur_i; sw_node_cur_i = heap->nodes[sw_node_cur_i].prev_i; } } } { // cigar and seq uint32_t *cigar_ptr=NULL; uint8_t *seq_ptr=NULL; uint32_t cigar_i = 0; // cigar bam_new->data_len += bam_new->core.n_cigar*sizeof(uint32_t); sw_align_bam_alloc_data(bam_new, bam_new->data_len); cigar_ptr = bam1_cigar(bam_new); // seq bam_new->core.l_qseq = bam_old->core.l_qseq; bam_new->data_len += (bam_new->core.l_qseq + 1)/2; sw_align_bam_alloc_data(bam_new, bam_new->data_len); seq_ptr = bam1_seq(bam_new); // fill in cigar and seq cigar_i = (1 == strand) ? bam_new->core.n_cigar-1 : 0; cigar_cur_op = cigar_prev_op = -1; cigar_cur_length = cigar_prev_length = -1; sw_node_cur_i = sw_node_best_i; while(0 <= sw_node_cur_i) { if(0 <= sw_node_prev_i && BAM_CDEL == cigar_prev_op && 1 < fabs(heap->nodes[sw_node_cur_i].node->position - heap->nodes[sw_node_prev_i].node->position)) { cigar_cur_op = BAM_CDEL; } else { switch(__node_type(heap->nodes[sw_node_cur_i].node)) { case NODE_MATCH: case NODE_MISMATCH: cigar_cur_op = BAM_CMATCH; break; case NODE_INSERTION: cigar_cur_op = BAM_CINS; break; default: srma_error(__func__, "unknown node type", Exit, OutOfRange); } // pack sequence if(1 == strand && 0 == read_index%2) { seq_ptr[read_index/2] = 0; } else if(0 == strand && 1 == read_index%2) { seq_ptr[read_index/2] = 0; } // DEBUG /* fprintf(stderr, "read_index=%d base=%d\n", read_index, __node_base(heap->nodes[sw_node_cur_i].node)); */ seq_ptr[read_index/2] |= int_to_nt4bit[__node_base(heap->nodes[sw_node_cur_i].node)] << 4*(1-(read_index%2)); if(1 == strand) { read_index++; } else { read_index--; } } if(cigar_prev_op != cigar_cur_op) { // add the previous cigar operator if(-1 != cigar_prev_op) { bam1_cigar(bam_new)[cigar_i] = (cigar_prev_length << BAM_CIGAR_SHIFT) | cigar_prev_op; if(1 == strand) { // reverse strand cigar_i--; } else { cigar_i++; } } // update the previous cigar operator cigar_prev_op = cigar_cur_op; if(cigar_cur_op == BAM_CDEL) { // deletion length cigar_prev_length = (int)fabs(heap->nodes[sw_node_cur_i].node->position - heap->nodes[sw_node_cur_i].node->position) - 1; } else { cigar_prev_length = 1; } } else { cigar_prev_length++; } // Update if(BAM_CDEL != cigar_cur_op) { sw_node_prev_i = sw_node_cur_i; sw_node_cur_i = heap->nodes[sw_node_cur_i].prev_i; } } if(0 < cigar_prev_length) { if(-1 == cigar_prev_op || BAM_CDEL == cigar_prev_op) { srma_error(__func__, "Alignment ended with a null cigar or a deletion", Exit, OutOfRange); } bam1_cigar(bam_new)[cigar_i] = (cigar_prev_length << BAM_CIGAR_SHIFT) | cigar_prev_op; // DEBUG if(1 == strand) { // reverse strand assert(cigar_i == 0); } else { assert(cigar_i == bam_new->core.n_cigar-1); } } } { // qualities uint8_t *qual_ptr = NULL; char qual, q1, q2; uint8_t prev_base = 0, next_base; bam_new->data_len += bam_new->core.l_qseq; sw_align_bam_alloc_data(bam_new, bam_new->data_len); qual_ptr = bam1_qual(bam_new); if(space == SRMA_SPACE_CS) { color_errors = srma_malloc(sizeof(char)*(1 + bam_new->core.l_qseq), __func__, "color_errors"); prev_base = nt2int_table[(int)colors[0]]; for(i=0;i<bam_new->core.l_qseq;i++) { if(0 == strand) { next_base = nt4bit_to_int[bam1_seqi(bam1_seq(bam_new), i)]; } else { next_base = nt4bit_to_int[bam1_seqi(bam1_seq(bam_new), bam_new->core.l_qseq-i-1)]; if(next_base < 4) next_base = 3 - next_base; } if((prev_base ^ next_base) == nt2int_table[(int)colors[i+1]]) { color_errors[i] = '-'; } else { color_errors[i] = colors[i+1]; } prev_base = next_base; } color_errors[i]='\0'; // Get new base qualities based on color qualities for(i=0;i<bam_new->core.l_qseq;i++) { // use MAQ 0.7.1 conversion if(i == bam_new->core.l_qseq-1) { qual = srma_char2qual(color_qualities[i]); } else { int m1, m2; if(0 == strand) { // forward m1 = ('-' == color_errors[i]) ? 1 : 0; m2 = ('-' == color_errors[i+1]) ? 1 : 0; q1 = color_qualities[i]; q2 = color_qualities[i+1]; } else { m1 = ('-' == color_errors[bam_new->core.l_qseq-i-1]) ? 1 : 0; m2 = ('-' == color_errors[bam_new->core.l_qseq-i-2]) ? 1 : 0; q1 = color_qualities[bam_new->core.l_qseq-i-1]; q2 = color_qualities[bam_new->core.l_qseq-i-2]; } if(1 == m1 && 1 == m2) { qual = srma_char2qual(q1) + srma_char2qual(q2) + 10; } else if(1 == m1) { qual = srma_char2qual(q1) - srma_char2qual(q2); } else if(1 == m2) { qual = srma_char2qual(q2) - srma_char2qual(q1); } else { qual = 1; } } if(0 == strand) { bam1_qual(bam_new)[i] = __bound_qual(qual); } else { bam1_qual(bam_new)[bam_new->core.l_qseq-i-1] = __bound_qual(qual); } } } else if(1 == correct_bases) { // Get new base qualities for(i=0;i<bam_new->core.l_qseq;i++) { if(bam1_seqi(bam1_seq(bam_new), i) == bam1_seqi(bam1_seq(bam_old), i)) { bam1_qual(bam_new)[i] = bam1_qual(bam_old)[i]; } else { qual = srma_char2qual(bam1_qual(bam_old)[i]) - 33; bam1_qual(bam_new)[i] = srma_qual2char(__bound_qual(qual - SRMA_CORRECT_BASE_QUALITY_PENALTY)); } } } else { // Copy old quality memcpy(bam1_qual(bam_new), bam1_qual(bam_old), bam_new->core.l_qseq); } } // TODO soft-clipping { // Add in any auxiliary data as necessary uint8_t *s; int32_t i = 0; bam_new->l_aux = 0; while(NULL != sw_align_save_tags[i]) { __copy_old(sw_align_save_tags[i]); i++; } // TODO // PG // TODO: is AS correct bam_aux_append(bam_new, "AS", 'i', sizeof(uint32_t), (uint8_t*)&heap->nodes[sw_node_best_i].score); if(1 == correct_bases) { int32_t l = bam_old->core.l_qseq; char *str; str = srma_malloc(sizeof(char)*(l+1), __func__, "seq"); for(i=0;i<l;i++) { str[i] = bam_nt16_rev_table[bam1_seqi(bam1_seq(bam_old), i)]; } str[i] = '\0'; bam_aux_append(bam_new, "XO", 'Z', l+1, (uint8_t*)str); for(i=0;i<l;i++) { str[i] = bam1_qual(bam_old)[i] + 33; } str[i] = '\0'; bam_aux_append(bam_new, "XQ", 'Z', l+1, (uint8_t*)str); free(str); } bam_aux_append(bam_new, "XC", 'i', sizeof(uint32_t), (uint8_t*)&heap->nodes[sw_node_best_i].coverage_sum); if(space == SRMA_SPACE_CS) { bam_aux_append(bam_new, "XE", 'Z', bam_new->core.l_qseq+1, (uint8_t*)color_errors); } } // destroy the old bam structure bam_destroy1(bam_old); if(space == SRMA_SPACE_CS) { free(color_errors); } return bam_new; }
// from bam_md.c in SAMtools // modified not fill in the NM tag, and not to start the reference a c->pos static void tmap_sam_md1_core(bam1_t *b, char *ref) { uint8_t *seq = bam1_seq(b); uint32_t *cigar = bam1_cigar(b); bam1_core_t *c = &b->core; int i, x, y, u = 0; kstring_t *str; uint8_t *old_md, *old_nm; int32_t old_nm_i=-1, nm=0; str = (kstring_t*)calloc(1, sizeof(kstring_t)); for (i = y = x = 0; i < c->n_cigar; ++i) { int j, l = cigar[i]>>4, op = cigar[i]&0xf; if (op == BAM_CMATCH) { for (j = 0; j < l; ++j) { int z = y + j; int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]]; if (ref[x+j] == 0) break; // out of boundary if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match ++u; } else { ksprintf(str, "%d", u); kputc(ref[x+j], str); u = 0; nm++; } } if (j < l) break; x += l; y += l; } else if (op == BAM_CDEL) { ksprintf(str, "%d", u); kputc('^', str); for (j = 0; j < l; ++j) { if (ref[x+j] == 0) break; kputc(ref[x+j], str); } u = 0; if (j < l) break; x += l; nm += l; } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) { y += l; if (op == BAM_CINS) nm += l; } else if (op == BAM_CREF_SKIP) { x += l; } } ksprintf(str, "%d", u); // update MD old_md = bam_aux_get(b, "MD"); if(NULL == old_md) { bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s); } else { int is_diff = 0; if(strlen((char*)old_md+1) == str->l) { for(i = 0; i < str->l; ++i) { if(toupper(old_md[i+1]) != toupper(str->s[i])) { break; } } if(i < str->l) { is_diff = 1; } } else { is_diff = 1; } if(1 == is_diff) { bam_aux_del(b, old_md); bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s); } } // update NM old_nm = bam_aux_get(b, "NM"); if(NULL != old_nm) { old_nm_i = bam_aux2i(old_nm); if(old_nm_i != nm) { bam_aux_del(b, old_nm); bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); } } free(str->s); free(str); }
int bsstrand_func(bam1_t *b, const samfile_t *in, samfile_t *out, void *data) { bsstrand_data_t *d = (bsstrand_data_t*)data; bsstrand_conf_t *conf = d->conf; const bam1_core_t *c = &b->core; if (c->flag & BAM_FUNMAP){ if (out) samwrite(out, b); d->n_unmapped++; return 0; } fetch_refseq(d->rs, in->header->target_name[c->tid], c->pos, c->pos+1); uint32_t rpos=c->pos+1, qpos=0; int i, nC2T = 0, nG2A = 0; uint32_t j; char rbase, qbase; for (i=0; i<c->n_cigar; ++i) { uint32_t op = bam_cigar_op(bam1_cigar(b)[i]); uint32_t oplen = bam_cigar_oplen(bam1_cigar(b)[i]); switch(op) { case BAM_CMATCH: for(j=0; j<oplen; ++j) { rbase = toupper(getbase_refseq(d->rs, rpos+j)); qbase = bscall(bam1_seq(b), qpos+j); if (rbase == 'C' && qbase == 'T') nC2T += 1; if (rbase == 'G' && qbase == 'A') nG2A += 1; /* printf("%c vs %c\n", toupper(rbase), qbase); */ } rpos += oplen; qpos += oplen; break; case BAM_CINS: qpos += oplen; break; case BAM_CDEL: rpos += oplen; break; case BAM_CSOFT_CLIP: qpos += oplen; break; default: fprintf(stderr, "Unknown cigar, %u\n", op); abort(); } } char key[2] = {'Z','S'}; unsigned char *bsstrand = bam_aux_get(b, key); if (bsstrand) { bsstrand++; double s = similarity(nG2A, nC2T); if (nG2A > 1 && nC2T > 1 && s > 0.5) { if (conf->output_read || conf->output_all_read) printf("F\t%s\t%d\t%d\t%d\t%s\t%s\t%1.2f\n", in->header->target_name[c->tid], c->pos, nC2T, nG2A, bam1_qname(b), bsstrand, s); bam_aux_append(b, "OS", 'A', 1, bsstrand); bsstrand[0] = '?'; d->n_fail++; } else if (*bsstrand == '+' && nG2A > nC2T + 2) { if (conf->output_read || conf->output_all_read) printf("W2C\t%s\t%d\t%d\t%d\t%s\t%s\t%1.2f\n", in->header->target_name[c->tid], c->pos, nC2T, nG2A, bam1_qname(b), bsstrand, s); bam_aux_append(b, "OS", 'A', 1, bsstrand); bsstrand[0] = '-'; d->n_corr++; } else if (*bsstrand == '-' && nC2T > nG2A + 2) { if (conf->output_read || conf->output_all_read) printf("C2W\t%s\t%d\t%d\t%d\t%s\t%s\t%1.2f\n", in->header->target_name[c->tid], c->pos, nC2T, nG2A, bam1_qname(b), bsstrand, s); bam_aux_append(b, "OS", 'A', 1, bsstrand); bsstrand[0] = '+'; d->n_corr++; } else if (conf->output_all_read) { printf("N\t%s\t%d\t%d\t%d\t%s\t%s\t%1.2f\n", in->header->target_name[c->tid], c->pos, nC2T, nG2A, bam1_qname(b), bsstrand, s); } } else if (!(c->flag & BAM_FUNMAP) && conf->infer_bsstrand) { char bss[3]; if (similarity(nG2A, nC2T) < 0.5) { strcpy(bss, "??"); } else if (nC2T > nG2A) { strcpy(bss, c->flag & BAM_FREVERSE ? "+-" : "++"); } else { strcpy(bss, c->flag & BAM_FREVERSE ? "-+" : "--"); } bam_aux_append(b, "ZS", 'Z', 3, (uint8_t*) bss); } if (out) samwrite(out, b); d->n_mapped++; return 0; }
int do_grep() { #ifdef DEBUGa printf("[!]do_grep\n"); #endif BamInfo_t *pbam; kh_cstr_t BamID; khiter_t ki, bami; kstring_t ks1 = { 0, 0, NULL }; kstring_t ks2 = { 0, 0, NULL }; kstring_t ks3 = { 0, 0, NULL }; kstring_t kstr = { 0, 0, NULL }; //ksprintf(kstr, "%s/%s_grep/", myConfig.WorkDir, myConfig.ProjectID); //const char *filePrefix = strdup(ks_str(kstr)); //kputs(myConfig.WorkDir,kstr); samFile *in; bam_hdr_t *h; hts_idx_t *idx; bam1_t *b, *d, *d2, *bR1, *bR2, *bR3; bR1 = bam_init1(); bR2 = bam_init1(); bR3 = bam_init1(); htsFile *out; //hts_opt *in_opts = NULL, *out_opts = NULL; int r = 0, exit_code = 0; kvec_t(bam1_t) R1, R2, RV; pierCluster_t *pierCluster; //samdat_t tmp_samdat; #ifdef DEBUGa kstr.l = 0; ksprintf(&kstr, "%s/%s_grep/Greped.dump", myConfig.WorkDir, myConfig.ProjectID); FILE *fsdump = fopen(ks_str(&kstr),"w"); #endif kstr.l = 0; ksprintf(&kstr, "%s/%s_grep/Greped.ini", myConfig.WorkDir, myConfig.ProjectID); FILE *fs = fopen(ks_str(&kstr),"w"); uint32_t blockid = 0; for (bami = kh_begin(bamNFOp); bami != kh_end(bamNFOp); ++bami) { //printf(">[%d]:\n",bami); if (kh_exist(bamNFOp, bami)) { kv_init(R1); kv_init(R2); kv_init(RV); //tmp_samdat = (const samdat_t){ 0 }; //memset(&tmp_samdat,0,sizeof(samdat_t)); //printf("-[%d]:\n",bami); BamID = kh_key(bamNFOp, bami); pbam = &kh_value(bamNFOp, bami); fprintf(stderr, "%u [%s]=%s\t%u %u\n",bami,BamID,pbam->fileName,pbam->insertSize,pbam->SD); in = sam_open(pbam->fileName, "r"); if (in == NULL) { fprintf(stderr, "[x]Error opening \"%s\"\n", pbam->fileName); return EXIT_FAILURE; } h = sam_hdr_read(in); kstr.l = 0; ksprintf(&kstr, "%s/%s_grep/%s.bam", myConfig.WorkDir, myConfig.ProjectID, BamID); out = hts_open(ks_str(&kstr), "wb"); if (out == NULL) { fprintf(stderr, "[x]Error opening [%s]\n",ks_str(&kstr)); return EXIT_FAILURE; } if (sam_hdr_write(out, h) < 0) { fprintf(stderr, "[!]Error writing output header.\n"); exit_code = 1; } int8_t *ChrIsHum; if (h == NULL) { fprintf(stderr, "[x]Couldn't read header for \"%s\"\n", pbam->fileName); return EXIT_FAILURE; } else { ChrIsHum = (int8_t *) malloc(h->n_targets * sizeof(int8_t)); for (int32_t i=0; i < h->n_targets; ++i) { //ChrIsHum[i] = -1; ki = kh_get(chrNFO, chrNFOp, h->target_name[i]); if (ki == kh_end(chrNFOp)) { errx(4,"[x]Cannot find ChrID for [%s] !",h->target_name[i]); } else { ChrInfo_t * tmp = &kh_value(chrNFOp, ki); ChrIsHum[i] = tmp->isHum; //printf(">>> %d Chr:%s %d\n",i,h->target_name[i],ChrIsHum[i]); } } } h->ignore_sam_err = 0; b = bam_init1(); d = bam_init1(); d2 = bam_init1(); if ((idx = sam_index_load(in, pbam->fileName)) == 0) { fprintf(stderr, "[E::%s] fail to load the BAM index\n", __func__); return 1; } pierCluster = sam_plp_init(); while ((r = sam_read1(in, h, b)) >= 0) { int8_t flag = false; const bam1_core_t *c = &b->core; if (c->qual < myConfig.minBamQual) { continue; } if (c->n_cigar) { uint32_t *cigar = bam_get_cigar(b); int i = c->n_cigar; --i; if ( (bam_cigar_opchr(cigar[0])=='S' && bam_cigar_oplen(cigar[0]) >= myConfig.minGrepSlen) || (bam_cigar_opchr(cigar[i])=='S' && bam_cigar_oplen(cigar[i]) >= myConfig.minGrepSlen) ) { flag = true; } /* We only need /\d+S/ on both terminal, NOT inside. for (int i = 0; i < c->n_cigar; ++i) { if (bam_cigar_opchr(cigar[i])=='S') { // soft clipping if ( bam_cigar_oplen(cigar[i]) >= myConfig.minGrepSlen ) { flag = true; } } } */ } if (flag && ChrIsHum[c->tid]) { // Now, skip Virus items. //bam_copy1(bR1, b); flag = 0; // recycle //int enoughMapQ = 0; //kstring_t ks = { 0, 0, NULL }; /*if (sam_format1(h, b, &ks1) < 0) { fprintf(stderr, "Error writing output.\n"); exit_code = 1; break; } else */if ((c->mtid == c->tid && ChrIsHum[c->tid]) || (ChrIsHum[c->tid] ^ ChrIsHum[c->mtid])) { // Only grep those mapped on same Human ChrID, or diff species/一方在病毒的情况. //printf(">[%s]\n",ks_str(&ks1)); flag |= 1; //tmp_samdat.b = bam_dup1(b); //kv_push(samdat_t,R1,tmp_samdat); /*if (checkMapQ(ChrIsHum, b, true)) { ++enoughMapQ; }*/ } if (getPairedSam(in, idx, b, d) != 0) { flag &= ~1; continue; } else { flag |= 2; /*if (checkMapQ(ChrIsHum, d, false)) { ++enoughMapQ; }*/ if (c->flag & BAM_FSECONDARY) { if (getPairedSam(in, idx, d, d2) == 0) { //sam_format1(h, d2, &ks3); flag |= 4; /*if (checkMapQ(ChrIsHum, d2, false)) { ++enoughMapQ; }*/ } } } /* 对于 BAM_FSECONDARY(256) 的 Read,跳两次 与 读 SA 项,效果一样。 >[sf95_Ref_48245009_48245108_48245208_Vir_-_2000_2044_R_100_90 353 chr2 13996555 0 50S40M chr18 48245109 0ACACAACAATGTTCCGGAGACTCTAAGGCCTCCCGATACAGAGCAGAGGCCACACACACACACACCATGGAATACTATTCAGCCAAAAAA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC NM:i:0 MD:Z:40 AS:i:40 XS:i:40 RG:Z:Fsimout_mB SA:Z:rgi|59585|emb|X04615.1|,2000,-,40S46M4S,60,0; YC:Z:CT YD:Z:f] -[sf95_Ref_48245009_48245108_48245208_Vir_-_2000_2044_R_100_90 177 chr18 48245109 9 40S50M gi|59585|emb|X04615.1|2000 0 GTTCCGGAGACTCTAAGGCCTCCCGATACAGAGCAGAGGCCACACACACACACACCATGGAATACTATTCAGCCAAAAAAAGGAATTCAA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC NM:i:0 MD:Z:50 AS:i:50 XS:i:46 RG:Z:Fsimout_mB SA:Z:rgi|59585|emb|X04615.1|,2000,+,50S40M,9,0; YC:Z:GA YD:Z:f] +[sf95_Ref_48245009_48245108_48245208_Vir_-_2000_2044_R_100_90 113 gi|59585|emb|X04615.1| 2000 60 40S46M4S chr18 48245109 0 TTTTTTGGCTGAATAGTATTCCATGGTGTGTGTGTGTGTGGCCTCTGCTCTGTATCGGGAGGCCTTAGAGTCTCCGGAACATTGTTGTGT CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC NM:i:0 MD:Z:46 AS:i:46 XS:i:27 RG:Z:Fsimout_mB SA:Z:fchr2,13996555,+,50S40M,0,0; YC:Z:CT YD:Z:r] */ /*if (sam_format1(h, d, &ks2) < 0) { fprintf(stderr, "Error writing output.\n"); exit_code = 1; break; }*/ if (((flag & 3) == 3) /*&& enoughMapQ >= myConfig.samples*/) { /*printf(">%d[%s]\n",checkMapQ(ChrIsHum, b, true),ks_str(&ks1)); printf("-%d[%s]\n",checkMapQ(ChrIsHum, d, false),ks_str(&ks2)); if (flag & 4) { printf("+%d[%s]\n",checkMapQ(ChrIsHum, d2, false),ks_str(&ks3)); } printf("<--%d\n",enoughMapQ);*/ bam_aux_append(b, "Zd", 'Z', 2, (uint8_t*)"H"); if (sam_plp_push(ChrIsHum, pierCluster, b) == 0) { //printf("--HumRange=%s:%d-%d\n", h->target_name[(pierCluster->HumanRange).tid], (pierCluster->HumanRange).pos, (pierCluster->HumanRange).endpos); if ((!ChrIsHum[(d->core).tid]) && (flag & 2)) { bam_aux_append(d, "Zd", 'Z', 2, (uint8_t*)"V"); sam_plp_push(ChrIsHum, pierCluster, d); } if ((!ChrIsHum[(d2->core).tid]) && (flag & 4)) { bam_aux_append(d2, "Zd", 'Z', 2, (uint8_t*)"V"); sam_plp_push(ChrIsHum, pierCluster, d2); } } else { ++blockid; //print #ifdef DEBUGa fprintf(fsdump,"[%u %s]\nHumRange=%s:%d-%d\n", blockid, BamID, h->target_name[(pierCluster->HumanRange).tid], (pierCluster->HumanRange).pos, (pierCluster->HumanRange).endpos); fprintf(fsdump,"VirRange=%s:%d-%d\n", h->target_name[(pierCluster->VirusRange).tid], (pierCluster->VirusRange).pos, (pierCluster->VirusRange).endpos); #endif fprintf(fs,"[%u]\nBamID=%s\nHumRange=%s:%d-%d\n",blockid, BamID, h->target_name[(pierCluster->HumanRange).tid], (pierCluster->HumanRange).pos, (pierCluster->HumanRange).endpos); if ( (pierCluster->VirusRange).pos == 0 && (pierCluster->VirusRange).endpos == 0 ) { fprintf(fs,"VirRange=NA\n"); } else { fprintf(fs,"VirRange=%s:%d-%d\n", h->target_name[(pierCluster->VirusRange).tid], (pierCluster->VirusRange).pos, (pierCluster->VirusRange).endpos); } for (size_t i=0; i<kv_size(pierCluster->Reads);++i) { bam1_t *bi = kv_A(pierCluster->Reads, i); bam_aux_append(bi, "Zc", 'i', sizeof(uint32_t), (uint8_t*)&blockid); #ifdef DEBUGa if (sam_format1(h, bi, &ks1) < 0) { fprintf(stderr, "Error writing output.\n"); exit_code = 1; break; } else { fprintf(fsdump,"%s\n",ks1.s); } #endif if (sam_write1(out, h, bi) < 0) { fprintf(stderr, "[x]Error writing output.\n"); exit_code = 1; break; } } #ifdef DEBUGa fprintf(fsdump,"\n"); printf("HumRange=%s:%d-%d\n", h->target_name[(pierCluster->HumanRange).tid], (pierCluster->HumanRange).pos, (pierCluster->HumanRange).endpos); //fflush(fs); #endif sam_plp_dectroy(pierCluster); pierCluster = sam_plp_init(); } } } //char *qname = bam_get_qname(b); } r = sam_close(out); // stdout can only be closed once if (r < 0) { fprintf(stderr, "Error closing output.\n"); exit_code = 1; } hts_idx_destroy(idx); bam_destroy1(b); bam_destroy1(d); bam_destroy1(d2); bam_hdr_destroy(h); r = sam_close(in); free(ChrIsHum); #ifdef DEBUGa //fflush(NULL); //pressAnyKey(); #endif sam_plp_dectroy(pierCluster); //printf("<[%d]:\n",bami); } } #ifdef DEBUGa fclose(fsdump); #endif fclose(fs); getPairedSam(NULL, NULL, NULL, NULL); // sam_close(fp2); //printf("---[%d]---\n",exit_code); bam_destroy1(bR1); bam_destroy1(bR2); bam_destroy1(bR3); ks_release(&ks1); ks_release(&ks2); ks_release(&ks3); ks_release(&kstr); //free((char*)filePrefix); return exit_code; }
int main(int argc, char *argv[]) { hashtable ht=new_hashtable(HASHSIZE); bamFile in,in2; bamFile out; if (argc != 3) { fprintf(stderr, "Usage: bam_fix_NH <in.bam> <out.bam>\n"); return 1; } // Open file and exit if error //in = strcmp(argv[1], "-")? bam_open(argv[1], "rb") : bam_dopen(fileno(stdin), "rb"); in = bam_open(argv[1], "rb"); out = strcmp(argv[2], "-")? bam_open(argv[2], "w") : bam_dopen(fileno(stdout), "w"); if (in == 0 ) { fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[1]); return 1; } if (out == 0) { fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[2]); return 1; } unsigned long num_alns=0; int ref; // *********** // Copy header bam_header_t *header; header = bam_header_read(in); bam_header_write(out,header); // sorted by name? // Should not rely on the value in SO bam1_t *aln=bam_init1(); bam1_t *prev=bam_init1(); printf("Hashing...\n");flush(stdout); while(bam_read1(in,aln)>=0) { // read alignment if (aln->core.tid < 0) continue;//ignore unaligned reads ++num_alns; new_read_aln(ht,bam1_qname(aln)); } bam_close(in); printf("Hashing complete (%lu alignments)\n",num_alns); printf("Memory used in the hash: %ld MB\n",index_mem/1024/1024); flush(stdout); // reopen in2 = bam_open(argv[1], "rb"); if (in2 == 0 ) { fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[1]); return 1; } header = bam_header_read(in2); while(bam_read1(in2,aln)>=0) { // read alignment if (aln->core.tid < 0) continue;//ignore unaligned reads ++num_alns; READ_ALN *r=get_read_aln(ht,bam1_qname(aln)); //assert(r!=NULL); // update the NH field uint8_t *old_nh = bam_aux_get(aln, "NH"); uint8_t nh=r->ctr; if (old_nh) { if (nh!=bam_aux2i(old_nh)) { fprintf(stderr,"warning: value mismatch! replacing>%s %d->%d\n",bam1_qname(aln),bam_aux2i(old_nh),nh); } bam_aux_del(aln, old_nh); bam_aux_append(aln, "NH", 'i', 4, (uint8_t*)&nh); } if (!old_nh) { // add NH bam_aux_append(aln, "NH", 'i', 4, (uint8_t*)&nh); #ifdef DEBUG printf("!>%s %d\n",bam1_qname(aln),bam_aux2i(old_nh)); #endif } // in->header // Also fix the XS:A tag // BAM_FREAD1 // BAM_FREAD2 // BAM_FREVERSE the read is mapped to the reverse strand //bam1_cigar(b) //BAM_CREF_SKIP 3 CIGAR skip on the reference (e.g. spliced alignment) //BAM_FREVERSE 16 the read is mapped to the reverse strand if (aln->core.flag & BAM_FSECONDARY) continue; // skip secondary alignments if (aln->core.flag & ! BAM_FPAIRED) continue; // not paired if (aln->core.flag & ! BAM_FPROPER_PAIR) continue; // not a proper pair if (aln->core.flag & ! BAM_FMUNMAP) continue; // the mate is mapped if (aln->core.flag & BAM_FSECONDARY) continue; // secundary read if (aln->core.flag & BAM_FREAD2) continue; // only count each pair once // core.strand == 0 (f/+) 1 r/- // flag // bam1_qname(b) bam_write1(out,aln); } // bam_destroy1(aln); bam_close(in2); bam_close(out); return 0; /* uint8_t *old_nm = bam_aux_get(b, "NM"); 90 if (c->flag & BAM_FUNMAP) return; 91 if (old_nm) old_nm_i = bam_aux2i(old_nm); 92 if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); 93 else if (nm != old_nm_i) { 94 fprintf(stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam1_qname(b), old_nm_i, nm); 95 bam_aux_del(b, old_nm); 96 bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); 97 } */ }
void bam_fillmd1_core(bam1_t *b, char *ref, int ref_len, int flag, int max_nm) { uint8_t *seq = bam_get_seq(b); uint32_t *cigar = bam_get_cigar(b); bam1_core_t *c = &b->core; int i, x, y, u = 0; kstring_t *str; int32_t old_nm_i = -1, nm = 0; str = (kstring_t*)calloc(1, sizeof(kstring_t)); for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) { int j, l = cigar[i]>>4, op = cigar[i]&0xf; if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { for (j = 0; j < l; ++j) { int c1, c2, z = y + j; if (x+j >= ref_len || ref[x+j] == '\0') break; // out of bounds c1 = bam_seqi(seq, z), c2 = seq_nt16_table[(int)ref[x+j]]; if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match if (flag&USE_EQUAL) seq[z/2] &= (z&1)? 0xf0 : 0x0f; ++u; } else { kputw(u, str); kputc(ref[x+j], str); u = 0; ++nm; } } if (j < l) break; x += l; y += l; } else if (op == BAM_CDEL) { kputw(u, str); kputc('^', str); for (j = 0; j < l; ++j) { if (x+j >= ref_len || ref[x+j] == '\0') break; kputc(ref[x+j], str); } u = 0; x += j; nm += j; if (j < l) break; } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) { y += l; if (op == BAM_CINS) nm += l; } else if (op == BAM_CREF_SKIP) { x += l; } } kputw(u, str); // apply max_nm if (max_nm > 0 && nm >= max_nm) { for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) { int j, l = cigar[i]>>4, op = cigar[i]&0xf; if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { for (j = 0; j < l; ++j) { int c1, c2, z = y + j; if (x+j >= ref_len || ref[x+j] == '\0') break; // out of bounds c1 = bam_seqi(seq, z), c2 = seq_nt16_table[(int)ref[x+j]]; if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match seq[z/2] |= (z&1)? 0x0f : 0xf0; bam_get_qual(b)[z] = 0; } } if (j < l) break; x += l; y += l; } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l; else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l; } } // update NM if ((flag & UPDATE_NM) && !(c->flag & BAM_FUNMAP)) { uint8_t *old_nm = bam_aux_get(b, "NM"); if (old_nm) old_nm_i = bam_aux2i(old_nm); if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); else if (nm != old_nm_i) { fprintf(stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam_get_qname(b), old_nm_i, nm); bam_aux_del(b, old_nm); bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm); } } // update MD if ((flag & UPDATE_MD) && !(c->flag & BAM_FUNMAP)) { uint8_t *old_md = bam_aux_get(b, "MD"); if (!old_md) bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s); else { int is_diff = 0; if (strlen((char*)old_md+1) == str->l) { for (i = 0; i < str->l; ++i) if (toupper(old_md[i+1]) != toupper(str->s[i])) break; if (i < str->l) is_diff = 1; } else is_diff = 1; if (is_diff) { fprintf(stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam_get_qname(b), old_md+1, str->s); bam_aux_del(b, old_md); bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s); } } } // drop all tags but RG if (flag&DROP_TAG) { uint8_t *q = bam_aux_get(b, "RG"); bam_aux_drop_other(b, q); } // reduce the resolution of base quality if (flag&BIN_QUAL) { uint8_t *qual = bam_get_qual(b); for (i = 0; i < b->core.l_qseq; ++i) if (qual[i] >= 3) qual[i] = qual[i]/10*10 + 7; } free(str->s); free(str); }
/*! @abstract Merge multiple sorted BAM. @param is_by_qname whether to sort by query name @param out output BAM file name @param mode sam_open() mode to be used to create the final output file (overrides level settings from UNCOMP and LEVEL1 flags) @param headers name of SAM file from which to copy '@' header lines, or NULL to copy them from the first file to be merged @param n number of files to be merged @param fn names of files to be merged @param flag flags that control how the merge is undertaken @param reg region to merge @param n_threads number of threads to use (passed to htslib) @discussion Padding information may NOT correctly maintained. This function is NOT thread safe. */ int bam_merge_core2(int by_qname, const char *out, const char *mode, const char *headers, int n, char * const *fn, int flag, const char *reg, int n_threads) { samFile *fpout, **fp; heap1_t *heap; bam_hdr_t *hout = NULL; int i, j, *RG_len = NULL; uint64_t idx = 0; char **RG = NULL; hts_itr_t **iter = NULL; bam_hdr_t **hdr = NULL; trans_tbl_t *translation_tbl = NULL; // Is there a specified pre-prepared header to use for output? if (headers) { samFile* fpheaders = sam_open(headers, "r"); if (fpheaders == NULL) { const char *message = strerror(errno); fprintf(pysamerr, "[bam_merge_core] cannot open '%s': %s\n", headers, message); return -1; } hout = sam_hdr_read(fpheaders); sam_close(fpheaders); } g_is_by_qname = by_qname; fp = (samFile**)calloc(n, sizeof(samFile*)); heap = (heap1_t*)calloc(n, sizeof(heap1_t)); iter = (hts_itr_t**)calloc(n, sizeof(hts_itr_t*)); hdr = (bam_hdr_t**)calloc(n, sizeof(bam_hdr_t*)); translation_tbl = (trans_tbl_t*)calloc(n, sizeof(trans_tbl_t)); // prepare RG tag from file names if (flag & MERGE_RG) { RG = (char**)calloc(n, sizeof(char*)); RG_len = (int*)calloc(n, sizeof(int)); for (i = 0; i != n; ++i) { int l = strlen(fn[i]); const char *s = fn[i]; if (l > 4 && strcmp(s + l - 4, ".bam") == 0) l -= 4; for (j = l - 1; j >= 0; --j) if (s[j] == '/') break; ++j; l -= j; RG[i] = (char*)calloc(l + 1, 1); RG_len[i] = l; strncpy(RG[i], s + j, l); } } // open and read the header from each file for (i = 0; i < n; ++i) { bam_hdr_t *hin; fp[i] = sam_open(fn[i], "r"); if (fp[i] == NULL) { int j; fprintf(pysamerr, "[bam_merge_core] fail to open file %s\n", fn[i]); for (j = 0; j < i; ++j) sam_close(fp[j]); free(fp); free(heap); // FIXME: possible memory leak return -1; } hin = sam_hdr_read(fp[i]); if (hout) trans_tbl_init(hout, hin, translation_tbl+i, flag & MERGE_COMBINE_RG, flag & MERGE_COMBINE_PG); else { // As yet, no headers to merge into... hout = bam_hdr_dup(hin); // ...so no need to translate header into itself trans_tbl_init(hout, hin, translation_tbl+i, true, true); } // TODO sam_itr_next() doesn't yet work for SAM files, // so for those keep the headers around for use with sam_read1() if (hts_get_format(fp[i])->format == sam) hdr[i] = hin; else { bam_hdr_destroy(hin); hdr[i] = NULL; } if ((translation_tbl+i)->lost_coord_sort && !by_qname) { fprintf(pysamerr, "[bam_merge_core] Order of targets in file %s caused coordinate sort to be lost\n", fn[i]); } } // Transform the header into standard form pretty_header(&hout->text,hout->l_text); // If we're only merging a specified region move our iters to start at that point if (reg) { int* rtrans = rtrans_build(n, hout->n_targets, translation_tbl); int tid, beg, end; const char *name_lim = hts_parse_reg(reg, &beg, &end); char *name = malloc(name_lim - reg + 1); memcpy(name, reg, name_lim - reg); name[name_lim - reg] = '\0'; tid = bam_name2id(hout, name); free(name); if (tid < 0) { fprintf(pysamerr, "[%s] Malformated region string or undefined reference name\n", __func__); return -1; } for (i = 0; i < n; ++i) { hts_idx_t *idx = sam_index_load(fp[i], fn[i]); // (rtrans[i*n+tid]) Look up what hout tid translates to in input tid space int mapped_tid = rtrans[i*hout->n_targets+tid]; if (mapped_tid != INT32_MIN) { iter[i] = sam_itr_queryi(idx, mapped_tid, beg, end); } else { iter[i] = sam_itr_queryi(idx, HTS_IDX_NONE, 0, 0); } hts_idx_destroy(idx); if (iter[i] == NULL) break; } free(rtrans); } else { for (i = 0; i < n; ++i) { if (hdr[i] == NULL) { iter[i] = sam_itr_queryi(NULL, HTS_IDX_REST, 0, 0); if (iter[i] == NULL) break; } else iter[i] = NULL; } } if (i < n) { fprintf(pysamerr, "[%s] Memory allocation failed\n", __func__); return -1; } // Load the first read from each file into the heap for (i = 0; i < n; ++i) { heap1_t *h = heap + i; h->i = i; h->b = bam_init1(); if ((iter[i]? sam_itr_next(fp[i], iter[i], h->b) : sam_read1(fp[i], hdr[i], h->b)) >= 0) { bam_translate(h->b, translation_tbl + i); h->pos = ((uint64_t)h->b->core.tid<<32) | (uint32_t)((int32_t)h->b->core.pos+1)<<1 | bam_is_rev(h->b); h->idx = idx++; } else { h->pos = HEAP_EMPTY; bam_destroy1(h->b); h->b = NULL; } } // Open output file and write header if ((fpout = sam_open(out, mode)) == 0) { fprintf(pysamerr, "[%s] fail to create the output file.\n", __func__); return -1; } sam_hdr_write(fpout, hout); if (!(flag & MERGE_UNCOMP)) hts_set_threads(fpout, n_threads); // Begin the actual merge ks_heapmake(heap, n, heap); while (heap->pos != HEAP_EMPTY) { bam1_t *b = heap->b; if (flag & MERGE_RG) { uint8_t *rg = bam_aux_get(b, "RG"); if (rg) bam_aux_del(b, rg); bam_aux_append(b, "RG", 'Z', RG_len[heap->i] + 1, (uint8_t*)RG[heap->i]); } sam_write1(fpout, hout, b); if ((j = (iter[heap->i]? sam_itr_next(fp[heap->i], iter[heap->i], b) : sam_read1(fp[heap->i], hdr[heap->i], b))) >= 0) { bam_translate(b, translation_tbl + heap->i); heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)((int)b->core.pos+1)<<1 | bam_is_rev(b); heap->idx = idx++; } else if (j == -1) { heap->pos = HEAP_EMPTY; bam_destroy1(heap->b); heap->b = NULL; } else fprintf(pysamerr, "[bam_merge_core] '%s' is truncated. Continue anyway.\n", fn[heap->i]); ks_heapadjust(heap, 0, n, heap); } // Clean up and close if (flag & MERGE_RG) { for (i = 0; i != n; ++i) free(RG[i]); free(RG); free(RG_len); } for (i = 0; i < n; ++i) { trans_tbl_destroy(translation_tbl + i); hts_itr_destroy(iter[i]); bam_hdr_destroy(hdr[i]); sam_close(fp[i]); } bam_hdr_destroy(hout); sam_close(fpout); free(translation_tbl); free(fp); free(heap); free(iter); free(hdr); return 0; }
int main(int argc, char *argv[]) { short out2stdout=0; hashtable ht=new_hashtable(HASHSIZE); bamFile in,in2; bamFile out; int paired;//1 if not paired or pair read 1, 2 otherwise index_mem=sizeof(hashtable)*sizeof(hashnode**)*HASHSIZE*2; if (argc != 3) { fprintf(stderr, "Usage: bam_fix_NH <in.bam> <out.bam or - for stdout>\n"); return 1; } // Open file and exit if error in = bam_open(argv[1], "rb"); out2stdout = strcmp(argv[2], "-")? 0 : 1; out = strcmp(argv[2], "-")? bam_open(argv[2], "w") : bam_dopen(fileno(stdout), "w"); if (in == 0 ) { fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[1]); return 1; } if (out == 0) { fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[2]); return 1; } unsigned long num_alns=0; int ref; // *********** // Copy header bam_header_t *header; header = bam_header_read(in); bam_header_write(out,header); // sorted by name? // Should not rely on the value in SO bam1_t *aln=bam_init1(); bam1_t *prev=bam_init1(); if (!out2stdout) { fprintf(stderr,"bam_fix_NH version %s\n",VERSION); fprintf(stderr,"Processing %s\n",argv[1]); fprintf(stderr,"Hashing...\n");fflush(stderr); } while(bam_read1(in,aln)>=0) { // read alignment if (aln->core.tid < 0) continue;//ignore unaligned reads if (aln->core.flag & BAM_FUNMAP) continue; if (aln->core.flag & BAM_FREAD2) paired=2; else paired=1; ++num_alns; new_read_aln(ht,fix_read_name(bam1_qname(aln),paired)); if(!out2stdout) PRINT_ALNS_PROCESSED(num_alns); } bam_close(in); if(!out2stdout) { fprintf(stderr,"%s%lu\n",BACKLINE,num_alns); fprintf(stderr,"Hashing complete (%lu alignments)\n",num_alns); fprintf(stderr,"Memory used: %ld MB\n",index_mem/1024/1024); fprintf(stderr,"Updating entries with NH and printing BAM...\n"); fflush(stderr); } // reopen in2 = bam_open(argv[1], "rb"); if (in2 == 0 ) { fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[1]); return 1; } header = bam_header_read(in2); num_alns=0; while(bam_read1(in2,aln)>=0) { // read alignment paired=1; if (aln->core.tid < 0) continue;//ignore unaligned reads if (aln->core.flag & BAM_FUNMAP) continue; if (aln->core.flag & BAM_FREAD2) paired=2; ++num_alns; READ_ALN *r=get_read_aln(ht,fix_read_name(bam1_qname(aln),paired)); assert(r!=NULL); // update the NH field uint8_t *old_nh = bam_aux_get(aln, "NH"); int32_t nh=r->ctr; if (old_nh) { if (nh!=bam_aux2i(old_nh)) { fprintf(stderr,"warning: value mismatch! replacing>%s %d->%d\n",bam1_qname(aln),bam_aux2i(old_nh),nh); } bam_aux_del(aln, old_nh); bam_aux_append(aln, "NH", 'i', 4, (uint8_t*)&nh); #ifdef DEBUG // printf("!>%s %d\n",bam1_qname(aln),r->ctr); #endif } if (!old_nh) { // add NH bam_aux_append(aln, "NH", 'i', 4, (uint8_t*)&nh); #ifdef DEBUG fprintf(stderr,"!>%s %d\n",bam1_qname(aln),bam_aux2i(old_nh)); #endif } bam_write1(out,aln); if(!out2stdout) PRINT_ALNS_PROCESSED(num_alns); } // bam_destroy1(aln); bam_close(in2); bam_close(out); if(!out2stdout) { fprintf(stderr,"%s%lu\n",BACKLINE,num_alns); fprintf(stderr,"Done.\n"); } return 0; }
static int aux_fields1(void) { static const char sam[] = "data:," "@SQ\tSN:one\tLN:1000\n" "@SQ\tSN:two\tLN:500\n" "r1\t0\tone\t500\t20\t8M\t*\t0\t0\tATGCATGC\tqqqqqqqq\tXA:A:k\tXi:i:37\tXf:f:" xstr(PI) "\tXd:d:" xstr(E) "\tXZ:Z:" HELLO "\tXH:H:" BEEF "\tXB:B:c,-2,0,+2\tB0:B:i,-2147483648,-1,0,1,2147483647\tB1:B:I,0,1,2147483648,4294967295\tB2:B:s,-32768,-1,0,1,32767\tB3:B:S,0,1,32768,65535\tB4:B:c,-128,-1,0,1,127\tB5:B:C,0,1,127,255\tBf:B:f,-3.14159,2.71828\tZZ:i:1000000\tF2:d:2.46801\tY1:i:-2147483648\tY2:i:-2147483647\tY3:i:-1\tY4:i:0\tY5:i:1\tY6:i:2147483647\tY7:i:2147483648\tY8:i:4294967295\n"; // Canonical form of the alignment record above, as output by sam_format1() static const char r1[] = "r1\t0\tone\t500\t20\t8M\t*\t0\t0\tATGCATGC\tqqqqqqqq\tXi:i:37\tXf:f:3.14159\tXd:d:2.71828\tXZ:Z:" NEW_HELLO "\tXH:H:" BEEF "\tXB:B:c,-2,0,2\tB0:B:i,-2147483648,-1,0,1,2147483647\tB1:B:I,0,1,2147483648,4294967295\tB2:B:s,-32768,-1,0,1,32767\tB3:B:S,0,1,32768,65535\tB4:B:c,-128,-1,0,1,127\tB5:B:C,0,1,127,255\tBf:B:f,-3.14159,2.71828\tZZ:i:1000000\tF2:f:9.8765\tY1:i:-2147483648\tY2:i:-2147483647\tY3:i:-1\tY4:i:0\tY5:i:1\tY6:i:2147483647\tY7:i:2147483648\tY8:i:4294967295\tN0:i:-1234\tN1:i:1234\tN2:i:-2\tN3:i:3\tF1:f:4.5678\tN4:B:S,65535,32768,1,0\tN5:i:4242"; samFile *in = sam_open(sam, "r"); bam_hdr_t *header = sam_hdr_read(in); bam1_t *aln = bam_init1(); uint8_t *p; kstring_t ks = { 0, 0, NULL }; int64_t b0vals[5] = { -2147483648LL,-1,0,1,2147483647LL }; // i int64_t b1vals[4] = { 0,1,2147483648LL,4294967295LL }; // I int64_t b2vals[5] = { -32768,-1,0,1,32767 }; // s int64_t b3vals[4] = { 0,1,32768,65535 }; // S int64_t b4vals[5] = { -128,-1,0,1,127 }; // c int64_t b5vals[4] = { 0,1,127,255 }; // C // NB: Floats not doubles below! // See https://randomascii.wordpress.com/2012/06/26/doubles-are-not-floats-so-dont-compare-them/ float bfvals[2] = { -3.14159f, 2.71828f }; int8_t n4v1[] = { -128, -64, -32, -16, -8, -4, -2, -1, 0, 1, 2, 4, 8, 16, 32, 64, 127 }; uint32_t n4v2[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1234, 5678, 1U << 31, 0 }; int16_t n4v3[] = { -32768, -1, 0, 1, 32767 }; float n4v4[] = { 0, 1, 2, 10, 20, 30, 1.5, -2.5 }; uint8_t n4v5[] = { 0, 255 }; int32_t n4v6[] = { -2147483647 - 1, 10, -1, 0, 1, 2147483647 }; uint16_t n4v7[] = { 65535, 32768, 1, 0 }; int32_t ival = -1234; uint32_t uval = 1234; float f1 = 4.5678; float f2 = 9.8765; size_t nvals, i; if (sam_read1(in, header, aln) >= 0) { if ((p = check_bam_aux_get(aln, "XA", 'A')) && bam_aux2A(p) != 'k') fail("XA field is '%c', expected 'k'", bam_aux2A(p)); bam_aux_del(aln,p); if (bam_aux_get(aln,"XA")) fail("XA field was not deleted"); if ((p = check_bam_aux_get(aln, "Xi", 'C')) && bam_aux2i(p) != 37) fail("Xi field is %"PRId64", expected 37", bam_aux2i(p)); if ((p = check_bam_aux_get(aln, "Xf", 'f')) && fabs(bam_aux2f(p) - PI) > 1E-6) fail("Xf field is %.12f, expected pi", bam_aux2f(p)); if ((p = check_bam_aux_get(aln, "Xd", 'd')) && fabs(bam_aux2f(p) - E) > 1E-6) fail("Xf field is %.12f, expected e", bam_aux2f(p)); if ((p = check_bam_aux_get(aln, "XZ", 'Z')) && strcmp(bam_aux2Z(p), HELLO) != 0) fail("XZ field is \"%s\", expected \"%s\"", bam_aux2Z(p), HELLO); bam_aux_update_str(aln,"XZ",strlen(NEW_HELLO)+1,NEW_HELLO); if ((p = check_bam_aux_get(aln, "XZ", 'Z')) && strcmp(bam_aux2Z(p), NEW_HELLO) != 0) fail("XZ field is \"%s\", expected \"%s\"", bam_aux2Z(p), NEW_HELLO); if ((p = check_bam_aux_get(aln, "XH", 'H')) && strcmp(bam_aux2Z(p), BEEF) != 0) fail("XH field is \"%s\", expected \"%s\"", bam_aux2Z(p), BEEF); if ((p = check_bam_aux_get(aln, "XB", 'B')) && ! (memcmp(p, "Bc", 2) == 0 && memcmp(p + 2, "\x03\x00\x00\x00\xfe\x00\x02", 7) == 0)) fail("XB field is %c,..., expected c,-2,0,+2", p[1]); check_int_B_array(aln, "B0", NELE(b0vals), b0vals); check_int_B_array(aln, "B1", NELE(b1vals), b1vals); check_int_B_array(aln, "B2", NELE(b2vals), b2vals); check_int_B_array(aln, "B3", NELE(b3vals), b3vals); check_int_B_array(aln, "B4", NELE(b4vals), b4vals); check_int_B_array(aln, "B5", NELE(b5vals), b5vals); nvals = NELE(bfvals); if ((p = check_bam_aux_get(aln, "Bf", 'B')) != NULL) { if (bam_auxB_len(p) != nvals) fail("Wrong length reported for Bf field, got %d, expected %zd\n", bam_auxB_len(p), nvals); for (i = 0; i < nvals; i++) { if (bam_auxB2f(p, i) != bfvals[i]) { fail("Wrong value from bam_auxB2f for Bf field index %zd, " "got %f expected %f\n", i, bam_auxB2f(p, i), bfvals[i]); } } } if ((p = check_bam_aux_get(aln, "ZZ", 'I')) && bam_aux2i(p) != 1000000) fail("ZZ field is %"PRId64", expected 1000000", bam_aux2i(p)); if ((p = bam_aux_get(aln, "Y1")) && bam_aux2i(p) != -2147483647-1) fail("Y1 field is %"PRId64", expected -2^31", bam_aux2i(p)); if ((p = bam_aux_get(aln, "Y2")) && bam_aux2i(p) != -2147483647) fail("Y2 field is %"PRId64", expected -2^31+1", bam_aux2i(p)); if ((p = bam_aux_get(aln, "Y3")) && bam_aux2i(p) != -1) fail("Y3 field is %"PRId64", expected -1", bam_aux2i(p)); if ((p = bam_aux_get(aln, "Y4")) && bam_aux2i(p) != 0) fail("Y4 field is %"PRId64", expected 0", bam_aux2i(p)); if ((p = bam_aux_get(aln, "Y5")) && bam_aux2i(p) != 1) fail("Y5 field is %"PRId64", expected 1", bam_aux2i(p)); if ((p = bam_aux_get(aln, "Y6")) && bam_aux2i(p) != 2147483647) fail("Y6 field is %"PRId64", expected 2^31-1", bam_aux2i(p)); if ((p = bam_aux_get(aln, "Y7")) && bam_aux2i(p) != 2147483648LL) fail("Y7 field is %"PRId64", expected 2^31", bam_aux2i(p)); if ((p = bam_aux_get(aln, "Y8")) && bam_aux2i(p) != 4294967295LL) fail("Y8 field is %"PRId64", expected 2^32-1", bam_aux2i(p)); // Try appending some new tags if (bam_aux_append(aln, "N0", 'i', sizeof(ival), (uint8_t *) &ival) != 0) fail("Failed to append N0:i tag"); if ((p = bam_aux_get(aln, "N0")) && bam_aux2i(p) != ival) fail("N0 field is %"PRId64", expected %d", bam_aux2i(p), ival); if (bam_aux_append(aln, "N1", 'I', sizeof(uval), (uint8_t *) &uval) != 0) fail("failed to append N1:I tag"); if ((p = bam_aux_get(aln, "N1")) && bam_aux2i(p) != uval) fail("N1 field is %"PRId64", expected %u", bam_aux2i(p), uval); // Append tags with bam_aux_update_int() if (bam_aux_update_int(aln, "N2", -2) < 0) fail("failed to append N2:c tag"); if (bam_aux_update_int(aln, "N3", 3) < 0) fail("failed to append N3:C tag"); p = bam_aux_get(aln, "N2"); if (!p) fail("failed to retrieve N2 tag"); else if (*p != 'c' || bam_aux2i(p) != -2) fail("N2 field is %c:%"PRId64", expected c:-2", *p, bam_aux2i(p)); p = bam_aux_get(aln, "N3"); if (!p) fail("failed to retrieve N3 tag"); else if (*p != 'C' || bam_aux2i(p) != 3) fail("N3 field is %c:%"PRId64", expected C:3", *p, bam_aux2i(p)); // Try changing values with bam_aux_update_int() i = test_update_int(aln, "N2", 2, 'C', "N3", 3, 'C'); if (i == 0) test_update_int(aln, "N2", 1234, 'S', "N3", 3, 'C'); if (i == 0) test_update_int(aln, "N2", -1, 's', "N3", 3, 'C'); if (i == 0) test_update_int(aln, "N2", 4294967295U, 'I', "N3", 3, 'C'); if (i == 0) test_update_int(aln, "N2", -2, 'i', "N3", 3, 'C'); // Append a value with bam_aux_update_float() if (bam_aux_update_float(aln, "F1", f1) < 0) fail("append F1:f tag"); p = bam_aux_get(aln, "F1"); if (!p) fail("retrieve F1 tag"); else if (*p != 'f' || bam_aux2f(p) != f1) fail("F1 field is %c:%e, expected f:%e", *p, bam_aux2f(p), f1); // Change a double tag to a float if (bam_aux_update_float(aln, "F2", f2) < 0) fail("update F2 tag"); p = bam_aux_get(aln, "F2"); if (!p) fail("retrieve F2 tag"); else if (*p != 'f' || bam_aux2f(p) != f2) fail("F2 field is %c:%e, expected f:%e", *p, bam_aux2f(p), f2); // Check the next one is intact too p = bam_aux_get(aln, "Y1"); if (!p) fail("retrieve Y1 tag"); else if (*p != 'i' && bam_aux2i(p) != -2147483647-1) fail("Y1 field is %"PRId64", expected -2^31", bam_aux2i(p)); // bam_aux_update_array tests // append a new array i = test_update_array(aln, "N4", 'c', NELE(n4v1), n4v1, "\0\0", 0, 0); // Add a sentinal to check resizes work if (i == 0) i = test_update_int(aln, "N5", 4242, 'S', "\0\0", 0, 0); // alter the array tag a few times if (i == 0) i = test_update_array(aln, "N4", 'I', NELE(n4v2), n4v2, "N5", 4242, 'S'); if (i == 0) i = test_update_array(aln, "N4", 's', NELE(n4v3), n4v3, "N5", 4242, 'S'); if (i == 0) i = test_update_array(aln, "N4", 'f', NELE(n4v4), n4v4, "N5", 4242, 'S'); if (i == 0) i = test_update_array(aln, "N4", 'c', NELE(n4v5), n4v5, "N5", 4242, 'S'); if (i == 0) i = test_update_array(aln, "N4", 'i', NELE(n4v6), n4v6, "N5", 4242, 'S'); if (i == 0) i = test_update_array(aln, "N4", 'S', NELE(n4v7), n4v7, "N5", 4242, 'S'); if (sam_format1(header, aln, &ks) < 0) fail("can't format record"); if (strcmp(ks.s, r1) != 0) fail("record formatted incorrectly: \"%s\"", ks.s); free(ks.s); } else fail("can't read record"); bam_destroy1(aln); bam_hdr_destroy(header); sam_close(in); return 1; }