Esempio n. 1
0
// Returns 0 on success, -1 on failure.
static int sync_mq_mc(bam1_t* src, bam1_t* dest)
{
    if ( (src->core.flag & BAM_FUNMAP) == 0 ) { // If mapped
        // Copy Mate Mapping Quality
        uint32_t mq = src->core.qual;
        uint8_t* data;
        if ((data = bam_aux_get(dest,"MQ")) != NULL) {
            bam_aux_del(dest, data);
        }

        bam_aux_append(dest, "MQ", 'i', sizeof(uint32_t), (uint8_t*)&mq);
    }
    // Copy mate cigar if either read is mapped
    if ( (src->core.flag & BAM_FUNMAP) == 0 || (dest->core.flag & BAM_FUNMAP) == 0 ) {
        uint8_t* data_mc;
        if ((data_mc = bam_aux_get(dest,"MC")) != NULL) {
            bam_aux_del(dest, data_mc);
        }

        // Convert cigar to string
        kstring_t mc = { 0, 0, NULL };
        if (bam_format_cigar(src, &mc) < 0) return -1;

        bam_aux_append(dest, "MC", 'Z', ks_len(&mc)+1, (uint8_t*)ks_str(&mc));
        free(mc.s);
    }
    return 0;
}
Esempio n. 2
0
/*
 * This function calculates ct tag for two bams, it assumes they are from the same template and
 * writes the tag to the first read in position terms.
 */
static void bam_template_cigar(bam1_t *b1, bam1_t *b2, kstring_t *str)
{
    bam1_t *swap;
    int i, end;
    uint32_t *cigar;
    str->l = 0;
    if (b1->core.tid != b2->core.tid || b1->core.tid < 0 || b1->core.pos < 0 || b2->core.pos < 0 || b1->core.flag&BAM_FUNMAP || b2->core.flag&BAM_FUNMAP) return; // coordinateless or not on the same chr; skip
    if (b1->core.pos > b2->core.pos) swap = b1, b1 = b2, b2 = swap; // make sure b1 has a smaller coordinate
    kputc((b1->core.flag & BAM_FREAD1)? '1' : '2', str); // segment index
    kputc((b1->core.flag & BAM_FREVERSE)? 'R' : 'F', str); // strand
    for (i = 0, cigar = bam_get_cigar(b1); i < b1->core.n_cigar; ++i) {
        kputw(bam_cigar_oplen(cigar[i]), str);
        kputc(bam_cigar_opchr(cigar[i]), str);
    }
    end = bam_endpos(b1);
    kputw(b2->core.pos - end, str);
    kputc('T', str);
    kputc((b2->core.flag & BAM_FREAD1)? '1' : '2', str); // segment index
    kputc((b2->core.flag & BAM_FREVERSE)? 'R' : 'F', str); // strand
    for (i = 0, cigar = bam_get_cigar(b2); i < b2->core.n_cigar; ++i) {
        kputw(bam_cigar_oplen(cigar[i]), str);
        kputc(bam_cigar_opchr(cigar[i]), str);
    }

    uint8_t* data;
    if ((data = bam_aux_get(b1,"ct")) != NULL) bam_aux_del(b1, data);
    if ((data = bam_aux_get(b2,"ct")) != NULL) bam_aux_del(b2, data);

    bam_aux_append(b1, "ct", 'Z', str->l+1, (uint8_t*)str->s);
}
Esempio n. 3
0
void bam_template_cigar(bam1_t *b1, bam1_t *b2, kstring_t *str)
{
	bam1_t *swap;
	int i, end;
	uint32_t *cigar;
	str->l = 0;
	if (b1->core.tid != b2->core.tid || b1->core.tid < 0) return; // coordinateless or not on the same chr; skip
	if (b1->core.pos > b2->core.pos) swap = b1, b1 = b2, b2 = swap; // make sure b1 has a smaller coordinate
	kputc((b1->core.flag & BAM_FREAD1)? '1' : '2', str); // segment index
	kputc((b1->core.flag & BAM_FREVERSE)? 'R' : 'F', str); // strand
	for (i = 0, cigar = bam1_cigar(b1); i < b1->core.n_cigar; ++i) {
		kputw(bam_cigar_oplen(cigar[i]), str);
		kputc(bam_cigar_opchr(cigar[i]), str);
	}
	end = bam_calend(&b1->core, cigar);
	kputw(b2->core.pos - end, str);
	kputc('T', str);
	kputc((b2->core.flag & BAM_FREAD1)? '1' : '2', str); // segment index
	kputc((b2->core.flag & BAM_FREVERSE)? 'R' : 'F', str); // strand
	for (i = 0, cigar = bam1_cigar(b2); i < b2->core.n_cigar; ++i) {
		kputw(bam_cigar_oplen(cigar[i]), str);
		kputc(bam_cigar_opchr(cigar[i]), str);
	}
	bam_aux_append(b1, "CT", 'Z', str->l+1, (uint8_t*)str->s); 
}
Esempio n. 4
0
void
addSupplementaryAlignmentEvidence(
    bam_record& bamRead,
    const std::string& svStr)
{
    static const char svtag[] = {'S','A'};
    bam_aux_append(bamRead.get_data(),svtag,'Z',(svStr.size()+1),
                   (const uint8_t*)(svStr.c_str()));
}
Esempio n. 5
0
static void orphan_only_func(const state_t* state, bam1_t* file_read)
{
    uint8_t* data = (uint8_t*)strdup(state->rg_id);
    int len = strlen(state->rg_id)+1;
    // If the old exists don't do anything
    uint8_t* old = bam_aux_get(file_read, "RG");
    if (old == NULL) {
        bam_aux_append(file_read, "RG",'Z',len,data);
    }
    free(data);
}
Esempio n. 6
0
static void sync_mq(bam1_t* src, bam1_t* dest)
{
    if ( (src->core.flag & BAM_FUNMAP) == 0 ) { // If mapped
        uint32_t mq = src->core.qual;
        uint8_t* data;
        if ((data = bam_aux_get(dest,"MQ")) != NULL) {
            bam_aux_del(dest, data);
        }

        bam_aux_append(dest, "MQ", 'i', sizeof(uint32_t), (uint8_t*)&mq);
    }
}
Esempio n. 7
0
static void overwrite_all_func(const state_t* state, bam1_t* file_read)
{
    uint8_t* data = (uint8_t*)strdup(state->rg_id);
    int len = strlen(state->rg_id)+1;
    // If the old exists delete it
    uint8_t* old = bam_aux_get(file_read, "RG");
    if (old != NULL) {
        bam_aux_del(file_read, old);
    }

    bam_aux_append(file_read, "RG", 'Z', len, data);
    free(data);
}
Esempio n. 8
0
static void bam_translate(bam1_t* b, trans_tbl_t* tbl)
{
    // Update target id if not unmapped tid
    if ( b->core.tid >= 0 ) { b->core.tid = tbl->tid_trans[b->core.tid]; }
    if ( b->core.mtid >= 0 ) { b->core.mtid = tbl->tid_trans[b->core.mtid]; }

    // If we have a RG update it
    uint8_t *rg = bam_aux_get(b, "RG");
    if (rg) {
        char* decoded_rg = bam_aux2Z(rg);
        khiter_t k = kh_get(c2c, tbl->rg_trans, decoded_rg);
        if (k != kh_end(tbl->rg_trans)) {
            char* translate_rg = kh_value(tbl->rg_trans,k);
            bam_aux_del(b, rg);
            bam_aux_append(b, "RG", 'Z', strlen(translate_rg) + 1, (uint8_t*)translate_rg);
        } else {
            fprintf(pysamerr, "[bam_translate] RG tag \"%s\" on read \"%s\" encountered with no corresponding entry in header, tag lost\n",decoded_rg, bam_get_qname(b));
            bam_aux_del(b, rg);
        }
    }

    // If we have a PG update it
    uint8_t *pg = bam_aux_get(b, "PG");
    if (pg) {
        char* decoded_pg = bam_aux2Z(pg);
        khiter_t k = kh_get(c2c, tbl->pg_trans, decoded_pg);
        if (k != kh_end(tbl->pg_trans)) {
            char* translate_pg = kh_value(tbl->pg_trans,k);
            bam_aux_del(b, pg);
            bam_aux_append(b, "PG", 'Z', strlen(translate_pg) + 1, (uint8_t*)translate_pg);
        } else {
            fprintf(pysamerr, "[bam_translate] PG tag \"%s\" on read \"%s\" encountered with no corresponding entry in header, tag lost\n",decoded_pg, bam_get_qname(b));
            bam_aux_del(b, pg);
        }
    }
}
Esempio n. 9
0
static int uniform_fetch_func(bam1_t *b, void *data)
{
     uint8_t *to_delete;
     data_t_uniform *tmp = (data_t_uniform*)data;
     bam1_core_t *c = &b->core;
     char *iq;
     char *dq;

     iq = malloc((c->l_qseq+1) * sizeof(char));
     memset(iq, tmp->iq, c->l_qseq);
     iq[c->l_qseq] = '\0';

     to_delete = bam_aux_get(b, BI_TAG);
     if (to_delete) {
          bam_aux_del(b, to_delete);
     }
     bam_aux_append(b, BI_TAG, 'Z', c->l_qseq+1, (uint8_t*) iq);


     dq = malloc((c->l_qseq+1) * sizeof(char));
     memset(dq, tmp->dq, c->l_qseq);
     dq[c->l_qseq] = '\0';

     to_delete = bam_aux_get(b, BD_TAG);
     if (to_delete) {
          bam_aux_del(b, to_delete);
     }
     bam_aux_append(b, BD_TAG, 'Z', c->l_qseq+1, (uint8_t*) dq);

     bam_write1(tmp->out, b);

     free(iq);
     free(dq);

     return 0;
}
Esempio n. 10
0
static int add_mate_score(bam1_t *src, bam1_t *dest)
{
    uint8_t *data_ms;
    uint32_t mate_score = calc_mate_score(src);

    if ((data_ms = bam_aux_get(dest, "ms")) != NULL) {
        bam_aux_del(dest, data_ms);
    }

    if (bam_aux_append(dest, "ms", 'i', sizeof(uint32_t), (uint8_t*)&mate_score) == -1) {
        return -1;
    }

    return 0;
}
Esempio n. 11
0
int bam_merge_core2(int by_qname, const char *out, const char *headers, int n, char * const *fn, int flag, const char *reg, int level)
#endif
{
	bamFile fpout, *fp;
	heap1_t *heap;
	bam_header_t *hout = 0;
	bam_header_t *hheaders = NULL;
	int i, j, *RG_len = 0;
	uint64_t idx = 0;
	char **RG = 0, mode[8];
	bam_iter_t *iter = 0;

	if (headers) {
		tamFile fpheaders = sam_open(headers);
		if (fpheaders == 0) {
			const char *message = strerror(errno);
			fprintf(stderr, "[bam_merge_core] cannot open '%s': %s\n", headers, message);
			return -1;
		}
		hheaders = sam_header_read(fpheaders);
		sam_close(fpheaders);
	}

	g_is_by_qname = by_qname;
	fp = (bamFile*)calloc(n, sizeof(bamFile));
	heap = (heap1_t*)calloc(n, sizeof(heap1_t));
	iter = (bam_iter_t*)calloc(n, sizeof(bam_iter_t));
	// prepare RG tag
	if (flag & MERGE_RG) {
		RG = (char**)calloc(n, sizeof(void*));
		RG_len = (int*)calloc(n, sizeof(int));
		for (i = 0; i != n; ++i) {
			int l = strlen(fn[i]);
			const char *s = fn[i];
			if (l > 4 && strcmp(s + l - 4, ".bam") == 0) l -= 4;
			for (j = l - 1; j >= 0; --j) if (s[j] == '/') break;
			++j; l -= j;
			RG[i] = calloc(l + 1, 1);
			RG_len[i] = l;
			strncpy(RG[i], s + j, l);
		}
	}
	// read the first
	for (i = 0; i != n; ++i) {
		bam_header_t *hin;
		fp[i] = bam_open(fn[i], "r");
		if (fp[i] == 0) {
			int j;
			fprintf(stderr, "[bam_merge_core] fail to open file %s\n", fn[i]);
			for (j = 0; j < i; ++j) bam_close(fp[j]);
			free(fp); free(heap);
			// FIXME: possible memory leak
			return -1;
		}
		hin = bam_header_read(fp[i]);
		if (i == 0) { // the first BAM
			hout = hin;
		} else { // validate multiple baf
			int min_n_targets = hout->n_targets;
			if (hin->n_targets < min_n_targets) min_n_targets = hin->n_targets;

			for (j = 0; j < min_n_targets; ++j)
				if (strcmp(hout->target_name[j], hin->target_name[j]) != 0) {
					fprintf(stderr, "[bam_merge_core] different target sequence name: '%s' != '%s' in file '%s'\n",
							hout->target_name[j], hin->target_name[j], fn[i]);
					return -1;
				}

			// If this input file has additional target reference sequences,
			// add them to the headers to be output
			if (hin->n_targets > hout->n_targets) {
				swap_header_targets(hout, hin);
				// FIXME Possibly we should also create @SQ text headers
				// for the newly added reference sequences
			}

			bam_header_destroy(hin);
		}
	}

	if (hheaders) {
		// If the text headers to be swapped in include any @SQ headers,
		// check that they are consistent with the existing binary list
		// of reference information.
		if (hheaders->n_targets > 0) {
			if (hout->n_targets != hheaders->n_targets) {
				fprintf(stderr, "[bam_merge_core] number of @SQ headers in '%s' differs from number of target sequences\n", headers);
				if (!reg) return -1;
			}
			for (j = 0; j < hout->n_targets; ++j)
				if (strcmp(hout->target_name[j], hheaders->target_name[j]) != 0) {
					fprintf(stderr, "[bam_merge_core] @SQ header '%s' in '%s' differs from target sequence\n", hheaders->target_name[j], headers);
					if (!reg) return -1;
				}
		}

		swap_header_text(hout, hheaders);
		bam_header_destroy(hheaders);
	}

	if (reg) {
		int tid, beg, end;
		if (bam_parse_region(hout, reg, &tid, &beg, &end) < 0) {
			fprintf(stderr, "[%s] Malformated region string or undefined reference name\n", __func__);
			return -1;
		}
		for (i = 0; i < n; ++i) {
			bam_index_t *idx;
			idx = bam_index_load(fn[i]);
			iter[i] = bam_iter_query(idx, tid, beg, end);
			bam_index_destroy(idx);
		}
	}

	for (i = 0; i < n; ++i) {
		heap1_t *h = heap + i;
		h->i = i;
		h->b = (bam1_t*)calloc(1, sizeof(bam1_t));
		if (bam_iter_read(fp[i], iter[i], h->b) >= 0) {
			h->pos = ((uint64_t)h->b->core.tid<<32) | (uint32_t)((int32_t)h->b->core.pos+1)<<1 | bam1_strand(h->b);
			h->idx = idx++;
		}
		else h->pos = HEAP_EMPTY;
	}
	if (flag & MERGE_UNCOMP) level = 0;
	else if (flag & MERGE_LEVEL1) level = 1;
	strcpy(mode, "w");
	if (level >= 0) sprintf(mode + 1, "%d", level < 9? level : 9);
	if ((fpout = strcmp(out, "-")? bam_open(out, "w") : bam_dopen(fileno(stdout), "w")) == 0) {
		fprintf(stderr, "[%s] fail to create the output file.\n", __func__);
		return -1;
	}
	bam_header_write(fpout, hout);
	bam_header_destroy(hout);
#ifndef _PBGZF_USE 
	if (!(flag & MERGE_UNCOMP)) bgzf_mt(fpout, n_threads, 256);
#endif

	ks_heapmake(heap, n, heap);
	while (heap->pos != HEAP_EMPTY) {
		bam1_t *b = heap->b;
		if (flag & MERGE_RG) {
			uint8_t *rg = bam_aux_get(b, "RG");
			if (rg) bam_aux_del(b, rg);
			bam_aux_append(b, "RG", 'Z', RG_len[heap->i] + 1, (uint8_t*)RG[heap->i]);
		}
		bam_write1_core(fpout, &b->core, b->data_len, b->data);
		if ((j = bam_iter_read(fp[heap->i], iter[heap->i], b)) >= 0) {
			heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)((int)b->core.pos+1)<<1 | bam1_strand(b);
			heap->idx = idx++;
		} else if (j == -1) {
			heap->pos = HEAP_EMPTY;
			free(heap->b->data); free(heap->b);
			heap->b = 0;
		} else fprintf(stderr, "[bam_merge_core] '%s' is truncated. Continue anyway.\n", fn[heap->i]);
		ks_heapadjust(heap, 0, n, heap);
	}

	if (flag & MERGE_RG) {
		for (i = 0; i != n; ++i) free(RG[i]);
		free(RG); free(RG_len);
	}
	for (i = 0; i != n; ++i) {
		bam_iter_destroy(iter[i]);
		bam_close(fp[i]);
	}
	bam_close(fpout);
	free(fp); free(heap); free(iter);
	return 0;
}
Esempio n. 12
0
static int dindel_fetch_func(bam1_t *b, void *data)
{
     data_t_dindel *tmp = (data_t_dindel*)data;
     bam1_core_t *c = &b->core;
     int rlen;
     uint8_t *to_delete;

     /* don't change reads failing default mask: BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP */
     if (c->flag & BAM_DEF_MASK) {
          /* fprintf(stderr, "skipping read: %s at pos %d\n", bam1_qname(b), c->pos); */
          bam_write1(tmp->out, b);
          return 0;
     }

     /* get the reference sequence and compute homopolymer array */
     if (tmp->tid != c->tid) {
             /*fprintf(stderr, "fetching reference sequence %s\n",
               tmp->in->header->target_name[c->tid]); */
          char *ref = fai_fetch(tmp->fai, tmp->in->header->target_name[c->tid], &rlen);
          strtoupper(ref);/* safeguard */
          int rlen = strlen(ref);
          tmp->tid = c->tid;
          if (tmp->hpcount) free(tmp->hpcount);
          tmp->hpcount = (int*)malloc(rlen*sizeof(int));
          find_homopolymers(ref, tmp->hpcount, rlen);
          free(ref);
          tmp->rlen = rlen;
          /* fprintf(stderr, "fetched reference sequence\n");*/
     }

     /* parse the cigar string */
     uint32_t *cigar = bam1_cigar(b);
     uint8_t indelq[c->l_qseq+1];
     /* fprintf(stderr, "l_qseq:%d\n", c->l_qseq); */
     int i;
     int x = c->pos; /* coordinate on reference */
     int y = 0; /* coordinate on query */
     for (i = 0; i < c->n_cigar; ++i) {
          int j, oplen = cigar[i]>>4, op = cigar[i]&0xf;
          if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
               for (j = 0; j < oplen; j++) {
                       /*fprintf(stderr, "query:%d, ref:%d, count:%d\n", 
                         y, x, tmp->hpcount[x+1]); */
                    /* FIXME clang complains: The left operand of '>' is a garbage value */
                    indelq[y] = (x > tmp->rlen-2) ? DINDELQ[0] : (tmp->hpcount[x+1]>18 ?
                         DINDELQ[0] : DINDELQ[tmp->hpcount[x+1]]);
                    x++; 
                    y++;
               }
          } else if (op == BAM_CHARD_CLIP) { /* do nothing */
          } else if (op == BAM_CDEL) {
               x += oplen;
          } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) { 
               for (j = 0; j < oplen; j++) {
                       /* fprintf(stderr, "query:%d, ref:%d\n", y, x); */
                    indelq[y] = DINDELQ[0];
                    y++;
               }
          } else {
               LOG_FATAL("unknown op %d for read %s\n", op, bam1_qname(b));/* FIXME skip? seen this somewhere else properly handled */
               exit(1);
          }
     }
     indelq[y] = '\0';

     to_delete = bam_aux_get(b, BI_TAG);
     if (to_delete) {
          bam_aux_del(b, to_delete);
     }
     bam_aux_append(b, BI_TAG, 'Z', c->l_qseq+1, indelq);

     to_delete = bam_aux_get(b, BD_TAG);
     if (to_delete) {
          bam_aux_del(b, to_delete);
     }
     bam_aux_append(b, BD_TAG, 'Z', c->l_qseq+1, indelq);

     bam_write1(tmp->out, b);
     return 0;
}
Esempio n. 13
0
static int trim_ns(bam1_t *b, void *data) {
    int ret = 0;
    opts_t *op((opts_t *)data);
    std::vector<uint8_t> aux(bam_get_aux(b), bam_get_aux(b) + bam_get_l_aux(b));
    int tmp;
    uint8_t *const seq(bam_get_seq(b));
    uint32_t *const cigar(bam_get_cigar(b));
    //op->n_cigar = b->core.n_cigar;
    op->resize(b->l_data); // Make sure it's big enough to hold everything.
    memcpy(op->data, b->data, b->core.l_qname);

    // Get #Ns at the beginning
    for(tmp = 0; bam_seqi(seq, tmp) == dlib::htseq::HTS_N; ++tmp);
    const int n_start(tmp);

    if(tmp == b->core.l_qseq - 1) // all bases are N -- garbage read
         ret |= op->skip_all_ns;

    // Get #Ns at the end
    for(tmp = b->core.l_qseq - 1; bam_seqi(seq, tmp) == dlib::htseq::HTS_N; --tmp);
    const int n_end(b->core.l_qseq - 1 - tmp);

    // Get new length for read
    int final_len(b->core.l_qseq - n_end - n_start);
    if(final_len < 0) final_len = 0;
    if(final_len < op->min_trimmed_len) // Too short.
        ret |= 1;
    // Copy in qual and all of aux.

    if(n_end) {
        if((tmp = bam_cigar_oplen(cigar[b->core.n_cigar - 1]) - n_end) == 0) {
            LOG_DEBUG("Entire cigar operation is the softclip. Decrease the number of new cigar operations.\n");
            --b->core.n_cigar;
        } else {
            LOG_DEBUG("Updating second cigar operation in-place.\n");
            cigar[b->core.n_cigar - 1] = bam_cigar_gen(tmp, BAM_CSOFT_CLIP);
        }
    }

    // Get new n_cigar.
    if((tmp = bam_cigar_oplen(*cigar) - n_start) == 0) {
        memcpy(op->data + b->core.l_qname, cigar + 1, (--b->core.n_cigar) << 2); // << 2 for 4 bit per cigar op
    } else {
        if(n_start) *cigar = bam_cigar_gen(tmp, BAM_CSOFT_CLIP);
        memcpy(op->data + b->core.l_qname, cigar, b->core.n_cigar << 2);
    }
    uint8_t *opseq(op->data + b->core.l_qname + (b->core.n_cigar << 2)); // Pointer to the seq region of new data field.
    for(tmp = 0; tmp < final_len >> 1; ++tmp)
        opseq[tmp] = (bam_seqi(seq, ((tmp << 1) + n_start)) << 4) | (bam_seqi(seq, (tmp << 1) + n_start + 1));
    if(final_len & 1)
        opseq[tmp] = (bam_seqi(seq, ((tmp << 1) + n_start)) << 4);

    tmp = bam_get_l_aux(b);
    memcpy(opseq + ((final_len + 1) >> 1), bam_get_qual(b) + n_start, final_len + tmp);
    // Switch data strings
    std::swap(op->data, b->data);
    b->core.l_qseq = final_len;
    memcpy(bam_get_aux(b), aux.data(), aux.size());
    b->l_data = (bam_get_aux(b) - b->data) + aux.size();
    if(n_end) bam_aux_append(b, "NE", 'i', sizeof(int), (uint8_t *)&n_end);
    if(n_start) bam_aux_append(b, "NS", 'i', sizeof(int), (uint8_t *)&n_start);
    const uint32_t *pvar((uint32_t *)dlib::array_tag(b, "PV"));
    tmp = b->core.flag & BAM_FREVERSE ? n_end: n_start;
    if(pvar) {
        std::vector<uint32_t>pvals(pvar + tmp, pvar + final_len + tmp);
        bam_aux_del(b, (uint8_t *)(pvar) - 6);
        dlib::bam_aux_array_append(b, "PV", 'I', sizeof(uint32_t), final_len, (uint8_t *)pvals.data());
    }
    const uint32_t *fvar((uint32_t *)dlib::array_tag(b, "FA"));
    if(fvar) {
        std::vector<uint32_t>fvals(fvar + tmp, fvar + final_len + tmp);
        bam_aux_del(b, (uint8_t *)(fvar) - 6);
        dlib::bam_aux_array_append(b, "FA", 'I', sizeof(uint32_t), final_len, (uint8_t *)fvals.data());
    }
    return ret;
}
Esempio n. 14
0
File: sw_align.c Progetto: nh13/SRMA
// TODO soft clipping
bam1_t *sw_align_update_bam(bam1_t *bam_old, char *rg_id, sw_heap_t *heap, int32_t sw_node_best_i, uint8_t space, char *colors, char *color_qualities, uint8_t strand, uint8_t correct_bases)
{
	bam1_t *bam_new=NULL;
	int32_t sw_node_cur_i=-1, sw_node_prev_i=-1;
	int32_t i;
	int32_t cigar_cur_op, cigar_prev_op;
	int32_t cigar_cur_length, cigar_prev_length;
	uint32_t read_index;
	char *color_errors = NULL;

	if(sw_node_best_i < 0) { // none found, do not modify alignment
		return bam_old;
	}

	bam_new = srma_calloc(1, sizeof(bam1_t), __func__, "bam_new");

	if(1 == strand) {
		read_index = 0;
	}
	else {
		read_index = bam_old->core.l_qseq-1;
	}

	{ // query name
		bam_new->core.l_qname = bam_old->core.l_qname;
		bam_new->data_len += bam_new->core.l_qname;
		sw_align_bam_alloc_data(bam_new, bam_new->data_len);
		memcpy(bam1_qname(bam_new), bam1_qname(bam_old), bam_old->core.l_qname);
	}
	{ // flag
		bam_new->core.flag = bam_old->core.flag;
	}
	{ // tid, pos, qual
		bam_new->core.tid = heap->nodes[sw_node_best_i].node->contig-1; // it is one-based, we want zero-based
		if(1 == strand) { // reverse strand
			bam_new->core.pos = heap->nodes[sw_node_best_i].node->position-1;
		}
		else {
			bam_new->core.pos = heap->nodes[sw_node_best_i].start_position-1; // zero-based
		}
		bam_new->core.qual = bam_old->core.qual; // should we change the mapping quality?
		bam_new->core.mtid = -1;
		bam_new->core.mpos = -1;
		bam_new->core.isize = 0;
	}
	{ // cigar length
		bam_new->core.n_cigar = 0;
		cigar_cur_op = cigar_prev_op = -1;
		sw_node_cur_i = sw_node_best_i;
		while(0 <= sw_node_cur_i) {
			if(0 <= sw_node_prev_i  && BAM_CDEL == cigar_prev_op && 1 < fabs(heap->nodes[sw_node_cur_i].node->position - heap->nodes[sw_node_prev_i].node->position)) {
				cigar_cur_op = BAM_CDEL;
			}	
			else {
				switch(__node_type(heap->nodes[sw_node_cur_i].node)) {
					case NODE_MATCH:
					case NODE_MISMATCH:
						cigar_cur_op = BAM_CMATCH;
						break;
					case NODE_INSERTION:
						cigar_cur_op = BAM_CINS;
						break;
					default:
						srma_error(__func__, "unknown node type", Exit, OutOfRange);
				}
			}
			if(cigar_prev_op != cigar_cur_op) {
				// update the previous cigar operator
				cigar_prev_op = cigar_cur_op;
				bam_new->core.n_cigar++;
			}
			// Update
			if(BAM_CDEL != cigar_cur_op) {
				sw_node_prev_i = sw_node_cur_i;
				sw_node_cur_i = heap->nodes[sw_node_cur_i].prev_i;
			}
		}
	}

	{ // cigar and seq
		uint32_t *cigar_ptr=NULL;
		uint8_t *seq_ptr=NULL;
		uint32_t cigar_i = 0;
		// cigar
		bam_new->data_len += bam_new->core.n_cigar*sizeof(uint32_t);
		sw_align_bam_alloc_data(bam_new, bam_new->data_len);
		cigar_ptr = bam1_cigar(bam_new);
		// seq
		bam_new->core.l_qseq = bam_old->core.l_qseq;
		bam_new->data_len += (bam_new->core.l_qseq + 1)/2;
		sw_align_bam_alloc_data(bam_new, bam_new->data_len);
		seq_ptr = bam1_seq(bam_new);
		// fill in cigar and seq
		cigar_i = (1 == strand) ? bam_new->core.n_cigar-1 : 0;
		cigar_cur_op = cigar_prev_op = -1;
		cigar_cur_length = cigar_prev_length = -1;
		sw_node_cur_i = sw_node_best_i;
		while(0 <= sw_node_cur_i) {
			if(0 <= sw_node_prev_i && BAM_CDEL == cigar_prev_op && 1 < fabs(heap->nodes[sw_node_cur_i].node->position - heap->nodes[sw_node_prev_i].node->position)) {
				cigar_cur_op = BAM_CDEL;
			}	
			else {
				switch(__node_type(heap->nodes[sw_node_cur_i].node)) {
					case NODE_MATCH:
					case NODE_MISMATCH:
						cigar_cur_op = BAM_CMATCH;
						break;
					case NODE_INSERTION:
						cigar_cur_op = BAM_CINS;
						break;
					default:
						srma_error(__func__, "unknown node type", Exit, OutOfRange);
				}
				// pack sequence
				if(1 == strand && 0 == read_index%2) {
					seq_ptr[read_index/2] = 0;
				}
				else if(0 == strand && 1 == read_index%2) {
					seq_ptr[read_index/2] = 0;
				}
				// DEBUG
				/*
				   fprintf(stderr, "read_index=%d base=%d\n",
				   read_index, __node_base(heap->nodes[sw_node_cur_i].node));
				   */
				seq_ptr[read_index/2] |= int_to_nt4bit[__node_base(heap->nodes[sw_node_cur_i].node)] << 4*(1-(read_index%2));
				if(1 == strand) {
					read_index++;
				}
				else {
					read_index--;
				}
			}
			if(cigar_prev_op != cigar_cur_op) {
				// add the previous cigar operator
				if(-1 != cigar_prev_op) {
					bam1_cigar(bam_new)[cigar_i] = (cigar_prev_length << BAM_CIGAR_SHIFT) | cigar_prev_op;
					if(1 == strand) { // reverse strand
						cigar_i--;
					}
					else {
						cigar_i++;
					}
				}

				// update the previous cigar operator
				cigar_prev_op = cigar_cur_op;
				if(cigar_cur_op == BAM_CDEL) {
					// deletion length
					cigar_prev_length = (int)fabs(heap->nodes[sw_node_cur_i].node->position - heap->nodes[sw_node_cur_i].node->position) - 1;
				}
				else {
					cigar_prev_length = 1;
				}
			}
			else {
				cigar_prev_length++;
			}
			// Update
			if(BAM_CDEL != cigar_cur_op) {
				sw_node_prev_i = sw_node_cur_i;
				sw_node_cur_i = heap->nodes[sw_node_cur_i].prev_i;
			}
		}
		if(0 < cigar_prev_length) {
			if(-1 == cigar_prev_op || BAM_CDEL == cigar_prev_op) {
				srma_error(__func__, "Alignment ended with a null cigar or a deletion", Exit, OutOfRange);
			}	
			bam1_cigar(bam_new)[cigar_i] = (cigar_prev_length << BAM_CIGAR_SHIFT) | cigar_prev_op;
			// DEBUG
			if(1 == strand) { // reverse strand
				assert(cigar_i == 0);
			}
			else {
				assert(cigar_i == bam_new->core.n_cigar-1);
			}
		}
	}

	{ // qualities
		uint8_t *qual_ptr = NULL;
		char qual, q1, q2;
		uint8_t prev_base = 0, next_base;

		bam_new->data_len += bam_new->core.l_qseq;
		sw_align_bam_alloc_data(bam_new, bam_new->data_len);
		qual_ptr = bam1_qual(bam_new);

		if(space == SRMA_SPACE_CS) {
			color_errors = srma_malloc(sizeof(char)*(1 + bam_new->core.l_qseq), __func__, "color_errors");
			prev_base = nt2int_table[(int)colors[0]];
			for(i=0;i<bam_new->core.l_qseq;i++) {
				if(0 == strand) {
					next_base = nt4bit_to_int[bam1_seqi(bam1_seq(bam_new), i)];
				}
				else {
					next_base = nt4bit_to_int[bam1_seqi(bam1_seq(bam_new), bam_new->core.l_qseq-i-1)];
					if(next_base < 4) next_base = 3 - next_base;
				}
				if((prev_base ^ next_base) == nt2int_table[(int)colors[i+1]]) {
					color_errors[i] = '-';
				}
				else {
					color_errors[i] = colors[i+1];
				}
				prev_base = next_base;
			}
			color_errors[i]='\0';

			// Get new base qualities based on color qualities
			for(i=0;i<bam_new->core.l_qseq;i++) {
				// use MAQ 0.7.1 conversion
				if(i == bam_new->core.l_qseq-1) { 
					qual = srma_char2qual(color_qualities[i]);
				}
				else {
					int m1, m2;
					if(0 == strand) { // forward
						m1 = ('-' == color_errors[i]) ? 1 : 0;
						m2 = ('-' == color_errors[i+1]) ? 1 : 0;
						q1 = color_qualities[i];
						q2 = color_qualities[i+1];
					}
					else {
						m1 = ('-' == color_errors[bam_new->core.l_qseq-i-1]) ? 1 : 0;
						m2 = ('-' == color_errors[bam_new->core.l_qseq-i-2]) ? 1 : 0;
						q1 = color_qualities[bam_new->core.l_qseq-i-1];
						q2 = color_qualities[bam_new->core.l_qseq-i-2];
					}
					if(1 == m1 && 1 == m2) {
						qual = srma_char2qual(q1) + srma_char2qual(q2) + 10; 
					}
					else if(1 == m1) {
						qual = srma_char2qual(q1) - srma_char2qual(q2);
					}
					else if(1 == m2) {
						qual = srma_char2qual(q2) - srma_char2qual(q1);
					}
					else {
						qual = 1;
					}
				}
				if(0 == strand) {
					bam1_qual(bam_new)[i] = __bound_qual(qual);
				}
				else {
					bam1_qual(bam_new)[bam_new->core.l_qseq-i-1] = __bound_qual(qual);
				}
			}
		}
		else if(1 == correct_bases) {
			// Get new base qualities
			for(i=0;i<bam_new->core.l_qseq;i++) {
				if(bam1_seqi(bam1_seq(bam_new), i) == bam1_seqi(bam1_seq(bam_old), i)) {
					bam1_qual(bam_new)[i] = bam1_qual(bam_old)[i];
				}
				else {
					qual = srma_char2qual(bam1_qual(bam_old)[i]) -  33; 
					bam1_qual(bam_new)[i] = srma_qual2char(__bound_qual(qual - SRMA_CORRECT_BASE_QUALITY_PENALTY));
				}
			}
		}
		else {
			// Copy old quality
			memcpy(bam1_qual(bam_new), bam1_qual(bam_old), bam_new->core.l_qseq);
		}
	}

	// TODO soft-clipping

	{ // Add in any auxiliary data as necessary
		uint8_t *s;
		int32_t i = 0;
		bam_new->l_aux = 0;
			
		while(NULL != sw_align_save_tags[i]) {
			__copy_old(sw_align_save_tags[i]);
			i++;
		}
		
		// TODO 
		// PG

		// TODO: is AS correct
		bam_aux_append(bam_new, "AS", 'i', sizeof(uint32_t), (uint8_t*)&heap->nodes[sw_node_best_i].score);
		if(1 == correct_bases) {
			int32_t l = bam_old->core.l_qseq;
			char *str;

			str = srma_malloc(sizeof(char)*(l+1), __func__, "seq");
			for(i=0;i<l;i++) {
				str[i] = bam_nt16_rev_table[bam1_seqi(bam1_seq(bam_old), i)];
			}
			str[i] = '\0';
			bam_aux_append(bam_new, "XO", 'Z', l+1, (uint8_t*)str);

			for(i=0;i<l;i++) {
				str[i] = bam1_qual(bam_old)[i] + 33;  
			}
			str[i] = '\0';
			bam_aux_append(bam_new, "XQ", 'Z', l+1, (uint8_t*)str);

			free(str);
		}
		bam_aux_append(bam_new, "XC", 'i', sizeof(uint32_t), (uint8_t*)&heap->nodes[sw_node_best_i].coverage_sum);
		if(space == SRMA_SPACE_CS) {
			bam_aux_append(bam_new, "XE", 'Z', bam_new->core.l_qseq+1, (uint8_t*)color_errors);
		}
	}

	// destroy the old bam structure
	bam_destroy1(bam_old);
		
	if(space == SRMA_SPACE_CS) {
		free(color_errors);
	}

	return bam_new;
}	
Esempio n. 15
0
// from bam_md.c in SAMtools
// modified not fill in the NM tag, and not to start the reference a c->pos
static void 
tmap_sam_md1_core(bam1_t *b, char *ref)
{
  uint8_t *seq = bam1_seq(b);
  uint32_t *cigar = bam1_cigar(b);
  bam1_core_t *c = &b->core;
  int i, x, y, u = 0;
  kstring_t *str;
  uint8_t *old_md, *old_nm;
  int32_t old_nm_i=-1, nm=0;

  str = (kstring_t*)calloc(1, sizeof(kstring_t));
  for (i = y = x = 0; i < c->n_cigar; ++i) {
      int j, l = cigar[i]>>4, op = cigar[i]&0xf;
      if (op == BAM_CMATCH) {
          for (j = 0; j < l; ++j) {
              int z = y + j;
              int c1 = bam1_seqi(seq, z), c2 = bam_nt16_table[(int)ref[x+j]];
              if (ref[x+j] == 0) break; // out of boundary
              if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match
                  ++u;
              } else {
                  ksprintf(str, "%d", u);
                  kputc(ref[x+j], str);
                  u = 0; 
                  nm++;
              }
          }
          if (j < l) break;
          x += l; y += l;
      } else if (op == BAM_CDEL) {
          ksprintf(str, "%d", u);
          kputc('^', str);
          for (j = 0; j < l; ++j) {
              if (ref[x+j] == 0) break;
              kputc(ref[x+j], str);
          }
          u = 0;
          if (j < l) break;
          x += l; 
          nm += l;
      } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) {
          y += l;
          if (op == BAM_CINS) nm += l;
      } else if (op == BAM_CREF_SKIP) {
          x += l;
      }
  }
  ksprintf(str, "%d", u);

  // update MD
  old_md = bam_aux_get(b, "MD");
  if(NULL == old_md) {
      bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s);
  }
  else {
      int is_diff = 0;
      if(strlen((char*)old_md+1) == str->l) {
          for(i = 0; i < str->l; ++i) {
            if(toupper(old_md[i+1]) != toupper(str->s[i])) {
              break;
            }
          }
          if(i < str->l) {
              is_diff = 1;
          }
      } 
      else {
          is_diff = 1;
      }
      if(1 == is_diff) {
          bam_aux_del(b, old_md);
          bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s);
      }
  }

  // update NM
  old_nm = bam_aux_get(b, "NM");
  if(NULL != old_nm) {
      old_nm_i = bam_aux2i(old_nm);
      if(old_nm_i != nm) {
          bam_aux_del(b, old_nm);
          bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm);
      }
  }

  free(str->s); free(str);
}
Esempio n. 16
0
int bsstrand_func(bam1_t *b, const samfile_t *in, samfile_t *out, void *data) {

	bsstrand_data_t *d = (bsstrand_data_t*)data;
	bsstrand_conf_t *conf = d->conf;
	const bam1_core_t *c = &b->core;

	if (c->flag & BAM_FUNMAP){
		if (out) samwrite(out, b);
		d->n_unmapped++;
		return 0;
	}
	
	fetch_refseq(d->rs, in->header->target_name[c->tid], c->pos, c->pos+1);
	uint32_t rpos=c->pos+1, qpos=0;
	int i, nC2T = 0, nG2A = 0;
	uint32_t j;
	char rbase, qbase;

	for (i=0; i<c->n_cigar; ++i) {
		uint32_t op = bam_cigar_op(bam1_cigar(b)[i]);
		uint32_t oplen = bam_cigar_oplen(bam1_cigar(b)[i]);
		switch(op) {
		case BAM_CMATCH:
			for(j=0; j<oplen; ++j) {
				rbase = toupper(getbase_refseq(d->rs, rpos+j));
				qbase = bscall(bam1_seq(b), qpos+j);
				if (rbase == 'C' && qbase == 'T') nC2T += 1;
				if (rbase == 'G' && qbase == 'A') nG2A += 1;
				/* printf("%c vs %c\n", toupper(rbase), qbase); */
			}
			rpos += oplen;
			qpos += oplen;
			break;
		case BAM_CINS:
			qpos += oplen;
			break;
		case BAM_CDEL:
			rpos += oplen;
			break;
		case BAM_CSOFT_CLIP:
			qpos += oplen;
			break;
		default:
			fprintf(stderr, "Unknown cigar, %u\n", op);
			abort();
		}
	}

	char key[2] = {'Z','S'};
	unsigned char *bsstrand = bam_aux_get(b, key);
	if (bsstrand) {
		bsstrand++;
		double s = similarity(nG2A, nC2T);
		if (nG2A > 1 && nC2T > 1 && s > 0.5) {
			if (conf->output_read || conf->output_all_read)
				printf("F\t%s\t%d\t%d\t%d\t%s\t%s\t%1.2f\n", in->header->target_name[c->tid], c->pos, nC2T, nG2A, bam1_qname(b), bsstrand, s);
			bam_aux_append(b, "OS", 'A', 1, bsstrand);
			bsstrand[0] = '?';
			d->n_fail++;
		} else if (*bsstrand == '+' && nG2A > nC2T + 2) {
			if (conf->output_read || conf->output_all_read)
				printf("W2C\t%s\t%d\t%d\t%d\t%s\t%s\t%1.2f\n", in->header->target_name[c->tid], c->pos, nC2T, nG2A, bam1_qname(b), bsstrand, s);
			bam_aux_append(b, "OS", 'A', 1, bsstrand);
			bsstrand[0] = '-';
			d->n_corr++;
		} else if (*bsstrand == '-' && nC2T > nG2A + 2) {
			if (conf->output_read || conf->output_all_read)
				printf("C2W\t%s\t%d\t%d\t%d\t%s\t%s\t%1.2f\n", in->header->target_name[c->tid], c->pos, nC2T, nG2A, bam1_qname(b), bsstrand, s);
			bam_aux_append(b, "OS", 'A', 1, bsstrand);
			bsstrand[0] = '+';
			d->n_corr++;
		} else if (conf->output_all_read) {
			printf("N\t%s\t%d\t%d\t%d\t%s\t%s\t%1.2f\n", in->header->target_name[c->tid], c->pos, nC2T, nG2A, bam1_qname(b), bsstrand, s);
		}
	} else if (!(c->flag & BAM_FUNMAP) && conf->infer_bsstrand) {
		char bss[3];
		if (similarity(nG2A, nC2T) < 0.5) {
			strcpy(bss, "??");
		} else if (nC2T > nG2A) {
			strcpy(bss, c->flag & BAM_FREVERSE ? "+-" : "++");
		} else {
			strcpy(bss, c->flag & BAM_FREVERSE ? "-+" : "--");
		}
		bam_aux_append(b, "ZS", 'Z', 3, (uint8_t*) bss);
	}

	
	if (out) samwrite(out, b);
	d->n_mapped++;

	return 0;
}
Esempio n. 17
0
int do_grep() {
#ifdef DEBUGa
	printf("[!]do_grep\n");
#endif
	BamInfo_t *pbam;
	kh_cstr_t BamID;
	khiter_t ki, bami;
	kstring_t ks1 = { 0, 0, NULL };
	kstring_t ks2 = { 0, 0, NULL };
	kstring_t ks3 = { 0, 0, NULL };
	kstring_t kstr = { 0, 0, NULL };
	//ksprintf(kstr, "%s/%s_grep/", myConfig.WorkDir, myConfig.ProjectID);
	//const char *filePrefix = strdup(ks_str(kstr));
	//kputs(myConfig.WorkDir,kstr);

	samFile *in;
	bam_hdr_t *h;
	hts_idx_t *idx;
	bam1_t *b, *d, *d2, *bR1, *bR2, *bR3;
	bR1 = bam_init1(); bR2 = bam_init1(); bR3 = bam_init1();
	htsFile *out;
	//hts_opt *in_opts = NULL, *out_opts = NULL;
	int r = 0, exit_code = 0;

	kvec_t(bam1_t) R1, R2, RV;
	pierCluster_t *pierCluster;
	//samdat_t tmp_samdat;
#ifdef DEBUGa
	kstr.l = 0;
	ksprintf(&kstr, "%s/%s_grep/Greped.dump", myConfig.WorkDir, myConfig.ProjectID);
	FILE *fsdump = fopen(ks_str(&kstr),"w");
#endif
	kstr.l = 0;
	ksprintf(&kstr, "%s/%s_grep/Greped.ini", myConfig.WorkDir, myConfig.ProjectID);
	FILE *fs = fopen(ks_str(&kstr),"w");
	uint32_t blockid = 0;

	for (bami = kh_begin(bamNFOp); bami != kh_end(bamNFOp); ++bami) {
		//printf(">[%d]:\n",bami);
		if (kh_exist(bamNFOp, bami)) {
			kv_init(R1); kv_init(R2); kv_init(RV);
			//tmp_samdat = (const samdat_t){ 0 };
			//memset(&tmp_samdat,0,sizeof(samdat_t));
			//printf("-[%d]:\n",bami);
			BamID = kh_key(bamNFOp, bami);
			pbam = &kh_value(bamNFOp, bami);
			fprintf(stderr, "%u [%s]=%s\t%u %u\n",bami,BamID,pbam->fileName,pbam->insertSize,pbam->SD);

			in = sam_open(pbam->fileName, "r");
			if (in == NULL) {
				fprintf(stderr, "[x]Error opening \"%s\"\n", pbam->fileName);
				return EXIT_FAILURE;
			}
			h = sam_hdr_read(in);
			kstr.l = 0;
			ksprintf(&kstr, "%s/%s_grep/%s.bam", myConfig.WorkDir, myConfig.ProjectID, BamID);
			out = hts_open(ks_str(&kstr), "wb");
			if (out == NULL) {
				fprintf(stderr, "[x]Error opening [%s]\n",ks_str(&kstr));
				return EXIT_FAILURE;
			}
			if (sam_hdr_write(out, h) < 0) {
				fprintf(stderr, "[!]Error writing output header.\n");
				exit_code = 1;
			}

			int8_t *ChrIsHum;
			if (h == NULL) {
				fprintf(stderr, "[x]Couldn't read header for \"%s\"\n", pbam->fileName);
				return EXIT_FAILURE;
			} else {
				ChrIsHum = (int8_t *) malloc(h->n_targets * sizeof(int8_t));
				for (int32_t i=0; i < h->n_targets; ++i) {
					//ChrIsHum[i] = -1;
					ki = kh_get(chrNFO, chrNFOp, h->target_name[i]);
					if (ki == kh_end(chrNFOp)) {
						errx(4,"[x]Cannot find ChrID for [%s] !",h->target_name[i]);
					} else {
						ChrInfo_t * tmp = &kh_value(chrNFOp, ki);
						ChrIsHum[i] = tmp->isHum;
						//printf(">>> %d Chr:%s %d\n",i,h->target_name[i],ChrIsHum[i]);
					}
				}
			}
			h->ignore_sam_err = 0;
			b = bam_init1();
			d = bam_init1();
			d2 = bam_init1();
			if ((idx = sam_index_load(in, pbam->fileName)) == 0) {
				fprintf(stderr, "[E::%s] fail to load the BAM index\n", __func__);
				return 1;
			}
			pierCluster = sam_plp_init();
			while ((r = sam_read1(in, h, b)) >= 0) {
				int8_t flag = false;
				const bam1_core_t *c = &b->core;
				if (c->qual < myConfig.minBamQual) {
					continue;
				}
				if (c->n_cigar) {
					uint32_t *cigar = bam_get_cigar(b);
					int i = c->n_cigar; --i;
					if ( (bam_cigar_opchr(cigar[0])=='S' && bam_cigar_oplen(cigar[0]) >= myConfig.minGrepSlen) || 
						 (bam_cigar_opchr(cigar[i])=='S' && bam_cigar_oplen(cigar[i]) >= myConfig.minGrepSlen)    ) {
						flag = true;
					}
/* We only need /\d+S/ on both terminal, NOT inside.
					for (int i = 0; i < c->n_cigar; ++i) {
						if (bam_cigar_opchr(cigar[i])=='S') {	// soft clipping
							if ( bam_cigar_oplen(cigar[i]) >= myConfig.minGrepSlen ) {
								flag = true;
							}
						}
					}
*/
				}
				if (flag && ChrIsHum[c->tid]) {	// Now, skip Virus items.
					//bam_copy1(bR1, b);
					flag = 0;	// recycle
					//int enoughMapQ = 0;
					//kstring_t ks = { 0, 0, NULL };
					/*if (sam_format1(h, b, &ks1) < 0) {
						fprintf(stderr, "Error writing output.\n");
						exit_code = 1;
						break;
					} else */if ((c->mtid == c->tid && ChrIsHum[c->tid]) || (ChrIsHum[c->tid] ^ ChrIsHum[c->mtid])) {	// Only grep those mapped on same Human ChrID, or diff species/一方在病毒的情况.
						//printf(">[%s]\n",ks_str(&ks1));
						flag |= 1;
						//tmp_samdat.b = bam_dup1(b);
						//kv_push(samdat_t,R1,tmp_samdat);
						/*if (checkMapQ(ChrIsHum, b, true)) {
							++enoughMapQ;
						}*/
					}
					if (getPairedSam(in, idx, b, d) != 0) {
						flag &= ~1;
						continue;
					} else {
						flag |= 2;
						/*if (checkMapQ(ChrIsHum, d, false)) {
							++enoughMapQ;
						}*/
						if (c->flag & BAM_FSECONDARY) {
							if (getPairedSam(in, idx, d, d2) == 0) {
								//sam_format1(h, d2, &ks3);
								flag |= 4;
								/*if (checkMapQ(ChrIsHum, d2, false)) {
									++enoughMapQ;
								}*/
							}
						}
					}
/*
对于 BAM_FSECONDARY(256) 的 Read,跳两次 与 读 SA 项,效果一样。
>[sf95_Ref_48245009_48245108_48245208_Vir_-_2000_2044_R_100_90	353	chr2	13996555	0	50S40M	chr18	48245109	0ACACAACAATGTTCCGGAGACTCTAAGGCCTCCCGATACAGAGCAGAGGCCACACACACACACACCATGGAATACTATTCAGCCAAAAAA	CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC	NM:i:0	MD:Z:40	AS:i:40	XS:i:40	RG:Z:Fsimout_mB	SA:Z:rgi|59585|emb|X04615.1|,2000,-,40S46M4S,60,0;	YC:Z:CT	YD:Z:f]
-[sf95_Ref_48245009_48245108_48245208_Vir_-_2000_2044_R_100_90	177	chr18	48245109	9	40S50M	gi|59585|emb|X04615.1|2000	0	GTTCCGGAGACTCTAAGGCCTCCCGATACAGAGCAGAGGCCACACACACACACACCATGGAATACTATTCAGCCAAAAAAAGGAATTCAA	CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC	NM:i:0	MD:Z:50	AS:i:50	XS:i:46	RG:Z:Fsimout_mB	SA:Z:rgi|59585|emb|X04615.1|,2000,+,50S40M,9,0;	YC:Z:GA	YD:Z:f]
+[sf95_Ref_48245009_48245108_48245208_Vir_-_2000_2044_R_100_90	113	gi|59585|emb|X04615.1|	2000	60	40S46M4S	chr18	48245109	0	TTTTTTGGCTGAATAGTATTCCATGGTGTGTGTGTGTGTGGCCTCTGCTCTGTATCGGGAGGCCTTAGAGTCTCCGGAACATTGTTGTGT	CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC	NM:i:0	MD:Z:46	AS:i:46	XS:i:27	RG:Z:Fsimout_mB	SA:Z:fchr2,13996555,+,50S40M,0,0;	YC:Z:CT	YD:Z:r]
*/
					/*if (sam_format1(h, d, &ks2) < 0) {
						fprintf(stderr, "Error writing output.\n");
						exit_code = 1;
						break;
					}*/
					if (((flag & 3) == 3) /*&& enoughMapQ >= myConfig.samples*/) {
						/*printf(">%d[%s]\n",checkMapQ(ChrIsHum, b, true),ks_str(&ks1));
						printf("-%d[%s]\n",checkMapQ(ChrIsHum, d, false),ks_str(&ks2));
						if (flag & 4) {
							printf("+%d[%s]\n",checkMapQ(ChrIsHum, d2, false),ks_str(&ks3));
						}
						printf("<--%d\n",enoughMapQ);*/
						bam_aux_append(b, "Zd", 'Z', 2, (uint8_t*)"H");
						if (sam_plp_push(ChrIsHum, pierCluster, b) == 0) {
							//printf("--HumRange=%s:%d-%d\n", h->target_name[(pierCluster->HumanRange).tid], (pierCluster->HumanRange).pos, (pierCluster->HumanRange).endpos);
							if ((!ChrIsHum[(d->core).tid]) && (flag & 2)) {
								bam_aux_append(d, "Zd", 'Z', 2, (uint8_t*)"V");
								sam_plp_push(ChrIsHum, pierCluster, d);
							}
							if ((!ChrIsHum[(d2->core).tid]) && (flag & 4)) {
								bam_aux_append(d2, "Zd", 'Z', 2, (uint8_t*)"V");
								sam_plp_push(ChrIsHum, pierCluster, d2);
							}
						} else {
							++blockid;
							//print
#ifdef DEBUGa
							fprintf(fsdump,"[%u %s]\nHumRange=%s:%d-%d\n", blockid, BamID, h->target_name[(pierCluster->HumanRange).tid], (pierCluster->HumanRange).pos, (pierCluster->HumanRange).endpos);
							fprintf(fsdump,"VirRange=%s:%d-%d\n", h->target_name[(pierCluster->VirusRange).tid], (pierCluster->VirusRange).pos, (pierCluster->VirusRange).endpos);
#endif
							fprintf(fs,"[%u]\nBamID=%s\nHumRange=%s:%d-%d\n",blockid, BamID,
							h->target_name[(pierCluster->HumanRange).tid], (pierCluster->HumanRange).pos, (pierCluster->HumanRange).endpos);
							if ( (pierCluster->VirusRange).pos == 0 && (pierCluster->VirusRange).endpos == 0 ) {
								fprintf(fs,"VirRange=NA\n");
							} else {
								fprintf(fs,"VirRange=%s:%d-%d\n", h->target_name[(pierCluster->VirusRange).tid],
									(pierCluster->VirusRange).pos, (pierCluster->VirusRange).endpos);
							}
							for (size_t i=0; i<kv_size(pierCluster->Reads);++i) {
								bam1_t *bi = kv_A(pierCluster->Reads, i);
								bam_aux_append(bi, "Zc", 'i', sizeof(uint32_t), (uint8_t*)&blockid);
#ifdef DEBUGa
								if (sam_format1(h, bi, &ks1) < 0) {
									fprintf(stderr, "Error writing output.\n");
									exit_code = 1;
									break;
								} else {
									fprintf(fsdump,"%s\n",ks1.s);
								}
#endif
								if (sam_write1(out, h, bi) < 0) {
									fprintf(stderr, "[x]Error writing output.\n");
									exit_code = 1;
									break;
								}
							}
#ifdef DEBUGa
							fprintf(fsdump,"\n");
							printf("HumRange=%s:%d-%d\n", h->target_name[(pierCluster->HumanRange).tid], (pierCluster->HumanRange).pos, (pierCluster->HumanRange).endpos);
							//fflush(fs);
#endif
							sam_plp_dectroy(pierCluster);
							pierCluster = sam_plp_init();
						}
					}
				}
				//char *qname = bam_get_qname(b);
			}
			r = sam_close(out);   // stdout can only be closed once
			if (r < 0) {
				fprintf(stderr, "Error closing output.\n");
				exit_code = 1;
			}

			hts_idx_destroy(idx);
			bam_destroy1(b);
			bam_destroy1(d);
			bam_destroy1(d2);
			bam_hdr_destroy(h);
			r = sam_close(in);
			free(ChrIsHum);
#ifdef DEBUGa
			//fflush(NULL);
			//pressAnyKey();
#endif
			sam_plp_dectroy(pierCluster);
			//printf("<[%d]:\n",bami);
		}
	}
#ifdef DEBUGa
	fclose(fsdump);
#endif
	fclose(fs);
	getPairedSam(NULL, NULL, NULL, NULL);	// sam_close(fp2);
	//printf("---[%d]---\n",exit_code);
	bam_destroy1(bR1); bam_destroy1(bR2); bam_destroy1(bR3);
	ks_release(&ks1);
	ks_release(&ks2);
	ks_release(&ks3);
	ks_release(&kstr);
	//free((char*)filePrefix);
	return exit_code;
}
Esempio n. 18
0
int main(int argc, char *argv[])  
{  
  hashtable ht=new_hashtable(HASHSIZE);
  bamFile in,in2; 
  bamFile out; 
  
  if (argc != 3) {  
    fprintf(stderr, "Usage: bam_fix_NH <in.bam> <out.bam>\n");  
    return 1;  
  }  
  
  // Open file and exit if error
  //in = strcmp(argv[1], "-")? bam_open(argv[1], "rb") : bam_dopen(fileno(stdin), "rb");
  in = bam_open(argv[1], "rb");
  out = strcmp(argv[2], "-")? bam_open(argv[2], "w") : bam_dopen(fileno(stdout), "w"); 
  if (in == 0 ) {  
    fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[1]);  
    return 1;  
  }  
  if (out == 0) {  
    fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[2]);  
    return 1;  
  }  

  unsigned long num_alns=0;
  int ref;  

  // ***********
  // Copy header
  bam_header_t *header;
  header = bam_header_read(in);
  bam_header_write(out,header);

  // sorted by name?
  // Should not rely on the value in SO 
  bam1_t *aln=bam_init1();
  bam1_t *prev=bam_init1();

  printf("Hashing...\n");flush(stdout);
  while(bam_read1(in,aln)>=0) { // read alignment
    if (aln->core.tid < 0) continue;//ignore unaligned reads
    ++num_alns;
    new_read_aln(ht,bam1_qname(aln));
  }
  bam_close(in);  
  printf("Hashing complete (%lu alignments)\n",num_alns);
  printf("Memory used in the hash: %ld MB\n",index_mem/1024/1024);  
  flush(stdout);
  // reopen
  in2 = bam_open(argv[1], "rb");
  if (in2 == 0 ) {  
    fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[1]);  
    return 1;  
  }  

  header = bam_header_read(in2);
  
  while(bam_read1(in2,aln)>=0) { // read alignment
    if (aln->core.tid < 0) continue;//ignore unaligned reads
    ++num_alns;
    
    READ_ALN *r=get_read_aln(ht,bam1_qname(aln));

    //assert(r!=NULL);
    // update the NH field
    uint8_t *old_nh = bam_aux_get(aln, "NH");    
    uint8_t nh=r->ctr;
    if (old_nh) {
      if (nh!=bam_aux2i(old_nh)) {
	fprintf(stderr,"warning: value mismatch! replacing>%s %d->%d\n",bam1_qname(aln),bam_aux2i(old_nh),nh);
      }
      bam_aux_del(aln, old_nh);
      bam_aux_append(aln, "NH", 'i', 4, (uint8_t*)&nh);
    }
    if (!old_nh) { // add NH  
      bam_aux_append(aln, "NH", 'i', 4, (uint8_t*)&nh);
#ifdef DEBUG
      printf("!>%s %d\n",bam1_qname(aln),bam_aux2i(old_nh));
#endif
    }
    // in->header
    // Also fix the XS:A tag
    // BAM_FREAD1
    // BAM_FREAD2
    // BAM_FREVERSE the read is mapped to the reverse strand 
    //bam1_cigar(b) 
      //BAM_CREF_SKIP 3 CIGAR skip on the reference (e.g. spliced alignment)
      //BAM_FREVERSE 16 the read is mapped to the reverse strand
    if (aln->core.flag & BAM_FSECONDARY) continue; // skip secondary alignments
    if (aln->core.flag & ! BAM_FPAIRED) continue; // not paired
    if (aln->core.flag & ! BAM_FPROPER_PAIR) continue; // not a proper pair
    if (aln->core.flag & ! BAM_FMUNMAP) continue; // the mate is mapped
    if (aln->core.flag & BAM_FSECONDARY) continue; // secundary read
    if (aln->core.flag & BAM_FREAD2) continue; // only count each pair once
    // core.strand == 0 (f/+) 1 r/-
    // flag
    // bam1_qname(b)
    bam_write1(out,aln);
  }
  // 
  bam_destroy1(aln);
  bam_close(in2);  
  bam_close(out);  
  return 0;  
/*
uint8_t *old_nm = bam_aux_get(b, "NM");
90 	if (c->flag & BAM_FUNMAP) return;
91 	if (old_nm) old_nm_i = bam_aux2i(old_nm);
92 	if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm);
93 	else if (nm != old_nm_i) {
94 	fprintf(stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam1_qname(b), old_nm_i, nm);
95 	bam_aux_del(b, old_nm);
96 	bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm);
97 	}
*/
}  
Esempio n. 19
0
void bam_fillmd1_core(bam1_t *b, char *ref, int ref_len, int flag, int max_nm)
{
    uint8_t *seq = bam_get_seq(b);
    uint32_t *cigar = bam_get_cigar(b);
    bam1_core_t *c = &b->core;
    int i, x, y, u = 0;
    kstring_t *str;
    int32_t old_nm_i = -1, nm = 0;

    str = (kstring_t*)calloc(1, sizeof(kstring_t));
    for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
        int j, l = cigar[i]>>4, op = cigar[i]&0xf;
        if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
            for (j = 0; j < l; ++j) {
                int c1, c2, z = y + j;
                if (x+j >= ref_len || ref[x+j] == '\0') break; // out of bounds
                c1 = bam_seqi(seq, z), c2 = seq_nt16_table[(int)ref[x+j]];
                if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match
                    if (flag&USE_EQUAL) seq[z/2] &= (z&1)? 0xf0 : 0x0f;
                    ++u;
                } else {
                    kputw(u, str);
                    kputc(ref[x+j], str);
                    u = 0;
                    ++nm;
                }
            }
            if (j < l) break;
            x += l;
            y += l;
        } else if (op == BAM_CDEL) {
            kputw(u, str);
            kputc('^', str);
            for (j = 0; j < l; ++j) {
                if (x+j >= ref_len || ref[x+j] == '\0') break;
                kputc(ref[x+j], str);
            }
            u = 0;
            x += j;
            nm += j;
            if (j < l) break;
        } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) {
            y += l;
            if (op == BAM_CINS) nm += l;
        } else if (op == BAM_CREF_SKIP) {
            x += l;
        }
    }
    kputw(u, str);
    // apply max_nm
    if (max_nm > 0 && nm >= max_nm) {
        for (i = y = 0, x = c->pos; i < c->n_cigar; ++i) {
            int j, l = cigar[i]>>4, op = cigar[i]&0xf;
            if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
                for (j = 0; j < l; ++j) {
                    int c1, c2, z = y + j;
                    if (x+j >= ref_len || ref[x+j] == '\0') break; // out of bounds
                    c1 = bam_seqi(seq, z), c2 = seq_nt16_table[(int)ref[x+j]];
                    if ((c1 == c2 && c1 != 15 && c2 != 15) || c1 == 0) { // a match
                        seq[z/2] |= (z&1)? 0x0f : 0xf0;
                        bam_get_qual(b)[z] = 0;
                    }
                }
                if (j < l) break;
                x += l;
                y += l;
            } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) x += l;
            else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) y += l;
        }
    }
    // update NM
    if ((flag & UPDATE_NM) && !(c->flag & BAM_FUNMAP)) {
        uint8_t *old_nm = bam_aux_get(b, "NM");
        if (old_nm) old_nm_i = bam_aux2i(old_nm);
        if (!old_nm) bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm);
        else if (nm != old_nm_i) {
            fprintf(stderr, "[bam_fillmd1] different NM for read '%s': %d -> %d\n", bam_get_qname(b), old_nm_i, nm);
            bam_aux_del(b, old_nm);
            bam_aux_append(b, "NM", 'i', 4, (uint8_t*)&nm);
        }
    }
    // update MD
    if ((flag & UPDATE_MD) && !(c->flag & BAM_FUNMAP)) {
        uint8_t *old_md = bam_aux_get(b, "MD");
        if (!old_md) bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s);
        else {
            int is_diff = 0;
            if (strlen((char*)old_md+1) == str->l) {
                for (i = 0; i < str->l; ++i)
                    if (toupper(old_md[i+1]) != toupper(str->s[i]))
                        break;
                if (i < str->l) is_diff = 1;
            } else is_diff = 1;
            if (is_diff) {
                fprintf(stderr, "[bam_fillmd1] different MD for read '%s': '%s' -> '%s'\n", bam_get_qname(b), old_md+1, str->s);
                bam_aux_del(b, old_md);
                bam_aux_append(b, "MD", 'Z', str->l + 1, (uint8_t*)str->s);
            }
        }
    }

    // drop all tags but RG
    if (flag&DROP_TAG) {
        uint8_t *q = bam_aux_get(b, "RG");
        bam_aux_drop_other(b, q);
    }
    // reduce the resolution of base quality
    if (flag&BIN_QUAL) {
        uint8_t *qual = bam_get_qual(b);
        for (i = 0; i < b->core.l_qseq; ++i)
            if (qual[i] >= 3) qual[i] = qual[i]/10*10 + 7;
    }

    free(str->s);
    free(str);
}
Esempio n. 20
0
/*!
  @abstract    Merge multiple sorted BAM.
  @param  is_by_qname whether to sort by query name
  @param  out         output BAM file name
  @param  mode        sam_open() mode to be used to create the final output file
                      (overrides level settings from UNCOMP and LEVEL1 flags)
  @param  headers     name of SAM file from which to copy '@' header lines,
                      or NULL to copy them from the first file to be merged
  @param  n           number of files to be merged
  @param  fn          names of files to be merged
  @param  flag        flags that control how the merge is undertaken
  @param  reg         region to merge
  @param  n_threads   number of threads to use (passed to htslib)
  @discussion Padding information may NOT correctly maintained. This
  function is NOT thread safe.
 */
int bam_merge_core2(int by_qname, const char *out, const char *mode, const char *headers, int n, char * const *fn, int flag, const char *reg, int n_threads)
{
    samFile *fpout, **fp;
    heap1_t *heap;
    bam_hdr_t *hout = NULL;
    int i, j, *RG_len = NULL;
    uint64_t idx = 0;
    char **RG = NULL;
    hts_itr_t **iter = NULL;
    bam_hdr_t **hdr = NULL;
    trans_tbl_t *translation_tbl = NULL;

    // Is there a specified pre-prepared header to use for output?
    if (headers) {
        samFile* fpheaders = sam_open(headers, "r");
        if (fpheaders == NULL) {
            const char *message = strerror(errno);
            fprintf(pysamerr, "[bam_merge_core] cannot open '%s': %s\n", headers, message);
            return -1;
        }
        hout = sam_hdr_read(fpheaders);
        sam_close(fpheaders);
    }

    g_is_by_qname = by_qname;
    fp = (samFile**)calloc(n, sizeof(samFile*));
    heap = (heap1_t*)calloc(n, sizeof(heap1_t));
    iter = (hts_itr_t**)calloc(n, sizeof(hts_itr_t*));
    hdr = (bam_hdr_t**)calloc(n, sizeof(bam_hdr_t*));
    translation_tbl = (trans_tbl_t*)calloc(n, sizeof(trans_tbl_t));
    // prepare RG tag from file names
    if (flag & MERGE_RG) {
        RG = (char**)calloc(n, sizeof(char*));
        RG_len = (int*)calloc(n, sizeof(int));
        for (i = 0; i != n; ++i) {
            int l = strlen(fn[i]);
            const char *s = fn[i];
            if (l > 4 && strcmp(s + l - 4, ".bam") == 0) l -= 4;
            for (j = l - 1; j >= 0; --j) if (s[j] == '/') break;
            ++j; l -= j;
            RG[i] = (char*)calloc(l + 1, 1);
            RG_len[i] = l;
            strncpy(RG[i], s + j, l);
        }
    }
    // open and read the header from each file
    for (i = 0; i < n; ++i) {
        bam_hdr_t *hin;
        fp[i] = sam_open(fn[i], "r");
        if (fp[i] == NULL) {
            int j;
            fprintf(pysamerr, "[bam_merge_core] fail to open file %s\n", fn[i]);
            for (j = 0; j < i; ++j) sam_close(fp[j]);
            free(fp); free(heap);
            // FIXME: possible memory leak
            return -1;
        }
        hin = sam_hdr_read(fp[i]);
        if (hout)
            trans_tbl_init(hout, hin, translation_tbl+i, flag & MERGE_COMBINE_RG, flag & MERGE_COMBINE_PG);
        else {
            // As yet, no headers to merge into...
            hout = bam_hdr_dup(hin);
            // ...so no need to translate header into itself
            trans_tbl_init(hout, hin, translation_tbl+i, true, true);
        }

        // TODO sam_itr_next() doesn't yet work for SAM files,
        // so for those keep the headers around for use with sam_read1()
        if (hts_get_format(fp[i])->format == sam) hdr[i] = hin;
        else { bam_hdr_destroy(hin); hdr[i] = NULL; }

        if ((translation_tbl+i)->lost_coord_sort && !by_qname) {
            fprintf(pysamerr, "[bam_merge_core] Order of targets in file %s caused coordinate sort to be lost\n", fn[i]);
        }
    }

    // Transform the header into standard form
    pretty_header(&hout->text,hout->l_text);

    // If we're only merging a specified region move our iters to start at that point
    if (reg) {
        int* rtrans = rtrans_build(n, hout->n_targets, translation_tbl);

        int tid, beg, end;
        const char *name_lim = hts_parse_reg(reg, &beg, &end);
        char *name = malloc(name_lim - reg + 1);
        memcpy(name, reg, name_lim - reg);
        name[name_lim - reg] = '\0';
        tid = bam_name2id(hout, name);
        free(name);
        if (tid < 0) {
            fprintf(pysamerr, "[%s] Malformated region string or undefined reference name\n", __func__);
            return -1;
        }
        for (i = 0; i < n; ++i) {
            hts_idx_t *idx = sam_index_load(fp[i], fn[i]);
            // (rtrans[i*n+tid]) Look up what hout tid translates to in input tid space
            int mapped_tid = rtrans[i*hout->n_targets+tid];
            if (mapped_tid != INT32_MIN) {
                iter[i] = sam_itr_queryi(idx, mapped_tid, beg, end);
            } else {
                iter[i] = sam_itr_queryi(idx, HTS_IDX_NONE, 0, 0);
            }
            hts_idx_destroy(idx);
            if (iter[i] == NULL) break;
        }
        free(rtrans);
    } else {
        for (i = 0; i < n; ++i) {
            if (hdr[i] == NULL) {
                iter[i] = sam_itr_queryi(NULL, HTS_IDX_REST, 0, 0);
                if (iter[i] == NULL) break;
            }
            else iter[i] = NULL;
        }
    }

    if (i < n) {
        fprintf(pysamerr, "[%s] Memory allocation failed\n", __func__);
        return -1;
    }

    // Load the first read from each file into the heap
    for (i = 0; i < n; ++i) {
        heap1_t *h = heap + i;
        h->i = i;
        h->b = bam_init1();
        if ((iter[i]? sam_itr_next(fp[i], iter[i], h->b) : sam_read1(fp[i], hdr[i], h->b)) >= 0) {
            bam_translate(h->b, translation_tbl + i);
            h->pos = ((uint64_t)h->b->core.tid<<32) | (uint32_t)((int32_t)h->b->core.pos+1)<<1 | bam_is_rev(h->b);
            h->idx = idx++;
        }
        else {
            h->pos = HEAP_EMPTY;
            bam_destroy1(h->b);
            h->b = NULL;
        }
    }

    // Open output file and write header
    if ((fpout = sam_open(out, mode)) == 0) {
        fprintf(pysamerr, "[%s] fail to create the output file.\n", __func__);
        return -1;
    }
    sam_hdr_write(fpout, hout);
    if (!(flag & MERGE_UNCOMP)) hts_set_threads(fpout, n_threads);

    // Begin the actual merge
    ks_heapmake(heap, n, heap);
    while (heap->pos != HEAP_EMPTY) {
        bam1_t *b = heap->b;
        if (flag & MERGE_RG) {
            uint8_t *rg = bam_aux_get(b, "RG");
            if (rg) bam_aux_del(b, rg);
            bam_aux_append(b, "RG", 'Z', RG_len[heap->i] + 1, (uint8_t*)RG[heap->i]);
        }
        sam_write1(fpout, hout, b);
        if ((j = (iter[heap->i]? sam_itr_next(fp[heap->i], iter[heap->i], b) : sam_read1(fp[heap->i], hdr[heap->i], b))) >= 0) {
            bam_translate(b, translation_tbl + heap->i);
            heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)((int)b->core.pos+1)<<1 | bam_is_rev(b);
            heap->idx = idx++;
        } else if (j == -1) {
            heap->pos = HEAP_EMPTY;
            bam_destroy1(heap->b);
            heap->b = NULL;
        } else fprintf(pysamerr, "[bam_merge_core] '%s' is truncated. Continue anyway.\n", fn[heap->i]);
        ks_heapadjust(heap, 0, n, heap);
    }

    // Clean up and close
    if (flag & MERGE_RG) {
        for (i = 0; i != n; ++i) free(RG[i]);
        free(RG); free(RG_len);
    }
    for (i = 0; i < n; ++i) {
        trans_tbl_destroy(translation_tbl + i);
        hts_itr_destroy(iter[i]);
        bam_hdr_destroy(hdr[i]);
        sam_close(fp[i]);
    }
    bam_hdr_destroy(hout);
    sam_close(fpout);
    free(translation_tbl); free(fp); free(heap); free(iter); free(hdr);
    return 0;
}
Esempio n. 21
0
int main(int argc, char *argv[])  
{  
  short out2stdout=0;
  hashtable ht=new_hashtable(HASHSIZE);
  bamFile in,in2; 
  bamFile out; 
  int paired;//1 if not paired or pair read 1, 2 otherwise
  index_mem=sizeof(hashtable)*sizeof(hashnode**)*HASHSIZE*2;

  if (argc != 3) {  
    fprintf(stderr, "Usage: bam_fix_NH <in.bam> <out.bam or - for stdout>\n");  
    return 1;  
  }  
  // Open file and exit if error
  in = bam_open(argv[1], "rb");
  out2stdout = strcmp(argv[2], "-")? 0 : 1; 
  out = strcmp(argv[2], "-")? bam_open(argv[2], "w") : bam_dopen(fileno(stdout), "w"); 
  if (in == 0 ) {  
    fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[1]);  
    return 1;  
  }  
  if (out == 0) {  
    fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[2]);  
    return 1;  
  }  

  unsigned long num_alns=0;
  int ref;  

  // ***********
  // Copy header
  bam_header_t *header;
  header = bam_header_read(in);
  bam_header_write(out,header);

  // sorted by name?
  // Should not rely on the value in SO 
  bam1_t *aln=bam_init1();
  bam1_t *prev=bam_init1();

  if (!out2stdout) {
    fprintf(stderr,"bam_fix_NH version %s\n",VERSION);
    fprintf(stderr,"Processing %s\n",argv[1]);
    fprintf(stderr,"Hashing...\n");fflush(stderr);
  }

  while(bam_read1(in,aln)>=0) { // read alignment
    if (aln->core.tid < 0) continue;//ignore unaligned reads
    if (aln->core.flag & BAM_FUNMAP) continue;
    if (aln->core.flag & BAM_FREAD2) paired=2;
    else paired=1;
    ++num_alns;
    new_read_aln(ht,fix_read_name(bam1_qname(aln),paired));
    if(!out2stdout) PRINT_ALNS_PROCESSED(num_alns);
  }
  bam_close(in);  
  if(!out2stdout) {
    fprintf(stderr,"%s%lu\n",BACKLINE,num_alns);
    fprintf(stderr,"Hashing complete (%lu alignments)\n",num_alns);
    fprintf(stderr,"Memory used: %ld MB\n",index_mem/1024/1024);  
    fprintf(stderr,"Updating entries with NH and printing BAM...\n");
    fflush(stderr);
  }
  // reopen
  in2 = bam_open(argv[1], "rb");
  if (in2 == 0 ) {  
    fprintf(stderr, "ERROR: Fail to open BAM file %s\n", argv[1]);  
    return 1;  
  }  

  header = bam_header_read(in2);
  num_alns=0;
  while(bam_read1(in2,aln)>=0) { // read alignment
    paired=1;
    if (aln->core.tid < 0) continue;//ignore unaligned reads
    if (aln->core.flag & BAM_FUNMAP) continue;
    if (aln->core.flag & BAM_FREAD2) paired=2;
    ++num_alns;
    READ_ALN *r=get_read_aln(ht,fix_read_name(bam1_qname(aln),paired));

    assert(r!=NULL);
    // update the NH field
    uint8_t *old_nh = bam_aux_get(aln, "NH");    
    int32_t nh=r->ctr;
    if (old_nh) {
      if (nh!=bam_aux2i(old_nh)) {
	fprintf(stderr,"warning: value mismatch! replacing>%s %d->%d\n",bam1_qname(aln),bam_aux2i(old_nh),nh);
      }
      bam_aux_del(aln, old_nh);
      bam_aux_append(aln, "NH", 'i', 4, (uint8_t*)&nh);
#ifdef DEBUG
      //      printf("!>%s %d\n",bam1_qname(aln),r->ctr);
#endif
    }
    if (!old_nh) { // add NH  
      bam_aux_append(aln, "NH", 'i', 4, (uint8_t*)&nh);
#ifdef DEBUG
      fprintf(stderr,"!>%s %d\n",bam1_qname(aln),bam_aux2i(old_nh));
#endif
    }
    bam_write1(out,aln);
    if(!out2stdout) PRINT_ALNS_PROCESSED(num_alns);
  }
  // 
  bam_destroy1(aln);
  bam_close(in2);  
  bam_close(out);  
  if(!out2stdout) {
    fprintf(stderr,"%s%lu\n",BACKLINE,num_alns);
    fprintf(stderr,"Done.\n");
  }
  return 0;  
}  
Esempio n. 22
0
File: sam.c Progetto: atks/vt
static int aux_fields1(void)
{
    static const char sam[] = "data:,"
"@SQ\tSN:one\tLN:1000\n"
"@SQ\tSN:two\tLN:500\n"
"r1\t0\tone\t500\t20\t8M\t*\t0\t0\tATGCATGC\tqqqqqqqq\tXA:A:k\tXi:i:37\tXf:f:" xstr(PI) "\tXd:d:" xstr(E) "\tXZ:Z:" HELLO "\tXH:H:" BEEF "\tXB:B:c,-2,0,+2\tB0:B:i,-2147483648,-1,0,1,2147483647\tB1:B:I,0,1,2147483648,4294967295\tB2:B:s,-32768,-1,0,1,32767\tB3:B:S,0,1,32768,65535\tB4:B:c,-128,-1,0,1,127\tB5:B:C,0,1,127,255\tBf:B:f,-3.14159,2.71828\tZZ:i:1000000\tF2:d:2.46801\tY1:i:-2147483648\tY2:i:-2147483647\tY3:i:-1\tY4:i:0\tY5:i:1\tY6:i:2147483647\tY7:i:2147483648\tY8:i:4294967295\n";

    // Canonical form of the alignment record above, as output by sam_format1()
    static const char r1[] = "r1\t0\tone\t500\t20\t8M\t*\t0\t0\tATGCATGC\tqqqqqqqq\tXi:i:37\tXf:f:3.14159\tXd:d:2.71828\tXZ:Z:" NEW_HELLO "\tXH:H:" BEEF "\tXB:B:c,-2,0,2\tB0:B:i,-2147483648,-1,0,1,2147483647\tB1:B:I,0,1,2147483648,4294967295\tB2:B:s,-32768,-1,0,1,32767\tB3:B:S,0,1,32768,65535\tB4:B:c,-128,-1,0,1,127\tB5:B:C,0,1,127,255\tBf:B:f,-3.14159,2.71828\tZZ:i:1000000\tF2:f:9.8765\tY1:i:-2147483648\tY2:i:-2147483647\tY3:i:-1\tY4:i:0\tY5:i:1\tY6:i:2147483647\tY7:i:2147483648\tY8:i:4294967295\tN0:i:-1234\tN1:i:1234\tN2:i:-2\tN3:i:3\tF1:f:4.5678\tN4:B:S,65535,32768,1,0\tN5:i:4242";

    samFile *in = sam_open(sam, "r");
    bam_hdr_t *header = sam_hdr_read(in);
    bam1_t *aln = bam_init1();
    uint8_t *p;
    kstring_t ks = { 0, 0, NULL };
    int64_t b0vals[5] = { -2147483648LL,-1,0,1,2147483647LL }; // i
    int64_t b1vals[4] = { 0,1,2147483648LL,4294967295LL };     // I
    int64_t b2vals[5] = { -32768,-1,0,1,32767 };           // s
    int64_t b3vals[4] = { 0,1,32768,65535 };               // S
    int64_t b4vals[5] = { -128,-1,0,1,127 };               // c
    int64_t b5vals[4] = { 0,1,127,255 };                   // C
    // NB: Floats not doubles below!
    // See https://randomascii.wordpress.com/2012/06/26/doubles-are-not-floats-so-dont-compare-them/
    float bfvals[2] = { -3.14159f, 2.71828f };

    int8_t n4v1[] = { -128, -64, -32, -16, -8, -4, -2, -1,
                      0, 1, 2, 4, 8, 16, 32, 64, 127 };
    uint32_t n4v2[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1234, 5678, 1U << 31, 0 };
    int16_t n4v3[] = { -32768, -1, 0, 1, 32767 };
    float n4v4[] = { 0, 1, 2, 10, 20, 30, 1.5, -2.5 };
    uint8_t n4v5[] = { 0, 255 };
    int32_t n4v6[] = { -2147483647 - 1, 10, -1, 0, 1, 2147483647 };
    uint16_t n4v7[] = { 65535, 32768, 1, 0 };

    int32_t ival = -1234;
    uint32_t uval = 1234;
    float f1 = 4.5678;
    float f2 = 9.8765;

    size_t nvals, i;

    if (sam_read1(in, header, aln) >= 0) {
        if ((p = check_bam_aux_get(aln, "XA", 'A')) && bam_aux2A(p) != 'k')
            fail("XA field is '%c', expected 'k'", bam_aux2A(p));

        bam_aux_del(aln,p);
        if (bam_aux_get(aln,"XA"))
            fail("XA field was not deleted");

        if ((p = check_bam_aux_get(aln, "Xi", 'C')) && bam_aux2i(p) != 37)
            fail("Xi field is %"PRId64", expected 37", bam_aux2i(p));

        if ((p = check_bam_aux_get(aln, "Xf", 'f')) && fabs(bam_aux2f(p) - PI) > 1E-6)
            fail("Xf field is %.12f, expected pi", bam_aux2f(p));

        if ((p = check_bam_aux_get(aln, "Xd", 'd')) && fabs(bam_aux2f(p) - E) > 1E-6)
            fail("Xf field is %.12f, expected e", bam_aux2f(p));

        if ((p = check_bam_aux_get(aln, "XZ", 'Z')) && strcmp(bam_aux2Z(p), HELLO) != 0)
            fail("XZ field is \"%s\", expected \"%s\"", bam_aux2Z(p), HELLO);

        bam_aux_update_str(aln,"XZ",strlen(NEW_HELLO)+1,NEW_HELLO);
        if ((p = check_bam_aux_get(aln, "XZ", 'Z')) && strcmp(bam_aux2Z(p), NEW_HELLO) != 0)
            fail("XZ field is \"%s\", expected \"%s\"", bam_aux2Z(p), NEW_HELLO);


        if ((p = check_bam_aux_get(aln, "XH", 'H')) && strcmp(bam_aux2Z(p), BEEF) != 0)
            fail("XH field is \"%s\", expected \"%s\"", bam_aux2Z(p), BEEF);

        if ((p = check_bam_aux_get(aln, "XB", 'B'))
            && ! (memcmp(p, "Bc", 2) == 0
                  && memcmp(p + 2, "\x03\x00\x00\x00\xfe\x00\x02", 7) == 0))
            fail("XB field is %c,..., expected c,-2,0,+2", p[1]);

        check_int_B_array(aln, "B0", NELE(b0vals), b0vals);
        check_int_B_array(aln, "B1", NELE(b1vals), b1vals);
        check_int_B_array(aln, "B2", NELE(b2vals), b2vals);
        check_int_B_array(aln, "B3", NELE(b3vals), b3vals);
        check_int_B_array(aln, "B4", NELE(b4vals), b4vals);
        check_int_B_array(aln, "B5", NELE(b5vals), b5vals);

        nvals = NELE(bfvals);
        if ((p = check_bam_aux_get(aln, "Bf", 'B')) != NULL) {
            if (bam_auxB_len(p) != nvals)
                fail("Wrong length reported for Bf field, got %d, expected %zd\n",
                     bam_auxB_len(p), nvals);

            for (i = 0; i < nvals; i++) {
                if (bam_auxB2f(p, i) != bfvals[i]) {
                    fail("Wrong value from bam_auxB2f for Bf field index %zd, "
                         "got %f expected %f\n",
                         i, bam_auxB2f(p, i), bfvals[i]);
                }
            }
        }

        if ((p = check_bam_aux_get(aln, "ZZ", 'I')) && bam_aux2i(p) != 1000000)
            fail("ZZ field is %"PRId64", expected 1000000", bam_aux2i(p));

        if ((p = bam_aux_get(aln, "Y1")) && bam_aux2i(p) != -2147483647-1)
            fail("Y1 field is %"PRId64", expected -2^31", bam_aux2i(p));

        if ((p = bam_aux_get(aln, "Y2")) && bam_aux2i(p) != -2147483647)
            fail("Y2 field is %"PRId64", expected -2^31+1", bam_aux2i(p));

        if ((p = bam_aux_get(aln, "Y3")) && bam_aux2i(p) != -1)
            fail("Y3 field is %"PRId64", expected -1", bam_aux2i(p));

        if ((p = bam_aux_get(aln, "Y4")) && bam_aux2i(p) != 0)
            fail("Y4 field is %"PRId64", expected 0", bam_aux2i(p));

        if ((p = bam_aux_get(aln, "Y5")) && bam_aux2i(p) != 1)
            fail("Y5 field is %"PRId64", expected 1", bam_aux2i(p));

        if ((p = bam_aux_get(aln, "Y6")) && bam_aux2i(p) != 2147483647)
            fail("Y6 field is %"PRId64", expected 2^31-1", bam_aux2i(p));

        if ((p = bam_aux_get(aln, "Y7")) && bam_aux2i(p) != 2147483648LL)
            fail("Y7 field is %"PRId64", expected 2^31", bam_aux2i(p));

        if ((p = bam_aux_get(aln, "Y8")) && bam_aux2i(p) != 4294967295LL)
            fail("Y8 field is %"PRId64", expected 2^32-1", bam_aux2i(p));

        // Try appending some new tags
        if (bam_aux_append(aln, "N0", 'i', sizeof(ival), (uint8_t *) &ival) != 0)
            fail("Failed to append N0:i tag");

        if ((p = bam_aux_get(aln, "N0")) && bam_aux2i(p) != ival)
            fail("N0 field is %"PRId64", expected %d", bam_aux2i(p), ival);

        if (bam_aux_append(aln, "N1", 'I', sizeof(uval), (uint8_t *) &uval) != 0)
            fail("failed to append N1:I tag");

        if ((p = bam_aux_get(aln, "N1")) && bam_aux2i(p) != uval)
            fail("N1 field is %"PRId64", expected %u", bam_aux2i(p), uval);

        // Append tags with bam_aux_update_int()
        if (bam_aux_update_int(aln, "N2", -2) < 0)
            fail("failed to append N2:c tag");

        if (bam_aux_update_int(aln, "N3", 3) < 0)
            fail("failed to append N3:C tag");

        p = bam_aux_get(aln, "N2");
        if (!p)
            fail("failed to retrieve N2 tag");
        else if (*p != 'c' || bam_aux2i(p) != -2)
            fail("N2 field is %c:%"PRId64", expected c:-2", *p, bam_aux2i(p));

        p = bam_aux_get(aln, "N3");
        if (!p)
            fail("failed to retrieve N3 tag");
        else if (*p != 'C' || bam_aux2i(p) != 3)
            fail("N3 field is %c:%"PRId64", expected C:3", *p, bam_aux2i(p));

        // Try changing values with bam_aux_update_int()
        i = test_update_int(aln, "N2", 2, 'C', "N3", 3, 'C');
        if (i == 0) test_update_int(aln, "N2", 1234, 'S', "N3", 3, 'C');
        if (i == 0) test_update_int(aln, "N2", -1, 's', "N3", 3, 'C');
        if (i == 0) test_update_int(aln, "N2", 4294967295U, 'I', "N3", 3, 'C');
        if (i == 0) test_update_int(aln, "N2", -2, 'i', "N3", 3, 'C');

        // Append a value with bam_aux_update_float()
        if (bam_aux_update_float(aln, "F1", f1) < 0)
            fail("append F1:f tag");

        p = bam_aux_get(aln, "F1");
        if (!p)
            fail("retrieve F1 tag");
        else if (*p != 'f' || bam_aux2f(p) != f1)
            fail("F1 field is %c:%e, expected f:%e", *p, bam_aux2f(p), f1);

        // Change a double tag to a float
        if (bam_aux_update_float(aln, "F2", f2) < 0)
            fail("update F2 tag");

        p = bam_aux_get(aln, "F2");
        if (!p)
            fail("retrieve F2 tag");
        else if (*p != 'f' || bam_aux2f(p) != f2)
            fail("F2 field is %c:%e, expected f:%e", *p, bam_aux2f(p), f2);

        // Check the next one is intact too
        p = bam_aux_get(aln, "Y1");
        if (!p)
            fail("retrieve Y1 tag");
        else if (*p != 'i' && bam_aux2i(p) != -2147483647-1)
            fail("Y1 field is %"PRId64", expected -2^31", bam_aux2i(p));

        // bam_aux_update_array tests
        // append a new array
        i = test_update_array(aln, "N4", 'c', NELE(n4v1), n4v1, "\0\0", 0, 0);

        // Add a sentinal to check resizes work
        if (i == 0) i = test_update_int(aln, "N5", 4242, 'S', "\0\0", 0, 0);

        // alter the array tag a few times
        if (i == 0)
            i = test_update_array(aln, "N4", 'I', NELE(n4v2), n4v2,
                                  "N5", 4242, 'S');
        if (i == 0)
            i = test_update_array(aln, "N4", 's', NELE(n4v3), n4v3,
                                  "N5", 4242, 'S');
        if (i == 0)
            i = test_update_array(aln, "N4", 'f', NELE(n4v4), n4v4,
                                  "N5", 4242, 'S');
        if (i == 0)
            i = test_update_array(aln, "N4", 'c', NELE(n4v5), n4v5,
                                  "N5", 4242, 'S');
        if (i == 0)
            i = test_update_array(aln, "N4", 'i', NELE(n4v6), n4v6,
                                  "N5", 4242, 'S');
        if (i == 0)
            i = test_update_array(aln, "N4", 'S', NELE(n4v7), n4v7,
                                  "N5", 4242, 'S');

        if (sam_format1(header, aln, &ks) < 0)
            fail("can't format record");

        if (strcmp(ks.s, r1) != 0)
            fail("record formatted incorrectly: \"%s\"", ks.s);

        free(ks.s);
    }
    else fail("can't read record");

    bam_destroy1(aln);
    bam_hdr_destroy(header);
    sam_close(in);

    return 1;
}