Beispiel #1
0
/*
 * Reads a file and outputs a new CRAM file to stdout with 'h'
 * replaced as the header.  No checks are made to the validity.
 *
 * FIXME: error checking
 */
int cram_reheader(cram_fd *in, bam_hdr_t *h, const char *arg_list, int add_PG)
{
    htsFile *h_out = hts_open("-", "wc");
    cram_fd *out = h_out->fp.cram;
    cram_container *c = NULL;
    int ret = -1;

    // Attempt to fill out a cram->refs[] array from @SQ headers
    cram_fd_set_header(out, sam_hdr_parse_(h->text, h->l_text));
    if (add_PG) {
        if (sam_hdr_add_PG(cram_fd_get_header(out), "samtools",
                           "VN", samtools_version(),
                           arg_list ? "CL": NULL,
                           arg_list ? arg_list : NULL,
                           NULL) != 0)
            goto err;

        // Covert back to bam_hdr_t struct
        free(h->text);
        h->text = strdup(sam_hdr_str(cram_fd_get_header(out)));
        h->l_text = sam_hdr_length(cram_fd_get_header(out));
        if (!h->text)
            goto err;
    }

    if (sam_hdr_write(h_out, h) != 0)
        goto err;
    cram_set_option(out, CRAM_OPT_REFERENCE, NULL);

    while ((c = cram_read_container(in))) {
        int32_t i, num_blocks = cram_container_get_num_blocks(c);
        if (cram_write_container(out, c) != 0)
            goto err;

        for (i = 0; i < num_blocks; i++) {
            cram_block *blk = cram_read_block(in);
            if (!blk || cram_write_block(out, blk) != 0) {
                if (blk) cram_free_block(blk);
                goto err;
            }
            cram_free_block(blk);
        }
        cram_free_container(c);
    }

    ret = 0;

 err:
    if (hts_close(h_out) != 0)
        ret = -1;

    return ret;
}
Beispiel #2
0
/*
 * Reads a file and outputs a new BAM file to fd with 'h' replaced as
 * the header.    No checks are made to the validity.
 */
int bam_reheader(BGZF *in, bam_hdr_t *h, int fd,
                 const char *arg_list, int add_PG)
{
    BGZF *fp;
    ssize_t len;
    uint8_t *buf;
    if (in->is_write) return -1;
    buf = malloc(BUF_SIZE);
    if (bam_hdr_read(in) == NULL) {
        fprintf(stderr, "Couldn't read header\n");
        free(buf);
        return -1;
    }
    fp = bgzf_fdopen(fd, "w");

    if (add_PG) {
        // Around the houses, but it'll do until we can manipulate bam_hdr_t natively.
        SAM_hdr *sh = sam_hdr_parse_(h->text, h->l_text);
        if (sam_hdr_add_PG(sh, "samtools",
                           "VN", samtools_version(),
                           arg_list ? "CL": NULL,
                           arg_list ? arg_list : NULL,
                           NULL) != 0)
            return -1;

        free(h->text);
        h->text = strdup(sam_hdr_str(sh));
        h->l_text = sam_hdr_length(sh);
        if (!h->text)
            return -1;
        sam_hdr_free(sh);
    }

    bam_hdr_write(fp, h);
    if (in->block_offset < in->block_length) {
        bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset);
        bgzf_flush(fp);
    }
    while ((len = bgzf_raw_read(in, buf, BUF_SIZE)) > 0)
        bgzf_raw_write(fp, buf, len);
    free(buf);
    fp->block_offset = in->block_offset = 0;
    bgzf_close(fp);
    return 0;
}
Beispiel #3
0
SAM_hdr *bam_header_to_cram(bam_hdr_t *h) {
    return sam_hdr_parse_(h->text, h->l_text);
}
Beispiel #4
0
int main(int argc, char *argv[])
{
    samFile *in;
    char *fn_ref = 0;
    int flag = 0, c, clevel = -1, ignore_sam_err = 0;
    char moder[8];
    bam_hdr_t *h;
    bam1_t *b;
    htsFile *out;
    char modew[8];
    int r = 0, exit_code = 0;
    hts_opt *in_opts = NULL, *out_opts = NULL, *last = NULL;
    int nreads = 0;
    int benchmark = 0;

    while ((c = getopt(argc, argv, "IbDCSl:t:i:o:N:B")) >= 0) {
        switch (c) {
        case 'S': flag |= 1; break;
        case 'b': flag |= 2; break;
        case 'D': flag |= 4; break;
        case 'C': flag |= 8; break;
        case 'B': benchmark = 1; break;
        case 'l': clevel = atoi(optarg); flag |= 2; break;
        case 't': fn_ref = optarg; break;
        case 'I': ignore_sam_err = 1; break;
        case 'i': if (add_option(&in_opts,  optarg)) return 1; break;
        case 'o': if (add_option(&out_opts, optarg)) return 1; break;
        case 'N': nreads = atoi(optarg);
        }
    }
    if (argc == optind) {
        fprintf(stderr, "Usage: samview [-bSCSIB] [-N num_reads] [-l level] [-o option=value] <in.bam>|<in.sam>|<in.cram> [region]\n");
        return 1;
    }
    strcpy(moder, "r");
    if (flag&4) strcat(moder, "c");
    else if ((flag&1) == 0) strcat(moder, "b");

    in = sam_open(argv[optind], moder);
    if (in == NULL) {
        fprintf(stderr, "Error opening \"%s\"\n", argv[optind]);
        return EXIT_FAILURE;
    }
    h = sam_hdr_read(in);
    h->ignore_sam_err = ignore_sam_err;
    b = bam_init1();

    strcpy(modew, "w");
    if (clevel >= 0 && clevel <= 9) sprintf(modew + 1, "%d", clevel);
    if (flag&8) strcat(modew, "c");
    else if (flag&2) strcat(modew, "b");
    out = hts_open("-", modew);
    if (out == NULL) {
        fprintf(stderr, "Error opening standard output\n");
        return EXIT_FAILURE;
    }

    /* CRAM output */
    if (flag & 8) {
        int ret;

        // Parse input header and use for CRAM output
        out->fp.cram->header = sam_hdr_parse_(h->text, h->l_text);

        // Create CRAM references arrays
        if (fn_ref)
            ret = cram_set_option(out->fp.cram, CRAM_OPT_REFERENCE, fn_ref);
        else
            // Attempt to fill out a cram->refs[] array from @SQ headers
            ret = cram_set_option(out->fp.cram, CRAM_OPT_REFERENCE, NULL);

        if (ret != 0)
            return EXIT_FAILURE;
    }

    // Process any options; currently cram only.
    for (; in_opts;  in_opts = (last=in_opts)->next, free(last)) {
        hts_set_opt(in,  in_opts->opt,  in_opts->val);
        if (in_opts->opt == CRAM_OPT_REFERENCE)
            if (hts_set_opt(out,  in_opts->opt,  in_opts->val) != 0)
                return EXIT_FAILURE;
    }
    for (; out_opts;  out_opts = (last=out_opts)->next, free(last))
        if (hts_set_opt(out, out_opts->opt,  out_opts->val) != 0)
            return EXIT_FAILURE;

    if (!benchmark)
        sam_hdr_write(out, h);
    if (optind + 1 < argc && !(flag&1)) { // BAM input and has a region
        int i;
        hts_idx_t *idx;
        if ((idx = sam_index_load(in, argv[optind])) == 0) {
            fprintf(stderr, "[E::%s] fail to load the BAM index\n", __func__);
            return 1;
        }
        for (i = optind + 1; i < argc; ++i) {
            hts_itr_t *iter;
            if ((iter = sam_itr_querys(idx, h, argv[i])) == 0) {
                fprintf(stderr, "[E::%s] fail to parse region '%s'\n", __func__, argv[i]);
                continue;
            }
            while ((r = sam_itr_next(in, iter, b)) >= 0) {
                if (!benchmark && sam_write1(out, h, b) < 0) {
                    fprintf(stderr, "Error writing output.\n");
                    exit_code = 1;
                    break;
                }
                if (nreads && --nreads == 0)
                    break;
            }
            hts_itr_destroy(iter);
        }
        hts_idx_destroy(idx);
    } else while ((r = sam_read1(in, h, b)) >= 0) {
        if (!benchmark && sam_write1(out, h, b) < 0) {
            fprintf(stderr, "Error writing output.\n");
            exit_code = 1;
            break;
        }
        if (nreads && --nreads == 0)
            break;
    }

    if (r < -1) {
        fprintf(stderr, "Error parsing input.\n");
        exit_code = 1;
    }

    r = sam_close(out);
    if (r < 0) {
        fprintf(stderr, "Error closing output.\n");
        exit_code = 1;
    }

    bam_destroy1(b);
    bam_hdr_destroy(h);

    r = sam_close(in);
    if (r < 0) {
        fprintf(stderr, "Error closing input.\n");
        exit_code = 1;
    }

    return exit_code;
}
Beispiel #5
0
int main(int argc, char *argv[])
{
	samFile *in;
	char *fn_ref = 0;
	int flag = 0, c, clevel = -1, ignore_sam_err = 0;
	char moder[8];
	bam_hdr_t *h;
	bam1_t *b;
	htsFile *out;
	char modew[8];
	int r = 0, exit_code = 0;

	while ((c = getopt(argc, argv, "IbDCSl:t:")) >= 0) {
		switch (c) {
		case 'S': flag |= 1; break;
		case 'b': flag |= 2; break;
		case 'D': flag |= 4; break;
		case 'C': flag |= 8; break;
		case 'l': clevel = atoi(optarg); flag |= 2; break;
		case 't': fn_ref = optarg; break;
		case 'I': ignore_sam_err = 1; break;
		}
	}
	if (argc == optind) {
		fprintf(stderr, "Usage: samview [-bSCSI] [-l level] <in.bam>|<in.sam>|<in.cram> [region]\n");
		return 1;
	}
	strcpy(moder, "r");
	if (flag&4) strcat(moder, "c");
	else if ((flag&1) == 0) strcat(moder, "b");

	in = sam_open(argv[optind], moder);
	h = sam_hdr_read(in);
	h->ignore_sam_err = ignore_sam_err;
	b = bam_init1();

	strcpy(modew, "w");
	if (clevel >= 0 && clevel <= 9) sprintf(modew + 1, "%d", clevel);
	if (flag&8) strcat(modew, "c");
	else if (flag&2) strcat(modew, "b");
	out = hts_open("-", modew);

	/* CRAM output */
	if (flag & 8) {
	    // Parse input header and use for CRAM output
	    out->fp.cram->header = sam_hdr_parse_(h->text, h->l_text);

	    // Create CRAM references arrays
	    if (fn_ref)
		cram_set_option(out->fp.cram, CRAM_OPT_REFERENCE, fn_ref);
	    else
		// Attempt to fill out a cram->refs[] array from @SQ headers
		cram_set_option(out->fp.cram, CRAM_OPT_REFERENCE, NULL);
	}

	sam_hdr_write(out, h);
	if (optind + 1 < argc && !(flag&1)) { // BAM input and has a region
	    int i;
	    hts_idx_t *idx;
	    if ((idx = bam_index_load(argv[optind])) == 0) {
		fprintf(stderr, "[E::%s] fail to load the BAM index\n", __func__);
		return 1;
	    }
	    for (i = optind + 1; i < argc; ++i) {
		hts_itr_t *iter;
		if ((iter = bam_itr_querys(idx, h, argv[i])) == 0) {
		    fprintf(stderr, "[E::%s] fail to parse region '%s'\n", __func__, argv[i]);
		    continue;
		}
		while ((r = bam_itr_next(in, iter, b)) >= 0) {
		    if (sam_write1(out, h, b) < 0) {
			fprintf(stderr, "Error writing output.\n");
			exit_code = 1;
			break;
		    }
		}
		hts_itr_destroy(iter);
	    }
	    hts_idx_destroy(idx);
	} else while ((r = sam_read1(in, h, b)) >= 0) {
		if (sam_write1(out, h, b) < 0) {
			fprintf(stderr, "Error writing output.\n");
			exit_code = 1;
			break;
		}
	}
	sam_close(out);

	if (r < -1) {
	    fprintf(stderr, "Error parsing input.\n");
	    exit_code = 1;
	}

	bam_destroy1(b);
	bam_hdr_destroy(h);
	sam_close(in);
	return exit_code;
}
Beispiel #6
0
/*
 * Reads a version 3 CRAM file and replaces the header in-place,
 * provided the header is small enough to fit without growing the
 * entire file.
 *
 * Version 3 format has a SAM header held as an (optionally)
 * compressed block within the header container.  Additional
 * uncompressed blocks or simply unallocated space (the difference
 * between total block sizes and the container size) are used to
 * provide room for growth or contraction of the compressed header.
 *
 * Returns 0 on success;
 *        -1 on general failure;
 *        -2 on failure due to insufficient size
 */
int cram_reheader_inplace3(cram_fd *fd, const bam_hdr_t *h, const char *arg_list,
                          int add_PG)
{
    cram_container *c = NULL;
    cram_block *b = NULL;
    SAM_hdr *hdr = NULL;
    off_t start, sz, end;
    int container_sz, max_container_sz;
    char *buf = NULL;
    int ret = -1;

    if (cram_major_vers(fd) < 2 ||
        cram_major_vers(fd) > 3) {
        fprintf(stderr, "[%s] unsupported CRAM version %d\n", __func__,
                cram_major_vers(fd));
        goto err;
    }

    if (!(hdr = sam_hdr_parse_(h->text, h->l_text)))
        goto err;

    if (add_PG && sam_hdr_add_PG(hdr, "samtools", "VN", samtools_version(),
                                 arg_list ? "CL": NULL,
                                 arg_list ? arg_list : NULL,
                                 NULL))
        goto err;

    int header_len = sam_hdr_length(hdr);
    /* Fix M5 strings? Maybe out of scope for this tool */

    // Find current size of SAM header block
    if ((start = hseek(cram_fd_get_fp(fd), 26, SEEK_SET)) != 26)
        goto err;

    if (!(c = cram_read_container(fd)))
        goto err;

    // +5 allows num_landmarks to increase from 0 to 1 (Cramtools)
    max_container_sz = cram_container_size(c)+5;

    sz = htell(cram_fd_get_fp(fd)) + cram_container_get_length(c) - start;
    end = htell(cram_fd_get_fp(fd)) + cram_container_get_length(c);

    // We force 1 block instead of (optionally) 2.  C CRAM
    // implementations for v3 were writing 1 compressed block followed
    // by 1 uncompressed block.  However this is tricky to deal with
    // as changing block sizes can mean the block header also changes
    // size due to itf8 and variable size integers.
    //
    // If we had 1 block, this doesn't change anything.
    // If we had 2 blocks, the new container header will be smaller by
    // 1+ bytes, requiring the cram_container_get_length(c) to be larger in value.
    // However this is an int32 instead of itf8 so the container
    // header structure stays the same size.  This means we can always
    // reduce the number of blocks without running into size problems.
    cram_container_set_num_blocks(c, 1);
    int32_t *landmark;
    int32_t num_landmarks;
    landmark = cram_container_get_landmarks(c, &num_landmarks);
    if (num_landmarks && landmark) {
        num_landmarks = 1;
        landmark[0] = 0;
    } else {
        num_landmarks = 0;
    }
    cram_container_set_landmarks(c, num_landmarks, landmark);

    buf = malloc(max_container_sz);
    container_sz = max_container_sz;
    if (cram_store_container(fd, c, buf, &container_sz) != 0)
        goto err;

    if (!buf)
        goto err;

    // Proposed new length, but changing cram_container_get_length(c) may change the
    // container_sz and thus the remainder (cram_container_get_length(c) itself).
    cram_container_set_length(c, sz - container_sz);

    int old_container_sz = container_sz;
    container_sz = max_container_sz;
    if (cram_store_container(fd, c, buf, &container_sz) != 0)
        goto err;

    if (old_container_sz != container_sz) {
        fprintf(stderr, "Quirk of fate makes this troublesome! "
                "Please use non-inplace version.\n");
        goto err;
    }



    // Version 3.0 supports compressed header
    b = cram_new_block(FILE_HEADER, 0);
    int32_put_blk(b, header_len);
    cram_block_append(b, sam_hdr_str(hdr), header_len);
    cram_block_update_size(b);

    cram_compress_block(fd, b, NULL, -1, -1);

    if (hseek(cram_fd_get_fp(fd), 26, SEEK_SET) != 26)
        goto err;

    if (cram_block_size(b) > cram_container_get_length(c)) {
        fprintf(stderr, "New header will not fit. Use non-inplace version"
                " (%d > %d)\n",
                (int)cram_block_size(b), cram_container_get_length(c));
        ret = -2;
        goto err;
    }

    if (cram_write_container(fd, c) == -1)
        goto err;

    if (cram_write_block(fd, b) == -1)
        goto err;

    // Blank out the remainder
    int rsz = end - htell(cram_fd_get_fp(fd));
    assert(rsz >= 0);
    if (rsz) {
        char *rem = calloc(1, rsz);
        ret = hwrite(cram_fd_get_fp(fd), rem, rsz) == rsz ? 0 : -1;
        free(rem);
    }

 err:
    if (c) cram_free_container(c);
    if (buf) free(buf);
    if (b) cram_free_block(b);
    if (hdr) sam_hdr_free(hdr);

    return ret;
}
Beispiel #7
0
/*
 * Reads a version 2 CRAM file and replaces the header in-place,
 * provided the header is small enough to fit without growing the
 * entire file.
 *
 * Version 2 format has an uncompressed SAM header with multiple nul
 * termination bytes to permit inline header editing.
 *
 * Returns 0 on success;
 *        -1 on general failure;
 *        -2 on failure due to insufficient size
 */
int cram_reheader_inplace2(cram_fd *fd, const bam_hdr_t *h, const char *arg_list,
                          int add_PG)
{
    cram_container *c = NULL;
    cram_block *b = NULL;
    SAM_hdr *hdr = NULL;
    off_t start;
    int ret = -1;

    if (cram_major_vers(fd) < 2 ||
        cram_major_vers(fd) > 3) {
        fprintf(stderr, "[%s] unsupported CRAM version %d\n", __func__,
                cram_major_vers(fd));
        goto err;
    }

    if (!(hdr = sam_hdr_parse_(h->text, h->l_text)))
        goto err;

    if (add_PG && sam_hdr_add_PG(hdr, "samtools", "VN", samtools_version(),
                                 arg_list ? "CL": NULL,
                                 arg_list ? arg_list : NULL,
                                 NULL))
        goto err;

    int header_len = sam_hdr_length(hdr);
    /* Fix M5 strings? Maybe out of scope for this tool */

    // Load the existing header
    if ((start = hseek(cram_fd_get_fp(fd), 26, SEEK_SET)) != 26)
        goto err;

    if (!(c = cram_read_container(fd)))
        goto err;

    // Version 2.1 has a single uncompressed block which is nul
    // terminated with many nuls to permit growth.
    //
    // So load old block and keep all contents identical bar the
    // header text itself
    if (!(b = cram_read_block(fd)))
        goto err;

    if (cram_block_get_uncomp_size(b) < header_len+4) {
        fprintf(stderr, "New header will not fit. Use non-inplace version (%d > %d)\n",
                header_len+4, cram_block_get_uncomp_size(b));
        ret = -2;
        goto err;
    }

    cram_block_set_offset(b, 0);   // rewind block
    int32_put_blk(b, header_len);
    cram_block_append(b, sam_hdr_str(hdr), header_len);
    // Zero the remaining block
    memset(cram_block_get_data(b)+cram_block_get_offset(b), 0,
           cram_block_get_uncomp_size(b) - cram_block_get_offset(b));
    // Make sure all sizes and byte-offsets are consistent after memset
    cram_block_set_offset(b, cram_block_get_uncomp_size(b));
    cram_block_set_comp_size(b, cram_block_get_uncomp_size(b));

    if (hseek(cram_fd_get_fp(fd), start, SEEK_SET) != start)
        goto err;

    if (cram_write_container(fd, c) == -1)
        goto err;

    if (cram_write_block(fd, b) == -1)
        goto err;

    ret = 0;
 err:
    if (c) cram_free_container(c);
    if (b) cram_free_block(b);
    if (hdr) sam_hdr_free(hdr);

    return ret;
}
Beispiel #8
0
/*
 * CRAM files don't store the RG:Z:ID per read in the aux field.
 * Instead they have a numerical data series (RG) to point each read
 * back to the Nth @RG line in the file.  This means that we may need
 * to edit the RG data series (if the files were produced from
 * "samtools split" for example).
 *
 * The encoding method is stored in the compression header. Typical
 * examples:
 *
 * RG => EXTERNAL {18}           # Block content-id 18 holds RG values
 *                               # as a series of ITF8 encoded values
 *
 * RG => HUFFMAN {1, 255, 255, 255, 255, 255, 1, 0}
 *                               # One RG value #-1.  (No RG)
 *
 * RG => HUFFMAN {1, 0, 1, 0}    # One RG value #0 (always first RG)
 *
 * RG => HUFFMAN {2, 0, 1, 2, 1, 1}
 *                               # Two RG values, #0 and #1, written
 *                               # to the CORE block and possibly
 *                               # mixed with other data series.
 *
 * A single value can (but may not be) implemented as a zero bit
 * huffman code.  In this situation we can change the meta-data in the
 * compression header to renumber an RG value..
 */
int cram_cat(int nfn, char * const *fn, const bam_hdr_t *h, const char* outcram)
{
    samFile *out;
    cram_fd *out_c;
    int i, vers_maj, vers_min;
    khash_s2i *rg2id = NULL;
    bam_hdr_t *new_h = NULL;

    /* Check consistent versioning and compatible headers */
    if (!(new_h = cram_cat_check_hdr(nfn, fn, h, &rg2id, &vers_maj, &vers_min)))
        return -1;

    /* Open the file with cram_vers */
    char vers[100];
    sprintf(vers, "%d.%d", vers_maj, vers_min);
    out = sam_open(outcram, "wc");
    if (out == 0) {
        fprintf(stderr, "[%s] ERROR: fail to open output file '%s'.\n", __func__, outcram);
        return 1;
    }
    out_c = out->fp.cram;
    cram_set_option(out_c, CRAM_OPT_VERSION, vers);
    //fprintf(stderr, "Creating cram vers %s\n", vers);

    cram_fd_set_header(out_c, sam_hdr_parse_(new_h->text,  new_h->l_text)); // needed?
    sam_hdr_write(out, new_h);

    for (i = 0; i < nfn; ++i) {
        samFile *in;
        cram_fd *in_c;
        cram_container *c;
        bam_hdr_t *old;
        int new_rg = -1;

        in = sam_open(fn[i], "rc");
        if (in == 0) {
            fprintf(stderr, "[%s] ERROR: fail to open file '%s'.\n", __func__, fn[i]);
            return -1;
        }
        in_c = in->fp.cram;

        old = sam_hdr_read(in);
        khash_s2i *rg2id_in = hash_rg(old);

        // Compute RG mapping if suitable for changing.
        if (rg2id_in->n_id == 1) {
            int _;
            new_rg = hash_s2i_inc(rg2id, rg2id_in->id[0], NULL, &_);
        } else {
            new_rg = 0;
        }

        hash_s2i_free(rg2id_in);


        // Copy contains and blocks within them
        while ((c = cram_read_container(in_c))) {
            cram_block *blk;

           if (cram_container_is_empty(in_c)) {
                if (cram_write_container(out_c, c) != 0)
                    return -1;

                // Container compression header
                if (!(blk = cram_read_block(in_c)))
                    return -1;
                if (cram_write_block(out_c, blk) != 0) {
                    cram_free_block(blk);
                    return -1;
                }
                cram_free_block(blk);
                cram_free_container(c);

                continue;
            }

            // If we have just one RG key and new_rg != 0 then
            // we need to edit the compression header. IF WE CAN.
            if (new_rg) {
                int zero = 0;
                //fprintf(stderr, "Transcode RG %d to %d\n", 0, new_rg);
                cram_transcode_rg(in_c, out_c, c, 1, &zero, &new_rg);
            } else {
                int32_t num_slices;

                // Not switching rg so do the usual read/write loop
                if (cram_write_container(out_c, c) != 0)
                    return -1;

                // Container compression header
                if (!(blk = cram_read_block(in_c)))
                    return -1;
                if (cram_write_block(out_c, blk) != 0) {
                    cram_free_block(blk);
                    return -1;
                }
                cram_free_block(blk);


                // Container num_blocks can be invalid, due to a bug.
                // Instead we iterate in slice context instead.
                (void)cram_container_get_landmarks(c, &num_slices);
                cram_copy_slice(in_c, out_c, num_slices);
            }

            cram_free_container(c);
        }

        bam_hdr_destroy(old);
        sam_close(in);
    }
    sam_close(out);

    hash_s2i_free(rg2id);
    bam_hdr_destroy(new_h);

    return 0;
}