コード例 #1
0
ファイル: bam_reheader.c プロジェクト: BIGLabHYU/samtools
/*
 * Reads a file and outputs a new CRAM file to stdout with 'h'
 * replaced as the header.  No checks are made to the validity.
 *
 * FIXME: error checking
 */
int cram_reheader(cram_fd *in, bam_hdr_t *h, const char *arg_list, int add_PG)
{
    htsFile *h_out = hts_open("-", "wc");
    cram_fd *out = h_out->fp.cram;
    cram_container *c = NULL;
    int ret = -1;

    // Attempt to fill out a cram->refs[] array from @SQ headers
    cram_fd_set_header(out, sam_hdr_parse_(h->text, h->l_text));
    if (add_PG) {
        if (sam_hdr_add_PG(cram_fd_get_header(out), "samtools",
                           "VN", samtools_version(),
                           arg_list ? "CL": NULL,
                           arg_list ? arg_list : NULL,
                           NULL) != 0)
            goto err;

        // Covert back to bam_hdr_t struct
        free(h->text);
        h->text = strdup(sam_hdr_str(cram_fd_get_header(out)));
        h->l_text = sam_hdr_length(cram_fd_get_header(out));
        if (!h->text)
            goto err;
    }

    if (sam_hdr_write(h_out, h) != 0)
        goto err;
    cram_set_option(out, CRAM_OPT_REFERENCE, NULL);

    while ((c = cram_read_container(in))) {
        int32_t i, num_blocks = cram_container_get_num_blocks(c);
        if (cram_write_container(out, c) != 0)
            goto err;

        for (i = 0; i < num_blocks; i++) {
            cram_block *blk = cram_read_block(in);
            if (!blk || cram_write_block(out, blk) != 0) {
                if (blk) cram_free_block(blk);
                goto err;
            }
            cram_free_block(blk);
        }
        cram_free_container(c);
    }

    ret = 0;

 err:
    if (hts_close(h_out) != 0)
        ret = -1;

    return ret;
}
コード例 #2
0
ファイル: cram_index.c プロジェクト: svn2github/staden-io_lib
/*
 * Skips to a container overlapping the start coordinate listed in
 * cram_range.
 *
 * In theory we call cram_index_query multiple times, once per slice
 * overlapping the range. However slices may be absent from the index
 * which makes this problematic. Instead we find the left-most slice
 * and then read from then on, skipping decoding of slices and/or
 * whole containers when they don't overlap the specified cram_range.
 *
 * Returns 0 on success
 *        -1 on failure
 */
int cram_seek_to_refpos(cram_fd *fd, cram_range *r) {
    cram_index *e;

    // Ideally use an index, so see if we have one.
    if ((e = cram_index_query(fd, r->refid, r->start, NULL))) {
	if (0 != cram_seek(fd, e->offset, SEEK_SET))
	    if (0 != cram_seek(fd, e->offset - fd->first_container, SEEK_CUR))
		return -1;
    } else {
	fprintf(stderr, "Unknown reference ID. Missing from index?\n");
	return -1;
    }

    if (fd->ctr) {
	cram_free_container(fd->ctr);
	fd->ctr = NULL;
    }

    return 0;
}
コード例 #3
0
ファイル: cram_index.c プロジェクト: mtmorgan/Rhtslib
/*
 * Builds an index file.
 *
 * fd is a newly opened cram file that we wish to index.
 * fn_base is the filename of the associated CRAM file. Internally we
 * add ".crai" to this to get the index filename.
 *
 * Returns 0 on success
 *        -1 on failure
 */
int cram_index_build(cram_fd *fd, const char *fn_base) {
    cram_container *c;
    off_t cpos, spos, hpos;
    zfp *fp;
    char fn_idx[PATH_MAX];

    if (strlen(fn_base) > PATH_MAX-6)
	return -1;

    sprintf(fn_idx, "%s.crai", fn_base);
    if (!(fp = zfopen(fn_idx, "wz"))) {
        perror(fn_idx);
        return -1;
    }

    cpos = htell(fd->fp);
    while ((c = cram_read_container(fd))) {
        int j;

        if (fd->err) {
            perror("Cram container read");
            return 1;
        }

        hpos = htell(fd->fp);

        if (!(c->comp_hdr_block = cram_read_block(fd)))
            return 1;
        assert(c->comp_hdr_block->content_type == COMPRESSION_HEADER);

        c->comp_hdr = cram_decode_compression_header(fd, c->comp_hdr_block);
        if (!c->comp_hdr)
            return -1;

        // 2.0 format
        for (j = 0; j < c->num_landmarks; j++) {
            char buf[1024];
            cram_slice *s;
            int sz;

            spos = htell(fd->fp);
            assert(spos - cpos - c->offset == c->landmark[j]);

            if (!(s = cram_read_slice(fd))) {
		zfclose(fp);
		return -1;
	    }

            sz = (int)(htell(fd->fp) - spos);

	    if (s->hdr->ref_seq_id == -2) {
		cram_index_build_multiref(fd, c, s, fp,
					  cpos, c->landmark[j], sz);
	    } else {
		sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n",
			s->hdr->ref_seq_id, s->hdr->ref_seq_start,
			s->hdr->ref_seq_span, (int64_t)cpos,
			c->landmark[j], sz);
		zfputs(buf, fp);
	    }

            cram_free_slice(s);
        }

        cpos = htell(fd->fp);
        assert(cpos == hpos + c->length);

        cram_free_container(c);
    }
    if (fd->err) {
	zfclose(fp);
	return -1;
    }
	

    return zfclose(fp);
}
コード例 #4
0
ファイル: bam_reheader.c プロジェクト: BIGLabHYU/samtools
/*
 * Reads a version 3 CRAM file and replaces the header in-place,
 * provided the header is small enough to fit without growing the
 * entire file.
 *
 * Version 3 format has a SAM header held as an (optionally)
 * compressed block within the header container.  Additional
 * uncompressed blocks or simply unallocated space (the difference
 * between total block sizes and the container size) are used to
 * provide room for growth or contraction of the compressed header.
 *
 * Returns 0 on success;
 *        -1 on general failure;
 *        -2 on failure due to insufficient size
 */
int cram_reheader_inplace3(cram_fd *fd, const bam_hdr_t *h, const char *arg_list,
                          int add_PG)
{
    cram_container *c = NULL;
    cram_block *b = NULL;
    SAM_hdr *hdr = NULL;
    off_t start, sz, end;
    int container_sz, max_container_sz;
    char *buf = NULL;
    int ret = -1;

    if (cram_major_vers(fd) < 2 ||
        cram_major_vers(fd) > 3) {
        fprintf(stderr, "[%s] unsupported CRAM version %d\n", __func__,
                cram_major_vers(fd));
        goto err;
    }

    if (!(hdr = sam_hdr_parse_(h->text, h->l_text)))
        goto err;

    if (add_PG && sam_hdr_add_PG(hdr, "samtools", "VN", samtools_version(),
                                 arg_list ? "CL": NULL,
                                 arg_list ? arg_list : NULL,
                                 NULL))
        goto err;

    int header_len = sam_hdr_length(hdr);
    /* Fix M5 strings? Maybe out of scope for this tool */

    // Find current size of SAM header block
    if ((start = hseek(cram_fd_get_fp(fd), 26, SEEK_SET)) != 26)
        goto err;

    if (!(c = cram_read_container(fd)))
        goto err;

    // +5 allows num_landmarks to increase from 0 to 1 (Cramtools)
    max_container_sz = cram_container_size(c)+5;

    sz = htell(cram_fd_get_fp(fd)) + cram_container_get_length(c) - start;
    end = htell(cram_fd_get_fp(fd)) + cram_container_get_length(c);

    // We force 1 block instead of (optionally) 2.  C CRAM
    // implementations for v3 were writing 1 compressed block followed
    // by 1 uncompressed block.  However this is tricky to deal with
    // as changing block sizes can mean the block header also changes
    // size due to itf8 and variable size integers.
    //
    // If we had 1 block, this doesn't change anything.
    // If we had 2 blocks, the new container header will be smaller by
    // 1+ bytes, requiring the cram_container_get_length(c) to be larger in value.
    // However this is an int32 instead of itf8 so the container
    // header structure stays the same size.  This means we can always
    // reduce the number of blocks without running into size problems.
    cram_container_set_num_blocks(c, 1);
    int32_t *landmark;
    int32_t num_landmarks;
    landmark = cram_container_get_landmarks(c, &num_landmarks);
    if (num_landmarks && landmark) {
        num_landmarks = 1;
        landmark[0] = 0;
    } else {
        num_landmarks = 0;
    }
    cram_container_set_landmarks(c, num_landmarks, landmark);

    buf = malloc(max_container_sz);
    container_sz = max_container_sz;
    if (cram_store_container(fd, c, buf, &container_sz) != 0)
        goto err;

    if (!buf)
        goto err;

    // Proposed new length, but changing cram_container_get_length(c) may change the
    // container_sz and thus the remainder (cram_container_get_length(c) itself).
    cram_container_set_length(c, sz - container_sz);

    int old_container_sz = container_sz;
    container_sz = max_container_sz;
    if (cram_store_container(fd, c, buf, &container_sz) != 0)
        goto err;

    if (old_container_sz != container_sz) {
        fprintf(stderr, "Quirk of fate makes this troublesome! "
                "Please use non-inplace version.\n");
        goto err;
    }



    // Version 3.0 supports compressed header
    b = cram_new_block(FILE_HEADER, 0);
    int32_put_blk(b, header_len);
    cram_block_append(b, sam_hdr_str(hdr), header_len);
    cram_block_update_size(b);

    cram_compress_block(fd, b, NULL, -1, -1);

    if (hseek(cram_fd_get_fp(fd), 26, SEEK_SET) != 26)
        goto err;

    if (cram_block_size(b) > cram_container_get_length(c)) {
        fprintf(stderr, "New header will not fit. Use non-inplace version"
                " (%d > %d)\n",
                (int)cram_block_size(b), cram_container_get_length(c));
        ret = -2;
        goto err;
    }

    if (cram_write_container(fd, c) == -1)
        goto err;

    if (cram_write_block(fd, b) == -1)
        goto err;

    // Blank out the remainder
    int rsz = end - htell(cram_fd_get_fp(fd));
    assert(rsz >= 0);
    if (rsz) {
        char *rem = calloc(1, rsz);
        ret = hwrite(cram_fd_get_fp(fd), rem, rsz) == rsz ? 0 : -1;
        free(rem);
    }

 err:
    if (c) cram_free_container(c);
    if (buf) free(buf);
    if (b) cram_free_block(b);
    if (hdr) sam_hdr_free(hdr);

    return ret;
}
コード例 #5
0
ファイル: bam_reheader.c プロジェクト: BIGLabHYU/samtools
/*
 * Reads a version 2 CRAM file and replaces the header in-place,
 * provided the header is small enough to fit without growing the
 * entire file.
 *
 * Version 2 format has an uncompressed SAM header with multiple nul
 * termination bytes to permit inline header editing.
 *
 * Returns 0 on success;
 *        -1 on general failure;
 *        -2 on failure due to insufficient size
 */
int cram_reheader_inplace2(cram_fd *fd, const bam_hdr_t *h, const char *arg_list,
                          int add_PG)
{
    cram_container *c = NULL;
    cram_block *b = NULL;
    SAM_hdr *hdr = NULL;
    off_t start;
    int ret = -1;

    if (cram_major_vers(fd) < 2 ||
        cram_major_vers(fd) > 3) {
        fprintf(stderr, "[%s] unsupported CRAM version %d\n", __func__,
                cram_major_vers(fd));
        goto err;
    }

    if (!(hdr = sam_hdr_parse_(h->text, h->l_text)))
        goto err;

    if (add_PG && sam_hdr_add_PG(hdr, "samtools", "VN", samtools_version(),
                                 arg_list ? "CL": NULL,
                                 arg_list ? arg_list : NULL,
                                 NULL))
        goto err;

    int header_len = sam_hdr_length(hdr);
    /* Fix M5 strings? Maybe out of scope for this tool */

    // Load the existing header
    if ((start = hseek(cram_fd_get_fp(fd), 26, SEEK_SET)) != 26)
        goto err;

    if (!(c = cram_read_container(fd)))
        goto err;

    // Version 2.1 has a single uncompressed block which is nul
    // terminated with many nuls to permit growth.
    //
    // So load old block and keep all contents identical bar the
    // header text itself
    if (!(b = cram_read_block(fd)))
        goto err;

    if (cram_block_get_uncomp_size(b) < header_len+4) {
        fprintf(stderr, "New header will not fit. Use non-inplace version (%d > %d)\n",
                header_len+4, cram_block_get_uncomp_size(b));
        ret = -2;
        goto err;
    }

    cram_block_set_offset(b, 0);   // rewind block
    int32_put_blk(b, header_len);
    cram_block_append(b, sam_hdr_str(hdr), header_len);
    // Zero the remaining block
    memset(cram_block_get_data(b)+cram_block_get_offset(b), 0,
           cram_block_get_uncomp_size(b) - cram_block_get_offset(b));
    // Make sure all sizes and byte-offsets are consistent after memset
    cram_block_set_offset(b, cram_block_get_uncomp_size(b));
    cram_block_set_comp_size(b, cram_block_get_uncomp_size(b));

    if (hseek(cram_fd_get_fp(fd), start, SEEK_SET) != start)
        goto err;

    if (cram_write_container(fd, c) == -1)
        goto err;

    if (cram_write_block(fd, b) == -1)
        goto err;

    ret = 0;
 err:
    if (c) cram_free_container(c);
    if (b) cram_free_block(b);
    if (hdr) sam_hdr_free(hdr);

    return ret;
}
コード例 #6
0
ファイル: bam_cat.c プロジェクト: dozy/samtools
/*
 * CRAM files don't store the RG:Z:ID per read in the aux field.
 * Instead they have a numerical data series (RG) to point each read
 * back to the Nth @RG line in the file.  This means that we may need
 * to edit the RG data series (if the files were produced from
 * "samtools split" for example).
 *
 * The encoding method is stored in the compression header. Typical
 * examples:
 *
 * RG => EXTERNAL {18}           # Block content-id 18 holds RG values
 *                               # as a series of ITF8 encoded values
 *
 * RG => HUFFMAN {1, 255, 255, 255, 255, 255, 1, 0}
 *                               # One RG value #-1.  (No RG)
 *
 * RG => HUFFMAN {1, 0, 1, 0}    # One RG value #0 (always first RG)
 *
 * RG => HUFFMAN {2, 0, 1, 2, 1, 1}
 *                               # Two RG values, #0 and #1, written
 *                               # to the CORE block and possibly
 *                               # mixed with other data series.
 *
 * A single value can (but may not be) implemented as a zero bit
 * huffman code.  In this situation we can change the meta-data in the
 * compression header to renumber an RG value..
 */
int cram_cat(int nfn, char * const *fn, const bam_hdr_t *h, const char* outcram)
{
    samFile *out;
    cram_fd *out_c;
    int i, vers_maj, vers_min;
    khash_s2i *rg2id = NULL;
    bam_hdr_t *new_h = NULL;

    /* Check consistent versioning and compatible headers */
    if (!(new_h = cram_cat_check_hdr(nfn, fn, h, &rg2id, &vers_maj, &vers_min)))
        return -1;

    /* Open the file with cram_vers */
    char vers[100];
    sprintf(vers, "%d.%d", vers_maj, vers_min);
    out = sam_open(outcram, "wc");
    if (out == 0) {
        fprintf(stderr, "[%s] ERROR: fail to open output file '%s'.\n", __func__, outcram);
        return 1;
    }
    out_c = out->fp.cram;
    cram_set_option(out_c, CRAM_OPT_VERSION, vers);
    //fprintf(stderr, "Creating cram vers %s\n", vers);

    cram_fd_set_header(out_c, sam_hdr_parse_(new_h->text,  new_h->l_text)); // needed?
    sam_hdr_write(out, new_h);

    for (i = 0; i < nfn; ++i) {
        samFile *in;
        cram_fd *in_c;
        cram_container *c;
        bam_hdr_t *old;
        int new_rg = -1;

        in = sam_open(fn[i], "rc");
        if (in == 0) {
            fprintf(stderr, "[%s] ERROR: fail to open file '%s'.\n", __func__, fn[i]);
            return -1;
        }
        in_c = in->fp.cram;

        old = sam_hdr_read(in);
        khash_s2i *rg2id_in = hash_rg(old);

        // Compute RG mapping if suitable for changing.
        if (rg2id_in->n_id == 1) {
            int _;
            new_rg = hash_s2i_inc(rg2id, rg2id_in->id[0], NULL, &_);
        } else {
            new_rg = 0;
        }

        hash_s2i_free(rg2id_in);


        // Copy contains and blocks within them
        while ((c = cram_read_container(in_c))) {
            cram_block *blk;

           if (cram_container_is_empty(in_c)) {
                if (cram_write_container(out_c, c) != 0)
                    return -1;

                // Container compression header
                if (!(blk = cram_read_block(in_c)))
                    return -1;
                if (cram_write_block(out_c, blk) != 0) {
                    cram_free_block(blk);
                    return -1;
                }
                cram_free_block(blk);
                cram_free_container(c);

                continue;
            }

            // If we have just one RG key and new_rg != 0 then
            // we need to edit the compression header. IF WE CAN.
            if (new_rg) {
                int zero = 0;
                //fprintf(stderr, "Transcode RG %d to %d\n", 0, new_rg);
                cram_transcode_rg(in_c, out_c, c, 1, &zero, &new_rg);
            } else {
                int32_t num_slices;

                // Not switching rg so do the usual read/write loop
                if (cram_write_container(out_c, c) != 0)
                    return -1;

                // Container compression header
                if (!(blk = cram_read_block(in_c)))
                    return -1;
                if (cram_write_block(out_c, blk) != 0) {
                    cram_free_block(blk);
                    return -1;
                }
                cram_free_block(blk);


                // Container num_blocks can be invalid, due to a bug.
                // Instead we iterate in slice context instead.
                (void)cram_container_get_landmarks(c, &num_slices);
                cram_copy_slice(in_c, out_c, num_slices);
            }

            cram_free_container(c);
        }

        bam_hdr_destroy(old);
        sam_close(in);
    }
    sam_close(out);

    hash_s2i_free(rg2id);
    bam_hdr_destroy(new_h);

    return 0;
}