Exemplo n.º 1
0
/*
 * Builds an index file.
 *
 * fd is a newly opened cram file that we wish to index.
 * fn_base is the filename of the associated CRAM file. Internally we
 * add ".crai" to this to get the index filename.
 *
 * Returns 0 on success
 *        -1 on failure
 */
int cram_index_build(cram_fd *fd, const char *fn_base) {
    cram_container *c;
    off_t cpos, spos, hpos;
    zfp *fp;
    char fn_idx[PATH_MAX];

    if (strlen(fn_base) > PATH_MAX-6)
	return -1;

    sprintf(fn_idx, "%s.crai", fn_base);
    if (!(fp = zfopen(fn_idx, "wz"))) {
        perror(fn_idx);
        return -1;
    }

    cpos = htell(fd->fp);
    while ((c = cram_read_container(fd))) {
        int j;

        if (fd->err) {
            perror("Cram container read");
            return 1;
        }

        hpos = htell(fd->fp);

        if (!(c->comp_hdr_block = cram_read_block(fd)))
            return 1;
        assert(c->comp_hdr_block->content_type == COMPRESSION_HEADER);

        c->comp_hdr = cram_decode_compression_header(fd, c->comp_hdr_block);
        if (!c->comp_hdr)
            return -1;

        // 2.0 format
        for (j = 0; j < c->num_landmarks; j++) {
            char buf[1024];
            cram_slice *s;
            int sz;

            spos = htell(fd->fp);
            assert(spos - cpos - c->offset == c->landmark[j]);

            if (!(s = cram_read_slice(fd))) {
		zfclose(fp);
		return -1;
	    }

            sz = (int)(htell(fd->fp) - spos);

	    if (s->hdr->ref_seq_id == -2) {
		cram_index_build_multiref(fd, c, s, fp,
					  cpos, c->landmark[j], sz);
	    } else {
		sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n",
			s->hdr->ref_seq_id, s->hdr->ref_seq_start,
			s->hdr->ref_seq_span, (int64_t)cpos,
			c->landmark[j], sz);
		zfputs(buf, fp);
	    }

            cram_free_slice(s);
        }

        cpos = htell(fd->fp);
        assert(cpos == hpos + c->length);

        cram_free_container(c);
    }
    if (fd->err) {
	zfclose(fp);
	return -1;
    }
	

    return zfclose(fp);
}
Exemplo n.º 2
0
/*
 * Renumbers RG numbers in a cram compression header.
 *
 * CRAM stores RG as the Nth number in the header, rather than a
 * string holding the ID: tag.  This is smaller in space, but means
 * "samtools cat" to join files together that contain single but
 * different RG lines needs a way of renumbering them.
 *
 * The file descriptor is expected to be immediately after the
 * cram_container structure (ie before the cram compression header).
 * Due to the nature of the CRAM format, this needs to read and write
 * the blocks itself.  Note that there may be multiple slices within
 * the container, meaning multiple compression headers to manipulate.
 * Changing RG may change the size of the compression header and
 * therefore the length field in the container.  Hence we rewrite all
 * blocks just incase and also emit the adjusted container.
 *
 * The current implementation can only cope with renumbering a single
 * RG (and only then if it is using HUFFMAN or BETA codecs).  In
 * theory it *may* be possible to renumber multiple RGs if they use
 * HUFFMAN to the CORE block or use an external block unshared by any
 * other data series.  So we have an API that can be upgraded to
 * support this, but do not implement it for now.  An example
 * implementation of RG as an EXTERNAL block would be to find that
 * block and rewrite it, returning the number of blocks consumed.
 *
 * Returns 0 on success;
 *        -1 if unable to edit;
 *        -2 on other errors (eg I/O).
 */
int cram_transcode_rg(cram_fd *in, cram_fd *out,
                      cram_container *c,
                      int nrg, int *in_rg, int *out_rg) {
    int new_rg = *out_rg, old_size, new_size;
    cram_block *o_blk, *n_blk;
    cram_block_compression_hdr *ch;

    if (nrg != 1) {
        hts_log_error("CRAM transcode supports only a single RG");
        return -2;
    }

    // Produce a new block holding the updated compression header,
    // with RG transcoded to a new value. (Single only supported.)
    o_blk = cram_read_block(in);
    old_size = cram_block_size(o_blk);
    ch = cram_decode_compression_header(in, o_blk);
    if (cram_block_compression_hdr_set_rg(ch, new_rg) != 0)
        return -1;
    cram_block_compression_hdr_decoder2encoder(in, ch);
    n_blk = cram_encode_compression_header(in, c, ch);
    cram_free_compression_header(ch);

    /*
     * Warning: this has internal knowledge of the cram compression
     * header format.
     *
     * The decoder doesn't set c->tags_used, so the encoder puts a two
     * byte blank segment.  This means n_blk is too short.  We skip
     * through the decoded old block (o_blk) and copy from there.
     */
    char *cp = cram_block_get_data(o_blk);
    char *op = cp;
    char *endp = cp + cram_block_get_uncomp_size(o_blk);
    //fprintf(stderr, "sz = %d\n", (int)(endp-cp));
    int32_t i32;

    cp += safe_itf8_get(cp, endp, &i32);
    cp += i32;
    cp += safe_itf8_get(cp, endp, &i32);
    cp += i32;
    op = cp;
    cp += safe_itf8_get(cp, endp, &i32);
    i32 += (cp-op);

    //fprintf(stderr, "remaining %d bytes\n", i32);
    cram_block_set_size(n_blk, cram_block_get_size(n_blk)-2);
    cram_block_append(n_blk, op, i32);
    cram_block_update_size(n_blk);

    new_size = cram_block_size(n_blk);

    //fprintf(stderr, "size %d -> %d\n", old_size, new_size);

    // Now we've constructedthe updated compression header,
    // amend the container too (it may have changed size).
    int32_t *landmarks, num_landmarks;
    landmarks = cram_container_get_landmarks(c, &num_landmarks);

    if (old_size != new_size) {
        int diff = new_size - old_size, j;

        for (j = 0; j < num_landmarks; j++)
            landmarks[j] += diff;
        //cram_container_set_landmarks(c, num_landmarks, landmarks);
        cram_container_set_length(c, cram_container_get_length(c) + diff);
    }

    // Finally write it all out; container, compression header,
    // and then all the remaining slice blocks.
    if (cram_write_container(out, c) != 0)
        return -2;

    cram_write_block(out, n_blk);
    cram_free_block(o_blk);
    cram_free_block(n_blk);

    // Container num_blocks can be invalid, due to a bug.
    // Instead we iterate in slice context instead.
    return cram_copy_slice(in, out, num_landmarks);
}