Ejemplo n.º 1
0
/*
 * Renumbers RG numbers in a cram compression header.
 *
 * CRAM stores RG as the Nth number in the header, rather than a
 * string holding the ID: tag.  This is smaller in space, but means
 * "samtools cat" to join files together that contain single but
 * different RG lines needs a way of renumbering them.
 *
 * The file descriptor is expected to be immediately after the
 * cram_container structure (ie before the cram compression header).
 * Due to the nature of the CRAM format, this needs to read and write
 * the blocks itself.  Note that there may be multiple slices within
 * the container, meaning multiple compression headers to manipulate.
 * Changing RG may change the size of the compression header and
 * therefore the length field in the container.  Hence we rewrite all
 * blocks just incase and also emit the adjusted container.
 *
 * The current implementation can only cope with renumbering a single
 * RG (and only then if it is using HUFFMAN or BETA codecs).  In
 * theory it *may* be possible to renumber multiple RGs if they use
 * HUFFMAN to the CORE block or use an external block unshared by any
 * other data series.  So we have an API that can be upgraded to
 * support this, but do not implement it for now.  An example
 * implementation of RG as an EXTERNAL block would be to find that
 * block and rewrite it, returning the number of blocks consumed.
 *
 * Returns 0 on success;
 *        -1 if unable to edit;
 *        -2 on other errors (eg I/O).
 */
int cram_transcode_rg(cram_fd *in, cram_fd *out,
                      cram_container *c,
                      int nrg, int *in_rg, int *out_rg) {
    int new_rg = *out_rg, old_size, new_size;
    cram_block *o_blk, *n_blk;
    cram_block_compression_hdr *ch;

    if (nrg != 1) {
        hts_log_error("CRAM transcode supports only a single RG");
        return -2;
    }

    // Produce a new block holding the updated compression header,
    // with RG transcoded to a new value. (Single only supported.)
    o_blk = cram_read_block(in);
    old_size = cram_block_size(o_blk);
    ch = cram_decode_compression_header(in, o_blk);
    if (cram_block_compression_hdr_set_rg(ch, new_rg) != 0)
        return -1;
    cram_block_compression_hdr_decoder2encoder(in, ch);
    n_blk = cram_encode_compression_header(in, c, ch);
    cram_free_compression_header(ch);

    /*
     * Warning: this has internal knowledge of the cram compression
     * header format.
     *
     * The decoder doesn't set c->tags_used, so the encoder puts a two
     * byte blank segment.  This means n_blk is too short.  We skip
     * through the decoded old block (o_blk) and copy from there.
     */
    char *cp = cram_block_get_data(o_blk);
    char *op = cp;
    char *endp = cp + cram_block_get_uncomp_size(o_blk);
    //fprintf(stderr, "sz = %d\n", (int)(endp-cp));
    int32_t i32;

    cp += safe_itf8_get(cp, endp, &i32);
    cp += i32;
    cp += safe_itf8_get(cp, endp, &i32);
    cp += i32;
    op = cp;
    cp += safe_itf8_get(cp, endp, &i32);
    i32 += (cp-op);

    //fprintf(stderr, "remaining %d bytes\n", i32);
    cram_block_set_size(n_blk, cram_block_get_size(n_blk)-2);
    cram_block_append(n_blk, op, i32);
    cram_block_update_size(n_blk);

    new_size = cram_block_size(n_blk);

    //fprintf(stderr, "size %d -> %d\n", old_size, new_size);

    // Now we've constructedthe updated compression header,
    // amend the container too (it may have changed size).
    int32_t *landmarks, num_landmarks;
    landmarks = cram_container_get_landmarks(c, &num_landmarks);

    if (old_size != new_size) {
        int diff = new_size - old_size, j;

        for (j = 0; j < num_landmarks; j++)
            landmarks[j] += diff;
        //cram_container_set_landmarks(c, num_landmarks, landmarks);
        cram_container_set_length(c, cram_container_get_length(c) + diff);
    }

    // Finally write it all out; container, compression header,
    // and then all the remaining slice blocks.
    if (cram_write_container(out, c) != 0)
        return -2;

    cram_write_block(out, n_blk);
    cram_free_block(o_blk);
    cram_free_block(n_blk);

    // Container num_blocks can be invalid, due to a bug.
    // Instead we iterate in slice context instead.
    return cram_copy_slice(in, out, num_landmarks);
}
Ejemplo n.º 2
0
/*
 * Reads a version 3 CRAM file and replaces the header in-place,
 * provided the header is small enough to fit without growing the
 * entire file.
 *
 * Version 3 format has a SAM header held as an (optionally)
 * compressed block within the header container.  Additional
 * uncompressed blocks or simply unallocated space (the difference
 * between total block sizes and the container size) are used to
 * provide room for growth or contraction of the compressed header.
 *
 * Returns 0 on success;
 *        -1 on general failure;
 *        -2 on failure due to insufficient size
 */
int cram_reheader_inplace3(cram_fd *fd, const bam_hdr_t *h, const char *arg_list,
                          int add_PG)
{
    cram_container *c = NULL;
    cram_block *b = NULL;
    SAM_hdr *hdr = NULL;
    off_t start, sz, end;
    int container_sz, max_container_sz;
    char *buf = NULL;
    int ret = -1;

    if (cram_major_vers(fd) < 2 ||
        cram_major_vers(fd) > 3) {
        fprintf(stderr, "[%s] unsupported CRAM version %d\n", __func__,
                cram_major_vers(fd));
        goto err;
    }

    if (!(hdr = sam_hdr_parse_(h->text, h->l_text)))
        goto err;

    if (add_PG && sam_hdr_add_PG(hdr, "samtools", "VN", samtools_version(),
                                 arg_list ? "CL": NULL,
                                 arg_list ? arg_list : NULL,
                                 NULL))
        goto err;

    int header_len = sam_hdr_length(hdr);
    /* Fix M5 strings? Maybe out of scope for this tool */

    // Find current size of SAM header block
    if ((start = hseek(cram_fd_get_fp(fd), 26, SEEK_SET)) != 26)
        goto err;

    if (!(c = cram_read_container(fd)))
        goto err;

    // +5 allows num_landmarks to increase from 0 to 1 (Cramtools)
    max_container_sz = cram_container_size(c)+5;

    sz = htell(cram_fd_get_fp(fd)) + cram_container_get_length(c) - start;
    end = htell(cram_fd_get_fp(fd)) + cram_container_get_length(c);

    // We force 1 block instead of (optionally) 2.  C CRAM
    // implementations for v3 were writing 1 compressed block followed
    // by 1 uncompressed block.  However this is tricky to deal with
    // as changing block sizes can mean the block header also changes
    // size due to itf8 and variable size integers.
    //
    // If we had 1 block, this doesn't change anything.
    // If we had 2 blocks, the new container header will be smaller by
    // 1+ bytes, requiring the cram_container_get_length(c) to be larger in value.
    // However this is an int32 instead of itf8 so the container
    // header structure stays the same size.  This means we can always
    // reduce the number of blocks without running into size problems.
    cram_container_set_num_blocks(c, 1);
    int32_t *landmark;
    int32_t num_landmarks;
    landmark = cram_container_get_landmarks(c, &num_landmarks);
    if (num_landmarks && landmark) {
        num_landmarks = 1;
        landmark[0] = 0;
    } else {
        num_landmarks = 0;
    }
    cram_container_set_landmarks(c, num_landmarks, landmark);

    buf = malloc(max_container_sz);
    container_sz = max_container_sz;
    if (cram_store_container(fd, c, buf, &container_sz) != 0)
        goto err;

    if (!buf)
        goto err;

    // Proposed new length, but changing cram_container_get_length(c) may change the
    // container_sz and thus the remainder (cram_container_get_length(c) itself).
    cram_container_set_length(c, sz - container_sz);

    int old_container_sz = container_sz;
    container_sz = max_container_sz;
    if (cram_store_container(fd, c, buf, &container_sz) != 0)
        goto err;

    if (old_container_sz != container_sz) {
        fprintf(stderr, "Quirk of fate makes this troublesome! "
                "Please use non-inplace version.\n");
        goto err;
    }



    // Version 3.0 supports compressed header
    b = cram_new_block(FILE_HEADER, 0);
    int32_put_blk(b, header_len);
    cram_block_append(b, sam_hdr_str(hdr), header_len);
    cram_block_update_size(b);

    cram_compress_block(fd, b, NULL, -1, -1);

    if (hseek(cram_fd_get_fp(fd), 26, SEEK_SET) != 26)
        goto err;

    if (cram_block_size(b) > cram_container_get_length(c)) {
        fprintf(stderr, "New header will not fit. Use non-inplace version"
                " (%d > %d)\n",
                (int)cram_block_size(b), cram_container_get_length(c));
        ret = -2;
        goto err;
    }

    if (cram_write_container(fd, c) == -1)
        goto err;

    if (cram_write_block(fd, b) == -1)
        goto err;

    // Blank out the remainder
    int rsz = end - htell(cram_fd_get_fp(fd));
    assert(rsz >= 0);
    if (rsz) {
        char *rem = calloc(1, rsz);
        ret = hwrite(cram_fd_get_fp(fd), rem, rsz) == rsz ? 0 : -1;
        free(rem);
    }

 err:
    if (c) cram_free_container(c);
    if (buf) free(buf);
    if (b) cram_free_block(b);
    if (hdr) sam_hdr_free(hdr);

    return ret;
}