/* * Renumbers RG numbers in a cram compression header. * * CRAM stores RG as the Nth number in the header, rather than a * string holding the ID: tag. This is smaller in space, but means * "samtools cat" to join files together that contain single but * different RG lines needs a way of renumbering them. * * The file descriptor is expected to be immediately after the * cram_container structure (ie before the cram compression header). * Due to the nature of the CRAM format, this needs to read and write * the blocks itself. Note that there may be multiple slices within * the container, meaning multiple compression headers to manipulate. * Changing RG may change the size of the compression header and * therefore the length field in the container. Hence we rewrite all * blocks just incase and also emit the adjusted container. * * The current implementation can only cope with renumbering a single * RG (and only then if it is using HUFFMAN or BETA codecs). In * theory it *may* be possible to renumber multiple RGs if they use * HUFFMAN to the CORE block or use an external block unshared by any * other data series. So we have an API that can be upgraded to * support this, but do not implement it for now. An example * implementation of RG as an EXTERNAL block would be to find that * block and rewrite it, returning the number of blocks consumed. * * Returns 0 on success; * -1 if unable to edit; * -2 on other errors (eg I/O). */ int cram_transcode_rg(cram_fd *in, cram_fd *out, cram_container *c, int nrg, int *in_rg, int *out_rg) { int new_rg = *out_rg, old_size, new_size; cram_block *o_blk, *n_blk; cram_block_compression_hdr *ch; if (nrg != 1) { hts_log_error("CRAM transcode supports only a single RG"); return -2; } // Produce a new block holding the updated compression header, // with RG transcoded to a new value. (Single only supported.) o_blk = cram_read_block(in); old_size = cram_block_size(o_blk); ch = cram_decode_compression_header(in, o_blk); if (cram_block_compression_hdr_set_rg(ch, new_rg) != 0) return -1; cram_block_compression_hdr_decoder2encoder(in, ch); n_blk = cram_encode_compression_header(in, c, ch); cram_free_compression_header(ch); /* * Warning: this has internal knowledge of the cram compression * header format. * * The decoder doesn't set c->tags_used, so the encoder puts a two * byte blank segment. This means n_blk is too short. We skip * through the decoded old block (o_blk) and copy from there. */ char *cp = cram_block_get_data(o_blk); char *op = cp; char *endp = cp + cram_block_get_uncomp_size(o_blk); //fprintf(stderr, "sz = %d\n", (int)(endp-cp)); int32_t i32; cp += safe_itf8_get(cp, endp, &i32); cp += i32; cp += safe_itf8_get(cp, endp, &i32); cp += i32; op = cp; cp += safe_itf8_get(cp, endp, &i32); i32 += (cp-op); //fprintf(stderr, "remaining %d bytes\n", i32); cram_block_set_size(n_blk, cram_block_get_size(n_blk)-2); cram_block_append(n_blk, op, i32); cram_block_update_size(n_blk); new_size = cram_block_size(n_blk); //fprintf(stderr, "size %d -> %d\n", old_size, new_size); // Now we've constructedthe updated compression header, // amend the container too (it may have changed size). int32_t *landmarks, num_landmarks; landmarks = cram_container_get_landmarks(c, &num_landmarks); if (old_size != new_size) { int diff = new_size - old_size, j; for (j = 0; j < num_landmarks; j++) landmarks[j] += diff; //cram_container_set_landmarks(c, num_landmarks, landmarks); cram_container_set_length(c, cram_container_get_length(c) + diff); } // Finally write it all out; container, compression header, // and then all the remaining slice blocks. if (cram_write_container(out, c) != 0) return -2; cram_write_block(out, n_blk); cram_free_block(o_blk); cram_free_block(n_blk); // Container num_blocks can be invalid, due to a bug. // Instead we iterate in slice context instead. return cram_copy_slice(in, out, num_landmarks); }
/* * Reads a version 2 CRAM file and replaces the header in-place, * provided the header is small enough to fit without growing the * entire file. * * Version 2 format has an uncompressed SAM header with multiple nul * termination bytes to permit inline header editing. * * Returns 0 on success; * -1 on general failure; * -2 on failure due to insufficient size */ int cram_reheader_inplace2(cram_fd *fd, const bam_hdr_t *h, const char *arg_list, int add_PG) { cram_container *c = NULL; cram_block *b = NULL; SAM_hdr *hdr = NULL; off_t start; int ret = -1; if (cram_major_vers(fd) < 2 || cram_major_vers(fd) > 3) { fprintf(stderr, "[%s] unsupported CRAM version %d\n", __func__, cram_major_vers(fd)); goto err; } if (!(hdr = sam_hdr_parse_(h->text, h->l_text))) goto err; if (add_PG && sam_hdr_add_PG(hdr, "samtools", "VN", samtools_version(), arg_list ? "CL": NULL, arg_list ? arg_list : NULL, NULL)) goto err; int header_len = sam_hdr_length(hdr); /* Fix M5 strings? Maybe out of scope for this tool */ // Load the existing header if ((start = hseek(cram_fd_get_fp(fd), 26, SEEK_SET)) != 26) goto err; if (!(c = cram_read_container(fd))) goto err; // Version 2.1 has a single uncompressed block which is nul // terminated with many nuls to permit growth. // // So load old block and keep all contents identical bar the // header text itself if (!(b = cram_read_block(fd))) goto err; if (cram_block_get_uncomp_size(b) < header_len+4) { fprintf(stderr, "New header will not fit. Use non-inplace version (%d > %d)\n", header_len+4, cram_block_get_uncomp_size(b)); ret = -2; goto err; } cram_block_set_offset(b, 0); // rewind block int32_put_blk(b, header_len); cram_block_append(b, sam_hdr_str(hdr), header_len); // Zero the remaining block memset(cram_block_get_data(b)+cram_block_get_offset(b), 0, cram_block_get_uncomp_size(b) - cram_block_get_offset(b)); // Make sure all sizes and byte-offsets are consistent after memset cram_block_set_offset(b, cram_block_get_uncomp_size(b)); cram_block_set_comp_size(b, cram_block_get_uncomp_size(b)); if (hseek(cram_fd_get_fp(fd), start, SEEK_SET) != start) goto err; if (cram_write_container(fd, c) == -1) goto err; if (cram_write_block(fd, b) == -1) goto err; ret = 0; err: if (c) cram_free_container(c); if (b) cram_free_block(b); if (hdr) sam_hdr_free(hdr); return ret; }