/* * Copies the blocks representing the next num_slice slices from a * container from 'in' to 'out'. It is expected that the file pointer * is just after the read of the cram_container and cram compression * header. * * Returns 0 on success * -1 on failure */ int cram_copy_slice(cram_fd *in, cram_fd *out, int32_t num_slice) { int32_t i, j; for (i = 0; i < num_slice; i++) { cram_block *blk; cram_block_slice_hdr *hdr; if (!(blk = cram_read_block(in))) return -1; if (!(hdr = cram_decode_slice_header(in, blk))) { cram_free_block(blk); return -1; } if (cram_write_block(out, blk) != 0) { cram_free_block(blk); return -1; } cram_free_block(blk); int num_blocks = cram_slice_hdr_get_num_blocks(hdr); for (j = 0; j < num_blocks; j++) { blk = cram_read_block(in); if (!blk || cram_write_block(out, blk) != 0) { if (blk) cram_free_block(blk); return -1; } cram_free_block(blk); } cram_free_slice_header(hdr); } return 0; }
/* * Reads a file and outputs a new CRAM file to stdout with 'h' * replaced as the header. No checks are made to the validity. * * FIXME: error checking */ int cram_reheader(cram_fd *in, bam_hdr_t *h, const char *arg_list, int add_PG) { htsFile *h_out = hts_open("-", "wc"); cram_fd *out = h_out->fp.cram; cram_container *c = NULL; int ret = -1; // Attempt to fill out a cram->refs[] array from @SQ headers cram_fd_set_header(out, sam_hdr_parse_(h->text, h->l_text)); if (add_PG) { if (sam_hdr_add_PG(cram_fd_get_header(out), "samtools", "VN", samtools_version(), arg_list ? "CL": NULL, arg_list ? arg_list : NULL, NULL) != 0) goto err; // Covert back to bam_hdr_t struct free(h->text); h->text = strdup(sam_hdr_str(cram_fd_get_header(out))); h->l_text = sam_hdr_length(cram_fd_get_header(out)); if (!h->text) goto err; } if (sam_hdr_write(h_out, h) != 0) goto err; cram_set_option(out, CRAM_OPT_REFERENCE, NULL); while ((c = cram_read_container(in))) { int32_t i, num_blocks = cram_container_get_num_blocks(c); if (cram_write_container(out, c) != 0) goto err; for (i = 0; i < num_blocks; i++) { cram_block *blk = cram_read_block(in); if (!blk || cram_write_block(out, blk) != 0) { if (blk) cram_free_block(blk); goto err; } cram_free_block(blk); } cram_free_container(c); } ret = 0; err: if (hts_close(h_out) != 0) ret = -1; return ret; }
/* * Renumbers RG numbers in a cram compression header. * * CRAM stores RG as the Nth number in the header, rather than a * string holding the ID: tag. This is smaller in space, but means * "samtools cat" to join files together that contain single but * different RG lines needs a way of renumbering them. * * The file descriptor is expected to be immediately after the * cram_container structure (ie before the cram compression header). * Due to the nature of the CRAM format, this needs to read and write * the blocks itself. Note that there may be multiple slices within * the container, meaning multiple compression headers to manipulate. * Changing RG may change the size of the compression header and * therefore the length field in the container. Hence we rewrite all * blocks just incase and also emit the adjusted container. * * The current implementation can only cope with renumbering a single * RG (and only then if it is using HUFFMAN or BETA codecs). In * theory it *may* be possible to renumber multiple RGs if they use * HUFFMAN to the CORE block or use an external block unshared by any * other data series. So we have an API that can be upgraded to * support this, but do not implement it for now. An example * implementation of RG as an EXTERNAL block would be to find that * block and rewrite it, returning the number of blocks consumed. * * Returns 0 on success; * -1 if unable to edit; * -2 on other errors (eg I/O). */ int cram_transcode_rg(cram_fd *in, cram_fd *out, cram_container *c, int nrg, int *in_rg, int *out_rg) { int new_rg = *out_rg, old_size, new_size; cram_block *o_blk, *n_blk; cram_block_compression_hdr *ch; if (nrg != 1) { hts_log_error("CRAM transcode supports only a single RG"); return -2; } // Produce a new block holding the updated compression header, // with RG transcoded to a new value. (Single only supported.) o_blk = cram_read_block(in); old_size = cram_block_size(o_blk); ch = cram_decode_compression_header(in, o_blk); if (cram_block_compression_hdr_set_rg(ch, new_rg) != 0) return -1; cram_block_compression_hdr_decoder2encoder(in, ch); n_blk = cram_encode_compression_header(in, c, ch); cram_free_compression_header(ch); /* * Warning: this has internal knowledge of the cram compression * header format. * * The decoder doesn't set c->tags_used, so the encoder puts a two * byte blank segment. This means n_blk is too short. We skip * through the decoded old block (o_blk) and copy from there. */ char *cp = cram_block_get_data(o_blk); char *op = cp; char *endp = cp + cram_block_get_uncomp_size(o_blk); //fprintf(stderr, "sz = %d\n", (int)(endp-cp)); int32_t i32; cp += safe_itf8_get(cp, endp, &i32); cp += i32; cp += safe_itf8_get(cp, endp, &i32); cp += i32; op = cp; cp += safe_itf8_get(cp, endp, &i32); i32 += (cp-op); //fprintf(stderr, "remaining %d bytes\n", i32); cram_block_set_size(n_blk, cram_block_get_size(n_blk)-2); cram_block_append(n_blk, op, i32); cram_block_update_size(n_blk); new_size = cram_block_size(n_blk); //fprintf(stderr, "size %d -> %d\n", old_size, new_size); // Now we've constructedthe updated compression header, // amend the container too (it may have changed size). int32_t *landmarks, num_landmarks; landmarks = cram_container_get_landmarks(c, &num_landmarks); if (old_size != new_size) { int diff = new_size - old_size, j; for (j = 0; j < num_landmarks; j++) landmarks[j] += diff; //cram_container_set_landmarks(c, num_landmarks, landmarks); cram_container_set_length(c, cram_container_get_length(c) + diff); } // Finally write it all out; container, compression header, // and then all the remaining slice blocks. if (cram_write_container(out, c) != 0) return -2; cram_write_block(out, n_blk); cram_free_block(o_blk); cram_free_block(n_blk); // Container num_blocks can be invalid, due to a bug. // Instead we iterate in slice context instead. return cram_copy_slice(in, out, num_landmarks); }
/* * Reads a version 3 CRAM file and replaces the header in-place, * provided the header is small enough to fit without growing the * entire file. * * Version 3 format has a SAM header held as an (optionally) * compressed block within the header container. Additional * uncompressed blocks or simply unallocated space (the difference * between total block sizes and the container size) are used to * provide room for growth or contraction of the compressed header. * * Returns 0 on success; * -1 on general failure; * -2 on failure due to insufficient size */ int cram_reheader_inplace3(cram_fd *fd, const bam_hdr_t *h, const char *arg_list, int add_PG) { cram_container *c = NULL; cram_block *b = NULL; SAM_hdr *hdr = NULL; off_t start, sz, end; int container_sz, max_container_sz; char *buf = NULL; int ret = -1; if (cram_major_vers(fd) < 2 || cram_major_vers(fd) > 3) { fprintf(stderr, "[%s] unsupported CRAM version %d\n", __func__, cram_major_vers(fd)); goto err; } if (!(hdr = sam_hdr_parse_(h->text, h->l_text))) goto err; if (add_PG && sam_hdr_add_PG(hdr, "samtools", "VN", samtools_version(), arg_list ? "CL": NULL, arg_list ? arg_list : NULL, NULL)) goto err; int header_len = sam_hdr_length(hdr); /* Fix M5 strings? Maybe out of scope for this tool */ // Find current size of SAM header block if ((start = hseek(cram_fd_get_fp(fd), 26, SEEK_SET)) != 26) goto err; if (!(c = cram_read_container(fd))) goto err; // +5 allows num_landmarks to increase from 0 to 1 (Cramtools) max_container_sz = cram_container_size(c)+5; sz = htell(cram_fd_get_fp(fd)) + cram_container_get_length(c) - start; end = htell(cram_fd_get_fp(fd)) + cram_container_get_length(c); // We force 1 block instead of (optionally) 2. C CRAM // implementations for v3 were writing 1 compressed block followed // by 1 uncompressed block. However this is tricky to deal with // as changing block sizes can mean the block header also changes // size due to itf8 and variable size integers. // // If we had 1 block, this doesn't change anything. // If we had 2 blocks, the new container header will be smaller by // 1+ bytes, requiring the cram_container_get_length(c) to be larger in value. // However this is an int32 instead of itf8 so the container // header structure stays the same size. This means we can always // reduce the number of blocks without running into size problems. cram_container_set_num_blocks(c, 1); int32_t *landmark; int32_t num_landmarks; landmark = cram_container_get_landmarks(c, &num_landmarks); if (num_landmarks && landmark) { num_landmarks = 1; landmark[0] = 0; } else { num_landmarks = 0; } cram_container_set_landmarks(c, num_landmarks, landmark); buf = malloc(max_container_sz); container_sz = max_container_sz; if (cram_store_container(fd, c, buf, &container_sz) != 0) goto err; if (!buf) goto err; // Proposed new length, but changing cram_container_get_length(c) may change the // container_sz and thus the remainder (cram_container_get_length(c) itself). cram_container_set_length(c, sz - container_sz); int old_container_sz = container_sz; container_sz = max_container_sz; if (cram_store_container(fd, c, buf, &container_sz) != 0) goto err; if (old_container_sz != container_sz) { fprintf(stderr, "Quirk of fate makes this troublesome! " "Please use non-inplace version.\n"); goto err; } // Version 3.0 supports compressed header b = cram_new_block(FILE_HEADER, 0); int32_put_blk(b, header_len); cram_block_append(b, sam_hdr_str(hdr), header_len); cram_block_update_size(b); cram_compress_block(fd, b, NULL, -1, -1); if (hseek(cram_fd_get_fp(fd), 26, SEEK_SET) != 26) goto err; if (cram_block_size(b) > cram_container_get_length(c)) { fprintf(stderr, "New header will not fit. Use non-inplace version" " (%d > %d)\n", (int)cram_block_size(b), cram_container_get_length(c)); ret = -2; goto err; } if (cram_write_container(fd, c) == -1) goto err; if (cram_write_block(fd, b) == -1) goto err; // Blank out the remainder int rsz = end - htell(cram_fd_get_fp(fd)); assert(rsz >= 0); if (rsz) { char *rem = calloc(1, rsz); ret = hwrite(cram_fd_get_fp(fd), rem, rsz) == rsz ? 0 : -1; free(rem); } err: if (c) cram_free_container(c); if (buf) free(buf); if (b) cram_free_block(b); if (hdr) sam_hdr_free(hdr); return ret; }
/* * Reads a version 2 CRAM file and replaces the header in-place, * provided the header is small enough to fit without growing the * entire file. * * Version 2 format has an uncompressed SAM header with multiple nul * termination bytes to permit inline header editing. * * Returns 0 on success; * -1 on general failure; * -2 on failure due to insufficient size */ int cram_reheader_inplace2(cram_fd *fd, const bam_hdr_t *h, const char *arg_list, int add_PG) { cram_container *c = NULL; cram_block *b = NULL; SAM_hdr *hdr = NULL; off_t start; int ret = -1; if (cram_major_vers(fd) < 2 || cram_major_vers(fd) > 3) { fprintf(stderr, "[%s] unsupported CRAM version %d\n", __func__, cram_major_vers(fd)); goto err; } if (!(hdr = sam_hdr_parse_(h->text, h->l_text))) goto err; if (add_PG && sam_hdr_add_PG(hdr, "samtools", "VN", samtools_version(), arg_list ? "CL": NULL, arg_list ? arg_list : NULL, NULL)) goto err; int header_len = sam_hdr_length(hdr); /* Fix M5 strings? Maybe out of scope for this tool */ // Load the existing header if ((start = hseek(cram_fd_get_fp(fd), 26, SEEK_SET)) != 26) goto err; if (!(c = cram_read_container(fd))) goto err; // Version 2.1 has a single uncompressed block which is nul // terminated with many nuls to permit growth. // // So load old block and keep all contents identical bar the // header text itself if (!(b = cram_read_block(fd))) goto err; if (cram_block_get_uncomp_size(b) < header_len+4) { fprintf(stderr, "New header will not fit. Use non-inplace version (%d > %d)\n", header_len+4, cram_block_get_uncomp_size(b)); ret = -2; goto err; } cram_block_set_offset(b, 0); // rewind block int32_put_blk(b, header_len); cram_block_append(b, sam_hdr_str(hdr), header_len); // Zero the remaining block memset(cram_block_get_data(b)+cram_block_get_offset(b), 0, cram_block_get_uncomp_size(b) - cram_block_get_offset(b)); // Make sure all sizes and byte-offsets are consistent after memset cram_block_set_offset(b, cram_block_get_uncomp_size(b)); cram_block_set_comp_size(b, cram_block_get_uncomp_size(b)); if (hseek(cram_fd_get_fp(fd), start, SEEK_SET) != start) goto err; if (cram_write_container(fd, c) == -1) goto err; if (cram_write_block(fd, b) == -1) goto err; ret = 0; err: if (c) cram_free_container(c); if (b) cram_free_block(b); if (hdr) sam_hdr_free(hdr); return ret; }
/* * CRAM files don't store the RG:Z:ID per read in the aux field. * Instead they have a numerical data series (RG) to point each read * back to the Nth @RG line in the file. This means that we may need * to edit the RG data series (if the files were produced from * "samtools split" for example). * * The encoding method is stored in the compression header. Typical * examples: * * RG => EXTERNAL {18} # Block content-id 18 holds RG values * # as a series of ITF8 encoded values * * RG => HUFFMAN {1, 255, 255, 255, 255, 255, 1, 0} * # One RG value #-1. (No RG) * * RG => HUFFMAN {1, 0, 1, 0} # One RG value #0 (always first RG) * * RG => HUFFMAN {2, 0, 1, 2, 1, 1} * # Two RG values, #0 and #1, written * # to the CORE block and possibly * # mixed with other data series. * * A single value can (but may not be) implemented as a zero bit * huffman code. In this situation we can change the meta-data in the * compression header to renumber an RG value.. */ int cram_cat(int nfn, char * const *fn, const bam_hdr_t *h, const char* outcram) { samFile *out; cram_fd *out_c; int i, vers_maj, vers_min; khash_s2i *rg2id = NULL; bam_hdr_t *new_h = NULL; /* Check consistent versioning and compatible headers */ if (!(new_h = cram_cat_check_hdr(nfn, fn, h, &rg2id, &vers_maj, &vers_min))) return -1; /* Open the file with cram_vers */ char vers[100]; sprintf(vers, "%d.%d", vers_maj, vers_min); out = sam_open(outcram, "wc"); if (out == 0) { fprintf(stderr, "[%s] ERROR: fail to open output file '%s'.\n", __func__, outcram); return 1; } out_c = out->fp.cram; cram_set_option(out_c, CRAM_OPT_VERSION, vers); //fprintf(stderr, "Creating cram vers %s\n", vers); cram_fd_set_header(out_c, sam_hdr_parse_(new_h->text, new_h->l_text)); // needed? sam_hdr_write(out, new_h); for (i = 0; i < nfn; ++i) { samFile *in; cram_fd *in_c; cram_container *c; bam_hdr_t *old; int new_rg = -1; in = sam_open(fn[i], "rc"); if (in == 0) { fprintf(stderr, "[%s] ERROR: fail to open file '%s'.\n", __func__, fn[i]); return -1; } in_c = in->fp.cram; old = sam_hdr_read(in); khash_s2i *rg2id_in = hash_rg(old); // Compute RG mapping if suitable for changing. if (rg2id_in->n_id == 1) { int _; new_rg = hash_s2i_inc(rg2id, rg2id_in->id[0], NULL, &_); } else { new_rg = 0; } hash_s2i_free(rg2id_in); // Copy contains and blocks within them while ((c = cram_read_container(in_c))) { cram_block *blk; if (cram_container_is_empty(in_c)) { if (cram_write_container(out_c, c) != 0) return -1; // Container compression header if (!(blk = cram_read_block(in_c))) return -1; if (cram_write_block(out_c, blk) != 0) { cram_free_block(blk); return -1; } cram_free_block(blk); cram_free_container(c); continue; } // If we have just one RG key and new_rg != 0 then // we need to edit the compression header. IF WE CAN. if (new_rg) { int zero = 0; //fprintf(stderr, "Transcode RG %d to %d\n", 0, new_rg); cram_transcode_rg(in_c, out_c, c, 1, &zero, &new_rg); } else { int32_t num_slices; // Not switching rg so do the usual read/write loop if (cram_write_container(out_c, c) != 0) return -1; // Container compression header if (!(blk = cram_read_block(in_c))) return -1; if (cram_write_block(out_c, blk) != 0) { cram_free_block(blk); return -1; } cram_free_block(blk); // Container num_blocks can be invalid, due to a bug. // Instead we iterate in slice context instead. (void)cram_container_get_landmarks(c, &num_slices); cram_copy_slice(in_c, out_c, num_slices); } cram_free_container(c); } bam_hdr_destroy(old); sam_close(in); } sam_close(out); hash_s2i_free(rg2id); bam_hdr_destroy(new_h); return 0; }