int cram_reheader_inplace(cram_fd *fd, const bam_hdr_t *h, const char *arg_list, int add_PG) { switch (cram_major_vers(fd)) { case 2: return cram_reheader_inplace2(fd, h, arg_list, add_PG); case 3: return cram_reheader_inplace3(fd, h, arg_list, add_PG); default: fprintf(stderr, "[%s] unsupported CRAM version %d\n", __func__, cram_major_vers(fd)); return -1; } }
/* * Reads a version 2 CRAM file and replaces the header in-place, * provided the header is small enough to fit without growing the * entire file. * * Version 2 format has an uncompressed SAM header with multiple nul * termination bytes to permit inline header editing. * * Returns 0 on success; * -1 on general failure; * -2 on failure due to insufficient size */ int cram_reheader_inplace2(cram_fd *fd, const bam_hdr_t *h, const char *arg_list, int add_PG) { cram_container *c = NULL; cram_block *b = NULL; SAM_hdr *hdr = NULL; off_t start; int ret = -1; if (cram_major_vers(fd) < 2 || cram_major_vers(fd) > 3) { fprintf(stderr, "[%s] unsupported CRAM version %d\n", __func__, cram_major_vers(fd)); goto err; } if (!(hdr = sam_hdr_parse_(h->text, h->l_text))) goto err; if (add_PG && sam_hdr_add_PG(hdr, "samtools", "VN", samtools_version(), arg_list ? "CL": NULL, arg_list ? arg_list : NULL, NULL)) goto err; int header_len = sam_hdr_length(hdr); /* Fix M5 strings? Maybe out of scope for this tool */ // Load the existing header if ((start = hseek(cram_fd_get_fp(fd), 26, SEEK_SET)) != 26) goto err; if (!(c = cram_read_container(fd))) goto err; // Version 2.1 has a single uncompressed block which is nul // terminated with many nuls to permit growth. // // So load old block and keep all contents identical bar the // header text itself if (!(b = cram_read_block(fd))) goto err; if (cram_block_get_uncomp_size(b) < header_len+4) { fprintf(stderr, "New header will not fit. Use non-inplace version (%d > %d)\n", header_len+4, cram_block_get_uncomp_size(b)); ret = -2; goto err; } cram_block_set_offset(b, 0); // rewind block int32_put_blk(b, header_len); cram_block_append(b, sam_hdr_str(hdr), header_len); // Zero the remaining block memset(cram_block_get_data(b)+cram_block_get_offset(b), 0, cram_block_get_uncomp_size(b) - cram_block_get_offset(b)); // Make sure all sizes and byte-offsets are consistent after memset cram_block_set_offset(b, cram_block_get_uncomp_size(b)); cram_block_set_comp_size(b, cram_block_get_uncomp_size(b)); if (hseek(cram_fd_get_fp(fd), start, SEEK_SET) != start) goto err; if (cram_write_container(fd, c) == -1) goto err; if (cram_write_block(fd, b) == -1) goto err; ret = 0; err: if (c) cram_free_container(c); if (b) cram_free_block(b); if (hdr) sam_hdr_free(hdr); return ret; }
/* * Reads a version 3 CRAM file and replaces the header in-place, * provided the header is small enough to fit without growing the * entire file. * * Version 3 format has a SAM header held as an (optionally) * compressed block within the header container. Additional * uncompressed blocks or simply unallocated space (the difference * between total block sizes and the container size) are used to * provide room for growth or contraction of the compressed header. * * Returns 0 on success; * -1 on general failure; * -2 on failure due to insufficient size */ int cram_reheader_inplace3(cram_fd *fd, const bam_hdr_t *h, const char *arg_list, int add_PG) { cram_container *c = NULL; cram_block *b = NULL; SAM_hdr *hdr = NULL; off_t start, sz, end; int container_sz, max_container_sz; char *buf = NULL; int ret = -1; if (cram_major_vers(fd) < 2 || cram_major_vers(fd) > 3) { fprintf(stderr, "[%s] unsupported CRAM version %d\n", __func__, cram_major_vers(fd)); goto err; } if (!(hdr = sam_hdr_parse_(h->text, h->l_text))) goto err; if (add_PG && sam_hdr_add_PG(hdr, "samtools", "VN", samtools_version(), arg_list ? "CL": NULL, arg_list ? arg_list : NULL, NULL)) goto err; int header_len = sam_hdr_length(hdr); /* Fix M5 strings? Maybe out of scope for this tool */ // Find current size of SAM header block if ((start = hseek(cram_fd_get_fp(fd), 26, SEEK_SET)) != 26) goto err; if (!(c = cram_read_container(fd))) goto err; // +5 allows num_landmarks to increase from 0 to 1 (Cramtools) max_container_sz = cram_container_size(c)+5; sz = htell(cram_fd_get_fp(fd)) + cram_container_get_length(c) - start; end = htell(cram_fd_get_fp(fd)) + cram_container_get_length(c); // We force 1 block instead of (optionally) 2. C CRAM // implementations for v3 were writing 1 compressed block followed // by 1 uncompressed block. However this is tricky to deal with // as changing block sizes can mean the block header also changes // size due to itf8 and variable size integers. // // If we had 1 block, this doesn't change anything. // If we had 2 blocks, the new container header will be smaller by // 1+ bytes, requiring the cram_container_get_length(c) to be larger in value. // However this is an int32 instead of itf8 so the container // header structure stays the same size. This means we can always // reduce the number of blocks without running into size problems. cram_container_set_num_blocks(c, 1); int32_t *landmark; int32_t num_landmarks; landmark = cram_container_get_landmarks(c, &num_landmarks); if (num_landmarks && landmark) { num_landmarks = 1; landmark[0] = 0; } else { num_landmarks = 0; } cram_container_set_landmarks(c, num_landmarks, landmark); buf = malloc(max_container_sz); container_sz = max_container_sz; if (cram_store_container(fd, c, buf, &container_sz) != 0) goto err; if (!buf) goto err; // Proposed new length, but changing cram_container_get_length(c) may change the // container_sz and thus the remainder (cram_container_get_length(c) itself). cram_container_set_length(c, sz - container_sz); int old_container_sz = container_sz; container_sz = max_container_sz; if (cram_store_container(fd, c, buf, &container_sz) != 0) goto err; if (old_container_sz != container_sz) { fprintf(stderr, "Quirk of fate makes this troublesome! " "Please use non-inplace version.\n"); goto err; } // Version 3.0 supports compressed header b = cram_new_block(FILE_HEADER, 0); int32_put_blk(b, header_len); cram_block_append(b, sam_hdr_str(hdr), header_len); cram_block_update_size(b); cram_compress_block(fd, b, NULL, -1, -1); if (hseek(cram_fd_get_fp(fd), 26, SEEK_SET) != 26) goto err; if (cram_block_size(b) > cram_container_get_length(c)) { fprintf(stderr, "New header will not fit. Use non-inplace version" " (%d > %d)\n", (int)cram_block_size(b), cram_container_get_length(c)); ret = -2; goto err; } if (cram_write_container(fd, c) == -1) goto err; if (cram_write_block(fd, b) == -1) goto err; // Blank out the remainder int rsz = end - htell(cram_fd_get_fp(fd)); assert(rsz >= 0); if (rsz) { char *rem = calloc(1, rsz); ret = hwrite(cram_fd_get_fp(fd), rem, rsz) == rsz ? 0 : -1; free(rem); } err: if (c) cram_free_container(c); if (buf) free(buf); if (b) cram_free_block(b); if (hdr) sam_hdr_free(hdr); return ret; }
/* * Check the files are consistent and capable of being concatenated. * Also fills out the rg2id read-group hash and the version numbers * and produces a new bam_hdr_t structure with merged RG lines. * Note it is only a simple merge, as we lack the niceties of a proper * header API. * * Returns updated header on success; * NULL on failure. */ static bam_hdr_t *cram_cat_check_hdr(int nfn, char * const *fn, const bam_hdr_t *h, khash_s2i **rg2id, int *vers_maj_p, int *vers_min_p) { int i, vers_maj = -1, vers_min = -1; bam_hdr_t *new_h = NULL; if (h) { new_h = bam_hdr_dup(h); *rg2id = hash_rg(new_h); } for (i = 0; i < nfn; ++i) { samFile *in; cram_fd *in_c; khint_t ki; int new_rg = -1; in = sam_open(fn[i], "rc"); if (in == 0) { fprintf(stderr, "[%s] ERROR: fail to open file '%s'.\n", __func__, fn[i]); return NULL; } in_c = in->fp.cram; int vmaj = cram_major_vers(in_c); int vmin = cram_minor_vers(in_c); if ((vers_maj != -1 && vers_maj != vmaj) || (vers_min != -1 && vers_min != vmin)) { fprintf(stderr, "[%s] ERROR: input files have differing version numbers.\n", __func__); return NULL; } vers_maj = vmaj; vers_min = vmin; bam_hdr_t *old = sam_hdr_read(in); khash_s2i *rg2id_in = hash_rg(old); if (!new_h) { new_h = bam_hdr_dup(old); *rg2id = hash_rg(new_h); } // Add any existing @RG entries to our global @RG hash. for (ki = 0; ki < rg2id_in->n_id; ki++) { int added; new_rg = hash_s2i_inc(*rg2id, rg2id_in->id[ki], rg2id_in->line[ki], &added); //fprintf(stderr, "RG %s: #%d -> #%d\n", // rg2id_in->id[ki], ki, new_rg); if (added) { // Also add to new_h const char *line = rg2id_in->line[ki]; const char *line_end = line; while (*line && *line_end++ != '\n') ; new_h->l_text += line_end - line; new_h->text = realloc(new_h->text, new_h->l_text+1); strncat(&new_h->text[new_h->l_text - (line_end - line)], line, line_end - line); } if (new_rg != ki && rg2id_in->n_id > 1) { fprintf(stderr, "[%s] ERROR: Same size @RG lists but differing order / contents\n", __func__); return NULL; } } hash_s2i_free(rg2id_in); bam_hdr_destroy(old); sam_close(in); } *vers_maj_p = vers_maj; *vers_min_p = vers_min; return new_h; }