/* * Reads a file and outputs a new CRAM file to stdout with 'h' * replaced as the header. No checks are made to the validity. * * FIXME: error checking */ int cram_reheader(cram_fd *in, bam_hdr_t *h, const char *arg_list, int add_PG) { htsFile *h_out = hts_open("-", "wc"); cram_fd *out = h_out->fp.cram; cram_container *c = NULL; int ret = -1; // Attempt to fill out a cram->refs[] array from @SQ headers cram_fd_set_header(out, sam_hdr_parse_(h->text, h->l_text)); if (add_PG) { if (sam_hdr_add_PG(cram_fd_get_header(out), "samtools", "VN", samtools_version(), arg_list ? "CL": NULL, arg_list ? arg_list : NULL, NULL) != 0) goto err; // Covert back to bam_hdr_t struct free(h->text); h->text = strdup(sam_hdr_str(cram_fd_get_header(out))); h->l_text = sam_hdr_length(cram_fd_get_header(out)); if (!h->text) goto err; } if (sam_hdr_write(h_out, h) != 0) goto err; cram_set_option(out, CRAM_OPT_REFERENCE, NULL); while ((c = cram_read_container(in))) { int32_t i, num_blocks = cram_container_get_num_blocks(c); if (cram_write_container(out, c) != 0) goto err; for (i = 0; i < num_blocks; i++) { cram_block *blk = cram_read_block(in); if (!blk || cram_write_block(out, blk) != 0) { if (blk) cram_free_block(blk); goto err; } cram_free_block(blk); } cram_free_container(c); } ret = 0; err: if (hts_close(h_out) != 0) ret = -1; return ret; }
/* * Skips to a container overlapping the start coordinate listed in * cram_range. * * In theory we call cram_index_query multiple times, once per slice * overlapping the range. However slices may be absent from the index * which makes this problematic. Instead we find the left-most slice * and then read from then on, skipping decoding of slices and/or * whole containers when they don't overlap the specified cram_range. * * Returns 0 on success * -1 on failure */ int cram_seek_to_refpos(cram_fd *fd, cram_range *r) { cram_index *e; // Ideally use an index, so see if we have one. if ((e = cram_index_query(fd, r->refid, r->start, NULL))) { if (0 != cram_seek(fd, e->offset, SEEK_SET)) if (0 != cram_seek(fd, e->offset - fd->first_container, SEEK_CUR)) return -1; } else { fprintf(stderr, "Unknown reference ID. Missing from index?\n"); return -1; } if (fd->ctr) { cram_free_container(fd->ctr); fd->ctr = NULL; } return 0; }
/* * Builds an index file. * * fd is a newly opened cram file that we wish to index. * fn_base is the filename of the associated CRAM file. Internally we * add ".crai" to this to get the index filename. * * Returns 0 on success * -1 on failure */ int cram_index_build(cram_fd *fd, const char *fn_base) { cram_container *c; off_t cpos, spos, hpos; zfp *fp; char fn_idx[PATH_MAX]; if (strlen(fn_base) > PATH_MAX-6) return -1; sprintf(fn_idx, "%s.crai", fn_base); if (!(fp = zfopen(fn_idx, "wz"))) { perror(fn_idx); return -1; } cpos = htell(fd->fp); while ((c = cram_read_container(fd))) { int j; if (fd->err) { perror("Cram container read"); return 1; } hpos = htell(fd->fp); if (!(c->comp_hdr_block = cram_read_block(fd))) return 1; assert(c->comp_hdr_block->content_type == COMPRESSION_HEADER); c->comp_hdr = cram_decode_compression_header(fd, c->comp_hdr_block); if (!c->comp_hdr) return -1; // 2.0 format for (j = 0; j < c->num_landmarks; j++) { char buf[1024]; cram_slice *s; int sz; spos = htell(fd->fp); assert(spos - cpos - c->offset == c->landmark[j]); if (!(s = cram_read_slice(fd))) { zfclose(fp); return -1; } sz = (int)(htell(fd->fp) - spos); if (s->hdr->ref_seq_id == -2) { cram_index_build_multiref(fd, c, s, fp, cpos, c->landmark[j], sz); } else { sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n", s->hdr->ref_seq_id, s->hdr->ref_seq_start, s->hdr->ref_seq_span, (int64_t)cpos, c->landmark[j], sz); zfputs(buf, fp); } cram_free_slice(s); } cpos = htell(fd->fp); assert(cpos == hpos + c->length); cram_free_container(c); } if (fd->err) { zfclose(fp); return -1; } return zfclose(fp); }
/* * Reads a version 3 CRAM file and replaces the header in-place, * provided the header is small enough to fit without growing the * entire file. * * Version 3 format has a SAM header held as an (optionally) * compressed block within the header container. Additional * uncompressed blocks or simply unallocated space (the difference * between total block sizes and the container size) are used to * provide room for growth or contraction of the compressed header. * * Returns 0 on success; * -1 on general failure; * -2 on failure due to insufficient size */ int cram_reheader_inplace3(cram_fd *fd, const bam_hdr_t *h, const char *arg_list, int add_PG) { cram_container *c = NULL; cram_block *b = NULL; SAM_hdr *hdr = NULL; off_t start, sz, end; int container_sz, max_container_sz; char *buf = NULL; int ret = -1; if (cram_major_vers(fd) < 2 || cram_major_vers(fd) > 3) { fprintf(stderr, "[%s] unsupported CRAM version %d\n", __func__, cram_major_vers(fd)); goto err; } if (!(hdr = sam_hdr_parse_(h->text, h->l_text))) goto err; if (add_PG && sam_hdr_add_PG(hdr, "samtools", "VN", samtools_version(), arg_list ? "CL": NULL, arg_list ? arg_list : NULL, NULL)) goto err; int header_len = sam_hdr_length(hdr); /* Fix M5 strings? Maybe out of scope for this tool */ // Find current size of SAM header block if ((start = hseek(cram_fd_get_fp(fd), 26, SEEK_SET)) != 26) goto err; if (!(c = cram_read_container(fd))) goto err; // +5 allows num_landmarks to increase from 0 to 1 (Cramtools) max_container_sz = cram_container_size(c)+5; sz = htell(cram_fd_get_fp(fd)) + cram_container_get_length(c) - start; end = htell(cram_fd_get_fp(fd)) + cram_container_get_length(c); // We force 1 block instead of (optionally) 2. C CRAM // implementations for v3 were writing 1 compressed block followed // by 1 uncompressed block. However this is tricky to deal with // as changing block sizes can mean the block header also changes // size due to itf8 and variable size integers. // // If we had 1 block, this doesn't change anything. // If we had 2 blocks, the new container header will be smaller by // 1+ bytes, requiring the cram_container_get_length(c) to be larger in value. // However this is an int32 instead of itf8 so the container // header structure stays the same size. This means we can always // reduce the number of blocks without running into size problems. cram_container_set_num_blocks(c, 1); int32_t *landmark; int32_t num_landmarks; landmark = cram_container_get_landmarks(c, &num_landmarks); if (num_landmarks && landmark) { num_landmarks = 1; landmark[0] = 0; } else { num_landmarks = 0; } cram_container_set_landmarks(c, num_landmarks, landmark); buf = malloc(max_container_sz); container_sz = max_container_sz; if (cram_store_container(fd, c, buf, &container_sz) != 0) goto err; if (!buf) goto err; // Proposed new length, but changing cram_container_get_length(c) may change the // container_sz and thus the remainder (cram_container_get_length(c) itself). cram_container_set_length(c, sz - container_sz); int old_container_sz = container_sz; container_sz = max_container_sz; if (cram_store_container(fd, c, buf, &container_sz) != 0) goto err; if (old_container_sz != container_sz) { fprintf(stderr, "Quirk of fate makes this troublesome! " "Please use non-inplace version.\n"); goto err; } // Version 3.0 supports compressed header b = cram_new_block(FILE_HEADER, 0); int32_put_blk(b, header_len); cram_block_append(b, sam_hdr_str(hdr), header_len); cram_block_update_size(b); cram_compress_block(fd, b, NULL, -1, -1); if (hseek(cram_fd_get_fp(fd), 26, SEEK_SET) != 26) goto err; if (cram_block_size(b) > cram_container_get_length(c)) { fprintf(stderr, "New header will not fit. Use non-inplace version" " (%d > %d)\n", (int)cram_block_size(b), cram_container_get_length(c)); ret = -2; goto err; } if (cram_write_container(fd, c) == -1) goto err; if (cram_write_block(fd, b) == -1) goto err; // Blank out the remainder int rsz = end - htell(cram_fd_get_fp(fd)); assert(rsz >= 0); if (rsz) { char *rem = calloc(1, rsz); ret = hwrite(cram_fd_get_fp(fd), rem, rsz) == rsz ? 0 : -1; free(rem); } err: if (c) cram_free_container(c); if (buf) free(buf); if (b) cram_free_block(b); if (hdr) sam_hdr_free(hdr); return ret; }
/* * Reads a version 2 CRAM file and replaces the header in-place, * provided the header is small enough to fit without growing the * entire file. * * Version 2 format has an uncompressed SAM header with multiple nul * termination bytes to permit inline header editing. * * Returns 0 on success; * -1 on general failure; * -2 on failure due to insufficient size */ int cram_reheader_inplace2(cram_fd *fd, const bam_hdr_t *h, const char *arg_list, int add_PG) { cram_container *c = NULL; cram_block *b = NULL; SAM_hdr *hdr = NULL; off_t start; int ret = -1; if (cram_major_vers(fd) < 2 || cram_major_vers(fd) > 3) { fprintf(stderr, "[%s] unsupported CRAM version %d\n", __func__, cram_major_vers(fd)); goto err; } if (!(hdr = sam_hdr_parse_(h->text, h->l_text))) goto err; if (add_PG && sam_hdr_add_PG(hdr, "samtools", "VN", samtools_version(), arg_list ? "CL": NULL, arg_list ? arg_list : NULL, NULL)) goto err; int header_len = sam_hdr_length(hdr); /* Fix M5 strings? Maybe out of scope for this tool */ // Load the existing header if ((start = hseek(cram_fd_get_fp(fd), 26, SEEK_SET)) != 26) goto err; if (!(c = cram_read_container(fd))) goto err; // Version 2.1 has a single uncompressed block which is nul // terminated with many nuls to permit growth. // // So load old block and keep all contents identical bar the // header text itself if (!(b = cram_read_block(fd))) goto err; if (cram_block_get_uncomp_size(b) < header_len+4) { fprintf(stderr, "New header will not fit. Use non-inplace version (%d > %d)\n", header_len+4, cram_block_get_uncomp_size(b)); ret = -2; goto err; } cram_block_set_offset(b, 0); // rewind block int32_put_blk(b, header_len); cram_block_append(b, sam_hdr_str(hdr), header_len); // Zero the remaining block memset(cram_block_get_data(b)+cram_block_get_offset(b), 0, cram_block_get_uncomp_size(b) - cram_block_get_offset(b)); // Make sure all sizes and byte-offsets are consistent after memset cram_block_set_offset(b, cram_block_get_uncomp_size(b)); cram_block_set_comp_size(b, cram_block_get_uncomp_size(b)); if (hseek(cram_fd_get_fp(fd), start, SEEK_SET) != start) goto err; if (cram_write_container(fd, c) == -1) goto err; if (cram_write_block(fd, b) == -1) goto err; ret = 0; err: if (c) cram_free_container(c); if (b) cram_free_block(b); if (hdr) sam_hdr_free(hdr); return ret; }
/* * CRAM files don't store the RG:Z:ID per read in the aux field. * Instead they have a numerical data series (RG) to point each read * back to the Nth @RG line in the file. This means that we may need * to edit the RG data series (if the files were produced from * "samtools split" for example). * * The encoding method is stored in the compression header. Typical * examples: * * RG => EXTERNAL {18} # Block content-id 18 holds RG values * # as a series of ITF8 encoded values * * RG => HUFFMAN {1, 255, 255, 255, 255, 255, 1, 0} * # One RG value #-1. (No RG) * * RG => HUFFMAN {1, 0, 1, 0} # One RG value #0 (always first RG) * * RG => HUFFMAN {2, 0, 1, 2, 1, 1} * # Two RG values, #0 and #1, written * # to the CORE block and possibly * # mixed with other data series. * * A single value can (but may not be) implemented as a zero bit * huffman code. In this situation we can change the meta-data in the * compression header to renumber an RG value.. */ int cram_cat(int nfn, char * const *fn, const bam_hdr_t *h, const char* outcram) { samFile *out; cram_fd *out_c; int i, vers_maj, vers_min; khash_s2i *rg2id = NULL; bam_hdr_t *new_h = NULL; /* Check consistent versioning and compatible headers */ if (!(new_h = cram_cat_check_hdr(nfn, fn, h, &rg2id, &vers_maj, &vers_min))) return -1; /* Open the file with cram_vers */ char vers[100]; sprintf(vers, "%d.%d", vers_maj, vers_min); out = sam_open(outcram, "wc"); if (out == 0) { fprintf(stderr, "[%s] ERROR: fail to open output file '%s'.\n", __func__, outcram); return 1; } out_c = out->fp.cram; cram_set_option(out_c, CRAM_OPT_VERSION, vers); //fprintf(stderr, "Creating cram vers %s\n", vers); cram_fd_set_header(out_c, sam_hdr_parse_(new_h->text, new_h->l_text)); // needed? sam_hdr_write(out, new_h); for (i = 0; i < nfn; ++i) { samFile *in; cram_fd *in_c; cram_container *c; bam_hdr_t *old; int new_rg = -1; in = sam_open(fn[i], "rc"); if (in == 0) { fprintf(stderr, "[%s] ERROR: fail to open file '%s'.\n", __func__, fn[i]); return -1; } in_c = in->fp.cram; old = sam_hdr_read(in); khash_s2i *rg2id_in = hash_rg(old); // Compute RG mapping if suitable for changing. if (rg2id_in->n_id == 1) { int _; new_rg = hash_s2i_inc(rg2id, rg2id_in->id[0], NULL, &_); } else { new_rg = 0; } hash_s2i_free(rg2id_in); // Copy contains and blocks within them while ((c = cram_read_container(in_c))) { cram_block *blk; if (cram_container_is_empty(in_c)) { if (cram_write_container(out_c, c) != 0) return -1; // Container compression header if (!(blk = cram_read_block(in_c))) return -1; if (cram_write_block(out_c, blk) != 0) { cram_free_block(blk); return -1; } cram_free_block(blk); cram_free_container(c); continue; } // If we have just one RG key and new_rg != 0 then // we need to edit the compression header. IF WE CAN. if (new_rg) { int zero = 0; //fprintf(stderr, "Transcode RG %d to %d\n", 0, new_rg); cram_transcode_rg(in_c, out_c, c, 1, &zero, &new_rg); } else { int32_t num_slices; // Not switching rg so do the usual read/write loop if (cram_write_container(out_c, c) != 0) return -1; // Container compression header if (!(blk = cram_read_block(in_c))) return -1; if (cram_write_block(out_c, blk) != 0) { cram_free_block(blk); return -1; } cram_free_block(blk); // Container num_blocks can be invalid, due to a bug. // Instead we iterate in slice context instead. (void)cram_container_get_landmarks(c, &num_slices); cram_copy_slice(in_c, out_c, num_slices); } cram_free_container(c); } bam_hdr_destroy(old); sam_close(in); } sam_close(out); hash_s2i_free(rg2id); bam_hdr_destroy(new_h); return 0; }