/* * Builds an index file. * * fd is a newly opened cram file that we wish to index. * fn_base is the filename of the associated CRAM file. Internally we * add ".crai" to this to get the index filename. * * Returns 0 on success * -1 on failure */ int cram_index_build(cram_fd *fd, const char *fn_base) { cram_container *c; off_t cpos, spos, hpos; zfp *fp; char fn_idx[PATH_MAX]; if (strlen(fn_base) > PATH_MAX-6) return -1; sprintf(fn_idx, "%s.crai", fn_base); if (!(fp = zfopen(fn_idx, "wz"))) { perror(fn_idx); return -1; } cpos = htell(fd->fp); while ((c = cram_read_container(fd))) { int j; if (fd->err) { perror("Cram container read"); return 1; } hpos = htell(fd->fp); if (!(c->comp_hdr_block = cram_read_block(fd))) return 1; assert(c->comp_hdr_block->content_type == COMPRESSION_HEADER); c->comp_hdr = cram_decode_compression_header(fd, c->comp_hdr_block); if (!c->comp_hdr) return -1; // 2.0 format for (j = 0; j < c->num_landmarks; j++) { char buf[1024]; cram_slice *s; int sz; spos = htell(fd->fp); assert(spos - cpos - c->offset == c->landmark[j]); if (!(s = cram_read_slice(fd))) { zfclose(fp); return -1; } sz = (int)(htell(fd->fp) - spos); if (s->hdr->ref_seq_id == -2) { cram_index_build_multiref(fd, c, s, fp, cpos, c->landmark[j], sz); } else { sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n", s->hdr->ref_seq_id, s->hdr->ref_seq_start, s->hdr->ref_seq_span, (int64_t)cpos, c->landmark[j], sz); zfputs(buf, fp); } cram_free_slice(s); } cpos = htell(fd->fp); assert(cpos == hpos + c->length); cram_free_container(c); } if (fd->err) { zfclose(fp); return -1; } return zfclose(fp); }
/* * Renumbers RG numbers in a cram compression header. * * CRAM stores RG as the Nth number in the header, rather than a * string holding the ID: tag. This is smaller in space, but means * "samtools cat" to join files together that contain single but * different RG lines needs a way of renumbering them. * * The file descriptor is expected to be immediately after the * cram_container structure (ie before the cram compression header). * Due to the nature of the CRAM format, this needs to read and write * the blocks itself. Note that there may be multiple slices within * the container, meaning multiple compression headers to manipulate. * Changing RG may change the size of the compression header and * therefore the length field in the container. Hence we rewrite all * blocks just incase and also emit the adjusted container. * * The current implementation can only cope with renumbering a single * RG (and only then if it is using HUFFMAN or BETA codecs). In * theory it *may* be possible to renumber multiple RGs if they use * HUFFMAN to the CORE block or use an external block unshared by any * other data series. So we have an API that can be upgraded to * support this, but do not implement it for now. An example * implementation of RG as an EXTERNAL block would be to find that * block and rewrite it, returning the number of blocks consumed. * * Returns 0 on success; * -1 if unable to edit; * -2 on other errors (eg I/O). */ int cram_transcode_rg(cram_fd *in, cram_fd *out, cram_container *c, int nrg, int *in_rg, int *out_rg) { int new_rg = *out_rg, old_size, new_size; cram_block *o_blk, *n_blk; cram_block_compression_hdr *ch; if (nrg != 1) { hts_log_error("CRAM transcode supports only a single RG"); return -2; } // Produce a new block holding the updated compression header, // with RG transcoded to a new value. (Single only supported.) o_blk = cram_read_block(in); old_size = cram_block_size(o_blk); ch = cram_decode_compression_header(in, o_blk); if (cram_block_compression_hdr_set_rg(ch, new_rg) != 0) return -1; cram_block_compression_hdr_decoder2encoder(in, ch); n_blk = cram_encode_compression_header(in, c, ch); cram_free_compression_header(ch); /* * Warning: this has internal knowledge of the cram compression * header format. * * The decoder doesn't set c->tags_used, so the encoder puts a two * byte blank segment. This means n_blk is too short. We skip * through the decoded old block (o_blk) and copy from there. */ char *cp = cram_block_get_data(o_blk); char *op = cp; char *endp = cp + cram_block_get_uncomp_size(o_blk); //fprintf(stderr, "sz = %d\n", (int)(endp-cp)); int32_t i32; cp += safe_itf8_get(cp, endp, &i32); cp += i32; cp += safe_itf8_get(cp, endp, &i32); cp += i32; op = cp; cp += safe_itf8_get(cp, endp, &i32); i32 += (cp-op); //fprintf(stderr, "remaining %d bytes\n", i32); cram_block_set_size(n_blk, cram_block_get_size(n_blk)-2); cram_block_append(n_blk, op, i32); cram_block_update_size(n_blk); new_size = cram_block_size(n_blk); //fprintf(stderr, "size %d -> %d\n", old_size, new_size); // Now we've constructedthe updated compression header, // amend the container too (it may have changed size). int32_t *landmarks, num_landmarks; landmarks = cram_container_get_landmarks(c, &num_landmarks); if (old_size != new_size) { int diff = new_size - old_size, j; for (j = 0; j < num_landmarks; j++) landmarks[j] += diff; //cram_container_set_landmarks(c, num_landmarks, landmarks); cram_container_set_length(c, cram_container_get_length(c) + diff); } // Finally write it all out; container, compression header, // and then all the remaining slice blocks. if (cram_write_container(out, c) != 0) return -2; cram_write_block(out, n_blk); cram_free_block(o_blk); cram_free_block(n_blk); // Container num_blocks can be invalid, due to a bug. // Instead we iterate in slice context instead. return cram_copy_slice(in, out, num_landmarks); }