off_t hseek(hFILE *fp, off_t offset, int whence) { off_t curpos, pos; if (writebuffer_is_nonempty(fp) && fp->mobile) { int ret = flush_buffer(fp); if (ret < 0) return ret; } curpos = htell(fp); // Relative offsets are given relative to the hFILE's stream position, // which may differ from the backend's physical position due to buffering // read-ahead. Correct for this by converting to an absolute position. if (whence == SEEK_CUR) { if (curpos + offset < 0) { // Either a negative offset resulted in a position before the // start of the file, or we overflowed when given a positive offset fp->has_errno = errno = (offset < 0)? EINVAL : EOVERFLOW; return -1; } whence = SEEK_SET; offset = curpos + offset; } // For fixed immobile buffers, convert everything else to SEEK_SET too // so that seeking can be avoided for all (within range) requests. else if (! fp->mobile && whence == SEEK_END) { size_t length = fp->end - fp->buffer; if (offset > 0 || -offset > length) { fp->has_errno = errno = EINVAL; return -1; } whence = SEEK_SET; offset = length + offset; } // Avoid seeking if the desired position is within our read buffer. // (But not when the next operation may be a write on a mobile buffer.) if (whence == SEEK_SET && (! fp->mobile || fp->readonly) && offset >= fp->offset && offset - fp->offset <= fp->end - fp->buffer) { fp->begin = &fp->buffer[offset - fp->offset]; return offset; } pos = fp->backend->seek(fp, offset, whence); if (pos < 0) { fp->has_errno = errno; return pos; } // Seeking succeeded, so discard any non-empty read buffer fp->begin = fp->end = fp->buffer; fp->at_eof = 0; fp->offset = pos; return pos; }
void check_offset(hFILE *f, off_t off, const char *message) { off_t ret = htell(f); if (ret < 0) fail("htell(%s)", message); if (ret == off) return; fprintf(stderr, "%s offset incorrect: expected %ld but got %ld\n", message, (long)off, (long)ret); exit(EXIT_FAILURE); }
/* * Builds an index file. * * fd is a newly opened cram file that we wish to index. * fn_base is the filename of the associated CRAM file. Internally we * add ".crai" to this to get the index filename. * * Returns 0 on success * -1 on failure */ int cram_index_build(cram_fd *fd, const char *fn_base) { cram_container *c; off_t cpos, spos, hpos; zfp *fp; char fn_idx[PATH_MAX]; if (strlen(fn_base) > PATH_MAX-6) return -1; sprintf(fn_idx, "%s.crai", fn_base); if (!(fp = zfopen(fn_idx, "wz"))) { perror(fn_idx); return -1; } cpos = htell(fd->fp); while ((c = cram_read_container(fd))) { int j; if (fd->err) { perror("Cram container read"); return 1; } hpos = htell(fd->fp); if (!(c->comp_hdr_block = cram_read_block(fd))) return 1; assert(c->comp_hdr_block->content_type == COMPRESSION_HEADER); c->comp_hdr = cram_decode_compression_header(fd, c->comp_hdr_block); if (!c->comp_hdr) return -1; // 2.0 format for (j = 0; j < c->num_landmarks; j++) { char buf[1024]; cram_slice *s; int sz; spos = htell(fd->fp); assert(spos - cpos - c->offset == c->landmark[j]); if (!(s = cram_read_slice(fd))) { zfclose(fp); return -1; } sz = (int)(htell(fd->fp) - spos); if (s->hdr->ref_seq_id == -2) { cram_index_build_multiref(fd, c, s, fp, cpos, c->landmark[j], sz); } else { sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n", s->hdr->ref_seq_id, s->hdr->ref_seq_start, s->hdr->ref_seq_span, (int64_t)cpos, c->landmark[j], sz); zfputs(buf, fp); } cram_free_slice(s); } cpos = htell(fd->fp); assert(cpos == hpos + c->length); cram_free_container(c); } if (fd->err) { zfclose(fp); return -1; } return zfclose(fp); }
/* * Reads a version 3 CRAM file and replaces the header in-place, * provided the header is small enough to fit without growing the * entire file. * * Version 3 format has a SAM header held as an (optionally) * compressed block within the header container. Additional * uncompressed blocks or simply unallocated space (the difference * between total block sizes and the container size) are used to * provide room for growth or contraction of the compressed header. * * Returns 0 on success; * -1 on general failure; * -2 on failure due to insufficient size */ int cram_reheader_inplace3(cram_fd *fd, const bam_hdr_t *h, const char *arg_list, int add_PG) { cram_container *c = NULL; cram_block *b = NULL; SAM_hdr *hdr = NULL; off_t start, sz, end; int container_sz, max_container_sz; char *buf = NULL; int ret = -1; if (cram_major_vers(fd) < 2 || cram_major_vers(fd) > 3) { fprintf(stderr, "[%s] unsupported CRAM version %d\n", __func__, cram_major_vers(fd)); goto err; } if (!(hdr = sam_hdr_parse_(h->text, h->l_text))) goto err; if (add_PG && sam_hdr_add_PG(hdr, "samtools", "VN", samtools_version(), arg_list ? "CL": NULL, arg_list ? arg_list : NULL, NULL)) goto err; int header_len = sam_hdr_length(hdr); /* Fix M5 strings? Maybe out of scope for this tool */ // Find current size of SAM header block if ((start = hseek(cram_fd_get_fp(fd), 26, SEEK_SET)) != 26) goto err; if (!(c = cram_read_container(fd))) goto err; // +5 allows num_landmarks to increase from 0 to 1 (Cramtools) max_container_sz = cram_container_size(c)+5; sz = htell(cram_fd_get_fp(fd)) + cram_container_get_length(c) - start; end = htell(cram_fd_get_fp(fd)) + cram_container_get_length(c); // We force 1 block instead of (optionally) 2. C CRAM // implementations for v3 were writing 1 compressed block followed // by 1 uncompressed block. However this is tricky to deal with // as changing block sizes can mean the block header also changes // size due to itf8 and variable size integers. // // If we had 1 block, this doesn't change anything. // If we had 2 blocks, the new container header will be smaller by // 1+ bytes, requiring the cram_container_get_length(c) to be larger in value. // However this is an int32 instead of itf8 so the container // header structure stays the same size. This means we can always // reduce the number of blocks without running into size problems. cram_container_set_num_blocks(c, 1); int32_t *landmark; int32_t num_landmarks; landmark = cram_container_get_landmarks(c, &num_landmarks); if (num_landmarks && landmark) { num_landmarks = 1; landmark[0] = 0; } else { num_landmarks = 0; } cram_container_set_landmarks(c, num_landmarks, landmark); buf = malloc(max_container_sz); container_sz = max_container_sz; if (cram_store_container(fd, c, buf, &container_sz) != 0) goto err; if (!buf) goto err; // Proposed new length, but changing cram_container_get_length(c) may change the // container_sz and thus the remainder (cram_container_get_length(c) itself). cram_container_set_length(c, sz - container_sz); int old_container_sz = container_sz; container_sz = max_container_sz; if (cram_store_container(fd, c, buf, &container_sz) != 0) goto err; if (old_container_sz != container_sz) { fprintf(stderr, "Quirk of fate makes this troublesome! " "Please use non-inplace version.\n"); goto err; } // Version 3.0 supports compressed header b = cram_new_block(FILE_HEADER, 0); int32_put_blk(b, header_len); cram_block_append(b, sam_hdr_str(hdr), header_len); cram_block_update_size(b); cram_compress_block(fd, b, NULL, -1, -1); if (hseek(cram_fd_get_fp(fd), 26, SEEK_SET) != 26) goto err; if (cram_block_size(b) > cram_container_get_length(c)) { fprintf(stderr, "New header will not fit. Use non-inplace version" " (%d > %d)\n", (int)cram_block_size(b), cram_container_get_length(c)); ret = -2; goto err; } if (cram_write_container(fd, c) == -1) goto err; if (cram_write_block(fd, b) == -1) goto err; // Blank out the remainder int rsz = end - htell(cram_fd_get_fp(fd)); assert(rsz >= 0); if (rsz) { char *rem = calloc(1, rsz); ret = hwrite(cram_fd_get_fp(fd), rem, rsz) == rsz ? 0 : -1; free(rem); } err: if (c) cram_free_container(c); if (buf) free(buf); if (b) cram_free_block(b); if (hdr) sam_hdr_free(hdr); return ret; }