static void naive_concat(args_t *args) { // only compressed BCF atm BGZF *bgzf_out = bgzf_open(args->output_fname,"w");; const size_t page_size = 32768; char *buf = (char*) malloc(page_size); kstring_t tmp = {0,0,0}; int i; for (i=0; i<args->nfnames; i++) { htsFile *hts_fp = hts_open(args->fnames[i],"r"); if ( !hts_fp ) error("Failed to open: %s\n", args->fnames[i]); htsFormat type = *hts_get_format(hts_fp); if ( type.format==vcf ) error("The --naive option currently works only for compressed BCFs, sorry :-/\n"); if ( type.compression!=bgzf ) error("The --naive option currently works only for compressed BCFs, sorry :-/\n"); BGZF *fp = hts_get_bgzfp(hts_fp); if ( !fp || bgzf_read_block(fp) != 0 || !fp->block_length ) error("Failed to read %s: %s\n", args->fnames[i], strerror(errno)); uint8_t magic[5]; if ( bgzf_read(fp, magic, 5) != 5 ) error("Failed to read the BCF header in %s\n", args->fnames[i]); if (strncmp((char*)magic, "BCF\2\2", 5) != 0) error("Invalid BCF magic string in %s\n", args->fnames[i]); if ( bgzf_read(fp, &tmp.l, 4) != 4 ) error("Failed to read the BCF header in %s\n", args->fnames[i]); hts_expand(char,tmp.l,tmp.m,tmp.s); if ( bgzf_read(fp, tmp.s, tmp.l) != tmp.l ) error("Failed to read the BCF header in %s\n", args->fnames[i]); // write only the first header if ( i==0 ) { if ( bgzf_write(bgzf_out, "BCF\2\2", 5) !=5 ) error("Failed to write %d bytes to %s\n", 5,args->output_fname); if ( bgzf_write(bgzf_out, &tmp.l, 4) !=4 ) error("Failed to write %d bytes to %s\n", 4,args->output_fname); if ( bgzf_write(bgzf_out, tmp.s, tmp.l) != tmp.l) error("Failed to write %d bytes to %s\n", tmp.l,args->output_fname); } // Output all non-header data that were read together with the header block int nskip = fp->block_offset; if ( fp->block_length - nskip > 0 ) { if ( bgzf_write(bgzf_out, fp->uncompressed_block+nskip, fp->block_length-nskip)<0 ) error("Error: %d\n",fp->errcode); } if ( bgzf_flush(bgzf_out)<0 ) error("Error: %d\n",bgzf_out->errcode); // Stream the rest of the file as it is, without recompressing, but remove BGZF EOF blocks ssize_t nread, ncached = 0, nwr; const int neof = 28; char cached[neof]; while (1) { nread = bgzf_raw_read(fp, buf, page_size); // page_size boundary may occur in the middle of the EOF block, so we need to cache the blocks' ends if ( nread<=0 ) break; if ( nread<=neof ) // last block { if ( ncached ) { // flush the part of the cache that won't be needed nwr = bgzf_raw_write(bgzf_out, cached, nread); if (nwr != nread) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)nread); // make space in the cache so that we can append to the end if ( nread!=neof ) memmove(cached,cached+nread,neof-nread); } // fill the cache and check for eof outside this loop memcpy(cached+neof-nread,buf,nread); break; } // not the last block, flush the cache if full if ( ncached ) { nwr = bgzf_raw_write(bgzf_out, cached, ncached); if (nwr != ncached) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)ncached); ncached = 0; } // fill the cache nread -= neof; memcpy(cached,buf+nread,neof); ncached = neof; nwr = bgzf_raw_write(bgzf_out, buf, nread); if (nwr != nread) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)nread); } if ( ncached && memcmp(cached,"\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0",neof) ) { nwr = bgzf_raw_write(bgzf_out, cached, neof); if (nwr != neof) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)neof); } if (hts_close(hts_fp)) error("Close failed: %s\n",args->fnames[i]); } free(buf); free(tmp.s); if (bgzf_close(bgzf_out) < 0) error("Error: %d\n",bgzf_out->errcode); }
int reheader_file(const char *fname, const char *header, int ftype, tbx_conf_t *conf) { if ( ftype & IS_TXT || !ftype ) { BGZF *fp = bgzf_open(fname,"r"); if ( !fp || bgzf_read_block(fp) != 0 || !fp->block_length ) return -1; char *buffer = fp->uncompressed_block; int skip_until = 0; // Skip the header: find out the position of the data block if ( buffer[0]==conf->meta_char ) { skip_until = 1; while (1) { if ( buffer[skip_until]=='\n' ) { skip_until++; if ( skip_until>=fp->block_length ) { if ( bgzf_read_block(fp) != 0 || !fp->block_length ) error("FIXME: No body in the file: %s\n", fname); skip_until = 0; } // The header has finished if ( buffer[skip_until]!=conf->meta_char ) break; } skip_until++; if ( skip_until>=fp->block_length ) { if (bgzf_read_block(fp) != 0 || !fp->block_length) error("FIXME: No body in the file: %s\n", fname); skip_until = 0; } } } // Output the new header FILE *hdr = fopen(header,"r"); if ( !hdr ) error("%s: %s", header,strerror(errno)); const size_t page_size = 32768; char *buf = malloc(page_size); BGZF *bgzf_out = bgzf_open("-", "w"); ssize_t nread; while ( (nread=fread(buf,1,page_size-1,hdr))>0 ) { if ( nread<page_size-1 && buf[nread-1]!='\n' ) buf[nread++] = '\n'; if (bgzf_write(bgzf_out, buf, nread) < 0) error("Error: %d\n",bgzf_out->errcode); } if ( fclose(hdr) ) error("close failed: %s\n", header); // Output all remainig data read with the header block if ( fp->block_length - skip_until > 0 ) { if (bgzf_write(bgzf_out, buffer+skip_until, fp->block_length-skip_until) < 0) error("Error: %d\n",fp->errcode); } if (bgzf_flush(bgzf_out) < 0) error("Error: %d\n",bgzf_out->errcode); while (1) { nread = bgzf_raw_read(fp, buf, page_size); if ( nread<=0 ) break; int count = bgzf_raw_write(bgzf_out, buf, nread); if (count != nread) error("Write failed, wrote %d instead of %d bytes.\n", count,(int)nread); } if (bgzf_close(bgzf_out) < 0) error("Error: %d\n",bgzf_out->errcode); if (bgzf_close(fp) < 0) error("Error: %d\n",fp->errcode); free(buf); } else error("todo: reheader BCF, BAM\n"); // BCF is difficult, records contain pointers to the header. return 0; }
int reheader_file(const char *header, const char *file, int meta) { BGZF *fp = bgzf_open(file,"r"); if (bgzf_read_block(fp) != 0 || !fp->block_length) return -1; char *buffer = fp->uncompressed_block; int skip_until = 0; if ( buffer[0]==meta ) { skip_until = 1; // Skip the header while (1) { if ( buffer[skip_until]=='\n' ) { skip_until++; if ( skip_until>=fp->block_length ) { if (bgzf_read_block(fp) != 0 || !fp->block_length) error("no body?\n"); skip_until = 0; } // The header has finished if ( buffer[skip_until]!=meta ) break; } skip_until++; if ( skip_until>=fp->block_length ) { if (bgzf_read_block(fp) != 0 || !fp->block_length) error("no body?\n"); skip_until = 0; } } } FILE *fh = fopen(header,"r"); if ( !fh ) error("%s: %s", header,strerror(errno)); int page_size = getpagesize(); char *buf = valloc(page_size); BGZF *bgzf_out = bgzf_fdopen(fileno(stdout), "w"); ssize_t nread; while ( (nread=fread(buf,1,page_size-1,fh))>0 ) { if ( nread<page_size-1 && buf[nread-1]!='\n' ) buf[nread++] = '\n'; if (bgzf_write(bgzf_out, buf, nread) < 0) error("Error: %s\n",bgzf_out->error); } fclose(fh); if ( fp->block_length - skip_until > 0 ) { if (bgzf_write(bgzf_out, buffer+skip_until, fp->block_length-skip_until) < 0) error("Error: %s\n",fp->error); } if (bgzf_flush(bgzf_out) < 0) error("Error: %s\n",bgzf_out->error); while (1) { #ifdef _USE_KNETFILE nread = knet_read(fp->x.fpr, buf, page_size); #else nread = fread(buf, 1, page_size, fp->file); #endif if ( nread<=0 ) break; #ifdef _USE_KNETFILE int count = fwrite(buf, 1, nread, bgzf_out->x.fpw); #else int count = fwrite(buf, 1, nread, bgzf_out->file); #endif if (count != nread) error("Write failed, wrote %d instead of %d bytes.\n", count,(int)nread); } if (bgzf_close(bgzf_out) < 0) error("Error: %s\n",bgzf_out->error); return 0; }
static void reheader_vcf_gz(args_t *args) { BGZF *fp = bgzf_open(args->fname,"r"); if ( !fp || bgzf_read_block(fp) != 0 || !fp->block_length ) error("Failed to read %s: %s\n", args->fname, strerror(errno)); kstring_t hdr = {0,0,0}; char *buffer = (char*) fp->uncompressed_block; // Read the header and find the position of the data block if ( buffer[0]!='#' ) error("Could not parse the header, expected '#', found '%c'\n", buffer[0]); int skip_until = 1; // end of the header in the current uncompressed block while (1) { if ( buffer[skip_until]=='\n' ) { skip_until++; if ( skip_until>=fp->block_length ) { kputsn(buffer,skip_until,&hdr); if ( bgzf_read_block(fp) != 0 || !fp->block_length ) error("FIXME: No body in the file: %s\n", args->fname); skip_until = 0; } // The header has finished if ( buffer[skip_until]!='#' ) { kputsn(buffer,skip_until,&hdr); break; } } skip_until++; if ( skip_until>=fp->block_length ) { kputsn(buffer,fp->block_length,&hdr); if (bgzf_read_block(fp) != 0 || !fp->block_length) error("FIXME: No body in the file: %s\n", args->fname); skip_until = 0; } } int nsamples = 0; char **samples = NULL; if ( args->samples_fname ) samples = hts_readlines(args->samples_fname, &nsamples); if ( args->header_fname ) { free(hdr.s); hdr.s = NULL; hdr.l = hdr.m = 0; read_header_file(args->header_fname, &hdr); } if ( samples ) { set_samples(samples, nsamples, &hdr); int i; for (i=0; i<nsamples; i++) free(samples[i]); free(samples); } // Output the modified header BGZF *bgzf_out = bgzf_dopen(fileno(stdout), "w"); bgzf_write(bgzf_out, hdr.s, hdr.l); free(hdr.s); // Output all remainig data read with the header block if ( fp->block_length - skip_until > 0 ) { if ( bgzf_write(bgzf_out, buffer+skip_until, fp->block_length-skip_until)<0 ) error("Error: %d\n",fp->errcode); } if ( bgzf_flush(bgzf_out)<0 ) error("Error: %d\n",bgzf_out->errcode); // Stream the rest of the file without as it is, without decompressing ssize_t nread; int page_size = getpagesize(); char *buf = (char*) valloc(page_size); while (1) { nread = bgzf_raw_read(fp, buf, page_size); if ( nread<=0 ) break; int count = bgzf_raw_write(bgzf_out, buf, nread); if (count != nread) error("Write failed, wrote %d instead of %d bytes.\n", count,(int)nread); } if (bgzf_close(bgzf_out) < 0) error("Error: %d\n",bgzf_out->errcode); if (bgzf_close(fp) < 0) error("Error: %d\n",fp->errcode); free(buf); }