int SingleChromosomeBCFIndex::nextLine(uint32_t* l_shared, uint32_t* l_indiv, std::vector<char>* line) { if (4 != bgzf_read(fBcfFile_, l_shared, sizeof(uint32_t)) || 4 != bgzf_read(fBcfFile_, l_indiv, sizeof(uint32_t))) { REprintf("readLine error!\n"); } uint32_t totalLen = *l_shared + *l_indiv; line->resize(totalLen); if ( totalLen != bgzf_read(fBcfFile_, line->data(), totalLen)) { REprintf("readLine bgzf_read error!\n"); } return totalLen; }
size_t count_fastq_sequences(BGZF *fastq_file) { bgzf_seek( fastq_file, 0, SEEK_SET ); cmph_uint32 count = 0; while( 1 ) { char buffer[ BUFSIZ ]; ssize_t bytes_read = bgzf_read( fastq_file, buffer, BUFSIZ ); if( bytes_read <= 0 ) { break; } int i; for(i = 0; i < bytes_read; i++) { if( buffer[ i ] == '@' ) { count++; } } } bgzf_seek( fastq_file, 0, SEEK_SET ); return count; }
//this function will print a samfile from the bamfile int motherView(bufReader *rd,int nFiles,std::vector<regs>regions) { aRead b; b.vDat=new uint8_t[RLEN]; kstring_t str; str.s=NULL; str.l=str.m=0; if(regions.size()==0) {//print all int block_size; while(SIG_COND && bgzf_read(rd[0].fp,&block_size,sizeof(int))){ getAlign(rd[0].fp,block_size,b); printReadBuffered(b,rd[0].hd,str); fprintf(stdout,"%s",str.s); } }else { for(int i=0;i<(int)regions.size();i++){ int tmpRef = regions[i].refID; int tmpStart = regions[i].start; int tmpStop = regions[i].stop; getOffsets(rd[0].bamfname,rd[0].hd,rd[0].it,tmpRef,tmpStart,tmpStop); int ret =0; while(SIG_COND){ ret = bam_iter_read(rd[0].fp, &rd[0].it, b); if(ret<0) break; printReadBuffered(b,rd[0].hd,str); fprintf(stdout,"%s",str.s); } free(rd[0].it.off);//the offsets } free(str.s); delete [] b.vDat; } return 0; }
bcf_hdr_t *bcf_hdr_read(BGZF *fp) { uint8_t magic[5]; bcf_hdr_t *h; h = bcf_hdr_init(); bgzf_read(fp, magic, 5); if (strncmp((char*)magic, "BCF\2\1", 5) != 0) { if (hts_verbose >= 2) fprintf(stderr, "[E::%s] invalid BCF2 magic string\n", __func__); bcf_hdr_destroy(h); return 0; } bgzf_read(fp, &h->l_text, 4); h->text = (char*)malloc(h->l_text); bgzf_read(fp, h->text, h->l_text); bcf_hdr_parse(h); return h; }
static inline int bcf_read1_core(BGZF *fp, bcf1_t *v) { uint32_t x[8]; int ret; if ((ret = bgzf_read(fp, x, 32)) != 32) { if (ret == 0) return -1; return -2; } x[0] -= 24; // to exclude six 32-bit integers ks_resize(&v->shared, x[0]); ks_resize(&v->indiv, x[1]); memcpy(v, x + 2, 16); v->n_allele = x[6]>>16; v->n_info = x[6]&0xffff; v->n_fmt = x[7]>>24; v->n_sample = x[7]&0xffffff; v->shared.l = x[0], v->indiv.l = x[1]; v->unpacked = 0; v->unpack_ptr = NULL; bgzf_read(fp, v->shared.s, v->shared.l); bgzf_read(fp, v->indiv.s, v->indiv.l); return 0; }
//hint is the suggested newsize void filt_readSites(filt*fl,char *chr,size_t hint) { assert(fl!=NULL); std::map<char*,asdf_dats,ltstr> ::iterator it = fl->offs.find(chr); if(it==fl->offs.end()){ fprintf(stderr,"\n\t-> Potential problem: The filereading has reached a chromsome: \'%s\', which is not included in your \'-sites\' file.\n\t-> Please consider limiting your analysis to the chromsomes of interest \n",chr); fprintf(stderr,"\t-> see \'http://www.popgen.dk/angsd/index.php/Sites\' for more information\n"); fprintf(stderr,"\t-> Program will continue reading this chromosome... \n"); //exit(0); free(fl->keeps); free(fl->minor); free(fl->major); fl->keeps=fl->minor=fl->major=NULL; fl->curLen =0; return; } bgzf_seek(fl->bg,it->second.offs,SEEK_SET); size_t nsize = std::max(fl->curLen,hint); nsize = std::max(nsize,it->second.len); if(nsize>fl->curLen) fl->keeps=(char*) realloc(fl->keeps,nsize); memset(fl->keeps,0,nsize); //fprintf(stderr,"it->second.len:%lu fl->curLen:%lu fl->keeps:%p\n",it->second.len,fl->curLen,fl->keeps); bgzf_read(fl->bg,fl->keeps,it->second.len); if(fl->hasMajMin==1){ if(nsize>fl->curLen) { fl->major = (char*) realloc(fl->major,nsize); fl->minor = (char*) realloc(fl->minor,nsize); memset(fl->major,0,nsize); memset(fl->minor,0,nsize); } bgzf_read(fl->bg,fl->major,it->second.len); bgzf_read(fl->bg,fl->minor,it->second.len); } fl->curNam=chr; fl->curLen = nsize; }
perChr getPerChr(BGZF *fp){ perChr ret; ret.nSites =0; ret.posi=NULL; ret.tW=ret.tP=ret.tF=ret.tH=ret.tL=NULL; size_t clen; if(bgzf_read(fp,&clen,sizeof(size_t))==0) return ret; ret.chr = new char[clen+1]; bgzf_read(fp,ret.chr,clen); ret.chr[clen] = '\0'; bgzf_read(fp,&ret.nSites,sizeof(size_t)); ret.posi = new int[ret.nSites]; ret.tW = new float[ret.nSites]; ret.tP = new float[ret.nSites]; ret.tF = new float[ret.nSites]; ret.tH = new float[ret.nSites]; ret.tL = new float[ret.nSites]; //read positions and thetas bgzf_read(fp,ret.posi,ret.nSites*sizeof(int)); bgzf_read(fp,ret.tW,ret.nSites*sizeof(float)); bgzf_read(fp,ret.tP,ret.nSites*sizeof(float)); bgzf_read(fp,ret.tF,ret.nSites*sizeof(float)); bgzf_read(fp,ret.tH,ret.nSites*sizeof(float)); bgzf_read(fp,ret.tL,ret.nSites*sizeof(float)); //make thetas into normal space for(size_t i=0;i<ret.nSites;i++){ ret.tW[i] = exp(ret.tW[i]); ret.tP[i] = exp(ret.tP[i]); ret.tF[i] = exp(ret.tF[i]); ret.tH[i] = exp(ret.tH[i]); ret.tL[i] = exp(ret.tL[i]); } return ret; }
uint64_t read_chunk(double **chunk_data, params *pars, uint64_t chunk) { uint64_t total_elems_read = 0; if(chunk >= pars->n_chunks) error("invalid chunk number!"); // Define chunk start and end positions uint64_t start_pos = chunk * pars->max_chunk_size; uint64_t end_pos = start_pos + pars->max_chunk_size - 1; if(end_pos >= pars->n_sites) end_pos = pars->n_sites - 1; uint64_t chunk_size = end_pos - start_pos + 1; if( pars->verbose >= 6 ) printf("\tReading chunk %lu from position %lu to %lu (%lu)\n", chunk+1, start_pos, end_pos, chunk_size); // Search start position #ifdef _USE_BGZF if( bgzf_seek(pars->in_glf_fh, pars->chunks_voffset[chunk], SEEK_SET) < 0 ) error("cannot seek GLF file (BGZF)!"); #endif // Read data from file for(uint64_t c = 0; c < chunk_size; c++) { #ifdef _USE_BGZF int bytes_read = bgzf_read(pars->in_glf_fh, chunk_data[c], (int) pars->n_ind * 3 * sizeof(double)); if(pars->call_geno) call_geno(chunk_data[c], pars->n_ind, 3); uint64_t elems_read = (uint64_t) bytes_read / sizeof(double); #else chunk_data[c] = pars->data[start_pos+c]; uint64_t elems_read = pars->n_ind * 3; #endif if( elems_read != pars->n_ind * 3 ) error("cannot read GLF file!"); total_elems_read += elems_read; } #ifdef _USE_BGZF // Update index for next chunk if( chunk+1 != pars->n_chunks && pars->chunks_voffset[chunk+1] == 0 ) pars->chunks_voffset[chunk+1] = bgzf_tell(pars->in_glf_fh); #endif return( total_elems_read/(pars->n_ind * 3) ); }
int main(int argc, char **argv) { int c, compress, pstdout, is_forced, index = 0, reindex = 0; BGZF *fp; void *buffer; long start, end, size; char *index_fname = NULL; static struct option loptions[] = { {"help",0,0,'h'}, {"offset",1,0,'b'}, {"stdout",0,0,'c'}, {"decompress",0,0,'d'}, {"force",0,0,'f'}, {"index",0,0,'i'}, {"index-name",1,0,'I'}, {"reindex",0,0,'r'}, {"size",1,0,'s'}, {0,0,0,0} }; compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0; while((c = getopt_long(argc, argv, "cdh?fb:s:iI:r",loptions,NULL)) >= 0){ switch(c){ case 'd': compress = 0; break; case 'c': pstdout = 1; break; case 'b': start = atol(optarg); compress = 0; pstdout = 1; break; case 's': size = atol(optarg); pstdout = 1; break; case 'f': is_forced = 1; break; case 'i': index = 1; break; case 'I': index_fname = optarg; break; case 'r': reindex = 1; compress = 0; break; case 'h': case '?': return bgzip_main_usage(); } } if (size >= 0) end = start + size; if (end >= 0 && end < start) { fprintf(stderr, "[bgzip] Illegal region: [%ld, %ld]\n", start, end); return 1; } if (compress == 1) { struct stat sbuf; int f_src = fileno(stdin); int f_dst = fileno(stdout); if ( argc>optind ) { if ( stat(argv[optind],&sbuf)<0 ) { fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); return 1; } if ((f_src = open(argv[optind], O_RDONLY)) < 0) { fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); return 1; } if (pstdout) f_dst = fileno(stdout); else { char *name = malloc(strlen(argv[optind]) + 5); strcpy(name, argv[optind]); strcat(name, ".gz"); f_dst = write_open(name, is_forced); if (f_dst < 0) return 1; free(name); } } else if (!pstdout && isatty(fileno((FILE *)stdout)) ) return bgzip_main_usage(); else if ( index && !index_fname ) { fprintf(stderr, "[bgzip] Index file name expected when writing to stdout\n"); return 1; } fp = bgzf_fdopen(f_dst, "w"); if ( index ) bgzf_index_build_init(fp); buffer = malloc(WINDOW_SIZE); while ((c = read(f_src, buffer, WINDOW_SIZE)) > 0) if (bgzf_write(fp, buffer, c) < 0) error("Could not write %d bytes: Error %d\n", c, fp->errcode); // f_dst will be closed here if ( index ) { if ( index_fname ) bgzf_index_dump(fp, index_fname, NULL); else bgzf_index_dump(fp, argv[optind], ".gz.gzi"); } if (bgzf_close(fp) < 0) error("Close failed: Error %d", fp->errcode); if (argc > optind && !pstdout) unlink(argv[optind]); free(buffer); close(f_src); return 0; } else if ( reindex ) { if ( argc>optind ) { fp = bgzf_open(argv[optind], "r"); if ( !fp ) error("[bgzip] Could not open file: %s\n", argv[optind]); } else { if ( !index_fname ) error("[bgzip] Index file name expected when reading from stdin\n"); fp = bgzf_fdopen(fileno(stdin), "r"); if ( !fp ) error("[bgzip] Could not read from stdin: %s\n", strerror(errno)); } buffer = malloc(BGZF_BLOCK_SIZE); bgzf_index_build_init(fp); int ret; while ( (ret=bgzf_read(fp, buffer, BGZF_BLOCK_SIZE))>0 ) ; free(buffer); if ( ret<0 ) error("Is the file gzipped or bgzipped? The latter is required for indexing.\n"); if ( index_fname ) bgzf_index_dump(fp, index_fname, NULL); else bgzf_index_dump(fp, argv[optind], ".gzi"); if ( bgzf_close(fp)<0 ) error("Close failed: Error %d\n",fp->errcode); return 0; } else { struct stat sbuf; int f_dst; if ( argc>optind ) { if ( stat(argv[optind],&sbuf)<0 ) { fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); return 1; } char *name; int len = strlen(argv[optind]); if ( strcmp(argv[optind]+len-3,".gz") ) { fprintf(stderr, "[bgzip] %s: unknown suffix -- ignored\n", argv[optind]); return 1; } fp = bgzf_open(argv[optind], "r"); if (fp == NULL) { fprintf(stderr, "[bgzip] Could not open file: %s\n", argv[optind]); return 1; } if (pstdout) { f_dst = fileno(stdout); } else { name = strdup(argv[optind]); name[strlen(name) - 3] = '\0'; f_dst = write_open(name, is_forced); free(name); } } else if (!pstdout && isatty(fileno((FILE *)stdin)) ) return bgzip_main_usage(); else { f_dst = fileno(stdout); fp = bgzf_fdopen(fileno(stdin), "r"); if (fp == NULL) { fprintf(stderr, "[bgzip] Could not read from stdin: %s\n", strerror(errno)); return 1; } } buffer = malloc(WINDOW_SIZE); if ( start>0 ) { if ( bgzf_index_load(fp, argv[optind], ".gzi") < 0 ) error("Could not load index: %s.gzi\n", argv[optind]); if ( bgzf_useek(fp, start, SEEK_SET) < 0 ) error("Could not seek to %d-th (uncompressd) byte\n", start); } while (1) { if (end < 0) c = bgzf_read(fp, buffer, WINDOW_SIZE); else c = bgzf_read(fp, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start)); if (c == 0) break; if (c < 0) error("Could not read %d bytes: Error %d\n", (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start), fp->errcode); start += c; if ( write(f_dst, buffer, c) != c ) error("Could not write %d bytes\n", c); if (end >= 0 && start >= end) break; } free(buffer); if (bgzf_close(fp) < 0) error("Close failed: Error %d\n",fp->errcode); if (!pstdout) unlink(argv[optind]); return 0; } return 0; }
static void naive_concat(args_t *args) { // only compressed BCF atm BGZF *bgzf_out = bgzf_open(args->output_fname,"w");; const size_t page_size = 32768; char *buf = (char*) malloc(page_size); kstring_t tmp = {0,0,0}; int i; for (i=0; i<args->nfnames; i++) { htsFile *hts_fp = hts_open(args->fnames[i],"r"); if ( !hts_fp ) error("Failed to open: %s\n", args->fnames[i]); htsFormat type = *hts_get_format(hts_fp); if ( type.format==vcf ) error("The --naive option currently works only for compressed BCFs, sorry :-/\n"); if ( type.compression!=bgzf ) error("The --naive option currently works only for compressed BCFs, sorry :-/\n"); BGZF *fp = hts_get_bgzfp(hts_fp); if ( !fp || bgzf_read_block(fp) != 0 || !fp->block_length ) error("Failed to read %s: %s\n", args->fnames[i], strerror(errno)); uint8_t magic[5]; if ( bgzf_read(fp, magic, 5) != 5 ) error("Failed to read the BCF header in %s\n", args->fnames[i]); if (strncmp((char*)magic, "BCF\2\2", 5) != 0) error("Invalid BCF magic string in %s\n", args->fnames[i]); if ( bgzf_read(fp, &tmp.l, 4) != 4 ) error("Failed to read the BCF header in %s\n", args->fnames[i]); hts_expand(char,tmp.l,tmp.m,tmp.s); if ( bgzf_read(fp, tmp.s, tmp.l) != tmp.l ) error("Failed to read the BCF header in %s\n", args->fnames[i]); // write only the first header if ( i==0 ) { if ( bgzf_write(bgzf_out, "BCF\2\2", 5) !=5 ) error("Failed to write %d bytes to %s\n", 5,args->output_fname); if ( bgzf_write(bgzf_out, &tmp.l, 4) !=4 ) error("Failed to write %d bytes to %s\n", 4,args->output_fname); if ( bgzf_write(bgzf_out, tmp.s, tmp.l) != tmp.l) error("Failed to write %d bytes to %s\n", tmp.l,args->output_fname); } // Output all non-header data that were read together with the header block int nskip = fp->block_offset; if ( fp->block_length - nskip > 0 ) { if ( bgzf_write(bgzf_out, fp->uncompressed_block+nskip, fp->block_length-nskip)<0 ) error("Error: %d\n",fp->errcode); } if ( bgzf_flush(bgzf_out)<0 ) error("Error: %d\n",bgzf_out->errcode); // Stream the rest of the file as it is, without recompressing, but remove BGZF EOF blocks ssize_t nread, ncached = 0, nwr; const int neof = 28; char cached[neof]; while (1) { nread = bgzf_raw_read(fp, buf, page_size); // page_size boundary may occur in the middle of the EOF block, so we need to cache the blocks' ends if ( nread<=0 ) break; if ( nread<=neof ) // last block { if ( ncached ) { // flush the part of the cache that won't be needed nwr = bgzf_raw_write(bgzf_out, cached, nread); if (nwr != nread) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)nread); // make space in the cache so that we can append to the end if ( nread!=neof ) memmove(cached,cached+nread,neof-nread); } // fill the cache and check for eof outside this loop memcpy(cached+neof-nread,buf,nread); break; } // not the last block, flush the cache if full if ( ncached ) { nwr = bgzf_raw_write(bgzf_out, cached, ncached); if (nwr != ncached) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)ncached); ncached = 0; } // fill the cache nread -= neof; memcpy(cached,buf+nread,neof); ncached = neof; nwr = bgzf_raw_write(bgzf_out, buf, nread); if (nwr != nread) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)nread); } if ( ncached && memcmp(cached,"\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0",neof) ) { nwr = bgzf_raw_write(bgzf_out, cached, neof); if (nwr != neof) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)neof); } if (hts_close(hts_fp)) error("Close failed: %s\n",args->fnames[i]); } free(buf); free(tmp.s); if (bgzf_close(bgzf_out) < 0) error("Error: %d\n",bgzf_out->errcode); }
int main(int argc, char **argv) { int c, compress, pstdout, is_forced, index = 0, rebgzip = 0, reindex = 0; BGZF *fp; void *buffer; long start, end, size; char *index_fname = NULL; int threads = 1; static const struct option loptions[] = { {"help", no_argument, NULL, 'h'}, {"offset", required_argument, NULL, 'b'}, {"stdout", no_argument, NULL, 'c'}, {"decompress", no_argument, NULL, 'd'}, {"force", no_argument, NULL, 'f'}, {"index", no_argument, NULL, 'i'}, {"index-name", required_argument, NULL, 'I'}, {"reindex", no_argument, NULL, 'r'}, {"rebgzip",no_argument,NULL,'g'}, {"size", required_argument, NULL, 's'}, {"threads", required_argument, NULL, '@'}, {"version", no_argument, NULL, 1}, {NULL, 0, NULL, 0} }; compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0; while((c = getopt_long(argc, argv, "cdh?fb:@:s:iI:gr",loptions,NULL)) >= 0){ switch(c){ case 'd': compress = 0; break; case 'c': pstdout = 1; break; case 'b': start = atol(optarg); compress = 0; pstdout = 1; break; case 's': size = atol(optarg); pstdout = 1; break; case 'f': is_forced = 1; break; case 'i': index = 1; break; case 'I': index_fname = optarg; break; case 'g': rebgzip = 1; break; case 'r': reindex = 1; compress = 0; break; case '@': threads = atoi(optarg); break; case 1: printf( "bgzip (htslib) %s\n" "Copyright (C) 2017 Genome Research Ltd.\n", hts_version()); return EXIT_SUCCESS; case 'h': case '?': return bgzip_main_usage(); } } if (size >= 0) end = start + size; if (end >= 0 && end < start) { fprintf(stderr, "[bgzip] Illegal region: [%ld, %ld]\n", start, end); return 1; } if (compress == 1) { struct stat sbuf; int f_src = fileno(stdin); if ( argc>optind ) { if ( stat(argv[optind],&sbuf)<0 ) { fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); return 1; } if ((f_src = open(argv[optind], O_RDONLY)) < 0) { fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); return 1; } if (pstdout) fp = bgzf_open("-", "w"); else { char *name = malloc(strlen(argv[optind]) + 5); strcpy(name, argv[optind]); strcat(name, ".gz"); fp = bgzf_open(name, is_forced? "w" : "wx"); if (fp == NULL && errno == EEXIST && confirm_overwrite(name)) fp = bgzf_open(name, "w"); if (fp == NULL) { fprintf(stderr, "[bgzip] can't create %s: %s\n", name, strerror(errno)); free(name); return 1; } free(name); } } else if (!pstdout && isatty(fileno((FILE *)stdout)) ) return bgzip_main_usage(); else if ( index && !index_fname ) { fprintf(stderr, "[bgzip] Index file name expected when writing to stdout\n"); return 1; } else fp = bgzf_open("-", "w"); if ( index && rebgzip ) { fprintf(stderr, "[bgzip] Can't produce a index and rebgzip simultaneously\n"); return 1; } if ( rebgzip && !index_fname ) { fprintf(stderr, "[bgzip] Index file name expected when writing to stdout\n"); return 1; } if (threads > 1) bgzf_mt(fp, threads, 256); if ( index ) bgzf_index_build_init(fp); buffer = malloc(WINDOW_SIZE); #ifdef _WIN32 _setmode(f_src, O_BINARY); #endif if (rebgzip){ if ( bgzf_index_load(fp, index_fname, NULL) < 0 ) error("Could not load index: %s.gzi\n", argv[optind]); while ((c = read(f_src, buffer, WINDOW_SIZE)) > 0) if (bgzf_block_write(fp, buffer, c) < 0) error("Could not write %d bytes: Error %d\n", c, fp->errcode); } else { while ((c = read(f_src, buffer, WINDOW_SIZE)) > 0) if (bgzf_write(fp, buffer, c) < 0) error("Could not write %d bytes: Error %d\n", c, fp->errcode); } if ( index ) { if (index_fname) { if (bgzf_index_dump(fp, index_fname, NULL) < 0) error("Could not write index to '%s'\n", index_fname); } else { if (bgzf_index_dump(fp, argv[optind], ".gz.gzi") < 0) error("Could not write index to '%s.gz.gzi'", argv[optind]); } } if (bgzf_close(fp) < 0) error("Close failed: Error %d", fp->errcode); if (argc > optind && !pstdout) unlink(argv[optind]); free(buffer); close(f_src); return 0; } else if ( reindex ) { if ( argc>optind ) { fp = bgzf_open(argv[optind], "r"); if ( !fp ) error("[bgzip] Could not open file: %s\n", argv[optind]); } else { if ( !index_fname ) error("[bgzip] Index file name expected when reading from stdin\n"); fp = bgzf_open("-", "r"); if ( !fp ) error("[bgzip] Could not read from stdin: %s\n", strerror(errno)); } buffer = malloc(BGZF_BLOCK_SIZE); bgzf_index_build_init(fp); int ret; while ( (ret=bgzf_read(fp, buffer, BGZF_BLOCK_SIZE))>0 ) ; free(buffer); if ( ret<0 ) error("Is the file gzipped or bgzipped? The latter is required for indexing.\n"); if ( index_fname ) { if (bgzf_index_dump(fp, index_fname, NULL) < 0) error("Could not write index to '%s'\n", index_fname); } else { if (bgzf_index_dump(fp, argv[optind], ".gzi") < 0) error("Could not write index to '%s.gzi'\n", argv[optind]); } if ( bgzf_close(fp)<0 ) error("Close failed: Error %d\n",fp->errcode); return 0; } else { struct stat sbuf; int f_dst; if ( argc>optind ) { if ( stat(argv[optind],&sbuf)<0 ) { fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); return 1; } char *name; int len = strlen(argv[optind]); if ( strcmp(argv[optind]+len-3,".gz") ) { fprintf(stderr, "[bgzip] %s: unknown suffix -- ignored\n", argv[optind]); return 1; } fp = bgzf_open(argv[optind], "r"); if (fp == NULL) { fprintf(stderr, "[bgzip] Could not open file: %s\n", argv[optind]); return 1; } if (pstdout) { f_dst = fileno(stdout); } else { const int wrflags = O_WRONLY | O_CREAT | O_TRUNC; name = strdup(argv[optind]); name[strlen(name) - 3] = '\0'; f_dst = open(name, is_forced? wrflags : wrflags|O_EXCL, 0666); if (f_dst < 0 && errno == EEXIST && confirm_overwrite(name)) f_dst = open(name, wrflags, 0666); if (f_dst < 0) { fprintf(stderr, "[bgzip] can't create %s: %s\n", name, strerror(errno)); free(name); return 1; } free(name); } } else if (!pstdout && isatty(fileno((FILE *)stdin)) ) return bgzip_main_usage(); else { f_dst = fileno(stdout); fp = bgzf_open("-", "r"); if (fp == NULL) { fprintf(stderr, "[bgzip] Could not read from stdin: %s\n", strerror(errno)); return 1; } } if (threads > 1) bgzf_mt(fp, threads, 256); buffer = malloc(WINDOW_SIZE); if ( start>0 ) { if ( bgzf_index_load(fp, argv[optind], ".gzi") < 0 ) error("Could not load index: %s.gzi\n", argv[optind]); if ( bgzf_useek(fp, start, SEEK_SET) < 0 ) error("Could not seek to %d-th (uncompressd) byte\n", start); } #ifdef _WIN32 _setmode(f_dst, O_BINARY); #endif while (1) { if (end < 0) c = bgzf_read(fp, buffer, WINDOW_SIZE); else c = bgzf_read(fp, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start)); if (c == 0) break; if (c < 0) error("Could not read %d bytes: Error %d\n", (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start), fp->errcode); start += c; if ( write(f_dst, buffer, c) != c ) { #ifdef _WIN32 if (GetLastError() != ERROR_NO_DATA) #endif error("Could not write %d bytes\n", c); } if (end >= 0 && start >= end) break; } free(buffer); if (bgzf_close(fp) < 0) error("Close failed: Error %d\n",fp->errcode); if (!pstdout) unlink(argv[optind]); return 0; } }
int main(int argc, char **argv) { int c, compress, pstdout, is_forced; BGZF *fp; void *buffer; long start, end, size; compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0; while((c = getopt(argc, argv, "cdhfb:s:")) >= 0){ switch(c){ case 'h': return bgzip_main_usage(); case 'd': compress = 0; break; case 'c': pstdout = 1; break; case 'b': start = atol(optarg); break; case 's': size = atol(optarg); break; case 'f': is_forced = 1; break; } } if (size >= 0) end = start + size; if (end >= 0 && end < start) { fprintf(stderr, "[bgzip] Illegal region: [%ld, %ld]\n", start, end); return 1; } if (compress == 1) { struct stat sbuf; int f_src = fileno(stdin); int f_dst = fileno(stdout); if ( argc>optind ) { if ( stat(argv[optind],&sbuf)<0 ) { fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); return 1; } if ((f_src = open(argv[optind], O_RDONLY)) < 0) { fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); return 1; } if (pstdout) f_dst = fileno(stdout); else { char *name = malloc(strlen(argv[optind]) + 5); strcpy(name, argv[optind]); strcat(name, ".gz"); f_dst = write_open(name, is_forced); if (f_dst < 0) return 1; free(name); } } else if (!pstdout && isatty(fileno((FILE *)stdout)) ) return bgzip_main_usage(); fp = bgzf_fdopen(f_dst, "w"); buffer = malloc(WINDOW_SIZE); while ((c = read(f_src, buffer, WINDOW_SIZE)) > 0) if (bgzf_write(fp, buffer, c) < 0) fail(fp); // f_dst will be closed here if (bgzf_close(fp) < 0) fail(fp); if (argc > optind && !pstdout) unlink(argv[optind]); free(buffer); close(f_src); return 0; } else { struct stat sbuf; int f_dst; if ( argc>optind ) { if ( stat(argv[optind],&sbuf)<0 ) { fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); return 1; } char *name; int len = strlen(argv[optind]); if ( strcmp(argv[optind]+len-3,".gz") ) { fprintf(stderr, "[bgzip] %s: unknown suffix -- ignored\n", argv[optind]); return 1; } fp = bgzf_open(argv[optind], "r"); if (fp == NULL) { fprintf(stderr, "[bgzip] Could not open file: %s\n", argv[optind]); return 1; } if (pstdout) { f_dst = fileno(stdout); } else { name = strdup(argv[optind]); name[strlen(name) - 3] = '\0'; f_dst = write_open(name, is_forced); free(name); } } else if (!pstdout && isatty(fileno((FILE *)stdin)) ) return bgzip_main_usage(); else { f_dst = fileno(stdout); fp = bgzf_fdopen(fileno(stdin), "r"); if (fp == NULL) { fprintf(stderr, "[bgzip] Could not read from stdin: %s\n", strerror(errno)); return 1; } } buffer = malloc(WINDOW_SIZE); if (bgzf_seek(fp, start, SEEK_SET) < 0) fail(fp); while (1) { if (end < 0) c = bgzf_read(fp, buffer, WINDOW_SIZE); else c = bgzf_read(fp, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start)); if (c == 0) break; if (c < 0) fail(fp); start += c; write(f_dst, buffer, c); if (end >= 0 && start >= end) break; } free(buffer); if (bgzf_close(fp) < 0) fail(fp); if (!pstdout) unlink(argv[optind]); return 0; } }
int fst_stat(int argc,char **argv){ char *bname = *argv; fprintf(stderr,"\t-> Assuming idxname:%s\n",bname); perfst *pf = perfst_init(bname); args *pars = getArgs(--argc,++argv); int *ppos = NULL; int chs = choose(pf->names.size(),2); // fprintf(stderr,"choose:%d \n",chs); double **ares = new double*[chs]; double **bres = new double*[chs]; double unweight[chs]; double wa[chs]; double wb[chs]; size_t nObs[chs]; for(int i=0;i<chs;i++){ unweight[i] = wa[i] = wb[i] =0.0; nObs[i] = 0; } for(myFstMap::iterator it=pf->mm.begin();it!=pf->mm.end();++it){ if(pars->chooseChr!=NULL){ it = pf->mm.find(pars->chooseChr); if(it==pf->mm.end()){ fprintf(stderr,"Problem finding chr: %s\n",pars->chooseChr); break; } } if(it->second.nSites==0) continue; bgzf_seek(pf->fp,it->second.off,SEEK_SET); ppos = new int[it->second.nSites]; bgzf_read(pf->fp,ppos,sizeof(int)*it->second.nSites); for(int i=0;i<choose(pf->names.size(),2);i++){ ares[i] = new double[it->second.nSites]; bres[i] = new double[it->second.nSites]; bgzf_read(pf->fp,ares[i],sizeof(double)*it->second.nSites); bgzf_read(pf->fp,bres[i],sizeof(double)*it->second.nSites); } int first=0; if(pars->start!=-1) while(ppos[first]<pars->start) first++; int last=it->second.nSites; if(pars->stop!=-1&&pars->stop<=ppos[last-1]){ last=first; while(ppos[last]<pars->stop) last++; } // fprintf(stderr,"pars->stop:%d ppos:%d first:%d last:%d\n",pars->stop,ppos[last-1],first,last); for(int s=first;s<last;s++){ #if 0 fprintf(stdout,"%s\t%d",it->first,ppos[s]+1); for(int i=0;i<choose(pf->names.size(),2);i++) fprintf(stdout,"\t%f\t%f",ares[i][s],bres[i][s]); fprintf(stdout,"\n"); #endif for(int i=0;i<choose(pf->names.size(),2);i++){ if(bres[i][s]!=0){ unweight[i] += ares[i][s]/bres[i][s]; nObs[i]++; } wa[i] += ares[i][s]; wb[i] += bres[i][s]; } } for(int i=0;i<choose(pf->names.size(),2);i++){ delete [] ares[i]; delete [] bres[i]; } delete [] ppos; if(pars->chooseChr!=NULL) break; } double fstUW[chs]; double fstW[chs]; for(int i=0;i<chs;i++){ fstUW[i] = unweight[i]/(1.0*nObs[i]); fstW[i] = wa[i]/wb[i]; fprintf(stderr,"\t-> FST.Unweight[nObs:%lu]:%f Fst.Weight:%f\n",nObs[i],fstUW[i],fstW[i]); fprintf(stdout,"%f %f\n",fstUW[i],fstW[i]); } if(chs==3){ //if chr==3 then we have 3pops and we will also calculate pbs statistics calcpbs(fstW);//<- NOTE: the pbs values will replace the fstW values for(int i=0;i<3;i++) fprintf(stderr,"\t-> pbs.pop%d\t%f\n",i+1,fstW[i]); } delete [] ares; delete [] bres; destroy_args(pars); perfst_destroy(pf); return 0; }
int fst_print(int argc,char **argv){ char *bname = *argv; fprintf(stderr,"\t-> Assuming idxname:%s\n",bname); perfst *pf = perfst_init(bname); writefst_header(stderr,pf); args *pars = getArgs(--argc,++argv); int *ppos = NULL; fprintf(stderr,"choose:%d \n",choose(pf->names.size(),2)); double **ares = new double*[choose(pf->names.size(),2)]; double **bres = new double*[choose(pf->names.size(),2)]; for(myFstMap::iterator it=pf->mm.begin();it!=pf->mm.end();++it){ if(pars->chooseChr!=NULL){ it = pf->mm.find(pars->chooseChr); if(it==pf->mm.end()){ fprintf(stderr,"Problem finding chr: %s\n",pars->chooseChr); break; } } if(it->second.nSites==0) continue; bgzf_seek(pf->fp,it->second.off,SEEK_SET); ppos = new int[it->second.nSites]; bgzf_read(pf->fp,ppos,sizeof(int)*it->second.nSites); for(int i=0;i<choose(pf->names.size(),2);i++){ ares[i] = new double[it->second.nSites]; bres[i] = new double[it->second.nSites]; bgzf_read(pf->fp,ares[i],sizeof(double)*it->second.nSites); bgzf_read(pf->fp,bres[i],sizeof(double)*it->second.nSites); } int first=0; if(pars->start!=-1) while(ppos[first]<pars->start) first++; int last=it->second.nSites; if(pars->stop!=-1&&pars->stop<=ppos[last-1]){ last=first; while(ppos[last]<pars->stop) last++; } fprintf(stderr,"pars->stop:%d ppos:%d first:%d last:%d\n",pars->stop,ppos[last-1],first,last); for(int s=first;s<last;s++){ fprintf(stdout,"%s\t%d",it->first,ppos[s]+1); for(int i=0;i<choose(pf->names.size(),2);i++) fprintf(stdout,"\t%f\t%f",ares[i][s],bres[i][s]); fprintf(stdout,"\n"); } for(int i=0;i<choose(pf->names.size(),2);i++){ delete [] ares[i]; delete [] bres[i]; } delete [] ppos; if(pars->chooseChr!=NULL) break; } delete [] ares; delete [] bres; destroy_args(pars); perfst_destroy(pf); return 0; }
int fst_stat2(int argc,char **argv){ int pS,pE;//physical start,physical end int begI,endI;//position in array for pS, pE; char *bname = *argv; fprintf(stderr,"\t-> Assuming idxname:%s\n",bname); perfst *pf = perfst_init(bname); args *pars = getArgs(--argc,++argv); fprintf(stderr,"win:%d step:%d\n",pars->win,pars->step); int *ppos = NULL; int chs = choose(pf->names.size(),2); // fprintf(stderr,"choose:%d \n",chs); double **ares = new double*[chs]; double **bres = new double*[chs]; double unweight[chs]; double wa[chs]; double wb[chs]; size_t nObs =0; for(myFstMap::iterator it=pf->mm.begin();it!=pf->mm.end();++it){ if(pars->chooseChr!=NULL){ it = pf->mm.find(pars->chooseChr); if(it==pf->mm.end()){ fprintf(stderr,"Problem finding chr: %s\n",pars->chooseChr); break; } } fprintf(stderr,"nSites:%lu\n",it->second.nSites); if(it->second.nSites==0&&pars->chooseChr!=NULL) break; else if(it->second.nSites==0&&pars->chooseChr==NULL) continue; bgzf_seek(pf->fp,it->second.off,SEEK_SET); ppos = new int[it->second.nSites]; bgzf_read(pf->fp,ppos,sizeof(int)*it->second.nSites); for(int i=0;i<it->second.nSites;i++) ppos[i]++; for(int i=0;i<choose(pf->names.size(),2);i++){ ares[i] = new double[it->second.nSites]; bres[i] = new double[it->second.nSites]; bgzf_read(pf->fp,ares[i],sizeof(double)*it->second.nSites); bgzf_read(pf->fp,bres[i],sizeof(double)*it->second.nSites); } if(pars->type==0) pS = ((pars->start!=-1?pars->start:ppos[0])/pars->step)*pars->step +pars->step; else if(pars->type==1) pS = (pars->start!=-1?pars->start:ppos[0]); else if(pars->type==2) pS = 1; pE = pS+pars->win; begI=endI=0; // fprintf(stderr,"ps:%d\n",pS);exit(0); if(pE>(pars->stop!=-1?pars->stop:ppos[it->second.nSites-1])){ fprintf(stderr,"end of dataset is before end of window: end of window:%d last position in chr:%d\n",pE,ppos[it->second.nSites-1]); // return str; } while(ppos[begI]<pS) begI++; endI=begI; while(ppos[endI]<pE) endI++; //fprintf(stderr,"begI:%d endI:%d\n",begI,endI); while(1){ for(int i=0;i<chs;i++) unweight[i] = wa[i] = wb[i] =0.0; nObs=0; fprintf(stdout,"(%d,%d)(%d,%d)(%d,%d)\t%s\t%d",begI,endI-1,ppos[begI],ppos[endI-1],pS,pE,it->first,pS+(pE-pS)/2); for(int s=begI;s<endI;s++){ #if 0 fprintf(stdout,"%s\t%d",it->first,ppos[s]+1); for(int i=0;i<choose(pf->names.size(),2);i++) fprintf(stdout,"\t%f\t%f",ares[i][s],bres[i][s]); fprintf(stdout,"\n"); #endif for(int i=0;i<choose(pf->names.size(),2);i++){ unweight[i] += ares[i][s]/bres[i][s]; wa[i] += ares[i][s]; wb[i] += bres[i][s]; } nObs++; } double fstW[chs]; for(int i=0;nObs>0&&i<chs;i++){ fstW[i] = wa[i]/wb[i]; fprintf(stdout,"\t%f\t%f",unweight[i]/(1.0*nObs),fstW[i]); } if(chs==3){ //if chr==3 then we have 3pops and we will also calculate pbs statistics calcpbs(fstW);//<- NOTE: the pbs values will replace the fstW values for(int i=0;i<3;i++) fprintf(stdout,"\t%f",fstW[i]); } fprintf(stdout,"\n"); pS += pars->step; pE =pS+pars->win; if(pE>(pars->stop!=-1?pars->stop:ppos[it->second.nSites-1])) break; while(ppos[begI]<pS) begI++; while(ppos[endI]<pE) endI++; } for(int i=0;i<choose(pf->names.size(),2);i++){ delete [] ares[i]; delete [] bres[i]; } delete [] ppos; if(pars->chooseChr!=NULL) break; } delete [] ares; delete [] bres; destroy_args(pars); perfst_destroy(pf); return 0; }
int main(int argc, char *argv[]) { if (argc <= 1) { fprintf(stderr, "Usage: thrash_threads4 input.bam\n"); exit(1); } // Find a valid seek location ~64M into the file int i; ssize_t got; BGZF *fpin = bgzf_open(argv[1], "r"); uint64_t upos = 0, uend = 0; char buf[100000]; for (i = 0; i < 100; i++) { if ((got = bgzf_read(fpin, buf, 65536)) < 0) abort(); upos += got; } int64_t pos = bgzf_tell(fpin); while ((got = bgzf_read(fpin, buf, 65536)) > 0) { uend += got; } if (got < 0) abort(); int64_t end = bgzf_tell(fpin); bgzf_close(fpin); // Ensure input is big enough to avoid case 3,4 below going off the end // of the file if (uend < upos + 10000000) { fprintf(stderr, "Please supply a bigger input file\n"); exit(1); } #define N 1000 // Spam random seeks & reads for (i = 0; i < 1000; i++) { printf("i=%d\t", i); fpin = bgzf_open(argv[1], "r"); int j, eof = 0, mt = 0; for (j = 0; j < 80; j++) { int n = rand() % 7; putchar('0'+n); fflush(stdout); switch (n) { case 0: // start if (bgzf_seek(fpin, 0LL, SEEK_SET) < 0) puts("!");//abort(); eof = 0; break; case 1: // mid if (bgzf_seek(fpin, pos, SEEK_SET) < 0) puts("!");//abort(); eof = 0; break; case 2: // eof if (bgzf_seek(fpin, end, SEEK_SET) < 0) puts("!");//abort(); eof = 1; break; case 3: case 4: { int l = rand()%(n==3?100000:100); if (bgzf_read(fpin, buf, l) != l*(1-eof)) abort(); break; } case 5: usleep(N); break; case 6: if (!mt) bgzf_mt(fpin, 8, 256); mt = 1; break; } } printf("\n"); if (bgzf_close(fpin)) abort(); } return 0; }
value caml_bgzf_input(value bgzf, value buf, value ofs, value len) { CAMLparam4(bgzf,buf,ofs,len); CAMLreturn(Val_long(bgzf_read(BGZF_val(bgzf),&Byte_u(buf,Long_val(ofs)),Int_val(len)))); }
/** * Create single chromosome index file * the file content is a 2-column matrix of int64_t type * line1: num_sample num_marker * line2: 0 bgzf_offset_for_#CHROM_line * line3: var_1_pos bgzf_offset_for_var_1 * ... */ int SingleChromosomeBCFIndex::createIndex() { // const char* fn = bcfFile_.c_str(); BGZF* fp = fBcfFile_; // bgzf_open(fn, "rb"); bgzf_seek(fp, 0, SEEK_SET); // check magic number char magic[5]; if (5 != bgzf_read(fp, magic, 5)) { return -1; // exit(1); } if (!(magic[0] == 'B' && magic[1] == 'C' && magic[2] == 'F' && magic[3] == 2 && (magic[4] == 1 || magic[4] == 2))) { return -1; // exit(1); } // read header uint32_t l_text; if (4 != bgzf_read(fp, &l_text, 4)) { return -1; // exit(1); } Rprintf("l_text = %d\n", l_text); std::string s; int64_t bgzf_offset_before_header = bgzf_tell(fp); // the beginning of header block s.resize(l_text); if (bgzf_read(fp, (void*)s.data(), l_text) != l_text) { REprintf( "Read failed!\n"); } BCFHeader bcfHeader; if (bcfHeader.parseHeader(s, &bcfHeader.header_contig_id, &bcfHeader.header_id, &bcfHeader.header_number, &bcfHeader.header_type, &bcfHeader.header_description)) { REprintf( "Parse header failed!\n"); return -1; // exit(1); } // locate #CHROM line int64_t bgzf_offset_after_header = bgzf_tell(fp); // the end of header block size_t ptr_chrom_line = s.find("#CHROM"); // the index of "#CHROM", also the size between beginning of header to '#CHROM' if (ptr_chrom_line == std::string::npos) { REprintf( "Cannot find the \"#CHROM\" line!\n"); return -1; // exit(1); } Rprintf("offset_header = %d\n", (int) ptr_chrom_line); bgzf_seek(fp, bgzf_offset_before_header, SEEK_SET); // rewind fp to the beginning of header s.resize(ptr_chrom_line); int64_t before_chrom_size = bgzf_read(fp, (void*) s.data(), ptr_chrom_line); int64_t bgzf_offset_before_chrom = bgzf_tell(fp); // the offset to #CHROM s.resize(l_text - before_chrom_size); int64_t after_chrom_size = bgzf_read(fp, (void*) s.data(), l_text - before_chrom_size); // load sample names while (s.back() == '\n' || s.back() == '\0') { s.resize(s.size() - 1); } stringTokenize(s, "\t", &bcfHeader.sample_names); const int64_t num_sample = (int)bcfHeader.sample_names.size() - 9; // vcf header has 9 columns CHROM...FORMAT before actual sample names Rprintf("sample size = %ld\n", num_sample); Rprintf("last character is s[after_chrom_size-1] = %d\n", s[after_chrom_size - 1]); // should be 0, the null terminator character // quality check if (bgzf_offset_after_header != bgzf_tell(fp)) { REprintf( "Messed up bgzf header\n"); return -1; // exit(1); } // create index file FILE* fIndex = fopen(indexFile_.c_str(), "wb"); int64_t num_marker = 0; int64_t pos = 0; fwrite(&num_sample, sizeof(int64_t), 1, fIndex); fwrite(&num_marker, sizeof(int64_t), 1, fIndex); fwrite(&pos, sizeof(int64_t), 1, fIndex); fwrite(&bgzf_offset_before_chrom, sizeof(int64_t), 1, fIndex); uint32_t l_shared; uint32_t l_indiv; std::vector<char> data; int64_t offset; do { offset = bgzf_tell(fp); if (4 != bgzf_read(fp, &l_shared, sizeof(uint32_t))) { break; // REprintf( "Wrong read!\n"); exit(1); } if (4 != bgzf_read(fp, &l_indiv, sizeof(uint32_t))) { break; // REprintf( "Wrong read!\n"); exit(1); } data.resize(l_shared + l_indiv); if (l_shared + l_indiv != bgzf_read(fp, data.data(), (l_shared+l_indiv) * sizeof(char))) { break; // REprintf( "Wrong read!\n"); exit(1); } memcpy(&pos, data.data() + 4, 4); fwrite(&pos, sizeof(int64_t), 1, fIndex); fwrite(&offset, sizeof(int64_t), 1, fIndex); num_marker++; if (num_marker % 10000 == 0) { Rprintf("\rprocessed %ld markers", num_marker); } } while (true); if (fseek(fIndex, 0, SEEK_SET)) { REprintf( "fseek failed\n!"); } fwrite(&num_sample, sizeof(int64_t), 1, fIndex); fwrite(&num_marker, sizeof(int64_t), 1, fIndex); fclose(fIndex); Rprintf("Indexing finished with %ld samples and %ld markers\n", num_sample, num_marker); return 0; }
// ClassifyFileType // Attempt to classify the alignment file as one of CSV, BED or SAM from it's initial 8k char contents // Currently processes CSV, BED and SAM format file types // Assumes must be SAM if initial lines have at least one prefixed by a '@' followed by a 2 letter record type code // etClassifyFileType CUtility::ClassifyFileType(char *pszFileName) { int hFile; gzFile gz; BGZF* pInBGZF; int BuffLen; int BuffIdx; UINT8 Buffer[cFileClassifyBuffLen]; UINT8 *pBuff; bool bStartNL; bool bSkipEOL; UINT8 Chr; int NumLines; int FieldCnt; int TabCnt; int CommaCnt; int FldLen; bool bInQuotes; int LikelyCSV; int LikelyBED; int LikelySAM; int LikelyNonCSVSAMBED; bool bSeenSAMHdrs; int FileNameLen; bool bGZd; FileNameLen = (int)strlen(pszFileName); bGZd = false; if(FileNameLen >= 4) { if(!stricmp(&pszFileName[FileNameLen-3],".gz")) bGZd = true; else { if(FileNameLen >= 5 && !stricmp(&pszFileName[FileNameLen-4],".bam")) { hFile = open(pszFileName,O_READSEQ); if(hFile == -1) return(eCFTopenerr); // BAM will using BGZF compression .. if((pInBGZF = bgzf_dopen(hFile, "r"))==NULL) { gDiagnostics.DiagOut(eDLFatal,gszProcName,"ClassifyFileType: unable to initialise for BGZF processing on file '%s'",pszFileName); close(hFile); return(eCFTopenerr); } hFile = -1; // try reading the header, bgzf_read will confirm it does start with "BAM\1" .... if((BuffLen = (int)bgzf_read(pInBGZF,Buffer,100)) < 100) // will be < 100 if errors ... { gDiagnostics.DiagOut(eDLFatal,gszProcName,"ClassifyFileType: Not a BAM format file '%s'",pszFileName); bgzf_close(pInBGZF); return(eCFTopenerr); } bgzf_close(pInBGZF); return(eCFTSAM); } } } // now can try to actually open file and read in first cFileClassifyBuffLen chars if(bGZd) { gz = gzopen(pszFileName,"rb"); if(gz == NULL) { gDiagnostics.DiagOut(eDLFatal,gszProcName,"Open: unable to open for reading gzip'd file '%s'",pszFileName); return(eCFTopenerr); } BuffLen = gzread(gz,Buffer,sizeof(Buffer)-1); gzclose(gz); } else { hFile = open(pszFileName,O_READSEQ); if(hFile == -1) return(eCFTopenerr); // read the 1st cFileTypeBuffLen into buffer BuffLen = read(hFile,Buffer,sizeof(Buffer)-1); close(hFile); } if(BuffLen < cMinFileClassifyLen) // an arbitary lower limit! return(eCFTlenerr); Buffer[BuffLen] = '\0'; pBuff = Buffer; NumLines = 0; LikelyCSV = 0; LikelyBED = 0; LikelySAM = 0; LikelyNonCSVSAMBED = 0; BuffIdx = 0; bStartNL = true; bSeenSAMHdrs = false; while(Chr = *pBuff++) { BuffIdx += 1; if(bStartNL) { FieldCnt = 0; TabCnt = 0; CommaCnt = 0; FldLen = 0; bInQuotes = false; bStartNL = false; bSkipEOL = false; NumLines += 1; } if(Chr == '\n' || Chr == '\r') // if at end of line { bStartNL = true; bSkipEOL = false; if(FieldCnt < 3) // BED can have down to 3 fields, CSV alignment and SAM should have more continue; if(!bInQuotes) { if(CommaCnt >= 3 && CommaCnt > TabCnt) // if at least as many commas as tabs as assumed field separators then most likely a CSV file LikelyCSV += 10; else // if more tabs than commas then could be either BED or SAM { if(bSeenSAMHdrs) { LikelyBED += 5; LikelySAM += 10; // SAM would be distinguished by it's header lines starting with '@" } else { LikelyBED += 20; LikelySAM += 5; } } } continue; } if(bSkipEOL) continue; if(!FieldCnt && !FldLen && (Chr == ' ' || Chr == '\t')) // simply slough all leading whitespaces before intial field starts continue; // nested quotes are potentially a problem; currently quotes are simply sloughed if(Chr == '\'' || Chr == '"') { bInQuotes = !bInQuotes; continue; } if(!FieldCnt && !bInQuotes && Chr == '@' || Chr == '>') { if(Chr == '@') // if SAM then header line(s) should be present and can be expected to start with "@HD", "@SQ", "@RG", "@PG", "@CO" { if(BuffIdx < (BuffLen - 3)) { if(((*pBuff == 'H' && pBuff[1] == 'D') || (*pBuff == 'S' && pBuff[1] == 'Q') || (*pBuff == 'R' && pBuff[1] == 'G') || (*pBuff == 'P' && pBuff[1] == 'G') || (*pBuff == 'C' && pBuff[1] == 'O')) && (pBuff[2] == ' ' || pBuff[2] == '\t' )) { bSeenSAMHdrs = true; LikelyNonCSVSAMBED = 0; LikelySAM += 10000; bSkipEOL = true; continue; } else { if(!bSeenSAMHdrs) // if no SAM headers parsed then could easily be a fastq... { LikelyNonCSVSAMBED += 50; bSkipEOL = true; continue; } } } } if(Chr == '>') // if at start of line then could easily be fasta... LikelyNonCSVSAMBED += 50; } switch(Chr) { case ' ': // simply slough spaces continue; case ',': // if comma then likely is a csv, but could still be BED if in optional fields 9 (itemRgb) onwards if(TabCnt < 8 && FieldCnt >= TabCnt) { FieldCnt += 1; CommaCnt += 1; FldLen = 0; } break; case '\t': // tabs are in BED and SAM as field separators, but could also be present in CSV as spacers if(CommaCnt < 3 && FieldCnt >= CommaCnt) { FieldCnt += 1; TabCnt += 1; FldLen = 0; } break; default: // any other char is assumed to be part of an actual field value FldLen += 1; break; } } if(LikelyNonCSVSAMBED >= 250 || (LikelyCSV < 10 && LikelyBED < 10 && LikelySAM < 500)) return(eCFTunknown); if(LikelyCSV >= LikelyBED && LikelyCSV >= LikelySAM) return(eCFTCSV); if(LikelyBED >= LikelySAM) return(eCFTBED); return(eCFTSAM); }