size_t count_fastq_sequences(BGZF *fastq_file) { bgzf_seek( fastq_file, 0, SEEK_SET ); cmph_uint32 count = 0; while( 1 ) { char buffer[ BUFSIZ ]; ssize_t bytes_read = bgzf_read( fastq_file, buffer, BUFSIZ ); if( bytes_read <= 0 ) { break; } int i; for(i = 0; i < bytes_read; i++) { if( buffer[ i ] == '@' ) { count++; } } } bgzf_seek( fastq_file, 0, SEEK_SET ); return count; }
ifq_codes_t ifq_query_index(ifq_index_t *index, char *query, ifq_record_t *record) { // Find key unsigned int id = cmph_search( index->hash, query, (cmph_uint32) strlen( query ) ); uint64_t pos = index->table[ id ]; if( bgzf_seek( index->fastq_file, pos, SEEK_SET ) < 0 ) { return IFQ_NOT_FOUND; } cmph_uint32 length; read_one_line( &record->name, &length, index->fastq_file ); if( strncmp( record->name, query, length ) == 0 ) { read_one_line( &record->sequence, &length, index->fastq_file ); read_one_line( &record->quality, &length, index->fastq_file ); read_one_line( &record->quality, &length, index->fastq_file ); } else { return IFQ_NOT_FOUND; } return IFQ_OK; }
BamMerge::BamMerge(const vector<string>& bam_fnames, vector<int64_t> file_offsets) : _bam_fnames(bam_fnames), _lines(less_bam(true)), _last_id(0) { if (bam_fnames.size() <= 0) return; for (size_t i = 0; i < _bam_fnames.size(); ++i) { const char* fname = _bam_fnames[i].c_str(); samfile_t* fp = samopen(fname, "rb", 0); if (fp==0) { warn_msg(ERR_BAM_OPEN, fname); exit(1); } if (bam_fnames.size() == file_offsets.size() && file_offsets[i] > 0) bgzf_seek(fp->x.bam, file_offsets[i], SEEK_SET); bam1_t* b = bam_init1(); if (samread(fp, b) > 0) { _src_files.push_back(fp); CBamLine brec(_lines.size(), b, fp->header); _lines.push(brec); } else { bam_destroy1(b); } } if (_lines.size() == 0) { warn_msg("Warning: no input BAM records found.\n"); exit(1); } }
int print(int argc, char**argv){ if(argc==0){ fprintf(stderr,"print FILE [-r chrName]\n"); exit(0); } char *base = *argv; char* outnames_bin = append(base,BIN); char* outnames_idx = append(base,IDX); fprintf(stderr,"Assuming binfile:%s and indexfile:%s\n",outnames_bin,outnames_idx); myMap mm = getMap(outnames_idx); writemap(stderr,mm); BGZF *fp = bgzf_open(outnames_bin,"r"); --argc;++argv; // fprintf(stderr,"argc=%d\n",argc); int argP =0; char *chr=NULL; while(argP<argc){ // fprintf(stderr,"args=%s\n",argv[argP]); if(argP==argc){ fprintf(stderr,"incomplete arguments list\n"); exit(0); } if(strcmp("-r",argv[argP])==0) chr = argv[argP+1]; else { fprintf(stderr,"Unknown argument:%s\n",argv[argP]); exit(0); } argP +=2; } if(chr!=NULL){ myMap::iterator it = mm.find(chr); if(it==mm.end()){ fprintf(stderr,"Problem finding chr: %s in index\n",chr); exit(0); } datum d = it->second; bgzf_seek(fp,d.fpos,SEEK_SET); } while(1){ perChr pc = getPerChr(fp); if(pc.nSites==0) break; fprintf(stderr,"pc.chr=%s pc.nSites=%zu firstpos=%d lastpos=%d\n",pc.chr,pc.nSites,pc.posi[0],pc.posi[pc.nSites-1]); print_main(pc,stdout); if(chr!=NULL) break; dalloc(pc); } return 0; }
uint64_t read_chunk(double **chunk_data, params *pars, uint64_t chunk) { uint64_t total_elems_read = 0; if(chunk >= pars->n_chunks) error("invalid chunk number!"); // Define chunk start and end positions uint64_t start_pos = chunk * pars->max_chunk_size; uint64_t end_pos = start_pos + pars->max_chunk_size - 1; if(end_pos >= pars->n_sites) end_pos = pars->n_sites - 1; uint64_t chunk_size = end_pos - start_pos + 1; if( pars->verbose >= 6 ) printf("\tReading chunk %lu from position %lu to %lu (%lu)\n", chunk+1, start_pos, end_pos, chunk_size); // Search start position #ifdef _USE_BGZF if( bgzf_seek(pars->in_glf_fh, pars->chunks_voffset[chunk], SEEK_SET) < 0 ) error("cannot seek GLF file (BGZF)!"); #endif // Read data from file for(uint64_t c = 0; c < chunk_size; c++) { #ifdef _USE_BGZF int bytes_read = bgzf_read(pars->in_glf_fh, chunk_data[c], (int) pars->n_ind * 3 * sizeof(double)); if(pars->call_geno) call_geno(chunk_data[c], pars->n_ind, 3); uint64_t elems_read = (uint64_t) bytes_read / sizeof(double); #else chunk_data[c] = pars->data[start_pos+c]; uint64_t elems_read = pars->n_ind * 3; #endif if( elems_read != pars->n_ind * 3 ) error("cannot read GLF file!"); total_elems_read += elems_read; } #ifdef _USE_BGZF // Update index for next chunk if( chunk+1 != pars->n_chunks && pars->chunks_voffset[chunk+1] == 0 ) pars->chunks_voffset[chunk+1] = bgzf_tell(pars->in_glf_fh); #endif return( total_elems_read/(pars->n_ind * 3) ); }
int SingleChromosomeBCFIndex::readLine(int64_t offset, uint32_t* l_shared, uint32_t* l_indiv, std::vector<char>* line) { if (bgzf_seek(fBcfFile_, offset, SEEK_SET)) { REprintf("seek error!\n"); } if (4 != bgzf_read(fBcfFile_, l_shared, sizeof(uint32_t)) || 4 != bgzf_read(fBcfFile_, l_indiv, sizeof(uint32_t))) { REprintf("readLine error!\n"); } uint32_t totalLen = *l_shared + *l_indiv; line->resize(totalLen); if ( totalLen != bgzf_read(fBcfFile_, line->data(), totalLen)) { REprintf("readLine bgzf_read error!\n"); } return totalLen; }
//hint is the suggested newsize void filt_readSites(filt*fl,char *chr,size_t hint) { assert(fl!=NULL); std::map<char*,asdf_dats,ltstr> ::iterator it = fl->offs.find(chr); if(it==fl->offs.end()){ fprintf(stderr,"\n\t-> Potential problem: The filereading has reached a chromsome: \'%s\', which is not included in your \'-sites\' file.\n\t-> Please consider limiting your analysis to the chromsomes of interest \n",chr); fprintf(stderr,"\t-> see \'http://www.popgen.dk/angsd/index.php/Sites\' for more information\n"); fprintf(stderr,"\t-> Program will continue reading this chromosome... \n"); //exit(0); free(fl->keeps); free(fl->minor); free(fl->major); fl->keeps=fl->minor=fl->major=NULL; fl->curLen =0; return; } bgzf_seek(fl->bg,it->second.offs,SEEK_SET); size_t nsize = std::max(fl->curLen,hint); nsize = std::max(nsize,it->second.len); if(nsize>fl->curLen) fl->keeps=(char*) realloc(fl->keeps,nsize); memset(fl->keeps,0,nsize); //fprintf(stderr,"it->second.len:%lu fl->curLen:%lu fl->keeps:%p\n",it->second.len,fl->curLen,fl->keeps); bgzf_read(fl->bg,fl->keeps,it->second.len); if(fl->hasMajMin==1){ if(nsize>fl->curLen) { fl->major = (char*) realloc(fl->major,nsize); fl->minor = (char*) realloc(fl->minor,nsize); memset(fl->major,0,nsize); memset(fl->minor,0,nsize); } bgzf_read(fl->bg,fl->major,it->second.len); bgzf_read(fl->bg,fl->minor,it->second.len); } fl->curNam=chr; fl->curLen = nsize; }
ifq_codes_t ifq_create_index(char *fastq_path, char *index_prefix) { char *hash_path = concatenate( index_prefix, ".hsh" ); char *seek_path = concatenate( index_prefix, ".lup" ); ifq_codes_t ret = IFQ_OK; /* Open output files */ BGZF *fastq_file = bgzf_open( fastq_path, "r" ); if( fastq_file == NULL ) { ret = IFQ_BAD_FASTQ; goto index_fastq_fail; } FILE *hash_file = fopen( hash_path, "w" ); if( hash_file == NULL ) { ret = IFQ_BAD_PREFIX; goto index_prefix_fail; } /* Create hash function */ cmph_io_adapter_t *source = cmph_io_fastq_adapter( fastq_file ); if( source == NULL ) { ret = IFQ_BAD_HASH; goto index_prefix_fail; } cmph_config_t *config = cmph_config_new( source ); cmph_config_set_algo( config, CMPH_CHD ); cmph_config_set_mphf_fd( config, hash_file ); cmph_t *hash = cmph_new( config ); if( hash == NULL ) { ret = IFQ_BAD_HASH; goto index_hash_fail; } /* Create the file index using the hash */ bgzf_seek( fastq_file, 0, SEEK_SET ); if( create_index( fastq_file, hash, seek_path ) != 1 ) { ret = IFQ_BAD_INDEX; goto index_create_fail; } index_fastq_fail: free( hash_path ); free( seek_path ); index_create_fail: cmph_config_destroy( config ); cmph_dump( hash, hash_file ); cmph_destroy( hash ); free( source ); index_hash_fail: fclose( hash_file ); index_prefix_fail: bgzf_close( fastq_file ); return ret; }
int do_stat(int argc, char**argv){ if(argc==0){ fprintf(stderr,"do_stat FILE -win -step -nChr [-r chrName -type [0,1,2]]\n"); exit(0); } char *base = *argv; char* outnames_bin = append(base,BIN); char* outnames_idx = append(base,IDX); fprintf(stderr,"\tAssuming binfile:%s and indexfile:%s\n",outnames_bin,outnames_idx); myMap mm = getMap(outnames_idx); writemap(stderr,mm); BGZF *fp = bgzf_open(outnames_bin,"r"); --argc;++argv; // fprintf(stderr,"argc=%d\n",argc); int argP =0; char *chr=NULL; char *outnames = NULL; int nChr =0; int win =0; int step =0; int type =0; while(argP<argc){ // fprintf(stderr,"args=%s\n",argv[argP]); if(argP==argc){ fprintf(stderr,"incomplete arguments list\n"); exit(0); } if(strcmp("-r",argv[argP])==0) chr = argv[argP+1]; else if(strcmp("-outnames",argv[argP])==0) outnames = argv[argP+1]; else if(strcmp("-step",argv[argP])==0) step = atoi(argv[argP+1]); else if(strcmp("-win",argv[argP])==0) win = atoi(argv[argP+1]); else if(strcmp("-nChr",argv[argP])==0) nChr = atoi(argv[argP+1]); else if(strcmp("-type",argv[argP])==0) type = atoi(argv[argP+1]); else { fprintf(stderr,"Unknown argument:%s\n",argv[argP]); exit(0); } argP +=2; } fprintf(stderr,"\t -r=%s outnames=%s step: %d win: %d nChr:%d\n",chr,outnames,step,win,nChr); if(nChr==0){ fprintf(stderr,"nChr must be different from zero\n"); exit(0); } if(win==0||step==0){ fprintf(stderr,"\tWinsize equals zero or step size equals zero. Will use entire chromosome as window\n"); win=step=0; } if(chr!=NULL){ myMap::iterator it = mm.find(chr); if(it==mm.end()){ fprintf(stderr,"\tProblem finding chr: %s in index\n",chr); exit(0); } datum d = it->second; bgzf_seek(fp,d.fpos,SEEK_SET); } if(outnames==NULL) outnames = base; char *resname = append(outnames,RES); FILE *fpres = fopen(resname,"w"); //fprintf(fpres,"## thetaStat VERSION: %s build:(%s,%s)\n",VERSION,__DATE__,__TIME__); fprintf(fpres,"#(indexStart,indexStop)(firstPos_withData,lastPos_withData)(WinStart,WinStop)\t"); fprintf(fpres,"Chr\tWinCenter\t"); fprintf(fpres,"tW\ttP\ttF\ttH\ttL\t"); fprintf(fpres,"Tajima\tfuf\tfud\tfayh\tzeng\tnSites\n"); while(1){ perChr pc = getPerChr(fp); if(pc.nSites==0) break; fprintf(stderr,"\tpc.chr=%s pc.nSites=%zu firstpos=%d lastpos=%d\n",pc.chr,pc.nSites,pc.posi[0],pc.posi[pc.nSites-1]); kstring_t str = do_stat_main(pc,step,win,nChr,type); fwrite(str.s,1,str.l,fpres);//should clean up str, doesn't matter for this program; fflush(fpres); if(chr!=NULL) break; dalloc(pc); } fclose(fpres); fprintf(stderr,"\tDumping file: \"%s\"\n",resname); return 0; }
int fst_print(int argc,char **argv){ char *bname = *argv; fprintf(stderr,"\t-> Assuming idxname:%s\n",bname); perfst *pf = perfst_init(bname); writefst_header(stderr,pf); args *pars = getArgs(--argc,++argv); int *ppos = NULL; fprintf(stderr,"choose:%d \n",choose(pf->names.size(),2)); double **ares = new double*[choose(pf->names.size(),2)]; double **bres = new double*[choose(pf->names.size(),2)]; for(myFstMap::iterator it=pf->mm.begin();it!=pf->mm.end();++it){ if(pars->chooseChr!=NULL){ it = pf->mm.find(pars->chooseChr); if(it==pf->mm.end()){ fprintf(stderr,"Problem finding chr: %s\n",pars->chooseChr); break; } } if(it->second.nSites==0) continue; bgzf_seek(pf->fp,it->second.off,SEEK_SET); ppos = new int[it->second.nSites]; bgzf_read(pf->fp,ppos,sizeof(int)*it->second.nSites); for(int i=0;i<choose(pf->names.size(),2);i++){ ares[i] = new double[it->second.nSites]; bres[i] = new double[it->second.nSites]; bgzf_read(pf->fp,ares[i],sizeof(double)*it->second.nSites); bgzf_read(pf->fp,bres[i],sizeof(double)*it->second.nSites); } int first=0; if(pars->start!=-1) while(ppos[first]<pars->start) first++; int last=it->second.nSites; if(pars->stop!=-1&&pars->stop<=ppos[last-1]){ last=first; while(ppos[last]<pars->stop) last++; } fprintf(stderr,"pars->stop:%d ppos:%d first:%d last:%d\n",pars->stop,ppos[last-1],first,last); for(int s=first;s<last;s++){ fprintf(stdout,"%s\t%d",it->first,ppos[s]+1); for(int i=0;i<choose(pf->names.size(),2);i++) fprintf(stdout,"\t%f\t%f",ares[i][s],bres[i][s]); fprintf(stdout,"\n"); } for(int i=0;i<choose(pf->names.size(),2);i++){ delete [] ares[i]; delete [] bres[i]; } delete [] ppos; if(pars->chooseChr!=NULL) break; } delete [] ares; delete [] bres; destroy_args(pars); perfst_destroy(pf); return 0; }
int fst_stat(int argc,char **argv){ char *bname = *argv; fprintf(stderr,"\t-> Assuming idxname:%s\n",bname); perfst *pf = perfst_init(bname); args *pars = getArgs(--argc,++argv); int *ppos = NULL; int chs = choose(pf->names.size(),2); // fprintf(stderr,"choose:%d \n",chs); double **ares = new double*[chs]; double **bres = new double*[chs]; double unweight[chs]; double wa[chs]; double wb[chs]; size_t nObs[chs]; for(int i=0;i<chs;i++){ unweight[i] = wa[i] = wb[i] =0.0; nObs[i] = 0; } for(myFstMap::iterator it=pf->mm.begin();it!=pf->mm.end();++it){ if(pars->chooseChr!=NULL){ it = pf->mm.find(pars->chooseChr); if(it==pf->mm.end()){ fprintf(stderr,"Problem finding chr: %s\n",pars->chooseChr); break; } } if(it->second.nSites==0) continue; bgzf_seek(pf->fp,it->second.off,SEEK_SET); ppos = new int[it->second.nSites]; bgzf_read(pf->fp,ppos,sizeof(int)*it->second.nSites); for(int i=0;i<choose(pf->names.size(),2);i++){ ares[i] = new double[it->second.nSites]; bres[i] = new double[it->second.nSites]; bgzf_read(pf->fp,ares[i],sizeof(double)*it->second.nSites); bgzf_read(pf->fp,bres[i],sizeof(double)*it->second.nSites); } int first=0; if(pars->start!=-1) while(ppos[first]<pars->start) first++; int last=it->second.nSites; if(pars->stop!=-1&&pars->stop<=ppos[last-1]){ last=first; while(ppos[last]<pars->stop) last++; } // fprintf(stderr,"pars->stop:%d ppos:%d first:%d last:%d\n",pars->stop,ppos[last-1],first,last); for(int s=first;s<last;s++){ #if 0 fprintf(stdout,"%s\t%d",it->first,ppos[s]+1); for(int i=0;i<choose(pf->names.size(),2);i++) fprintf(stdout,"\t%f\t%f",ares[i][s],bres[i][s]); fprintf(stdout,"\n"); #endif for(int i=0;i<choose(pf->names.size(),2);i++){ if(bres[i][s]!=0){ unweight[i] += ares[i][s]/bres[i][s]; nObs[i]++; } wa[i] += ares[i][s]; wb[i] += bres[i][s]; } } for(int i=0;i<choose(pf->names.size(),2);i++){ delete [] ares[i]; delete [] bres[i]; } delete [] ppos; if(pars->chooseChr!=NULL) break; } double fstUW[chs]; double fstW[chs]; for(int i=0;i<chs;i++){ fstUW[i] = unweight[i]/(1.0*nObs[i]); fstW[i] = wa[i]/wb[i]; fprintf(stderr,"\t-> FST.Unweight[nObs:%lu]:%f Fst.Weight:%f\n",nObs[i],fstUW[i],fstW[i]); fprintf(stdout,"%f %f\n",fstUW[i],fstW[i]); } if(chs==3){ //if chr==3 then we have 3pops and we will also calculate pbs statistics calcpbs(fstW);//<- NOTE: the pbs values will replace the fstW values for(int i=0;i<3;i++) fprintf(stderr,"\t-> pbs.pop%d\t%f\n",i+1,fstW[i]); } delete [] ares; delete [] bres; destroy_args(pars); perfst_destroy(pf); return 0; }
int fst_stat2(int argc,char **argv){ int pS,pE;//physical start,physical end int begI,endI;//position in array for pS, pE; char *bname = *argv; fprintf(stderr,"\t-> Assuming idxname:%s\n",bname); perfst *pf = perfst_init(bname); args *pars = getArgs(--argc,++argv); fprintf(stderr,"win:%d step:%d\n",pars->win,pars->step); int *ppos = NULL; int chs = choose(pf->names.size(),2); // fprintf(stderr,"choose:%d \n",chs); double **ares = new double*[chs]; double **bres = new double*[chs]; double unweight[chs]; double wa[chs]; double wb[chs]; size_t nObs =0; for(myFstMap::iterator it=pf->mm.begin();it!=pf->mm.end();++it){ if(pars->chooseChr!=NULL){ it = pf->mm.find(pars->chooseChr); if(it==pf->mm.end()){ fprintf(stderr,"Problem finding chr: %s\n",pars->chooseChr); break; } } fprintf(stderr,"nSites:%lu\n",it->second.nSites); if(it->second.nSites==0&&pars->chooseChr!=NULL) break; else if(it->second.nSites==0&&pars->chooseChr==NULL) continue; bgzf_seek(pf->fp,it->second.off,SEEK_SET); ppos = new int[it->second.nSites]; bgzf_read(pf->fp,ppos,sizeof(int)*it->second.nSites); for(int i=0;i<it->second.nSites;i++) ppos[i]++; for(int i=0;i<choose(pf->names.size(),2);i++){ ares[i] = new double[it->second.nSites]; bres[i] = new double[it->second.nSites]; bgzf_read(pf->fp,ares[i],sizeof(double)*it->second.nSites); bgzf_read(pf->fp,bres[i],sizeof(double)*it->second.nSites); } if(pars->type==0) pS = ((pars->start!=-1?pars->start:ppos[0])/pars->step)*pars->step +pars->step; else if(pars->type==1) pS = (pars->start!=-1?pars->start:ppos[0]); else if(pars->type==2) pS = 1; pE = pS+pars->win; begI=endI=0; // fprintf(stderr,"ps:%d\n",pS);exit(0); if(pE>(pars->stop!=-1?pars->stop:ppos[it->second.nSites-1])){ fprintf(stderr,"end of dataset is before end of window: end of window:%d last position in chr:%d\n",pE,ppos[it->second.nSites-1]); // return str; } while(ppos[begI]<pS) begI++; endI=begI; while(ppos[endI]<pE) endI++; //fprintf(stderr,"begI:%d endI:%d\n",begI,endI); while(1){ for(int i=0;i<chs;i++) unweight[i] = wa[i] = wb[i] =0.0; nObs=0; fprintf(stdout,"(%d,%d)(%d,%d)(%d,%d)\t%s\t%d",begI,endI-1,ppos[begI],ppos[endI-1],pS,pE,it->first,pS+(pE-pS)/2); for(int s=begI;s<endI;s++){ #if 0 fprintf(stdout,"%s\t%d",it->first,ppos[s]+1); for(int i=0;i<choose(pf->names.size(),2);i++) fprintf(stdout,"\t%f\t%f",ares[i][s],bres[i][s]); fprintf(stdout,"\n"); #endif for(int i=0;i<choose(pf->names.size(),2);i++){ unweight[i] += ares[i][s]/bres[i][s]; wa[i] += ares[i][s]; wb[i] += bres[i][s]; } nObs++; } double fstW[chs]; for(int i=0;nObs>0&&i<chs;i++){ fstW[i] = wa[i]/wb[i]; fprintf(stdout,"\t%f\t%f",unweight[i]/(1.0*nObs),fstW[i]); } if(chs==3){ //if chr==3 then we have 3pops and we will also calculate pbs statistics calcpbs(fstW);//<- NOTE: the pbs values will replace the fstW values for(int i=0;i<3;i++) fprintf(stdout,"\t%f",fstW[i]); } fprintf(stdout,"\n"); pS += pars->step; pE =pS+pars->win; if(pE>(pars->stop!=-1?pars->stop:ppos[it->second.nSites-1])) break; while(ppos[begI]<pS) begI++; while(ppos[endI]<pE) endI++; } for(int i=0;i<choose(pf->names.size(),2);i++){ delete [] ares[i]; delete [] bres[i]; } delete [] ppos; if(pars->chooseChr!=NULL) break; } delete [] ares; delete [] bres; destroy_args(pars); perfst_destroy(pf); return 0; }
int main(int argc, char *argv[]) { if (argc <= 1) { fprintf(stderr, "Usage: thrash_threads4 input.bam\n"); exit(1); } // Find a valid seek location ~64M into the file int i; ssize_t got; BGZF *fpin = bgzf_open(argv[1], "r"); uint64_t upos = 0, uend = 0; char buf[100000]; for (i = 0; i < 100; i++) { if ((got = bgzf_read(fpin, buf, 65536)) < 0) abort(); upos += got; } int64_t pos = bgzf_tell(fpin); while ((got = bgzf_read(fpin, buf, 65536)) > 0) { uend += got; } if (got < 0) abort(); int64_t end = bgzf_tell(fpin); bgzf_close(fpin); // Ensure input is big enough to avoid case 3,4 below going off the end // of the file if (uend < upos + 10000000) { fprintf(stderr, "Please supply a bigger input file\n"); exit(1); } #define N 1000 // Spam random seeks & reads for (i = 0; i < 1000; i++) { printf("i=%d\t", i); fpin = bgzf_open(argv[1], "r"); int j, eof = 0, mt = 0; for (j = 0; j < 80; j++) { int n = rand() % 7; putchar('0'+n); fflush(stdout); switch (n) { case 0: // start if (bgzf_seek(fpin, 0LL, SEEK_SET) < 0) puts("!");//abort(); eof = 0; break; case 1: // mid if (bgzf_seek(fpin, pos, SEEK_SET) < 0) puts("!");//abort(); eof = 0; break; case 2: // eof if (bgzf_seek(fpin, end, SEEK_SET) < 0) puts("!");//abort(); eof = 1; break; case 3: case 4: { int l = rand()%(n==3?100000:100); if (bgzf_read(fpin, buf, l) != l*(1-eof)) abort(); break; } case 5: usleep(N); break; case 6: if (!mt) bgzf_mt(fpin, 8, 256); mt = 1; break; } } printf("\n"); if (bgzf_close(fpin)) abort(); } return 0; }
int main(int argc, char **argv) { int c, compress, pstdout, is_forced; BGZF *fp; void *buffer; long start, end, size; compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0; while((c = getopt(argc, argv, "cdhfb:s:")) >= 0){ switch(c){ case 'h': return bgzip_main_usage(); case 'd': compress = 0; break; case 'c': pstdout = 1; break; case 'b': start = atol(optarg); break; case 's': size = atol(optarg); break; case 'f': is_forced = 1; break; } } if (size >= 0) end = start + size; if (end >= 0 && end < start) { fprintf(stderr, "[bgzip] Illegal region: [%ld, %ld]\n", start, end); return 1; } if (compress == 1) { struct stat sbuf; int f_src = fileno(stdin); int f_dst = fileno(stdout); if ( argc>optind ) { if ( stat(argv[optind],&sbuf)<0 ) { fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); return 1; } if ((f_src = open(argv[optind], O_RDONLY)) < 0) { fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); return 1; } if (pstdout) f_dst = fileno(stdout); else { char *name = malloc(strlen(argv[optind]) + 5); strcpy(name, argv[optind]); strcat(name, ".gz"); f_dst = write_open(name, is_forced); if (f_dst < 0) return 1; free(name); } } else if (!pstdout && isatty(fileno((FILE *)stdout)) ) return bgzip_main_usage(); fp = bgzf_fdopen(f_dst, "w"); buffer = malloc(WINDOW_SIZE); while ((c = read(f_src, buffer, WINDOW_SIZE)) > 0) if (bgzf_write(fp, buffer, c) < 0) fail(fp); // f_dst will be closed here if (bgzf_close(fp) < 0) fail(fp); if (argc > optind && !pstdout) unlink(argv[optind]); free(buffer); close(f_src); return 0; } else { struct stat sbuf; int f_dst; if ( argc>optind ) { if ( stat(argv[optind],&sbuf)<0 ) { fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); return 1; } char *name; int len = strlen(argv[optind]); if ( strcmp(argv[optind]+len-3,".gz") ) { fprintf(stderr, "[bgzip] %s: unknown suffix -- ignored\n", argv[optind]); return 1; } fp = bgzf_open(argv[optind], "r"); if (fp == NULL) { fprintf(stderr, "[bgzip] Could not open file: %s\n", argv[optind]); return 1; } if (pstdout) { f_dst = fileno(stdout); } else { name = strdup(argv[optind]); name[strlen(name) - 3] = '\0'; f_dst = write_open(name, is_forced); free(name); } } else if (!pstdout && isatty(fileno((FILE *)stdin)) ) return bgzip_main_usage(); else { f_dst = fileno(stdout); fp = bgzf_fdopen(fileno(stdin), "r"); if (fp == NULL) { fprintf(stderr, "[bgzip] Could not read from stdin: %s\n", strerror(errno)); return 1; } } buffer = malloc(WINDOW_SIZE); if (bgzf_seek(fp, start, SEEK_SET) < 0) fail(fp); while (1) { if (end < 0) c = bgzf_read(fp, buffer, WINDOW_SIZE); else c = bgzf_read(fp, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start)); if (c == 0) break; if (c < 0) fail(fp); start += c; write(f_dst, buffer, c); if (end >= 0 && start >= end) break; } free(buffer); if (bgzf_close(fp) < 0) fail(fp); if (!pstdout) unlink(argv[optind]); return 0; } }
perfst * perfst_init(char *fname){ perfst *ret = new perfst ; ret->nSites =0; size_t clen; if(!fexists(fname)){ fprintf(stderr,"\t-> Problem opening file: \'%s\'\n",fname); exit(0); } FILE *fp = NULL; fp=fopen(fname,"r"); if(fp==NULL){ fprintf(stderr,"\t-> Problem opening file:%s\n",fname); exit(0); } char buf[8]; assert(fread(buf,1,8,fp)==8); ret->version=fstversion(fname); //read names size_t nit=0; assert(fread(&nit,sizeof(size_t),1,fp)==1); //fprintf(stderr,"nit:%lu\n",nit); for(int i=0;i<nit;i++){ size_t clen; assert(fread(&clen,sizeof(size_t),1,fp)==1); //fprintf(stderr,"clen:%lu\n",clen); char *nam =(char*) calloc(clen+1,1); assert(fread(nam,sizeof(char),clen,fp)==clen); ret->names.push_back(nam); } #if 1 while(fread(&clen,sizeof(size_t),1,fp)){ char *chr = (char*)calloc(clen+1,1); unsigned a =(unsigned) fread(chr,1,clen,fp); assert(clen==a); dat d; if(1!=fread(&d.nSites,sizeof(size_t),1,fp)){ fprintf(stderr,"[%s.%s():%d] Problem reading data: %s \n",__FILE__,__FUNCTION__,__LINE__,fname); exit(0); } // exit(0); ret->nSites += d.nSites; if(1!=fread(&d.off,sizeof(int64_t),1,fp)){ fprintf(stderr,"[%s->%s():%d] Problem reading data: %s \n",__FILE__,__FUNCTION__,__LINE__,fname); exit(0); } myFstMap::iterator it = ret->mm.find(chr); if(it==ret->mm.end()) ret->mm[chr] =d ; else{ fprintf(stderr,"Problem with chr: %s, key already exists\n",chr); exit(0); } } #endif fclose(fp); char *tmp =(char*)calloc(strlen(fname)+100,1);//that should do it tmp=strncpy(tmp,fname,strlen(fname)-3); // fprintf(stderr,"tmp:%s\n",tmp); char *tmp2 = (char*)calloc(strlen(fname)+100,1);//that should do it snprintf(tmp2,strlen(fname)+100,"%sgz",tmp); fprintf(stderr,"\t-> Assuming .fst.gz file: %s\n",tmp2); if(ret->version!=fstversion(tmp2)){ fprintf(stderr,"\t-> Version mismatch: %d %d\n",ret->version,fstversion(tmp2)); return NULL; } ret->fp = bgzf_open(tmp2,"r"); bgzf_seek(ret->fp,8,SEEK_SET); free(tmp); free(tmp2); // writefst_header(stderr,ret); return ret; }
/** * Create single chromosome index file * the file content is a 2-column matrix of int64_t type * line1: num_sample num_marker * line2: 0 bgzf_offset_for_#CHROM_line * line3: var_1_pos bgzf_offset_for_var_1 * ... */ int SingleChromosomeBCFIndex::createIndex() { // const char* fn = bcfFile_.c_str(); BGZF* fp = fBcfFile_; // bgzf_open(fn, "rb"); bgzf_seek(fp, 0, SEEK_SET); // check magic number char magic[5]; if (5 != bgzf_read(fp, magic, 5)) { return -1; // exit(1); } if (!(magic[0] == 'B' && magic[1] == 'C' && magic[2] == 'F' && magic[3] == 2 && (magic[4] == 1 || magic[4] == 2))) { return -1; // exit(1); } // read header uint32_t l_text; if (4 != bgzf_read(fp, &l_text, 4)) { return -1; // exit(1); } Rprintf("l_text = %d\n", l_text); std::string s; int64_t bgzf_offset_before_header = bgzf_tell(fp); // the beginning of header block s.resize(l_text); if (bgzf_read(fp, (void*)s.data(), l_text) != l_text) { REprintf( "Read failed!\n"); } BCFHeader bcfHeader; if (bcfHeader.parseHeader(s, &bcfHeader.header_contig_id, &bcfHeader.header_id, &bcfHeader.header_number, &bcfHeader.header_type, &bcfHeader.header_description)) { REprintf( "Parse header failed!\n"); return -1; // exit(1); } // locate #CHROM line int64_t bgzf_offset_after_header = bgzf_tell(fp); // the end of header block size_t ptr_chrom_line = s.find("#CHROM"); // the index of "#CHROM", also the size between beginning of header to '#CHROM' if (ptr_chrom_line == std::string::npos) { REprintf( "Cannot find the \"#CHROM\" line!\n"); return -1; // exit(1); } Rprintf("offset_header = %d\n", (int) ptr_chrom_line); bgzf_seek(fp, bgzf_offset_before_header, SEEK_SET); // rewind fp to the beginning of header s.resize(ptr_chrom_line); int64_t before_chrom_size = bgzf_read(fp, (void*) s.data(), ptr_chrom_line); int64_t bgzf_offset_before_chrom = bgzf_tell(fp); // the offset to #CHROM s.resize(l_text - before_chrom_size); int64_t after_chrom_size = bgzf_read(fp, (void*) s.data(), l_text - before_chrom_size); // load sample names while (s.back() == '\n' || s.back() == '\0') { s.resize(s.size() - 1); } stringTokenize(s, "\t", &bcfHeader.sample_names); const int64_t num_sample = (int)bcfHeader.sample_names.size() - 9; // vcf header has 9 columns CHROM...FORMAT before actual sample names Rprintf("sample size = %ld\n", num_sample); Rprintf("last character is s[after_chrom_size-1] = %d\n", s[after_chrom_size - 1]); // should be 0, the null terminator character // quality check if (bgzf_offset_after_header != bgzf_tell(fp)) { REprintf( "Messed up bgzf header\n"); return -1; // exit(1); } // create index file FILE* fIndex = fopen(indexFile_.c_str(), "wb"); int64_t num_marker = 0; int64_t pos = 0; fwrite(&num_sample, sizeof(int64_t), 1, fIndex); fwrite(&num_marker, sizeof(int64_t), 1, fIndex); fwrite(&pos, sizeof(int64_t), 1, fIndex); fwrite(&bgzf_offset_before_chrom, sizeof(int64_t), 1, fIndex); uint32_t l_shared; uint32_t l_indiv; std::vector<char> data; int64_t offset; do { offset = bgzf_tell(fp); if (4 != bgzf_read(fp, &l_shared, sizeof(uint32_t))) { break; // REprintf( "Wrong read!\n"); exit(1); } if (4 != bgzf_read(fp, &l_indiv, sizeof(uint32_t))) { break; // REprintf( "Wrong read!\n"); exit(1); } data.resize(l_shared + l_indiv); if (l_shared + l_indiv != bgzf_read(fp, data.data(), (l_shared+l_indiv) * sizeof(char))) { break; // REprintf( "Wrong read!\n"); exit(1); } memcpy(&pos, data.data() + 4, 4); fwrite(&pos, sizeof(int64_t), 1, fIndex); fwrite(&offset, sizeof(int64_t), 1, fIndex); num_marker++; if (num_marker % 10000 == 0) { Rprintf("\rprocessed %ld markers", num_marker); } } while (true); if (fseek(fIndex, 0, SEEK_SET)) { REprintf( "fseek failed\n!"); } fwrite(&num_sample, sizeof(int64_t), 1, fIndex); fwrite(&num_marker, sizeof(int64_t), 1, fIndex); fclose(fIndex); Rprintf("Indexing finished with %ld samples and %ld markers\n", num_sample, num_marker); return 0; }
void key_fastq_rewind(void *data) { BGZF *fastq_file = (BGZF *) data; bgzf_seek( fastq_file, 0, SEEK_SET ); }
int main(int argc, char** argv) { const char* fBG = argv[1]; const char* fIndex = argv[2]; int64_t pos = strtol(argv[3], NULL, 0); // const int Nrecord = 10; // read everything MmapFile mmapFile; mmapFile.open(fIndex); size_t Nrecord = mmapFile.getFileSize() / 16 - 1; Record* r = (Record*)mmapFile.data; // FILE* fp = fopen(fIndex, "rb"); // if (Nrecord != fread(r, sizeof(Record), Nrecord, fp)) { // fprintf(stderr, "Read error!\n"); // } // binary search for file position int64_t offset = -1; Record query; query.pos = pos; // Comparator comparator; Record* lb = std::lower_bound(r, r + Nrecord, query, comparator); // r[lb].pos >= query.pos Record* ub = std::upper_bound(lb, r + Nrecord, query, comparator); // r[ub].pos > query.pos for (Record* pi = lb; pi != ub; ++pi) { printf("%ld %ld\n", pi->pos, pi->offset); offset = pi->offset; // (TODO) only store one virtual offset for now. break; } // int64_t offset = -1; // for (int i = 0; i < Nrecord; ++i) { // if (r[i].pos == pos) { // offset = r[i].offset; // break; // } // } if (offset < 0) { fprintf(stderr, "Cannot find position!\n"); } else { printf("found: %ld %ld\n", pos, offset); } BGZF* fp2 = bgzf_open(fBG, "rb"); if (bgzf_seek(fp2, offset, SEEK_SET)) { fprintf(stderr, "seek error!\n"); } kstring_t* str; str = (kstring_t*)calloc(1, sizeof(kstring_t)); kstring_t& s = *str; int ret = bgzf_getline(fp2, '\n', &s); if (ret <= 0) { fprintf(stderr, "getline error, ret = %d!\n", ret); } for (size_t i = 0; i < s.l; ++i) { if (i >= 50) break; printf("%c", s.s[i]); } printf("\n"); free(str); bgzf_close(fp2); // fclose(fp); return 0; }
value caml_bgzf_seek(value bgzf,value pos) { CAMLparam2(bgzf,pos); if(bgzf_seek(BGZF_val(bgzf),Int64_val(pos),SEEK_SET) != 0) caml_failwith("BGZF.seek"); CAMLreturn(Val_unit); }