int64_t writeAll(std::vector<the_t> &thetas, char *chr,BGZF *fp){ fprintf(stderr,"\tWriting: chr:%s with nSites:%zu\n",chr,thetas.size()); int64_t retVal =bgzf_tell(fp); size_t clen=strlen(chr); bgzf_write(fp,&clen,sizeof(size_t));//write len of chr bgzf_write(fp,chr,clen);//write chr size_t vLen = thetas.size(); bgzf_write(fp,&vLen,sizeof(size_t));//write len of positions; int *posi = new int[thetas.size()]; static float **the = new float*[5]; for(int i=0;i<5;i++) the[i] = new float[thetas.size()]; for(size_t i=0;i<thetas.size();i++){ posi[i] =thetas[i].posi; for(int j=0;j<5;j++) the[j][i] = thetas[i].vals[j]; delete [] thetas[i].vals; } bgzf_write(fp,posi,sizeof(int)*thetas.size()); for(int j=0;j<5;j++){ bgzf_write(fp,the[j],sizeof(float)*thetas.size()); delete [] the[j]; } delete [] posi; fprintf(stderr,"\tDone writing: %s\n",chr); return retVal; }
void abcSmartCounts::changeChr(int newRefId){ if(doSmartCounts==0) return; // fprintf(stderr,"cur:%d new:%d\n",curChr,newRefId); if(curChr!=-1){ int64_t retVal =bgzf_tell(fbin); int clen = strlen(header->name[curChr]); bgzf_write(fbin,&clen,sizeof(int)); bgzf_write(fbin,header->name[curChr],clen); bgzf_write(fbin,&len,sizeof(int)); for(int i=0;i<4;i++) bgzf_write(fbin,counts[i],len);//write len of chr //write index stuff fprintf(stderr,"Writing index for chr: %s\n",header->name[curChr]); fwrite(&clen,sizeof(int),1,fidx); fwrite(header->name[curChr] ,sizeof(char),clen,fidx); fwrite(&len,sizeof(int),1,fidx); fwrite(&retVal,sizeof(int64_t),1,fidx); } curChr = newRefId; len = header->l_ref[curChr]; for(int i=0;i<4;i++){ delete [] counts[i]; counts[i] = new unsigned char[len]; memset(counts[i],0,len); } }
void populate_index(uint64_t *table, cmph_t *hash, BGZF *fastq_file) { while( 1 ) { /* Find @ */ char c; while( ( c = bgzf_getc( fastq_file ) ) != '@' && c >= 0 ) { } long pos = bgzf_tell( fastq_file ); if( pos == -1 ) { break; } char *accession = NULL; cmph_uint32 accession_length; if( read_one_line( &accession, &accession_length, fastq_file ) != 1 ) { break; } /* Next char is sequence, save pos */ unsigned int id = cmph_search( hash, accession, accession_length ); table[ id ] = (uint64_t) pos; } }
abcSmartCounts::~abcSmartCounts(){ if(doSmartCounts==0) return; int64_t retVal =bgzf_tell(fbin); int clen = strlen(header->name[curChr]); bgzf_write(fbin,&clen,sizeof(int)); bgzf_write(fbin,header->name[curChr],clen); bgzf_write(fbin,&len,sizeof(int)); for(int i=0;i<4;i++) bgzf_write(fbin,counts[i],len);//write len of chr //write index stuff fwrite(&clen,sizeof(int),1,fidx); fwrite(header->name[curChr],sizeof(char),clen,fidx); fwrite(&len,sizeof(int),1,fidx); fwrite(&retVal,sizeof(int64_t),1,fidx); for(int i=0;i<4;i++) delete [] counts[i]; delete [] counts; fclose(fidx); bgzf_close(fbin); }
BamFilePrivate(const std::string& fn) : filename_(fn) , firstAlignmentOffset_(-1) { // ensure we've updated htslib verbosity with requested verbosity here hts_verbose = ( PacBio::BAM::HtslibVerbosity == -1 ? 0 : PacBio::BAM::HtslibVerbosity); // attempt open auto f = RawOpen(); #if !defined (PBBAM_NO_CHECK_EOF) || PBBAM_AUTOVALIDATE // sanity check on file const int eofCheck = bgzf_check_EOF(f->fp.bgzf); if (eofCheck <= 0 ) { // 1: EOF present & correct // 2: not seekable (e.g. reading from stdin) // 0: EOF absent // -1: some other error std::stringstream e; if (eofCheck == 0) e << fn << " : is missing EOF block" << std::endl; else e << fn << " : unknown error while checking EOF block" << std::endl; throw std::runtime_error(e.str()); } #endif // attempt fetch header std::unique_ptr<bam_hdr_t, internal::HtslibHeaderDeleter> hdr(sam_hdr_read(f.get())); header_ = internal::BamHeaderMemory::FromRawData(hdr.get()); // cache first alignment offset firstAlignmentOffset_ = bgzf_tell(f->fp.bgzf); }
uint64_t read_chunk(double **chunk_data, params *pars, uint64_t chunk) { uint64_t total_elems_read = 0; if(chunk >= pars->n_chunks) error("invalid chunk number!"); // Define chunk start and end positions uint64_t start_pos = chunk * pars->max_chunk_size; uint64_t end_pos = start_pos + pars->max_chunk_size - 1; if(end_pos >= pars->n_sites) end_pos = pars->n_sites - 1; uint64_t chunk_size = end_pos - start_pos + 1; if( pars->verbose >= 6 ) printf("\tReading chunk %lu from position %lu to %lu (%lu)\n", chunk+1, start_pos, end_pos, chunk_size); // Search start position #ifdef _USE_BGZF if( bgzf_seek(pars->in_glf_fh, pars->chunks_voffset[chunk], SEEK_SET) < 0 ) error("cannot seek GLF file (BGZF)!"); #endif // Read data from file for(uint64_t c = 0; c < chunk_size; c++) { #ifdef _USE_BGZF int bytes_read = bgzf_read(pars->in_glf_fh, chunk_data[c], (int) pars->n_ind * 3 * sizeof(double)); if(pars->call_geno) call_geno(chunk_data[c], pars->n_ind, 3); uint64_t elems_read = (uint64_t) bytes_read / sizeof(double); #else chunk_data[c] = pars->data[start_pos+c]; uint64_t elems_read = pars->n_ind * 3; #endif if( elems_read != pars->n_ind * 3 ) error("cannot read GLF file!"); total_elems_read += elems_read; } #ifdef _USE_BGZF // Update index for next chunk if( chunk+1 != pars->n_chunks && pars->chunks_voffset[chunk+1] == 0 ) pars->chunks_voffset[chunk+1] = bgzf_tell(pars->in_glf_fh); #endif return( total_elems_read/(pars->n_ind * 3) ); }
BgzfFileType::BgzfFileType(const char * filename, const char * mode) { // If the file is for write and is '-', then write to stdout. if(((mode[0] == 'w') || (mode[0] == 'W')) && (strcmp(filename, "-") == 0)) { // Write to stdout. bgzfHandle = bgzf_fdopen(fileno(stdout), mode); } else if(((mode[0] == 'r') || (mode[0] == 'R')) && (strcmp(filename, "-") == 0)) { // read from stdin bgzfHandle = bgzf_fdopen(fileno(stdin), mode); } else { bgzfHandle = bgzf_open(filename, mode); } myStartPos = 0; if (bgzfHandle != NULL) { // Check to see if the file is being opened for read, if the eof block // is required, and if it is, if it is there. if ((mode[0] == 'r' || mode[0] == 'R') && ourRequireEofBlock && (bgzf_check_EOF(bgzfHandle) == 0)) { std::cerr << "BGZF EOF marker is missing in " << filename << std::endl; // the block is supposed to be there, but isn't, so close the file. close(); } else { // Successfully opened a properly formatted file, so get the start // position. myStartPos = bgzf_tell(bgzfHandle); } } myEOF = false; }
//return zero if fine. int writeDat(char *last,mmap &mm,tary<char> *keep,tary<char> *major,tary<char> *minor,BGZF *BFP,FILE *fp,int doCompl){ assert(last!=NULL); if((major!=NULL) ^ (minor!=NULL)){ fprintf(stderr,"major and minor should be the same\n"); return 1; } int hasMajMin =0; if(major!=NULL) hasMajMin =1; fprintf(stderr,"\t-> Writing chr:\'%s\' \n",last); mmap::iterator it=mm.find(last); if(it!=mm.end()){ return 1; }else mm[strdup(last)]=1; //write data and index stuff int64_t retVal =bgzf_tell(BFP);//now contains the offset to which we should point. //write chrname int clen=strlen(last)+1; fwrite(&clen,1,sizeof(int),fp); fwrite(last,clen,sizeof(char),fp); fwrite(&retVal,1,sizeof(int64_t),fp); for(int i=0;doCompl&&i<keep->l;i++) if(keep->d[i]==0) keep->d[i]=1; else keep->d[i]=0; fwrite(&keep->l,sizeof(size_t),1,fp);//write len of chr fwrite(&hasMajMin,1,sizeof(int),fp); aio::bgzf_write(BFP,keep->d,keep->l);//write keep if(hasMajMin){ aio::bgzf_write(BFP,major->d,major->l);//write maj aio::bgzf_write(BFP,minor->d,minor->l);//write min } return 0; }
int fst_index(int argc,char **argv){ if(argc<1){ fprintf(stderr,"Must supply afile.saf.idx [chrname, write more info]\n"); return 0; } args *arg = getArgs(argc,argv); if(!arg->fstout){ fprintf(stderr,"\t-> Must supply -fstout for doing fstindex\n"); return 0; } std::vector<persaf *> &saf =arg->saf; //assert(saf.size()==2); size_t nSites = arg->nSites; if(nSites == 0){//if no -nSites is specified nSites = 100000;//<- set default to 100k sites, no need to load everything... // nSites=nsites(saf,arg); } fprintf(stderr,"\t-> nSites: %lu\n",nSites); std::vector<Matrix<float> *> gls; for(int i=0;i<saf.size();i++) gls.push_back(alloc<float>(nSites,saf[i]->nChr+1)); // int ndim= parspace(saf); if(arg->sfsfname.size()!=choose(saf.size(),2)){ fprintf(stderr,"\t-> You have supplied: %lu populations, that is %d pairs\n",saf.size(),choose(saf.size(),2)); fprintf(stderr,"\t-> You therefore need to supply %d 2dsfs priors instead of:%lu\n",choose(saf.size(),2),arg->sfsfname.size()); exit(0); } std::vector<double *> sfs; int inc =0; for(int i=0;i<saf.size();i++) for(int j=i+1;j<saf.size();j++){ size_t pairdim = (saf[i]->nChr+1)*(saf[j]->nChr+1); double *ddd=new double[pairdim]; readSFS(arg->sfsfname[inc],pairdim,ddd); normalize(ddd,pairdim); sfs.push_back(ddd); inc++; } double **a1,**b1; a1=new double*[choose(saf.size(),2)]; b1=new double*[choose(saf.size(),2)]; inc=0; for(int i=0;i<saf.size();i++) for(int j=i+1;j<saf.size();j++){ calcCoef((int)saf[i]->nChr,(int)saf[j]->nChr,&a1[inc],&b1[inc]); // fprintf(stderr,"a1[%d]:%p b1[%d]:%p\n",inc,&a1[inc][0],inc,&b1[inc][0]); inc++; } BGZF *fstbg = openFileBG(arg->fstout,".fst.gz"); FILE *fstfp = openFile(arg->fstout,".fst.idx"); char buf[8]="fstv1"; bgzf_write(fstbg,buf,8); fwrite(buf,1,8,fstfp); #if 0 for(int i=0;i<ndim;i++) fprintf(stdout,"%f %f\n",a1[i],b1[i]); exit(0); #endif #if 1 size_t nsafs=saf.size(); fwrite(&nsafs,sizeof(size_t),1,fstfp); for(int i=0;i<nsafs;i++){ size_t clen= strlen(saf[i]->fname); fwrite(&clen,sizeof(size_t),1,fstfp); fwrite(saf[i]->fname,1,clen,fstfp); } #endif int asdf = choose(saf.size(),2); std::vector<double> *ares = new std::vector<double> [choose(saf.size(),2)]; std::vector<double> *bres = new std::vector<double> [choose(saf.size(),2)]; // for(int i=0;i<3;i++) // fprintf(stderr,"ares.size():%lu bres.size():%lu sfs:%p\n",ares[i].size(),bres[i].size(),&sfs[i][0]); std::vector<int> posi; setGloc(saf,nSites); int *posiToPrint = new int[nSites]; for(myMap::iterator it = saf[0]->mm.begin();it!=saf[0]->mm.end();++it) { // fprintf(stderr,"doing chr:%s\n",it->first); if(arg->chooseChr!=NULL){ it = saf[0]->mm.find(arg->chooseChr); if(it==saf[0]->mm.end()){ fprintf(stderr,"Problem finding chr: %s\n",arg->chooseChr); break; } } for(int i=0;i<choose(saf.size(),2);i++){ ares[i].clear(); bres[i].clear(); } posi.clear(); while(1) { int ret=readdata(saf,gls,nSites,it->first,arg->start,arg->stop,posiToPrint,NULL);//read nsites from data // fprintf(stderr,"ret:%d glsx:%lu\n",ret,gls[0]->x); //if(gls[0]->x!=nSites&&arg->chooseChr==NULL&&ret!=-3){ //fprintf(stderr,"continue continue\n"); // continue; //} fprintf(stderr,"\t-> Will now do fst temp dump using a chunk of %lu\n",gls[0]->x); int inc=0; for(int i=0;i<saf.size();i++) for(int j=i+1;j<saf.size();j++){ // fprintf(stderr,"i:%d j:%d inc:%d gls[i]:%p gls[j]:%p sfs:%p a1:%p b1:%p\n",i,j,inc,gls[i],gls[j],sfs[i],&a1[inc][0],&a1[inc][0]); block_coef(gls[i],gls[j],sfs[inc],a1[inc],b1[inc],ares[inc],bres[inc]); inc++; } for(int i=0;i<gls[0]->x;i++) posi.push_back(posiToPrint[i]); for(int i=0;i<gls.size();i++) gls[i]->x =0; if(ret==-2)//no more data in files or in chr, eith way we break; break; } size_t clen = strlen(it->first); fwrite(&clen,sizeof(size_t),1,fstfp); fwrite(it->first,1,clen,fstfp); size_t nit=posi.size(); assert(1==fwrite(&nit,sizeof(size_t),1,fstfp)); int64_t tell = bgzf_tell(fstbg); fwrite(&tell,sizeof(int64_t),1,fstfp); bgzf_write(fstbg,&posi[0],posi.size()*sizeof(int)); int inc =0; for(int i=0;i<saf.size();i++) for(int j=i+1;j<saf.size();j++){ bgzf_write(fstbg,&(ares[inc][0]),ares[inc].size()*sizeof(double)); bgzf_write(fstbg,&(bres[inc][0]),bres[inc].size()*sizeof(double)); inc++; } if(arg->chooseChr!=NULL) break; } delGloc(saf,nSites); destroy(gls,nSites); destroy_args(arg); for(int i=0;i<sfs.size();i++) delete [] sfs[i]; #if 0 fprintf(stderr,"\n\t-> NB NB output is no longer log probs of the frequency spectrum!\n"); fprintf(stderr,"\t-> Output is now simply the expected values! \n"); fprintf(stderr,"\t-> You can convert to the old format simply with log(norm(x))\n"); #endif bgzf_close(fstbg); fclose(fstfp); fprintf(stderr,"\t-> fst index finished with no errors!\n"); return 0; }
int main(int argc, char *argv[]) { if (argc <= 1) { fprintf(stderr, "Usage: thrash_threads4 input.bam\n"); exit(1); } // Find a valid seek location ~64M into the file int i; ssize_t got; BGZF *fpin = bgzf_open(argv[1], "r"); uint64_t upos = 0, uend = 0; char buf[100000]; for (i = 0; i < 100; i++) { if ((got = bgzf_read(fpin, buf, 65536)) < 0) abort(); upos += got; } int64_t pos = bgzf_tell(fpin); while ((got = bgzf_read(fpin, buf, 65536)) > 0) { uend += got; } if (got < 0) abort(); int64_t end = bgzf_tell(fpin); bgzf_close(fpin); // Ensure input is big enough to avoid case 3,4 below going off the end // of the file if (uend < upos + 10000000) { fprintf(stderr, "Please supply a bigger input file\n"); exit(1); } #define N 1000 // Spam random seeks & reads for (i = 0; i < 1000; i++) { printf("i=%d\t", i); fpin = bgzf_open(argv[1], "r"); int j, eof = 0, mt = 0; for (j = 0; j < 80; j++) { int n = rand() % 7; putchar('0'+n); fflush(stdout); switch (n) { case 0: // start if (bgzf_seek(fpin, 0LL, SEEK_SET) < 0) puts("!");//abort(); eof = 0; break; case 1: // mid if (bgzf_seek(fpin, pos, SEEK_SET) < 0) puts("!");//abort(); eof = 0; break; case 2: // eof if (bgzf_seek(fpin, end, SEEK_SET) < 0) puts("!");//abort(); eof = 1; break; case 3: case 4: { int l = rand()%(n==3?100000:100); if (bgzf_read(fpin, buf, l) != l*(1-eof)) abort(); break; } case 5: usleep(N); break; case 6: if (!mt) bgzf_mt(fpin, 8, 256); mt = 1; break; } } printf("\n"); if (bgzf_close(fpin)) abort(); } return 0; }
value caml_bgzf_tell(value bgzf) { CAMLparam1(bgzf); CAMLreturn(copy_int64(bgzf_tell(BGZF_val(bgzf)))); }
/** * Create single chromosome index file * the file content is a 2-column matrix of int64_t type * line1: num_sample num_marker * line2: 0 bgzf_offset_for_#CHROM_line * line3: var_1_pos bgzf_offset_for_var_1 * ... */ int SingleChromosomeBCFIndex::createIndex() { // const char* fn = bcfFile_.c_str(); BGZF* fp = fBcfFile_; // bgzf_open(fn, "rb"); bgzf_seek(fp, 0, SEEK_SET); // check magic number char magic[5]; if (5 != bgzf_read(fp, magic, 5)) { return -1; // exit(1); } if (!(magic[0] == 'B' && magic[1] == 'C' && magic[2] == 'F' && magic[3] == 2 && (magic[4] == 1 || magic[4] == 2))) { return -1; // exit(1); } // read header uint32_t l_text; if (4 != bgzf_read(fp, &l_text, 4)) { return -1; // exit(1); } Rprintf("l_text = %d\n", l_text); std::string s; int64_t bgzf_offset_before_header = bgzf_tell(fp); // the beginning of header block s.resize(l_text); if (bgzf_read(fp, (void*)s.data(), l_text) != l_text) { REprintf( "Read failed!\n"); } BCFHeader bcfHeader; if (bcfHeader.parseHeader(s, &bcfHeader.header_contig_id, &bcfHeader.header_id, &bcfHeader.header_number, &bcfHeader.header_type, &bcfHeader.header_description)) { REprintf( "Parse header failed!\n"); return -1; // exit(1); } // locate #CHROM line int64_t bgzf_offset_after_header = bgzf_tell(fp); // the end of header block size_t ptr_chrom_line = s.find("#CHROM"); // the index of "#CHROM", also the size between beginning of header to '#CHROM' if (ptr_chrom_line == std::string::npos) { REprintf( "Cannot find the \"#CHROM\" line!\n"); return -1; // exit(1); } Rprintf("offset_header = %d\n", (int) ptr_chrom_line); bgzf_seek(fp, bgzf_offset_before_header, SEEK_SET); // rewind fp to the beginning of header s.resize(ptr_chrom_line); int64_t before_chrom_size = bgzf_read(fp, (void*) s.data(), ptr_chrom_line); int64_t bgzf_offset_before_chrom = bgzf_tell(fp); // the offset to #CHROM s.resize(l_text - before_chrom_size); int64_t after_chrom_size = bgzf_read(fp, (void*) s.data(), l_text - before_chrom_size); // load sample names while (s.back() == '\n' || s.back() == '\0') { s.resize(s.size() - 1); } stringTokenize(s, "\t", &bcfHeader.sample_names); const int64_t num_sample = (int)bcfHeader.sample_names.size() - 9; // vcf header has 9 columns CHROM...FORMAT before actual sample names Rprintf("sample size = %ld\n", num_sample); Rprintf("last character is s[after_chrom_size-1] = %d\n", s[after_chrom_size - 1]); // should be 0, the null terminator character // quality check if (bgzf_offset_after_header != bgzf_tell(fp)) { REprintf( "Messed up bgzf header\n"); return -1; // exit(1); } // create index file FILE* fIndex = fopen(indexFile_.c_str(), "wb"); int64_t num_marker = 0; int64_t pos = 0; fwrite(&num_sample, sizeof(int64_t), 1, fIndex); fwrite(&num_marker, sizeof(int64_t), 1, fIndex); fwrite(&pos, sizeof(int64_t), 1, fIndex); fwrite(&bgzf_offset_before_chrom, sizeof(int64_t), 1, fIndex); uint32_t l_shared; uint32_t l_indiv; std::vector<char> data; int64_t offset; do { offset = bgzf_tell(fp); if (4 != bgzf_read(fp, &l_shared, sizeof(uint32_t))) { break; // REprintf( "Wrong read!\n"); exit(1); } if (4 != bgzf_read(fp, &l_indiv, sizeof(uint32_t))) { break; // REprintf( "Wrong read!\n"); exit(1); } data.resize(l_shared + l_indiv); if (l_shared + l_indiv != bgzf_read(fp, data.data(), (l_shared+l_indiv) * sizeof(char))) { break; // REprintf( "Wrong read!\n"); exit(1); } memcpy(&pos, data.data() + 4, 4); fwrite(&pos, sizeof(int64_t), 1, fIndex); fwrite(&offset, sizeof(int64_t), 1, fIndex); num_marker++; if (num_marker % 10000 == 0) { Rprintf("\rprocessed %ld markers", num_marker); } } while (true); if (fseek(fIndex, 0, SEEK_SET)) { REprintf( "fseek failed\n!"); } fwrite(&num_sample, sizeof(int64_t), 1, fIndex); fwrite(&num_marker, sizeof(int64_t), 1, fIndex); fclose(fIndex); Rprintf("Indexing finished with %ld samples and %ld markers\n", num_sample, num_marker); return 0; }