abcGL::~abcGL(){ free(angsd_tmpdir); if(GL==0&&doGlf==0) return; else if(GL==1) bam_likes_destroy(); else if(GL==2) gatk_destroy(); else if(GL==4) abcError::killGlobalErrorProbs(errorProbs); else if(GL==5) phys_destroy(); if(doGlf) bgzf_close(gzoutfile); if(gzoutfile!=NULL) bgzf_close(gzoutfile2); if(bufstr.s!=NULL) free(bufstr.s); if(errors){ for(int i=0;i<4;i++) delete [] errors[i]; delete [] errors; } delete [] logfactorial; }
abcCounts::~abcCounts(){ if(oFileCountsBin!=NULL) bgzf_close(oFileCountsBin); if(oFileCountsPos!=NULL) bgzf_close(oFileCountsPos); if(doQsDist){ FILE *oFileQs = NULL; oFileQs = aio::openFile(oFiles,postfix3); fprintf(oFileQs,"qscore\tcounts\n"); printQs(oFileQs,qsDist); if(oFileQs) fclose(oFileQs); delete[] qsDist; } if(doDepth){ FILE *oFileSamplDepth = aio::openFile(oFiles,postfix4); FILE *oFileGlobDepth = aio::openFile(oFiles,postfix5); for(int i=0;i<nInd;i++){ for(int j=0;j<maxDepth+1;j++){ fprintf(oFileSamplDepth,"%lu\t",depthCount[i][j]); } fprintf(oFileSamplDepth,"\n"); } //thorfinn for(int j=0;j<maxDepth+1;j++) fprintf(oFileGlobDepth,"%lu\t",globCount[j]); fprintf(oFileGlobDepth,"\n"); //clean depthCount for(int i=0;i<nInd;i++) delete[] depthCount[i]; delete[] depthCount; if(oFileSamplDepth) fclose(oFileSamplDepth); if(oFileSamplDepth) fclose(oFileGlobDepth); } if(minQfile!=NULL){ // angsd::printMatrix(minQmat,stderr); angsd::deleteMatrix(minQmat); } if(oFileIcounts!=NULL) bgzf_close(oFileIcounts); free(oFiles); free(bpos.s); free(bbin.s); free(bufstr.s); if(globCount) delete [] globCount; }
void perpsmc_destroy(perpsmc *pp){ bgzf_close(pp->bgzf_gls); bgzf_close(pp->bgzf_pos); destroy(pp->mm); if(pp->pos) delete [] pp->pos; if(pp->gls) delete [] pp->gls; free(pp->fname); delete pp; }
abcHaploCall::~abcHaploCall(){ if(doHaploCall==0) return; if(outfileZ!=NULL) bgzf_close(outfileZ); }
int bam_reheader(BGZF *in, const bam_header_t *h, int fd) { BGZF *fp; bam_header_t *old; int len; uint8_t *buf; if (in->open_mode != 'r') return -1; buf = malloc(BUF_SIZE); old = bam_header_read(in); fp = bgzf_dopen(fd, "w"); bam_header_write(fp, h); if (in->block_offset < in->block_length) { bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset); bgzf_flush(fp); } #ifdef _USE_KNETFILE while ((len = knet_read(in->fp, buf, BUF_SIZE)) > 0) fwrite(buf, 1, len, fp->fp); #else while (!feof(in->fp) && (len = fread(buf, 1, BUF_SIZE, in->fp)) > 0) fwrite(buf, 1, len, fp->fp); #endif free(buf); fp->block_offset = in->block_offset = 0; bgzf_close(fp); return 0; }
void perfst_destroy(perfst *pp){ bgzf_close(pp->fp); destroy(pp->mm); for(int i=0;i<pp->names.size();i++) free(pp->names[i]); delete pp; }
int fai_build(const char *fn) { char *str; BGZF *bgzf; FILE *fp; faidx_t *fai; str = (char*)calloc(strlen(fn) + 5, 1); sprintf(str, "%s.fai", fn); bgzf = bgzf_open(fn, "r"); if ( !bgzf ) { fprintf(stderr, "[fai_build] fail to open the FASTA file %s\n",fn); free(str); return -1; } if ( bgzf->is_compressed ) bgzf_index_build_init(bgzf); fai = fai_build_core(bgzf); if ( bgzf->is_compressed ) bgzf_index_dump(bgzf, fn, ".gzi"); bgzf_close(bgzf); fp = fopen(str, "wb"); if ( !fp ) { fprintf(stderr, "[fai_build] fail to write FASTA index %s\n",str); fai_destroy(fai); free(str); return -1; } fai_save(fai, fp); fclose(fp); free(str); fai_destroy(fai); return 0; }
void SingleChromosomeBCFIndex::close() { if (fBcfFile_) { bgzf_close(fBcfFile_); fBcfFile_ = NULL; } closeIndex(); }
int main_reheader(int argc, char *argv[]) { bam_header_t *h; BGZF *in; if (argc != 3) { fprintf(stderr, "Usage: samtools reheader <in.header.sam> <in.bam>\n"); return 1; } { // read the header tamFile fph = sam_open(argv[1]); if (fph == 0) { fprintf(stderr, "[%s] fail to read the header from %s.\n", __func__, argv[1]); return 1; } h = sam_header_read(fph); sam_close(fph); } in = strcmp(argv[2], "-")? bam_open(argv[2], "r") : bam_dopen(fileno(stdin), "r"); if (in == 0) { fprintf(stderr, "[%s] fail to open file %s.\n", __func__, argv[2]); return 1; } bam_reheader(in, h, fileno(stdout)); bgzf_close(in); return 0; }
int bgzf_check_bgzf(const char *fn) { BGZF *fp; uint8_t buf[10],magic[10]="\037\213\010\4\0\0\0\0\0\377"; int n; if ((fp = bgzf_open(fn, "r")) == 0) { fprintf(stderr, "[bgzf_check_bgzf] failed to open the file: %s\n",fn); return -1; } #ifdef _USE_KNETFILE n = knet_read(fp->x.fpr, buf, 10); #else n = fread(buf, 1, 10, fp->file); #endif bgzf_close(fp); if ( n!=10 ) return -1; if ( !memcmp(magic, buf, 10) ) return 1; return 0; }
abcSmartCounts::~abcSmartCounts(){ if(doSmartCounts==0) return; int64_t retVal =bgzf_tell(fbin); int clen = strlen(header->name[curChr]); bgzf_write(fbin,&clen,sizeof(int)); bgzf_write(fbin,header->name[curChr],clen); bgzf_write(fbin,&len,sizeof(int)); for(int i=0;i<4;i++) bgzf_write(fbin,counts[i],len);//write len of chr //write index stuff fwrite(&clen,sizeof(int),1,fidx); fwrite(header->name[curChr],sizeof(char),clen,fidx); fwrite(&len,sizeof(int),1,fidx); fwrite(&retVal,sizeof(int64_t),1,fidx); for(int i=0;i<4;i++) delete [] counts[i]; delete [] counts; fclose(fidx); bgzf_close(fbin); }
void fai_destroy(faidx_t *fai) { int i; for (i = 0; i < fai->n; ++i) free(fai->name[i]); free(fai->name); kh_destroy(s, fai->hash); if (fai->bgzf) bgzf_close(fai->bgzf); free(fai); }
abcHWE::~abcHWE(){ if(doHWE==0) return; if(doHWE>0) if(outfileZ!=NULL) bgzf_close(outfileZ); delete chisq; }
void BAMbinSortByCoordinate(uint32 iBin, uint binN, uint binS, uint nThreads, string dirBAMsort, Parameters *P) { if (binS==0) return; //nothing to do for empty bins //allocate arrays char *bamIn=new char[binS]; uint *startPos=new uint[binN*3]; uint bamInBytes=0; //load all aligns for (uint it=0; it<nThreads; it++) { string bamInFile=dirBAMsort+to_string(it)+"/"+to_string((uint) iBin); ifstream bamInStream (bamInFile.c_str()); bamInStream.read(bamIn+bamInBytes,binS);//read the whole file bamInBytes += bamInStream.gcount(); bamInStream.close(); remove(bamInFile.c_str()); }; if (bamInBytes!=binS) { ostringstream errOut; errOut << "EXITING because of FATAL ERROR: number of bytes expected from the BAM bin does not agree with the actual size on disk: "; errOut << binS <<" "<< bamInBytes <<" "<< iBin <<"\n"; exitWithError(errOut.str(),std::cerr, P->inOut->logMain, 1, *P); }; //extract coordinates for (uint ib=0,ia=0;ia<binN;ia++) { uint32 *bamIn32=(uint32*) (bamIn+ib); startPos[ia*3] =( ((uint) bamIn32[1]) << 32) | ( (uint)bamIn32[2] ); startPos[ia*3+2]=ib; ib+=bamIn32[0]+sizeof(uint32);//note that size of the BAM record does not include the size record itself startPos[ia*3+1]=*( (uint*) (bamIn+ib) ); //read order ib+=sizeof(uint); }; //sort qsort((void*) startPos, binN, sizeof(uint)*3, funCompareUint2); BGZF *bgzfBin; bgzfBin=bgzf_open((dirBAMsort+"/b"+to_string((uint) iBin)).c_str(),("w"+to_string((long long) P->outBAMcompression)).c_str()); outBAMwriteHeader(bgzfBin,P->samHeaderSortedCoord,P->chrName,P->chrLength); //send ordered aligns to bgzf one-by-one for (uint ia=0;ia<binN;ia++) { char* ib=bamIn+startPos[ia*3+2]; bgzf_write(bgzfBin,ib, *((uint32*) ib)+sizeof(uint32) ); }; bgzf_flush(bgzfBin); bgzf_close(bgzfBin); //release memory delete [] bamIn; delete [] startPos; };
void ifq_destroy_index(ifq_index_t *index) { if( index != NULL ) { cmph_destroy( index->hash ); munmap( index->table, index->lookup_size ); fclose( index->hash_file ); bgzf_close( index->fastq_file ); close( index->lookup_fd ); } }
void reader_destroy(reader_t *r) { if(NULL == r) return; if(0 == r->compress) { if(bgzf_close(r->fp_bgzf) < 0) { fprintf(stderr, "reader bgzf_close: bug encountered\n"); exit(1); } } free(r); }
void dalloc(filt *f){ for(pMap::iterator it=f->offs.begin();it!=f->offs.end();++it) free(it->first); f->offs.clear(); bgzf_close(f->bg); fclose(f->fp); free(f->keeps); free(f->major); free(f->minor); delete f; f=NULL; }
void VariantList::printToCompressedVCF(IHeader::SharedPtr headerPtr, bool printHeader, int out) { BGZF* fp = bgzf_dopen(out, "w"); if (printHeader) { bgzf_write(fp, headerPtr->getHeader().c_str(), headerPtr->getHeader().size()); } for(const auto variantPtr : this->m_variant_ptrs) { bgzf_write(fp, variantPtr->getVariantLine(headerPtr).c_str(), variantPtr->getVariantLine(headerPtr).size()); } bgzf_close(fp); }
int main(int argc, char *argv[]) { if (argc <= 1) { fprintf(stderr, "Usage: thrash_threads1 input.bam\n"); exit(1); } int i; for (i = 0; i < 10000; i++) { printf("i=%d\n", i); BGZF *fpin = bgzf_open(argv[1], "r"); bgzf_mt(fpin, 2, 256); if (bgzf_close(fpin) < 0) abort(); } return 0; }
abcAsso::~abcAsso(){ if(doPrint) fprintf(stderr,"staring [%s]\t[%s]\n",__FILE__,__FUNCTION__); if(doAsso==0) return; for(int i=0;i<ymat.y;i++) if(multiOutfile[i]!=NULL) bgzf_close(multiOutfile[i]); delete [] multiOutfile; if(covfile!=NULL) angsd::deleteMatrix(covmat); angsd::deleteMatrix(ymat); }
/* * Reads a file and outputs a new BAM file to fd with 'h' replaced as * the header. No checks are made to the validity. */ int bam_reheader(BGZF *in, bam_hdr_t *h, int fd, const char *arg_list, int add_PG) { BGZF *fp; ssize_t len; uint8_t *buf; if (in->is_write) return -1; buf = malloc(BUF_SIZE); if (bam_hdr_read(in) == NULL) { fprintf(stderr, "Couldn't read header\n"); free(buf); return -1; } fp = bgzf_fdopen(fd, "w"); if (add_PG) { // Around the houses, but it'll do until we can manipulate bam_hdr_t natively. SAM_hdr *sh = sam_hdr_parse_(h->text, h->l_text); if (sam_hdr_add_PG(sh, "samtools", "VN", samtools_version(), arg_list ? "CL": NULL, arg_list ? arg_list : NULL, NULL) != 0) return -1; free(h->text); h->text = strdup(sam_hdr_str(sh)); h->l_text = sam_hdr_length(sh); if (!h->text) return -1; sam_hdr_free(sh); } bam_hdr_write(fp, h); if (in->block_offset < in->block_length) { bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset); bgzf_flush(fp); } while ((len = bgzf_raw_read(in, buf, BUF_SIZE)) > 0) bgzf_raw_write(fp, buf, len); free(buf); fp->block_offset = in->block_offset = 0; bgzf_close(fp); return 0; }
int main_getalt(int argc, char *argv[]) { int c; char *fn; BGZF *fp; bcf1_t *b; bcf_hdr_t *h; kstring_t s = {0,0,0}; while ((c = getopt(argc, argv, "")) >= 0) { } if (argc - optind == 0) { fprintf(stderr, "Usage: bgt getalt <bgt-base>\n"); return 1; } fn = (char*)calloc(strlen(argv[optind]) + 5, 1); sprintf(fn, "%s.bcf", argv[optind]); fp = bgzf_open(fn, "r"); free(fn); assert(fp); h = bcf_hdr_read(fp); b = bcf_init1(); while (bcf_read1(fp, b) >= 0) { char *ref, *alt; int l_ref, l_alt, i, min_l; bcf_get_ref_alt1(b, &l_ref, &ref, &l_alt, &alt); min_l = l_ref < l_alt? l_ref : l_alt; for (i = 0; i < min_l && ref[i] == alt[i]; ++i); s.l = 0; kputs(h->id[BCF_DT_CTG][b->rid].key, &s); kputc(':', &s); kputw(b->pos + 1 + i, &s); kputc(':', &s); kputw(b->rlen - i, &s); kputc(':', &s); kputsn(alt + i, l_alt - i, &s); puts(s.s); } bcf_destroy1(b); bcf_hdr_destroy(h); bgzf_close(fp); free(s.s); return 0; }
void writer_destroy(writer_t *w) { if(NULL == w) return; if(0 == w->compress) { if(fclose(w->fp_file) < 0) { fprintf(stderr, "writer bzf_close: bug encountered\n"); exit(1); } } else { if(bgzf_close(w->fp_bgzf) < 0) { fprintf(stderr, "writer bzf_close: bug encountered\n"); exit(1); } // TODO } block_pool_destroy(w->pool_local); free(w); }
int bam_reheader(BGZF *in, const bam_header_t *h, int fd) { BGZF *fp; bam_header_t *old; ssize_t len; uint8_t *buf; if (in->is_write) return -1; buf = malloc(BUF_SIZE); old = bam_header_read(in); fp = bgzf_fdopen(fd, "w"); bam_header_write(fp, h); if (in->block_offset < in->block_length) { bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset); bgzf_flush(fp); } while ((len = bgzf_raw_read(in, buf, BUF_SIZE)) > 0) bgzf_raw_write(fp, buf, len); free(buf); fp->block_offset = in->block_offset = 0; bgzf_close(fp); return 0; }
int bgzf_check_bgzf(const char *fn) { BGZF *fp; unsigned char buf[10]; unsigned char magic[]="\037\213\010\4\0\0\0\0\0\377"; int n; if ((fp = bgzf_open(fn, "r")) == 0) { fprintf(stderr, "[bgzf_check_bgzf] failed to open the file: %s\n",fn); return -1; } n = fread(buf, 1, 10, fp->file); bgzf_close(fp); if (n != 10) return -1; if (!memcmp(magic, buf, 10)) return 1; return 0; }
int main (int argc, char **argv) { ///////////////////// // Parse Arguments // ///////////////////// params *pars = new params; init_pars(pars); parse_cmd_args(argc, argv, pars); if( pars->version ) { printf("ngsF v%s\nCompiled on %s @ %s", version, __DATE__, __TIME__); #ifdef _USE_BGZF printf(" (BGZF library)\n"); #else printf(" (STD library)\n"); #endif exit(0); } if( pars->verbose >= 1 ) { printf("==> Input Arguments:\n"); printf("\tglf file: %s\n\tinit_values: %s\n\tfreq_fixed: %s\n\tout file: %s\n\tn_ind: %d\n\tn_sites: %lu\n\tchunk_size: %lu\n\tfast_lkl: %s\n\tapprox_EM: %s\n\tcall_geno: %s\n\tmax_iters: %d\n\tmin_epsilon: %.10f\n\tn_threads: %d\n\tseed: %lu\n\tquick: %s\n\tversion: %s\n\tverbose: %d\n\n", pars->in_glf, pars->init_values, pars->freq_fixed ? "true":"false", pars->out_file, pars->n_ind, pars->n_sites, pars->max_chunk_size, pars->fast_lkl ? "true":"false", pars->approx_EM ? "true":"false", pars->call_geno ? "true":"false", pars->max_iters, pars->min_epsilon, pars->n_threads, pars->seed, pars->quick ? "true":"false", version, pars->verbose); } if( pars->verbose > 4 ) printf("==> Verbose values greater than 4 for debugging purpose only. Expect large amounts of info on screen\n"); ///////////////////// // Check Arguments // ///////////////////// if(pars->in_glf == NULL) error(__FUNCTION__,"GL input file (-glf) missing!"); else if( strcmp(pars->in_glf, "-") == 0 ) { pars->in_glf_type = new char[6]; pars->in_glf_type = strcat(pars->in_glf_type, "STDIN"); } else { pars->in_glf_type = strrchr(pars->in_glf, '.'); if(pars->in_glf_type == NULL) error(__FUNCTION__,"invalid file type!"); } if(pars->out_file == NULL) error(__FUNCTION__,"output file (-out) missing!"); if(pars->n_ind == 0) error(__FUNCTION__,"number of individuals (-n_ind) missing!"); if(pars->n_sites == 0) error(__FUNCTION__,"number of sites (-n_sites) missing!"); /////////////////////// // Check input files // /////////////////////// // Get file total size struct stat st; stat(pars->in_glf, &st); if( strcmp(pars->in_glf_type, "STDIN") != 0 ) { if( pars->n_sites == st.st_size/sizeof(double)/pars->n_ind/3 && strcmp(pars->in_glf_type, ".glf") == 0 ) { if(pars->verbose >= 1) printf("==> UNCOMP input file (\"%s\"): number of sites (%lu) match expected file size\n", pars->in_glf_type, pars->n_sites); } else if( strcmp(pars->in_glf_type, ".glf") != 0 ) { if( pars->verbose >= 1) printf("==> COMPRESSED input file (\"%s\"): number of sites (%lu) do NOT match expected file size\n", pars->in_glf_type, pars->n_sites); } else error(__FUNCTION__,"wrong number of sites or invalid/corrupt file!"); } // Adjust max_chunk_size in case of fewer sites if(pars->max_chunk_size > pars->n_sites) { if( pars->verbose >= 1 ) printf("==> Fewer sites (%lu) than chunk_size (%lu). Reducing chunk size to match number of sites\n", pars->n_sites, pars->max_chunk_size); pars->max_chunk_size = pars->n_sites; } // Calculate total number of chunks pars->n_chunks = ceil( (double) pars->n_sites/ (double) pars->max_chunk_size ); if( pars->verbose >= 1 ) printf("==> Analysis will be run in %ld chunk(s)\n", pars->n_chunks); // Alocate memory for the chunk index pars->chunks_voffset = new int64_t[pars->n_chunks]; memset(pars->chunks_voffset, 0, pars->n_chunks*sizeof(int64_t)); // Adjust thread number to chunks if(pars->n_chunks < pars->n_threads) { if( pars->verbose >= 1 ) printf("==> Fewer chunks (%ld) than threads (%d). Reducing the number of threads to match number of chunks\n", pars->n_chunks, pars->n_threads); pars->n_threads = pars->n_chunks; } // Open input file #ifdef _USE_BGZF if( pars->verbose >= 1 ) printf("==> Using BGZF I/O library\n"); // Open BGZIP file if( strcmp(pars->in_glf_type, ".bgz") == 0 ) { if( (pars->in_glf_fh = bgzf_open(pars->in_glf, "rb")) < 0 ) error(__FUNCTION__,"Cannot open BGZIP file!"); } else error(__FUNCTION__,"BGZF library only supports BGZIP files!"); bgzf_set_cache_size(pars->in_glf_fh, CACHE_SIZE * 1024uL * 1024uL * 1024uL); #else if( pars->verbose >= 1 ) printf("==> Using native I/O library\n"); // Open GLF file if( strcmp(pars->in_glf_type, "STDIN") == 0 ) pars->in_glf_fh = stdin; else if( strcmp(pars->in_glf_type, ".glf") == 0 ) { if( (pars->in_glf_fh = fopen(pars->in_glf, "rb")) == NULL ) error(__FUNCTION__,"Cannot open GLF file!"); } else error(__FUNCTION__,"Standard library only supports UNCOMPRESSED GLF files!"); // Allocate memory and read from the file pars->data = new double* [pars->n_sites]; for(uint64_t s = 0; s < pars->n_sites; s++) { pars->data[s] = new double[pars->n_ind * 3]; if( fread (pars->data[s], sizeof(double), pars->n_ind * 3, pars->in_glf_fh) != pars->n_ind * 3) error(__FUNCTION__,"cannot read GLF file!"); if(pars->call_geno) call_geno(pars->data[s], pars->n_ind, 3); } #endif if( pars->in_glf_fh == NULL ) error(__FUNCTION__,"cannot open GLF file!"); /////////////////////////////////// // Declare variables for results // /////////////////////////////////// out_data *output = new out_data; output->site_freq = new double[pars->n_sites]; output->site_freq_num = new double[pars->n_sites]; output->site_freq_den = new double[pars->n_sites]; output->site_prob_var = new double[pars->n_sites]; output->site_tmpprob_var = new double[pars->n_sites]; output->indF = new double[pars->n_ind]; output->indF_num = new double[pars->n_ind]; output->indF_den = new double[pars->n_ind]; output->ind_lkl = new double[pars->n_ind]; // Initialize output init_output(pars, output); ////////////////// // Analyze Data // ////////////////// if( pars->verbose >= 1 && !pars->fast_lkl && strcmp("e", pars->init_values) != 0 ) { printf("==> Initial LogLkl: %.15f\n", full_HWE_like(pars, output->site_freq, output->indF, 0, pars->n_ind)); fflush(stdout); } do_EM(pars, output); if( pars->verbose >= 1 ) printf("\nFinal logLkl: %f\n", output->global_lkl); ////////////////// // Print Output // ////////////////// FILE *out_file; if( pars->verbose >= 1 ) printf("Printing Output...\n"); out_file = fopen(pars->out_file, "w"); if(out_file == NULL) error(__FUNCTION__,"Cannot open OUTPUT file!"); for(uint16_t i = 0; i < pars->n_ind; i++) fprintf(out_file,"%f\n", output->indF[i]); fclose(out_file); ////////////////////// // Close Input File // ////////////////////// if( pars->verbose >= 1 ) printf("Exiting...\n"); #ifdef _USE_BGZF bgzf_close(pars->in_glf_fh); #else for(uint64_t s = 0; s < pars->n_sites; s++) delete [] pars->data[s]; delete [] pars->data; fclose(pars->in_glf_fh); #endif ///////////////// // Free Memory // ///////////////// delete [] output->site_freq; delete [] output->site_freq_num; delete [] output->site_freq_den; delete [] output->site_prob_var; delete [] output->indF; delete [] output->indF_num; delete [] output->indF_den; delete [] output->ind_lkl; delete output; //if( strcmp("e", pars->init_values) == 0 ) //delete [] pars->init_values; delete [] pars->chunks_voffset; delete pars; return 0; }
static void naive_concat(args_t *args) { // only compressed BCF atm BGZF *bgzf_out = bgzf_open(args->output_fname,"w");; const size_t page_size = 32768; char *buf = (char*) malloc(page_size); kstring_t tmp = {0,0,0}; int i; for (i=0; i<args->nfnames; i++) { htsFile *hts_fp = hts_open(args->fnames[i],"r"); if ( !hts_fp ) error("Failed to open: %s\n", args->fnames[i]); htsFormat type = *hts_get_format(hts_fp); if ( type.format==vcf ) error("The --naive option currently works only for compressed BCFs, sorry :-/\n"); if ( type.compression!=bgzf ) error("The --naive option currently works only for compressed BCFs, sorry :-/\n"); BGZF *fp = hts_get_bgzfp(hts_fp); if ( !fp || bgzf_read_block(fp) != 0 || !fp->block_length ) error("Failed to read %s: %s\n", args->fnames[i], strerror(errno)); uint8_t magic[5]; if ( bgzf_read(fp, magic, 5) != 5 ) error("Failed to read the BCF header in %s\n", args->fnames[i]); if (strncmp((char*)magic, "BCF\2\2", 5) != 0) error("Invalid BCF magic string in %s\n", args->fnames[i]); if ( bgzf_read(fp, &tmp.l, 4) != 4 ) error("Failed to read the BCF header in %s\n", args->fnames[i]); hts_expand(char,tmp.l,tmp.m,tmp.s); if ( bgzf_read(fp, tmp.s, tmp.l) != tmp.l ) error("Failed to read the BCF header in %s\n", args->fnames[i]); // write only the first header if ( i==0 ) { if ( bgzf_write(bgzf_out, "BCF\2\2", 5) !=5 ) error("Failed to write %d bytes to %s\n", 5,args->output_fname); if ( bgzf_write(bgzf_out, &tmp.l, 4) !=4 ) error("Failed to write %d bytes to %s\n", 4,args->output_fname); if ( bgzf_write(bgzf_out, tmp.s, tmp.l) != tmp.l) error("Failed to write %d bytes to %s\n", tmp.l,args->output_fname); } // Output all non-header data that were read together with the header block int nskip = fp->block_offset; if ( fp->block_length - nskip > 0 ) { if ( bgzf_write(bgzf_out, fp->uncompressed_block+nskip, fp->block_length-nskip)<0 ) error("Error: %d\n",fp->errcode); } if ( bgzf_flush(bgzf_out)<0 ) error("Error: %d\n",bgzf_out->errcode); // Stream the rest of the file as it is, without recompressing, but remove BGZF EOF blocks ssize_t nread, ncached = 0, nwr; const int neof = 28; char cached[neof]; while (1) { nread = bgzf_raw_read(fp, buf, page_size); // page_size boundary may occur in the middle of the EOF block, so we need to cache the blocks' ends if ( nread<=0 ) break; if ( nread<=neof ) // last block { if ( ncached ) { // flush the part of the cache that won't be needed nwr = bgzf_raw_write(bgzf_out, cached, nread); if (nwr != nread) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)nread); // make space in the cache so that we can append to the end if ( nread!=neof ) memmove(cached,cached+nread,neof-nread); } // fill the cache and check for eof outside this loop memcpy(cached+neof-nread,buf,nread); break; } // not the last block, flush the cache if full if ( ncached ) { nwr = bgzf_raw_write(bgzf_out, cached, ncached); if (nwr != ncached) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)ncached); ncached = 0; } // fill the cache nread -= neof; memcpy(cached,buf+nread,neof); ncached = neof; nwr = bgzf_raw_write(bgzf_out, buf, nread); if (nwr != nread) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)nread); } if ( ncached && memcmp(cached,"\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0",neof) ) { nwr = bgzf_raw_write(bgzf_out, cached, neof); if (nwr != neof) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)neof); } if (hts_close(hts_fp)) error("Close failed: %s\n",args->fnames[i]); } free(buf); free(tmp.s); if (bgzf_close(bgzf_out) < 0) error("Error: %d\n",bgzf_out->errcode); }
int reheader_file(const char *header, const char *file, int meta) { BGZF *fp = bgzf_open(file,"r"); if (bgzf_read_block(fp) != 0 || !fp->block_length) return -1; char *buffer = fp->uncompressed_block; int skip_until = 0; if ( buffer[0]==meta ) { skip_until = 1; // Skip the header while (1) { if ( buffer[skip_until]=='\n' ) { skip_until++; if ( skip_until>=fp->block_length ) { if (bgzf_read_block(fp) != 0 || !fp->block_length) error("no body?\n"); skip_until = 0; } // The header has finished if ( buffer[skip_until]!=meta ) break; } skip_until++; if ( skip_until>=fp->block_length ) { if (bgzf_read_block(fp) != 0 || !fp->block_length) error("no body?\n"); skip_until = 0; } } } FILE *fh = fopen(header,"r"); if ( !fh ) error("%s: %s", header,strerror(errno)); int page_size = getpagesize(); char *buf = valloc(page_size); BGZF *bgzf_out = bgzf_fdopen(fileno(stdout), "w"); ssize_t nread; while ( (nread=fread(buf,1,page_size-1,fh))>0 ) { if ( nread<page_size-1 && buf[nread-1]!='\n' ) buf[nread++] = '\n'; if (bgzf_write(bgzf_out, buf, nread) < 0) error("Error: %s\n",bgzf_out->error); } fclose(fh); if ( fp->block_length - skip_until > 0 ) { if (bgzf_write(bgzf_out, buffer+skip_until, fp->block_length-skip_until) < 0) error("Error: %s\n",fp->error); } if (bgzf_flush(bgzf_out) < 0) error("Error: %s\n",bgzf_out->error); while (1) { #ifdef _USE_KNETFILE nread = knet_read(fp->x.fpr, buf, page_size); #else nread = fread(buf, 1, page_size, fp->file); #endif if ( nread<=0 ) break; #ifdef _USE_KNETFILE int count = fwrite(buf, 1, nread, bgzf_out->x.fpw); #else int count = fwrite(buf, 1, nread, bgzf_out->file); #endif if (count != nread) error("Write failed, wrote %d instead of %d bytes.\n", count,(int)nread); } if (bgzf_close(bgzf_out) < 0) error("Error: %s\n",bgzf_out->error); return 0; }
void ReadDB::import_reads(const std::string& input_filename, const std::string& out_fasta_filename) { // Open readers FILE* read_fp = fopen(input_filename.c_str(), "r"); if(read_fp == NULL) { fprintf(stderr, "error: could not open %s for read\n", input_filename.c_str()); exit(EXIT_FAILURE); } gzFile gz_read_fp = gzdopen(fileno(read_fp), "r"); if(gz_read_fp == NULL) { fprintf(stderr, "error: could not open %s using gzdopen\n", input_filename.c_str()); exit(EXIT_FAILURE); } // Open writers FILE* write_fp = fopen(out_fasta_filename.c_str(), "w"); if(write_fp == NULL) { fprintf(stderr, "error: could not open %s for write\n", out_fasta_filename.c_str()); exit(EXIT_FAILURE); } BGZF* bgzf_write_fp = bgzf_dopen(fileno(write_fp), "w"); if(bgzf_write_fp == NULL) { fprintf(stderr, "error: could not open %s for bgzipped write\n", out_fasta_filename.c_str()); exit(EXIT_FAILURE); } // read input sequences, add to DB and convert to fasta int ret = 0; kseq_t* seq = kseq_init(gz_read_fp); while((ret = kseq_read(seq)) >= 0) { // Check for a path to the fast5 file in the comment of the read std::string path = ""; if(seq->comment.l > 0) { // This splitting code implicitly handles both the 2 and 3 field // fasta format that poretools will output. The FAST5 path // is always the last field. std::vector<std::string> fields = split(seq->comment.s, ' '); path = fields.back(); // as a sanity check we require the path name to end in ".fast5" if(path.length() < 6 || path.substr(path.length() - 6) != ".fast5") { path = ""; } } // sanity check that the read does not exist in the database // JTS 04/2019: changed error to warning to account for duplicate reads coming out of // some versions of guppy. auto iter = m_data.find(seq->name.s); if(iter != m_data.end()) { fprintf(stderr, "Warning: duplicate read name %s found in fasta file\n", seq->name.s); continue; } // add path add_signal_path(seq->name.s, path); // write sequence in gzipped fasta for fai indexing later std::string out_record; out_record += ">"; out_record += seq->name.s; out_record += "\n"; out_record += seq->seq.s; out_record += "\n"; size_t write_length = bgzf_write(bgzf_write_fp, out_record.c_str(), out_record.length()); if(write_length != out_record.length()) { fprintf(stderr, "error in bgzf_write, aborting\n"); exit(EXIT_FAILURE); } } // check for abnormal exit conditions if(ret <= -2) { fprintf(stderr, "kseq_read returned %d indicating an error with the input file %s\n", ret, input_filename.c_str()); exit(EXIT_FAILURE); } // cleanup kseq_destroy(seq); gzclose(gz_read_fp); fclose(read_fp); bgzf_close(bgzf_write_fp); fclose(write_fp); }
void signalFromBAM(const string bamFileName, const string sigFileName, Parameters P) { bam1_t *bamA; bamA=bam_init1(); double nMult=0, nUniq=0; if (P.outWigFlags.norm==1) {//count reads in the BAM file BGZF *bamIn=bgzf_open(bamFileName.c_str(),"r"); bam_hdr_t *bamHeader=bam_hdr_read(bamIn); while ( true ) {//until the end of file int bamBytes1=bam_read1(bamIn, bamA); if (bamBytes1<0) break; //end of file if (bamA->core.tid<0) continue; //unmapped read // if ( !std::regex_match(chrName.at(bamA->core.tid),std::regex(P.outWigReferencesPrefix))) continue; //reference does not mathc required references if ( P.outWigReferencesPrefix!="-" && (P.outWigReferencesPrefix.compare(0,P.outWigReferencesPrefix.size(),bamHeader->target_name[bamA->core.tid],P.outWigReferencesPrefix.size())!=0) ) continue; //reference does not match required references uint8_t* aNHp=bam_aux_get(bamA,"NH"); if (aNHp!=NULL) { uint32_t aNH=bam_aux2i(aNHp); if (aNH==1) {//unique mappers ++nUniq; } else if (aNH>1) { nMult+=1.0/aNH; }; }; }; bgzf_close(bamIn); }; BGZF *bamIn=bgzf_open(bamFileName.c_str(),"r"); bam_hdr_t *bamHeader=bam_hdr_read(bamIn); int sigN=P.outWigFlags.strand ? 4 : 2; double *normFactor=new double[sigN]; ofstream **sigOutAll=new ofstream* [sigN]; string* sigOutFileName=new string[sigN]; sigOutFileName[0]=sigFileName+".Unique.str1.out"; sigOutFileName[1]=sigFileName+".UniqueMultiple.str1.out"; if (P.outWigFlags.strand) { sigOutFileName[2]=sigFileName+".Unique.str2.out"; sigOutFileName[3]=sigFileName+".UniqueMultiple.str2.out"; }; for (int ii=0; ii<sigN; ii++) { sigOutFileName[ii]+= (P.outWigFlags.format==0 ? ".bg" : ".wig"); sigOutAll[ii]=new ofstream ( sigOutFileName[ii].c_str() ); }; if (P.outWigFlags.norm==0) {//raw counts normFactor[0]=1; normFactor[1]=1; } else if (P.outWigFlags.norm==1) {//normlaized normFactor[0]=1.0e6 / nUniq; normFactor[1]=1.0e6 / (nUniq+nMult); for (int is=0;is<sigN;is++) {//formatting double output *sigOutAll[is]<<setiosflags(ios::fixed) << setprecision(5); }; }; if (P.outWigFlags.strand) { normFactor[2]=normFactor[0]; normFactor[3]=normFactor[1]; }; int iChr=-999; double *sigAll=NULL; uint32_t chrLen=0; while ( true ) {//until the end of file int bamBytes1=bam_read1(bamIn, bamA); if (bamA->core.tid!=iChr || bamBytes1<0) { //output to file if (iChr!=-999) {//iChr=-999 marks chromosomes that are not output, including unmapped reads for (int is=0;is<sigN;is++) { if (P.outWigFlags.format==1) { *sigOutAll[is] <<"variableStep chrom="<<bamHeader->target_name[iChr] <<"\n"; }; double prevSig=0; for (uint32_t ig=0;ig<chrLen;ig++) { double newSig=sigAll[sigN*ig+is]; if (P.outWigFlags.format==0) {//bedGraph if (newSig!=prevSig) { if (prevSig!=0) {//finish previous record *sigOutAll[is] <<ig<<"\t"<<prevSig*normFactor[is] <<"\n"; //1-based end }; if (newSig!=0) { *sigOutAll[is] << bamHeader->target_name[iChr] <<"\t"<< ig <<"\t"; //0-based beginning }; prevSig=newSig; }; } else if (P.outWigFlags.format==1){//wiggle if (newSig!=0) { *sigOutAll[is] <<ig+1<<"\t"<<newSig*normFactor[is] <<"\n"; }; }; }; }; }; if (bamBytes1<0) {//no more reads break; }; iChr=bamA->core.tid; if ( iChr==-1 || (P.outWigReferencesPrefix!="-" && (P.outWigReferencesPrefix.compare(0,P.outWigReferencesPrefix.size(),bamHeader->target_name[bamA->core.tid],P.outWigReferencesPrefix.size())!=0) ) ) { iChr=-999; continue; //reference does not match required references }; chrLen=bamHeader->target_len[iChr]+1;//one extra base at the end which sohuld always be 0 delete [] sigAll; sigAll= new double[sigN*chrLen]; memset(sigAll, 0, sizeof(*sigAll)*sigN*chrLen); }; // uint32_t nCigar =(bamA->core.flag<<16)>>16; // uint32_t mapFlag=bamA->core.flag>>16; // uint32_t mapQ=(bamA->core.flag<<16)>>24; #define BAM_CIGAR_OperationShift 4 #define BAM_CIGAR_LengthBits 28 #define BAM_CIGAR_M 0 #define BAM_CIGAR_I 1 #define BAM_CIGAR_D 2 #define BAM_CIGAR_N 3 #define BAM_CIGAR_S 4 #define BAM_CIGAR_H 5 #define BAM_CIGAR_P 6 #define BAM_CIGAR_EQ 7 #define BAM_CIGAR_X 8 //by default, alignments marked as duplicate are not processed if ( (bamA->core.flag & 0x400) > 0 ) continue; //NH attribute uint8_t* aNHp=bam_aux_get(bamA,"NH"); uint32_t aNH; if (aNHp==NULL) { aNH=1; //no NH tag: assume NH=1 //continue; //do not process lines without NH field } else { aNH=bam_aux2i(bam_aux_get(bamA,"NH")); //write a safer function allowing for lacking NH tag }; if (aNH==0) continue; //do not process lines without NH=0 uint32_t aG=bamA->core.pos; uint32_t iStrand=0; if (P.outWigFlags.strand) {//strand for stranded data from SAM flag iStrand= ( (bamA->core.flag & 0x10) > 0 ) == ( (bamA->core.flag & 0x80) == 0 );//0/1 for +/- }; if (P.outWigFlags.type==1) {//5' of the1st read signal only, RAMPAGE/CAGE if ( (bamA->core.flag & 0x80)>0) continue; //skip if this the second mate if (iStrand==0) { if (aNH==1) {//unique mappers sigAll[aG*sigN+0+2*iStrand]++; }; sigAll[aG*sigN+1+2*iStrand]+=1.0/aNH;//U+M, normalized by the number of multi-mapping loci continue; //record only the first position }; }; uint32_t* cigar=(uint32_t*) (bamA->data+bamA->core.l_qname); for (uint32_t ic=0; ic<bamA->core.n_cigar; ic++) { uint32_t cigOp=(cigar[ic]<<BAM_CIGAR_LengthBits)>>BAM_CIGAR_LengthBits; uint32_t cigL=cigar[ic]>>BAM_CIGAR_OperationShift; switch (cigOp) { case(BAM_CIGAR_D): case(BAM_CIGAR_N): aG+=cigL; break; case(BAM_CIGAR_M): if (P.outWigFlags.type==0 || (P.outWigFlags.type==2 && (bamA->core.flag & 0x80)>0 )) {//full signal, or second mate onyl signal for (uint32_t ig=0;ig<cigL;ig++) { if (aG>=chrLen) { cerr << "BUG: alignment extends past chromosome in signalFromBAM.cpp\n"; exit(-1); }; if (aNH==1) {//unique mappers sigAll[aG*sigN+0+2*iStrand]++; }; sigAll[aG*sigN+1+2*iStrand]+=1.0/aNH;//U+M, normalized by the number of multi-mapping loci aG++; }; } else { aG+=cigL; }; }; }; if (P.outWigFlags.type==1) {//full signal --aG; if (aNH==1) {//unique mappers sigAll[aG*sigN+0+2*iStrand]++; }; sigAll[aG*sigN+1+2*iStrand]+=1.0/aNH;//U+M, normalized by the number of multi-mapping loci }; }; delete [] sigAll; for (int is=0; is<sigN; is++) {// flush/close all signal files sigOutAll[is]->flush(); sigOutAll[is]->close(); }; };