int fai_build(const char *fn) { char *str; BGZF *bgzf; FILE *fp; faidx_t *fai; str = (char*)calloc(strlen(fn) + 5, 1); sprintf(str, "%s.fai", fn); bgzf = bgzf_open(fn, "r"); if ( !bgzf ) { fprintf(stderr, "[fai_build] fail to open the FASTA file %s\n",fn); free(str); return -1; } if ( bgzf->is_compressed ) bgzf_index_build_init(bgzf); fai = fai_build_core(bgzf); if ( bgzf->is_compressed ) bgzf_index_dump(bgzf, fn, ".gzi"); bgzf_close(bgzf); fp = fopen(str, "wb"); if ( !fp ) { fprintf(stderr, "[fai_build] fail to write FASTA index %s\n",str); fai_destroy(fai); free(str); return -1; } fai_save(fai, fp); fclose(fp); free(str); fai_destroy(fai); return 0; }
SingleChromosomeBCFIndex::SingleChromosomeBCFIndex( const std::string& bcfFile, const std::string& indexFile) { bcfFile_ = bcfFile; indexFile_ = indexFile; fBcfFile_ = bgzf_open(bcfFile_.c_str(), "rb"); data_ = NULL; }
int main_reheader(int argc, char *argv[]) { bam_header_t *h; BGZF *in; if (argc != 3) { fprintf(stderr, "Usage: samtools reheader <in.header.sam> <in.bam>\n"); return 1; } { // read the header tamFile fph = sam_open(argv[1]); if (fph == 0) { fprintf(stderr, "[%s] fail to read the header from %s.\n", __func__, argv[1]); return 1; } h = sam_header_read(fph); sam_close(fph); } in = strcmp(argv[2], "-")? bgzf_open(argv[2], "r") : bgzf_fdopen(fileno(stdin), "r"); if (in == 0) { fprintf(stderr, "[%s] fail to open file %s.\n", __func__, argv[2]); return 1; } bam_reheader(in, h, fileno(stdout)); bgzf_close(in); return 0; }
int print(int argc, char**argv){ if(argc==0){ fprintf(stderr,"print FILE [-r chrName]\n"); exit(0); } char *base = *argv; char* outnames_bin = append(base,BIN); char* outnames_idx = append(base,IDX); fprintf(stderr,"Assuming binfile:%s and indexfile:%s\n",outnames_bin,outnames_idx); myMap mm = getMap(outnames_idx); writemap(stderr,mm); BGZF *fp = bgzf_open(outnames_bin,"r"); --argc;++argv; // fprintf(stderr,"argc=%d\n",argc); int argP =0; char *chr=NULL; while(argP<argc){ // fprintf(stderr,"args=%s\n",argv[argP]); if(argP==argc){ fprintf(stderr,"incomplete arguments list\n"); exit(0); } if(strcmp("-r",argv[argP])==0) chr = argv[argP+1]; else { fprintf(stderr,"Unknown argument:%s\n",argv[argP]); exit(0); } argP +=2; } if(chr!=NULL){ myMap::iterator it = mm.find(chr); if(it==mm.end()){ fprintf(stderr,"Problem finding chr: %s in index\n",chr); exit(0); } datum d = it->second; bgzf_seek(fp,d.fpos,SEEK_SET); } while(1){ perChr pc = getPerChr(fp); if(pc.nSites==0) break; fprintf(stderr,"pc.chr=%s pc.nSites=%zu firstpos=%d lastpos=%d\n",pc.chr,pc.nSites,pc.posi[0],pc.posi[pc.nSites-1]); print_main(pc,stdout); if(chr!=NULL) break; dalloc(pc); } return 0; }
int BGZipFileWriter::open(const char* fn, bool append){ if (append) fprintf(stderr, "Gzip does not support appending.\n"); this->fp = bgzf_open(fn, "w"); if (!this->fp) { fprintf(stderr, "ERROR: Cannot open %s for write\n", fn); return -1; } return 0; }
void BAMbinSortByCoordinate(uint32 iBin, uint binN, uint binS, uint nThreads, string dirBAMsort, Parameters *P) { if (binS==0) return; //nothing to do for empty bins //allocate arrays char *bamIn=new char[binS]; uint *startPos=new uint[binN*3]; uint bamInBytes=0; //load all aligns for (uint it=0; it<nThreads; it++) { string bamInFile=dirBAMsort+to_string(it)+"/"+to_string((uint) iBin); ifstream bamInStream (bamInFile.c_str()); bamInStream.read(bamIn+bamInBytes,binS);//read the whole file bamInBytes += bamInStream.gcount(); bamInStream.close(); remove(bamInFile.c_str()); }; if (bamInBytes!=binS) { ostringstream errOut; errOut << "EXITING because of FATAL ERROR: number of bytes expected from the BAM bin does not agree with the actual size on disk: "; errOut << binS <<" "<< bamInBytes <<" "<< iBin <<"\n"; exitWithError(errOut.str(),std::cerr, P->inOut->logMain, 1, *P); }; //extract coordinates for (uint ib=0,ia=0;ia<binN;ia++) { uint32 *bamIn32=(uint32*) (bamIn+ib); startPos[ia*3] =( ((uint) bamIn32[1]) << 32) | ( (uint)bamIn32[2] ); startPos[ia*3+2]=ib; ib+=bamIn32[0]+sizeof(uint32);//note that size of the BAM record does not include the size record itself startPos[ia*3+1]=*( (uint*) (bamIn+ib) ); //read order ib+=sizeof(uint); }; //sort qsort((void*) startPos, binN, sizeof(uint)*3, funCompareUint2); BGZF *bgzfBin; bgzfBin=bgzf_open((dirBAMsort+"/b"+to_string((uint) iBin)).c_str(),("w"+to_string((long long) P->outBAMcompression)).c_str()); outBAMwriteHeader(bgzfBin,P->samHeaderSortedCoord,P->chrName,P->chrLength); //send ordered aligns to bgzf one-by-one for (uint ia=0;ia<binN;ia++) { char* ib=bamIn+startPos[ia*3+2]; bgzf_write(bgzfBin,ib, *((uint32*) ib)+sizeof(uint32) ); }; bgzf_flush(bgzfBin); bgzf_close(bgzfBin); //release memory delete [] bamIn; delete [] startPos; };
faidx_t *fai_load(const char *fn) { char *str; FILE *fp; faidx_t *fai; str = (char*)calloc(strlen(fn) + 5, 1); sprintf(str, "%s.fai", fn); #ifdef _USE_KNETFILE if (strstr(fn, "ftp://") == fn || strstr(fn, "http://") == fn) { fp = download_and_open(str); if ( !fp ) { fprintf(stderr, "[fai_load] failed to open remote FASTA index %s\n", str); free(str); return 0; } } else #endif fp = fopen(str, "rb"); if (fp == 0) { fprintf(stderr, "[fai_load] build FASTA index.\n"); fai_build(fn); fp = fopen(str, "rb"); if (fp == 0) { fprintf(stderr, "[fai_load] fail to open FASTA index.\n"); free(str); return 0; } } fai = fai_read(fp); fclose(fp); fai->bgzf = bgzf_open(fn, "rb"); free(str); if (fai->bgzf == 0) { fprintf(stderr, "[fai_load] fail to open FASTA file.\n"); return 0; } if ( fai->bgzf->is_compressed==1 ) { if ( bgzf_index_load(fai->bgzf, fn, ".gzi") < 0 ) { fprintf(stderr, "[fai_load] failed to load .gzi index: %s[.gzi]\n", fn); fai_destroy(fai); return NULL; } } return fai; }
ifq_codes_t ifq_open_index(char *fastq_path, char *index_prefix, ifq_index_t *index) { char *hash_path = concatenate( index_prefix, ".hsh" ); char *lookup_path = concatenate( index_prefix, ".lup" ); ifq_codes_t ret = IFQ_OK; index->fastq_file = bgzf_open( fastq_path , "r" ); if( index->fastq_file == NULL ) { ret = IFQ_BAD_FASTQ; goto index_error; } index->hash_file = fopen( hash_path , "r" ); if( index->hash_file == NULL ) { ret = IFQ_BAD_PREFIX; goto index_error; } index->hash = cmph_load( index->hash_file ); if( index->hash == NULL ) { ret = IFQ_BAD_HASH; goto index_error; } index->lookup_fd = open( lookup_path, O_RDWR ); if( index->lookup_fd == -1 ) { ret = IFQ_BAD_PREFIX; goto index_error; } struct stat sb; fstat( index->lookup_fd, &sb ); index->lookup_size = sb.st_size; index->table = (uint64_t *) mmap( NULL, index->lookup_size, PROT_READ, MAP_FILE | MAP_SHARED, index->lookup_fd, 0 ); if( index->table == MAP_FAILED ) { ret = IFQ_BAD_INDEX; goto index_error; } index_error: free( hash_path ); free( lookup_path ); return ret; }
int val_bed(int argc, char**argv){ if(argc!=1){ fprintf(stderr,"val_bed FILE.gz \n"); exit(0); } char *base = *argv; char* outnames_bin = append(base,BIN); char* outnames_gz = base; fprintf(stderr,"Assuming binfile:%s and gzfile:%s\n",outnames_bin,outnames_gz); BGZF *fp = bgzf_open(outnames_bin,"r"); gzFile gz =gzopen(outnames_gz,"r"); char buf[4096]; gzgets(gz,buf,4096); while(1){ perChr pc = getPerChr(fp); if(pc.nSites==0) break; fprintf(stderr,"pc.chr=%s pc.nSites=%zu firstpos=%d lastpos=%d\n",pc.chr,pc.nSites,pc.posi[0],pc.posi[pc.nSites-1]); for(size_t i=0;i<pc.nSites;i++){ gzgets(gz,buf,4096); char *chr = strtok(buf,"\n\t "); if(strcmp(chr,pc.chr)!=0){ fprintf(stderr,"Problem with nonmatching chromosome: \'%s\' vs \'%s\'\n",chr,pc.chr); exit(0); } int posi =atoi(strtok(NULL,"\t\n ")); if(posi!=pc.posi[i]){ fprintf(stderr,"Problem with nonmatching position\n"); exit(0); } float tW = atof(strtok(NULL,"\t\n ")); float tP = atof(strtok(NULL,"\t\n ")); float tF = atof(strtok(NULL,"\t\n ")); float tH = atof(strtok(NULL,"\t\n ")); float tL = atof(strtok(NULL,"\t\n ")); fun(tW,pc.tW[i]); fun(tP,pc.tP[i]); fun(tF,pc.tF[i]); fun(tH,pc.tH[i]); fun(tL,pc.tL[i]); } fprintf(stderr,"FILE: %s chr: %s OK\n",base,pc.chr); dalloc(pc); } fprintf(stderr,"ALL OK: %s\n",base); return 0; }
int main(int argc, char *argv[]) { if (argc <= 1) { fprintf(stderr, "Usage: thrash_threads1 input.bam\n"); exit(1); } int i; for (i = 0; i < 10000; i++) { printf("i=%d\n", i); BGZF *fpin = bgzf_open(argv[1], "r"); bgzf_mt(fpin, 2, 256); if (bgzf_close(fpin) < 0) abort(); } return 0; }
int main_getalt(int argc, char *argv[]) { int c; char *fn; BGZF *fp; bcf1_t *b; bcf_hdr_t *h; kstring_t s = {0,0,0}; while ((c = getopt(argc, argv, "")) >= 0) { } if (argc - optind == 0) { fprintf(stderr, "Usage: bgt getalt <bgt-base>\n"); return 1; } fn = (char*)calloc(strlen(argv[optind]) + 5, 1); sprintf(fn, "%s.bcf", argv[optind]); fp = bgzf_open(fn, "r"); free(fn); assert(fp); h = bcf_hdr_read(fp); b = bcf_init1(); while (bcf_read1(fp, b) >= 0) { char *ref, *alt; int l_ref, l_alt, i, min_l; bcf_get_ref_alt1(b, &l_ref, &ref, &l_alt, &alt); min_l = l_ref < l_alt? l_ref : l_alt; for (i = 0; i < min_l && ref[i] == alt[i]; ++i); s.l = 0; kputs(h->id[BCF_DT_CTG][b->rid].key, &s); kputc(':', &s); kputw(b->pos + 1 + i, &s); kputc(':', &s); kputw(b->rlen - i, &s); kputc(':', &s); kputsn(alt + i, l_alt - i, &s); puts(s.s); } bcf_destroy1(b); bcf_hdr_destroy(h); bgzf_close(fp); free(s.s); return 0; }
BgzfFileType::BgzfFileType(const char * filename, const char * mode) { // If the file is for write and is '-', then write to stdout. if(((mode[0] == 'w') || (mode[0] == 'W')) && (strcmp(filename, "-") == 0)) { // Write to stdout. bgzfHandle = bgzf_fdopen(fileno(stdout), mode); } else if(((mode[0] == 'r') || (mode[0] == 'R')) && (strcmp(filename, "-") == 0)) { // read from stdin bgzfHandle = bgzf_fdopen(fileno(stdin), mode); } else { bgzfHandle = bgzf_open(filename, mode); } myStartPos = 0; if (bgzfHandle != NULL) { // Check to see if the file is being opened for read, if the eof block // is required, and if it is, if it is there. if ((mode[0] == 'r' || mode[0] == 'R') && ourRequireEofBlock && (bgzf_check_EOF(bgzfHandle) == 0)) { std::cerr << "BGZF EOF marker is missing in " << filename << std::endl; // the block is supposed to be there, but isn't, so close the file. close(); } else { // Successfully opened a properly formatted file, so get the start // position. myStartPos = bgzf_tell(bgzfHandle); } } myEOF = false; }
int bgzf_check_bgzf(const char *fn) { BGZF *fp; unsigned char buf[10]; unsigned char magic[]="\037\213\010\4\0\0\0\0\0\377"; int n; if ((fp = bgzf_open(fn, "r")) == 0) { fprintf(stderr, "[bgzf_check_bgzf] failed to open the file: %s\n",fn); return -1; } n = fread(buf, 1, 10, fp->file); bgzf_close(fp); if (n != 10) return -1; if (!memcmp(magic, buf, 10)) return 1; return 0; }
int do_stat(int argc, char**argv){ if(argc==0){ fprintf(stderr,"do_stat FILE -win -step -nChr [-r chrName -type [0,1,2]]\n"); exit(0); } char *base = *argv; char* outnames_bin = append(base,BIN); char* outnames_idx = append(base,IDX); fprintf(stderr,"\tAssuming binfile:%s and indexfile:%s\n",outnames_bin,outnames_idx); myMap mm = getMap(outnames_idx); writemap(stderr,mm); BGZF *fp = bgzf_open(outnames_bin,"r"); --argc;++argv; // fprintf(stderr,"argc=%d\n",argc); int argP =0; char *chr=NULL; char *outnames = NULL; int nChr =0; int win =0; int step =0; int type =0; while(argP<argc){ // fprintf(stderr,"args=%s\n",argv[argP]); if(argP==argc){ fprintf(stderr,"incomplete arguments list\n"); exit(0); } if(strcmp("-r",argv[argP])==0) chr = argv[argP+1]; else if(strcmp("-outnames",argv[argP])==0) outnames = argv[argP+1]; else if(strcmp("-step",argv[argP])==0) step = atoi(argv[argP+1]); else if(strcmp("-win",argv[argP])==0) win = atoi(argv[argP+1]); else if(strcmp("-nChr",argv[argP])==0) nChr = atoi(argv[argP+1]); else if(strcmp("-type",argv[argP])==0) type = atoi(argv[argP+1]); else { fprintf(stderr,"Unknown argument:%s\n",argv[argP]); exit(0); } argP +=2; } fprintf(stderr,"\t -r=%s outnames=%s step: %d win: %d nChr:%d\n",chr,outnames,step,win,nChr); if(nChr==0){ fprintf(stderr,"nChr must be different from zero\n"); exit(0); } if(win==0||step==0){ fprintf(stderr,"\tWinsize equals zero or step size equals zero. Will use entire chromosome as window\n"); win=step=0; } if(chr!=NULL){ myMap::iterator it = mm.find(chr); if(it==mm.end()){ fprintf(stderr,"\tProblem finding chr: %s in index\n",chr); exit(0); } datum d = it->second; bgzf_seek(fp,d.fpos,SEEK_SET); } if(outnames==NULL) outnames = base; char *resname = append(outnames,RES); FILE *fpres = fopen(resname,"w"); //fprintf(fpres,"## thetaStat VERSION: %s build:(%s,%s)\n",VERSION,__DATE__,__TIME__); fprintf(fpres,"#(indexStart,indexStop)(firstPos_withData,lastPos_withData)(WinStart,WinStop)\t"); fprintf(fpres,"Chr\tWinCenter\t"); fprintf(fpres,"tW\ttP\ttF\ttH\ttL\t"); fprintf(fpres,"Tajima\tfuf\tfud\tfayh\tzeng\tnSites\n"); while(1){ perChr pc = getPerChr(fp); if(pc.nSites==0) break; fprintf(stderr,"\tpc.chr=%s pc.nSites=%zu firstpos=%d lastpos=%d\n",pc.chr,pc.nSites,pc.posi[0],pc.posi[pc.nSites-1]); kstring_t str = do_stat_main(pc,step,win,nChr,type); fwrite(str.s,1,str.l,fpres);//should clean up str, doesn't matter for this program; fflush(fpres); if(chr!=NULL) break; dalloc(pc); } fclose(fpres); fprintf(stderr,"\tDumping file: \"%s\"\n",resname); return 0; }
ifq_codes_t ifq_create_index(char *fastq_path, char *index_prefix) { char *hash_path = concatenate( index_prefix, ".hsh" ); char *seek_path = concatenate( index_prefix, ".lup" ); ifq_codes_t ret = IFQ_OK; /* Open output files */ BGZF *fastq_file = bgzf_open( fastq_path, "r" ); if( fastq_file == NULL ) { ret = IFQ_BAD_FASTQ; goto index_fastq_fail; } FILE *hash_file = fopen( hash_path, "w" ); if( hash_file == NULL ) { ret = IFQ_BAD_PREFIX; goto index_prefix_fail; } /* Create hash function */ cmph_io_adapter_t *source = cmph_io_fastq_adapter( fastq_file ); if( source == NULL ) { ret = IFQ_BAD_HASH; goto index_prefix_fail; } cmph_config_t *config = cmph_config_new( source ); cmph_config_set_algo( config, CMPH_CHD ); cmph_config_set_mphf_fd( config, hash_file ); cmph_t *hash = cmph_new( config ); if( hash == NULL ) { ret = IFQ_BAD_HASH; goto index_hash_fail; } /* Create the file index using the hash */ bgzf_seek( fastq_file, 0, SEEK_SET ); if( create_index( fastq_file, hash, seek_path ) != 1 ) { ret = IFQ_BAD_INDEX; goto index_create_fail; } index_fastq_fail: free( hash_path ); free( seek_path ); index_create_fail: cmph_config_destroy( config ); cmph_dump( hash, hash_file ); cmph_destroy( hash ); free( source ); index_hash_fail: fclose( hash_file ); index_prefix_fail: bgzf_close( fastq_file ); return ret; }
void make_bed(int argc,char **argv){ // fprintf(stderr,"[%s] \n",__FUNCTION__); if(argc==0){ fprintf(stderr,"make_bed FILE.theta.gz [OUTNAMES] (if OUTNAMES is supplied, this will be used as prefix \n"); exit(0); } if(!fexists(argv[0])){ fprintf(stderr,"Problem opening file: %s\n",argv[0]); exit(0); } char *base = argv[0]; if(argc==2) base = argv[1]; char* outnames_bin = append(base,BIN); char* outnames_idx = append(base,IDX); const char *delims = "\t \n"; gzFile gfp = gzopen(argv[0],"r"); char *buf = new char[LENS]; BGZF *cfpD = bgzf_open(outnames_bin,"w9"); FILE *fp =fopen(outnames_idx,"w"); std::vector<the_t> vec; char *lastChr = NULL; while(gzgets(gfp,buf,LENS)){ char *chr = strtok(buf,delims); if(chr[0]=='#') continue; int posi=atoi(strtok(NULL,delims)) ; if(lastChr==NULL){ lastChr = strdup(chr); }else if(strcmp(lastChr,chr)!=0){ int64_t id=writeAll(vec,lastChr,cfpD);//write data write_index(vec.size(),lastChr,fp,id);//write index; vec.clear(); free(lastChr); lastChr=strdup(chr); } the_t t; t.posi =posi; float *the =new float[5]; for(int i=0;i<5;i++) the[i] = atof(strtok(NULL,delims)) ; t.vals = the; vec.push_back(t); #if 0 fprintf(stderr,"%s %d ",chr,posi); for(int i=0;i<5;i++) fprintf(stderr," %f",the[i]); fprintf(stderr,"\n"); #endif } int64_t id=writeAll(vec,lastChr,cfpD);//write data write_index(vec.size(),lastChr,fp,id);//write index; vec.clear(); free(lastChr); fprintf(stderr,"\tHas dumped files:\n\t\t'%s\'\n\t\t\'%s\'\n",outnames_bin,outnames_idx); bgzf_close(cfpD); fclose(fp); gzclose(gfp); delete [] buf; delete [] outnames_bin; delete [] outnames_idx; }
int main(int argc, char *argv[]) { if (argc <= 1) { fprintf(stderr, "Usage: thrash_threads4 input.bam\n"); exit(1); } // Find a valid seek location ~64M into the file int i; ssize_t got; BGZF *fpin = bgzf_open(argv[1], "r"); uint64_t upos = 0, uend = 0; char buf[100000]; for (i = 0; i < 100; i++) { if ((got = bgzf_read(fpin, buf, 65536)) < 0) abort(); upos += got; } int64_t pos = bgzf_tell(fpin); while ((got = bgzf_read(fpin, buf, 65536)) > 0) { uend += got; } if (got < 0) abort(); int64_t end = bgzf_tell(fpin); bgzf_close(fpin); // Ensure input is big enough to avoid case 3,4 below going off the end // of the file if (uend < upos + 10000000) { fprintf(stderr, "Please supply a bigger input file\n"); exit(1); } #define N 1000 // Spam random seeks & reads for (i = 0; i < 1000; i++) { printf("i=%d\t", i); fpin = bgzf_open(argv[1], "r"); int j, eof = 0, mt = 0; for (j = 0; j < 80; j++) { int n = rand() % 7; putchar('0'+n); fflush(stdout); switch (n) { case 0: // start if (bgzf_seek(fpin, 0LL, SEEK_SET) < 0) puts("!");//abort(); eof = 0; break; case 1: // mid if (bgzf_seek(fpin, pos, SEEK_SET) < 0) puts("!");//abort(); eof = 0; break; case 2: // eof if (bgzf_seek(fpin, end, SEEK_SET) < 0) puts("!");//abort(); eof = 1; break; case 3: case 4: { int l = rand()%(n==3?100000:100); if (bgzf_read(fpin, buf, l) != l*(1-eof)) abort(); break; } case 5: usleep(N); break; case 6: if (!mt) bgzf_mt(fpin, 8, 256); mt = 1; break; } } printf("\n"); if (bgzf_close(fpin)) abort(); } return 0; }
void bcf_file::print_bcf(const parameters ¶ms) { LOG.printLOG("Outputting BCF file...\n"); BGZF * out; if(!params.stream_out) { string output_file = params.output_prefix + ".recode.bcf"; out = bgzf_open(output_file.c_str(), "w"); } else out = bgzf_dopen(1, "w"); string header_str; uint32_t len_text = 0; vector<char> header; char magic[5] = {'B','C','F','\2','\2'}; bgzf_write(out, magic, 5); for (unsigned int ui=0; ui<meta_data.lines.size(); ui++) { for (unsigned int uj=0; uj<meta_data.lines[ui].length(); uj++) header.push_back( meta_data.lines[ui][uj] ); header.push_back('\n'); } header_str = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"; if (meta_data.N_indv > 0) header_str += "\tFORMAT"; for (unsigned int ui=0; ui<meta_data.N_indv; ui++) if (include_indv[ui]) { header_str += "\t"; header_str += meta_data.indv[ui]; } header_str += "\n"; for (unsigned int ui=0; ui<header_str.length(); ui++) header.push_back( header_str[ui] ); header.push_back( '\0' ); len_text = header.size(); bgzf_write(out, (char *)&len_text, sizeof(len_text) ); bgzf_write(out, (char *)&header[0], len_text ); vector<char> variant_line; entry * e = new bcf_entry(meta_data, include_indv); while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(true, true, true); e->parse_full_entry(true); e->parse_genotype_entries(true); e->print_bcf(out, params.recode_INFO_to_keep, params.recode_all_INFO); } delete e; bgzf_close(out); }
void vcf_file::print_bcf(const parameters ¶ms) { LOG.printLOG("Outputting BCF file...\n"); BGZF * out; if(!params.stream_out) { string output_file = params.output_prefix + ".recode.bcf"; out = bgzf_open(output_file.c_str(), "w"); } else out = bgzf_dopen(1, "w"); string header_str; uint32_t len_text = 0; vector<char> header; char magic[5] = {'B','C','F','\2', '\1'}; bgzf_write(out, magic, 5); if (meta_data.has_idx) { LOG.warning("VCF file contains IDX values in header. These are being removed for conversion to BCF."); meta_data.reprint(); meta_data.reparse(); } for (unsigned int ui=0; ui<meta_data.lines.size(); ui++) { for (unsigned int uj=0; uj<meta_data.lines[ui].length(); uj++) header.push_back( meta_data.lines[ui][uj] ); header.push_back('\n'); } if (meta_data.has_contigs == false) { vector<string> contig_vector; get_contigs(params.contigs_file, contig_vector); for(unsigned int ui=0; ui<contig_vector.size(); ui++) { meta_data.add_CONTIG_descriptor(contig_vector[ui].substr(10, contig_vector[ui].size()-8),int(ui)); for(unsigned int uj=0; uj<contig_vector[ui].size(); uj++) header.push_back(contig_vector[ui][uj]); header.push_back('\n'); } } header_str = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"; if (meta_data.N_indv > 0) header_str += "\tFORMAT"; for (unsigned int ui=0; ui<meta_data.N_indv; ui++) if (include_indv[ui]) { header_str += "\t"; header_str += meta_data.indv[ui]; } header_str += "\n"; for (unsigned int ui=0; ui<header_str.length(); ui++) header.push_back( header_str[ui] ); header.push_back( '\0' ); len_text = header.size(); bgzf_write(out, (char *)&len_text, sizeof(len_text) ); bgzf_write(out, (char *)&header[0], len_text ); vector<char> variant_line; entry * e = new vcf_entry(meta_data, include_indv); while(!eof()) { get_entry(variant_line); e->reset(variant_line); N_entries += e->apply_filters(params); if(!e->passed_filters) continue; N_kept_entries++; e->parse_basic_entry(true, true, true); e->parse_full_entry(true); e->parse_genotype_entries(true,true,true,true); e->print_bcf(out, params.recode_INFO_to_keep, params.recode_all_INFO); } delete e; bgzf_close(out); }
perpsmc * perpsmc_init(char *fname){ perpsmc *ret = new perpsmc ; ret->fname = strdup(fname); ret->gls =NULL; ret->pos = NULL; ret->bgzf_pos=ret->bgzf_gls=NULL; ret->pos = NULL; size_t clen; if(!fexists(fname)){ fprintf(stderr,"\t-> Problem opening file: \'%s\'\n",fname); exit(0); } FILE *fp = NULL; fp=fopen(fname,"r"); if(fp==NULL){ fprintf(stderr,"\t-> Problem opening file:%s\n",fname); exit(0); } char buf[8]; assert(fread(buf,1,8,fp)==8); ret->version = psmcversion(fname); fprintf(stderr,"\t-> Version of fname: \'%s\' is:%d\n",fname,ret->version); if(ret->version!=1){ fprintf(stderr,"\t-> Looks like you are trying to use a version of PSMC that does not exists\n"); exit(0); } ret->nSites =0; while(fread(&clen,sizeof(size_t),1,fp)){ char *chr = (char*)malloc(clen+1); assert(clen==fread(chr,1,clen,fp)); chr[clen] = '\0'; datum d; if(1!=fread(&d.nSites,sizeof(size_t),1,fp)){ fprintf(stderr,"[%s.%s():%d] Problem reading data: %s \n",__FILE__,__FUNCTION__,__LINE__,fname); exit(0); } ret->nSites += d.nSites; if(1!=fread(&d.pos,sizeof(int64_t),1,fp)){ fprintf(stderr,"[%s->%s():%d] Problem reading data: %s \n",__FILE__,__FUNCTION__,__LINE__,fname); exit(0); } if(1!=fread(&d.saf,sizeof(int64_t),1,fp)){ fprintf(stderr,"[%s->%s():%d] Problem reading data: %s \n",__FILE__,__FUNCTION__,__LINE__,fname); exit(0); } myMap::iterator it = ret->mm.find(chr); if(it==ret->mm.end()) ret->mm[chr] =d ; else{ fprintf(stderr,"Problem with chr: %s, key already exists, psmc file needs to be sorted. (sort your -rf that you used for input)\n",chr); exit(0); } } fclose(fp); char *tmp =(char*)calloc(strlen(fname)+100,1);//that should do it tmp=strncpy(tmp,fname,strlen(fname)-3); // fprintf(stderr,"tmp:%s\n",tmp); char *tmp2 = (char*)calloc(strlen(fname)+100,1);//that should do it snprintf(tmp2,strlen(fname)+100,"%sgz",tmp); fprintf(stderr,"\t-> Assuming .psmc.gz file: %s\n",tmp2); ret->bgzf_gls = bgzf_open(tmp2,"r"); if(ret->bgzf_gls) my_bgzf_seek(ret->bgzf_gls,8,SEEK_SET); if(ret->bgzf_gls && ret->version!=psmcversion(tmp2)){ fprintf(stderr,"\t-> Problem with mismatch of version of %s vs %s %d vs %d\n",fname,tmp2,ret->version,psmcversion(tmp2)); exit(0); } snprintf(tmp2,strlen(fname)+100,"%spos.gz",tmp); fprintf(stderr,"\t-> Assuming .psmc.pos.gz: %s\n",tmp2); ret->bgzf_pos = bgzf_open(tmp2,"r"); if(ret->pos) my_bgzf_seek(ret->bgzf_pos,8,SEEK_SET); if(ret->bgzf_pos&& ret->version!=psmcversion(tmp2)){ fprintf(stderr,"Problem with mismatch of version of %s vs %s\n",fname,tmp2); exit(0); } //assert(ret->pos!=NULL&&ret->saf!=NULL); free(tmp);free(tmp2); return ret; }
int main_tabix(int argc, char *argv[]) { int c, min_shift = -1, is_force = 0, is_all = 0; tbx_conf_t conf = tbx_conf_gff, *conf_ptr = NULL; while ((c = getopt(argc, argv, "0fap:s:b:e:S:c:m:")) >= 0) if (c == '0') conf.preset |= TBX_UCSC; else if (c == 'f') is_force = 1; else if (c == 'a') is_all = 1; else if (c == 'm') min_shift = atoi(optarg); else if (c == 's') conf.sc = atoi(optarg); else if (c == 'b') conf.bc = atoi(optarg); else if (c == 'e') conf.ec = atoi(optarg); else if (c == 'c') conf.meta_char = *optarg; else if (c == 'S') conf.line_skip = atoi(optarg); else if (c == 'p') { if (strcmp(optarg, "gff") == 0) conf_ptr = &tbx_conf_gff; else if (strcmp(optarg, "bed") == 0) conf_ptr = &tbx_conf_bed; else if (strcmp(optarg, "sam") == 0) conf_ptr = &tbx_conf_sam; else if (strcmp(optarg, "vcf") == 0) conf_ptr = &tbx_conf_vcf; else { fprintf(stderr, "The type '%s' not recognised\n", optarg); return 1; } } if (optind == argc) { fprintf(stderr, "\nUsage: bcftools tabix [options] <in.gz> [reg1 [...]]\n\n"); fprintf(stderr, "Options: -p STR preset: gff, bed, sam or vcf [gff]\n"); fprintf(stderr, " -s INT column number for sequence names (suppressed by -p) [1]\n"); fprintf(stderr, " -b INT column number for region start [4]\n"); fprintf(stderr, " -e INT column number for region end (if no end, set INT to -b) [5]\n"); fprintf(stderr, " -0 specify coordinates are zero-based\n"); fprintf(stderr, " -S INT skip first INT lines [0]\n"); fprintf(stderr, " -c CHAR skip lines starting with CHAR [null]\n"); fprintf(stderr, " -a print all records\n"); fprintf(stderr, " -f force to overwrite existing index\n"); fprintf(stderr, " -m INT set the minimal interval size to 1<<INT; 0 for the old tabix index [0]\n"); fprintf(stderr, "\n"); return 1; } if (is_all) { // read without random access kstring_t s; BGZF *fp; s.l = s.m = 0; s.s = 0; fp = bgzf_open(argv[optind], "r"); while (bgzf_getline(fp, '\n', &s) >= 0) puts(s.s); bgzf_close(fp); free(s.s); } else if (optind + 2 > argc) { // create index if ( !conf_ptr ) { // auto-detect file type by file name int l = strlen(argv[optind]); int strcasecmp(const char *s1, const char *s2); if (l>=7 && strcasecmp(argv[optind]+l-7, ".gff.gz") == 0) conf_ptr = &tbx_conf_gff; else if (l>=7 && strcasecmp(argv[optind]+l-7, ".bed.gz") == 0) conf_ptr = &tbx_conf_bed; else if (l>=7 && strcasecmp(argv[optind]+l-7, ".sam.gz") == 0) conf_ptr = &tbx_conf_sam; else if (l>=7 && strcasecmp(argv[optind]+l-7, ".vcf.gz") == 0) conf_ptr = &tbx_conf_vcf; } if ( conf_ptr ) conf = *conf_ptr; if (!is_force) { char *fn; FILE *fp; fn = (char*)alloca(strlen(argv[optind]) + 5); strcat(strcpy(fn, argv[optind]), min_shift <= 0? ".tbi" : ".csi"); if ((fp = fopen(fn, "rb")) != 0) { fclose(fp); fprintf(stderr, "[E::%s] the index file exists; use option '-f' to overwrite\n", __func__); return 1; } } if ( tbx_index_build(argv[optind], min_shift, &conf) ) { fprintf(stderr,"tbx_index_build failed: Is the file bgzip-compressed? Was wrong -p [type] option used?\n"); return 1; } } else { // read with random access tbx_t *tbx; BGZF *fp; kstring_t s; int i; if ((tbx = tbx_index_load(argv[optind])) == 0) return 1; if ((fp = bgzf_open(argv[optind], "r")) == 0) return 1; s.s = 0; s.l = s.m = 0; for (i = optind + 1; i < argc; ++i) { hts_itr_t *itr; if ((itr = tbx_itr_querys(tbx, argv[i])) == 0) continue; while (tbx_bgzf_itr_next(fp, tbx, itr, &s) >= 0) puts(s.s); tbx_itr_destroy(itr); } free(s.s); bgzf_close(fp); tbx_destroy(tbx); } return 0; }
static void naive_concat(args_t *args) { // only compressed BCF atm BGZF *bgzf_out = bgzf_open(args->output_fname,"w");; const size_t page_size = 32768; char *buf = (char*) malloc(page_size); kstring_t tmp = {0,0,0}; int i; for (i=0; i<args->nfnames; i++) { htsFile *hts_fp = hts_open(args->fnames[i],"r"); if ( !hts_fp ) error("Failed to open: %s\n", args->fnames[i]); htsFormat type = *hts_get_format(hts_fp); if ( type.format==vcf ) error("The --naive option currently works only for compressed BCFs, sorry :-/\n"); if ( type.compression!=bgzf ) error("The --naive option currently works only for compressed BCFs, sorry :-/\n"); BGZF *fp = hts_get_bgzfp(hts_fp); if ( !fp || bgzf_read_block(fp) != 0 || !fp->block_length ) error("Failed to read %s: %s\n", args->fnames[i], strerror(errno)); uint8_t magic[5]; if ( bgzf_read(fp, magic, 5) != 5 ) error("Failed to read the BCF header in %s\n", args->fnames[i]); if (strncmp((char*)magic, "BCF\2\2", 5) != 0) error("Invalid BCF magic string in %s\n", args->fnames[i]); if ( bgzf_read(fp, &tmp.l, 4) != 4 ) error("Failed to read the BCF header in %s\n", args->fnames[i]); hts_expand(char,tmp.l,tmp.m,tmp.s); if ( bgzf_read(fp, tmp.s, tmp.l) != tmp.l ) error("Failed to read the BCF header in %s\n", args->fnames[i]); // write only the first header if ( i==0 ) { if ( bgzf_write(bgzf_out, "BCF\2\2", 5) !=5 ) error("Failed to write %d bytes to %s\n", 5,args->output_fname); if ( bgzf_write(bgzf_out, &tmp.l, 4) !=4 ) error("Failed to write %d bytes to %s\n", 4,args->output_fname); if ( bgzf_write(bgzf_out, tmp.s, tmp.l) != tmp.l) error("Failed to write %d bytes to %s\n", tmp.l,args->output_fname); } // Output all non-header data that were read together with the header block int nskip = fp->block_offset; if ( fp->block_length - nskip > 0 ) { if ( bgzf_write(bgzf_out, fp->uncompressed_block+nskip, fp->block_length-nskip)<0 ) error("Error: %d\n",fp->errcode); } if ( bgzf_flush(bgzf_out)<0 ) error("Error: %d\n",bgzf_out->errcode); // Stream the rest of the file as it is, without recompressing, but remove BGZF EOF blocks ssize_t nread, ncached = 0, nwr; const int neof = 28; char cached[neof]; while (1) { nread = bgzf_raw_read(fp, buf, page_size); // page_size boundary may occur in the middle of the EOF block, so we need to cache the blocks' ends if ( nread<=0 ) break; if ( nread<=neof ) // last block { if ( ncached ) { // flush the part of the cache that won't be needed nwr = bgzf_raw_write(bgzf_out, cached, nread); if (nwr != nread) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)nread); // make space in the cache so that we can append to the end if ( nread!=neof ) memmove(cached,cached+nread,neof-nread); } // fill the cache and check for eof outside this loop memcpy(cached+neof-nread,buf,nread); break; } // not the last block, flush the cache if full if ( ncached ) { nwr = bgzf_raw_write(bgzf_out, cached, ncached); if (nwr != ncached) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)ncached); ncached = 0; } // fill the cache nread -= neof; memcpy(cached,buf+nread,neof); ncached = neof; nwr = bgzf_raw_write(bgzf_out, buf, nread); if (nwr != nread) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)nread); } if ( ncached && memcmp(cached,"\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0",neof) ) { nwr = bgzf_raw_write(bgzf_out, cached, neof); if (nwr != neof) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)neof); } if (hts_close(hts_fp)) error("Close failed: %s\n",args->fnames[i]); } free(buf); free(tmp.s); if (bgzf_close(bgzf_out) < 0) error("Error: %d\n",bgzf_out->errcode); }
void signalFromBAM(const string bamFileName, const string sigFileName, Parameters P) { bam1_t *bamA; bamA=bam_init1(); double nMult=0, nUniq=0; if (P.outWigFlags.norm==1) {//count reads in the BAM file BGZF *bamIn=bgzf_open(bamFileName.c_str(),"r"); bam_hdr_t *bamHeader=bam_hdr_read(bamIn); while ( true ) {//until the end of file int bamBytes1=bam_read1(bamIn, bamA); if (bamBytes1<0) break; //end of file if (bamA->core.tid<0) continue; //unmapped read // if ( !std::regex_match(chrName.at(bamA->core.tid),std::regex(P.outWigReferencesPrefix))) continue; //reference does not mathc required references if ( P.outWigReferencesPrefix!="-" && (P.outWigReferencesPrefix.compare(0,P.outWigReferencesPrefix.size(),bamHeader->target_name[bamA->core.tid],P.outWigReferencesPrefix.size())!=0) ) continue; //reference does not match required references uint8_t* aNHp=bam_aux_get(bamA,"NH"); if (aNHp!=NULL) { uint32_t aNH=bam_aux2i(aNHp); if (aNH==1) {//unique mappers ++nUniq; } else if (aNH>1) { nMult+=1.0/aNH; }; }; }; bgzf_close(bamIn); }; BGZF *bamIn=bgzf_open(bamFileName.c_str(),"r"); bam_hdr_t *bamHeader=bam_hdr_read(bamIn); int sigN=P.outWigFlags.strand ? 4 : 2; double *normFactor=new double[sigN]; ofstream **sigOutAll=new ofstream* [sigN]; string* sigOutFileName=new string[sigN]; sigOutFileName[0]=sigFileName+".Unique.str1.out"; sigOutFileName[1]=sigFileName+".UniqueMultiple.str1.out"; if (P.outWigFlags.strand) { sigOutFileName[2]=sigFileName+".Unique.str2.out"; sigOutFileName[3]=sigFileName+".UniqueMultiple.str2.out"; }; for (int ii=0; ii<sigN; ii++) { sigOutFileName[ii]+= (P.outWigFlags.format==0 ? ".bg" : ".wig"); sigOutAll[ii]=new ofstream ( sigOutFileName[ii].c_str() ); }; if (P.outWigFlags.norm==0) {//raw counts normFactor[0]=1; normFactor[1]=1; } else if (P.outWigFlags.norm==1) {//normlaized normFactor[0]=1.0e6 / nUniq; normFactor[1]=1.0e6 / (nUniq+nMult); for (int is=0;is<sigN;is++) {//formatting double output *sigOutAll[is]<<setiosflags(ios::fixed) << setprecision(5); }; }; if (P.outWigFlags.strand) { normFactor[2]=normFactor[0]; normFactor[3]=normFactor[1]; }; int iChr=-999; double *sigAll=NULL; uint32_t chrLen=0; while ( true ) {//until the end of file int bamBytes1=bam_read1(bamIn, bamA); if (bamA->core.tid!=iChr || bamBytes1<0) { //output to file if (iChr!=-999) {//iChr=-999 marks chromosomes that are not output, including unmapped reads for (int is=0;is<sigN;is++) { if (P.outWigFlags.format==1) { *sigOutAll[is] <<"variableStep chrom="<<bamHeader->target_name[iChr] <<"\n"; }; double prevSig=0; for (uint32_t ig=0;ig<chrLen;ig++) { double newSig=sigAll[sigN*ig+is]; if (P.outWigFlags.format==0) {//bedGraph if (newSig!=prevSig) { if (prevSig!=0) {//finish previous record *sigOutAll[is] <<ig<<"\t"<<prevSig*normFactor[is] <<"\n"; //1-based end }; if (newSig!=0) { *sigOutAll[is] << bamHeader->target_name[iChr] <<"\t"<< ig <<"\t"; //0-based beginning }; prevSig=newSig; }; } else if (P.outWigFlags.format==1){//wiggle if (newSig!=0) { *sigOutAll[is] <<ig+1<<"\t"<<newSig*normFactor[is] <<"\n"; }; }; }; }; }; if (bamBytes1<0) {//no more reads break; }; iChr=bamA->core.tid; if ( iChr==-1 || (P.outWigReferencesPrefix!="-" && (P.outWigReferencesPrefix.compare(0,P.outWigReferencesPrefix.size(),bamHeader->target_name[bamA->core.tid],P.outWigReferencesPrefix.size())!=0) ) ) { iChr=-999; continue; //reference does not match required references }; chrLen=bamHeader->target_len[iChr]+1;//one extra base at the end which sohuld always be 0 delete [] sigAll; sigAll= new double[sigN*chrLen]; memset(sigAll, 0, sizeof(*sigAll)*sigN*chrLen); }; // uint32_t nCigar =(bamA->core.flag<<16)>>16; // uint32_t mapFlag=bamA->core.flag>>16; // uint32_t mapQ=(bamA->core.flag<<16)>>24; #define BAM_CIGAR_OperationShift 4 #define BAM_CIGAR_LengthBits 28 #define BAM_CIGAR_M 0 #define BAM_CIGAR_I 1 #define BAM_CIGAR_D 2 #define BAM_CIGAR_N 3 #define BAM_CIGAR_S 4 #define BAM_CIGAR_H 5 #define BAM_CIGAR_P 6 #define BAM_CIGAR_EQ 7 #define BAM_CIGAR_X 8 //by default, alignments marked as duplicate are not processed if ( (bamA->core.flag & 0x400) > 0 ) continue; //NH attribute uint8_t* aNHp=bam_aux_get(bamA,"NH"); uint32_t aNH; if (aNHp==NULL) { aNH=1; //no NH tag: assume NH=1 //continue; //do not process lines without NH field } else { aNH=bam_aux2i(bam_aux_get(bamA,"NH")); //write a safer function allowing for lacking NH tag }; if (aNH==0) continue; //do not process lines without NH=0 uint32_t aG=bamA->core.pos; uint32_t iStrand=0; if (P.outWigFlags.strand) {//strand for stranded data from SAM flag iStrand= ( (bamA->core.flag & 0x10) > 0 ) == ( (bamA->core.flag & 0x80) == 0 );//0/1 for +/- }; if (P.outWigFlags.type==1) {//5' of the1st read signal only, RAMPAGE/CAGE if ( (bamA->core.flag & 0x80)>0) continue; //skip if this the second mate if (iStrand==0) { if (aNH==1) {//unique mappers sigAll[aG*sigN+0+2*iStrand]++; }; sigAll[aG*sigN+1+2*iStrand]+=1.0/aNH;//U+M, normalized by the number of multi-mapping loci continue; //record only the first position }; }; uint32_t* cigar=(uint32_t*) (bamA->data+bamA->core.l_qname); for (uint32_t ic=0; ic<bamA->core.n_cigar; ic++) { uint32_t cigOp=(cigar[ic]<<BAM_CIGAR_LengthBits)>>BAM_CIGAR_LengthBits; uint32_t cigL=cigar[ic]>>BAM_CIGAR_OperationShift; switch (cigOp) { case(BAM_CIGAR_D): case(BAM_CIGAR_N): aG+=cigL; break; case(BAM_CIGAR_M): if (P.outWigFlags.type==0 || (P.outWigFlags.type==2 && (bamA->core.flag & 0x80)>0 )) {//full signal, or second mate onyl signal for (uint32_t ig=0;ig<cigL;ig++) { if (aG>=chrLen) { cerr << "BUG: alignment extends past chromosome in signalFromBAM.cpp\n"; exit(-1); }; if (aNH==1) {//unique mappers sigAll[aG*sigN+0+2*iStrand]++; }; sigAll[aG*sigN+1+2*iStrand]+=1.0/aNH;//U+M, normalized by the number of multi-mapping loci aG++; }; } else { aG+=cigL; }; }; }; if (P.outWigFlags.type==1) {//full signal --aG; if (aNH==1) {//unique mappers sigAll[aG*sigN+0+2*iStrand]++; }; sigAll[aG*sigN+1+2*iStrand]+=1.0/aNH;//U+M, normalized by the number of multi-mapping loci }; }; delete [] sigAll; for (int is=0; is<sigN; is++) {// flush/close all signal files sigOutAll[is]->flush(); sigOutAll[is]->close(); }; };
int bam_cat(int nfn, char * const *fn, const bam_header_t *h, const char* outbam) { BGZF *fp; FILE* fp_file; uint8_t *buf; uint8_t ebuf[BGZF_EMPTY_BLOCK_SIZE]; const int es=BGZF_EMPTY_BLOCK_SIZE; int i; fp = strcmp(outbam, "-")? bgzf_open(outbam, "w") : bgzf_fdopen(_fileno(stdout), "w"); if (fp == 0) { fprintf(stderr, "[%s] ERROR: fail to open output file '%s'.\n", __FUNCTION__, outbam); return 1; } if (h) bam_header_write(fp, h); buf = (uint8_t*) malloc(BUF_SIZE); for(i = 0; i < nfn; ++i){ BGZF *in; bam_header_t *old; int len,j; in = strcmp(fn[i], "-")? bam_open(fn[i], "r") : bam_dopen(_fileno(stdin), "r"); if (in == 0) { fprintf(stderr, "[%s] ERROR: fail to open file '%s'.\n", __FUNCTION__, fn[i]); return -1; } if (in->open_mode != 'r') return -1; old = bam_header_read(in); if (h == 0 && i == 0) bam_header_write(fp, old); if (in->block_offset < in->block_length) { bgzf_write(fp, (uint8_t*)in->uncompressed_block + in->block_offset, in->block_length - in->block_offset); bgzf_flush(fp); } j=0; #ifdef _USE_KNETFILE fp_file=fp->x.fpw; while ((len = knet_read(in->x.fpr, buf, BUF_SIZE)) > 0) { #else fp_file=fp->file; while (!feof(in->file) && (len = fread(buf, 1, BUF_SIZE, in->file)) > 0) { #endif if(len<es){ int diff=es-len; if(j==0) { fprintf(stderr, "[%s] ERROR: truncated file?: '%s'.\n", __FUNCTION__, fn[i]); return -1; } fwrite(ebuf, 1, len, fp_file); memcpy(ebuf,ebuf+len,diff); memcpy(ebuf+diff,buf,len); } else { if(j!=0) fwrite(ebuf, 1, es, fp_file); len-= es; memcpy(ebuf,buf+len,es); fwrite(buf, 1, len, fp_file); } j=1; } /* check final gzip block */ { const uint8_t gzip1=ebuf[0]; const uint8_t gzip2=ebuf[1]; const uint32_t isize=*((uint32_t*)(ebuf+es-4)); if(((gzip1!=GZIPID1) || (gzip2!=GZIPID2)) || (isize!=0)) { fprintf(stderr, "[%s] WARNING: Unexpected block structure in file '%s'.", __FUNCTION__, fn[i]); fprintf(stderr, " Possible output corruption.\n"); fwrite(ebuf, 1, es, fp_file); } } bam_header_destroy(old); bgzf_close(in); } free(buf); bgzf_close(fp); return 0; } int main_cat(int argc, char *argv[]) { bam_header_t *h = 0; char *outfn = 0; int c, ret; while ((c = getopt(argc, argv, "h:o:")) >= 0) { switch (c) { case 'h': { tamFile fph = sam_open(optarg); if (fph == 0) { fprintf(stderr, "[%s] ERROR: fail to read the header from '%s'.\n", __FUNCTION__, argv[1]); return 1; } h = sam_header_read(fph); sam_close(fph); break; } case 'o': outfn = strdup(optarg); break; } } if (argc - optind < 2) { fprintf(stderr, "Usage: samtools cat [-h header.sam] [-o out.bam] <in1.bam> <in2.bam> [...]\n"); return 1; } ret = bam_cat(argc - optind, argv + optind, h, outfn? outfn : "-"); free(outfn); return ret; }
int reheader_file(const char *fname, const char *header, int ftype, tbx_conf_t *conf) { if ( ftype & IS_TXT || !ftype ) { BGZF *fp = bgzf_open(fname,"r"); if ( !fp || bgzf_read_block(fp) != 0 || !fp->block_length ) return -1; char *buffer = fp->uncompressed_block; int skip_until = 0; // Skip the header: find out the position of the data block if ( buffer[0]==conf->meta_char ) { skip_until = 1; while (1) { if ( buffer[skip_until]=='\n' ) { skip_until++; if ( skip_until>=fp->block_length ) { if ( bgzf_read_block(fp) != 0 || !fp->block_length ) error("FIXME: No body in the file: %s\n", fname); skip_until = 0; } // The header has finished if ( buffer[skip_until]!=conf->meta_char ) break; } skip_until++; if ( skip_until>=fp->block_length ) { if (bgzf_read_block(fp) != 0 || !fp->block_length) error("FIXME: No body in the file: %s\n", fname); skip_until = 0; } } } // Output the new header FILE *hdr = fopen(header,"r"); if ( !hdr ) error("%s: %s", header,strerror(errno)); const size_t page_size = 32768; char *buf = malloc(page_size); BGZF *bgzf_out = bgzf_open("-", "w"); ssize_t nread; while ( (nread=fread(buf,1,page_size-1,hdr))>0 ) { if ( nread<page_size-1 && buf[nread-1]!='\n' ) buf[nread++] = '\n'; if (bgzf_write(bgzf_out, buf, nread) < 0) error("Error: %d\n",bgzf_out->errcode); } if ( fclose(hdr) ) error("close failed: %s\n", header); // Output all remainig data read with the header block if ( fp->block_length - skip_until > 0 ) { if (bgzf_write(bgzf_out, buffer+skip_until, fp->block_length-skip_until) < 0) error("Error: %d\n",fp->errcode); } if (bgzf_flush(bgzf_out) < 0) error("Error: %d\n",bgzf_out->errcode); while (1) { nread = bgzf_raw_read(fp, buf, page_size); if ( nread<=0 ) break; int count = bgzf_raw_write(bgzf_out, buf, nread); if (count != nread) error("Write failed, wrote %d instead of %d bytes.\n", count,(int)nread); } if (bgzf_close(bgzf_out) < 0) error("Error: %d\n",bgzf_out->errcode); if (bgzf_close(fp) < 0) error("Error: %d\n",fp->errcode); free(buf); } else error("todo: reheader BCF, BAM\n"); // BCF is difficult, records contain pointers to the header. return 0; }
int reheader_file(const char *header, const char *file, int meta) { BGZF *fp = bgzf_open(file,"r"); if (bgzf_read_block(fp) != 0 || !fp->block_length) return -1; char *buffer = fp->uncompressed_block; int skip_until = 0; if ( buffer[0]==meta ) { skip_until = 1; // Skip the header while (1) { if ( buffer[skip_until]=='\n' ) { skip_until++; if ( skip_until>=fp->block_length ) { if (bgzf_read_block(fp) != 0 || !fp->block_length) error("no body?\n"); skip_until = 0; } // The header has finished if ( buffer[skip_until]!=meta ) break; } skip_until++; if ( skip_until>=fp->block_length ) { if (bgzf_read_block(fp) != 0 || !fp->block_length) error("no body?\n"); skip_until = 0; } } } FILE *fh = fopen(header,"r"); if ( !fh ) error("%s: %s", header,strerror(errno)); int page_size = getpagesize(); char *buf = valloc(page_size); BGZF *bgzf_out = bgzf_fdopen(fileno(stdout), "w"); ssize_t nread; while ( (nread=fread(buf,1,page_size-1,fh))>0 ) { if ( nread<page_size-1 && buf[nread-1]!='\n' ) buf[nread++] = '\n'; if (bgzf_write(bgzf_out, buf, nread) < 0) error("Error: %s\n",bgzf_out->error); } fclose(fh); if ( fp->block_length - skip_until > 0 ) { if (bgzf_write(bgzf_out, buffer+skip_until, fp->block_length-skip_until) < 0) error("Error: %s\n",fp->error); } if (bgzf_flush(bgzf_out) < 0) error("Error: %s\n",bgzf_out->error); while (1) { #ifdef _USE_KNETFILE nread = knet_read(fp->x.fpr, buf, page_size); #else nread = fread(buf, 1, page_size, fp->file); #endif if ( nread<=0 ) break; #ifdef _USE_KNETFILE int count = fwrite(buf, 1, nread, bgzf_out->x.fpw); #else int count = fwrite(buf, 1, nread, bgzf_out->file); #endif if (count != nread) error("Write failed, wrote %d instead of %d bytes.\n", count,(int)nread); } if (bgzf_close(bgzf_out) < 0) error("Error: %s\n",bgzf_out->error); return 0; }
int init_regions(const char *fname, regions_t *reg) { int bgzf_getline(BGZF *fp, int delim, kstring_t *str); BGZF *zfp = bgzf_open(fname, "r"); if ( !zfp ) { fprintf(stderr,"%s: %s\n",fname,strerror(errno)); return 0; } int i, mseqs = 10, mpos = 0; reg->nseqs = 0; reg->pos = (pos_t **)calloc(mseqs,sizeof(pos_t*)); reg->npos = (int*) calloc(mseqs,sizeof(int)); reg->seq_names = (char **) calloc(mseqs,sizeof(char*)); kstring_t str = {0,0,0}; ssize_t nread; while ((nread = bgzf_getline(zfp, '\n', &str)) > 0) { char *line = str.s; if ( line[0] == '#' ) continue; int i = 0; while ( i<nread && !isspace(line[i]) ) i++; if ( i>=nread ) { fprintf(stderr,"Could not parse the file: %s [%s]\n", fname,line); return 0; } line[i] = 0; if ( reg->nseqs==0 || strcmp(line,reg->seq_names[reg->nseqs-1]) ) { // New sequence reg->nseqs++; if ( reg->nseqs >= mseqs ) { mseqs++; reg->pos = (pos_t **) realloc(reg->pos,sizeof(pos_t*)*mseqs); reg->pos[mseqs-1] = NULL; reg->npos = (int *) realloc(reg->npos,sizeof(int)*mseqs); reg->npos[mseqs-1] = 0; reg->seq_names = (char**) realloc(reg->seq_names,sizeof(char*)*mseqs); } reg->seq_names[reg->nseqs-1] = strdup(line); mpos = 0; } int iseq = reg->nseqs-1; if ( reg->npos[iseq] >= mpos ) { mpos += 100; reg->pos[iseq] = (pos_t*) realloc(reg->pos[iseq],sizeof(pos_t)*mpos); } int ipos = reg->npos[iseq]; pos_t *pos = reg->pos[iseq]; reg->npos[iseq]++; if ( (sscanf(line+i+1,"%d %d",&pos[ipos].from,&pos[ipos].to))!=2 ) { if ( (sscanf(line+i+1,"%d",&pos[ipos].from))!=1 ) { fprintf(stderr,"Could not parse the region [%s]\n",line+i+1); return 0; } pos[ipos].to = pos[ipos].from; } // Check that the file is sorted if ( ipos>0 && (pos[ipos].from < pos[ipos-1].from || (pos[ipos].from==pos[ipos-1].from && pos[ipos].to<pos[ipos-1].to)) ) { fprintf(stderr,"The file is not sorted: %s\n", fname); return 0; } } // Check that chromosomes come in blocks int j; for (i=0; i<reg->nseqs; i++) { for (j=0; j<i; j++) { if ( !strcmp(reg->seq_names[i],reg->seq_names[j]) ) { fprintf(stderr,"The file is not sorted: %s\n", fname); return 0; } } } if (str.m) free(str.s); else return 0; bgzf_close(zfp); return 1; }
int main (int argc, char **argv) { ///////////////////// // Parse Arguments // ///////////////////// params *pars = new params; init_pars(pars); parse_cmd_args(argc, argv, pars); if( pars->version ) { printf("ngsF v%s\nCompiled on %s @ %s", version, __DATE__, __TIME__); #ifdef _USE_BGZF printf(" (BGZF library)\n"); #else printf(" (STD library)\n"); #endif exit(0); } if( pars->verbose >= 1 ) { printf("==> Input Arguments:\n"); printf("\tglf file: %s\n\tinit_values: %s\n\tfreq_fixed: %s\n\tout file: %s\n\tn_ind: %d\n\tn_sites: %lu\n\tchunk_size: %lu\n\tfast_lkl: %s\n\tapprox_EM: %s\n\tcall_geno: %s\n\tmax_iters: %d\n\tmin_epsilon: %.10f\n\tn_threads: %d\n\tseed: %lu\n\tquick: %s\n\tversion: %s\n\tverbose: %d\n\n", pars->in_glf, pars->init_values, pars->freq_fixed ? "true":"false", pars->out_file, pars->n_ind, pars->n_sites, pars->max_chunk_size, pars->fast_lkl ? "true":"false", pars->approx_EM ? "true":"false", pars->call_geno ? "true":"false", pars->max_iters, pars->min_epsilon, pars->n_threads, pars->seed, pars->quick ? "true":"false", version, pars->verbose); } if( pars->verbose > 4 ) printf("==> Verbose values greater than 4 for debugging purpose only. Expect large amounts of info on screen\n"); ///////////////////// // Check Arguments // ///////////////////// if(pars->in_glf == NULL) error(__FUNCTION__,"GL input file (-glf) missing!"); else if( strcmp(pars->in_glf, "-") == 0 ) { pars->in_glf_type = new char[6]; pars->in_glf_type = strcat(pars->in_glf_type, "STDIN"); } else { pars->in_glf_type = strrchr(pars->in_glf, '.'); if(pars->in_glf_type == NULL) error(__FUNCTION__,"invalid file type!"); } if(pars->out_file == NULL) error(__FUNCTION__,"output file (-out) missing!"); if(pars->n_ind == 0) error(__FUNCTION__,"number of individuals (-n_ind) missing!"); if(pars->n_sites == 0) error(__FUNCTION__,"number of sites (-n_sites) missing!"); /////////////////////// // Check input files // /////////////////////// // Get file total size struct stat st; stat(pars->in_glf, &st); if( strcmp(pars->in_glf_type, "STDIN") != 0 ) { if( pars->n_sites == st.st_size/sizeof(double)/pars->n_ind/3 && strcmp(pars->in_glf_type, ".glf") == 0 ) { if(pars->verbose >= 1) printf("==> UNCOMP input file (\"%s\"): number of sites (%lu) match expected file size\n", pars->in_glf_type, pars->n_sites); } else if( strcmp(pars->in_glf_type, ".glf") != 0 ) { if( pars->verbose >= 1) printf("==> COMPRESSED input file (\"%s\"): number of sites (%lu) do NOT match expected file size\n", pars->in_glf_type, pars->n_sites); } else error(__FUNCTION__,"wrong number of sites or invalid/corrupt file!"); } // Adjust max_chunk_size in case of fewer sites if(pars->max_chunk_size > pars->n_sites) { if( pars->verbose >= 1 ) printf("==> Fewer sites (%lu) than chunk_size (%lu). Reducing chunk size to match number of sites\n", pars->n_sites, pars->max_chunk_size); pars->max_chunk_size = pars->n_sites; } // Calculate total number of chunks pars->n_chunks = ceil( (double) pars->n_sites/ (double) pars->max_chunk_size ); if( pars->verbose >= 1 ) printf("==> Analysis will be run in %ld chunk(s)\n", pars->n_chunks); // Alocate memory for the chunk index pars->chunks_voffset = new int64_t[pars->n_chunks]; memset(pars->chunks_voffset, 0, pars->n_chunks*sizeof(int64_t)); // Adjust thread number to chunks if(pars->n_chunks < pars->n_threads) { if( pars->verbose >= 1 ) printf("==> Fewer chunks (%ld) than threads (%d). Reducing the number of threads to match number of chunks\n", pars->n_chunks, pars->n_threads); pars->n_threads = pars->n_chunks; } // Open input file #ifdef _USE_BGZF if( pars->verbose >= 1 ) printf("==> Using BGZF I/O library\n"); // Open BGZIP file if( strcmp(pars->in_glf_type, ".bgz") == 0 ) { if( (pars->in_glf_fh = bgzf_open(pars->in_glf, "rb")) < 0 ) error(__FUNCTION__,"Cannot open BGZIP file!"); } else error(__FUNCTION__,"BGZF library only supports BGZIP files!"); bgzf_set_cache_size(pars->in_glf_fh, CACHE_SIZE * 1024uL * 1024uL * 1024uL); #else if( pars->verbose >= 1 ) printf("==> Using native I/O library\n"); // Open GLF file if( strcmp(pars->in_glf_type, "STDIN") == 0 ) pars->in_glf_fh = stdin; else if( strcmp(pars->in_glf_type, ".glf") == 0 ) { if( (pars->in_glf_fh = fopen(pars->in_glf, "rb")) == NULL ) error(__FUNCTION__,"Cannot open GLF file!"); } else error(__FUNCTION__,"Standard library only supports UNCOMPRESSED GLF files!"); // Allocate memory and read from the file pars->data = new double* [pars->n_sites]; for(uint64_t s = 0; s < pars->n_sites; s++) { pars->data[s] = new double[pars->n_ind * 3]; if( fread (pars->data[s], sizeof(double), pars->n_ind * 3, pars->in_glf_fh) != pars->n_ind * 3) error(__FUNCTION__,"cannot read GLF file!"); if(pars->call_geno) call_geno(pars->data[s], pars->n_ind, 3); } #endif if( pars->in_glf_fh == NULL ) error(__FUNCTION__,"cannot open GLF file!"); /////////////////////////////////// // Declare variables for results // /////////////////////////////////// out_data *output = new out_data; output->site_freq = new double[pars->n_sites]; output->site_freq_num = new double[pars->n_sites]; output->site_freq_den = new double[pars->n_sites]; output->site_prob_var = new double[pars->n_sites]; output->site_tmpprob_var = new double[pars->n_sites]; output->indF = new double[pars->n_ind]; output->indF_num = new double[pars->n_ind]; output->indF_den = new double[pars->n_ind]; output->ind_lkl = new double[pars->n_ind]; // Initialize output init_output(pars, output); ////////////////// // Analyze Data // ////////////////// if( pars->verbose >= 1 && !pars->fast_lkl && strcmp("e", pars->init_values) != 0 ) { printf("==> Initial LogLkl: %.15f\n", full_HWE_like(pars, output->site_freq, output->indF, 0, pars->n_ind)); fflush(stdout); } do_EM(pars, output); if( pars->verbose >= 1 ) printf("\nFinal logLkl: %f\n", output->global_lkl); ////////////////// // Print Output // ////////////////// FILE *out_file; if( pars->verbose >= 1 ) printf("Printing Output...\n"); out_file = fopen(pars->out_file, "w"); if(out_file == NULL) error(__FUNCTION__,"Cannot open OUTPUT file!"); for(uint16_t i = 0; i < pars->n_ind; i++) fprintf(out_file,"%f\n", output->indF[i]); fclose(out_file); ////////////////////// // Close Input File // ////////////////////// if( pars->verbose >= 1 ) printf("Exiting...\n"); #ifdef _USE_BGZF bgzf_close(pars->in_glf_fh); #else for(uint64_t s = 0; s < pars->n_sites; s++) delete [] pars->data[s]; delete [] pars->data; fclose(pars->in_glf_fh); #endif ///////////////// // Free Memory // ///////////////// delete [] output->site_freq; delete [] output->site_freq_num; delete [] output->site_freq_den; delete [] output->site_prob_var; delete [] output->indF; delete [] output->indF_num; delete [] output->indF_den; delete [] output->ind_lkl; delete output; //if( strcmp("e", pars->init_values) == 0 ) //delete [] pars->init_values; delete [] pars->chunks_voffset; delete pars; return 0; }
void filterReads(char * inBamFile, char * outBamFile, int minMapQual, int minLen, int maxMisMatches, float minPcId, float minPcAln, int ignoreSuppAlignments, int ignoreSecondaryAlignments) { // int result = -1; int outResult = -1; int supp_check = 0x0; if (ignoreSuppAlignments) { supp_check |= BAM_FSUPPLEMENTARY; } if (ignoreSecondaryAlignments) { supp_check |= BAM_FSECONDARY; } // helper variables BGZF* in = 0; BGZF* out = 0; bam1_t *b = bam_init1(); bam_hdr_t *h; // open bam if ((in = bgzf_open(inBamFile, "r")) == 0) { fprintf(stderr, "ERROR: Failed to open \"%s\" for reading.\n", inBamFile); } else if ((h = bam_hdr_read(in)) == 0) { // read header fprintf(stderr, "ERROR: Failed to read BAM header of file \"%s\".\n", inBamFile); } else if ((out = bgzf_open(outBamFile, "w")) == 0) { fprintf(stderr, "ERROR: Failed to open \"%s\" for writing.\n", outBamFile); } else { // write and destroy header bam_hdr_write(out, h); bam_hdr_destroy(h); int line = 0; int matches, mismatches, qLen; float pcAln, pcId; int showStats = 0; // fetch alignments while ((result = bam_read1(in, b)) >= 0) { line += 1; // only primary mappings if ((b->core.flag & supp_check) != 0) { if (showStats) fprintf(stdout, "Rejected %d, non-primary\n", line); continue; } // only high quality if (b->core.qual < minMapQual) { if (showStats) fprintf(stdout, "Rejected %d, quality: %d\n", line, b->core.qual); continue; } // not too many absolute mismatches mismatches = bam_aux2i(bam_aux_get(b, "NM")); if (mismatches > maxMisMatches) { if (showStats) fprintf(stdout, "Rejected %d, mismatches: %d\n", line, mismatches); continue; } // not too short qLen = bam_cigar2qlen((&b->core)->n_cigar, bam_get_cigar(b)); if (qLen < minLen) { if (showStats) fprintf(stdout, "Rejected %d, length: %d\n", line, qLen); continue; } // only high percent identity matches = bam_cigar2matches((&b->core)->n_cigar, bam_get_cigar(b)); pcId = (matches - mismatches) / (float)matches; // percentage as float between 0 to 1 if (pcId < minPcId) { if (showStats) fprintf(stdout, "Rejected %d, identity pc: %.4f\n", line, pcId); continue; } // only high percent alignment pcAln = matches / (float)qLen; // percentage as float between 0 to 1 if (pcAln < minPcAln) { if (showStats) fprintf(stdout, "Rejected %d, alignment pc: %.4f\n", line, pcAln); continue; } if ((outResult = bam_write1(out, b)) < -1) { fprintf(stderr, "ERROR: Attempt to write read no. %d to file \"%s\" failed with code %d.\n", line, outBamFile, outResult); } } if (result < -1) { fprintf(stderr, "ERROR: retrieval of read no. %d from file \"%s\" failed with code %d.\n", line, inBamFile, result); } } if (in) bgzf_close(in); if (out) bgzf_close(out); bam_destroy1(b); }