int bam_reheader(BGZF *in, const bam_header_t *h, int fd) { BGZF *fp; bam_header_t *old; int len; uint8_t *buf; if (in->open_mode != 'r') return -1; buf = malloc(BUF_SIZE); old = bam_header_read(in); fp = bgzf_dopen(fd, "w"); bam_header_write(fp, h); if (in->block_offset < in->block_length) { bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset); bgzf_flush(fp); } #ifdef _USE_KNETFILE while ((len = knet_read(in->fp, buf, BUF_SIZE)) > 0) fwrite(buf, 1, len, fp->fp); #else while (!feof(in->fp) && (len = fread(buf, 1, BUF_SIZE, in->fp)) > 0) fwrite(buf, 1, len, fp->fp); #endif free(buf); fp->block_offset = in->block_offset = 0; bgzf_close(fp); return 0; }
void BAMbinSortByCoordinate(uint32 iBin, uint binN, uint binS, uint nThreads, string dirBAMsort, Parameters *P) { if (binS==0) return; //nothing to do for empty bins //allocate arrays char *bamIn=new char[binS]; uint *startPos=new uint[binN*3]; uint bamInBytes=0; //load all aligns for (uint it=0; it<nThreads; it++) { string bamInFile=dirBAMsort+to_string(it)+"/"+to_string((uint) iBin); ifstream bamInStream (bamInFile.c_str()); bamInStream.read(bamIn+bamInBytes,binS);//read the whole file bamInBytes += bamInStream.gcount(); bamInStream.close(); remove(bamInFile.c_str()); }; if (bamInBytes!=binS) { ostringstream errOut; errOut << "EXITING because of FATAL ERROR: number of bytes expected from the BAM bin does not agree with the actual size on disk: "; errOut << binS <<" "<< bamInBytes <<" "<< iBin <<"\n"; exitWithError(errOut.str(),std::cerr, P->inOut->logMain, 1, *P); }; //extract coordinates for (uint ib=0,ia=0;ia<binN;ia++) { uint32 *bamIn32=(uint32*) (bamIn+ib); startPos[ia*3] =( ((uint) bamIn32[1]) << 32) | ( (uint)bamIn32[2] ); startPos[ia*3+2]=ib; ib+=bamIn32[0]+sizeof(uint32);//note that size of the BAM record does not include the size record itself startPos[ia*3+1]=*( (uint*) (bamIn+ib) ); //read order ib+=sizeof(uint); }; //sort qsort((void*) startPos, binN, sizeof(uint)*3, funCompareUint2); BGZF *bgzfBin; bgzfBin=bgzf_open((dirBAMsort+"/b"+to_string((uint) iBin)).c_str(),("w"+to_string((long long) P->outBAMcompression)).c_str()); outBAMwriteHeader(bgzfBin,P->samHeaderSortedCoord,P->chrName,P->chrLength); //send ordered aligns to bgzf one-by-one for (uint ia=0;ia<binN;ia++) { char* ib=bamIn+startPos[ia*3+2]; bgzf_write(bgzfBin,ib, *((uint32*) ib)+sizeof(uint32) ); }; bgzf_flush(bgzfBin); bgzf_close(bgzfBin); //release memory delete [] bamIn; delete [] startPos; };
void outBAMwriteHeader (BGZF* fp, const string &samh, const vector <string> &chrn, const vector <uint> &chrl) { bgzf_write(fp,"BAM\001",4); int32 hlen=samh.size(); bgzf_write(fp,(char*) &hlen,sizeof(hlen)); bgzf_write(fp,samh.c_str(),hlen); int32 nchr=(int32) chrn.size(); bgzf_write(fp,(char*) &nchr,sizeof(nchr)); for (int32 ii=0;ii<nchr;ii++) { int32 rlen = (int32) (chrn.at(ii).size()+1); int32 slen = (int32) chrl[ii]; bgzf_write(fp,(char*) &rlen,sizeof(rlen)); bgzf_write(fp,chrn.at(ii).data(),rlen); //this includes \0 at the end of the string bgzf_write(fp,(char*) &slen,sizeof(slen)); }; bgzf_flush(fp); };
/* * Reads a file and outputs a new BAM file to fd with 'h' replaced as * the header. No checks are made to the validity. */ int bam_reheader(BGZF *in, bam_hdr_t *h, int fd, const char *arg_list, int add_PG) { BGZF *fp; ssize_t len; uint8_t *buf; if (in->is_write) return -1; buf = malloc(BUF_SIZE); if (bam_hdr_read(in) == NULL) { fprintf(stderr, "Couldn't read header\n"); free(buf); return -1; } fp = bgzf_fdopen(fd, "w"); if (add_PG) { // Around the houses, but it'll do until we can manipulate bam_hdr_t natively. SAM_hdr *sh = sam_hdr_parse_(h->text, h->l_text); if (sam_hdr_add_PG(sh, "samtools", "VN", samtools_version(), arg_list ? "CL": NULL, arg_list ? arg_list : NULL, NULL) != 0) return -1; free(h->text); h->text = strdup(sam_hdr_str(sh)); h->l_text = sam_hdr_length(sh); if (!h->text) return -1; sam_hdr_free(sh); } bam_hdr_write(fp, h); if (in->block_offset < in->block_length) { bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset); bgzf_flush(fp); } while ((len = bgzf_raw_read(in, buf, BUF_SIZE)) > 0) bgzf_raw_write(fp, buf, len); free(buf); fp->block_offset = in->block_offset = 0; bgzf_close(fp); return 0; }
int bam_reheader(BGZF *in, const bam_header_t *h, int fd) { BGZF *fp; bam_header_t *old; ssize_t len; uint8_t *buf; if (in->is_write) return -1; buf = malloc(BUF_SIZE); old = bam_header_read(in); fp = bgzf_fdopen(fd, "w"); bam_header_write(fp, h); if (in->block_offset < in->block_length) { bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset); bgzf_flush(fp); } while ((len = bgzf_raw_read(in, buf, BUF_SIZE)) > 0) bgzf_raw_write(fp, buf, len); free(buf); fp->block_offset = in->block_offset = 0; bgzf_close(fp); return 0; }
int bam_header_write(bamFile fp, const bam_header_t *header) { char buf[4]; int32_t i, name_len, x; // write "BAM1" strncpy(buf, "BAM\001", 4); bam_write(fp, buf, 4); // write plain text and the number of reference sequences if (bam_is_be) { x = bam_swap_endian_4(header->l_text); bam_write(fp, &x, 4); if (header->l_text) bam_write(fp, header->text, header->l_text); x = bam_swap_endian_4(header->n_targets); bam_write(fp, &x, 4); } else { bam_write(fp, &header->l_text, 4); if (header->l_text) bam_write(fp, header->text, header->l_text); bam_write(fp, &header->n_targets, 4); } // write sequence names and lengths for (i = 0; i != header->n_targets; ++i) { char *p = header->target_name[i]; name_len = strlen(p) + 1; if (bam_is_be) { x = bam_swap_endian_4(name_len); bam_write(fp, &x, 4); } else bam_write(fp, &name_len, 4); bam_write(fp, p, name_len); if (bam_is_be) { x = bam_swap_endian_4(header->target_len[i]); bam_write(fp, &x, 4); } else bam_write(fp, &header->target_len[i], 4); } bgzf_flush(fp); return 0; }
static void naive_concat(args_t *args) { // only compressed BCF atm BGZF *bgzf_out = bgzf_open(args->output_fname,"w");; const size_t page_size = 32768; char *buf = (char*) malloc(page_size); kstring_t tmp = {0,0,0}; int i; for (i=0; i<args->nfnames; i++) { htsFile *hts_fp = hts_open(args->fnames[i],"r"); if ( !hts_fp ) error("Failed to open: %s\n", args->fnames[i]); htsFormat type = *hts_get_format(hts_fp); if ( type.format==vcf ) error("The --naive option currently works only for compressed BCFs, sorry :-/\n"); if ( type.compression!=bgzf ) error("The --naive option currently works only for compressed BCFs, sorry :-/\n"); BGZF *fp = hts_get_bgzfp(hts_fp); if ( !fp || bgzf_read_block(fp) != 0 || !fp->block_length ) error("Failed to read %s: %s\n", args->fnames[i], strerror(errno)); uint8_t magic[5]; if ( bgzf_read(fp, magic, 5) != 5 ) error("Failed to read the BCF header in %s\n", args->fnames[i]); if (strncmp((char*)magic, "BCF\2\2", 5) != 0) error("Invalid BCF magic string in %s\n", args->fnames[i]); if ( bgzf_read(fp, &tmp.l, 4) != 4 ) error("Failed to read the BCF header in %s\n", args->fnames[i]); hts_expand(char,tmp.l,tmp.m,tmp.s); if ( bgzf_read(fp, tmp.s, tmp.l) != tmp.l ) error("Failed to read the BCF header in %s\n", args->fnames[i]); // write only the first header if ( i==0 ) { if ( bgzf_write(bgzf_out, "BCF\2\2", 5) !=5 ) error("Failed to write %d bytes to %s\n", 5,args->output_fname); if ( bgzf_write(bgzf_out, &tmp.l, 4) !=4 ) error("Failed to write %d bytes to %s\n", 4,args->output_fname); if ( bgzf_write(bgzf_out, tmp.s, tmp.l) != tmp.l) error("Failed to write %d bytes to %s\n", tmp.l,args->output_fname); } // Output all non-header data that were read together with the header block int nskip = fp->block_offset; if ( fp->block_length - nskip > 0 ) { if ( bgzf_write(bgzf_out, fp->uncompressed_block+nskip, fp->block_length-nskip)<0 ) error("Error: %d\n",fp->errcode); } if ( bgzf_flush(bgzf_out)<0 ) error("Error: %d\n",bgzf_out->errcode); // Stream the rest of the file as it is, without recompressing, but remove BGZF EOF blocks ssize_t nread, ncached = 0, nwr; const int neof = 28; char cached[neof]; while (1) { nread = bgzf_raw_read(fp, buf, page_size); // page_size boundary may occur in the middle of the EOF block, so we need to cache the blocks' ends if ( nread<=0 ) break; if ( nread<=neof ) // last block { if ( ncached ) { // flush the part of the cache that won't be needed nwr = bgzf_raw_write(bgzf_out, cached, nread); if (nwr != nread) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)nread); // make space in the cache so that we can append to the end if ( nread!=neof ) memmove(cached,cached+nread,neof-nread); } // fill the cache and check for eof outside this loop memcpy(cached+neof-nread,buf,nread); break; } // not the last block, flush the cache if full if ( ncached ) { nwr = bgzf_raw_write(bgzf_out, cached, ncached); if (nwr != ncached) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)ncached); ncached = 0; } // fill the cache nread -= neof; memcpy(cached,buf+nread,neof); ncached = neof; nwr = bgzf_raw_write(bgzf_out, buf, nread); if (nwr != nread) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)nread); } if ( ncached && memcmp(cached,"\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0",neof) ) { nwr = bgzf_raw_write(bgzf_out, cached, neof); if (nwr != neof) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)neof); } if (hts_close(hts_fp)) error("Close failed: %s\n",args->fnames[i]); } free(buf); free(tmp.s); if (bgzf_close(bgzf_out) < 0) error("Error: %d\n",bgzf_out->errcode); }
int reheader_file(const char *header, const char *file, int meta) { BGZF *fp = bgzf_open(file,"r"); if (bgzf_read_block(fp) != 0 || !fp->block_length) return -1; char *buffer = fp->uncompressed_block; int skip_until = 0; if ( buffer[0]==meta ) { skip_until = 1; // Skip the header while (1) { if ( buffer[skip_until]=='\n' ) { skip_until++; if ( skip_until>=fp->block_length ) { if (bgzf_read_block(fp) != 0 || !fp->block_length) error("no body?\n"); skip_until = 0; } // The header has finished if ( buffer[skip_until]!=meta ) break; } skip_until++; if ( skip_until>=fp->block_length ) { if (bgzf_read_block(fp) != 0 || !fp->block_length) error("no body?\n"); skip_until = 0; } } } FILE *fh = fopen(header,"r"); if ( !fh ) error("%s: %s", header,strerror(errno)); int page_size = getpagesize(); char *buf = valloc(page_size); BGZF *bgzf_out = bgzf_fdopen(fileno(stdout), "w"); ssize_t nread; while ( (nread=fread(buf,1,page_size-1,fh))>0 ) { if ( nread<page_size-1 && buf[nread-1]!='\n' ) buf[nread++] = '\n'; if (bgzf_write(bgzf_out, buf, nread) < 0) error("Error: %s\n",bgzf_out->error); } fclose(fh); if ( fp->block_length - skip_until > 0 ) { if (bgzf_write(bgzf_out, buffer+skip_until, fp->block_length-skip_until) < 0) error("Error: %s\n",fp->error); } if (bgzf_flush(bgzf_out) < 0) error("Error: %s\n",bgzf_out->error); while (1) { #ifdef _USE_KNETFILE nread = knet_read(fp->x.fpr, buf, page_size); #else nread = fread(buf, 1, page_size, fp->file); #endif if ( nread<=0 ) break; #ifdef _USE_KNETFILE int count = fwrite(buf, 1, nread, bgzf_out->x.fpw); #else int count = fwrite(buf, 1, nread, bgzf_out->file); #endif if (count != nread) error("Write failed, wrote %d instead of %d bytes.\n", count,(int)nread); } if (bgzf_close(bgzf_out) < 0) error("Error: %s\n",bgzf_out->error); return 0; }
int bam_cat(int nfn, char * const *fn, const bam_header_t *h, const char* outbam) { BGZF *fp; FILE* fp_file; uint8_t *buf; uint8_t ebuf[BGZF_EMPTY_BLOCK_SIZE]; const int es=BGZF_EMPTY_BLOCK_SIZE; int i; fp = strcmp(outbam, "-")? bgzf_open(outbam, "w") : bgzf_fdopen(_fileno(stdout), "w"); if (fp == 0) { fprintf(stderr, "[%s] ERROR: fail to open output file '%s'.\n", __FUNCTION__, outbam); return 1; } if (h) bam_header_write(fp, h); buf = (uint8_t*) malloc(BUF_SIZE); for(i = 0; i < nfn; ++i){ BGZF *in; bam_header_t *old; int len,j; in = strcmp(fn[i], "-")? bam_open(fn[i], "r") : bam_dopen(_fileno(stdin), "r"); if (in == 0) { fprintf(stderr, "[%s] ERROR: fail to open file '%s'.\n", __FUNCTION__, fn[i]); return -1; } if (in->open_mode != 'r') return -1; old = bam_header_read(in); if (h == 0 && i == 0) bam_header_write(fp, old); if (in->block_offset < in->block_length) { bgzf_write(fp, (uint8_t*)in->uncompressed_block + in->block_offset, in->block_length - in->block_offset); bgzf_flush(fp); } j=0; #ifdef _USE_KNETFILE fp_file=fp->x.fpw; while ((len = knet_read(in->x.fpr, buf, BUF_SIZE)) > 0) { #else fp_file=fp->file; while (!feof(in->file) && (len = fread(buf, 1, BUF_SIZE, in->file)) > 0) { #endif if(len<es){ int diff=es-len; if(j==0) { fprintf(stderr, "[%s] ERROR: truncated file?: '%s'.\n", __FUNCTION__, fn[i]); return -1; } fwrite(ebuf, 1, len, fp_file); memcpy(ebuf,ebuf+len,diff); memcpy(ebuf+diff,buf,len); } else { if(j!=0) fwrite(ebuf, 1, es, fp_file); len-= es; memcpy(ebuf,buf+len,es); fwrite(buf, 1, len, fp_file); } j=1; } /* check final gzip block */ { const uint8_t gzip1=ebuf[0]; const uint8_t gzip2=ebuf[1]; const uint32_t isize=*((uint32_t*)(ebuf+es-4)); if(((gzip1!=GZIPID1) || (gzip2!=GZIPID2)) || (isize!=0)) { fprintf(stderr, "[%s] WARNING: Unexpected block structure in file '%s'.", __FUNCTION__, fn[i]); fprintf(stderr, " Possible output corruption.\n"); fwrite(ebuf, 1, es, fp_file); } } bam_header_destroy(old); bgzf_close(in); } free(buf); bgzf_close(fp); return 0; } int main_cat(int argc, char *argv[]) { bam_header_t *h = 0; char *outfn = 0; int c, ret; while ((c = getopt(argc, argv, "h:o:")) >= 0) { switch (c) { case 'h': { tamFile fph = sam_open(optarg); if (fph == 0) { fprintf(stderr, "[%s] ERROR: fail to read the header from '%s'.\n", __FUNCTION__, argv[1]); return 1; } h = sam_header_read(fph); sam_close(fph); break; } case 'o': outfn = strdup(optarg); break; } } if (argc - optind < 2) { fprintf(stderr, "Usage: samtools cat [-h header.sam] [-o out.bam] <in1.bam> <in2.bam> [...]\n"); return 1; } ret = bam_cat(argc - optind, argv + optind, h, outfn? outfn : "-"); free(outfn); return ret; }
int main(int argInN, char* argIn[]) { time(&g_statsAll.timeStart); Parameters *P = new Parameters; //all parameters P->inputParameters(argInN, argIn); *(P->inOut->logStdOut) << timeMonthDayTime(g_statsAll.timeStart) << " ..... Started STAR run\n" <<flush; //generate genome if (P->runMode=="genomeGenerate") { genomeGenerate(P); (void) sysRemoveDir (P->outFileTmp); P->inOut->logMain << "DONE: Genome generation, EXITING\n" << flush; exit(0); } else if (P->runMode!="alignReads") { P->inOut->logMain << "EXITING because of INPUT ERROR: unknown value of input parameter runMode=" <<P->runMode<<endl<<flush; exit(1); }; Genome mainGenome (P); mainGenome.genomeLoad(); if (P->genomeLoad=="LoadAndExit" || P->genomeLoad=="Remove") { return 0; }; P->twoPass.pass2=false; //this is the 1st pass SjdbClass sjdbLoci; if (P->sjdbInsert.pass1) { Parameters *P1=new Parameters; *P1=*P; sjdbInsertJunctions(P, P1, mainGenome, sjdbLoci); }; //calculate genome-related parameters Transcriptome *mainTranscriptome=NULL; /////////////////////////////////////////////////////////////////////////////////////////////////START if (P->runThreadN>1) { g_threadChunks.threadArray=new pthread_t[P->runThreadN]; pthread_mutex_init(&g_threadChunks.mutexInRead, NULL); pthread_mutex_init(&g_threadChunks.mutexOutSAM, NULL); pthread_mutex_init(&g_threadChunks.mutexOutBAM1, NULL); pthread_mutex_init(&g_threadChunks.mutexOutUnmappedFastx, NULL); pthread_mutex_init(&g_threadChunks.mutexOutFilterBySJout, NULL); pthread_mutex_init(&g_threadChunks.mutexStats, NULL); pthread_mutex_init(&g_threadChunks.mutexBAMsortBins, NULL); }; g_statsAll.progressReportHeader(P->inOut->logProgress); if (P->twoPass.yes) {//2-pass //re-define P for the pass1 Parameters *P1=new Parameters; *P1=*P; //turn off unnecessary calculations P1->outSAMtype[0]="None"; P1->outSAMbool=false; P1->outBAMunsorted=false; P1->outBAMcoord=false; P1->chimSegmentMin=0; P1->quant.yes=false; P1->quant.trSAM.yes=false; P1->quant.geCount.yes=false; P1->outFilterBySJoutStage=0; P1->outReadsUnmapped="None"; P1->outFileNamePrefix=P->twoPass.dir; P1->readMapNumber=P->twoPass.pass1readsN; // P1->inOut->logMain.open((P1->outFileNamePrefix + "Log.out").c_str()); g_statsAll.resetN(); time(&g_statsAll.timeStartMap); P->inOut->logProgress << timeMonthDayTime(g_statsAll.timeStartMap) <<"\tStarted 1st pass mapping\n" <<flush; *P->inOut->logStdOut << timeMonthDayTime(g_statsAll.timeStartMap) << " ..... Started 1st pass mapping\n" <<flush; //run mapping for Pass1 ReadAlignChunk *RAchunk1[P->runThreadN]; for (int ii=0;ii<P1->runThreadN;ii++) { RAchunk1[ii]=new ReadAlignChunk(P1, mainGenome, mainTranscriptome, ii); }; mapThreadsSpawn(P1, RAchunk1); outputSJ(RAchunk1,P1); //collapse and output junctions // for (int ii=0;ii<P1->runThreadN;ii++) { // delete [] RAchunk[ii]; // }; time_t rawtime; time (&rawtime); P->inOut->logProgress << timeMonthDayTime(rawtime) <<"\tFinished 1st pass mapping\n"; *P->inOut->logStdOut << timeMonthDayTime(rawtime) << " ..... Finished 1st pass mapping\n" <<flush; ofstream logFinal1 ( (P->twoPass.dir + "/Log.final.out").c_str()); g_statsAll.reportFinal(logFinal1,P1); P->twoPass.pass2=true;//starting the 2nd pass P->twoPass.pass1sjFile=P->twoPass.dir+"/SJ.out.tab"; sjdbInsertJunctions(P, P1, mainGenome, sjdbLoci); //reopen reads files P->closeReadsFiles(); P->openReadsFiles(); } else {//not 2-pass //nothing for now }; if ( P->quant.yes ) {//load transcriptome mainTranscriptome=new Transcriptome(P); }; //initialize Stats g_statsAll.resetN(); time(&g_statsAll.timeStartMap); *P->inOut->logStdOut << timeMonthDayTime(g_statsAll.timeStartMap) << " ..... Started mapping\n" <<flush; g_statsAll.timeLastReport=g_statsAll.timeStartMap; //open SAM/BAM files for output if (P->outSAMmode != "None") {//open SAM file and write header ostringstream samHeaderStream; for (uint ii=0;ii<P->nChrReal;ii++) { samHeaderStream << "@SQ\tSN:"<< P->chrName.at(ii) <<"\tLN:"<<P->chrLength[ii]<<"\n"; }; if (P->outSAMheaderPG.at(0)!="-") { samHeaderStream << P->outSAMheaderPG.at(0); for (uint ii=1;ii<P->outSAMheaderPG.size(); ii++) { samHeaderStream << "\t" << P->outSAMheaderPG.at(ii); }; samHeaderStream << "\n"; }; samHeaderStream << "@PG\tID:STAR\tPN:STAR\tVN:" << STAR_VERSION <<"\tCL:" << P->commandLineFull <<"\n"; if (P->outSAMheaderCommentFile!="-") { ifstream comstream (P->outSAMheaderCommentFile); while (comstream.good()) { string line1; getline(comstream,line1); if (line1.find_first_not_of(" \t\n\v\f\r")!=std::string::npos) {//skip blank lines samHeaderStream << line1 <<"\n"; }; }; }; for (uint32 ii=0;ii<P->outSAMattrRGlineSplit.size();ii++) {//@RG lines samHeaderStream << "@RG\t" << P->outSAMattrRGlineSplit.at(ii) <<"\n"; }; samHeaderStream << "@CO\t" <<"user command line: " << P->commandLine <<"\n"; if (P->outSAMheaderHD.at(0)!="-") { P->samHeaderHD = P->outSAMheaderHD.at(0); for (uint ii=1;ii<P->outSAMheaderHD.size(); ii++) { P->samHeaderHD +="\t" + P->outSAMheaderHD.at(ii); }; } else { P->samHeaderHD = "@HD\tVN:1.4"; }; P->samHeader=P->samHeaderHD+"\n"+samHeaderStream.str(); //for the sorted BAM, need to add SO:cooridnate to the header line P->samHeaderSortedCoord=P->samHeaderHD + (P->outSAMheaderHD.size()==0 ? "" : "\tSO:coordinate") + "\n" + samHeaderStream.str(); if (P->outSAMbool) {// *P->inOut->outSAM << P->samHeader; }; if (P->outBAMunsorted){ outBAMwriteHeader(P->inOut->outBAMfileUnsorted,P->samHeader,P->chrName,P->chrLength); }; // if (P->outBAMcoord){ // outBAMwriteHeader(P->inOut->outBAMfileCoord,P->samHeader,P->chrName,P->chrLength); // }; if ( P->quant.trSAM.yes ) { samHeaderStream.str(""); vector <uint> trlength; for (uint32 ii=0;ii<mainTranscriptome->trID.size();ii++) { uint32 iex1=mainTranscriptome->trExI[ii]+mainTranscriptome->trExN[ii]-1; //last exon of the transcript trlength.push_back(mainTranscriptome->exLenCum[iex1]+mainTranscriptome->exSE[2*iex1+1]-mainTranscriptome->exSE[2*iex1]+1); samHeaderStream << "@SQ\tSN:"<< mainTranscriptome->trID.at(ii) <<"\tLN:"<<trlength.back()<<"\n"; }; for (uint32 ii=0;ii<P->outSAMattrRGlineSplit.size();ii++) {//@RG lines samHeaderStream << "@RG\t" << P->outSAMattrRGlineSplit.at(ii) <<"\n"; }; outBAMwriteHeader(P->inOut->outQuantBAMfile,samHeaderStream.str(),mainTranscriptome->trID,trlength); }; }; if (P->chimSegmentMin>0) { P->inOut->outChimJunction.open((P->outFileNamePrefix + "Chimeric.out.junction").c_str()); P->inOut->outChimSAM.open((P->outFileNamePrefix + "Chimeric.out.sam").c_str()); P->inOut->outChimSAM << P->samHeader; pthread_mutex_init(&g_threadChunks.mutexOutChimSAM, NULL); pthread_mutex_init(&g_threadChunks.mutexOutChimJunction, NULL); }; // P->inOut->logMain << "mlock value="<<mlockall(MCL_CURRENT|MCL_FUTURE) <<"\n"<<flush; // prepare chunks and spawn mapping threads ReadAlignChunk *RAchunk[P->runThreadN]; for (int ii=0;ii<P->runThreadN;ii++) { RAchunk[ii]=new ReadAlignChunk(P, mainGenome, mainTranscriptome, ii); }; mapThreadsSpawn(P, RAchunk); if (P->outFilterBySJoutStage==1) {//completed stage 1, go to stage 2 P->inOut->logMain << "Completed stage 1 mapping of outFilterBySJout mapping\n"<<flush; outputSJ(RAchunk,P);//collapse novel junctions P->readFilesIndex=-1; P->outFilterBySJoutStage=2; if (P->outBAMcoord) { for (int it=0; it<P->runThreadN; it++) {//prepare the unmapped bin RAchunk[it]->chunkOutBAMcoord->coordUnmappedPrepareBySJout(); }; }; mapThreadsSpawn(P, RAchunk); }; //close some BAM files if (P->inOut->outBAMfileUnsorted!=NULL) { bgzf_flush(P->inOut->outBAMfileUnsorted); bgzf_close(P->inOut->outBAMfileUnsorted); }; if (P->inOut->outQuantBAMfile!=NULL) { bgzf_flush(P->inOut->outQuantBAMfile); bgzf_close(P->inOut->outQuantBAMfile); }; if (P->outBAMcoord && P->limitBAMsortRAM==0) {//make it equal ot the genome size P->limitBAMsortRAM=P->nGenome+mainGenome.SA.lengthByte+mainGenome.SAi.lengthByte; }; //no need for genome anymore, free the memory mainGenome.freeMemory(); if ( P->quant.geCount.yes ) {//output gene quantifications for (int ichunk=1; ichunk<P->runThreadN; ichunk++) {//sum counts from all chunks into 0th chunk RAchunk[0]->chunkTr->quants->addQuants(*(RAchunk[ichunk]->chunkTr->quants)); }; RAchunk[0]->chunkTr->quantsOutput(); }; if (P->runThreadN>1 && P->outSAMorder=="PairedKeepInputOrder") {//concatenate Aligned.* files RAchunk[0]->chunkFilesCat(P->inOut->outSAM, P->outFileTmp + "/Aligned.out.sam.chunk", g_threadChunks.chunkOutN); }; if (P->outBAMcoord) {//sort BAM if needed *P->inOut->logStdOut << timeMonthDayTime() << " ..... Started sorting BAM\n" <<flush; P->inOut->logMain << timeMonthDayTime() << " ..... Started sorting BAM\n" <<flush; uint32 nBins=P->outBAMcoordNbins; //check max size needed for sorting uint maxMem=0; for (uint32 ibin=0; ibin<nBins-1; ibin++) {//check akk bins uint binS=0; for (int it=0; it<P->runThreadN; it++) {//collect sizes from threads binS += RAchunk[it]->chunkOutBAMcoord->binTotalBytes[ibin]+24*RAchunk[it]->chunkOutBAMcoord->binTotalN[ibin]; }; if (binS>maxMem) maxMem=binS; }; P->inOut->logMain << "Max memory needed for sorting = "<<maxMem<<endl; if (maxMem>P->limitBAMsortRAM) { ostringstream errOut; errOut <<"EXITING because of fatal ERROR: not enough memory for BAM sorting: \n"; errOut <<"SOLUTION: re-run STAR with at least --limitBAMsortRAM " <<maxMem+1000000000; exitWithError(errOut.str(), std::cerr, P->inOut->logMain, EXIT_CODE_PARAMETER, *P); }; uint totalMem=0; // P->inOut->logMain << "Started sorting BAM ..." <<endl; #pragma omp parallel num_threads(P->outBAMsortingThreadNactual) #pragma omp for schedule (dynamic,1) for (uint32 ibin1=0; ibin1<nBins; ibin1++) { uint32 ibin=nBins-1-ibin1;//reverse order to start with the last bin - unmapped reads uint binN=0, binS=0; for (int it=0; it<P->runThreadN; it++) {//collect sizes from threads binN += RAchunk[it]->chunkOutBAMcoord->binTotalN[ibin]; binS += RAchunk[it]->chunkOutBAMcoord->binTotalBytes[ibin]; }; if (binS==0) continue; //empty bin if (ibin == nBins-1) {//last bin for unmapped reads BAMbinSortUnmapped(ibin,P->runThreadN,P->outBAMsortTmpDir,P->inOut->outBAMfileCoord, P); } else { uint newMem=binS+binN*24; bool boolWait=true; while (boolWait) { #pragma omp critical if (totalMem+newMem < P->limitBAMsortRAM) { boolWait=false; totalMem+=newMem; }; sleep(0.1); }; BAMbinSortByCoordinate(ibin,binN,binS,P->runThreadN,P->outBAMsortTmpDir,P->inOut->outBAMfileCoord, P); #pragma omp critical totalMem-=newMem;//"release" RAM }; }; //concatenate all BAM files, using bam_cat char **bamBinNames = new char* [nBins]; vector <string> bamBinNamesV; for (uint32 ibin=0; ibin<nBins; ibin++) { bamBinNamesV.push_back(P->outBAMsortTmpDir+"/b"+to_string((uint) ibin)); struct stat buffer; if (stat (bamBinNamesV.back().c_str(), &buffer) != 0) {//check if file exists bamBinNamesV.pop_back(); }; }; for (uint32 ibin=0; ibin<bamBinNamesV.size(); ibin++) { bamBinNames[ibin] = (char*) bamBinNamesV.at(ibin).c_str(); }; bam_cat(bamBinNamesV.size(), bamBinNames, 0, P->outBAMfileCoordName.c_str()); }; //wiggle output if (P->outWigFlags.yes) { *P->inOut->logStdOut << timeMonthDayTime() << " ..... Started wiggle output\n" <<flush; P->inOut->logMain << timeMonthDayTime() << " ..... Started wiggle output\n" <<flush; string wigOutFileNamePrefix=P->outFileNamePrefix + "Signal"; signalFromBAM(P->outBAMfileCoordName, wigOutFileNamePrefix, *P); }; //aggregate output junctions //collapse splice junctions from different threads/chunks, and output them outputSJ(RAchunk,P); g_statsAll.progressReport(P->inOut->logProgress); P->inOut->logProgress << "ALL DONE!\n"<<flush; P->inOut->logFinal.open((P->outFileNamePrefix + "Log.final.out").c_str()); g_statsAll.reportFinal(P->inOut->logFinal,P); *P->inOut->logStdOut << timeMonthDayTime(g_statsAll.timeFinish) << " ..... Finished successfully\n" <<flush; P->inOut->logMain << "ALL DONE!\n"<<flush; sysRemoveDir (P->outFileTmp); P->closeReadsFiles();//this will kill the readFilesCommand processes if necessary mainGenome.~Genome(); //need explicit call because of the 'delete P->inOut' below, which will destroy P->inOut->logStdOut delete P->inOut; //to close files delete P; return 0; };
int reheader_file(const char *fname, const char *header, int ftype, tbx_conf_t *conf) { if ( ftype & IS_TXT || !ftype ) { BGZF *fp = bgzf_open(fname,"r"); if ( !fp || bgzf_read_block(fp) != 0 || !fp->block_length ) return -1; char *buffer = fp->uncompressed_block; int skip_until = 0; // Skip the header: find out the position of the data block if ( buffer[0]==conf->meta_char ) { skip_until = 1; while (1) { if ( buffer[skip_until]=='\n' ) { skip_until++; if ( skip_until>=fp->block_length ) { if ( bgzf_read_block(fp) != 0 || !fp->block_length ) error("FIXME: No body in the file: %s\n", fname); skip_until = 0; } // The header has finished if ( buffer[skip_until]!=conf->meta_char ) break; } skip_until++; if ( skip_until>=fp->block_length ) { if (bgzf_read_block(fp) != 0 || !fp->block_length) error("FIXME: No body in the file: %s\n", fname); skip_until = 0; } } } // Output the new header FILE *hdr = fopen(header,"r"); if ( !hdr ) error("%s: %s", header,strerror(errno)); const size_t page_size = 32768; char *buf = malloc(page_size); BGZF *bgzf_out = bgzf_open("-", "w"); ssize_t nread; while ( (nread=fread(buf,1,page_size-1,hdr))>0 ) { if ( nread<page_size-1 && buf[nread-1]!='\n' ) buf[nread++] = '\n'; if (bgzf_write(bgzf_out, buf, nread) < 0) error("Error: %d\n",bgzf_out->errcode); } if ( fclose(hdr) ) error("close failed: %s\n", header); // Output all remainig data read with the header block if ( fp->block_length - skip_until > 0 ) { if (bgzf_write(bgzf_out, buffer+skip_until, fp->block_length-skip_until) < 0) error("Error: %d\n",fp->errcode); } if (bgzf_flush(bgzf_out) < 0) error("Error: %d\n",bgzf_out->errcode); while (1) { nread = bgzf_raw_read(fp, buf, page_size); if ( nread<=0 ) break; int count = bgzf_raw_write(bgzf_out, buf, nread); if (count != nread) error("Write failed, wrote %d instead of %d bytes.\n", count,(int)nread); } if (bgzf_close(bgzf_out) < 0) error("Error: %d\n",bgzf_out->errcode); if (bgzf_close(fp) < 0) error("Error: %d\n",fp->errcode); free(buf); } else error("todo: reheader BCF, BAM\n"); // BCF is difficult, records contain pointers to the header. return 0; }
static void reheader_vcf_gz(args_t *args) { BGZF *fp = bgzf_open(args->fname,"r"); if ( !fp || bgzf_read_block(fp) != 0 || !fp->block_length ) error("Failed to read %s: %s\n", args->fname, strerror(errno)); kstring_t hdr = {0,0,0}; char *buffer = (char*) fp->uncompressed_block; // Read the header and find the position of the data block if ( buffer[0]!='#' ) error("Could not parse the header, expected '#', found '%c'\n", buffer[0]); int skip_until = 1; // end of the header in the current uncompressed block while (1) { if ( buffer[skip_until]=='\n' ) { skip_until++; if ( skip_until>=fp->block_length ) { kputsn(buffer,skip_until,&hdr); if ( bgzf_read_block(fp) != 0 || !fp->block_length ) error("FIXME: No body in the file: %s\n", args->fname); skip_until = 0; } // The header has finished if ( buffer[skip_until]!='#' ) { kputsn(buffer,skip_until,&hdr); break; } } skip_until++; if ( skip_until>=fp->block_length ) { kputsn(buffer,fp->block_length,&hdr); if (bgzf_read_block(fp) != 0 || !fp->block_length) error("FIXME: No body in the file: %s\n", args->fname); skip_until = 0; } } int nsamples = 0; char **samples = NULL; if ( args->samples_fname ) samples = hts_readlines(args->samples_fname, &nsamples); if ( args->header_fname ) { free(hdr.s); hdr.s = NULL; hdr.l = hdr.m = 0; read_header_file(args->header_fname, &hdr); } if ( samples ) { set_samples(samples, nsamples, &hdr); int i; for (i=0; i<nsamples; i++) free(samples[i]); free(samples); } // Output the modified header BGZF *bgzf_out = bgzf_dopen(fileno(stdout), "w"); bgzf_write(bgzf_out, hdr.s, hdr.l); free(hdr.s); // Output all remainig data read with the header block if ( fp->block_length - skip_until > 0 ) { if ( bgzf_write(bgzf_out, buffer+skip_until, fp->block_length-skip_until)<0 ) error("Error: %d\n",fp->errcode); } if ( bgzf_flush(bgzf_out)<0 ) error("Error: %d\n",bgzf_out->errcode); // Stream the rest of the file without as it is, without decompressing ssize_t nread; int page_size = getpagesize(); char *buf = (char*) valloc(page_size); while (1) { nread = bgzf_raw_read(fp, buf, page_size); if ( nread<=0 ) break; int count = bgzf_raw_write(bgzf_out, buf, nread); if (count != nread) error("Write failed, wrote %d instead of %d bytes.\n", count,(int)nread); } if (bgzf_close(bgzf_out) < 0) error("Error: %d\n",bgzf_out->errcode); if (bgzf_close(fp) < 0) error("Error: %d\n",fp->errcode); free(buf); }
int bam_cat(int nfn, char * const *fn, const bam_hdr_t *h, const char* outbam) { BGZF *fp; uint8_t *buf; uint8_t ebuf[BGZF_EMPTY_BLOCK_SIZE]; const int es=BGZF_EMPTY_BLOCK_SIZE; int i; fp = strcmp(outbam, "-")? bgzf_open(outbam, "w") : bgzf_fdopen(fileno(stdout), "w"); if (fp == 0) { fprintf(stderr, "[%s] ERROR: fail to open output file '%s'.\n", __func__, outbam); return 1; } if (h) bam_hdr_write(fp, h); buf = (uint8_t*) malloc(BUF_SIZE); for(i = 0; i < nfn; ++i){ BGZF *in; bam_hdr_t *old; int len,j; in = strcmp(fn[i], "-")? bgzf_open(fn[i], "r") : bgzf_fdopen(fileno(stdin), "r"); if (in == 0) { fprintf(stderr, "[%s] ERROR: fail to open file '%s'.\n", __func__, fn[i]); return -1; } if (in->is_write) return -1; old = bam_hdr_read(in); if (old == NULL) { fprintf(stderr, "[%s] ERROR: couldn't read header for '%s'.\n", __func__, fn[i]); bgzf_close(in); return -1; } if (h == 0 && i == 0) bam_hdr_write(fp, old); if (in->block_offset < in->block_length) { bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset); bgzf_flush(fp); } j=0; while ((len = bgzf_raw_read(in, buf, BUF_SIZE)) > 0) { if(len<es){ int diff=es-len; if(j==0) { fprintf(stderr, "[%s] ERROR: truncated file?: '%s'.\n", __func__, fn[i]); return -1; } bgzf_raw_write(fp, ebuf, len); memcpy(ebuf,ebuf+len,diff); memcpy(ebuf+diff,buf,len); } else { if(j!=0) bgzf_raw_write(fp, ebuf, es); len-= es; memcpy(ebuf,buf+len,es); bgzf_raw_write(fp, buf, len); } j=1; } /* check final gzip block */ { const uint8_t gzip1=ebuf[0]; const uint8_t gzip2=ebuf[1]; const uint32_t isize=*((uint32_t*)(ebuf+es-4)); if(((gzip1!=GZIPID1) || (gzip2!=GZIPID2)) || (isize!=0)) { fprintf(stderr, "[%s] WARNING: Unexpected block structure in file '%s'.", __func__, fn[i]); fprintf(stderr, " Possible output corruption.\n"); bgzf_raw_write(fp, ebuf, es); } } bam_hdr_destroy(old); bgzf_close(in); } free(buf); bgzf_close(fp); return 0; }