Esempio n. 1
0
int bam_reheader(BGZF *in, const bam_header_t *h, int fd)
{
	BGZF *fp;
	bam_header_t *old;
	int len;
	uint8_t *buf;
	if (in->open_mode != 'r') return -1;
	buf = malloc(BUF_SIZE);
	old = bam_header_read(in);
	fp = bgzf_dopen(fd, "w");
	bam_header_write(fp, h);
	if (in->block_offset < in->block_length) {
		bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset);
		bgzf_flush(fp);
	}
#ifdef _USE_KNETFILE
	while ((len = knet_read(in->fp, buf, BUF_SIZE)) > 0)
		fwrite(buf, 1, len, fp->fp);
#else
	while (!feof(in->fp) && (len = fread(buf, 1, BUF_SIZE, in->fp)) > 0)
		fwrite(buf, 1, len, fp->fp);
#endif
	free(buf);
	fp->block_offset = in->block_offset = 0;
	bgzf_close(fp);
	return 0;
}
Esempio n. 2
0
void BAMbinSortByCoordinate(uint32 iBin, uint binN, uint binS, uint nThreads, string dirBAMsort, Parameters *P) {
      
    if (binS==0) return; //nothing to do for empty bins
    //allocate arrays
    char *bamIn=new char[binS];
    uint *startPos=new uint[binN*3];

    uint bamInBytes=0;
    //load all aligns
    for (uint it=0; it<nThreads; it++) {
        string bamInFile=dirBAMsort+to_string(it)+"/"+to_string((uint) iBin);
        ifstream bamInStream (bamInFile.c_str());
        bamInStream.read(bamIn+bamInBytes,binS);//read the whole file
        bamInBytes += bamInStream.gcount();
        bamInStream.close();
        remove(bamInFile.c_str());
    };
    if (bamInBytes!=binS) {
        ostringstream errOut;
        errOut << "EXITING because of FATAL ERROR: number of bytes expected from the BAM bin does not agree with the actual size on disk: ";
        errOut << binS <<"   "<< bamInBytes <<"   "<< iBin <<"\n";
        exitWithError(errOut.str(),std::cerr, P->inOut->logMain, 1, *P);
    };
  
    //extract coordinates
    
    for (uint ib=0,ia=0;ia<binN;ia++) {
        uint32 *bamIn32=(uint32*) (bamIn+ib);
        startPos[ia*3]  =( ((uint) bamIn32[1]) << 32) | ( (uint)bamIn32[2] );
        startPos[ia*3+2]=ib;      
        ib+=bamIn32[0]+sizeof(uint32);//note that size of the BAM record does not include the size record itself
        startPos[ia*3+1]=*( (uint*) (bamIn+ib) ); //read order
        ib+=sizeof(uint);
    };
        
    //sort
    qsort((void*) startPos, binN, sizeof(uint)*3, funCompareUint2);
    
    BGZF *bgzfBin;
    bgzfBin=bgzf_open((dirBAMsort+"/b"+to_string((uint) iBin)).c_str(),("w"+to_string((long long) P->outBAMcompression)).c_str());
    outBAMwriteHeader(bgzfBin,P->samHeaderSortedCoord,P->chrName,P->chrLength);
    //send ordered aligns to bgzf one-by-one
    for (uint ia=0;ia<binN;ia++) {
        char* ib=bamIn+startPos[ia*3+2];
        bgzf_write(bgzfBin,ib, *((uint32*) ib)+sizeof(uint32) ); 
    };
    
    bgzf_flush(bgzfBin);
    bgzf_close(bgzfBin);
    //release memory
    delete [] bamIn;
    delete [] startPos;
};
Esempio n. 3
0
void outBAMwriteHeader (BGZF* fp, const string &samh, const vector <string> &chrn, const vector <uint> &chrl) {
    bgzf_write(fp,"BAM\001",4);
    int32 hlen=samh.size();            
    bgzf_write(fp,(char*) &hlen,sizeof(hlen));
    bgzf_write(fp,samh.c_str(),hlen);
    int32 nchr=(int32) chrn.size();
    bgzf_write(fp,(char*) &nchr,sizeof(nchr));
    for (int32 ii=0;ii<nchr;ii++) {
        int32 rlen = (int32) (chrn.at(ii).size()+1);
        int32 slen = (int32) chrl[ii];
        bgzf_write(fp,(char*) &rlen,sizeof(rlen));
        bgzf_write(fp,chrn.at(ii).data(),rlen); //this includes \0 at the end of the string
        bgzf_write(fp,(char*) &slen,sizeof(slen));
    };
    bgzf_flush(fp);
};
Esempio n. 4
0
/*
 * Reads a file and outputs a new BAM file to fd with 'h' replaced as
 * the header.    No checks are made to the validity.
 */
int bam_reheader(BGZF *in, bam_hdr_t *h, int fd,
                 const char *arg_list, int add_PG)
{
    BGZF *fp;
    ssize_t len;
    uint8_t *buf;
    if (in->is_write) return -1;
    buf = malloc(BUF_SIZE);
    if (bam_hdr_read(in) == NULL) {
        fprintf(stderr, "Couldn't read header\n");
        free(buf);
        return -1;
    }
    fp = bgzf_fdopen(fd, "w");

    if (add_PG) {
        // Around the houses, but it'll do until we can manipulate bam_hdr_t natively.
        SAM_hdr *sh = sam_hdr_parse_(h->text, h->l_text);
        if (sam_hdr_add_PG(sh, "samtools",
                           "VN", samtools_version(),
                           arg_list ? "CL": NULL,
                           arg_list ? arg_list : NULL,
                           NULL) != 0)
            return -1;

        free(h->text);
        h->text = strdup(sam_hdr_str(sh));
        h->l_text = sam_hdr_length(sh);
        if (!h->text)
            return -1;
        sam_hdr_free(sh);
    }

    bam_hdr_write(fp, h);
    if (in->block_offset < in->block_length) {
        bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset);
        bgzf_flush(fp);
    }
    while ((len = bgzf_raw_read(in, buf, BUF_SIZE)) > 0)
        bgzf_raw_write(fp, buf, len);
    free(buf);
    fp->block_offset = in->block_offset = 0;
    bgzf_close(fp);
    return 0;
}
Esempio n. 5
0
int bam_reheader(BGZF *in, const bam_header_t *h, int fd)
{
    BGZF *fp;
    bam_header_t *old;
    ssize_t len;
    uint8_t *buf;
    if (in->is_write) return -1;
    buf = malloc(BUF_SIZE);
    old = bam_header_read(in);
    fp = bgzf_fdopen(fd, "w");
    bam_header_write(fp, h);
    if (in->block_offset < in->block_length) {
        bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset);
        bgzf_flush(fp);
    }
    while ((len = bgzf_raw_read(in, buf, BUF_SIZE)) > 0)
        bgzf_raw_write(fp, buf, len);
    free(buf);
    fp->block_offset = in->block_offset = 0;
    bgzf_close(fp);
    return 0;
}
Esempio n. 6
0
int bam_header_write(bamFile fp, const bam_header_t *header)
{
	char buf[4];
	int32_t i, name_len, x;
	// write "BAM1"
	strncpy(buf, "BAM\001", 4);
	bam_write(fp, buf, 4);
	// write plain text and the number of reference sequences
	if (bam_is_be) {
		x = bam_swap_endian_4(header->l_text);
		bam_write(fp, &x, 4);
		if (header->l_text) bam_write(fp, header->text, header->l_text);
		x = bam_swap_endian_4(header->n_targets);
		bam_write(fp, &x, 4);
	} else {
		bam_write(fp, &header->l_text, 4);
		if (header->l_text) bam_write(fp, header->text, header->l_text);
		bam_write(fp, &header->n_targets, 4);
	}
	// write sequence names and lengths
	for (i = 0; i != header->n_targets; ++i) {
		char *p = header->target_name[i];
		name_len = strlen(p) + 1;
		if (bam_is_be) {
			x = bam_swap_endian_4(name_len);
			bam_write(fp, &x, 4);
		} else bam_write(fp, &name_len, 4);
		bam_write(fp, p, name_len);
		if (bam_is_be) {
			x = bam_swap_endian_4(header->target_len[i]);
			bam_write(fp, &x, 4);
		} else bam_write(fp, &header->target_len[i], 4);
	}
	bgzf_flush(fp);
	return 0;
}
Esempio n. 7
0
static void naive_concat(args_t *args)
{
    // only compressed BCF atm
    BGZF *bgzf_out = bgzf_open(args->output_fname,"w");;

    const size_t page_size = 32768;
    char *buf = (char*) malloc(page_size);
    kstring_t tmp = {0,0,0};
    int i;
    for (i=0; i<args->nfnames; i++)
    {
        htsFile *hts_fp = hts_open(args->fnames[i],"r");
        if ( !hts_fp ) error("Failed to open: %s\n", args->fnames[i]);
        htsFormat type = *hts_get_format(hts_fp);

        if ( type.format==vcf ) error("The --naive option currently works only for compressed BCFs, sorry :-/\n");
        if ( type.compression!=bgzf ) error("The --naive option currently works only for compressed BCFs, sorry :-/\n");

        BGZF *fp = hts_get_bgzfp(hts_fp);
        if ( !fp || bgzf_read_block(fp) != 0 || !fp->block_length )
            error("Failed to read %s: %s\n", args->fnames[i], strerror(errno));

        uint8_t magic[5];
        if ( bgzf_read(fp, magic, 5) != 5 ) error("Failed to read the BCF header in %s\n", args->fnames[i]);
        if (strncmp((char*)magic, "BCF\2\2", 5) != 0) error("Invalid BCF magic string in %s\n", args->fnames[i]);

        if ( bgzf_read(fp, &tmp.l, 4) != 4 ) error("Failed to read the BCF header in %s\n", args->fnames[i]);
        hts_expand(char,tmp.l,tmp.m,tmp.s);
        if ( bgzf_read(fp, tmp.s, tmp.l) != tmp.l ) error("Failed to read the BCF header in %s\n", args->fnames[i]);

        // write only the first header
        if ( i==0 )
        {
            if ( bgzf_write(bgzf_out, "BCF\2\2", 5) !=5 ) error("Failed to write %d bytes to %s\n", 5,args->output_fname);
            if ( bgzf_write(bgzf_out, &tmp.l, 4) !=4 ) error("Failed to write %d bytes to %s\n", 4,args->output_fname);
            if ( bgzf_write(bgzf_out, tmp.s, tmp.l) != tmp.l) error("Failed to write %d bytes to %s\n", tmp.l,args->output_fname);
        }

        // Output all non-header data that were read together with the header block
        int nskip = fp->block_offset;
        if ( fp->block_length - nskip > 0 )
        {
            if ( bgzf_write(bgzf_out, fp->uncompressed_block+nskip, fp->block_length-nskip)<0 ) error("Error: %d\n",fp->errcode);
        }
        if ( bgzf_flush(bgzf_out)<0 ) error("Error: %d\n",bgzf_out->errcode);


        // Stream the rest of the file as it is, without recompressing, but remove BGZF EOF blocks
        ssize_t nread, ncached = 0, nwr;
        const int neof = 28;
        char cached[neof];
        while (1)
        {
            nread = bgzf_raw_read(fp, buf, page_size);

            // page_size boundary may occur in the middle of the EOF block, so we need to cache the blocks' ends
            if ( nread<=0 ) break;
            if ( nread<=neof )      // last block
            {
                if ( ncached )
                {
                    // flush the part of the cache that won't be needed
                    nwr = bgzf_raw_write(bgzf_out, cached, nread);
                    if (nwr != nread) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)nread);

                    // make space in the cache so that we can append to the end
                    if ( nread!=neof ) memmove(cached,cached+nread,neof-nread);
                }

                // fill the cache and check for eof outside this loop
                memcpy(cached+neof-nread,buf,nread);
                break;
            }

            // not the last block, flush the cache if full
            if ( ncached )
            {
                nwr = bgzf_raw_write(bgzf_out, cached, ncached);
                if (nwr != ncached) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)ncached);
                ncached = 0;
            }

            // fill the cache
            nread -= neof;
            memcpy(cached,buf+nread,neof);
            ncached = neof;

            nwr = bgzf_raw_write(bgzf_out, buf, nread);
            if (nwr != nread) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)nread);
        }
        if ( ncached && memcmp(cached,"\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0",neof) )
        {
            nwr = bgzf_raw_write(bgzf_out, cached, neof);
            if (nwr != neof) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)neof);
        }
        if (hts_close(hts_fp)) error("Close failed: %s\n",args->fnames[i]);
    }
    free(buf);
    free(tmp.s);
    if (bgzf_close(bgzf_out) < 0) error("Error: %d\n",bgzf_out->errcode);
}
Esempio n. 8
0
File: main.c Progetto: Brainiarc7/TS
int reheader_file(const char *header, const char *file, int meta)
{
    BGZF *fp = bgzf_open(file,"r");
    if (bgzf_read_block(fp) != 0 || !fp->block_length)
        return -1;
    
    char *buffer = fp->uncompressed_block;
    int skip_until = 0;

    if ( buffer[0]==meta )
    {
        skip_until = 1;

        // Skip the header
        while (1)
        {
            if ( buffer[skip_until]=='\n' )
            {
                skip_until++;
                if ( skip_until>=fp->block_length )
                {
                    if (bgzf_read_block(fp) != 0 || !fp->block_length)
                        error("no body?\n");
                    skip_until = 0;
                }
                // The header has finished
                if ( buffer[skip_until]!=meta ) break;
            }
            skip_until++;
            if ( skip_until>=fp->block_length )
            {
                if (bgzf_read_block(fp) != 0 || !fp->block_length)
                    error("no body?\n");
                skip_until = 0;
            }
        }
    }

    FILE *fh = fopen(header,"r");
    if ( !fh )
        error("%s: %s", header,strerror(errno));
    int page_size = getpagesize();
    char *buf = valloc(page_size);
    BGZF *bgzf_out = bgzf_fdopen(fileno(stdout), "w");
    ssize_t nread;
    while ( (nread=fread(buf,1,page_size-1,fh))>0 )
    {
        if ( nread<page_size-1 && buf[nread-1]!='\n' )
            buf[nread++] = '\n';
        if (bgzf_write(bgzf_out, buf, nread) < 0) error("Error: %s\n",bgzf_out->error);
    }
    fclose(fh);

    if ( fp->block_length - skip_until > 0 )
    {
        if (bgzf_write(bgzf_out, buffer+skip_until, fp->block_length-skip_until) < 0) 
            error("Error: %s\n",fp->error);
    }
    if (bgzf_flush(bgzf_out) < 0) 
        error("Error: %s\n",bgzf_out->error);

    while (1)
    {
#ifdef _USE_KNETFILE
        nread = knet_read(fp->x.fpr, buf, page_size);
#else
        nread = fread(buf, 1, page_size, fp->file);
#endif
        if ( nread<=0 ) 
            break;

#ifdef _USE_KNETFILE
        int count = fwrite(buf, 1, nread, bgzf_out->x.fpw);
#else
        int count = fwrite(buf, 1, nread, bgzf_out->file);
#endif
        if (count != nread)
            error("Write failed, wrote %d instead of %d bytes.\n", count,(int)nread);
    }

    if (bgzf_close(bgzf_out) < 0) 
        error("Error: %s\n",bgzf_out->error);
   
    return 0;
}
Esempio n. 9
0
int bam_cat(int nfn, char * const *fn, const bam_header_t *h, const char* outbam)
{
    BGZF *fp;
    FILE* fp_file;
    uint8_t *buf;
    uint8_t ebuf[BGZF_EMPTY_BLOCK_SIZE];
    const int es=BGZF_EMPTY_BLOCK_SIZE;
    int i;
    
    fp = strcmp(outbam, "-")? bgzf_open(outbam, "w") : bgzf_fdopen(_fileno(stdout), "w");
    if (fp == 0) {
        fprintf(stderr, "[%s] ERROR: fail to open output file '%s'.\n", __FUNCTION__, outbam);
        return 1;
    }
    if (h) bam_header_write(fp, h);
    
    buf = (uint8_t*) malloc(BUF_SIZE);
    for(i = 0; i < nfn; ++i){
        BGZF *in;
        bam_header_t *old;
        int len,j;
        
        in = strcmp(fn[i], "-")? bam_open(fn[i], "r") : bam_dopen(_fileno(stdin), "r");
        if (in == 0) {
            fprintf(stderr, "[%s] ERROR: fail to open file '%s'.\n", __FUNCTION__, fn[i]);
            return -1;
        }
        if (in->open_mode != 'r') return -1;
        
        old = bam_header_read(in);
        if (h == 0 && i == 0) bam_header_write(fp, old);
        
        if (in->block_offset < in->block_length) {
            bgzf_write(fp, (uint8_t*)in->uncompressed_block + in->block_offset, in->block_length - in->block_offset);
            bgzf_flush(fp);
        }
        
        j=0;
#ifdef _USE_KNETFILE
        fp_file=fp->x.fpw;
        while ((len = knet_read(in->x.fpr, buf, BUF_SIZE)) > 0) {
#else  
        fp_file=fp->file;
        while (!feof(in->file) && (len = fread(buf, 1, BUF_SIZE, in->file)) > 0) {
#endif
            if(len<es){
                int diff=es-len;
                if(j==0) {
                    fprintf(stderr, "[%s] ERROR: truncated file?: '%s'.\n", __FUNCTION__, fn[i]);
                    return -1;
                }
                fwrite(ebuf, 1, len, fp_file);
                memcpy(ebuf,ebuf+len,diff);
                memcpy(ebuf+diff,buf,len);
            } else {
                if(j!=0) fwrite(ebuf, 1, es, fp_file);
                len-= es;
                memcpy(ebuf,buf+len,es);
                fwrite(buf, 1, len, fp_file);
            }
            j=1;
        }

        /* check final gzip block */
        {
            const uint8_t gzip1=ebuf[0];
            const uint8_t gzip2=ebuf[1];
            const uint32_t isize=*((uint32_t*)(ebuf+es-4));
            if(((gzip1!=GZIPID1) || (gzip2!=GZIPID2)) || (isize!=0)) {
                fprintf(stderr, "[%s] WARNING: Unexpected block structure in file '%s'.", __FUNCTION__, fn[i]);
                fprintf(stderr, " Possible output corruption.\n");
                fwrite(ebuf, 1, es, fp_file);
            }
        }
        bam_header_destroy(old);
        bgzf_close(in);
    }
    free(buf);
    bgzf_close(fp);
    return 0;
}



int main_cat(int argc, char *argv[])
{
    bam_header_t *h = 0;
    char *outfn = 0;
    int c, ret;
    while ((c = getopt(argc, argv, "h:o:")) >= 0) {
        switch (c) {
            case 'h': {
                tamFile fph = sam_open(optarg);
                if (fph == 0) {
                    fprintf(stderr, "[%s] ERROR: fail to read the header from '%s'.\n", __FUNCTION__, argv[1]);
                    return 1;
                }
                h = sam_header_read(fph);
                sam_close(fph);
                break;
            }
            case 'o': outfn = strdup(optarg); break;
        }
    }
    if (argc - optind < 2) {
        fprintf(stderr, "Usage: samtools cat [-h header.sam] [-o out.bam] <in1.bam> <in2.bam> [...]\n");
        return 1;
    }
    ret = bam_cat(argc - optind, argv + optind, h, outfn? outfn : "-");
    free(outfn);
    return ret;
}
Esempio n. 10
0
int main(int argInN, char* argIn[]) {
   
    time(&g_statsAll.timeStart);
   
    Parameters *P = new Parameters; //all parameters
       
    P->inputParameters(argInN, argIn);
    
    *(P->inOut->logStdOut) << timeMonthDayTime(g_statsAll.timeStart) << " ..... Started STAR run\n" <<flush;           
    
    //generate genome
    if (P->runMode=="genomeGenerate") {
        genomeGenerate(P);
        (void) sysRemoveDir (P->outFileTmp);        
        P->inOut->logMain << "DONE: Genome generation, EXITING\n" << flush;
        exit(0);
    } else if (P->runMode!="alignReads") {
        P->inOut->logMain << "EXITING because of INPUT ERROR: unknown value of input parameter runMode=" <<P->runMode<<endl<<flush;
        exit(1);
    };
    
    Genome mainGenome (P);
    mainGenome.genomeLoad();
    if (P->genomeLoad=="LoadAndExit" || P->genomeLoad=="Remove") 
    {
        return 0;
    };
    
    P->twoPass.pass2=false; //this is the 1st pass    
    
    SjdbClass sjdbLoci;

    if (P->sjdbInsert.pass1) 
    {
        Parameters *P1=new Parameters;
        *P1=*P;
        sjdbInsertJunctions(P, P1, mainGenome, sjdbLoci);
    };

    //calculate genome-related parameters
    Transcriptome *mainTranscriptome=NULL;
    
    
/////////////////////////////////////////////////////////////////////////////////////////////////START
    if (P->runThreadN>1) {
        g_threadChunks.threadArray=new pthread_t[P->runThreadN];
        pthread_mutex_init(&g_threadChunks.mutexInRead, NULL);
        pthread_mutex_init(&g_threadChunks.mutexOutSAM, NULL);
        pthread_mutex_init(&g_threadChunks.mutexOutBAM1, NULL);
        pthread_mutex_init(&g_threadChunks.mutexOutUnmappedFastx, NULL);
        pthread_mutex_init(&g_threadChunks.mutexOutFilterBySJout, NULL);
        pthread_mutex_init(&g_threadChunks.mutexStats, NULL);
        pthread_mutex_init(&g_threadChunks.mutexBAMsortBins, NULL);
    };

    g_statsAll.progressReportHeader(P->inOut->logProgress);    
    
    if (P->twoPass.yes) {//2-pass
        //re-define P for the pass1
        
        Parameters *P1=new Parameters;
        *P1=*P;
        //turn off unnecessary calculations
        P1->outSAMtype[0]="None";
        P1->outSAMbool=false;
        P1->outBAMunsorted=false;
        P1->outBAMcoord=false;
    
        P1->chimSegmentMin=0;
        
        P1->quant.yes=false;
        P1->quant.trSAM.yes=false;
        P1->quant.geCount.yes=false;
        
        P1->outFilterBySJoutStage=0;
        
        P1->outReadsUnmapped="None";
        
        P1->outFileNamePrefix=P->twoPass.dir;

        P1->readMapNumber=P->twoPass.pass1readsN;
//         P1->inOut->logMain.open((P1->outFileNamePrefix + "Log.out").c_str());

        g_statsAll.resetN();
        time(&g_statsAll.timeStartMap);
        P->inOut->logProgress << timeMonthDayTime(g_statsAll.timeStartMap) <<"\tStarted 1st pass mapping\n" <<flush;
        *P->inOut->logStdOut << timeMonthDayTime(g_statsAll.timeStartMap) << " ..... Started 1st pass mapping\n" <<flush;

        //run mapping for Pass1
        ReadAlignChunk *RAchunk1[P->runThreadN];        
        for (int ii=0;ii<P1->runThreadN;ii++) {
            RAchunk1[ii]=new ReadAlignChunk(P1, mainGenome, mainTranscriptome, ii);
        };    
        mapThreadsSpawn(P1, RAchunk1);
        outputSJ(RAchunk1,P1); //collapse and output junctions
//         for (int ii=0;ii<P1->runThreadN;ii++) {
//             delete [] RAchunk[ii];
//         };          
        
        time_t rawtime; time (&rawtime);
        P->inOut->logProgress << timeMonthDayTime(rawtime) <<"\tFinished 1st pass mapping\n";
        *P->inOut->logStdOut << timeMonthDayTime(rawtime) << " ..... Finished 1st pass mapping\n" <<flush;
        ofstream logFinal1 ( (P->twoPass.dir + "/Log.final.out").c_str());
        g_statsAll.reportFinal(logFinal1,P1);

        P->twoPass.pass2=true;//starting the 2nd pass
        P->twoPass.pass1sjFile=P->twoPass.dir+"/SJ.out.tab";
        
        sjdbInsertJunctions(P, P1, mainGenome, sjdbLoci);

        //reopen reads files
        P->closeReadsFiles();
        P->openReadsFiles();
    } else {//not 2-pass
        //nothing for now
    };
    
    if ( P->quant.yes ) {//load transcriptome
        mainTranscriptome=new Transcriptome(P);
    };    
    
    //initialize Stats
    g_statsAll.resetN();
    time(&g_statsAll.timeStartMap);
    *P->inOut->logStdOut << timeMonthDayTime(g_statsAll.timeStartMap) << " ..... Started mapping\n" <<flush;
    
    g_statsAll.timeLastReport=g_statsAll.timeStartMap;

    //open SAM/BAM files for output
    if (P->outSAMmode != "None") {//open SAM file and write header
        ostringstream samHeaderStream;
        
        for (uint ii=0;ii<P->nChrReal;ii++) {
            samHeaderStream << "@SQ\tSN:"<< P->chrName.at(ii) <<"\tLN:"<<P->chrLength[ii]<<"\n";
        };

        if (P->outSAMheaderPG.at(0)!="-") {
            samHeaderStream << P->outSAMheaderPG.at(0);
            for (uint ii=1;ii<P->outSAMheaderPG.size(); ii++) {
                samHeaderStream << "\t" << P->outSAMheaderPG.at(ii);
            };
            samHeaderStream << "\n";
        };        
        
        samHeaderStream << "@PG\tID:STAR\tPN:STAR\tVN:" << STAR_VERSION <<"\tCL:" << P->commandLineFull <<"\n";
        
        if (P->outSAMheaderCommentFile!="-") {
            ifstream comstream (P->outSAMheaderCommentFile);
            while (comstream.good()) {
                string line1;
                getline(comstream,line1);
                if (line1.find_first_not_of(" \t\n\v\f\r")!=std::string::npos) {//skip blank lines
                    samHeaderStream << line1 <<"\n";
                };
            };
        };         
        

        for (uint32 ii=0;ii<P->outSAMattrRGlineSplit.size();ii++) {//@RG lines
            samHeaderStream << "@RG\t" << P->outSAMattrRGlineSplit.at(ii) <<"\n";
        };
 
        samHeaderStream <<  "@CO\t" <<"user command line: " << P->commandLine <<"\n";
        
        if (P->outSAMheaderHD.at(0)!="-") {
            P->samHeaderHD = P->outSAMheaderHD.at(0);
            for (uint ii=1;ii<P->outSAMheaderHD.size(); ii++) {
                P->samHeaderHD +="\t" + P->outSAMheaderHD.at(ii);
            };
        } else {
            P->samHeaderHD = "@HD\tVN:1.4";
        };        
        
        
        P->samHeader=P->samHeaderHD+"\n"+samHeaderStream.str();
        //for the sorted BAM, need to add SO:cooridnate to the header line
        P->samHeaderSortedCoord=P->samHeaderHD + (P->outSAMheaderHD.size()==0 ? "" : "\tSO:coordinate") + "\n" + samHeaderStream.str();
        
        if (P->outSAMbool) {//
            *P->inOut->outSAM << P->samHeader;
        };
        if (P->outBAMunsorted){
            outBAMwriteHeader(P->inOut->outBAMfileUnsorted,P->samHeader,P->chrName,P->chrLength);
        };
//             if (P->outBAMcoord){
//                 outBAMwriteHeader(P->inOut->outBAMfileCoord,P->samHeader,P->chrName,P->chrLength);            
//             };
        
        if ( P->quant.trSAM.yes ) {
            samHeaderStream.str("");
            vector <uint> trlength;
            for (uint32 ii=0;ii<mainTranscriptome->trID.size();ii++) {
                uint32 iex1=mainTranscriptome->trExI[ii]+mainTranscriptome->trExN[ii]-1; //last exon of the transcript
                trlength.push_back(mainTranscriptome->exLenCum[iex1]+mainTranscriptome->exSE[2*iex1+1]-mainTranscriptome->exSE[2*iex1]+1);          
                samHeaderStream << "@SQ\tSN:"<< mainTranscriptome->trID.at(ii) <<"\tLN:"<<trlength.back()<<"\n";
            };
            for (uint32 ii=0;ii<P->outSAMattrRGlineSplit.size();ii++) {//@RG lines
                samHeaderStream << "@RG\t" << P->outSAMattrRGlineSplit.at(ii) <<"\n";
            };
            outBAMwriteHeader(P->inOut->outQuantBAMfile,samHeaderStream.str(),mainTranscriptome->trID,trlength);        
        };
        
    };
    
    if (P->chimSegmentMin>0) {
        P->inOut->outChimJunction.open((P->outFileNamePrefix + "Chimeric.out.junction").c_str());
        P->inOut->outChimSAM.open((P->outFileNamePrefix + "Chimeric.out.sam").c_str());
        P->inOut->outChimSAM << P->samHeader;
        pthread_mutex_init(&g_threadChunks.mutexOutChimSAM, NULL);   
        pthread_mutex_init(&g_threadChunks.mutexOutChimJunction, NULL);
    };
         
    // P->inOut->logMain << "mlock value="<<mlockall(MCL_CURRENT|MCL_FUTURE) <<"\n"<<flush;

    // prepare chunks and spawn mapping threads    
    ReadAlignChunk *RAchunk[P->runThreadN];
    for (int ii=0;ii<P->runThreadN;ii++) {
        RAchunk[ii]=new ReadAlignChunk(P, mainGenome, mainTranscriptome, ii);
    };    
    
    mapThreadsSpawn(P, RAchunk);
   
    if (P->outFilterBySJoutStage==1) {//completed stage 1, go to stage 2
        P->inOut->logMain << "Completed stage 1 mapping of outFilterBySJout mapping\n"<<flush;
        outputSJ(RAchunk,P);//collapse novel junctions
        P->readFilesIndex=-1;
        

        P->outFilterBySJoutStage=2;
        if (P->outBAMcoord) {
            for (int it=0; it<P->runThreadN; it++) {//prepare the unmapped bin 
                RAchunk[it]->chunkOutBAMcoord->coordUnmappedPrepareBySJout();
            };
        };

        mapThreadsSpawn(P, RAchunk);
    };
    
    //close some BAM files
    if (P->inOut->outBAMfileUnsorted!=NULL) {
        bgzf_flush(P->inOut->outBAMfileUnsorted);
        bgzf_close(P->inOut->outBAMfileUnsorted);
    };
    if (P->inOut->outQuantBAMfile!=NULL) {
        bgzf_flush(P->inOut->outQuantBAMfile);
        bgzf_close(P->inOut->outQuantBAMfile);
    };      
    
    if (P->outBAMcoord && P->limitBAMsortRAM==0) {//make it equal ot the genome size
        P->limitBAMsortRAM=P->nGenome+mainGenome.SA.lengthByte+mainGenome.SAi.lengthByte;
    };
        
    //no need for genome anymore, free the memory
    mainGenome.freeMemory();
    
    if ( P->quant.geCount.yes )
    {//output gene quantifications
        for (int ichunk=1; ichunk<P->runThreadN; ichunk++)
        {//sum counts from all chunks into 0th chunk
            RAchunk[0]->chunkTr->quants->addQuants(*(RAchunk[ichunk]->chunkTr->quants));
        };
        RAchunk[0]->chunkTr->quantsOutput();
    };
    
    if (P->runThreadN>1 && P->outSAMorder=="PairedKeepInputOrder") {//concatenate Aligned.* files
        RAchunk[0]->chunkFilesCat(P->inOut->outSAM, P->outFileTmp + "/Aligned.out.sam.chunk", g_threadChunks.chunkOutN);
    };    

    
    if (P->outBAMcoord) {//sort BAM if needed
        *P->inOut->logStdOut << timeMonthDayTime() << " ..... Started sorting BAM\n" <<flush;
        P->inOut->logMain << timeMonthDayTime() << " ..... Started sorting BAM\n" <<flush;
        uint32 nBins=P->outBAMcoordNbins;
        
        //check max size needed for sorting
        uint maxMem=0;
        for (uint32 ibin=0; ibin<nBins-1; ibin++) {//check akk bins
            uint binS=0;
            for (int it=0; it<P->runThreadN; it++) {//collect sizes from threads
                binS += RAchunk[it]->chunkOutBAMcoord->binTotalBytes[ibin]+24*RAchunk[it]->chunkOutBAMcoord->binTotalN[ibin];
            };        
            if (binS>maxMem) maxMem=binS;
        };
        P->inOut->logMain << "Max memory needed for sorting = "<<maxMem<<endl;
        if (maxMem>P->limitBAMsortRAM) {
            ostringstream errOut;
            errOut <<"EXITING because of fatal ERROR: not enough memory for BAM sorting: \n";
            errOut <<"SOLUTION: re-run STAR with at least --limitBAMsortRAM " <<maxMem+1000000000;
            exitWithError(errOut.str(), std::cerr, P->inOut->logMain, EXIT_CODE_PARAMETER, *P);                                    
        };
        
        
        uint totalMem=0;
//         P->inOut->logMain << "Started sorting BAM ..." <<endl;
        #pragma omp parallel num_threads(P->outBAMsortingThreadNactual) 
        #pragma omp for schedule (dynamic,1)
        for (uint32 ibin1=0; ibin1<nBins; ibin1++) {
            uint32 ibin=nBins-1-ibin1;//reverse order to start with the last bin - unmapped reads
            
            uint binN=0, binS=0;
            for (int it=0; it<P->runThreadN; it++) {//collect sizes from threads
                binN += RAchunk[it]->chunkOutBAMcoord->binTotalN[ibin];
                binS += RAchunk[it]->chunkOutBAMcoord->binTotalBytes[ibin];
            };
            
            if (binS==0) continue; //empty bin
  
            if (ibin == nBins-1) {//last bin for unmapped reads
                BAMbinSortUnmapped(ibin,P->runThreadN,P->outBAMsortTmpDir,P->inOut->outBAMfileCoord, P);
            } else {
            uint newMem=binS+binN*24;
            bool boolWait=true;
            while (boolWait) {
                #pragma omp critical
                if (totalMem+newMem < P->limitBAMsortRAM) {
                    boolWait=false;
                    totalMem+=newMem;
                };
                sleep(0.1);
            };
            BAMbinSortByCoordinate(ibin,binN,binS,P->runThreadN,P->outBAMsortTmpDir,P->inOut->outBAMfileCoord, P);
            #pragma omp critical
            totalMem-=newMem;//"release" RAM
        };
        };
        //concatenate all BAM files, using bam_cat
        char **bamBinNames = new char* [nBins];
        vector <string> bamBinNamesV;
        for (uint32 ibin=0; ibin<nBins; ibin++) {
            
            bamBinNamesV.push_back(P->outBAMsortTmpDir+"/b"+to_string((uint) ibin));            
            struct stat buffer;
            if (stat (bamBinNamesV.back().c_str(), &buffer) != 0) {//check if file exists
                bamBinNamesV.pop_back();
            };
        };
        for (uint32 ibin=0; ibin<bamBinNamesV.size(); ibin++) {
                bamBinNames[ibin] = (char*) bamBinNamesV.at(ibin).c_str();
        };
        bam_cat(bamBinNamesV.size(), bamBinNames, 0, P->outBAMfileCoordName.c_str());
    };
    //wiggle output
    if (P->outWigFlags.yes) {
        *P->inOut->logStdOut << timeMonthDayTime() << " ..... Started wiggle output\n" <<flush;
        P->inOut->logMain << timeMonthDayTime() << " ..... Started wiggle output\n" <<flush;
        string wigOutFileNamePrefix=P->outFileNamePrefix + "Signal";
        signalFromBAM(P->outBAMfileCoordName, wigOutFileNamePrefix, *P);
    };
    
    //aggregate output junctions
    //collapse splice junctions from different threads/chunks, and output them
    outputSJ(RAchunk,P);
    
    g_statsAll.progressReport(P->inOut->logProgress);
    P->inOut->logProgress  << "ALL DONE!\n"<<flush;
    P->inOut->logFinal.open((P->outFileNamePrefix + "Log.final.out").c_str());
    g_statsAll.reportFinal(P->inOut->logFinal,P);
    *P->inOut->logStdOut << timeMonthDayTime(g_statsAll.timeFinish) << " ..... Finished successfully\n" <<flush;
    
    P->inOut->logMain  << "ALL DONE!\n"<<flush;
    sysRemoveDir (P->outFileTmp);
    
    P->closeReadsFiles();//this will kill the readFilesCommand processes if necessary
    mainGenome.~Genome(); //need explicit call because of the 'delete P->inOut' below, which will destroy P->inOut->logStdOut
    
    delete P->inOut; //to close files
    delete P;
    
    return 0;    
};
Esempio n. 11
0
File: tabix.c Progetto: Illumina/akt
int reheader_file(const char *fname, const char *header, int ftype, tbx_conf_t *conf)
{
    if ( ftype & IS_TXT || !ftype )
    {
        BGZF *fp = bgzf_open(fname,"r");
        if ( !fp || bgzf_read_block(fp) != 0 || !fp->block_length ) return -1;

        char *buffer = fp->uncompressed_block;
        int skip_until = 0;

        // Skip the header: find out the position of the data block
        if ( buffer[0]==conf->meta_char )
        {
            skip_until = 1;
            while (1)
            {
                if ( buffer[skip_until]=='\n' )
                {
                    skip_until++;
                    if ( skip_until>=fp->block_length )
                    {
                        if ( bgzf_read_block(fp) != 0 || !fp->block_length ) error("FIXME: No body in the file: %s\n", fname);
                        skip_until = 0;
                    }
                    // The header has finished
                    if ( buffer[skip_until]!=conf->meta_char ) break;
                }
                skip_until++;
                if ( skip_until>=fp->block_length )
                {
                    if (bgzf_read_block(fp) != 0 || !fp->block_length) error("FIXME: No body in the file: %s\n", fname);
                    skip_until = 0;
                }
            }
        }

        // Output the new header
        FILE *hdr  = fopen(header,"r");
        if ( !hdr ) error("%s: %s", header,strerror(errno));
        const size_t page_size = 32768;
        char *buf = malloc(page_size);
        BGZF *bgzf_out = bgzf_open("-", "w");
        ssize_t nread;
        while ( (nread=fread(buf,1,page_size-1,hdr))>0 )
        {
            if ( nread<page_size-1 && buf[nread-1]!='\n' ) buf[nread++] = '\n';
            if (bgzf_write(bgzf_out, buf, nread) < 0) error("Error: %d\n",bgzf_out->errcode);
        }
        if ( fclose(hdr) ) error("close failed: %s\n", header);

        // Output all remainig data read with the header block
        if ( fp->block_length - skip_until > 0 )
        {
            if (bgzf_write(bgzf_out, buffer+skip_until, fp->block_length-skip_until) < 0) error("Error: %d\n",fp->errcode);
        }
        if (bgzf_flush(bgzf_out) < 0) error("Error: %d\n",bgzf_out->errcode);

        while (1)
        {
            nread = bgzf_raw_read(fp, buf, page_size);
            if ( nread<=0 ) break;

            int count = bgzf_raw_write(bgzf_out, buf, nread);
            if (count != nread) error("Write failed, wrote %d instead of %d bytes.\n", count,(int)nread);
        }
        if (bgzf_close(bgzf_out) < 0) error("Error: %d\n",bgzf_out->errcode);
        if (bgzf_close(fp) < 0) error("Error: %d\n",fp->errcode);
        free(buf);
    }
    else
        error("todo: reheader BCF, BAM\n");  // BCF is difficult, records contain pointers to the header.
    return 0;
}
Esempio n. 12
0
static void reheader_vcf_gz(args_t *args)
{
    BGZF *fp = bgzf_open(args->fname,"r");
    if ( !fp || bgzf_read_block(fp) != 0 || !fp->block_length )
        error("Failed to read %s: %s\n", args->fname, strerror(errno));

    kstring_t hdr = {0,0,0};
    char *buffer = (char*) fp->uncompressed_block;

    // Read the header and find the position of the data block
    if ( buffer[0]!='#' ) error("Could not parse the header, expected '#', found '%c'\n", buffer[0]);

    int skip_until = 1;     // end of the header in the current uncompressed block
    while (1)
    {
        if ( buffer[skip_until]=='\n' )
        {
            skip_until++;
            if ( skip_until>=fp->block_length )
            {
                kputsn(buffer,skip_until,&hdr);
                if ( bgzf_read_block(fp) != 0 || !fp->block_length ) error("FIXME: No body in the file: %s\n", args->fname);
                skip_until = 0;
            }
            // The header has finished
            if ( buffer[skip_until]!='#' )
            {
                kputsn(buffer,skip_until,&hdr);
                break;
            }
        }
        skip_until++;
        if ( skip_until>=fp->block_length )
        {
            kputsn(buffer,fp->block_length,&hdr);
            if (bgzf_read_block(fp) != 0 || !fp->block_length) error("FIXME: No body in the file: %s\n", args->fname);
            skip_until = 0;
        }
    }

    int nsamples = 0;
    char **samples = NULL;
    if ( args->samples_fname )
        samples = hts_readlines(args->samples_fname, &nsamples);
    if ( args->header_fname )
    {
        free(hdr.s); hdr.s = NULL; hdr.l = hdr.m = 0;
        read_header_file(args->header_fname, &hdr);
    }
    if ( samples )
    {
        set_samples(samples, nsamples, &hdr);
        int i;
        for (i=0; i<nsamples; i++) free(samples[i]);
        free(samples);
    }

    // Output the modified header
    BGZF *bgzf_out = bgzf_dopen(fileno(stdout), "w");
    bgzf_write(bgzf_out, hdr.s, hdr.l);
    free(hdr.s);

    // Output all remainig data read with the header block
    if ( fp->block_length - skip_until > 0 )
    {
        if ( bgzf_write(bgzf_out, buffer+skip_until, fp->block_length-skip_until)<0 ) error("Error: %d\n",fp->errcode);
    }
    if ( bgzf_flush(bgzf_out)<0 ) error("Error: %d\n",bgzf_out->errcode);

    // Stream the rest of the file without as it is, without decompressing
    ssize_t nread;
    int page_size = getpagesize();
    char *buf = (char*) valloc(page_size);
    while (1)
    {
        nread = bgzf_raw_read(fp, buf, page_size);
        if ( nread<=0 ) break;

        int count = bgzf_raw_write(bgzf_out, buf, nread);
        if (count != nread) error("Write failed, wrote %d instead of %d bytes.\n", count,(int)nread);
    }
    if (bgzf_close(bgzf_out) < 0) error("Error: %d\n",bgzf_out->errcode);
    if (bgzf_close(fp) < 0) error("Error: %d\n",fp->errcode);
    free(buf);
}
Esempio n. 13
0
int bam_cat(int nfn, char * const *fn, const bam_hdr_t *h, const char* outbam)
{
    BGZF *fp;
    uint8_t *buf;
    uint8_t ebuf[BGZF_EMPTY_BLOCK_SIZE];
    const int es=BGZF_EMPTY_BLOCK_SIZE;
    int i;

    fp = strcmp(outbam, "-")? bgzf_open(outbam, "w") : bgzf_fdopen(fileno(stdout), "w");
    if (fp == 0) {
        fprintf(stderr, "[%s] ERROR: fail to open output file '%s'.\n", __func__, outbam);
        return 1;
    }
    if (h) bam_hdr_write(fp, h);

    buf = (uint8_t*) malloc(BUF_SIZE);
    for(i = 0; i < nfn; ++i){
        BGZF *in;
        bam_hdr_t *old;
        int len,j;

        in = strcmp(fn[i], "-")? bgzf_open(fn[i], "r") : bgzf_fdopen(fileno(stdin), "r");
        if (in == 0) {
            fprintf(stderr, "[%s] ERROR: fail to open file '%s'.\n", __func__, fn[i]);
            return -1;
        }
        if (in->is_write) return -1;

        old = bam_hdr_read(in);
        if (old == NULL) {
            fprintf(stderr, "[%s] ERROR: couldn't read header for '%s'.\n",
                    __func__, fn[i]);
            bgzf_close(in);
            return -1;
        }
        if (h == 0 && i == 0) bam_hdr_write(fp, old);

        if (in->block_offset < in->block_length) {
            bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset);
            bgzf_flush(fp);
        }

        j=0;
        while ((len = bgzf_raw_read(in, buf, BUF_SIZE)) > 0) {
            if(len<es){
                int diff=es-len;
                if(j==0) {
                    fprintf(stderr, "[%s] ERROR: truncated file?: '%s'.\n", __func__, fn[i]);
                    return -1;
                }
                bgzf_raw_write(fp, ebuf, len);
                memcpy(ebuf,ebuf+len,diff);
                memcpy(ebuf+diff,buf,len);
            } else {
                if(j!=0) bgzf_raw_write(fp, ebuf, es);
                len-= es;
                memcpy(ebuf,buf+len,es);
                bgzf_raw_write(fp, buf, len);
            }
            j=1;
        }

        /* check final gzip block */
        {
            const uint8_t gzip1=ebuf[0];
            const uint8_t gzip2=ebuf[1];
            const uint32_t isize=*((uint32_t*)(ebuf+es-4));
            if(((gzip1!=GZIPID1) || (gzip2!=GZIPID2)) || (isize!=0)) {
                fprintf(stderr, "[%s] WARNING: Unexpected block structure in file '%s'.", __func__, fn[i]);
                fprintf(stderr, " Possible output corruption.\n");
                bgzf_raw_write(fp, ebuf, es);
            }
        }
        bam_hdr_destroy(old);
        bgzf_close(in);
    }
    free(buf);
    bgzf_close(fp);
    return 0;
}