Exemplo n.º 1
0
int SingleChromosomeBCFIndex::nextLine(uint32_t* l_shared,
                                       uint32_t* l_indiv,
                                       std::vector<char>* line) {
  if (4 != bgzf_read(fBcfFile_, l_shared, sizeof(uint32_t)) ||
      4 != bgzf_read(fBcfFile_, l_indiv, sizeof(uint32_t))) {
    REprintf("readLine error!\n");
  }
  uint32_t totalLen =  *l_shared + *l_indiv;
  line->resize(totalLen);
  if ( totalLen != bgzf_read(fBcfFile_, line->data(), totalLen)) {
    REprintf("readLine bgzf_read error!\n");
  }

  return totalLen;
}
Exemplo n.º 2
0
size_t count_fastq_sequences(BGZF *fastq_file)
{
    bgzf_seek( fastq_file, 0, SEEK_SET );

    cmph_uint32 count = 0;
    while( 1 )
    {
        char buffer[ BUFSIZ ];
        ssize_t bytes_read = bgzf_read( fastq_file, buffer, BUFSIZ );
        if( bytes_read <= 0 )
        {
            break;
        }

        int i;
        for(i = 0; i < bytes_read; i++)
        {
            if( buffer[ i ] == '@' )
            {
                count++;
            }
        }
    }

    bgzf_seek( fastq_file, 0, SEEK_SET );
    return count;
}
Exemplo n.º 3
0
//this function will print a samfile from the bamfile
int motherView(bufReader *rd,int nFiles,std::vector<regs>regions) {
  aRead b;
  b.vDat=new uint8_t[RLEN];
  kstring_t str;  str.s=NULL; str.l=str.m=0;
  
  if(regions.size()==0) {//print all
    int block_size;
    while(SIG_COND && bgzf_read(rd[0].fp,&block_size,sizeof(int))){
      getAlign(rd[0].fp,block_size,b);
      printReadBuffered(b,rd[0].hd,str);
      fprintf(stdout,"%s",str.s);
    }
  }else {
    for(int i=0;i<(int)regions.size();i++){
      int tmpRef = regions[i].refID;
      int tmpStart = regions[i].start;
      int tmpStop = regions[i].stop;
      
      getOffsets(rd[0].bamfname,rd[0].hd,rd[0].it,tmpRef,tmpStart,tmpStop);
      int ret =0;
      while(SIG_COND){
	ret = bam_iter_read(rd[0].fp, &rd[0].it, b);
	if(ret<0)
	  break;
	printReadBuffered(b,rd[0].hd,str);
	fprintf(stdout,"%s",str.s);
      }
      free(rd[0].it.off);//the offsets
    }
    free(str.s);
    delete [] b.vDat;
  }
  return 0;
}
Exemplo n.º 4
0
Arquivo: vcf.c Projeto: goshng/cocoa
bcf_hdr_t *bcf_hdr_read(BGZF *fp)
{
	uint8_t magic[5];
	bcf_hdr_t *h;
	h = bcf_hdr_init();
	bgzf_read(fp, magic, 5);
	if (strncmp((char*)magic, "BCF\2\1", 5) != 0) {
		if (hts_verbose >= 2)
			fprintf(stderr, "[E::%s] invalid BCF2 magic string\n", __func__);
		bcf_hdr_destroy(h);
		return 0;
	}
	bgzf_read(fp, &h->l_text, 4);
	h->text = (char*)malloc(h->l_text);
	bgzf_read(fp, h->text, h->l_text);
	bcf_hdr_parse(h);
	return h;
}
Exemplo n.º 5
0
Arquivo: vcf.c Projeto: goshng/cocoa
static inline int bcf_read1_core(BGZF *fp, bcf1_t *v)
{
	uint32_t x[8];
	int ret;
	if ((ret = bgzf_read(fp, x, 32)) != 32) {
		if (ret == 0) return -1;
		return -2;
	}
	x[0] -= 24; // to exclude six 32-bit integers
	ks_resize(&v->shared, x[0]);
	ks_resize(&v->indiv, x[1]);
	memcpy(v, x + 2, 16);
	v->n_allele = x[6]>>16; v->n_info = x[6]&0xffff;
	v->n_fmt = x[7]>>24; v->n_sample = x[7]&0xffffff;
	v->shared.l = x[0], v->indiv.l = x[1];
	v->unpacked = 0;
	v->unpack_ptr = NULL;
	bgzf_read(fp, v->shared.s, v->shared.l);
	bgzf_read(fp, v->indiv.s, v->indiv.l);
	return 0;
}
Exemplo n.º 6
0
//hint is the suggested newsize
void filt_readSites(filt*fl,char *chr,size_t hint) {
  assert(fl!=NULL);

  std::map<char*,asdf_dats,ltstr> ::iterator it = fl->offs.find(chr);
  if(it==fl->offs.end()){
    fprintf(stderr,"\n\t-> Potential problem: The filereading has reached a chromsome: \'%s\', which is not included in your \'-sites\' file.\n\t-> Please consider limiting your analysis to the chromsomes of interest \n",chr);
    fprintf(stderr,"\t-> see \'http://www.popgen.dk/angsd/index.php/Sites\' for more information\n");
    fprintf(stderr,"\t-> Program will continue reading this chromosome... \n");
    //exit(0);
    free(fl->keeps);
    free(fl->minor);
    free(fl->major);
    fl->keeps=fl->minor=fl->major=NULL;
    fl->curLen =0;
    return;
  }

  bgzf_seek(fl->bg,it->second.offs,SEEK_SET);

  size_t nsize = std::max(fl->curLen,hint);
  nsize = std::max(nsize,it->second.len);
  if(nsize>fl->curLen) 
    fl->keeps=(char*) realloc(fl->keeps,nsize);
  memset(fl->keeps,0,nsize);
  //fprintf(stderr,"it->second.len:%lu fl->curLen:%lu fl->keeps:%p\n",it->second.len,fl->curLen,fl->keeps);
  bgzf_read(fl->bg,fl->keeps,it->second.len);

  if(fl->hasMajMin==1){
    if(nsize>fl->curLen) {
      fl->major = (char*) realloc(fl->major,nsize);
      fl->minor = (char*) realloc(fl->minor,nsize);
      memset(fl->major,0,nsize);
      memset(fl->minor,0,nsize);
    }
    bgzf_read(fl->bg,fl->major,it->second.len);
    bgzf_read(fl->bg,fl->minor,it->second.len);
  }
  fl->curNam=chr;
  fl->curLen = nsize;
}
Exemplo n.º 7
0
perChr getPerChr(BGZF *fp){
  perChr ret;
  ret.nSites =0;
  ret.posi=NULL;
  ret.tW=ret.tP=ret.tF=ret.tH=ret.tL=NULL;
  size_t clen;
  
  if(bgzf_read(fp,&clen,sizeof(size_t))==0)
    return ret;
  ret.chr = new char[clen+1];
  bgzf_read(fp,ret.chr,clen);
  ret.chr[clen] = '\0';
  bgzf_read(fp,&ret.nSites,sizeof(size_t));
  ret.posi = new int[ret.nSites];
  ret.tW = new float[ret.nSites];
  ret.tP = new float[ret.nSites];
  ret.tF = new float[ret.nSites];
  ret.tH = new float[ret.nSites];
  ret.tL = new float[ret.nSites];
  
  //read positions and thetas
  bgzf_read(fp,ret.posi,ret.nSites*sizeof(int));
  bgzf_read(fp,ret.tW,ret.nSites*sizeof(float));
  bgzf_read(fp,ret.tP,ret.nSites*sizeof(float));
  bgzf_read(fp,ret.tF,ret.nSites*sizeof(float));
  bgzf_read(fp,ret.tH,ret.nSites*sizeof(float));
  bgzf_read(fp,ret.tL,ret.nSites*sizeof(float));
  
  //make thetas into normal space
  for(size_t i=0;i<ret.nSites;i++){
    ret.tW[i] = exp(ret.tW[i]);
    ret.tP[i] = exp(ret.tP[i]);
    ret.tF[i] = exp(ret.tF[i]);
    ret.tH[i] = exp(ret.tH[i]);
    ret.tL[i] = exp(ret.tL[i]);
  }


  return ret;
}
Exemplo n.º 8
0
uint64_t read_chunk(double **chunk_data, params *pars, uint64_t chunk) {
	uint64_t total_elems_read = 0;

	if(chunk >= pars->n_chunks)
		error("invalid chunk number!");

	// Define chunk start and end positions
	uint64_t start_pos = chunk * pars->max_chunk_size;
	uint64_t end_pos = start_pos + pars->max_chunk_size - 1;
	if(end_pos >= pars->n_sites)	end_pos = pars->n_sites - 1;
	uint64_t chunk_size = end_pos - start_pos + 1;
	if( pars->verbose >= 6 ) printf("\tReading chunk %lu from position %lu to %lu (%lu)\n", chunk+1, start_pos, end_pos, chunk_size);

	// Search start position
#ifdef _USE_BGZF
	if( bgzf_seek(pars->in_glf_fh, pars->chunks_voffset[chunk], SEEK_SET) < 0 )
		error("cannot seek GLF file (BGZF)!");
#endif

	// Read data from file
	for(uint64_t c = 0; c < chunk_size; c++) {
#ifdef _USE_BGZF
		int bytes_read = bgzf_read(pars->in_glf_fh, chunk_data[c], (int) pars->n_ind * 3 * sizeof(double));
		if(pars->call_geno)
			call_geno(chunk_data[c], pars->n_ind, 3);
		uint64_t elems_read = (uint64_t) bytes_read / sizeof(double);
#else
		chunk_data[c] = pars->data[start_pos+c];
		uint64_t elems_read = pars->n_ind * 3;
#endif
		if( elems_read != pars->n_ind * 3 )
			error("cannot read GLF file!");
		total_elems_read += elems_read;
	}

#ifdef _USE_BGZF
	// Update index for next chunk
	if( chunk+1 != pars->n_chunks && pars->chunks_voffset[chunk+1] == 0 )
		pars->chunks_voffset[chunk+1] = bgzf_tell(pars->in_glf_fh);
#endif

	return( total_elems_read/(pars->n_ind * 3) );
}
Exemplo n.º 9
0
Arquivo: bgzip.c Projeto: Abdul59/STAR
int main(int argc, char **argv)
{
	int c, compress, pstdout, is_forced, index = 0, reindex = 0;
	BGZF *fp;
	void *buffer;
	long start, end, size;
    char *index_fname = NULL;

    static struct option loptions[] = 
    {
        {"help",0,0,'h'},
        {"offset",1,0,'b'},
        {"stdout",0,0,'c'},
        {"decompress",0,0,'d'},
        {"force",0,0,'f'},
        {"index",0,0,'i'},
        {"index-name",1,0,'I'},
        {"reindex",0,0,'r'},
        {"size",1,0,'s'},
        {0,0,0,0}
    };

	compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0;
	while((c  = getopt_long(argc, argv, "cdh?fb:s:iI:r",loptions,NULL)) >= 0){
		switch(c){
		case 'd': compress = 0; break;
		case 'c': pstdout = 1; break;
		case 'b': start = atol(optarg); compress = 0; pstdout = 1; break;
		case 's': size = atol(optarg); pstdout = 1; break;
		case 'f': is_forced = 1; break;
        case 'i': index = 1; break;
        case 'I': index_fname = optarg; break;
        case 'r': reindex = 1; compress = 0; break;
		case 'h': 
        case '?': return bgzip_main_usage();
		}
	}
	if (size >= 0) end = start + size;
	if (end >= 0 && end < start) {
		fprintf(stderr, "[bgzip] Illegal region: [%ld, %ld]\n", start, end);
		return 1;
	}
	if (compress == 1) {
		struct stat sbuf;
		int f_src = fileno(stdin);
		int f_dst = fileno(stdout);

		if ( argc>optind )
		{
			if ( stat(argv[optind],&sbuf)<0 ) 
			{ 
				fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
				return 1; 
			}

			if ((f_src = open(argv[optind], O_RDONLY)) < 0) {
				fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
				return 1;
			}

			if (pstdout)
				f_dst = fileno(stdout);
			else
			{
				char *name = malloc(strlen(argv[optind]) + 5);
				strcpy(name, argv[optind]);
				strcat(name, ".gz");
				f_dst = write_open(name, is_forced);
				if (f_dst < 0) return 1;
				free(name);
			}
		}
		else if (!pstdout && isatty(fileno((FILE *)stdout)) )
			return bgzip_main_usage();
        else if ( index && !index_fname )
        {
            fprintf(stderr, "[bgzip] Index file name expected when writing to stdout\n");
            return 1;
        }

		fp = bgzf_fdopen(f_dst, "w");
        if ( index ) bgzf_index_build_init(fp);
		buffer = malloc(WINDOW_SIZE);
		while ((c = read(f_src, buffer, WINDOW_SIZE)) > 0)
			if (bgzf_write(fp, buffer, c) < 0) error("Could not write %d bytes: Error %d\n", c, fp->errcode);
		// f_dst will be closed here
        if ( index ) 
        {
            if ( index_fname ) bgzf_index_dump(fp, index_fname, NULL);
            else bgzf_index_dump(fp, argv[optind], ".gz.gzi");
        }
		if (bgzf_close(fp) < 0) error("Close failed: Error %d", fp->errcode);
		if (argc > optind && !pstdout) unlink(argv[optind]);
		free(buffer);
		close(f_src);
		return 0;
	}
    else if ( reindex )
    {
        if ( argc>optind )
        {
			fp = bgzf_open(argv[optind], "r");
            if ( !fp ) error("[bgzip] Could not open file: %s\n", argv[optind]);
        }
        else
        {
            if ( !index_fname ) error("[bgzip] Index file name expected when reading from stdin\n");
            fp = bgzf_fdopen(fileno(stdin), "r");
        	if ( !fp ) error("[bgzip] Could not read from stdin: %s\n", strerror(errno));
        }

        buffer = malloc(BGZF_BLOCK_SIZE);
        bgzf_index_build_init(fp);
        int ret;
        while ( (ret=bgzf_read(fp, buffer, BGZF_BLOCK_SIZE))>0 ) ;
        free(buffer);
        if ( ret<0 ) error("Is the file gzipped or bgzipped? The latter is required for indexing.\n");
 
        if ( index_fname )
            bgzf_index_dump(fp, index_fname, NULL);
        else 
            bgzf_index_dump(fp, argv[optind], ".gzi");

        if ( bgzf_close(fp)<0 ) error("Close failed: Error %d\n",fp->errcode);
        return 0;
    }
    else
    {
		struct stat sbuf;
		int f_dst;

		if ( argc>optind )
		{
			if ( stat(argv[optind],&sbuf)<0 )
			{
				fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
				return 1;
			}
			char *name;
			int len = strlen(argv[optind]);
			if ( strcmp(argv[optind]+len-3,".gz") )
			{
				fprintf(stderr, "[bgzip] %s: unknown suffix -- ignored\n", argv[optind]);
				return 1;
			}
			fp = bgzf_open(argv[optind], "r");
			if (fp == NULL) {
				fprintf(stderr, "[bgzip] Could not open file: %s\n", argv[optind]);
				return 1;
			}

			if (pstdout) {
				f_dst = fileno(stdout);
			}
			else {
				name = strdup(argv[optind]);
				name[strlen(name) - 3] = '\0';
				f_dst = write_open(name, is_forced);
				free(name);
			}
		}
		else if (!pstdout && isatty(fileno((FILE *)stdin)) )
			return bgzip_main_usage();
		else
		{
			f_dst = fileno(stdout);
			fp = bgzf_fdopen(fileno(stdin), "r");
			if (fp == NULL) {
				fprintf(stderr, "[bgzip] Could not read from stdin: %s\n", strerror(errno));
				return 1;
			}
		}
        buffer = malloc(WINDOW_SIZE);
        if ( start>0 )
        {
            if ( bgzf_index_load(fp, argv[optind], ".gzi") < 0 ) error("Could not load index: %s.gzi\n", argv[optind]);
            if ( bgzf_useek(fp, start, SEEK_SET) < 0 ) error("Could not seek to %d-th (uncompressd) byte\n", start);
        }
        while (1) {
            if (end < 0) c = bgzf_read(fp, buffer, WINDOW_SIZE);
            else c = bgzf_read(fp, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start));
            if (c == 0) break;
            if (c < 0) error("Could not read %d bytes: Error %d\n", (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start), fp->errcode);
            start += c;
            if ( write(f_dst, buffer, c) != c ) error("Could not write %d bytes\n", c);
            if (end >= 0 && start >= end) break;
        }
        free(buffer);
        if (bgzf_close(fp) < 0) error("Close failed: Error %d\n",fp->errcode);
        if (!pstdout) unlink(argv[optind]);
        return 0;
	}
    return 0;
}
Exemplo n.º 10
0
static void naive_concat(args_t *args)
{
    // only compressed BCF atm
    BGZF *bgzf_out = bgzf_open(args->output_fname,"w");;

    const size_t page_size = 32768;
    char *buf = (char*) malloc(page_size);
    kstring_t tmp = {0,0,0};
    int i;
    for (i=0; i<args->nfnames; i++)
    {
        htsFile *hts_fp = hts_open(args->fnames[i],"r");
        if ( !hts_fp ) error("Failed to open: %s\n", args->fnames[i]);
        htsFormat type = *hts_get_format(hts_fp);

        if ( type.format==vcf ) error("The --naive option currently works only for compressed BCFs, sorry :-/\n");
        if ( type.compression!=bgzf ) error("The --naive option currently works only for compressed BCFs, sorry :-/\n");

        BGZF *fp = hts_get_bgzfp(hts_fp);
        if ( !fp || bgzf_read_block(fp) != 0 || !fp->block_length )
            error("Failed to read %s: %s\n", args->fnames[i], strerror(errno));

        uint8_t magic[5];
        if ( bgzf_read(fp, magic, 5) != 5 ) error("Failed to read the BCF header in %s\n", args->fnames[i]);
        if (strncmp((char*)magic, "BCF\2\2", 5) != 0) error("Invalid BCF magic string in %s\n", args->fnames[i]);

        if ( bgzf_read(fp, &tmp.l, 4) != 4 ) error("Failed to read the BCF header in %s\n", args->fnames[i]);
        hts_expand(char,tmp.l,tmp.m,tmp.s);
        if ( bgzf_read(fp, tmp.s, tmp.l) != tmp.l ) error("Failed to read the BCF header in %s\n", args->fnames[i]);

        // write only the first header
        if ( i==0 )
        {
            if ( bgzf_write(bgzf_out, "BCF\2\2", 5) !=5 ) error("Failed to write %d bytes to %s\n", 5,args->output_fname);
            if ( bgzf_write(bgzf_out, &tmp.l, 4) !=4 ) error("Failed to write %d bytes to %s\n", 4,args->output_fname);
            if ( bgzf_write(bgzf_out, tmp.s, tmp.l) != tmp.l) error("Failed to write %d bytes to %s\n", tmp.l,args->output_fname);
        }

        // Output all non-header data that were read together with the header block
        int nskip = fp->block_offset;
        if ( fp->block_length - nskip > 0 )
        {
            if ( bgzf_write(bgzf_out, fp->uncompressed_block+nskip, fp->block_length-nskip)<0 ) error("Error: %d\n",fp->errcode);
        }
        if ( bgzf_flush(bgzf_out)<0 ) error("Error: %d\n",bgzf_out->errcode);


        // Stream the rest of the file as it is, without recompressing, but remove BGZF EOF blocks
        ssize_t nread, ncached = 0, nwr;
        const int neof = 28;
        char cached[neof];
        while (1)
        {
            nread = bgzf_raw_read(fp, buf, page_size);

            // page_size boundary may occur in the middle of the EOF block, so we need to cache the blocks' ends
            if ( nread<=0 ) break;
            if ( nread<=neof )      // last block
            {
                if ( ncached )
                {
                    // flush the part of the cache that won't be needed
                    nwr = bgzf_raw_write(bgzf_out, cached, nread);
                    if (nwr != nread) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)nread);

                    // make space in the cache so that we can append to the end
                    if ( nread!=neof ) memmove(cached,cached+nread,neof-nread);
                }

                // fill the cache and check for eof outside this loop
                memcpy(cached+neof-nread,buf,nread);
                break;
            }

            // not the last block, flush the cache if full
            if ( ncached )
            {
                nwr = bgzf_raw_write(bgzf_out, cached, ncached);
                if (nwr != ncached) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)ncached);
                ncached = 0;
            }

            // fill the cache
            nread -= neof;
            memcpy(cached,buf+nread,neof);
            ncached = neof;

            nwr = bgzf_raw_write(bgzf_out, buf, nread);
            if (nwr != nread) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)nread);
        }
        if ( ncached && memcmp(cached,"\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0",neof) )
        {
            nwr = bgzf_raw_write(bgzf_out, cached, neof);
            if (nwr != neof) error("Write failed, wrote %d instead of %d bytes.\n", nwr,(int)neof);
        }
        if (hts_close(hts_fp)) error("Close failed: %s\n",args->fnames[i]);
    }
    free(buf);
    free(tmp.s);
    if (bgzf_close(bgzf_out) < 0) error("Error: %d\n",bgzf_out->errcode);
}
Exemplo n.º 11
0
Arquivo: bgzip.c Projeto: Illumina/akt
int main(int argc, char **argv)
{
    int c, compress, pstdout, is_forced, index = 0, rebgzip = 0, reindex = 0;
    BGZF *fp;
    void *buffer;
    long start, end, size;
    char *index_fname = NULL;
    int threads = 1;

    static const struct option loptions[] =
    {
        {"help", no_argument, NULL, 'h'},
        {"offset", required_argument, NULL, 'b'},
        {"stdout", no_argument, NULL, 'c'},
        {"decompress", no_argument, NULL, 'd'},
        {"force", no_argument, NULL, 'f'},
        {"index", no_argument, NULL, 'i'},
        {"index-name", required_argument, NULL, 'I'},
        {"reindex", no_argument, NULL, 'r'},
        {"rebgzip",no_argument,NULL,'g'},
        {"size", required_argument, NULL, 's'},
        {"threads", required_argument, NULL, '@'},
        {"version", no_argument, NULL, 1},
        {NULL, 0, NULL, 0}
    };

    compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0;
    while((c  = getopt_long(argc, argv, "cdh?fb:@:s:iI:gr",loptions,NULL)) >= 0){
        switch(c){
        case 'd': compress = 0; break;
        case 'c': pstdout = 1; break;
        case 'b': start = atol(optarg); compress = 0; pstdout = 1; break;
        case 's': size = atol(optarg); pstdout = 1; break;
        case 'f': is_forced = 1; break;
        case 'i': index = 1; break;
        case 'I': index_fname = optarg; break;
        case 'g': rebgzip = 1; break;
        case 'r': reindex = 1; compress = 0; break;
        case '@': threads = atoi(optarg); break;
        case 1:
            printf(
"bgzip (htslib) %s\n"
"Copyright (C) 2017 Genome Research Ltd.\n", hts_version());
            return EXIT_SUCCESS;
        case 'h':
        case '?': return bgzip_main_usage();
        }
    }
    if (size >= 0) end = start + size;
    if (end >= 0 && end < start) {
        fprintf(stderr, "[bgzip] Illegal region: [%ld, %ld]\n", start, end);
        return 1;
    }
    if (compress == 1) {
        struct stat sbuf;
        int f_src = fileno(stdin);

        if ( argc>optind )
        {
            if ( stat(argv[optind],&sbuf)<0 )
            {
                fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
                return 1;
            }

            if ((f_src = open(argv[optind], O_RDONLY)) < 0) {
                fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
                return 1;
            }

            if (pstdout)
                fp = bgzf_open("-", "w");
            else
            {
                char *name = malloc(strlen(argv[optind]) + 5);
                strcpy(name, argv[optind]);
                strcat(name, ".gz");
                fp = bgzf_open(name, is_forced? "w" : "wx");
                if (fp == NULL && errno == EEXIST && confirm_overwrite(name))
                    fp = bgzf_open(name, "w");
                if (fp == NULL) {
                    fprintf(stderr, "[bgzip] can't create %s: %s\n", name, strerror(errno));
                    free(name);
                    return 1;
                }
                free(name);
            }
        }
        else if (!pstdout && isatty(fileno((FILE *)stdout)) )
            return bgzip_main_usage();
        else if ( index && !index_fname )
        {
            fprintf(stderr, "[bgzip] Index file name expected when writing to stdout\n");
            return 1;
        }
        else
            fp = bgzf_open("-", "w");

        if ( index && rebgzip )
        {
            fprintf(stderr, "[bgzip] Can't produce a index and rebgzip simultaneously\n");
            return 1;
        }

        if ( rebgzip && !index_fname )
        {
            fprintf(stderr, "[bgzip] Index file name expected when writing to stdout\n");
            return 1;
        }

        if (threads > 1)
            bgzf_mt(fp, threads, 256);

        if ( index ) bgzf_index_build_init(fp);
        buffer = malloc(WINDOW_SIZE);
#ifdef _WIN32
        _setmode(f_src, O_BINARY);
#endif
        if (rebgzip){
            if ( bgzf_index_load(fp, index_fname, NULL) < 0 ) error("Could not load index: %s.gzi\n", argv[optind]);

            while ((c = read(f_src, buffer, WINDOW_SIZE)) > 0)
                if (bgzf_block_write(fp, buffer, c) < 0) error("Could not write %d bytes: Error %d\n", c, fp->errcode);
        }
        else {
            while ((c = read(f_src, buffer, WINDOW_SIZE)) > 0)
                if (bgzf_write(fp, buffer, c) < 0) error("Could not write %d bytes: Error %d\n", c, fp->errcode);
        }
        if ( index )
        {
            if (index_fname) {
                if (bgzf_index_dump(fp, index_fname, NULL) < 0)
                    error("Could not write index to '%s'\n", index_fname);
            } else {
                if (bgzf_index_dump(fp, argv[optind], ".gz.gzi") < 0)
                    error("Could not write index to '%s.gz.gzi'", argv[optind]);
            }
        }
        if (bgzf_close(fp) < 0) error("Close failed: Error %d", fp->errcode);
        if (argc > optind && !pstdout) unlink(argv[optind]);
        free(buffer);
        close(f_src);
        return 0;
    }
    else if ( reindex )
    {
        if ( argc>optind )
        {
            fp = bgzf_open(argv[optind], "r");
            if ( !fp ) error("[bgzip] Could not open file: %s\n", argv[optind]);
        }
        else
        {
            if ( !index_fname ) error("[bgzip] Index file name expected when reading from stdin\n");
            fp = bgzf_open("-", "r");
            if ( !fp ) error("[bgzip] Could not read from stdin: %s\n", strerror(errno));
        }

        buffer = malloc(BGZF_BLOCK_SIZE);
        bgzf_index_build_init(fp);
        int ret;
        while ( (ret=bgzf_read(fp, buffer, BGZF_BLOCK_SIZE))>0 ) ;
        free(buffer);
        if ( ret<0 ) error("Is the file gzipped or bgzipped? The latter is required for indexing.\n");

        if ( index_fname ) {
            if (bgzf_index_dump(fp, index_fname, NULL) < 0)
                error("Could not write index to '%s'\n", index_fname);
        } else {
            if (bgzf_index_dump(fp, argv[optind], ".gzi") < 0)
                error("Could not write index to '%s.gzi'\n", argv[optind]);
        }

        if ( bgzf_close(fp)<0 ) error("Close failed: Error %d\n",fp->errcode);
        return 0;
    }
    else
    {
        struct stat sbuf;
        int f_dst;

        if ( argc>optind )
        {
            if ( stat(argv[optind],&sbuf)<0 )
            {
                fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
                return 1;
            }
            char *name;
            int len = strlen(argv[optind]);
            if ( strcmp(argv[optind]+len-3,".gz") )
            {
                fprintf(stderr, "[bgzip] %s: unknown suffix -- ignored\n", argv[optind]);
                return 1;
            }
            fp = bgzf_open(argv[optind], "r");
            if (fp == NULL) {
                fprintf(stderr, "[bgzip] Could not open file: %s\n", argv[optind]);
                return 1;
            }

            if (pstdout) {
                f_dst = fileno(stdout);
            }
            else {
                const int wrflags = O_WRONLY | O_CREAT | O_TRUNC;
                name = strdup(argv[optind]);
                name[strlen(name) - 3] = '\0';
                f_dst = open(name, is_forced? wrflags : wrflags|O_EXCL, 0666);
                if (f_dst < 0 && errno == EEXIST && confirm_overwrite(name))
                    f_dst = open(name, wrflags, 0666);
                if (f_dst < 0) {
                    fprintf(stderr, "[bgzip] can't create %s: %s\n", name, strerror(errno));
                    free(name);
                    return 1;
                }
                free(name);
            }
        }
        else if (!pstdout && isatty(fileno((FILE *)stdin)) )
            return bgzip_main_usage();
        else
        {
            f_dst = fileno(stdout);
            fp = bgzf_open("-", "r");
            if (fp == NULL) {
                fprintf(stderr, "[bgzip] Could not read from stdin: %s\n", strerror(errno));
                return 1;
            }
        }
        if (threads > 1)
            bgzf_mt(fp, threads, 256);

        buffer = malloc(WINDOW_SIZE);
        if ( start>0 )
        {
            if ( bgzf_index_load(fp, argv[optind], ".gzi") < 0 ) error("Could not load index: %s.gzi\n", argv[optind]);
            if ( bgzf_useek(fp, start, SEEK_SET) < 0 ) error("Could not seek to %d-th (uncompressd) byte\n", start);
        }
#ifdef _WIN32
        _setmode(f_dst, O_BINARY);
#endif
        while (1) {
            if (end < 0) c = bgzf_read(fp, buffer, WINDOW_SIZE);
            else c = bgzf_read(fp, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start));
            if (c == 0) break;
            if (c < 0) error("Could not read %d bytes: Error %d\n", (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start), fp->errcode);
            start += c;
            if ( write(f_dst, buffer, c) != c ) {
#ifdef _WIN32
                if (GetLastError() != ERROR_NO_DATA)
#endif
                error("Could not write %d bytes\n", c);
            }
            if (end >= 0 && start >= end) break;
        }
        free(buffer);
        if (bgzf_close(fp) < 0) error("Close failed: Error %d\n",fp->errcode);
        if (!pstdout) unlink(argv[optind]);
        return 0;
    }
}
Exemplo n.º 12
0
int main(int argc, char **argv)
{
	int c, compress, pstdout, is_forced;
	BGZF *fp;
	void *buffer;
	long start, end, size;

	compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0;
	while((c  = getopt(argc, argv, "cdhfb:s:")) >= 0){
		switch(c){
		case 'h': return bgzip_main_usage();
		case 'd': compress = 0; break;
		case 'c': pstdout = 1; break;
		case 'b': start = atol(optarg); break;
		case 's': size = atol(optarg); break;
		case 'f': is_forced = 1; break;
		}
	}
	if (size >= 0) end = start + size;
	if (end >= 0 && end < start) {
		fprintf(stderr, "[bgzip] Illegal region: [%ld, %ld]\n", start, end);
		return 1;
	}
	if (compress == 1) {
		struct stat sbuf;
		int f_src = fileno(stdin);
		int f_dst = fileno(stdout);

		if ( argc>optind )
		{
			if ( stat(argv[optind],&sbuf)<0 ) 
			{ 
				fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
				return 1; 
			}

			if ((f_src = open(argv[optind], O_RDONLY)) < 0) {
				fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
				return 1;
			}

			if (pstdout)
				f_dst = fileno(stdout);
			else
			{
				char *name = malloc(strlen(argv[optind]) + 5);
				strcpy(name, argv[optind]);
				strcat(name, ".gz");
				f_dst = write_open(name, is_forced);
				if (f_dst < 0) return 1;
				free(name);
			}
		}
		else if (!pstdout && isatty(fileno((FILE *)stdout)) )
			return bgzip_main_usage();

		fp = bgzf_fdopen(f_dst, "w");
		buffer = malloc(WINDOW_SIZE);
		while ((c = read(f_src, buffer, WINDOW_SIZE)) > 0)
			if (bgzf_write(fp, buffer, c) < 0) fail(fp);
		// f_dst will be closed here
		if (bgzf_close(fp) < 0) fail(fp);
		if (argc > optind && !pstdout) unlink(argv[optind]);
		free(buffer);
		close(f_src);
		return 0;
	} else {
		struct stat sbuf;
		int f_dst;

		if ( argc>optind )
		{
			if ( stat(argv[optind],&sbuf)<0 )
			{
				fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]);
				return 1;
			}
			char *name;
			int len = strlen(argv[optind]);
			if ( strcmp(argv[optind]+len-3,".gz") )
			{
				fprintf(stderr, "[bgzip] %s: unknown suffix -- ignored\n", argv[optind]);
				return 1;
			}
			fp = bgzf_open(argv[optind], "r");
			if (fp == NULL) {
				fprintf(stderr, "[bgzip] Could not open file: %s\n", argv[optind]);
				return 1;
			}

			if (pstdout) {
				f_dst = fileno(stdout);
			}
			else {
				name = strdup(argv[optind]);
				name[strlen(name) - 3] = '\0';
				f_dst = write_open(name, is_forced);
				free(name);
			}
		}
		else if (!pstdout && isatty(fileno((FILE *)stdin)) )
			return bgzip_main_usage();
		else
		{
			f_dst = fileno(stdout);
			fp = bgzf_fdopen(fileno(stdin), "r");
			if (fp == NULL) {
				fprintf(stderr, "[bgzip] Could not read from stdin: %s\n", strerror(errno));
				return 1;
			}
		}
		buffer = malloc(WINDOW_SIZE);
		if (bgzf_seek(fp, start, SEEK_SET) < 0) fail(fp);
		while (1) {
			if (end < 0) c = bgzf_read(fp, buffer, WINDOW_SIZE);
			else c = bgzf_read(fp, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start));
			if (c == 0) break;
			if (c < 0) fail(fp);
			start += c;
			write(f_dst, buffer, c);
			if (end >= 0 && start >= end) break;
		}
		free(buffer);
		if (bgzf_close(fp) < 0) fail(fp);
		if (!pstdout) unlink(argv[optind]);
		return 0;
	}
}
Exemplo n.º 13
0
int fst_stat(int argc,char **argv){
  
  char *bname = *argv;
  fprintf(stderr,"\t-> Assuming idxname:%s\n",bname);
  perfst *pf = perfst_init(bname);
  args *pars = getArgs(--argc,++argv);  
  int *ppos = NULL;
  int chs = choose(pf->names.size(),2);
  // fprintf(stderr,"choose:%d \n",chs);
  double **ares = new double*[chs];
  double **bres = new double*[chs];
  double unweight[chs];
  double wa[chs];
  double wb[chs];
  size_t nObs[chs];
  for(int i=0;i<chs;i++){
    unweight[i] = wa[i] = wb[i] =0.0;
    nObs[i] = 0;
  }
  for(myFstMap::iterator it=pf->mm.begin();it!=pf->mm.end();++it){
    if(pars->chooseChr!=NULL){
      it = pf->mm.find(pars->chooseChr);
      if(it==pf->mm.end()){
	fprintf(stderr,"Problem finding chr: %s\n",pars->chooseChr);
	break;
      }
    }
    if(it->second.nSites==0)
      continue;
    bgzf_seek(pf->fp,it->second.off,SEEK_SET);
    ppos = new int[it->second.nSites];
    
    bgzf_read(pf->fp,ppos,sizeof(int)*it->second.nSites);
    for(int i=0;i<choose(pf->names.size(),2);i++){
      ares[i] = new double[it->second.nSites];
      bres[i] = new double[it->second.nSites];
      bgzf_read(pf->fp,ares[i],sizeof(double)*it->second.nSites);
      bgzf_read(pf->fp,bres[i],sizeof(double)*it->second.nSites);
    }
    


    int first=0;
    if(pars->start!=-1)
      while(ppos[first]<pars->start) 
	first++;
    
    int last=it->second.nSites;

    if(pars->stop!=-1&&pars->stop<=ppos[last-1]){
      last=first;
      
      while(ppos[last]<pars->stop) 
	last++;
    }

    //  fprintf(stderr,"pars->stop:%d ppos:%d first:%d last:%d\n",pars->stop,ppos[last-1],first,last);

    for(int s=first;s<last;s++){
#if 0
      fprintf(stdout,"%s\t%d",it->first,ppos[s]+1);
      for(int i=0;i<choose(pf->names.size(),2);i++)
	fprintf(stdout,"\t%f\t%f",ares[i][s],bres[i][s]);
      fprintf(stdout,"\n");
#endif
      for(int i=0;i<choose(pf->names.size(),2);i++){
	if(bres[i][s]!=0){
	  unweight[i] += ares[i][s]/bres[i][s];
	  nObs[i]++;
	}
	wa[i] += ares[i][s];
	wb[i] += bres[i][s];
      }
    }
    for(int i=0;i<choose(pf->names.size(),2);i++){
      delete [] ares[i];
      delete [] bres[i];
    }
    
    delete [] ppos;
    
    if(pars->chooseChr!=NULL)
      break;
  }
  double fstUW[chs];
  double fstW[chs];
  for(int i=0;i<chs;i++){
    fstUW[i] = unweight[i]/(1.0*nObs[i]);
    fstW[i] = wa[i]/wb[i];
    fprintf(stderr,"\t-> FST.Unweight[nObs:%lu]:%f Fst.Weight:%f\n",nObs[i],fstUW[i],fstW[i]);
    fprintf(stdout,"%f %f\n",fstUW[i],fstW[i]);
  }
  if(chs==3){
    //if chr==3 then we have 3pops and we will also calculate pbs statistics
    calcpbs(fstW);//<- NOTE: the pbs values will replace the fstW values
    for(int i=0;i<3;i++)
      fprintf(stderr,"\t-> pbs.pop%d\t%f\n",i+1,fstW[i]);
  }
  delete [] ares;
  delete [] bres;
  destroy_args(pars);
  perfst_destroy(pf);
  return 0;
}
Exemplo n.º 14
0
int fst_print(int argc,char **argv){

  char *bname = *argv;
  fprintf(stderr,"\t-> Assuming idxname:%s\n",bname);
  perfst *pf = perfst_init(bname);
  writefst_header(stderr,pf);  
  args *pars = getArgs(--argc,++argv);  
  int *ppos = NULL;
  fprintf(stderr,"choose:%d \n",choose(pf->names.size(),2));
  double **ares = new double*[choose(pf->names.size(),2)];
  double **bres = new double*[choose(pf->names.size(),2)];
  for(myFstMap::iterator it=pf->mm.begin();it!=pf->mm.end();++it){
    if(pars->chooseChr!=NULL){
      it = pf->mm.find(pars->chooseChr);
      if(it==pf->mm.end()){
	fprintf(stderr,"Problem finding chr: %s\n",pars->chooseChr);
	break;
      }
    }
    if(it->second.nSites==0)
      continue;
    bgzf_seek(pf->fp,it->second.off,SEEK_SET);
    ppos = new int[it->second.nSites];
    
    bgzf_read(pf->fp,ppos,sizeof(int)*it->second.nSites);
    for(int i=0;i<choose(pf->names.size(),2);i++){
      ares[i] = new double[it->second.nSites];
      bres[i] = new double[it->second.nSites];
      bgzf_read(pf->fp,ares[i],sizeof(double)*it->second.nSites);
      bgzf_read(pf->fp,bres[i],sizeof(double)*it->second.nSites);
    }
    


    int first=0;
    if(pars->start!=-1)
      while(ppos[first]<pars->start) 
	first++;
    
    int last=it->second.nSites;

    if(pars->stop!=-1&&pars->stop<=ppos[last-1]){
      last=first;
      while(ppos[last]<pars->stop) 
	last++;
    }

    fprintf(stderr,"pars->stop:%d ppos:%d first:%d last:%d\n",pars->stop,ppos[last-1],first,last);

    for(int s=first;s<last;s++){
      fprintf(stdout,"%s\t%d",it->first,ppos[s]+1);
      for(int i=0;i<choose(pf->names.size(),2);i++)
	fprintf(stdout,"\t%f\t%f",ares[i][s],bres[i][s]);
      fprintf(stdout,"\n");
    }
    for(int i=0;i<choose(pf->names.size(),2);i++){
      delete [] ares[i];
      delete [] bres[i];
    }
    
    delete [] ppos;
    
    if(pars->chooseChr!=NULL)
      break;
  }
  delete [] ares;
  delete [] bres;
  destroy_args(pars);
  perfst_destroy(pf);
  return 0;
}
Exemplo n.º 15
0
int fst_stat2(int argc,char **argv){
  int pS,pE;//physical start,physical end
  int begI,endI;//position in array for pS, pE;
  
  char *bname = *argv;
  fprintf(stderr,"\t-> Assuming idxname:%s\n",bname);
  perfst *pf = perfst_init(bname);
  args *pars = getArgs(--argc,++argv);
  fprintf(stderr,"win:%d step:%d\n",pars->win,pars->step);
  int *ppos = NULL;
  int chs = choose(pf->names.size(),2);
  // fprintf(stderr,"choose:%d \n",chs);
  double **ares = new double*[chs];
  double **bres = new double*[chs];
  double unweight[chs];
  double wa[chs];
  double wb[chs];
  size_t nObs =0;
 
  for(myFstMap::iterator it=pf->mm.begin();it!=pf->mm.end();++it){
    if(pars->chooseChr!=NULL){
      it = pf->mm.find(pars->chooseChr);
      if(it==pf->mm.end()){
	fprintf(stderr,"Problem finding chr: %s\n",pars->chooseChr);
	break;
      }
    }
    fprintf(stderr,"nSites:%lu\n",it->second.nSites);
    if(it->second.nSites==0&&pars->chooseChr!=NULL)
      break;
    else if(it->second.nSites==0&&pars->chooseChr==NULL)
      continue;
    bgzf_seek(pf->fp,it->second.off,SEEK_SET);
    ppos = new int[it->second.nSites];
    
    bgzf_read(pf->fp,ppos,sizeof(int)*it->second.nSites);
    for(int i=0;i<it->second.nSites;i++)
      ppos[i]++;
    for(int i=0;i<choose(pf->names.size(),2);i++){
      ares[i] = new double[it->second.nSites];
      bres[i] = new double[it->second.nSites];
      bgzf_read(pf->fp,ares[i],sizeof(double)*it->second.nSites);
      bgzf_read(pf->fp,bres[i],sizeof(double)*it->second.nSites);
    }
    

    if(pars->type==0)
      pS = ((pars->start!=-1?pars->start:ppos[0])/pars->step)*pars->step +pars->step;
    else if(pars->type==1)
      pS = (pars->start!=-1?pars->start:ppos[0]);
    else if(pars->type==2)
      pS = 1;
    pE = pS+pars->win;
    begI=endI=0;

    //    fprintf(stderr,"ps:%d\n",pS);exit(0);
    if(pE>(pars->stop!=-1?pars->stop:ppos[it->second.nSites-1])){
    fprintf(stderr,"end of dataset is before end of window: end of window:%d last position in chr:%d\n",pE,ppos[it->second.nSites-1]);
    //    return str;
  }

  while(ppos[begI]<pS) begI++;
  
  endI=begI;
  while(ppos[endI]<pE) endI++;

  //fprintf(stderr,"begI:%d endI:%d\n",begI,endI);

  while(1){
    for(int i=0;i<chs;i++)
      unweight[i] = wa[i] = wb[i] =0.0;
    nObs=0;
    fprintf(stdout,"(%d,%d)(%d,%d)(%d,%d)\t%s\t%d",begI,endI-1,ppos[begI],ppos[endI-1],pS,pE,it->first,pS+(pE-pS)/2);
    for(int s=begI;s<endI;s++){
#if 0
      fprintf(stdout,"%s\t%d",it->first,ppos[s]+1);
      for(int i=0;i<choose(pf->names.size(),2);i++)
	fprintf(stdout,"\t%f\t%f",ares[i][s],bres[i][s]);
      fprintf(stdout,"\n");
#endif
      for(int i=0;i<choose(pf->names.size(),2);i++){
	unweight[i] += ares[i][s]/bres[i][s];
	wa[i] += ares[i][s];
	wb[i] += bres[i][s];
      }
      nObs++;
    }
    double fstW[chs];
    for(int i=0;nObs>0&&i<chs;i++){
      fstW[i] = wa[i]/wb[i];
      fprintf(stdout,"\t%f\t%f",unweight[i]/(1.0*nObs),fstW[i]);
    }
    if(chs==3){
      //if chr==3 then we have 3pops and we will also calculate pbs statistics
      calcpbs(fstW);//<- NOTE: the pbs values will replace the fstW values
      for(int i=0;i<3;i++)
	fprintf(stdout,"\t%f",fstW[i]);
    }
    fprintf(stdout,"\n");

    pS += pars->step;
    pE =pS+pars->win;
    if(pE>(pars->stop!=-1?pars->stop:ppos[it->second.nSites-1]))
      break;
    
    while(ppos[begI]<pS) begI++;
    while(ppos[endI]<pE) endI++;
  }
    for(int i=0;i<choose(pf->names.size(),2);i++){
      delete [] ares[i];
      delete [] bres[i];
    }
    
    delete [] ppos;
    
    if(pars->chooseChr!=NULL)
      break;
  }
 
  delete [] ares;
  delete [] bres;
  destroy_args(pars);
  perfst_destroy(pf);
  return 0;
}
Exemplo n.º 16
0
int main(int argc, char *argv[]) {
    if (argc <= 1) {
        fprintf(stderr, "Usage: thrash_threads4 input.bam\n");
        exit(1);
    }

    // Find a valid seek location ~64M into the file
    int i;
    ssize_t got;
    BGZF *fpin  = bgzf_open(argv[1], "r");
    uint64_t upos = 0, uend = 0;
    char buf[100000];
    for (i = 0; i < 100; i++) {
        if ((got = bgzf_read(fpin, buf, 65536)) < 0)
            abort();
        upos += got;
    }
    int64_t pos = bgzf_tell(fpin);
    while ((got = bgzf_read(fpin, buf, 65536)) > 0) {
        uend += got;
    }
    if (got < 0) abort();
    int64_t end = bgzf_tell(fpin);
    bgzf_close(fpin);

    // Ensure input is big enough to avoid case 3,4 below going off the end
    // of the file
    if (uend < upos + 10000000) {
        fprintf(stderr, "Please supply a bigger input file\n");
        exit(1);
    }

#define N 1000

    // Spam random seeks & reads
    for (i = 0; i < 1000; i++) {
        printf("i=%d\t", i);
        fpin  = bgzf_open(argv[1], "r");
        int j, eof = 0, mt = 0;
        for (j = 0; j < 80; j++) {
            int n = rand() % 7;
            putchar('0'+n); fflush(stdout);
            switch (n) {
            case 0: // start
                if (bgzf_seek(fpin, 0LL, SEEK_SET) < 0) puts("!");//abort();
                eof = 0;
                break;
            case 1: // mid
                if (bgzf_seek(fpin, pos, SEEK_SET) < 0) puts("!");//abort();
                eof = 0;
                break;
            case 2: // eof
                if (bgzf_seek(fpin, end, SEEK_SET) < 0) puts("!");//abort();
                eof = 1;
                break;
            case 3: case 4: {
                int l = rand()%(n==3?100000:100);
                if (bgzf_read(fpin, buf, l) != l*(1-eof)) abort();
                break;
            }
            case 5:
                usleep(N);
                break;
            case 6:
                if (!mt)
                    bgzf_mt(fpin, 8, 256);
                mt = 1;
                break;
            }
        }
        printf("\n");
        if (bgzf_close(fpin))
            abort();
    }

    return 0;
}
Exemplo n.º 17
0
value caml_bgzf_input(value bgzf, value buf, value ofs, value len) {
	CAMLparam4(bgzf,buf,ofs,len);
	CAMLreturn(Val_long(bgzf_read(BGZF_val(bgzf),&Byte_u(buf,Long_val(ofs)),Int_val(len))));
}
Exemplo n.º 18
0
/**
 * Create single chromosome index file
 * the file content is a 2-column matrix of int64_t type
 * line1:  num_sample  num_marker
 * line2:  0           bgzf_offset_for_#CHROM_line
 * line3:  var_1_pos   bgzf_offset_for_var_1
 * ...
 */
int SingleChromosomeBCFIndex::createIndex() {
  // const char* fn = bcfFile_.c_str();
  BGZF* fp = fBcfFile_;  // bgzf_open(fn, "rb");
  bgzf_seek(fp, 0, SEEK_SET);

  // check magic number
  char magic[5];
  if (5 != bgzf_read(fp, magic, 5)) {
    return -1; // exit(1);
  }
  if (!(magic[0] == 'B' && magic[1] == 'C' && magic[2] == 'F' &&
        magic[3] == 2 && (magic[4] == 1 || magic[4] == 2))) {
    return -1; // exit(1);
  }

  // read header
  uint32_t l_text;
  if (4 != bgzf_read(fp, &l_text, 4)) {
    return -1; // exit(1);
  }
  Rprintf("l_text = %d\n", l_text);

  std::string s;
  int64_t bgzf_offset_before_header = bgzf_tell(fp); // the beginning of header block
  s.resize(l_text);
  if (bgzf_read(fp, (void*)s.data(), l_text) != l_text) {
    REprintf( "Read failed!\n");
  }
  BCFHeader bcfHeader;
  if (bcfHeader.parseHeader(s,
                  &bcfHeader.header_contig_id,
                  &bcfHeader.header_id,
                  &bcfHeader.header_number,
                  &bcfHeader.header_type,
                  &bcfHeader.header_description)) {
    REprintf( "Parse header failed!\n");
    return -1; // exit(1);
  }

  // locate #CHROM line
  int64_t bgzf_offset_after_header = bgzf_tell(fp); // the end of header block
  size_t ptr_chrom_line = s.find("#CHROM"); // the index of "#CHROM", also the size between beginning of header to '#CHROM'
  if (ptr_chrom_line == std::string::npos) {
    REprintf( "Cannot find the \"#CHROM\" line!\n");
    return -1; // exit(1);
  }
  Rprintf("offset_header = %d\n", (int) ptr_chrom_line);

  bgzf_seek(fp, bgzf_offset_before_header, SEEK_SET); // rewind fp to the beginning of header
  s.resize(ptr_chrom_line);
  int64_t before_chrom_size = bgzf_read(fp, (void*) s.data(), ptr_chrom_line);
  int64_t bgzf_offset_before_chrom = bgzf_tell(fp); // the offset to #CHROM
  s.resize(l_text - before_chrom_size);
  int64_t after_chrom_size = bgzf_read(fp, (void*) s.data(), l_text - before_chrom_size);
  // load sample names
  while (s.back() == '\n' || s.back() == '\0') {
    s.resize(s.size() - 1);
  }
  stringTokenize(s, "\t", &bcfHeader.sample_names);
  const int64_t num_sample = (int)bcfHeader.sample_names.size() - 9; // vcf header has 9 columns CHROM...FORMAT before actual sample names
  Rprintf("sample size = %ld\n", num_sample);
  Rprintf("last character is s[after_chrom_size-1] = %d\n", s[after_chrom_size - 1]); // should be 0, the null terminator character
  // quality check
  if (bgzf_offset_after_header != bgzf_tell(fp)) {
    REprintf( "Messed up bgzf header\n");
    return -1; // exit(1);
  }

  // create index file
  FILE* fIndex = fopen(indexFile_.c_str(), "wb");
  int64_t num_marker = 0;
  int64_t pos = 0;
  fwrite(&num_sample, sizeof(int64_t), 1, fIndex);
  fwrite(&num_marker, sizeof(int64_t), 1, fIndex);
  fwrite(&pos, sizeof(int64_t), 1, fIndex);
  fwrite(&bgzf_offset_before_chrom, sizeof(int64_t), 1, fIndex);

  uint32_t l_shared;
  uint32_t l_indiv;
  std::vector<char> data;
  int64_t offset;
  do {
    offset = bgzf_tell(fp);
    if (4 != bgzf_read(fp, &l_shared, sizeof(uint32_t))) {
      break; // REprintf( "Wrong read!\n"); exit(1);
    }
    if (4 != bgzf_read(fp, &l_indiv, sizeof(uint32_t))) {
      break; // REprintf( "Wrong read!\n"); exit(1);
    }
    data.resize(l_shared + l_indiv);
    if (l_shared + l_indiv != bgzf_read(fp, data.data(), (l_shared+l_indiv) * sizeof(char))) {
      break; // REprintf( "Wrong read!\n"); exit(1);
    }
    memcpy(&pos, data.data() + 4, 4);
    fwrite(&pos, sizeof(int64_t), 1, fIndex);
    fwrite(&offset, sizeof(int64_t), 1, fIndex);

    num_marker++;
    if (num_marker % 10000 == 0) {
      Rprintf("\rprocessed %ld markers", num_marker);
    }
  } while (true);

  if (fseek(fIndex, 0, SEEK_SET)) {
    REprintf( "fseek failed\n!");
  }
  fwrite(&num_sample, sizeof(int64_t), 1, fIndex);
  fwrite(&num_marker, sizeof(int64_t), 1, fIndex);
  fclose(fIndex);
  Rprintf("Indexing finished with %ld samples and %ld markers\n", num_sample, num_marker);

  return 0;
}
Exemplo n.º 19
0
// ClassifyFileType
// Attempt to classify the alignment file as one of CSV, BED or SAM from it's initial 8k char contents
// Currently processes CSV, BED and SAM format file types
// Assumes must be SAM if initial lines have at least one prefixed by a '@' followed by a 2 letter record type code 
//
etClassifyFileType
CUtility::ClassifyFileType(char *pszFileName)
{
int hFile;
gzFile gz;
BGZF* pInBGZF;
int BuffLen;
int BuffIdx;
UINT8 Buffer[cFileClassifyBuffLen];
UINT8 *pBuff;
bool bStartNL;
bool bSkipEOL;
UINT8 Chr;
int NumLines;
int FieldCnt;
int TabCnt;
int CommaCnt;
int FldLen;
bool bInQuotes;
int LikelyCSV;
int LikelyBED;
int LikelySAM;
int LikelyNonCSVSAMBED;
bool bSeenSAMHdrs;
int FileNameLen;
bool bGZd;

FileNameLen = (int)strlen(pszFileName);
bGZd = false;
if(FileNameLen >= 4)
	{
	if(!stricmp(&pszFileName[FileNameLen-3],".gz"))
		bGZd = true;
	else
		{
		if(FileNameLen >= 5 && !stricmp(&pszFileName[FileNameLen-4],".bam"))
			{
			hFile = open(pszFileName,O_READSEQ);
			if(hFile == -1)
				return(eCFTopenerr);
			// BAM will using BGZF compression ..
			if((pInBGZF = bgzf_dopen(hFile, "r"))==NULL)
				{
				gDiagnostics.DiagOut(eDLFatal,gszProcName,"ClassifyFileType: unable to initialise for BGZF processing on file '%s'",pszFileName);
				close(hFile);
				return(eCFTopenerr);
				}
			hFile = -1;

			// try reading the header, bgzf_read will confirm it does start with "BAM\1" ....
			if((BuffLen = (int)bgzf_read(pInBGZF,Buffer,100)) < 100)		// will be < 100 if errors ...
				{
				gDiagnostics.DiagOut(eDLFatal,gszProcName,"ClassifyFileType: Not a BAM format file '%s'",pszFileName);
				bgzf_close(pInBGZF);
				return(eCFTopenerr);
				}
			bgzf_close(pInBGZF);
			return(eCFTSAM);
			}
		}
	}

// now can try to actually open file and read in first cFileClassifyBuffLen chars
if(bGZd)
	{
	gz = gzopen(pszFileName,"rb");
	if(gz == NULL)
		{
		gDiagnostics.DiagOut(eDLFatal,gszProcName,"Open: unable to open for reading gzip'd file '%s'",pszFileName);
		return(eCFTopenerr);
		}
	BuffLen = gzread(gz,Buffer,sizeof(Buffer)-1);
	gzclose(gz);
	}
else
	{
	hFile = open(pszFileName,O_READSEQ);
	if(hFile == -1)
		return(eCFTopenerr);
	// read the 1st cFileTypeBuffLen into buffer
	BuffLen = read(hFile,Buffer,sizeof(Buffer)-1);
	close(hFile);
	}

if(BuffLen < cMinFileClassifyLen)	// an arbitary lower limit!
	return(eCFTlenerr);

Buffer[BuffLen] = '\0';
pBuff = Buffer;
NumLines = 0;
LikelyCSV = 0;
LikelyBED = 0;
LikelySAM = 0;
LikelyNonCSVSAMBED = 0;
BuffIdx = 0;
bStartNL = true;
bSeenSAMHdrs = false;
while(Chr = *pBuff++)
	{
	BuffIdx += 1;
	if(bStartNL)
		{
		FieldCnt = 0;
		TabCnt = 0;
		CommaCnt = 0;
		FldLen = 0;
		bInQuotes = false;
		bStartNL = false;
		bSkipEOL = false;
		NumLines += 1;
		}
	if(Chr == '\n' || Chr == '\r')			// if at end of line
		{
		bStartNL = true;
		bSkipEOL = false;
		if(FieldCnt < 3)					// BED can have down to 3 fields, CSV alignment and SAM should have more
			continue;

		if(!bInQuotes)
			{
			if(CommaCnt >= 3 && CommaCnt > TabCnt)		// if at least as many commas as tabs as assumed field separators then most likely a CSV file
				LikelyCSV += 10;
			else							// if more tabs than commas then could be either BED or SAM
				{
				if(bSeenSAMHdrs)
					{
					LikelyBED += 5;
					LikelySAM += 10;			// SAM would be distinguished by it's header lines starting with '@"
					}
				else
					{
					LikelyBED += 20;
					LikelySAM += 5;
					}
				}
			}
		continue;
		}
	
	if(bSkipEOL)
		continue;

	if(!FieldCnt && !FldLen && (Chr == ' ' || Chr == '\t'))		// simply slough all leading whitespaces before intial field starts
		continue;

	// nested quotes are potentially a problem; currently quotes are simply sloughed
	if(Chr == '\'' || Chr == '"')
		{
		bInQuotes = !bInQuotes;
		continue;
		}


	if(!FieldCnt && !bInQuotes && Chr == '@' || Chr == '>')
		{
		if(Chr == '@')				// if SAM then header line(s) should be present and can be expected to start with "@HD", "@SQ", "@RG", "@PG", "@CO" 
			{		
			if(BuffIdx <  (BuffLen - 3))
				{
				if(((*pBuff == 'H' && pBuff[1] == 'D') ||
					(*pBuff == 'S' && pBuff[1] == 'Q') ||
					(*pBuff == 'R' && pBuff[1] == 'G') ||
					(*pBuff == 'P' && pBuff[1] == 'G') ||
					(*pBuff == 'C' && pBuff[1] == 'O')) &&
					(pBuff[2] == ' ' || pBuff[2] == '\t' ))
					{
					bSeenSAMHdrs = true;
					LikelyNonCSVSAMBED = 0;
					LikelySAM += 10000;
					bSkipEOL = true;
					continue;
					}
				else
					{
					if(!bSeenSAMHdrs)					// if no SAM headers parsed then could easily be a fastq...
						{
						LikelyNonCSVSAMBED += 50;
						bSkipEOL = true;
						continue;
						}
					}
				}
			}
		if(Chr == '>')									// if at start of line then could easily be fasta...
			LikelyNonCSVSAMBED += 50;
		}

	switch(Chr) {
		case ' ':			// simply slough spaces
			continue;

		case ',':			// if comma then likely is a csv, but could still be BED if in optional fields 9 (itemRgb) onwards 
			if(TabCnt < 8 && FieldCnt >= TabCnt)
				{
				FieldCnt += 1;	
				CommaCnt += 1;
				FldLen = 0;
				}
			break;

		case '\t':			// tabs are in BED and SAM as field separators, but could also be present in CSV as spacers
			if(CommaCnt < 3 && FieldCnt >= CommaCnt)
				{
				FieldCnt += 1;	
				TabCnt += 1;
				FldLen = 0;
				}
			break;

		default:			// any other char is assumed to be part of an actual field value
			FldLen += 1;
			break;
		}
	}

if(LikelyNonCSVSAMBED >= 250 || (LikelyCSV < 10 && LikelyBED < 10 && LikelySAM < 500))
	return(eCFTunknown);

if(LikelyCSV >= LikelyBED && LikelyCSV >= LikelySAM)
	return(eCFTCSV);
if(LikelyBED >= LikelySAM)
	return(eCFTBED);
return(eCFTSAM);	
}