// File name or a list of genomic locations. If file name, NULL is returned. static bcf_sr_regions_t *_regions_init_string(const char *str) { bcf_sr_regions_t *reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t)); reg->start = reg->end = -1; reg->prev_start = reg->prev_seq = -1; kstring_t tmp = {0,0,0}; const char *sp = str, *ep = str; int from, to; while ( 1 ) { while ( *ep && *ep!=',' && *ep!=':' ) ep++; tmp.l = 0; kputsn(sp,ep-sp,&tmp); if ( *ep==':' ) { sp = ep+1; from = hts_parse_decimal(sp,(char**)&ep); if ( sp==ep ) { fprintf(stderr,"[%s:%d %s] Could not parse the region(s): %s\n", __FILE__,__LINE__,__FUNCTION__,str); free(reg); free(tmp.s); return NULL; } if ( !*ep || *ep==',' ) { _regions_add(reg, tmp.s, from, from); sp = ep; continue; } if ( *ep!='-' ) { fprintf(stderr,"[%s:%d %s] Could not parse the region(s): %s\n", __FILE__,__LINE__,__FUNCTION__,str); free(reg); free(tmp.s); return NULL; } ep++; sp = ep; to = hts_parse_decimal(sp,(char**)&ep); if ( *ep && *ep!=',' ) { fprintf(stderr,"[%s:%d %s] Could not parse the region(s): %s\n", __FILE__,__LINE__,__FUNCTION__,str); free(reg); free(tmp.s); return NULL; } if ( sp==ep ) to = MAX_CSI_COOR-1; _regions_add(reg, tmp.s, from, to); if ( !*ep ) break; sp = ep; } else { if ( tmp.l ) _regions_add(reg, tmp.s, -1, -1); if ( !*ep ) break; sp = ++ep; } } free(tmp.s); return reg; }
bcf_sr_regions_t *bcf_sr_regions_init(const char *regions, int is_file, int ichr, int ifrom, int ito) { bcf_sr_regions_t *reg; if ( !is_file ) return _regions_init_string(regions); reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t)); reg->start = reg->end = -1; reg->prev_start = reg->prev_seq = -1; reg->file = hts_open(regions, "rb"); if ( !reg->file ) { fprintf(stderr,"[%s:%d %s] Could not open file: %s\n", __FILE__,__LINE__,__FUNCTION__,regions); free(reg); return NULL; } reg->tbx = tbx_index_load(regions); if ( !reg->tbx ) { int len = strlen(regions); int is_bed = strcasecmp(".bed",regions+len-4) ? 0 : 1; if ( !is_bed && !strcasecmp(".bed.gz",regions+len-7) ) is_bed = 1; if ( reg->file->format.format==vcf ) ito = 1; // read the whole file, tabix index is not present while ( hts_getline(reg->file, KS_SEP_LINE, ®->line) > 0 ) { char *chr, *chr_end; int from, to, ret; ret = _regions_parse_line(reg->line.s, ichr,ifrom,abs(ito), &chr,&chr_end,&from,&to); if ( ret < 0 ) { if ( ito<0 ) ret = _regions_parse_line(reg->line.s, ichr,ifrom,ifrom, &chr,&chr_end,&from,&to); if ( ret<0 ) { fprintf(stderr,"[%s:%d] Could not parse the file %s, using the columns %d,%d[,%d]\n", __FILE__,__LINE__,regions,ichr+1,ifrom+1,ito+1); hts_close(reg->file); reg->file = NULL; free(reg); return NULL; } } if ( !ret ) continue; if ( is_bed ) from++; *chr_end = 0; _regions_add(reg, chr, from, to); *chr_end = '\t'; } hts_close(reg->file); reg->file = NULL; if ( !reg->nseqs ) { free(reg); return NULL; } return reg; } reg->seq_names = (char**) tbx_seqnames(reg->tbx, ®->nseqs); if ( !reg->seq_hash ) reg->seq_hash = khash_str2int_init(); int i; for (i=0; i<reg->nseqs; i++) { khash_str2int_set(reg->seq_hash,reg->seq_names[i],i); } reg->fname = strdup(regions); reg->is_bin = 1; return reg; }
int bcf_sr_add_reader(bcf_srs_t *files, const char *fname) { htsFile* file_ptr = hts_open(fname, "r"); if ( ! file_ptr ) { files->errnum = open_failed; return 0; } files->has_line = (int*) realloc(files->has_line, sizeof(int)*(files->nreaders+1)); files->has_line[files->nreaders] = 0; files->readers = (bcf_sr_t*) realloc(files->readers, sizeof(bcf_sr_t)*(files->nreaders+1)); bcf_sr_t *reader = &files->readers[files->nreaders++]; memset(reader,0,sizeof(bcf_sr_t)); reader->file = file_ptr; files->errnum = 0; if ( files->require_index ) { if ( reader->file->format.format==vcf ) { if ( reader->file->format.compression!=bgzf ) { files->errnum = not_bgzf; return 0; } reader->tbx_idx = tbx_index_load(fname); if ( !reader->tbx_idx ) { files->errnum = idx_load_failed; return 0; } reader->header = bcf_hdr_read(reader->file); } else if ( reader->file->format.format==bcf ) { if ( reader->file->format.compression!=bgzf ) { files->errnum = not_bgzf; return 0; } reader->header = bcf_hdr_read(reader->file); reader->bcf_idx = bcf_index_load(fname); if ( !reader->bcf_idx ) { files->errnum = idx_load_failed; return 0; } } else { files->errnum = file_type_error; return 0; } } else { if ( reader->file->format.format==bcf || reader->file->format.format==vcf ) { reader->header = bcf_hdr_read(reader->file); } else { files->errnum = file_type_error; return 0; } files->streaming = 1; } if ( files->streaming && files->nreaders>1 ) { files->errnum = api_usage_error; fprintf(stderr,"[%s:%d %s] Error: %d readers, yet require_index not set\n", __FILE__,__LINE__,__FUNCTION__,files->nreaders); return 0; } if ( files->streaming && files->regions ) { files->errnum = api_usage_error; fprintf(stderr,"[%s:%d %s] Error: cannot tabix-jump in streaming mode\n", __FILE__,__LINE__,__FUNCTION__); return 0; } if ( !reader->header ) { files->errnum = header_error; return 0; } reader->fname = fname; if ( files->apply_filters ) reader->filter_ids = init_filters(reader->header, files->apply_filters, &reader->nfilter_ids); // Update list of chromosomes if ( !files->explicit_regs && !files->streaming ) { int n,i; const char **names = reader->tbx_idx ? tbx_seqnames(reader->tbx_idx, &n) : bcf_hdr_seqnames(reader->header, &n); for (i=0; i<n; i++) { if ( !files->regions ) files->regions = _regions_init_string(names[i]); else _regions_add(files->regions, names[i], -1, -1); } free(names); } return 1; }
int bcf_sr_add_reader(bcf_srs_t *files, const char *fname) { files->has_line = (int*) realloc(files->has_line, sizeof(int)*(files->nreaders+1)); files->has_line[files->nreaders] = 0; files->readers = (bcf_sr_t*) realloc(files->readers, sizeof(bcf_sr_t)*(files->nreaders+1)); bcf_sr_t *reader = &files->readers[files->nreaders++]; memset(reader,0,sizeof(bcf_sr_t)); reader->file = hts_open(fname, "r"); if ( !reader->file ) return 0; reader->type = reader->file->is_bin? FT_BCF : FT_VCF; if (reader->file->is_compressed) reader->type |= FT_GZ; if ( files->require_index ) { if ( reader->type==FT_VCF_GZ ) { reader->tbx_idx = tbx_index_load(fname); if ( !reader->tbx_idx ) { fprintf(stderr,"[add_reader] Could not load the index of %s\n", fname); return 0; } reader->header = bcf_hdr_read(reader->file); } else if ( reader->type==FT_BCF_GZ ) { reader->header = bcf_hdr_read(reader->file); reader->bcf_idx = bcf_index_load(fname); if ( !reader->bcf_idx ) { fprintf(stderr,"[add_reader] Could not load the index of %s\n", fname); return 0; // not indexed..? } } else { fprintf(stderr,"Index required, expected .vcf.gz or .bcf file: %s\n", fname); return 0; } } else { if ( reader->type & FT_BCF ) { reader->header = bcf_hdr_read(reader->file); } else if ( reader->type & FT_VCF ) { reader->header = bcf_hdr_read(reader->file); } else { fprintf(stderr,"File type not recognised: %s\n", fname); return 0; } files->streaming = 1; } if ( files->streaming && files->nreaders>1 ) { fprintf(stderr,"[%s:%d %s] Error: %d readers, yet require_index not set\n", __FILE__,__LINE__,__FUNCTION__,files->nreaders); return 0; } if ( files->streaming && files->regions ) { fprintf(stderr,"[%s:%d %s] Error: cannot tabix-jump in streaming mode\n", __FILE__,__LINE__,__FUNCTION__); return 0; } if ( !reader->header ) return 0; reader->fname = fname; if ( files->apply_filters ) reader->filter_ids = init_filters(reader->header, files->apply_filters, &reader->nfilter_ids); // Update list of chromosomes if ( !files->explicit_regs && !files->streaming ) { int n,i; const char **names = reader->tbx_idx ? tbx_seqnames(reader->tbx_idx, &n) : bcf_hdr_seqnames(reader->header, &n); for (i=0; i<n; i++) { if ( !files->regions ) files->regions = _regions_init_string(names[i]); else _regions_add(files->regions, names[i], -1, -1); } free(names); } return 1; }