// File name or a list of genomic locations. If file name, NULL is returned.
static bcf_sr_regions_t *_regions_init_string(const char *str)
{
    bcf_sr_regions_t *reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t));
    reg->start = reg->end = -1;
    reg->prev_start = reg->prev_seq = -1;

    kstring_t tmp = {0,0,0};
    const char *sp = str, *ep = str;
    int from, to;
    while ( 1 )
    {
        while ( *ep && *ep!=',' && *ep!=':' ) ep++;
        tmp.l = 0;
        kputsn(sp,ep-sp,&tmp);
        if ( *ep==':' )
        {
            sp = ep+1;
            from = hts_parse_decimal(sp,(char**)&ep);
            if ( sp==ep )
            {
                fprintf(stderr,"[%s:%d %s] Could not parse the region(s): %s\n", __FILE__,__LINE__,__FUNCTION__,str);
                free(reg); free(tmp.s); return NULL;
            }
            if ( !*ep || *ep==',' )
            {
                _regions_add(reg, tmp.s, from, from);
                sp = ep;
                continue;
            }
            if ( *ep!='-' )
            {
                fprintf(stderr,"[%s:%d %s] Could not parse the region(s): %s\n", __FILE__,__LINE__,__FUNCTION__,str);
                free(reg); free(tmp.s); return NULL;
            }
            ep++;
            sp = ep;
            to = hts_parse_decimal(sp,(char**)&ep);
            if ( *ep && *ep!=',' )
            {
                fprintf(stderr,"[%s:%d %s] Could not parse the region(s): %s\n", __FILE__,__LINE__,__FUNCTION__,str);
                free(reg); free(tmp.s); return NULL;
            }
            if ( sp==ep ) to = MAX_CSI_COOR-1;
            _regions_add(reg, tmp.s, from, to);
            if ( !*ep ) break;
            sp = ep;
        }
        else
        {
            if ( tmp.l ) _regions_add(reg, tmp.s, -1, -1);
            if ( !*ep ) break;
            sp = ++ep;
        }
    }
    free(tmp.s);
    return reg;
}
bcf_sr_regions_t *bcf_sr_regions_init(const char *regions, int is_file, int ichr, int ifrom, int ito)
{
    bcf_sr_regions_t *reg;
    if ( !is_file ) return _regions_init_string(regions);

    reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t));
    reg->start = reg->end = -1;
    reg->prev_start = reg->prev_seq = -1;

    reg->file = hts_open(regions, "rb");
    if ( !reg->file )
    {
        fprintf(stderr,"[%s:%d %s] Could not open file: %s\n", __FILE__,__LINE__,__FUNCTION__,regions);
        free(reg);
        return NULL;
    }

    reg->tbx = tbx_index_load(regions);
    if ( !reg->tbx )
    {
        int len = strlen(regions);
        int is_bed  = strcasecmp(".bed",regions+len-4) ? 0 : 1;
        if ( !is_bed && !strcasecmp(".bed.gz",regions+len-7) ) is_bed = 1;

        if ( reg->file->format.format==vcf ) ito = 1;

        // read the whole file, tabix index is not present
        while ( hts_getline(reg->file, KS_SEP_LINE, &reg->line) > 0 )
        {
            char *chr, *chr_end;
            int from, to, ret;
            ret = _regions_parse_line(reg->line.s, ichr,ifrom,abs(ito), &chr,&chr_end,&from,&to);
            if ( ret < 0 )
            {
                if ( ito<0 )
                    ret = _regions_parse_line(reg->line.s, ichr,ifrom,ifrom, &chr,&chr_end,&from,&to);
                if ( ret<0 )
                {
                    fprintf(stderr,"[%s:%d] Could not parse the file %s, using the columns %d,%d[,%d]\n", __FILE__,__LINE__,regions,ichr+1,ifrom+1,ito+1);
                    hts_close(reg->file); reg->file = NULL; free(reg);
                    return NULL;
                }
            }
            if ( !ret ) continue;
            if ( is_bed ) from++;
            *chr_end = 0;
            _regions_add(reg, chr, from, to);
            *chr_end = '\t';
        }
        hts_close(reg->file); reg->file = NULL;
        if ( !reg->nseqs ) { free(reg); return NULL; }
        return reg;
    }

    reg->seq_names = (char**) tbx_seqnames(reg->tbx, &reg->nseqs);
    if ( !reg->seq_hash )
        reg->seq_hash = khash_str2int_init();
    int i;
    for (i=0; i<reg->nseqs; i++)
    {
        khash_str2int_set(reg->seq_hash,reg->seq_names[i],i);
    }
    reg->fname  = strdup(regions);
    reg->is_bin = 1;
    return reg;
}
int bcf_sr_add_reader(bcf_srs_t *files, const char *fname)
{
    htsFile* file_ptr = hts_open(fname, "r");
    if ( ! file_ptr ) {
        files->errnum = open_failed;
        return 0;
    }

    files->has_line = (int*) realloc(files->has_line, sizeof(int)*(files->nreaders+1));
    files->has_line[files->nreaders] = 0;
    files->readers  = (bcf_sr_t*) realloc(files->readers, sizeof(bcf_sr_t)*(files->nreaders+1));
    bcf_sr_t *reader = &files->readers[files->nreaders++];
    memset(reader,0,sizeof(bcf_sr_t));

    reader->file = file_ptr;

    files->errnum = 0;

    if ( files->require_index )
    {
        if ( reader->file->format.format==vcf )
        {
            if ( reader->file->format.compression!=bgzf )
            {
                files->errnum = not_bgzf;
                return 0;
            }

            reader->tbx_idx = tbx_index_load(fname);
            if ( !reader->tbx_idx )
            {
                files->errnum = idx_load_failed;
                return 0;
            }

            reader->header = bcf_hdr_read(reader->file);
        }
        else if ( reader->file->format.format==bcf )
        {
            if ( reader->file->format.compression!=bgzf )
            {
                files->errnum = not_bgzf;
                return 0;
            }

            reader->header = bcf_hdr_read(reader->file);

            reader->bcf_idx = bcf_index_load(fname);
            if ( !reader->bcf_idx )
            {
                files->errnum = idx_load_failed;
                return 0;
            }
        }
        else
        {
            files->errnum = file_type_error;
            return 0;
        }
    }
    else
    {
        if ( reader->file->format.format==bcf || reader->file->format.format==vcf )
        {
            reader->header = bcf_hdr_read(reader->file);
        }
        else
        {
            files->errnum = file_type_error;
            return 0;
        }
        files->streaming = 1;
    }
    if ( files->streaming && files->nreaders>1 )
    {
        files->errnum = api_usage_error;
        fprintf(stderr,"[%s:%d %s] Error: %d readers, yet require_index not set\n", __FILE__,__LINE__,__FUNCTION__,files->nreaders);
        return 0;
    }
    if ( files->streaming && files->regions )
    {
        files->errnum = api_usage_error;
        fprintf(stderr,"[%s:%d %s] Error: cannot tabix-jump in streaming mode\n", __FILE__,__LINE__,__FUNCTION__);
        return 0;
    }
    if ( !reader->header )
    {
        files->errnum = header_error;
        return 0;
    }

    reader->fname = fname;
    if ( files->apply_filters )
        reader->filter_ids = init_filters(reader->header, files->apply_filters, &reader->nfilter_ids);

    // Update list of chromosomes
    if ( !files->explicit_regs && !files->streaming )
    {
        int n,i;
        const char **names = reader->tbx_idx ? tbx_seqnames(reader->tbx_idx, &n) : bcf_hdr_seqnames(reader->header, &n);
        for (i=0; i<n; i++)
        {
            if ( !files->regions )
                files->regions = _regions_init_string(names[i]);
            else
                _regions_add(files->regions, names[i], -1, -1);
        }
        free(names);
    }

    return 1;
}
示例#4
0
int bcf_sr_add_reader(bcf_srs_t *files, const char *fname)
{
    files->has_line = (int*) realloc(files->has_line, sizeof(int)*(files->nreaders+1));
    files->has_line[files->nreaders] = 0;
    files->readers  = (bcf_sr_t*) realloc(files->readers, sizeof(bcf_sr_t)*(files->nreaders+1));
    bcf_sr_t *reader = &files->readers[files->nreaders++];
    memset(reader,0,sizeof(bcf_sr_t));

    reader->file = hts_open(fname, "r");
    if ( !reader->file ) return 0;

    reader->type = reader->file->is_bin? FT_BCF : FT_VCF;
    if (reader->file->is_compressed) reader->type |= FT_GZ;

    if ( files->require_index )
    {
        if ( reader->type==FT_VCF_GZ ) 
        {
            reader->tbx_idx = tbx_index_load(fname);
            if ( !reader->tbx_idx )
            {
                fprintf(stderr,"[add_reader] Could not load the index of %s\n", fname);
                return 0;
            }

            reader->header = bcf_hdr_read(reader->file);
        }
        else if ( reader->type==FT_BCF_GZ ) 
        {
            reader->header = bcf_hdr_read(reader->file);

            reader->bcf_idx = bcf_index_load(fname);
            if ( !reader->bcf_idx ) 
            {
                fprintf(stderr,"[add_reader] Could not load the index of %s\n", fname);
                return 0;   // not indexed..?
            }
        }
        else
        {
            fprintf(stderr,"Index required, expected .vcf.gz or .bcf file: %s\n", fname);
            return 0;
        }
    }
    else 
    {
        if ( reader->type & FT_BCF )
        {
            reader->header = bcf_hdr_read(reader->file);
        }
        else if ( reader->type & FT_VCF )
        {
            reader->header = bcf_hdr_read(reader->file);
        }
        else
        {
            fprintf(stderr,"File type not recognised: %s\n", fname);
            return 0;
        }
        files->streaming = 1;
    }
    if ( files->streaming && files->nreaders>1 )
    {
        fprintf(stderr,"[%s:%d %s] Error: %d readers, yet require_index not set\n", __FILE__,__LINE__,__FUNCTION__,files->nreaders);
        return 0;
    }
    if ( files->streaming && files->regions )
    {
        fprintf(stderr,"[%s:%d %s] Error: cannot tabix-jump in streaming mode\n", __FILE__,__LINE__,__FUNCTION__);
        return 0;
    }
    if ( !reader->header ) return 0;

    reader->fname = fname;
    if ( files->apply_filters )
        reader->filter_ids = init_filters(reader->header, files->apply_filters, &reader->nfilter_ids);

    // Update list of chromosomes
    if ( !files->explicit_regs && !files->streaming )
    {
        int n,i;
        const char **names = reader->tbx_idx ? tbx_seqnames(reader->tbx_idx, &n) : bcf_hdr_seqnames(reader->header, &n);
        for (i=0; i<n; i++)
        {
            if ( !files->regions )
                files->regions = _regions_init_string(names[i]);
            else
                _regions_add(files->regions, names[i], -1, -1);
        }
        free(names);
    }

    return 1;
}