Esempio n. 1
0
inline int regidx_push(regidx_t *idx, char *chr_beg, char *chr_end, uint32_t beg, uint32_t end, void *payload)
{
    if ( beg > MAX_COOR_0 ) beg = MAX_COOR_0;
    if ( end > MAX_COOR_0 ) end = MAX_COOR_0;

    int rid;
    idx->str.l = 0;
    kputsn(chr_beg, chr_end-chr_beg+1, &idx->str);
    if ( khash_str2int_get(idx->seq2regs, idx->str.s, &rid)!=0 )
    {
        // new chromosome
        idx->nseq++;
        int m_prev = idx->mseq;
        hts_expand0(reglist_t,idx->nseq,idx->mseq,idx->seq);
        hts_expand0(char*,idx->nseq,m_prev,idx->seq_names);
        idx->seq_names[idx->nseq-1] = strdup(idx->str.s);
        rid = khash_str2int_inc(idx->seq2regs, idx->seq_names[idx->nseq-1]);
    }
Esempio n. 2
0
int regidx_insert(regidx_t *idx, char *line)
{
    if ( !line )
        return _regidx_build_index(idx);

    char *chr_from, *chr_to;
    reg_t reg;
    int ret = idx->parse(line,&chr_from,&chr_to,&reg,idx->payload,idx->usr);
    if ( ret==-2 ) return -1;   // error
    if ( ret==-1 ) return 0;    // skip the line

    int rid;
    idx->str.l = 0;
    kputsn(chr_from, chr_to-chr_from+1, &idx->str);
    if ( khash_str2int_get(idx->seq2regs, idx->str.s, &rid)!=0 )
    {
        idx->nseq++;
        int m_prev = idx->mseq;
        hts_expand0(reglist_t,idx->nseq,idx->mseq,idx->seq);
        hts_expand0(char*,idx->nseq,m_prev,idx->seq_names);
        idx->seq_names[idx->nseq-1] = strdup(idx->str.s);
        rid = khash_str2int_inc(idx->seq2regs, idx->seq_names[idx->nseq-1]);
    }
Esempio n. 3
0
static void sw_fill_buffer(bcf_sweep_t *sw)
{
    if ( !sw->iidx ) return;
    sw->iidx--;

    int ret = hts_useek(sw->file, sw->idx[sw->iidx], 0);
    assert( ret==0 );

    sw->nrec = 0;
    bcf1_t *rec = &sw->rec[sw->nrec];
    while ( (ret=bcf_read1(sw->file, sw->hdr, rec))==0 )
    {
        bcf_unpack(rec, BCF_UN_STR);

        // if not in the last block, stop at the saved record
        if ( sw->iidx+1 < sw->nidx && sw_rec_equal(sw,rec) ) break;

        sw->nrec++;
        hts_expand0(bcf1_t, sw->nrec+1, sw->mrec, sw->rec);
        rec = &sw->rec[sw->nrec];
    }
    sw_rec_save(sw, &sw->rec[0]);
}
Esempio n. 4
0
static void init_data(args_t *args)
{
    args->sr = bcf_sr_init();
    if ( args->region )
    {
        args->sr->require_index = 1;
        if ( bcf_sr_set_regions(args->sr, args->region, args->region_is_file)<0 ) error("Failed to read the regions: %s\n",args->region);
    }
    if ( args->target && bcf_sr_set_targets(args->sr, args->target, args->target_is_file, 0)<0 ) error("Failed to read the targets: %s\n",args->target);
    if ( !bcf_sr_add_reader(args->sr,args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum));
    args->hdr_in  = bcf_sr_get_header(args->sr,0);
    args->hdr_out = bcf_hdr_dup(args->hdr_in);

    if ( args->filter_str )
        args->filter = filter_init(args->hdr_in, args->filter_str);

    mkdir_p("%s/",args->output_dir);

    int i, nsmpl = bcf_hdr_nsamples(args->hdr_in);
    if ( !nsmpl ) error("No samples to split: %s\n", args->fname);
    args->fh = (htsFile**)calloc(nsmpl,sizeof(*args->fh));
    args->bnames = set_file_base_names(args);
    kstring_t str = {0,0,0};
    for (i=0; i<nsmpl; i++)
    {
        if ( !args->bnames[i] ) continue;
        str.l = 0;
        kputs(args->output_dir, &str);
        if ( str.s[str.l-1] != '/' ) kputc('/', &str);
        int k, l = str.l;
        kputs(args->bnames[i], &str);
        for (k=l; k<str.l; k++) if ( isspace(str.s[k]) ) str.s[k] = '_';
        if ( args->output_type & FT_BCF ) kputs(".bcf", &str);
        else if ( args->output_type & FT_GZ ) kputs(".vcf.gz", &str);
        else kputs(".vcf", &str);
        args->fh[i] = hts_open(str.s, hts_bcf_wmode(args->output_type));
        if ( args->fh[i] == NULL ) error("Can't write to \"%s\": %s\n", str.s, strerror(errno));
        bcf_hdr_nsamples(args->hdr_out) = 1;
        args->hdr_out->samples[0] = args->bnames[i];
        bcf_hdr_write(args->fh[i], args->hdr_out);
    }
    free(str.s);

    // parse tags
    int is_info = 0, is_fmt = 0;
    char *beg = args->keep_tags;
    while ( beg && *beg )
    {
        if ( !strncasecmp("INFO/",beg,5) ) { is_info = 1; is_fmt = 0; beg += 5; }
        else if ( !strcasecmp("INFO",beg) ) { args->keep_info = 1; break; }
        else if ( !strncasecmp("INFO,",beg,5) ) { args->keep_info = 1; beg += 5; continue; }
        else if ( !strncasecmp("FMT/",beg,4) ) { is_info = 0; is_fmt = 1; beg += 4; }
        else if ( !strncasecmp("FORMAT/",beg,7) ) { is_info = 0; is_fmt = 1; beg += 7; }
        else if ( !strcasecmp("FMT",beg) ) { args->keep_fmt = 1; break; }
        else if ( !strcasecmp("FORMAT",beg) ) { args->keep_fmt = 1; break; }
        else if ( !strncasecmp("FMT,",beg,4) ) { args->keep_fmt = 1; beg += 4; continue; }
        else if ( !strncasecmp("FORMAT,",beg,7) ) { args->keep_fmt = 1; beg += 7; continue; }
        char *end = beg;
        while ( *end && *end!=',' ) end++;
        char tmp = *end; *end = 0;
        int id = bcf_hdr_id2int(args->hdr_in, BCF_DT_ID, beg);
        beg = tmp ? end + 1 : end;
        if ( is_info && bcf_hdr_idinfo_exists(args->hdr_in,BCF_HL_INFO,id) )
        {
            if ( id >= args->ninfo_tags ) args->ninfo_tags = id + 1;
            hts_expand0(uint8_t, args->ninfo_tags, args->minfo_tags, args->info_tags);
            args->info_tags[id] = 1;
        }
        if ( is_fmt && bcf_hdr_idinfo_exists(args->hdr_in,BCF_HL_FMT,id) )
        {
            if ( id >= args->nfmt_tags ) args->nfmt_tags = id + 1;
            hts_expand0(uint8_t, args->nfmt_tags, args->mfmt_tags, args->fmt_tags);
            args->fmt_tags[id] = 1;
        }
    }
    if ( !args->keep_info && !args->keep_fmt && !args->ninfo_tags && !args->nfmt_tags )
    {
        args->keep_info = args->keep_fmt = 1;
    }
}
Esempio n. 5
0
// Parse filter expression and convert to reverse polish notation. Dijkstra's shunting-yard algorithm
filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
{
    filter_t *filter = (filter_t *) calloc(1,sizeof(filter_t));
    filter->str = strdup(str);
    filter->hdr = hdr;

    int nops = 0, mops = 0, *ops = NULL;    // operators stack
    int nout = 0, mout = 0;                 // filter tokens, RPN
    token_t *out = NULL;
    char *tmp = filter->str;
    int last_op = -1;
    while ( *tmp )
    {
        int len, ret;
        ret = filters_next_token(&tmp, &len);
        if ( ret==-1 ) error("Missing quotes in: %s\n", str);

        // fprintf(stderr,"token=[%c] .. [%s] %d\n", "x()[<=>]!|&+-*/Mm"[ret], tmp, len);
        // int i; for (i=0; i<nops; i++) fprintf(stderr," .%c.", "x()[<=>]!|&+-*/Mm"[ops[i]]); fprintf(stderr,"\n");

        if ( ret==TOK_MAX || ret==TOK_MIN || ret==TOK_AVG )
        {
            nout++;
            hts_expand0(token_t, nout, mout, out);
            filters_init_func(filter, ret, &tmp, &out[nout-1]);
        }
        else if ( ret==TOK_LFT )         // left bracket
        {
            nops++;
            hts_expand(int, nops, mops, ops);
            ops[nops-1] = ret;
        }
        else if ( ret==TOK_RGT )    // right bracket
        {
            while ( nops>0 && ops[nops-1]!=TOK_LFT )
            {
                nout++;
                hts_expand0(token_t, nout, mout, out);
                out[nout-1].tok_type = ops[nops-1];
                nops--;
            }
            if ( nops<=0 ) error("Could not parse: %s\n", str);
            nops--;
        }
        else if ( ret!=TOK_VAL )    // one of the operators
        {
            // detect unary minus: replace -value with -1*(value)
            if ( ret==TOK_SUB && last_op!=TOK_VAL && last_op!=TOK_RGT )
            {
                nout++;
                hts_expand0(token_t, nout, mout, out);
                token_t *tok = &out[nout-1];
                tok->tok_type  = TOK_VAL;
                tok->hdr_id    = -1;
                tok->pass      = -1;
                tok->threshold = -1.0;
                ret = TOK_MULT;
            }
            else
            {
                while ( nops>0 && op_prec[ret] < op_prec[ops[nops-1]] )
                {
                    nout++;
                    hts_expand0(token_t, nout, mout, out);
                    out[nout-1].tok_type = ops[nops-1];
                    nops--;
                }
            }
            nops++;
            hts_expand(int, nops, mops, ops);
            ops[nops-1] = ret;
        }
        else if ( !len ) 
        {
            if ( *tmp && !isspace(*tmp) ) error("Could not parse the expression: [%s]\n", str);
            break;     // all tokens read
        }
        else           // annotation name or filtering value
        {
            nout++;
            hts_expand0(token_t, nout, mout, out);
            filters_init1(filter, tmp, len, &out[nout-1]);
            tmp += len;
        }
        last_op = ret;
    }
    while ( nops>0 )
    {
        if ( ops[nops-1]==TOK_LFT || ops[nops-1]==TOK_RGT ) error("Could not parse the expression: [%s]\n", filter->str);
        nout++;
        hts_expand0(token_t, nout, mout, out);
        out[nout-1].tok_type = ops[nops-1];
        nops--;
    }

    // In the special cases of %TYPE and %FILTER the BCF header IDs are yet unknown. Walk through the
    // list of operators and convert the strings (e.g. "PASS") to BCF ids. The string value token must be
    // just before or after the %FILTER token and they must be followed with a comparison operator.
    // This code is fragile: improve me.
    int i;
    for (i=0; i<nout; i++)
    {
        if ( out[i].tok_type!=TOK_VAL ) continue;
        if ( !out[i].tag ) continue;
        if ( !strcmp(out[i].tag,"%TYPE") )
        {
            if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str);
            int j = i+1;
            if ( out[j].tok_type==TOK_EQ || out[j].tok_type==TOK_NE ) j = i - 1;
            if ( out[j].tok_type!=TOK_VAL || !out[j].key ) error("[%s:%d %s] Could not parse the expression: %s\n",  __FILE__,__LINE__,__FUNCTION__, filter->str);
            if ( !strcasecmp(out[j].key,"snp") || !strcasecmp(out[j].key,"snps") ) out[j].threshold = VCF_SNP;
            else if ( !strcasecmp(out[j].key,"indel") || !strcasecmp(out[j].key,"indels") ) out[j].threshold = VCF_INDEL;
            else if ( !strcasecmp(out[j].key,"mnp") || !strcasecmp(out[j].key,"mnps") ) out[j].threshold = VCF_MNP;
            else if ( !strcasecmp(out[j].key,"other") ) out[j].threshold = VCF_OTHER;
            else error("The type \"%s\" not recognised: %s\n", out[j].key, filter->str);
            out[j].tag = out[j].key; out[j].key = NULL;
            i = j;
            continue;
        }
        if ( !strcmp(out[i].tag,"%FILTER") )
        {
            if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str);
            int j = i+1;
            if ( out[j].tok_type==TOK_EQ || out[j].tok_type==TOK_NE ) j = i - 1;
            if ( out[j].tok_type!=TOK_VAL || !out[j].key )
                error("[%s:%d %s] Could not parse the expression, an unquoted string value perhaps? %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str);
            if ( strcmp(".",out[j].key) )
            {
                out[j].hdr_id = bcf_hdr_id2int(filter->hdr, BCF_DT_ID, out[j].key);
                if ( !bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FLT,out[j].hdr_id) )
                    error("The filter \"%s\" not present in the VCF header\n", out[j].key);
            }
            else
                out[j].hdr_id = -1;
            out[j].tag = out[j].key; out[j].key = NULL;
            out[i].hdr_id = out[j].hdr_id;
            i = j;
            continue;
        }
    }

    // filter_debug_print(out, nout);

    if ( mops ) free(ops);
    filter->filters   = out;
    filter->nfilters  = nout;
    filter->flt_stack = (token_t **)malloc(sizeof(token_t*)*nout);
    return filter;
}
Esempio n. 6
0
static void bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, int min_pos)
{
    if ( !srt->grp_str2int )
    {
        // first time here, initialize
        if ( !srt->pair )
        {
            if ( readers->collapse==COLLAPSE_NONE ) readers->collapse = BCF_SR_PAIR_EXACT;
            bcf_sr_set_opt(readers, BCF_SR_PAIR_LOGIC, readers->collapse);
        }
        bcf_sr_init_scores(srt);
        srt->grp_str2int = khash_str2int_init();
        srt->var_str2int = khash_str2int_init();
    }
    int k;
    khash_t(str2int) *hash;
    hash = srt->grp_str2int;
    for (k=0; k < kh_end(hash); k++)
        if ( kh_exist(hash,k) ) free((char*)kh_key(hash,k));
    hash = srt->var_str2int;
    for (k=0; k < kh_end(hash); k++)
        if ( kh_exist(hash,k) ) free((char*)kh_key(hash,k));
    kh_clear(str2int, srt->grp_str2int);
    kh_clear(str2int, srt->var_str2int);
    srt->ngrp = srt->nvar = srt->nvset = 0;

    grp_t grp;
    memset(&grp,0,sizeof(grp_t));

    // group VCFs into groups, each with a unique combination of variants in the duplicate lines
    int ireader,ivar,irec,igrp,ivset,iact;
    for (ireader=0; ireader<readers->nreaders; ireader++) srt->vcf_buf[ireader].nrec = 0;
    for (iact=0; iact<srt->nactive; iact++)
    {
        ireader = srt->active[iact];
        bcf_sr_t *reader = &readers->readers[ireader];
        int rid   = bcf_hdr_name2id(reader->header, chr);
        grp.nvar  = 0;
        hts_expand(int,reader->nbuffer,srt->moff,srt->off);
        srt->noff  = 0;
        srt->str.l = 0;
        for (irec=1; irec<=reader->nbuffer; irec++)
        {
            bcf1_t *line = reader->buffer[irec];
            if ( line->rid!=rid || line->pos!=min_pos ) break;

            if ( srt->str.l ) kputc(';',&srt->str);
            srt->off[srt->noff++] = srt->str.l;
            size_t beg = srt->str.l;
            for (ivar=1; ivar<line->n_allele; ivar++)
            {
                if ( ivar>1 ) kputc(',',&srt->str);
                kputs(line->d.allele[0],&srt->str);
                kputc('>',&srt->str);
                kputs(line->d.allele[ivar],&srt->str);
            }
            if ( line->n_allele==1 )
            {
                kputs(line->d.allele[0],&srt->str);
                kputsn(">.",2,&srt->str);
            }

            // Create new variant or attach to existing one. But careful, there can be duplicate
            // records with the same POS,REF,ALT (e.g. in dbSNP-b142)
            char *var_str = beg + srt->str.s;
            int ret, var_idx = 0, var_end = srt->str.l;
            while ( 1 )
            {
                ret = khash_str2int_get(srt->var_str2int, var_str, &ivar);
                if ( ret==-1 ) break;

                var_t *var = &srt->var[ivar];
                if ( var->vcf[var->nvcf-1] != ireader ) break;

                srt->str.l = var_end;
                kputw(var_idx, &srt->str);
                var_str = beg + srt->str.s;
                var_idx++;
            }
            if ( ret==-1 )
            {
                ivar = srt->nvar++;
                hts_expand0(var_t,srt->nvar,srt->mvar,srt->var);
                srt->var[ivar].nvcf = 0;
                khash_str2int_set(srt->var_str2int, strdup(var_str), ivar);
                free(srt->var[ivar].str);   // possible left-over from the previous position
            }
            var_t *var = &srt->var[ivar];
            var->nalt = line->n_allele - 1;
            var->type = bcf_get_variant_types(line);
            srt->str.s[var_end] = 0;
            if ( ret==-1 )
                var->str = strdup(var_str);

            int mvcf = var->mvcf;
            var->nvcf++;
            hts_expand0(int*, var->nvcf, var->mvcf, var->vcf);
            if ( mvcf != var->mvcf ) var->rec = (bcf1_t **) realloc(var->rec,sizeof(bcf1_t*)*var->mvcf);
            var->vcf[var->nvcf-1] = ireader;
            var->rec[var->nvcf-1] = line;

            grp.nvar++;
            hts_expand(var_t,grp.nvar,grp.mvar,grp.var);
            grp.var[grp.nvar-1] = ivar;
        }
        char *grp_key = grp_create_key(srt);
        int ret = khash_str2int_get(srt->grp_str2int, grp_key, &igrp);
        if ( ret==-1 )
        {
            igrp = srt->ngrp++;
            hts_expand0(grp_t, srt->ngrp, srt->mgrp, srt->grp);
            free(srt->grp[igrp].var);
            srt->grp[igrp] = grp;
            srt->grp[igrp].key = grp_key;
            khash_str2int_set(srt->grp_str2int, grp_key, igrp);
            memset(&grp,0,sizeof(grp_t));
        }
        else
            free(grp_key);
        srt->grp[igrp].nvcf++;
    }
    free(grp.var);

    // initialize bitmask - which groups is the variant present in
    for (ivar=0; ivar<srt->nvar; ivar++)
    {
        srt->var[ivar].mask = kbs_resize(srt->var[ivar].mask, srt->ngrp);
        kbs_clear(srt->var[ivar].mask);
    }
    for (igrp=0; igrp<srt->ngrp; igrp++)
    {
        for (ivar=0; ivar<srt->grp[igrp].nvar; ivar++)
        {
            int i = srt->grp[igrp].var[ivar];
            kbs_insert(srt->var[i].mask, igrp);
        }
    }

    // create the initial list of variant sets
    for (ivar=0; ivar<srt->nvar; ivar++)
    {
        ivset = srt->nvset++;
        hts_expand0(varset_t, srt->nvset, srt->mvset, srt->vset);

        varset_t *vset = &srt->vset[ivset];
        vset->nvar = 1;
        hts_expand0(var_t, vset->nvar, vset->mvar, vset->var);
        vset->var[vset->nvar-1] = ivar;
        var_t *var  = &srt->var[ivar];
        vset->cnt   = var->nvcf;
        vset->mask  = kbs_resize(vset->mask, srt->ngrp);
        kbs_clear(vset->mask);
        kbs_bitwise_or(vset->mask, var->mask);

        int type = 0;
        if ( var->type==VCF_REF ) type |= SR_REF;
        else
        {
            if ( var->type & VCF_SNP ) type |= SR_SNP;
            if ( var->type & VCF_MNP ) type |= SR_SNP;
            if ( var->type & VCF_INDEL ) type |= SR_INDEL;
            if ( var->type & VCF_OTHER ) type |= SR_OTHER;
        }
        var->type = type;
    }
#if DEBUG_VSETS
    debug_vsets(srt);
#endif

    // initialize the pairing matrix
    hts_expand(int, srt->ngrp*srt->nvset, srt->mpmat, srt->pmat);
    hts_expand(int, srt->nvset, srt->mcnt, srt->cnt);
    memset(srt->pmat, 0, sizeof(*srt->pmat)*srt->ngrp*srt->nvset);
    for (ivset=0; ivset<srt->nvset; ivset++)
    {
        varset_t *vset = &srt->vset[ivset];
        for (igrp=0; igrp<srt->ngrp; igrp++) srt->pmat[ivset*srt->ngrp+igrp] = 0;
        srt->cnt[ivset] = vset->cnt;
    }

    // pair the lines
    while ( srt->nvset )
    {
#if DEBUG_VSETS
    fprintf(stderr,"\n");
    debug_vsets(srt);
#endif

        int imax = 0;
        for (ivset=1; ivset<srt->nvset; ivset++)
            if ( srt->cnt[imax] < srt->cnt[ivset] ) imax = ivset;

        int ipair = -1;
        uint32_t max_score = 0;
        for (ivset=0; ivset<srt->nvset; ivset++)
        {
            if ( kbs_logical_and(srt->vset[imax].mask,srt->vset[ivset].mask) ) continue;   // cannot be merged
            uint32_t score = pairing_score(srt, imax, ivset);
            // fprintf(stderr,"score: %d %d, logic=%d \t..\t %u\n", imax,ivset,srt->pair,score);
            if ( max_score < score ) { max_score = score; ipair = ivset; }
        }

        // merge rows creating a new variant set this way
        if ( ipair!=-1 && ipair!=imax )
        {
            imax = merge_vsets(srt, imax, ipair);
            continue;
        }

        push_vset(srt, imax);
    }

    srt->chr = chr;
    srt->pos = min_pos;
}