inline int regidx_push(regidx_t *idx, char *chr_beg, char *chr_end, uint32_t beg, uint32_t end, void *payload) { if ( beg > MAX_COOR_0 ) beg = MAX_COOR_0; if ( end > MAX_COOR_0 ) end = MAX_COOR_0; int rid; idx->str.l = 0; kputsn(chr_beg, chr_end-chr_beg+1, &idx->str); if ( khash_str2int_get(idx->seq2regs, idx->str.s, &rid)!=0 ) { // new chromosome idx->nseq++; int m_prev = idx->mseq; hts_expand0(reglist_t,idx->nseq,idx->mseq,idx->seq); hts_expand0(char*,idx->nseq,m_prev,idx->seq_names); idx->seq_names[idx->nseq-1] = strdup(idx->str.s); rid = khash_str2int_inc(idx->seq2regs, idx->seq_names[idx->nseq-1]); }
int regidx_insert(regidx_t *idx, char *line) { if ( !line ) return _regidx_build_index(idx); char *chr_from, *chr_to; reg_t reg; int ret = idx->parse(line,&chr_from,&chr_to,®,idx->payload,idx->usr); if ( ret==-2 ) return -1; // error if ( ret==-1 ) return 0; // skip the line int rid; idx->str.l = 0; kputsn(chr_from, chr_to-chr_from+1, &idx->str); if ( khash_str2int_get(idx->seq2regs, idx->str.s, &rid)!=0 ) { idx->nseq++; int m_prev = idx->mseq; hts_expand0(reglist_t,idx->nseq,idx->mseq,idx->seq); hts_expand0(char*,idx->nseq,m_prev,idx->seq_names); idx->seq_names[idx->nseq-1] = strdup(idx->str.s); rid = khash_str2int_inc(idx->seq2regs, idx->seq_names[idx->nseq-1]); }
static void sw_fill_buffer(bcf_sweep_t *sw) { if ( !sw->iidx ) return; sw->iidx--; int ret = hts_useek(sw->file, sw->idx[sw->iidx], 0); assert( ret==0 ); sw->nrec = 0; bcf1_t *rec = &sw->rec[sw->nrec]; while ( (ret=bcf_read1(sw->file, sw->hdr, rec))==0 ) { bcf_unpack(rec, BCF_UN_STR); // if not in the last block, stop at the saved record if ( sw->iidx+1 < sw->nidx && sw_rec_equal(sw,rec) ) break; sw->nrec++; hts_expand0(bcf1_t, sw->nrec+1, sw->mrec, sw->rec); rec = &sw->rec[sw->nrec]; } sw_rec_save(sw, &sw->rec[0]); }
static void init_data(args_t *args) { args->sr = bcf_sr_init(); if ( args->region ) { args->sr->require_index = 1; if ( bcf_sr_set_regions(args->sr, args->region, args->region_is_file)<0 ) error("Failed to read the regions: %s\n",args->region); } if ( args->target && bcf_sr_set_targets(args->sr, args->target, args->target_is_file, 0)<0 ) error("Failed to read the targets: %s\n",args->target); if ( !bcf_sr_add_reader(args->sr,args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum)); args->hdr_in = bcf_sr_get_header(args->sr,0); args->hdr_out = bcf_hdr_dup(args->hdr_in); if ( args->filter_str ) args->filter = filter_init(args->hdr_in, args->filter_str); mkdir_p("%s/",args->output_dir); int i, nsmpl = bcf_hdr_nsamples(args->hdr_in); if ( !nsmpl ) error("No samples to split: %s\n", args->fname); args->fh = (htsFile**)calloc(nsmpl,sizeof(*args->fh)); args->bnames = set_file_base_names(args); kstring_t str = {0,0,0}; for (i=0; i<nsmpl; i++) { if ( !args->bnames[i] ) continue; str.l = 0; kputs(args->output_dir, &str); if ( str.s[str.l-1] != '/' ) kputc('/', &str); int k, l = str.l; kputs(args->bnames[i], &str); for (k=l; k<str.l; k++) if ( isspace(str.s[k]) ) str.s[k] = '_'; if ( args->output_type & FT_BCF ) kputs(".bcf", &str); else if ( args->output_type & FT_GZ ) kputs(".vcf.gz", &str); else kputs(".vcf", &str); args->fh[i] = hts_open(str.s, hts_bcf_wmode(args->output_type)); if ( args->fh[i] == NULL ) error("Can't write to \"%s\": %s\n", str.s, strerror(errno)); bcf_hdr_nsamples(args->hdr_out) = 1; args->hdr_out->samples[0] = args->bnames[i]; bcf_hdr_write(args->fh[i], args->hdr_out); } free(str.s); // parse tags int is_info = 0, is_fmt = 0; char *beg = args->keep_tags; while ( beg && *beg ) { if ( !strncasecmp("INFO/",beg,5) ) { is_info = 1; is_fmt = 0; beg += 5; } else if ( !strcasecmp("INFO",beg) ) { args->keep_info = 1; break; } else if ( !strncasecmp("INFO,",beg,5) ) { args->keep_info = 1; beg += 5; continue; } else if ( !strncasecmp("FMT/",beg,4) ) { is_info = 0; is_fmt = 1; beg += 4; } else if ( !strncasecmp("FORMAT/",beg,7) ) { is_info = 0; is_fmt = 1; beg += 7; } else if ( !strcasecmp("FMT",beg) ) { args->keep_fmt = 1; break; } else if ( !strcasecmp("FORMAT",beg) ) { args->keep_fmt = 1; break; } else if ( !strncasecmp("FMT,",beg,4) ) { args->keep_fmt = 1; beg += 4; continue; } else if ( !strncasecmp("FORMAT,",beg,7) ) { args->keep_fmt = 1; beg += 7; continue; } char *end = beg; while ( *end && *end!=',' ) end++; char tmp = *end; *end = 0; int id = bcf_hdr_id2int(args->hdr_in, BCF_DT_ID, beg); beg = tmp ? end + 1 : end; if ( is_info && bcf_hdr_idinfo_exists(args->hdr_in,BCF_HL_INFO,id) ) { if ( id >= args->ninfo_tags ) args->ninfo_tags = id + 1; hts_expand0(uint8_t, args->ninfo_tags, args->minfo_tags, args->info_tags); args->info_tags[id] = 1; } if ( is_fmt && bcf_hdr_idinfo_exists(args->hdr_in,BCF_HL_FMT,id) ) { if ( id >= args->nfmt_tags ) args->nfmt_tags = id + 1; hts_expand0(uint8_t, args->nfmt_tags, args->mfmt_tags, args->fmt_tags); args->fmt_tags[id] = 1; } } if ( !args->keep_info && !args->keep_fmt && !args->ninfo_tags && !args->nfmt_tags ) { args->keep_info = args->keep_fmt = 1; } }
// Parse filter expression and convert to reverse polish notation. Dijkstra's shunting-yard algorithm filter_t *filter_init(bcf_hdr_t *hdr, const char *str) { filter_t *filter = (filter_t *) calloc(1,sizeof(filter_t)); filter->str = strdup(str); filter->hdr = hdr; int nops = 0, mops = 0, *ops = NULL; // operators stack int nout = 0, mout = 0; // filter tokens, RPN token_t *out = NULL; char *tmp = filter->str; int last_op = -1; while ( *tmp ) { int len, ret; ret = filters_next_token(&tmp, &len); if ( ret==-1 ) error("Missing quotes in: %s\n", str); // fprintf(stderr,"token=[%c] .. [%s] %d\n", "x()[<=>]!|&+-*/Mm"[ret], tmp, len); // int i; for (i=0; i<nops; i++) fprintf(stderr," .%c.", "x()[<=>]!|&+-*/Mm"[ops[i]]); fprintf(stderr,"\n"); if ( ret==TOK_MAX || ret==TOK_MIN || ret==TOK_AVG ) { nout++; hts_expand0(token_t, nout, mout, out); filters_init_func(filter, ret, &tmp, &out[nout-1]); } else if ( ret==TOK_LFT ) // left bracket { nops++; hts_expand(int, nops, mops, ops); ops[nops-1] = ret; } else if ( ret==TOK_RGT ) // right bracket { while ( nops>0 && ops[nops-1]!=TOK_LFT ) { nout++; hts_expand0(token_t, nout, mout, out); out[nout-1].tok_type = ops[nops-1]; nops--; } if ( nops<=0 ) error("Could not parse: %s\n", str); nops--; } else if ( ret!=TOK_VAL ) // one of the operators { // detect unary minus: replace -value with -1*(value) if ( ret==TOK_SUB && last_op!=TOK_VAL && last_op!=TOK_RGT ) { nout++; hts_expand0(token_t, nout, mout, out); token_t *tok = &out[nout-1]; tok->tok_type = TOK_VAL; tok->hdr_id = -1; tok->pass = -1; tok->threshold = -1.0; ret = TOK_MULT; } else { while ( nops>0 && op_prec[ret] < op_prec[ops[nops-1]] ) { nout++; hts_expand0(token_t, nout, mout, out); out[nout-1].tok_type = ops[nops-1]; nops--; } } nops++; hts_expand(int, nops, mops, ops); ops[nops-1] = ret; } else if ( !len ) { if ( *tmp && !isspace(*tmp) ) error("Could not parse the expression: [%s]\n", str); break; // all tokens read } else // annotation name or filtering value { nout++; hts_expand0(token_t, nout, mout, out); filters_init1(filter, tmp, len, &out[nout-1]); tmp += len; } last_op = ret; } while ( nops>0 ) { if ( ops[nops-1]==TOK_LFT || ops[nops-1]==TOK_RGT ) error("Could not parse the expression: [%s]\n", filter->str); nout++; hts_expand0(token_t, nout, mout, out); out[nout-1].tok_type = ops[nops-1]; nops--; } // In the special cases of %TYPE and %FILTER the BCF header IDs are yet unknown. Walk through the // list of operators and convert the strings (e.g. "PASS") to BCF ids. The string value token must be // just before or after the %FILTER token and they must be followed with a comparison operator. // This code is fragile: improve me. int i; for (i=0; i<nout; i++) { if ( out[i].tok_type!=TOK_VAL ) continue; if ( !out[i].tag ) continue; if ( !strcmp(out[i].tag,"%TYPE") ) { if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str); int j = i+1; if ( out[j].tok_type==TOK_EQ || out[j].tok_type==TOK_NE ) j = i - 1; if ( out[j].tok_type!=TOK_VAL || !out[j].key ) error("[%s:%d %s] Could not parse the expression: %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str); if ( !strcasecmp(out[j].key,"snp") || !strcasecmp(out[j].key,"snps") ) out[j].threshold = VCF_SNP; else if ( !strcasecmp(out[j].key,"indel") || !strcasecmp(out[j].key,"indels") ) out[j].threshold = VCF_INDEL; else if ( !strcasecmp(out[j].key,"mnp") || !strcasecmp(out[j].key,"mnps") ) out[j].threshold = VCF_MNP; else if ( !strcasecmp(out[j].key,"other") ) out[j].threshold = VCF_OTHER; else error("The type \"%s\" not recognised: %s\n", out[j].key, filter->str); out[j].tag = out[j].key; out[j].key = NULL; i = j; continue; } if ( !strcmp(out[i].tag,"%FILTER") ) { if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str); int j = i+1; if ( out[j].tok_type==TOK_EQ || out[j].tok_type==TOK_NE ) j = i - 1; if ( out[j].tok_type!=TOK_VAL || !out[j].key ) error("[%s:%d %s] Could not parse the expression, an unquoted string value perhaps? %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str); if ( strcmp(".",out[j].key) ) { out[j].hdr_id = bcf_hdr_id2int(filter->hdr, BCF_DT_ID, out[j].key); if ( !bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FLT,out[j].hdr_id) ) error("The filter \"%s\" not present in the VCF header\n", out[j].key); } else out[j].hdr_id = -1; out[j].tag = out[j].key; out[j].key = NULL; out[i].hdr_id = out[j].hdr_id; i = j; continue; } } // filter_debug_print(out, nout); if ( mops ) free(ops); filter->filters = out; filter->nfilters = nout; filter->flt_stack = (token_t **)malloc(sizeof(token_t*)*nout); return filter; }
static void bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, int min_pos) { if ( !srt->grp_str2int ) { // first time here, initialize if ( !srt->pair ) { if ( readers->collapse==COLLAPSE_NONE ) readers->collapse = BCF_SR_PAIR_EXACT; bcf_sr_set_opt(readers, BCF_SR_PAIR_LOGIC, readers->collapse); } bcf_sr_init_scores(srt); srt->grp_str2int = khash_str2int_init(); srt->var_str2int = khash_str2int_init(); } int k; khash_t(str2int) *hash; hash = srt->grp_str2int; for (k=0; k < kh_end(hash); k++) if ( kh_exist(hash,k) ) free((char*)kh_key(hash,k)); hash = srt->var_str2int; for (k=0; k < kh_end(hash); k++) if ( kh_exist(hash,k) ) free((char*)kh_key(hash,k)); kh_clear(str2int, srt->grp_str2int); kh_clear(str2int, srt->var_str2int); srt->ngrp = srt->nvar = srt->nvset = 0; grp_t grp; memset(&grp,0,sizeof(grp_t)); // group VCFs into groups, each with a unique combination of variants in the duplicate lines int ireader,ivar,irec,igrp,ivset,iact; for (ireader=0; ireader<readers->nreaders; ireader++) srt->vcf_buf[ireader].nrec = 0; for (iact=0; iact<srt->nactive; iact++) { ireader = srt->active[iact]; bcf_sr_t *reader = &readers->readers[ireader]; int rid = bcf_hdr_name2id(reader->header, chr); grp.nvar = 0; hts_expand(int,reader->nbuffer,srt->moff,srt->off); srt->noff = 0; srt->str.l = 0; for (irec=1; irec<=reader->nbuffer; irec++) { bcf1_t *line = reader->buffer[irec]; if ( line->rid!=rid || line->pos!=min_pos ) break; if ( srt->str.l ) kputc(';',&srt->str); srt->off[srt->noff++] = srt->str.l; size_t beg = srt->str.l; for (ivar=1; ivar<line->n_allele; ivar++) { if ( ivar>1 ) kputc(',',&srt->str); kputs(line->d.allele[0],&srt->str); kputc('>',&srt->str); kputs(line->d.allele[ivar],&srt->str); } if ( line->n_allele==1 ) { kputs(line->d.allele[0],&srt->str); kputsn(">.",2,&srt->str); } // Create new variant or attach to existing one. But careful, there can be duplicate // records with the same POS,REF,ALT (e.g. in dbSNP-b142) char *var_str = beg + srt->str.s; int ret, var_idx = 0, var_end = srt->str.l; while ( 1 ) { ret = khash_str2int_get(srt->var_str2int, var_str, &ivar); if ( ret==-1 ) break; var_t *var = &srt->var[ivar]; if ( var->vcf[var->nvcf-1] != ireader ) break; srt->str.l = var_end; kputw(var_idx, &srt->str); var_str = beg + srt->str.s; var_idx++; } if ( ret==-1 ) { ivar = srt->nvar++; hts_expand0(var_t,srt->nvar,srt->mvar,srt->var); srt->var[ivar].nvcf = 0; khash_str2int_set(srt->var_str2int, strdup(var_str), ivar); free(srt->var[ivar].str); // possible left-over from the previous position } var_t *var = &srt->var[ivar]; var->nalt = line->n_allele - 1; var->type = bcf_get_variant_types(line); srt->str.s[var_end] = 0; if ( ret==-1 ) var->str = strdup(var_str); int mvcf = var->mvcf; var->nvcf++; hts_expand0(int*, var->nvcf, var->mvcf, var->vcf); if ( mvcf != var->mvcf ) var->rec = (bcf1_t **) realloc(var->rec,sizeof(bcf1_t*)*var->mvcf); var->vcf[var->nvcf-1] = ireader; var->rec[var->nvcf-1] = line; grp.nvar++; hts_expand(var_t,grp.nvar,grp.mvar,grp.var); grp.var[grp.nvar-1] = ivar; } char *grp_key = grp_create_key(srt); int ret = khash_str2int_get(srt->grp_str2int, grp_key, &igrp); if ( ret==-1 ) { igrp = srt->ngrp++; hts_expand0(grp_t, srt->ngrp, srt->mgrp, srt->grp); free(srt->grp[igrp].var); srt->grp[igrp] = grp; srt->grp[igrp].key = grp_key; khash_str2int_set(srt->grp_str2int, grp_key, igrp); memset(&grp,0,sizeof(grp_t)); } else free(grp_key); srt->grp[igrp].nvcf++; } free(grp.var); // initialize bitmask - which groups is the variant present in for (ivar=0; ivar<srt->nvar; ivar++) { srt->var[ivar].mask = kbs_resize(srt->var[ivar].mask, srt->ngrp); kbs_clear(srt->var[ivar].mask); } for (igrp=0; igrp<srt->ngrp; igrp++) { for (ivar=0; ivar<srt->grp[igrp].nvar; ivar++) { int i = srt->grp[igrp].var[ivar]; kbs_insert(srt->var[i].mask, igrp); } } // create the initial list of variant sets for (ivar=0; ivar<srt->nvar; ivar++) { ivset = srt->nvset++; hts_expand0(varset_t, srt->nvset, srt->mvset, srt->vset); varset_t *vset = &srt->vset[ivset]; vset->nvar = 1; hts_expand0(var_t, vset->nvar, vset->mvar, vset->var); vset->var[vset->nvar-1] = ivar; var_t *var = &srt->var[ivar]; vset->cnt = var->nvcf; vset->mask = kbs_resize(vset->mask, srt->ngrp); kbs_clear(vset->mask); kbs_bitwise_or(vset->mask, var->mask); int type = 0; if ( var->type==VCF_REF ) type |= SR_REF; else { if ( var->type & VCF_SNP ) type |= SR_SNP; if ( var->type & VCF_MNP ) type |= SR_SNP; if ( var->type & VCF_INDEL ) type |= SR_INDEL; if ( var->type & VCF_OTHER ) type |= SR_OTHER; } var->type = type; } #if DEBUG_VSETS debug_vsets(srt); #endif // initialize the pairing matrix hts_expand(int, srt->ngrp*srt->nvset, srt->mpmat, srt->pmat); hts_expand(int, srt->nvset, srt->mcnt, srt->cnt); memset(srt->pmat, 0, sizeof(*srt->pmat)*srt->ngrp*srt->nvset); for (ivset=0; ivset<srt->nvset; ivset++) { varset_t *vset = &srt->vset[ivset]; for (igrp=0; igrp<srt->ngrp; igrp++) srt->pmat[ivset*srt->ngrp+igrp] = 0; srt->cnt[ivset] = vset->cnt; } // pair the lines while ( srt->nvset ) { #if DEBUG_VSETS fprintf(stderr,"\n"); debug_vsets(srt); #endif int imax = 0; for (ivset=1; ivset<srt->nvset; ivset++) if ( srt->cnt[imax] < srt->cnt[ivset] ) imax = ivset; int ipair = -1; uint32_t max_score = 0; for (ivset=0; ivset<srt->nvset; ivset++) { if ( kbs_logical_and(srt->vset[imax].mask,srt->vset[ivset].mask) ) continue; // cannot be merged uint32_t score = pairing_score(srt, imax, ivset); // fprintf(stderr,"score: %d %d, logic=%d \t..\t %u\n", imax,ivset,srt->pair,score); if ( max_score < score ) { max_score = score; ipair = ivset; } } // merge rows creating a new variant set this way if ( ipair!=-1 && ipair!=imax ) { imax = merge_vsets(srt, imax, ipair); continue; } push_vset(srt, imax); } srt->chr = chr; srt->pos = min_pos; }