bcf1_t *vcfbuf_max_ld(vcfbuf_t *buf, bcf1_t *rec, double *ld) { *ld = -1; if ( !buf->rbuf.n ) return NULL; int i = buf->rbuf.f; // Relying on vcfbuf being properly flushed - all sites in the buffer // must come from the same chromosome if ( buf->vcf[i].rec->rid != rec->rid ) return NULL; int imax = 0; double max = 0; for (i=-1; rbuf_next(&buf->rbuf,&i); ) { if ( buf->ld.skip_filter ) { if ( buf->vcf[i].rec->d.n_flt > 1 ) continue; // multiple filters are set if ( buf->vcf[i].rec->d.n_flt==1 && buf->vcf[i].rec->d.flt[0]!=0 ) continue; // not PASS } double val = _calc_ld(buf, buf->vcf[i].rec, rec); if ( buf->ld.max && buf->ld.max < val ) { *ld = val; return buf->vcf[i].rec; } if ( val > max ) { max = val; imax = i; } } *ld = max; return buf->vcf[imax].rec; }
static void _prune_sites(vcfbuf_t *buf, int flush_all) { int nbuf = flush_all ? buf->rbuf.n : buf->rbuf.n - 1; if ( nbuf > buf->prune.mvrec ) { buf->prune.idx = (int*) realloc(buf->prune.idx, nbuf*sizeof(int)); buf->prune.vrec = (vcfrec_t**) realloc(buf->prune.vrec, nbuf*sizeof(vcfrec_t*)); buf->prune.mvrec = nbuf; } // set allele frequency and prepare buffer for sorting int i,k,irec = 0; for (i=-1; rbuf_next(&buf->rbuf,&i) && irec<nbuf; ) { bcf1_t *line = buf->vcf[i].rec; if ( line->n_allele > buf->prune.mac ) { buf->prune.ac = (int*) realloc(buf->prune.ac, line->n_allele*sizeof(*buf->prune.ac)); buf->prune.mac = line->n_allele; } if ( !buf->vcf[i].af_set ) { buf->vcf[i].af = 0; if ( buf->prune.af_tag ) { if ( bcf_get_info_float(buf->hdr,line,buf->prune.af_tag,&buf->prune.farr, &buf->prune.mfarr) > 0 ) buf->vcf[i].af = buf->prune.farr[0]; } else if ( bcf_calc_ac(buf->hdr, line, buf->prune.ac, BCF_UN_INFO|BCF_UN_FMT) ) { int ntot = buf->prune.ac[0], nalt = 0; for (k=1; k<line->n_allele; k++) nalt += buf->prune.ac[k]; buf->vcf[i].af = ntot ? (float)nalt/ntot : 0; } buf->vcf[i].af_set = 1; } buf->vcf[i].idx = irec; buf->prune.vrec[irec++] = &buf->vcf[i]; } // sort by allele frequency, low AF will be removed preferentially qsort(buf->prune.vrec, nbuf, sizeof(*buf->prune.vrec), cmpvrec); // sort the rbuf indexes to be pruned descendently so that j-th rbuf index // is removed before i-th index if i<j int nprune = nbuf - buf->prune.max_sites; for (i=0; i<nprune; i++) buf->prune.idx[i] = buf->prune.vrec[i]->idx; qsort(buf->prune.idx, nprune, sizeof(int), cmpint_desc); for (i=0; i<nprune; i++) rbuf_remove_kth(&buf->rbuf, vcfrec_t, buf->prune.idx[i], buf->vcf); }
static void buffered_filters(args_t *args, bcf1_t *line) { /** * The logic of SnpGap=3. The SNPs at positions 1 and 7 are filtered, * positions 0 and 8 are not: * 0123456789 * ref .G.GT..G.. * del .A.G-..A.. * Here the positions 1 and 6 are filtered, 0 and 7 are not: * 0123-456789 * ref .G.G-..G.. * ins .A.GT..A.. * * The logic of IndelGap=2. The second indel is filtered: * 012345678901 * ref .GT.GT..GT.. * del .G-.G-..G-.. * And similarly here, the second is filtered: * 01 23 456 78 * ref .A-.A-..A-.. * ins .AT.AT..AT.. */ // To avoid additional data structure, we abuse bcf1_t's var and var_type records. const int SnpGap_set = VCF_OTHER<<1; const int IndelGap_set = VCF_OTHER<<2; const int IndelGap_flush = VCF_OTHER<<3; int var_type = 0, i; if ( line ) { // Still on the same chromosome? int ilast = rbuf_last(&args->rbuf); if ( ilast>=0 && line->rid != args->rbuf_lines[ilast]->rid ) flush_buffer(args, args->rbuf.n); // new chromosome, flush everything rbuf_expand0(&args->rbuf,bcf1_t*,args->rbuf.n,args->rbuf_lines); // Insert the new record in the buffer. The line would be overwritten in // the next bcf_sr_next_line call, therefore we need to swap it with an // unused one ilast = rbuf_append(&args->rbuf); if ( !args->rbuf_lines[ilast] ) args->rbuf_lines[ilast] = bcf_init1(); SWAP(bcf1_t*, args->files->readers[0].buffer[0], args->rbuf_lines[ilast]); var_type = bcf_get_variant_types(line); // Find out the size of an indel. The indel boundaries are based on REF // (POS+1,POS+rlen-1). This is not entirely correct: mpileup likes to // output REF=CAGAGAGAGA, ALT=CAGAGAGAGAGA where REF=C,ALT=CGA could be // used. This filter is therefore more strict and may remove some valid // SNPs. int len = 1; if ( var_type & VCF_INDEL ) { for (i=1; i<line->n_allele; i++) if ( len < 1-line->d.var[i].n ) len = 1-line->d.var[i].n; } // Set the REF allele's length to max deletion length or to 1 if a SNP or an insertion. line->d.var[0].n = len; } int k_flush = 1; if ( args->indel_gap ) { k_flush = 0; // Find indels which are too close to each other int last_to = -1; for (i=-1; rbuf_next(&args->rbuf,&i); ) { bcf1_t *rec = args->rbuf_lines[i]; int rec_from = rec->pos; if ( last_to!=-1 && last_to < rec_from ) break; k_flush++; if ( !(rec->d.var_type & VCF_INDEL) ) continue; rec->d.var_type |= IndelGap_set; last_to = args->indel_gap + rec->pos + rec->d.var[0].n - 1; } if ( i==args->rbuf.f && line && last_to!=-1 ) k_flush = 0; if ( k_flush || !line ) { // Select the best indel from the cluster of k_flush indels int k = 0, max_ac = -1, imax_ac = -1; for (i=-1; rbuf_next(&args->rbuf,&i) && k<k_flush; ) { k++; bcf1_t *rec = args->rbuf_lines[i]; if ( !(rec->d.var_type & IndelGap_set) ) continue; hts_expand(int, rec->n_allele, args->ntmpi, args->tmpi); int ret = bcf_calc_ac(args->hdr, rec, args->tmpi, BCF_UN_ALL); if ( imax_ac==-1 || (ret && max_ac < args->tmpi[1]) ) { max_ac = args->tmpi[1]; imax_ac = i; } } // Filter all but the best indel (with max AF or first if AF not available) k = 0; for (i=-1; rbuf_next(&args->rbuf,&i) && k<k_flush; ) { k++; bcf1_t *rec = args->rbuf_lines[i]; if ( !(rec->d.var_type & IndelGap_set) ) continue; rec->d.var_type |= IndelGap_flush; if ( i!=imax_ac ) bcf_add_filter(args->hdr, args->rbuf_lines[i], args->IndelGap_id); } }