Beispiel #1
0
static int _overlap_can_flush(vcfbuf_t *buf, int flush_all)
{
    if ( flush_all ) { buf->overlap.rid = -1; return 1; }

    int i = rbuf_last(&buf->rbuf);
    vcfrec_t *last = &buf->vcf[i];
    if ( buf->overlap.rid != last->rec->rid ) buf->overlap.end = 0;

    int beg_pos = last->rec->pos;
    int end_pos = last->rec->pos + last->rec->rlen - 1;

    // Assuming left-aligned indels. In case it is a deletion, the real variant
    // starts one base after. If an insertion, the overlap with previous zero length.
    int imin = last->rec->rlen;
    for (i=0; i<last->rec->n_allele; i++)
    {
        char *ref = last->rec->d.allele[0];
        char *alt = last->rec->d.allele[i];
        if ( *alt == '<' ) continue;    // ignore symbolic alleles
        while ( *ref && *alt && nt_to_upper(*ref)==nt_to_upper(*alt) ) { ref++; alt++; }
        if ( imin > ref - last->rec->d.allele[0] ) imin = ref - last->rec->d.allele[0];
    }

    if ( beg_pos <= buf->overlap.end )
    {
        beg_pos += imin;
        if ( beg_pos > end_pos ) end_pos = beg_pos;
    }

    if ( buf->rbuf.n==1 )
    {
        buf->overlap.rid = last->rec->rid;
        buf->overlap.end = end_pos;
        return 0; 
    }
    if ( beg_pos <= buf->overlap.end )
    {
        if ( buf->overlap.end < end_pos ) buf->overlap.end = end_pos;
        return 0;
    }
    return 1;
}
Beispiel #2
0
bcf1_t *vcfbuf_flush(vcfbuf_t *buf, int flush_all)
{
    int i,j;

    if ( buf->rbuf.n==0 ) return NULL;
    if ( flush_all ) goto ret;

    i = rbuf_kth(&buf->rbuf, 0);    // first
    j = rbuf_last(&buf->rbuf);      // last

    if ( buf->vcf[i].rec->rid != buf->vcf[j].rec->rid ) goto ret;
    if ( buf->overlap.active )
    {
        int ret = _overlap_can_flush(buf, flush_all);
        //printf("can_flush: %d  %d - %d\n", ret, buf->vcf[i].rec->pos+1, buf->vcf[j].rec->pos+1);
        if ( ret ) goto ret;
    }
    //if ( buf->overlap.active && _overlap_can_flush(buf, flush_all) ) goto ret;

    if ( buf->win > 0 )
    {
        if ( buf->rbuf.n <= buf->win ) return NULL;
        goto ret;
    }
    else if ( buf->win < 0 )
    {
        if ( buf->vcf[i].rec->pos - buf->vcf[j].rec->pos > buf->win ) return NULL;
    }
    else return NULL;
    
ret:
    if ( buf->prune.max_sites && buf->prune.max_sites < buf->rbuf.n ) _prune_sites(buf, flush_all);

    i = rbuf_shift(&buf->rbuf);
    return buf->vcf[i].rec;
}
Beispiel #3
0
static void buffered_filters(args_t *args, bcf1_t *line)
{
    /**
     *  The logic of SnpGap=3. The SNPs at positions 1 and 7 are filtered,
     *  positions 0 and 8 are not:
     *           0123456789
     *      ref  .G.GT..G..
     *      del  .A.G-..A..
     *  Here the positions 1 and 6 are filtered, 0 and 7 are not:
     *           0123-456789
     *      ref  .G.G-..G..
     *      ins  .A.GT..A..
     *
     *  The logic of IndelGap=2. The second indel is filtered:
     *           012345678901
     *      ref  .GT.GT..GT..
     *      del  .G-.G-..G-..
     *  And similarly here, the second is filtered:
     *           01 23 456 78
     *      ref  .A-.A-..A-..
     *      ins  .AT.AT..AT..
     */

    // To avoid additional data structure, we abuse bcf1_t's var and var_type records.
    const int SnpGap_set     = VCF_OTHER<<1;
    const int IndelGap_set   = VCF_OTHER<<2;
    const int IndelGap_flush = VCF_OTHER<<3;

    int var_type = 0, i;
    if ( line )
    {
        // Still on the same chromosome?
        int ilast = rbuf_last(&args->rbuf);
        if ( ilast>=0 && line->rid != args->rbuf_lines[ilast]->rid )
            flush_buffer(args, args->rbuf.n); // new chromosome, flush everything

        rbuf_expand0(&args->rbuf,bcf1_t*,args->rbuf.n,args->rbuf_lines);

        // Insert the new record in the buffer. The line would be overwritten in
        // the next bcf_sr_next_line call, therefore we need to swap it with an
        // unused one
        ilast = rbuf_append(&args->rbuf);
        if ( !args->rbuf_lines[ilast] ) args->rbuf_lines[ilast] = bcf_init1();
        SWAP(bcf1_t*, args->files->readers[0].buffer[0], args->rbuf_lines[ilast]);

        var_type = bcf_get_variant_types(line);

        // Find out the size of an indel. The indel boundaries are based on REF
        // (POS+1,POS+rlen-1). This is not entirely correct: mpileup likes to
        // output REF=CAGAGAGAGA, ALT=CAGAGAGAGAGA where REF=C,ALT=CGA could be
        // used. This filter is therefore more strict and may remove some valid
        // SNPs.
        int len = 1;
        if ( var_type & VCF_INDEL )
        {
            for (i=1; i<line->n_allele; i++)
                if ( len < 1-line->d.var[i].n ) len = 1-line->d.var[i].n;
        }

        // Set the REF allele's length to max deletion length or to 1 if a SNP or an insertion.
        line->d.var[0].n = len;
    }

    int k_flush = 1;
    if ( args->indel_gap )
    {
        k_flush = 0;
        // Find indels which are too close to each other
        int last_to = -1;
        for (i=-1; rbuf_next(&args->rbuf,&i); )
        {
            bcf1_t *rec  = args->rbuf_lines[i];
            int rec_from = rec->pos;
            if ( last_to!=-1 && last_to < rec_from ) break;

            k_flush++;
            if ( !(rec->d.var_type & VCF_INDEL) ) continue;

            rec->d.var_type |= IndelGap_set;
            last_to = args->indel_gap + rec->pos + rec->d.var[0].n - 1;
        }
        if ( i==args->rbuf.f && line && last_to!=-1 ) k_flush = 0;
        if ( k_flush || !line )
        {
            // Select the best indel from the cluster of k_flush indels
            int k = 0, max_ac = -1, imax_ac = -1;
            for (i=-1; rbuf_next(&args->rbuf,&i) && k<k_flush; )
            {
                k++;
                bcf1_t *rec  = args->rbuf_lines[i];
                if ( !(rec->d.var_type & IndelGap_set) ) continue;
                hts_expand(int, rec->n_allele, args->ntmpi, args->tmpi);
                int ret = bcf_calc_ac(args->hdr, rec, args->tmpi, BCF_UN_ALL);
                if ( imax_ac==-1 || (ret && max_ac < args->tmpi[1]) ) { max_ac = args->tmpi[1]; imax_ac = i; }
            }

            // Filter all but the best indel (with max AF or first if AF not available)
            k = 0;
            for (i=-1; rbuf_next(&args->rbuf,&i) && k<k_flush; )
            {
                k++;
                bcf1_t *rec = args->rbuf_lines[i];
                if ( !(rec->d.var_type & IndelGap_set) ) continue;
                rec->d.var_type |= IndelGap_flush;
                if ( i!=imax_ac ) bcf_add_filter(args->hdr, args->rbuf_lines[i], args->IndelGap_id);
            }
        }