Exemple #1
0
static int fake_PLs(args_t *args, bcf_hdr_t *hdr, bcf1_t *line)
{
    // PLs not present, use GTs instead.
    int fake_PL = args->no_PLs ? args->no_PLs : 99;    // with 1, discordance is the number of non-matching GTs
    int nsm_gt, i;
    if ( (nsm_gt=bcf_get_genotypes(hdr, line, &args->tmp_arr, &args->ntmp_arr)) <= 0 )
        error("GT not present at %s:%d?\n", hdr->id[BCF_DT_CTG][line->rid].key, line->pos+1);
    nsm_gt /= bcf_hdr_nsamples(hdr);
    int npl = line->n_allele*(line->n_allele+1)/2;
    hts_expand(int,npl*bcf_hdr_nsamples(hdr),args->npl_arr,args->pl_arr);
    for (i=0; i<bcf_hdr_nsamples(hdr); i++)
    {
        int *gt_ptr = args->tmp_arr + i*nsm_gt;
        int j, *pl_ptr = args->pl_arr + i*npl;
        if ( bcf_gt_is_missing(gt_ptr[0]) || bcf_gt_is_missing(gt_ptr[1]) ) // missing genotype
        {
            for (j=0; j<npl; j++) pl_ptr[j] = -1;
        }
        else
        {
            int a = bcf_gt_allele(gt_ptr[0]);
            int b = bcf_gt_allele(gt_ptr[1]);
            for (j=0; j<npl; j++) pl_ptr[j] = fake_PL;
            int idx = bcf_alleles2gt(a,b);
            pl_ptr[idx] = 0;
        }
    }
    return npl;
}
Exemple #2
0
int estimate_AF(args_t *args, bcf1_t *line, double *alt_freq)
{
    if ( !args->nitmp )
    {
        args->nitmp = bcf_get_genotypes(args->hdr, line, &args->itmp, &args->mitmp);
        if ( args->nitmp != 2*args->nsmpl ) return -1;     // not diploid?
        args->nitmp /= args->nsmpl;
    }

    int i, nalt = 0, nref = 0;
    for (i=0; i<args->nsmpl; i++)
    {
        int32_t *gt = &args->itmp[i*args->nitmp];

        if ( bcf_gt_is_missing(gt[0]) || bcf_gt_is_missing(gt[1]) ) continue;

        if ( bcf_gt_allele(gt[0]) ) nalt++;
        else nref++;

        if ( bcf_gt_allele(gt[1]) ) nalt++;
        else nref++;
    }
    if ( !nalt && !nref ) return -1;

    *alt_freq = (double)nalt / (nalt + nref);
    return 0;
}
Exemple #3
0
int process_GT(args_t *args, bcf1_t *line, uint32_t *ntot, uint32_t *ndif)
{
    int ngt = bcf_get_genotypes(args->sm_hdr, line, &args->tmp_arr, &args->ntmp_arr);

    if ( ngt<=0 ) return 1;                 // GT not present
    if ( ngt!=args->nsmpl*2 ) return 2;     // not diploid
    ngt /= args->nsmpl;
    
    int i,j, idx = 0;
    for (i=1; i<args->nsmpl; i++)
    {
        int32_t *a = args->tmp_arr + i*ngt;
        if ( bcf_gt_is_missing(a[0]) || bcf_gt_is_missing(a[1]) || a[1]==bcf_int32_vector_end ) { idx+=i; continue; }
        int agt = 1<<bcf_gt_allele(a[0]) | 1<<bcf_gt_allele(a[1]);

        for (j=0; j<i; j++)
        {
            int32_t *b = args->tmp_arr + j*ngt;
            if ( bcf_gt_is_missing(b[0]) || bcf_gt_is_missing(b[1]) || b[1]==bcf_int32_vector_end ) { idx++; continue; }
            int bgt = 1<<bcf_gt_allele(b[0]) | 1<<bcf_gt_allele(b[1]);

            ntot[idx]++;
            if ( agt!=bgt ) ndif[idx]++;
            idx++;
        }
    }
    return 0;
}
Exemple #4
0
static void set_observed_prob_unrelated(bcf1_t *rec)
{
    float af = 0.5;  // alternate allele frequency

    int ngt = bcf_get_genotypes(args.hdr, rec, &args.gt_arr, &args.ngt_arr);
    if ( ngt<0 ) return;
    if ( ngt!=4 ) return;   // chrX

    int32_t a,b,c,d;
    a = args.gt_arr[2*args.isample];
    b = args.gt_arr[2*args.isample+1];
    c = args.gt_arr[2*args.jsample];
    d = args.gt_arr[2*args.jsample+1];
    if ( bcf_gt_is_missing(a) || bcf_gt_is_missing(b) ) return;
    if ( bcf_gt_is_missing(c) || bcf_gt_is_missing(d) ) return;
    if ( !bcf_gt_is_phased(a) && !bcf_gt_is_phased(b) ) return; // only the second allele should be set when phased
    if ( !bcf_gt_is_phased(c) && !bcf_gt_is_phased(d) ) return;
    a = bcf_gt_allele(a);
    b = bcf_gt_allele(b);
    c = bcf_gt_allele(c);
    d = bcf_gt_allele(d);

    int m = args.msites;
    args.nsites++;
    hts_expand(uint32_t,args.nsites,args.msites,args.sites);
    if ( m!=args.msites ) args.eprob = (double*) realloc(args.eprob, sizeof(double)*args.msites*args.nstates);

    args.sites[args.nsites-1] = rec->pos;
    double *prob = args.eprob + args.nstates*(args.nsites-1);
    prob[UNRL_xxxx] = prob_not_shared(af,a,c) * prob_not_shared(af,a,d) * prob_not_shared(af,b,c) * prob_not_shared(af,b,d);
    prob[UNRL_0x0x] = prob_shared(af,a,c) * prob_not_shared(af,b,d);
    prob[UNRL_0xx0] = prob_shared(af,a,d) * prob_not_shared(af,b,c);
    prob[UNRL_x00x] = prob_shared(af,b,c) * prob_not_shared(af,a,d);
    prob[UNRL_x0x0] = prob_shared(af,b,d) * prob_not_shared(af,a,c);
    prob[UNRL_0101] = prob_shared(af,a,c) * prob_shared(af,b,d);
    prob[UNRL_0110] = prob_shared(af,a,d) * prob_shared(af,b,c);

#if 0
    static int x = 0;
    if ( !x++)
    {
        printf("p(0==0) .. %f\n", prob_shared(af,0,0));
        printf("p(0!=0) .. %f\n", prob_not_shared(af,0,0));
        printf("p(0==1) .. %f\n", prob_shared(af,0,1));
        printf("p(0!=1) .. %f\n", prob_not_shared(af,0,1));
    }
    printf("%d|%d %d|%d  x:%f 11:%f 12:%f 21:%f 22:%f 11,22:%f 12,21:%f  %d\n", a,b,c,d,
            prob[UNRL_xxxx], prob[UNRL_0x0x], prob[UNRL_0xx0], prob[UNRL_x00x], prob[UNRL_x0x0], prob[UNRL_0101], prob[UNRL_0110], rec->pos+1);
#endif
}
Exemple #5
0
int process_region_precise(args_t *args, char *seq, regitr_t *itr)
{
    int k = 1;
    uint32_t start = itr->reg[itr->i].start, end = itr->reg[itr->i].end;
    while ( itr->i+k<itr->n && start==itr->reg[itr->i+k].start && end==itr->reg[itr->i+k].end ) k++;
    
    int ret = ploidy_query(args->ploidy, seq, start, args->sex2ploidy, NULL, NULL);
    assert(ret);

    memset(args->counts,0,args->ncounts*sizeof(int));

    // Select 'nsites' sites spaced so that they evenly cover the whole region 
    // to get a representative sample. We index-jump as we should be checking
    // a few sites only.
    int i, rid = -1, pos, prev_pos = -1, ismpl;
    for (i=0; i<args->nsites; i++)
    {
        rid = -1;
        pos = ((i+1.0)/(args->nsites+1))*(end - start) + start;
        if ( i>0 && pos <= prev_pos ) continue;     // the vcf is too sparse
        if ( bcf_sr_seek(args->sr,seq,pos)!=0 ) return k;   // sequence not present
        if ( !bcf_sr_next_line(args->sr) ) return k;        // no sites found
        bcf1_t *rec = bcf_sr_get_line(args->sr,0);
        if ( rid==-1 ) rid = rec->rid;
        if ( rid!=rec->rid || rec->pos > end ) break;
        prev_pos = rec->pos;

        int ngts = bcf_get_genotypes(args->hdr,rec,&args->gts,&args->ngts);
        ngts /= args->nsample;
        for (ismpl=0; ismpl<args->nsample; ismpl++)
        {
            int32_t *gts = args->gts + ngts*ismpl;
            int igt, ploidy = 0;
            for (igt=0; igt<ngts; igt++)
            {
                if ( gts[igt]==bcf_int32_vector_end || bcf_gt_is_missing(gts[igt]) ) break;
                else ploidy++;
            }
            args->counts[ismpl*(args->max_ploidy+1) + ploidy]++;
            if ( args->verbose )
                fprintf(stderr,"%s:%d\t%s\tploidy=%d\n", seq,rec->pos+1,args->hdr->samples[ismpl],ploidy);
        }
    }

    for (ismpl=0; ismpl<args->nsample; ismpl++)
    {
        float sum = 0, *probs = args->sex2prob + ismpl*args->nsex;
        int *counts = args->counts + ismpl*(args->max_ploidy+1);
        for (i=0; i<args->max_ploidy+1; i++) sum += counts[i];
        if ( !sum ) continue;
        for (i=0; i<args->nsex; i++)
        {
            int ploidy = args->sex2ploidy[i];
            probs[i] *= counts[ploidy]/sum;
        }
    }

    return k;
}
Exemple #6
0
static void phase_update(args_t *args, bcf_hdr_t *hdr, bcf1_t *rec)
{
    int i, nGTs = bcf_get_genotypes(hdr, rec, &args->GTa, &args->mGTa);
    for (i=0; i<bcf_hdr_nsamples(hdr); i++)
    {
        if ( !args->swap_phase[i] ) continue;
        int *gt = &args->GTa[i*2];
        if ( bcf_gt_is_missing(gt[0]) || gt[1]==bcf_int32_vector_end ) continue;
        SWAP(int, gt[0], gt[1]);
        gt[1] |= 1;
    }
    bcf_update_genotypes(hdr,rec,args->GTa,nGTs);
}
Exemple #7
0
static void set_observed_prob_trio(bcf1_t *rec)
{
    int ngt = bcf_get_genotypes(args.hdr, rec, &args.gt_arr, &args.ngt_arr);
    if ( ngt<0 ) return;
    if ( ngt!=6 ) return;   // chrX

    int32_t a,b,c,d,e,f;
    a = args.gt_arr[2*args.imother];
    b = args.gt_arr[2*args.imother+1];
    c = args.gt_arr[2*args.ifather];
    d = args.gt_arr[2*args.ifather+1];
    e = args.gt_arr[2*args.ichild];
    f = args.gt_arr[2*args.ichild+1];
    if ( bcf_gt_is_missing(a) || bcf_gt_is_missing(b) ) return;
    if ( bcf_gt_is_missing(c) || bcf_gt_is_missing(d) ) return;
    if ( bcf_gt_is_missing(e) || bcf_gt_is_missing(f) ) return;
    if ( !bcf_gt_is_phased(a) && !bcf_gt_is_phased(b) ) return; // only the second allele should be set when phased
    if ( !bcf_gt_is_phased(c) && !bcf_gt_is_phased(d) ) return;
    if ( !bcf_gt_is_phased(e) && !bcf_gt_is_phased(f) ) return;
    a = bcf_gt_allele(a);
    b = bcf_gt_allele(b);
    c = bcf_gt_allele(c);
    d = bcf_gt_allele(d);
    e = bcf_gt_allele(e);
    f = bcf_gt_allele(f);

    int mother = (1<<a) | (1<<b);
    int father = (1<<c) | (1<<d);
    int child  = (1<<e) | (1<<f);
    if ( !(mother&child) || !(father&child) )  return;      // Mendelian-inconsistent site, skip

    if ( a!=b ) args.nhet_mother++;
    if ( c!=d ) args.nhet_father++;

    int m = args.msites;
    args.nsites++;
    hts_expand(uint32_t,args.nsites,args.msites,args.sites);
    if ( m!=args.msites ) args.eprob = (double*) realloc(args.eprob, sizeof(double)*args.msites*args.nstates);

    args.sites[args.nsites-1] = rec->pos;
    double *prob = args.eprob + args.nstates*(args.nsites-1);
    prob[TRIO_AC] = prob_shared(0,e,a) * prob_shared(0,f,c);
    prob[TRIO_AD] = prob_shared(0,e,a) * prob_shared(0,f,d);
    prob[TRIO_BC] = prob_shared(0,e,b) * prob_shared(0,f,c);
    prob[TRIO_BD] = prob_shared(0,e,b) * prob_shared(0,f,d);
    prob[TRIO_CA] = prob_shared(0,e,c) * prob_shared(0,f,a);
    prob[TRIO_DA] = prob_shared(0,e,d) * prob_shared(0,f,a);
    prob[TRIO_CB] = prob_shared(0,e,c) * prob_shared(0,f,b);
    prob[TRIO_DB] = prob_shared(0,e,d) * prob_shared(0,f,b);
}
Exemple #8
0
static void phased_flush(args_t *args)
{
    if ( !args->nbuf ) return;

    bcf_hdr_t *ahdr = args->files->readers[0].header;
    bcf_hdr_t *bhdr = args->files->readers[1].header;

    int i, j, nsmpl = bcf_hdr_nsamples(args->out_hdr);
    static int gt_absent_warned = 0;

    for (i=0; i<args->nbuf; i+=2)
    {
        bcf1_t *arec = args->buf[i];
        bcf1_t *brec = args->buf[i+1];

        int nGTs = bcf_get_genotypes(ahdr, arec, &args->GTa, &args->mGTa);
        if ( nGTs < 0 ) 
        {
            if ( !gt_absent_warned )
            {
                fprintf(stderr,"GT is not present at %s:%d. (This warning is printed only once.)\n", bcf_seqname(ahdr,arec), arec->pos+1);
                gt_absent_warned = 1;
            }
            continue;
        }
        if ( nGTs != 2*nsmpl ) continue;    // not diploid
        nGTs = bcf_get_genotypes(bhdr, brec, &args->GTb, &args->mGTb);
        if ( nGTs < 0 )
        {
            if ( !gt_absent_warned )
            {
                fprintf(stderr,"GT is not present at %s:%d. (This warning is printed only once.)\n", bcf_seqname(bhdr,brec), brec->pos+1);
                gt_absent_warned = 1;
            }
            continue;
        }
        if ( nGTs != 2*nsmpl ) continue;    // not diploid

        for (j=0; j<nsmpl; j++)
        {
            int *gta = &args->GTa[j*2];
            int *gtb = &args->GTb[j*2];
            if ( gta[1]==bcf_int32_vector_end || gtb[1]==bcf_int32_vector_end ) continue;
            if ( bcf_gt_is_missing(gta[0]) || bcf_gt_is_missing(gta[1]) || bcf_gt_is_missing(gtb[0]) || bcf_gt_is_missing(gtb[1]) ) continue;
            if ( !bcf_gt_is_phased(gta[1]) || !bcf_gt_is_phased(gtb[1]) ) continue;
            if ( bcf_gt_allele(gta[0])==bcf_gt_allele(gta[1]) || bcf_gt_allele(gtb[0])==bcf_gt_allele(gtb[1]) ) continue;
            if ( bcf_gt_allele(gta[0])==bcf_gt_allele(gtb[0]) && bcf_gt_allele(gta[1])==bcf_gt_allele(gtb[1]) )
            {
                if ( args->swap_phase[j] ) args->nmism[j]++; else args->nmatch[j]++;
            }
            if ( bcf_gt_allele(gta[0])==bcf_gt_allele(gtb[1]) && bcf_gt_allele(gta[1])==bcf_gt_allele(gtb[0]) )
            {
                if ( args->swap_phase[j] ) args->nmatch[j]++; else args->nmism[j]++;
            }
        }
    }
    for (i=0; i<args->nbuf/2; i+=2)
    {
        bcf1_t *arec = args->buf[i];
        bcf_translate(args->out_hdr, args->files->readers[0].header, arec);
        if ( args->nswap )
            phase_update(args, args->out_hdr, arec);
        if ( !args->compact_PS || args->phase_set_changed )
        {
            bcf_update_format_int32(args->out_hdr,arec,"PS",args->phase_set,nsmpl);
            args->phase_set_changed = 0;
        }
        bcf_write(args->out_fh, args->out_hdr, arec);

        if ( arec->pos < args->prev_pos_check ) error("FIXME, disorder: %s:%d vs %d  [1]\n", bcf_seqname(args->files->readers[0].header,arec),arec->pos+1,args->prev_pos_check+1);
        args->prev_pos_check = arec->pos;
    }
    args->nswap = 0;
    for (j=0; j<nsmpl; j++)
    {
        if ( args->nmatch[j] >= args->nmism[j] )
            args->swap_phase[j] = 0;
        else
        {
            args->swap_phase[j] = 1;
            args->nswap++;
        }
        if ( args->nmatch[j] && args->nmism[j] )
        {
            // Entropy-inspired quality. The factor 0.7 shifts and scales to (0,1)
            double f = (double)args->nmatch[j]/(args->nmatch[j]+args->nmism[j]);
            args->phase_qual[j] = 99*(0.7 + f*log(f) + (1-f)*log(1-f))/0.7;
        }
        else
            args->phase_qual[j] = 99;
        args->nmatch[j] = 0;
        args->nmism[j]  = 0;
    }
    int PQ_printed = 0;
    for (; i<args->nbuf; i+=2)
    {
        bcf1_t *brec = args->buf[i+1];
        bcf_translate(args->out_hdr, args->files->readers[1].header, brec);
        if ( !PQ_printed )
        {
            bcf_update_format_int32(args->out_hdr,brec,"PQ",args->phase_qual,nsmpl);
            PQ_printed = 1;
            for (j=0; j<nsmpl; j++)
                if ( args->phase_qual[j] < args->min_PQ ) 
                {
                    args->phase_set[j] = brec->pos+1;
                    args->phase_set_changed = 1;
                }
                else if ( args->compact_PS ) args->phase_set[j] = bcf_int32_missing;
        }
        if ( args->nswap )
            phase_update(args, args->out_hdr, brec);
        if ( !args->compact_PS || args->phase_set_changed )
        {
            bcf_update_format_int32(args->out_hdr,brec,"PS",args->phase_set,nsmpl);
            args->phase_set_changed = 0;
        }
        bcf_write(args->out_fh, args->out_hdr, brec);

        if ( brec->pos < args->prev_pos_check ) error("FIXME, disorder: %s:%d vs %d  [2]\n", bcf_seqname(args->files->readers[1].header,brec),brec->pos+1,args->prev_pos_check+1);
        args->prev_pos_check = brec->pos;
    }
    args->nbuf = 0;
}
Exemple #9
0
static void apply_variant(args_t *args, bcf1_t *rec)
{
    if ( rec->n_allele==1 ) return;

    if ( rec->pos <= args->fa_frz_pos )
    {
        fprintf(pysamerr,"The site %s:%d overlaps with another variant, skipping...\n", bcf_seqname(args->hdr,rec),rec->pos+1);
        return;
    }
    if ( args->mask )
    {
        char *chr = (char*)bcf_hdr_id2name(args->hdr,args->rid);
        int start = rec->pos;
        int end   = rec->pos + rec->rlen - 1;
        if ( regidx_overlap(args->mask, chr,start,end,NULL) ) return;
    }

    int i, ialt = 1;
    if ( args->isample >= 0 )
    {
        bcf_fmt_t *fmt = bcf_get_fmt(args->hdr, rec, "GT");
        if ( !fmt ) return;
        if ( args->haplotype )
        {
            if ( args->haplotype > fmt->n ) error("Can't apply %d-th haplotype at %s:%d\n", args->haplotype,bcf_seqname(args->hdr,rec),rec->pos+1);
            uint8_t *ignore, *ptr = fmt->p + fmt->size*args->isample + args->haplotype - 1;
            ialt = bcf_dec_int1(ptr, fmt->type, &ignore);
            if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return;
            ialt = bcf_gt_allele(ialt);
        }
        else if ( args->output_iupac ) 
        {
            uint8_t *ignore, *ptr = fmt->p + fmt->size*args->isample;
            ialt = bcf_dec_int1(ptr, fmt->type, &ignore);
            if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return;
            ialt = bcf_gt_allele(ialt);

            int jalt;
            if ( fmt->n>1 )
            {
                ptr = fmt->p + fmt->size*args->isample + 1;
                jalt = bcf_dec_int1(ptr, fmt->type, &ignore);
                if ( bcf_gt_is_missing(jalt) || jalt==bcf_int32_vector_end ) jalt = ialt;
                else jalt = bcf_gt_allele(jalt);
            }
            else jalt = ialt;
            if ( rec->n_allele <= ialt || rec->n_allele <= jalt ) error("Broken VCF, too few alts at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1);
            if ( ialt!=jalt && !rec->d.allele[ialt][1] && !rec->d.allele[jalt][1] ) // is this a het snp?
            {
                char ial = rec->d.allele[ialt][0];
                char jal = rec->d.allele[jalt][0];
                rec->d.allele[ialt][0] = gt2iupac(ial,jal);
            }
        }
        else
        {
            for (i=0; i<fmt->n; i++)
            {
                uint8_t *ignore, *ptr = fmt->p + fmt->size*args->isample + i;
                ialt = bcf_dec_int1(ptr, fmt->type, &ignore);
                if ( bcf_gt_is_missing(ialt) || ialt==bcf_int32_vector_end ) return;
                ialt = bcf_gt_allele(ialt);
                if ( ialt ) break;
            }
        }
        if ( !ialt ) return;  // ref allele
        if ( rec->n_allele <= ialt ) error("Broken VCF, too few alts at %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1);
    }
    else if ( args->output_iupac && !rec->d.allele[0][1] && !rec->d.allele[1][1] )
    {
        char ial = rec->d.allele[0][0];
        char jal = rec->d.allele[1][0];
        rec->d.allele[1][0] = gt2iupac(ial,jal);
    }

    int idx = rec->pos - args->fa_ori_pos + args->fa_mod_off;
    if ( idx<0 || idx>=args->fa_buf.l ) 
        error("FIXME: %s:%d .. idx=%d, ori_pos=%d, len=%d, off=%d\n",bcf_seqname(args->hdr,rec),rec->pos+1,idx,args->fa_ori_pos,args->fa_buf.l,args->fa_mod_off);

    // sanity check the reference base
    int len_diff = 0, alen = 0;
    if ( rec->d.allele[ialt][0]=='<' )
    {
        if ( strcasecmp(rec->d.allele[ialt], "<DEL>") )
            error("Symbolic alleles other than <DEL> are currently not supported: %s at %s:%d\n",rec->d.allele[ialt],bcf_seqname(args->hdr,rec),rec->pos+1);
        assert( rec->d.allele[0][1]==0 );           // todo: for now expecting strlen(REF) = 1
        len_diff = 1-rec->rlen;
        rec->d.allele[ialt] = rec->d.allele[0];     // according to VCF spec, REF must precede the event
        alen = strlen(rec->d.allele[ialt]);
    }
    else if ( strncasecmp(rec->d.allele[0],args->fa_buf.s+idx,rec->rlen) )
    {
        // fprintf(pysamerr,"%d .. [%s], idx=%d ori=%d off=%d\n",args->fa_ori_pos,args->fa_buf.s,idx,args->fa_ori_pos,args->fa_mod_off);
        char tmp = 0;
        if ( args->fa_buf.l - idx > rec->rlen ) 
        { 
            tmp = args->fa_buf.s[idx+rec->rlen];
            args->fa_buf.s[idx+rec->rlen] = 0;
        }
        error(
            "The fasta sequence does not match the REF allele at %s:%d:\n"
            "   .vcf: [%s]\n" 
            "   .vcf: [%s] <- (ALT)\n" 
            "   .fa:  [%s]%c%s\n",
            bcf_seqname(args->hdr,rec),rec->pos+1, rec->d.allele[0], rec->d.allele[ialt], args->fa_buf.s+idx, 
            tmp?tmp:' ',tmp?args->fa_buf.s+idx+rec->rlen+1:""
            );
    }
    else
    {
        alen = strlen(rec->d.allele[ialt]);
        len_diff = alen - rec->rlen;
    }

    if ( args->fa_case )
        for (i=0; i<alen; i++) rec->d.allele[ialt][i] = toupper(rec->d.allele[ialt][i]);
    else
        for (i=0; i<alen; i++) rec->d.allele[ialt][i] = tolower(rec->d.allele[ialt][i]);

    if ( len_diff <= 0 )
    {
        // deletion or same size event
        for (i=0; i<alen; i++)
            args->fa_buf.s[idx+i] = rec->d.allele[ialt][i];
        if ( len_diff )
            memmove(args->fa_buf.s+idx+alen,args->fa_buf.s+idx+rec->rlen,args->fa_buf.l-idx-rec->rlen);
    }
    else
    {
        // insertion
        ks_resize(&args->fa_buf, args->fa_buf.l + len_diff);
        memmove(args->fa_buf.s + idx + rec->rlen + len_diff, args->fa_buf.s + idx + rec->rlen, args->fa_buf.l - idx - rec->rlen);
        for (i=0; i<alen; i++)
            args->fa_buf.s[idx+i] = rec->d.allele[ialt][i];
    }
    if (args->chain && len_diff != 0)
    {
        // If first nucleotide of both REF and ALT are the same... (indels typically include the nucleotide before the variant)
        if ( strncasecmp(rec->d.allele[0],rec->d.allele[ialt],1) == 0)
        {
            // ...extend the block by 1 bp: start is 1 bp further and alleles are 1 bp shorter
            push_chain_gap(args->chain, rec->pos + 1, rec->rlen - 1, rec->pos + 1 + args->fa_mod_off, alen - 1);
        }
        else
        {
            // otherwise, just the coordinates of the variant as given
            push_chain_gap(args->chain, rec->pos, rec->rlen, rec->pos + args->fa_mod_off, alen);
        }
    }
    args->fa_buf.l += len_diff;
    args->fa_mod_off += len_diff;
    args->fa_frz_pos  = rec->pos + rec->rlen - 1;
}
Exemple #10
0
static void check_gt(args_t *args)
{
    int i,ret, *gt2ipl = NULL, m_gt2ipl = 0, *gt_arr = NULL, ngt_arr = 0;
    int fake_pls = args->no_PLs;

    // Initialize things: check which tags are defined in the header, sample names etc.
    if ( bcf_hdr_id2int(args->gt_hdr, BCF_DT_ID, "GT")<0 ) error("[E::%s] GT not present in the header of %s?\n", __func__, args->files->readers[1].fname);
    if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "PL")<0 )
    {
        if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "GT")<0 )
            error("[E::%s] Neither PL nor GT present in the header of %s\n", __func__, args->files->readers[0].fname);
        if ( !args->no_PLs )
            fprintf(stderr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname);
        fake_pls = 1;
    }

    FILE *fp = args->plot ? open_file(NULL, "w", "%s.tab", args->plot) : stdout;
    print_header(args, fp);

    int tgt_isample = -1, query_isample = 0;
    if ( args->target_sample )
    {
        tgt_isample = bcf_hdr_id2int(args->gt_hdr, BCF_DT_SAMPLE, args->target_sample);
        if ( tgt_isample<0 ) error("No such sample in %s: [%s]\n", args->files->readers[1].fname, args->target_sample);
    }
    if ( args->all_sites )
    {
        if ( tgt_isample==-1 )
        {
            fprintf(stderr,"No target sample selected for comparison, using the first sample in %s: %s\n", args->gt_fname,args->gt_hdr->samples[0]);
            tgt_isample = 0;
        }
    }
    if ( args->query_sample )
    {
        query_isample = bcf_hdr_id2int(args->sm_hdr, BCF_DT_SAMPLE, args->query_sample);
        if ( query_isample<0 ) error("No such sample in %s: [%s]\n", args->files->readers[0].fname, args->query_sample);
    }
    if ( args->all_sites )
        fprintf(fp, "# [1]SC, Site by Site Comparison\t[2]Chromosome\t[3]Position\t[4]-g alleles\t[5]-g GT (%s)\t[6]match log LK\t[7]Query alleles\t[8-]Query PLs (%s)\n",
                args->gt_hdr->samples[tgt_isample],args->sm_hdr->samples[query_isample]);

    // Main loop
    float prev_lk = 0;
    while ( (ret=bcf_sr_next_line(args->files)) )
    {
        if ( ret!=2 ) continue;
        bcf1_t *sm_line = args->files->readers[0].buffer[0];    // the query file
        bcf1_t *gt_line = args->files->readers[1].buffer[0];    // the -g target file
        bcf_unpack(sm_line, BCF_UN_FMT);
        bcf_unpack(gt_line, BCF_UN_FMT);

        // Init mapping from target genotype index to the sample's PL fields
        int n_gt2ipl = gt_line->n_allele*(gt_line->n_allele + 1)/2;
        if ( n_gt2ipl > m_gt2ipl )
        {
            m_gt2ipl = n_gt2ipl;
            gt2ipl   = (int*) realloc(gt2ipl, sizeof(int)*m_gt2ipl);
        }
        if ( !init_gt2ipl(args, gt_line, sm_line, gt2ipl, n_gt2ipl) ) continue;

        // Target genotypes
        int ngt, npl;
        if ( (ngt=bcf_get_genotypes(args->gt_hdr, gt_line, &gt_arr, &ngt_arr)) <= 0 )
            error("GT not present at %s:%d?", args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, gt_line->pos+1);
        ngt /= bcf_hdr_nsamples(args->gt_hdr);
        if ( ngt!=2 ) continue; // checking only diploid genotypes

        // Sample PLs
        if ( !fake_pls )
        {
            if ( (npl=bcf_get_format_int32(args->sm_hdr, sm_line, "PL", &args->pl_arr, &args->npl_arr)) <= 0 )
                error("PL not present at %s:%d?", args->sm_hdr->id[BCF_DT_CTG][sm_line->rid].key, sm_line->pos+1);
            npl /= bcf_hdr_nsamples(args->sm_hdr);
        }
        else
            npl = fake_PLs(args, args->sm_hdr, sm_line);

        // Calculate likelihoods for all samples, assuming diploid genotypes

        // For faster access to genotype likelihoods (PLs) of the query sample
        int max_ipl, *pl_ptr = args->pl_arr + query_isample*npl;
        double sum_pl = 0; // for converting PLs to probs
        for (max_ipl=0; max_ipl<npl; max_ipl++)
        {
            if ( pl_ptr[max_ipl]==bcf_int32_vector_end ) break;
            if ( pl_ptr[max_ipl]==bcf_int32_missing ) continue;
            sum_pl += pow(10, -0.1*pl_ptr[max_ipl]);
        }
        if ( sum_pl==0 ) continue; // no PLs present
        if ( fake_pls && args->no_PLs==1 ) sum_pl = -1;

        // The main stats: concordance of the query sample with the target -g samples
        for (i=0; i<bcf_hdr_nsamples(args->gt_hdr); i++)
        {
            int *gt_ptr = gt_arr + i*ngt;
            if ( gt_ptr[1]==bcf_int32_vector_end ) continue;    // skip haploid genotypes
            if ( bcf_gt_is_missing(gt_ptr[0]) || bcf_gt_is_missing(gt_ptr[1]) ) continue;
            int a = bcf_gt_allele(gt_ptr[0]);
            int b = bcf_gt_allele(gt_ptr[1]);
            if ( args->hom_only && a!=b ) continue; // heterozygous genotype
            int igt_tgt = igt_tgt = bcf_alleles2gt(a,b); // genotype index in the target file
            int igt_qry = gt2ipl[igt_tgt];  // corresponding genotype in query file
            if ( igt_qry>=max_ipl || pl_ptr[igt_qry]<0 ) continue;   // genotype not present in query sample: haploid or missing
            args->lks[i] += sum_pl<0 ? -pl_ptr[igt_qry] : log(pow(10, -0.1*pl_ptr[igt_qry])/sum_pl);
            args->sites[i]++;
        }
        if ( args->all_sites )
        {
            // Print LKs at all sites for debugging
            int *gt_ptr = gt_arr + tgt_isample*ngt;
            if ( gt_ptr[1]==bcf_int32_vector_end ) continue;    // skip haploid genotypes
            int a = bcf_gt_allele(gt_ptr[0]);
            int b = bcf_gt_allele(gt_ptr[1]);
            if ( args->hom_only && a!=b ) continue; // heterozygous genotype
            fprintf(fp, "SC\t%s\t%d", args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, gt_line->pos+1);
            for (i=0; i<gt_line->n_allele; i++) fprintf(fp, "%c%s", i==0?'\t':',', gt_line->d.allele[i]);
            fprintf(fp, "\t%s/%s", a>=0 ? gt_line->d.allele[a] : ".", b>=0 ? gt_line->d.allele[b] : ".");
            fprintf(fp, "\t%f", args->lks[query_isample]-prev_lk);
            prev_lk = args->lks[query_isample];

            int igt, *pl_ptr = args->pl_arr + query_isample*npl; // PLs of the query sample
            for (i=0; i<sm_line->n_allele; i++) fprintf(fp, "%c%s", i==0?'\t':',', sm_line->d.allele[i]);
            for (igt=0; igt<npl; igt++)
                if ( pl_ptr[igt]==bcf_int32_vector_end ) break;
                else if ( pl_ptr[igt]==bcf_int32_missing ) fprintf(fp, ".");
                else fprintf(fp, "\t%d", pl_ptr[igt]);
            fprintf(fp, "\n");
        }
    }
    free(gt2ipl);
    free(gt_arr);
    free(args->pl_arr);
    free(args->tmp_arr);

    // To be able to plot total discordance (=number of mismatching GTs with -G1) in the same
    // plot as discordance per site, the latter must be scaled to the same range
    int nsamples = bcf_hdr_nsamples(args->gt_hdr);
    double extreme_lk = 0, extreme_lk_per_site = 0;
    for (i=0; i<nsamples; i++)
    {
        if ( args->lks[i] < extreme_lk ) extreme_lk = args->lks[i];
        if ( args->sites[i] && args->lks[i]/args->sites[i] < extreme_lk_per_site ) extreme_lk_per_site = args->lks[i]/args->sites[i];
    }

    // Sorted output
    double **p = (double**) malloc(sizeof(double*)*nsamples);
    for (i=0; i<nsamples; i++) p[i] = &args->lks[i];
    qsort(p, nsamples, sizeof(int*), cmp_doubleptr);

    fprintf(fp, "# [1]CN\t[2]Discordance with %s (total)\t[3]Discordance (score per site)\t[4]Number of sites compared\t[5]Sample\t[6]Sample ID\n", args->sm_hdr->samples[query_isample]);
    for (i=0; i<nsamples; i++)
    {
        int idx = p[i] - args->lks;
        double per_site = 0;
        if ( args->sites[idx] )
        {
            if ( args->sites[idx] && extreme_lk_per_site )
            {
                per_site = args->lks[idx]/args->sites[idx];
                per_site *= extreme_lk / extreme_lk_per_site;
            }
            else
                per_site = 0;
        }
        fprintf(fp, "CN\t%e\t%e\t%.0f\t%s\t%d\n", fabs(args->lks[idx]), fabs(per_site), args->sites[idx], args->gt_hdr->samples[idx], i);
    }

    if ( args->plot )
    {
        fclose(fp);
        plot_check(args, args->target_sample ? args->target_sample : "", args->sm_hdr->samples[query_isample]);
    }
}
Exemple #11
0
void bcf_remove_alleles(const bcf_hdr_t *header, bcf1_t *line, int rm_mask)
{
    int *map = (int*) calloc(line->n_allele, sizeof(int));

    // create map of indexes from old to new ALT numbering and modify ALT
    kstring_t str = {0,0,0};
    kputs(line->d.allele[0], &str);

    int nrm = 0, i,j;  // i: ori alleles, j: new alleles
    for (i=1, j=1; i<line->n_allele; i++)
    {
        if ( rm_mask & 1<<i )
        {
            // remove this allele
            line->d.allele[i] = NULL;
            nrm++;
            continue;
        }
        kputc(',', &str);
        kputs(line->d.allele[i], &str);
        map[i] = j;
        j++;
    }
    if ( !nrm ) { free(map); free(str.s); return; }

    int nR_ori = line->n_allele;
    int nR_new = line->n_allele-nrm;
    assert(nR_new > 0); // should not be able to remove reference allele
    int nA_ori = nR_ori-1;
    int nA_new = nR_new-1;

    int nG_ori = nR_ori*(nR_ori + 1)/2;
    int nG_new = nR_new*(nR_new + 1)/2;

    bcf_update_alleles_str(header, line, str.s);

    // remove from Number=G, Number=R and Number=A INFO fields.
    uint8_t *dat = NULL;
    int mdat = 0, ndat = 0, mdat_bytes = 0, nret;
    for (i=0; i<line->n_info; i++)
    {
        bcf_info_t *info = &line->d.info[i];
        int vlen = bcf_hdr_id2length(header,BCF_HL_INFO,info->key);

        if ( vlen!=BCF_VL_A && vlen!=BCF_VL_G && vlen!=BCF_VL_R ) continue; // no need to change

        int type = bcf_hdr_id2type(header,BCF_HL_INFO,info->key);
        if ( type==BCF_HT_FLAG ) continue;
        int size = 1;
        if ( type==BCF_HT_REAL || type==BCF_HT_INT ) size = 4;

        mdat = mdat_bytes / size;
        nret = bcf_get_info_values(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void**)&dat, &mdat, type);
        mdat_bytes = mdat * size;
        if ( nret<0 )
        {
            fprintf(stderr,"[%s:%d %s] Could not access INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
                bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret);
            exit(1);
        }
        if ( type==BCF_HT_STR )
        {
            str.l = 0;
            char *ss = (char*) dat, *se = (char*) dat;
            if ( vlen==BCF_VL_A || vlen==BCF_VL_R )
            {
                int nexp, inc = 0;
                if ( vlen==BCF_VL_A )
                {
                    nexp = nA_ori;
                    inc  = 1;
                }
                else
                    nexp = nR_ori;
                for (j=0; j<nexp; j++)
                {
                    if ( !*se ) break;
                    while ( *se && *se!=',' ) se++;
                    if ( rm_mask & 1<<(j+inc) )
                    {
                        if ( *se ) se++;
                        ss = se;
                        continue;
                    }
                    if ( str.l ) kputc(',',&str);
                    kputsn(ss,se-ss,&str);
                    if ( *se ) se++;
                    ss = se;
                }
                assert( j==nexp );
            }
            else    // Number=G, assuming diploid genotype
            {
                int k = 0, n = 0;
                for (j=0; j<nR_ori; j++)
                {
                    for (k=0; k<=j; k++)
                    {
                        if ( !*se ) break;
                        while ( *se && *se!=',' ) se++;
                        n++;
                        if ( rm_mask & 1<<j || rm_mask & 1<<k )
                        {
                            if ( *se ) se++;
                            ss = se;
                            continue;
                        }
                        if ( str.l ) kputc(',',&str);
                        kputsn(ss,se-ss,&str);
                        if ( *se ) se++;
                        ss = se;
                    }
                    if ( !*se ) break;
                }
                assert( n=nG_ori );
            }

            nret = bcf_update_info(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void*)str.s, str.l, type);
            if ( nret<0 )
            {
                fprintf(stderr,"[%s:%d %s] Could not update INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
                        bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret);
                exit(1);
            }
            continue;
        }

        if ( vlen==BCF_VL_A || vlen==BCF_VL_R )
        {
            int inc = 0, ntop;
            if ( vlen==BCF_VL_A )
            {
                assert( nret==nA_ori );
                ntop = nA_ori;
                ndat = nA_new;
                inc  = 1;
            }
            else
            {
                assert( nret==nR_ori );
                ntop = nR_ori;
                ndat = nR_new;
            }
            int k = 0;

            #define BRANCH(type_t,is_vector_end) \
            { \
                type_t *ptr = (type_t*) dat; \
                int size = sizeof(type_t); \
                for (j=0; j<ntop; j++) /* j:ori, k:new */ \
                { \
                    if ( is_vector_end ) { memcpy(dat+k*size, dat+j*size, size); break; } \
                    if ( rm_mask & 1<<(j+inc) ) continue; \
                    if ( j!=k ) memcpy(dat+k*size, dat+j*size, size); \
                    k++; \
                } \
            }
            switch (type)
            {
                case BCF_HT_INT:  BRANCH(int32_t,ptr[j]==bcf_int32_vector_end); break;
                case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr[j])); break;
            }
            #undef BRANCH
        }
        else    // Number=G
        {
            assert( nret==nG_ori );
            int k, l_ori = -1, l_new = 0;
            ndat = nG_new;

            #define BRANCH(type_t,is_vector_end) \
            { \
                type_t *ptr = (type_t*) dat; \
                int size = sizeof(type_t); \
                for (j=0; j<nR_ori; j++) \
                { \
                    for (k=0; k<=j; k++) \
                    { \
                        l_ori++; \
                        if ( is_vector_end ) { memcpy(dat+l_new*size, dat+l_ori*size, size); break; } \
                        if ( rm_mask & 1<<j || rm_mask & 1<<k ) continue; \
                        if ( l_ori!=l_new ) memcpy(dat+l_new*size, dat+l_ori*size, size); \
                        l_new++; \
                    } \
                } \
            }
            switch (type)
            {
                case BCF_HT_INT:  BRANCH(int32_t,ptr[l_ori]==bcf_int32_vector_end); break;
                case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr[l_ori])); break;
            }
            #undef BRANCH
        }

        nret = bcf_update_info(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void*)dat, ndat, type);
        if ( nret<0 )
        {
            fprintf(stderr,"[%s:%d %s] Could not update INFO/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
                    bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret);
            exit(1);
        }
    }

    // Update GT fields, the allele indexes might have changed
    for (i=1; i<line->n_allele; i++) if ( map[i]!=i ) break;
    if ( i<line->n_allele )
    {
        mdat = mdat_bytes / 4;  // sizeof(int32_t)
        nret = bcf_get_genotypes(header,line,(void**)&dat,&mdat);
        mdat_bytes = mdat * 4;
        if ( nret>0 )
        {
            nret /= line->n_sample;
            int32_t *ptr = (int32_t*) dat;
            for (i=0; i<line->n_sample; i++)
            {
                for (j=0; j<nret; j++)
                {
                    if ( bcf_gt_is_missing(ptr[j]) ) continue;
                    if ( ptr[j]==bcf_int32_vector_end ) break;
                    int al = bcf_gt_allele(ptr[j]);
                    assert( al<nR_ori && map[al]>=0 );
                    ptr[j] = (map[al]+1)<<1 | (ptr[j]&1);
                }
                ptr += nret;
            }
            bcf_update_genotypes(header, line, (void*)dat, nret*line->n_sample);
        }
    }

    // Remove from Number=G, Number=R and Number=A FORMAT fields.
    // Assuming haploid or diploid GTs
    for (i=0; i<line->n_fmt; i++)
    {
        bcf_fmt_t *fmt = &line->d.fmt[i];
        int vlen = bcf_hdr_id2length(header,BCF_HL_FMT,fmt->id);

        if ( vlen!=BCF_VL_A && vlen!=BCF_VL_G && vlen!=BCF_VL_R ) continue; // no need to change

        int type = bcf_hdr_id2type(header,BCF_HL_FMT,fmt->id);
        if ( type==BCF_HT_FLAG ) continue;

        int size = 1;
        if ( type==BCF_HT_REAL || type==BCF_HT_INT ) size = 4;

        mdat = mdat_bytes / size;
        nret = bcf_get_format_values(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void**)&dat, &mdat, type);
        mdat_bytes = mdat * size;
        if ( nret<0 )
        {
            fprintf(stderr,"[%s:%d %s] Could not access FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
                    bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret);
            exit(1);
        }

        if ( type==BCF_HT_STR )
        {
            int size = nret/line->n_sample;     // number of bytes per sample
            str.l = 0;
            if ( vlen==BCF_VL_A || vlen==BCF_VL_R )
            {
                int nexp, inc = 0;
                if ( vlen==BCF_VL_A )
                {
                    nexp = nA_ori;
                    inc  = 1;
                }
                else
                    nexp = nR_ori;
                for (j=0; j<line->n_sample; j++)
                {
                    char *ss = ((char*)dat) + j*size, *se = ss + size, *ptr = ss;
                    int k_src = 0, k_dst = 0, l = str.l;
                    for (k_src=0; k_src<nexp; k_src++)
                    {
                        if ( ptr>=se || !*ptr) break;
                        while ( ptr<se && *ptr && *ptr!=',' ) ptr++;
                        if ( rm_mask & 1<<(k_src+inc) )
                        {
                            ss = ++ptr;
                            continue;
                        }
                        if ( k_dst ) kputc(',',&str);
                        kputsn(ss,ptr-ss,&str);
                        ss = ++ptr;
                        k_dst++;
                    }
                    assert( k_src==nexp );
                    l = str.l - l;
                    for (; l<size; l++) kputc(0, &str);
                }
            }
            else    // Number=G, diploid or haploid
            {
                for (j=0; j<line->n_sample; j++)
                {
                    char *ss = ((char*)dat) + j*size, *se = ss + size, *ptr = ss;
                    int k_src = 0, k_dst = 0, l = str.l;
                    int nexp = 0; // diploid or haploid?
                    while ( ptr<se )
                    {
                        if ( !*ptr ) break;
                        if ( *ptr==',' ) nexp++;
                        ptr++;
                    }
                    if ( ptr!=ss ) nexp++;
                    assert( nexp==nG_ori || nexp==nR_ori );
                    ptr = ss;
                    if ( nexp==nG_ori ) // diploid
                    {
                        int ia, ib;
                        for (ia=0; ia<nR_ori; ia++)
                        {
                            for (ib=0; ib<=ia; ib++)
                            {
                                if ( ptr>=se || !*ptr ) break;
                                while ( ptr<se && *ptr && *ptr!=',' ) ptr++;
                                if ( rm_mask & 1<<ia || rm_mask & 1<<ib )
                                {
                                    ss = ++ptr;
                                    continue;
                                }
                                if ( k_dst ) kputc(',',&str);
                                kputsn(ss,ptr-ss,&str);
                                ss = ++ptr;
                                k_dst++;
                            }
                            if ( ptr>=se || !*ptr ) break;
                        }
                    }
                    else    // haploid
                    {
                        for (k_src=0; k_src<nR_ori; k_src++)
                        {
                            if ( ptr>=se || !*ptr ) break;
                            while ( ptr<se && *ptr && *ptr!=',' ) ptr++;
                            if ( rm_mask & 1<<k_src )
                            {
                                ss = ++ptr;
                                continue;
                            }
                            if ( k_dst ) kputc(',',&str);
                            kputsn(ss,ptr-ss,&str);
                            ss = ++ptr;
                            k_dst++;
                        }
                        assert( k_src==nR_ori );
                        l = str.l - l;
                        for (; l<size; l++) kputc(0, &str);
                    }
                }
            }
            nret = bcf_update_format(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void*)str.s, str.l, type);
            if ( nret<0 )
            {
                fprintf(stderr,"[%s:%d %s] Could not update FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
                        bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret);
                exit(1);
            }
            continue;
        }

        int nori = nret / line->n_sample;
        if ( vlen==BCF_VL_A || vlen==BCF_VL_R || (vlen==BCF_VL_G && nori==nR_ori) ) // Number=A, R or haploid Number=G
        {
            int inc = 0, nnew;
            if ( vlen==BCF_VL_A )
            {
                assert( nori==nA_ori );     // todo: will fail if all values are missing
                ndat = nA_new*line->n_sample;
                nnew = nA_new;
                inc  = 1;
            }
            else
            {
                assert( nori==nR_ori );     // todo: will fail if all values are missing
                ndat = nR_new*line->n_sample;
                nnew = nR_new;
            }

            #define BRANCH(type_t,is_vector_end) \
            { \
                for (j=0; j<line->n_sample; j++) \
                { \
                    type_t *ptr_src = ((type_t*)dat) + j*nori; \
                    type_t *ptr_dst = ((type_t*)dat) + j*nnew; \
                    int size = sizeof(type_t); \
                    int k_src, k_dst = 0; \
                    for (k_src=0; k_src<nori; k_src++) \
                    { \
                        if ( is_vector_end ) { memcpy(ptr_dst+k_dst, ptr_src+k_src, size); break; } \
                        if ( rm_mask & 1<<(k_src+inc) ) continue; \
                        memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
                        k_dst++; \
                    } \
                } \
            }
            switch (type)
            {
                case BCF_HT_INT:  BRANCH(int32_t,ptr_src[k_src]==bcf_int32_vector_end); break;
                case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr_src[k_src])); break;
            }
            #undef BRANCH
        }
        else    // Number=G, diploid or mixture of haploid+diploid
        {
            assert( nori==nG_ori );
            ndat = nG_new*line->n_sample;

            #define BRANCH(type_t,is_vector_end) \
            { \
                for (j=0; j<line->n_sample; j++) \
                { \
                    type_t *ptr_src = ((type_t*)dat) + j*nori; \
                    type_t *ptr_dst = ((type_t*)dat) + j*nG_new; \
                    int size = sizeof(type_t); \
                    int ia, ib, k_dst = 0, k_src; \
                    int nset = 0;   /* haploid or diploid? */ \
                    for (k_src=0; k_src<nG_ori; k_src++) { if ( is_vector_end ) break; nset++; } \
                    if ( nset==nR_ori ) /* haploid */ \
                    { \
                        for (k_src=0; k_src<nR_ori; k_src++) \
                        { \
                            if ( rm_mask & 1<<k_src ) continue; \
                            memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
                            k_dst++; \
                        } \
                        memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
                    } \
                    else /* diploid */ \
                    { \
                        k_src = -1; \
                        for (ia=0; ia<nR_ori; ia++) \
                        { \
                            for (ib=0; ib<=ia; ib++) \
                            { \
                                k_src++; \
                                if ( is_vector_end ) { memcpy(ptr_dst+k_dst, ptr_src+k_src, size); ia = nR_ori; break; }  \
                                if ( rm_mask & 1<<ia || rm_mask & 1<<ib ) continue; \
                                memcpy(ptr_dst+k_dst, ptr_src+k_src, size); \
                                k_dst++; \
                            } \
                        } \
                    } \
                } \
            }
            switch (type)
            {
                case BCF_HT_INT:  BRANCH(int32_t,ptr_src[k_src]==bcf_int32_vector_end); break;
                case BCF_HT_REAL: BRANCH(float,bcf_float_is_vector_end(ptr_src[k_src])); break;
            }
            #undef BRANCH
        }
        nret = bcf_update_format(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void*)dat, ndat, type);
        if ( nret<0 )
        {
            fprintf(stderr,"[%s:%d %s] Could not update FORMAT/%s at %s:%d [%d]\n", __FILE__,__LINE__,__FUNCTION__,
                    bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret);
            exit(1);
        }
    }
    free(dat);
    free(str.s);
    free(map);
}
Exemple #12
0
bcf1_t *process(bcf1_t *rec)
{
    bcf1_t *dflt = args.mode&MODE_LIST_GOOD ? rec : NULL;
    args.nrec++;

    if ( rec->n_allele > 63 ) return dflt;      // we use 64bit bitmask below

    int ngt = bcf_get_genotypes(args.hdr, rec, &args.gt_arr, &args.ngt_arr);
    if ( ngt<0 ) return dflt;
    if ( ngt!=2*bcf_hdr_nsamples(args.hdr) && ngt!=bcf_hdr_nsamples(args.hdr) ) return dflt;
    ngt /= bcf_hdr_nsamples(args.hdr);

    int itr_set = regidx_overlap(args.rules, bcf_seqname(args.hdr,rec),rec->pos,rec->pos, args.itr_ori);

    int i, has_bad = 0, needs_update = 0;
    for (i=0; i<args.ntrios; i++)
    {
        int32_t a,b,c,d,e,f;
        trio_t *trio = &args.trios[i];

        a = args.gt_arr[ngt*trio->imother];
        b = ngt==2 ? args.gt_arr[ngt*trio->imother+1] : bcf_int32_vector_end;
        c = args.gt_arr[ngt*trio->ifather];
        d = ngt==2 ? args.gt_arr[ngt*trio->ifather+1] : bcf_int32_vector_end;
        e = args.gt_arr[ngt*trio->ichild];
        f = ngt==2 ? args.gt_arr[ngt*trio->ichild+1] : bcf_int32_vector_end;

        // skip sites with missing data in child
        if ( bcf_gt_is_missing(e) || bcf_gt_is_missing(f) ) continue;

        uint64_t mother = 0, father = 0,child1,child2;

        int is_ok = 0;
        if ( !itr_set )
        {
            if ( f==bcf_int32_vector_end ) { warn_ploidy(rec); continue; }

            // All M,F,C genotypes are diploid. Missing data are considered consistent.
            child1 = 1<<bcf_gt_allele(e);
            child2 = 1<<bcf_gt_allele(f);
            mother  = bcf_gt_is_missing(a) ? child1|child2 : 1<<bcf_gt_allele(a);
            mother |= bcf_gt_is_missing(b) || b==bcf_int32_vector_end ? child1|child2 : 1<<bcf_gt_allele(b);
            father  = bcf_gt_is_missing(c) ? child1|child2 : 1<<bcf_gt_allele(c);
            father |= bcf_gt_is_missing(d) || d==bcf_int32_vector_end ? child1|child2 : 1<<bcf_gt_allele(d);

            if ( (mother&child1 && father&child2) || (mother&child2 && father&child1) ) is_ok = 1;
        }
        else
        {
            child1  = 1<<bcf_gt_allele(e);
            child2  = bcf_gt_is_missing(f) || f==bcf_int32_vector_end ? 0 : 1<<bcf_gt_allele(f);
            mother |= bcf_gt_is_missing(a) ? 0 : 1<<bcf_gt_allele(a);
            mother |= bcf_gt_is_missing(b) || b==bcf_int32_vector_end ? 0 : 1<<bcf_gt_allele(b);
            father |= bcf_gt_is_missing(c) ? 0 : 1<<bcf_gt_allele(c);
            father |= bcf_gt_is_missing(d) || d==bcf_int32_vector_end ? 0 : 1<<bcf_gt_allele(d);

            regitr_copy(args.itr, args.itr_ori);
            while ( !is_ok && regitr_overlap(args.itr) )
            {
                rule_t *rule = &regitr_payload(args.itr,rule_t);
                if ( child1 && child2 )
                {
                    if ( !rule->mal || !rule->fal ) continue;   // wrong rule (haploid), but this is a diploid GT
                    if ( !mother ) mother = child1|child2;
                    if ( !father ) father = child1|child2;
                    if ( (mother&child1 && father&child2) || (mother&child2 && father&child1) ) is_ok = 1; 
                    continue;
                }
                if ( rule->mal )
                {
                    if ( mother && !(child1&mother) ) continue;
                }
                if ( rule->fal )
                {
                    if ( father && !(child1&father) ) continue;
                }
                is_ok = 1;
            }
        }
        if ( is_ok )
        {
            trio->nok++;
        }
        else
        {
            trio->nbad++;
            has_bad = 1;
            if ( args.mode&MODE_DELETE )
            {
                args.gt_arr[ngt*trio->imother] = bcf_gt_missing;
                if ( b!=bcf_int32_vector_end ) args.gt_arr[ngt*trio->imother+1] = bcf_gt_missing; // should be always true 
                args.gt_arr[ngt*trio->ifather] = bcf_gt_missing;
                if ( d!=bcf_int32_vector_end ) args.gt_arr[ngt*trio->ifather+1] = bcf_gt_missing;
                args.gt_arr[ngt*trio->ichild] = bcf_gt_missing;
                if ( f!=bcf_int32_vector_end ) args.gt_arr[ngt*trio->ichild+1]  = bcf_gt_missing;
                needs_update = 1;
            }
        }
    }

    if ( needs_update && bcf_update_genotypes(args.hdr,rec,args.gt_arr,ngt*bcf_hdr_nsamples(args.hdr)) )
        error("Could not update GT field at %s:%d\n", bcf_seqname(args.hdr,rec),rec->pos+1);

    if ( args.mode&MODE_DELETE ) return rec;
    if ( args.mode&MODE_LIST_GOOD ) return has_bad ? NULL : rec;
    if ( args.mode&MODE_LIST_BAD ) return has_bad ? rec : NULL;

    return NULL;
}
Exemple #13
0
int parse_line(args_t *args, bcf1_t *line, double *alt_freq, double *pdg)
{
    args->nitmp = 0;

    // Set allele frequency
    int ret;
    if ( args->af_tag )
    {
        // Use an INFO tag provided by the user
        ret = bcf_get_info_float(args->hdr, line, args->af_tag, &args->AFs, &args->mAFs);
        if ( ret==1 )
            *alt_freq = args->AFs[0];
        if ( ret==-2 )
            error("Type mismatch for INFO/%s tag at %s:%d\n", args->af_tag, bcf_seqname(args->hdr,line), line->pos+1);
    }
    else if ( args->af_fname ) 
    {
        // Read AF from a file
        ret = read_AF(args->files->targets, line, alt_freq);
    }
    else
    {
        // Use GTs or AC/AN: GTs when AC/AN not present or when GTs explicitly requested by --estimate-AF
        ret = -1;
        if ( !args->estimate_AF )
        {
            int AC = -1, AN = 0;
            ret = bcf_get_info_int32(args->hdr, line, "AN", &args->itmp, &args->mitmp);
            if ( ret==1 )
            {
                AN = args->itmp[0];
                ret = bcf_get_info_int32(args->hdr, line, "AC", &args->itmp, &args->mitmp);
                if ( ret>0 )
                    AC = args->itmp[0];
            }
            if ( AN<=0 || AC<0 ) 
                ret = -1;
            else 
                *alt_freq = (double) AC/AN;
        }
        if ( ret==-1 )
            ret = estimate_AF(args, line, alt_freq);    // reads GTs into args->itmp
    }

    if ( ret<0 ) return ret;
    if ( *alt_freq==0.0 )
    {
        if ( args->dflt_AF==0 ) return -1;       // we skip sites with AF=0
        *alt_freq = args->dflt_AF;
    }

    // Set P(D|G)
    if ( args->fake_PLs )
    {
        if ( !args->nitmp )
        {
            args->nitmp = bcf_get_genotypes(args->hdr, line, &args->itmp, &args->mitmp);
            if ( args->nitmp != 2*args->nsmpl ) return -1;     // not diploid?
            args->nitmp /= args->nsmpl;
        }

        int32_t *gt = &args->itmp[args->ismpl*args->nitmp];
        if ( bcf_gt_is_missing(gt[0]) || bcf_gt_is_missing(gt[1]) ) return -1;

        int a = bcf_gt_allele(gt[0]);
        int b = bcf_gt_allele(gt[1]);
        if ( a!=b )
        {
            pdg[0] = pdg[2] = args->unseen_PL;
            pdg[1] = 1 - 2*args->unseen_PL;
        }
        else if ( a==0 )
        {
            pdg[0] = 1 - 2*args->unseen_PL;
            pdg[1] = pdg[2] = args->unseen_PL;
        }
        else
        {
            pdg[0] = pdg[1] = args->unseen_PL;
            pdg[2] = 1 - 2*args->unseen_PL;
        }
    }
    else
    {
        args->nitmp = bcf_get_format_int32(args->hdr, line, "PL", &args->itmp, &args->mitmp);
        if ( args->nitmp != args->nsmpl*line->n_allele*(line->n_allele+1)/2. ) return -1;     // not diploid?
        args->nitmp /= args->nsmpl;

        int32_t *pl = &args->itmp[args->ismpl*args->nitmp];
        pdg[0] = pl[0] < 256 ? args->pl2p[ pl[0] ] : 1.0;
        pdg[1] = pl[1] < 256 ? args->pl2p[ pl[1] ] : 1.0;
        pdg[2] = pl[2] < 256 ? args->pl2p[ pl[2] ] : 1.0;

        double sum = pdg[0] + pdg[1] + pdg[2];
        if ( !sum ) return -1;
        pdg[0] /= sum;
        pdg[1] /= sum;
        pdg[2] /= sum;
    }

    return 0;
}