Exemple #1
0
static int fake_PLs(args_t *args, bcf_hdr_t *hdr, bcf1_t *line)
{
    // PLs not present, use GTs instead. 
    int fake_PL = args->no_PLs ? args->no_PLs : 99;    // with 1, discordance is the number of non-matching GTs
    int nsm_gt, i;
    if ( (nsm_gt=bcf_get_genotypes(hdr, line, &args->tmp_arr, &args->ntmp_arr)) <= 0 ) 
        error("GT not present at %s:%d?\n", hdr->id[BCF_DT_CTG][line->rid].key, line->pos+1);
    nsm_gt /= bcf_hdr_nsamples(hdr);
    int npl = line->n_allele*(line->n_allele+1)/2;
    hts_expand(int,npl*bcf_hdr_nsamples(hdr),args->npl_arr,args->pl_arr);
    for (i=0; i<bcf_hdr_nsamples(hdr); i++)
    {
        int *gt_ptr = args->tmp_arr + i*nsm_gt;
        int a = bcf_gt_allele(gt_ptr[0]);
        int b = bcf_gt_allele(gt_ptr[1]);

        int j, *pl_ptr = args->pl_arr + i*npl;
        if ( a<0 || b<0 ) // missing genotype
        {
            for (j=0; j<npl; j++) pl_ptr[j] = -1;
        }
        else
        {
            for (j=0; j<npl; j++) pl_ptr[j] = fake_PL;
            int idx = bcf_alleles2gt(a,b);
            pl_ptr[idx] = 0;
        }
    }
    return npl;
}
Exemple #2
0
static void check_gt(args_t *args)
{
    int i,ret, *gt2ipl = NULL, m_gt2ipl = 0, *gt_arr = NULL, ngt_arr = 0;
    int fake_pls = args->no_PLs;

    // Initialize things: check which tags are defined in the header, sample names etc.
    if ( bcf_hdr_id2int(args->gt_hdr, BCF_DT_ID, "GT")<0 ) error("[E::%s] GT not present in the header of %s?\n", __func__, args->files->readers[1].fname);
    if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "PL")<0 ) 
    {
        if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "GT")<0 )
            error("[E::%s] Neither PL nor GT present in the header of %s\n", __func__, args->files->readers[0].fname);
        fprintf(stderr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname);
        fake_pls = 1;
    }

    FILE *fp = args->plot ? open_file(NULL, "w", "%s.tab", args->plot) : stdout;
    print_header(args, fp);

    int tgt_isample = -1, query_isample = 0;
    if ( args->target_sample ) 
    {
        tgt_isample = bcf_hdr_id2int(args->gt_hdr, BCF_DT_SAMPLE, args->target_sample);
        if ( tgt_isample<0 ) error("No such sample in %s: [%s]\n", args->files->readers[1].fname, args->target_sample);
    }
    if ( args->all_sites )
    {
        if ( tgt_isample==-1 ) 
        {
            fprintf(stderr,"No target sample selected for comparison, using the first sample in %s: %s\n", args->gt_fname,args->gt_hdr->samples[0]);
            tgt_isample = 0;
        }
    }
    if ( args->query_sample )
    {
        query_isample = bcf_hdr_id2int(args->sm_hdr, BCF_DT_SAMPLE, args->query_sample);
        if ( query_isample<0 ) error("No such sample in %s: [%s]\n", args->files->readers[0].fname, args->query_sample);
    }
    if ( args->all_sites )
        fprintf(fp, "# [1]SC, Site by Site Comparison\t[2]Chromosome\t[3]Position\t[4]-g alleles\t[5]-g GT (%s)\t[6]Coverage\t[7]Query alleles\t[8-]Query PLs (%s)\n", args->gt_hdr->samples[tgt_isample],args->sm_hdr->samples[query_isample]);

    // Main loop
    while ( (ret=bcf_sr_next_line(args->files)) )
    {
        if ( ret!=2 ) continue;
        bcf1_t *sm_line = args->files->readers[0].buffer[0];    // the query file
        bcf1_t *gt_line = args->files->readers[1].buffer[0];    // the -g target file
        bcf_unpack(sm_line, BCF_UN_FMT);
        bcf_unpack(gt_line, BCF_UN_FMT);

        // Init mapping from target genotype index to the sample's PL fields
        int n_gt2ipl = gt_line->n_allele*(gt_line->n_allele + 1)/2;
        if ( n_gt2ipl > m_gt2ipl )
        {
            m_gt2ipl = n_gt2ipl;
            gt2ipl   = (int*) realloc(gt2ipl, sizeof(int)*m_gt2ipl);
        }
        if ( !init_gt2ipl(args, gt_line, sm_line, gt2ipl, n_gt2ipl) ) continue;

        // Target genotypes
        int ngt, npl;
        if ( (ngt=bcf_get_genotypes(args->gt_hdr, gt_line, &gt_arr, &ngt_arr)) <= 0 ) 
            error("GT not present at %s:%d?", args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, gt_line->pos+1);
        ngt /= bcf_hdr_nsamples(args->gt_hdr);
        if ( ngt!=2 ) continue; // checking only diploid genotypes

        // Sample PLs
        if ( !fake_pls )
        {
            if ( (npl=bcf_get_format_int32(args->sm_hdr, sm_line, "PL", &args->pl_arr, &args->npl_arr)) <= 0 )
                error("PL not present at %s:%d?", args->sm_hdr->id[BCF_DT_CTG][sm_line->rid].key, sm_line->pos+1);
            npl /= bcf_hdr_nsamples(args->sm_hdr);
        }
        else
            npl = fake_PLs(args, args->sm_hdr, sm_line);

        // Calculate likelihoods for all samples, assuming diploid genotypes

        // For faster access to genotype likelihoods (PLs) of the query sample
        int max_ipl, *pl_ptr = args->pl_arr + query_isample*npl;
        double sum_pl = 0;  // for converting PLs to probs
        for (max_ipl=0; max_ipl<npl; max_ipl++) 
        {
            if ( pl_ptr[max_ipl]==bcf_int32_vector_end ) break;
            if ( pl_ptr[max_ipl]==bcf_int32_missing ) continue;
            sum_pl += pow(10, -0.1*pl_ptr[max_ipl]);
        }
        if ( sum_pl==0 ) continue; // no PLs present

        // The main stats: concordance of the query sample with the target -g samples
        for (i=0; i<bcf_hdr_nsamples(args->gt_hdr); i++)
        {
            int *gt_ptr = gt_arr + i*ngt;
            if ( gt_ptr[1]==bcf_int32_vector_end ) continue;    // skip haploid genotypes
            int a = bcf_gt_allele(gt_ptr[0]);
            int b = bcf_gt_allele(gt_ptr[1]);
            if ( a<0 || b<0 ) continue; // missing genotypes
            if ( args->hom_only && a!=b ) continue; // heterozygous genotype
            int igt_tgt = igt_tgt = bcf_alleles2gt(a,b); // genotype index in the target file
            int igt_qry = gt2ipl[igt_tgt];  // corresponding genotype in query file
            if ( igt_qry>=max_ipl || pl_ptr[igt_qry]<0 ) continue;   // genotype not present in query sample: haploid or missing
            args->lks[i] += log(pow(10, -0.1*pl_ptr[igt_qry])/sum_pl); 
            args->sites[i]++; 
        }
        if ( args->all_sites )
        {
            // Print LKs at all sites for debugging
            int *gt_ptr = gt_arr + tgt_isample*ngt;
            if ( gt_ptr[1]==bcf_int32_vector_end ) continue;    // skip haploid genotypes
            int a = bcf_gt_allele(gt_ptr[0]);
            int b = bcf_gt_allele(gt_ptr[1]);
            if ( args->hom_only && a!=b ) continue; // heterozygous genotype
            fprintf(fp, "SC\t%s\t%d", args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, gt_line->pos+1);
            for (i=0; i<gt_line->n_allele; i++) fprintf(fp, "%c%s", i==0?'\t':',', gt_line->d.allele[i]);
            fprintf(fp, "\t%s/%s", a>=0 ? gt_line->d.allele[a] : ".", b>=0 ? gt_line->d.allele[b] : ".");

            int igt, *pl_ptr = args->pl_arr + query_isample*npl; // PLs of the query sample
            for (i=0; i<sm_line->n_allele; i++) fprintf(fp, "%c%s", i==0?'\t':',', sm_line->d.allele[i]); 
            for (igt=0; igt<npl; igt++)   
                if ( pl_ptr[igt]==bcf_int32_vector_end ) break;
                else if ( pl_ptr[igt]==bcf_int32_missing ) fprintf(fp, ".");
                else fprintf(fp, "\t%d", pl_ptr[igt]); 
            fprintf(fp, "\n"); 
        }
    }
    free(gt2ipl);
    free(gt_arr);
    free(args->pl_arr);
    free(args->tmp_arr);

    // Scale LKs and certainties
    int nsamples = bcf_hdr_nsamples(args->gt_hdr);
    double min = args->lks[0];
    for (i=1; i<nsamples; i++) if ( min>args->lks[i] ) min = args->lks[i];
    for (i=0; i<nsamples; i++) args->lks[i] = min ? args->lks[i] / min : 0;
    double max_avg = args->sites[0] ? args->lks[0]/args->sites[0] : 0;
    for (i=1; i<nsamples; i++) 
    {
        double val = args->sites[i] ? args->lks[i]/args->sites[i] : 0;
        if ( max_avg<val ) max_avg = val;
    }

    // Sorted output
    double **p = (double**) malloc(sizeof(double*)*nsamples);
    for (i=0; i<nsamples; i++) p[i] = &args->lks[i];
    qsort(p, nsamples, sizeof(int*), cmp_doubleptr);

    fprintf(fp, "# [1]CN\t[2]Concordance with %s (total)\t[3]Concordance (average)\t[4]Number of sites compared\t[5]Sample\t[6]Sample ID\n", args->sm_hdr->samples[query_isample]);
    for (i=0; i<nsamples; i++)
    {
        int idx = p[i] - args->lks;
        double avg = args->sites[idx] ? args->lks[idx]/args->sites[idx] : 0;
        fprintf(fp, "CN\t%e\t%e\t%.0f\t%s\t%d\n", 1-args->lks[idx], 1-avg/max_avg, args->sites[idx], args->gt_hdr->samples[idx], i);
    }

    if ( args->plot )
    {
        fclose(fp);
        plot_check(args, args->target_sample ? args->target_sample : "", args->sm_hdr->samples[query_isample]);
    }
}
Exemple #3
0
/**
 * Computes Allele Balance from genotype likelihoods.
 *
 * @pls        - PHRED genotype likelihoods
 * @no_samples - number of samples
 * @ploidy     - ploidy
 * @dps        - depths
 * @GF         - estimated GF
 * @no_alleles - number of alleles
 * @ab         - estimate of allele balance
 * @n          - effective sample size
 */
void Estimator::compute_gl_ab(int32_t *pls, int32_t no_samples, int32_t ploidy,
                              int32_t *dps,
                              float* GF, int32_t no_alleles,
                              float& ab, int32_t& n)
{
    n = 0;
    
    if (ploidy!=2) return;

    if (no_alleles==2) 
    {
        float num = 0, denum = 0;
        for (size_t k=0; k<no_samples; ++k)
        {
            size_t offset = k*3;
            if(pls[offset]!=bcf_int32_missing && dps[k]!=0)
            {
                float nrefnum = pls[offset+2]-pls[offset+0];
                float nrefdenum = pls[offset]+pls[offset+2]-2*pls[offset+1] +6*dps[k];
                float nref = 0.5*dps[k]*(1+(nrefdenum?nrefnum/nrefdenum:0));
                float phet = lt->pl2prob(pls[offset+1])*GF[1] /
                             ( lt->pl2prob(pls[offset])*GF[0]
                              +lt->pl2prob(pls[offset+1])*GF[1]
                              +lt->pl2prob(pls[offset+2])*GF[2]);
    
                num += phet*nref;
                denum += phet*dps[k];
                ++n;           
            }
        }
    
        ab = (0.05+num)/(0.10+denum);
    }
    else
    {
        int32_t no_genotypes = bcf_an2gn(no_alleles);
        float num = 0, denum = 0;
        for (size_t k=0; k<no_samples; ++k)
        {
            size_t offset = k*no_genotypes;
            if(pls[offset]!=bcf_int32_missing)
            {
                float prob_data, p_ref;
                int32_t gt_index = 0;
                    
                for (size_t j=1; j<no_alleles; ++j)
                {
                    size_t het_index = bcf_alleles2gt(0,j);
                    size_t homalt_index = bcf_alleles2gt(j,j);
                    float nrefnum = pls[offset+homalt_index]-pls[offset];
                    float nrefdenum = pls[offset]+pls[offset+homalt_index]-2*pls[offset+het_index] +6*dps[k];
                    float nref = 0.5*dps[k]*(1+(nrefdenum?nrefnum/nrefdenum:0));
                
                    float n = lt->pl2prob(pls[offset+het_index])*GF[het_index] ;
                    float d = (lt->pl2prob(pls[offset])*GF[0]
                               +n
                               +lt->pl2prob(pls[offset+homalt_index])*GF[homalt_index]);
                    float phet = d?n/d:0.333;
                    num += phet*nref;
                    denum += phet*dps[k];
                }
                
                ++n;           
            }
        }
        
        ab = (0.05+num)/(0.10+denum);
    }
};
Exemple #4
0
/**
 * Computes the Inbreeding Coefficient Statistic from Genotype likelihoods.
 *
 * @pls        - PHRED genotype likelihoods
 * @no_samples - number of samples
 * @ploidy     - ploidy
 * @GF         - GF
 * @HWE_AF     - AF under HWE assumption
 * @no_alleles - number of alleles
 * @F          - estimated inbreeding coefficient
 * @n          - effective sample size
 */
void Estimator::compute_gl_fic(int32_t * pls, int32_t no_samples, int32_t ploidy,
                               float* HWE_AF, int32_t no_alleles, float* GF,
                               float& F, int32_t& n)
{
    n = 0;

    if (ploidy!=2)
    {
        return;
    }

    float HWE_GF[3];
    HWE_GF[0] = HWE_AF[0]*HWE_AF[0];
    HWE_GF[1] = 2*HWE_AF[0]*HWE_AF[1];
    HWE_GF[2] = HWE_AF[1]*HWE_AF[1];

    if (no_alleles==2)
    {
        int32_t no_genotypes = 3;

        float num = 0, denum=0;
        for (size_t k=0; k<no_samples; ++k)
        {
            size_t offset = k*no_genotypes;

            if (pls[offset]==bcf_int32_missing)
            {
                continue;
            }

            ++n;

            float o_het_sum = lt->pl2prob(pls[offset+1])*GF[1];
            float o_sum = lt->pl2prob(pls[offset])*GF[0];
            o_sum += o_het_sum;
            o_sum += lt->pl2prob(pls[offset+2])*GF[2];

            float e_het_sum = lt->pl2prob(pls[offset+1])*HWE_GF[1];
            float e_sum = lt->pl2prob(pls[offset])*HWE_GF[0];
            e_sum += e_het_sum;
            e_sum += lt->pl2prob(pls[offset+2])*HWE_GF[2];

            num += o_het_sum/o_sum;
            denum += e_het_sum/e_sum;
        }

        F = 1-num/denum;
    }
    else
    {
        int32_t no_genotypes = bcf_an2gn(no_alleles);

        float HWE_GF[no_genotypes];

        for (size_t i=0; i<no_alleles; ++i)
        {
            for (size_t j=0; j<=i; ++j)
            {
                HWE_GF[bcf_alleles2gt(i,j)] = (i!=j?2:1)*HWE_AF[i]*HWE_AF[j];
            }
        }

        float num=0, denum=0;
        float o_het_sum;
        float o_sum;
        float e_het_sum;
        float e_sum;
        for (size_t k=0; k<no_samples; ++k)
        {
            size_t offset = k*no_genotypes;
            if (pls[offset]==bcf_int32_missing)
            {
                continue;
            }

            ++n;

            o_het_sum = 0;
            o_sum = 0;
            e_het_sum = 0;
            e_sum = 0;
            int32_t gt_index = 0;
            for (size_t i=0; i<no_alleles; ++i)
            {
                for (size_t j=0; j<i; ++j)
                {
                    float p = lt->pl2prob(pls[offset+gt_index]);
                    o_het_sum += p * GF[gt_index];
                    o_sum += p * GF[gt_index];

                    e_het_sum += p * HWE_GF[gt_index];
                    e_sum += p * HWE_GF[gt_index];

                    ++gt_index;
                }

                //for homozygote
                o_sum += lt->pl2prob(pls[offset+gt_index]) * GF[gt_index];
                e_sum += lt->pl2prob(pls[offset+gt_index]) * HWE_GF[gt_index];
                ++gt_index;
            }

            num += o_het_sum/o_sum;
            denum += e_het_sum/e_sum;
        }

        F = 1-num/denum;
    }
};
Exemple #5
0
/**
 * Computes allele frequencies using hard calls.
 *
 * @gts        - genotypes
 * @no_samples - number of samples
 * @ploidy     - ploidy
 * @no_alleles - number of alleles
 * @AC         - alternate allele counts
 * @AN         - total number of allele counts
 * @AF         - alternate allele frequency
 * @GC         - genotype counts
 * @GN         - total number of genotype counts
 * @GF         - genotype frequency
 * @NS         - number of samples with data
 */
void Estimator::compute_af(int32_t *gts, int32_t no_samples, int32_t ploidy,
                int32_t no_alleles, int32_t *AC, int32_t& AN, float *AF,
                int32_t *GC,  int32_t& GN, float *GF, int32_t& NS)
{
    int32_t iter = 0;

    if (no_alleles==2 && ploidy==2)
    {
        NS = 0;
        int32_t no_genotypes = 3;
        AC[0] = 0;
        AC[1] = 0;

        GC[0] = 0;
        GC[1] = 0;
        GC[2] = 0;

        GN=0;

        for (size_t k=0; k<no_samples; ++k)
        {
            size_t offset = k*ploidy;

            int32_t g1 = bcf_gt_allele(gts[offset]);
            if (g1>=0)
            {
                ++AC[g1];
                ++AN;
            }
            int32_t g2 = bcf_gt_allele(gts[offset+1]);
            if (g2>=0)
            {
                ++AC[g2];
                ++AN;
            }

            if (g1>=0 && g2>=0)
            {
                ++GC[g1+g2];
                ++GN;
            }

            if (g1>=0||g2>=0) ++NS;
        }

        if (!NS)
        {
            return;
        }

        AF[0] = (float)AC[0]/AN;
        AF[1] = (float)AC[1]/AN;

        GF[0] = (float)GC[0]/GN;
        GF[1] = (float)GC[1]/GN;
        GF[2] = (float)GC[2]/GN;
    }
    else
    {
        NS = 0;
        AN = 0;
        GN = 0;
        int32_t no_genotypes = bcf_an2gn(no_alleles);
        for (size_t i=0; i<no_alleles; ++i)
        {
            AC[i] = 0;
        }

        for (size_t i=0; i<no_genotypes; ++i)
        {
            GC[i] = 0;
        }

        int32_t gt_indiv[ploidy];

        for (size_t k=0; k<no_samples; ++k)
        {
            size_t offset = k*ploidy;

            int32_t last_AN = AN;
            for (size_t i=0; i<ploidy; ++i)
            {
                gt_indiv[i] = bcf_gt_allele(gts[offset+i]);
                
                if (gt_indiv[i]>=0)
                {
                    ++AC[gt_indiv[i]];
                    ++AN;
                }
            }
            
            if (last_AN<AN) ++NS;
            
            if (ploidy==2 && gt_indiv[0]>=0 && gt_indiv[1]>=0)
            {
                if (gt_indiv[1]<gt_indiv[0])
                {
                    gt_indiv[0] += gt_indiv[1];
                    gt_indiv[1] = gt_indiv[0] - gt_indiv[1];
                    gt_indiv[0] -= gt_indiv[1];
                }
                
                ++GC[bcf_alleles2gt(gt_indiv[0],gt_indiv[1])];
                ++GN;
            }
        }

        for (size_t i=0; i<no_alleles; ++i)
        {
            AF[i] = (float)AC[i]/AN;
        }

        for (size_t i=0; i<no_genotypes; ++i)
        {
            GF[i] = (float)GC[i]/GN;
        }
    }
}
Exemple #6
0
/**
 * Computes allele frequencies using EM algorithm from genotype likelihoods.
 *
 * @pls        - PHRED genotype likelihoods
 * @no_samples - number of samples
 * @ploidy     - ploidy
 * @n_alleles  - number of alleles
 * @MLE_AF     - estimated AF
 * @MLE_GF     - estimated GF
 * @n          - effective sample size
 * @e          - error
 */
void Estimator::compute_gl_af(int32_t *pls, int32_t nsamples, int32_t ploidy,
                int32_t n_allele, float *MLE_AF, float *MLE_GF, int32_t& n,
                double e)
{
    int32_t iter = 0;

    if (n_allele==2 && ploidy==2)
    {
        n = 0;
        int32_t imap[nsamples];

        for (size_t i=0; i<nsamples; ++i)
        {
            if (pls[i*3]!=bcf_int32_missing)
            {
                imap[n] = i;
                ++n;
            }
        }

        if (!n)
        {
            return;
        }

        float gf[3];
        float gf_indiv[3];

        float mse = e+1;
        float diff = 0;

        gf[0] = 1.0/3;
        gf[1] = gf[0];
        gf[2] = gf[1];

        while (mse>e && iter<50)
        {
            MLE_GF[0] = 0;
            MLE_GF[1] = 0;
            MLE_GF[2] = 0;

            for (size_t i=0; i<n; ++i)
            {
                size_t offset = imap[i]*3;

                float prob_data = (gf_indiv[0] = gf[0]*lt->pl2prob(pls[offset]));
                prob_data += (gf_indiv[1] = gf[1]*lt->pl2prob(pls[offset+1]));
                prob_data += (gf_indiv[2] = gf[2]*lt->pl2prob(pls[offset+2]));

                MLE_GF[0] += (gf_indiv[0] /= prob_data);
                MLE_GF[1] += (gf_indiv[1] /= prob_data);
                MLE_GF[2] += (gf_indiv[2] /= prob_data);
            }

            MLE_GF[0] /= n;
            MLE_GF[1] /= n;
            MLE_GF[2] /= n;

            diff = (gf[0]-MLE_GF[0]);
            mse = diff*diff;
            diff = (gf[1]-MLE_GF[1]);
            mse += diff*diff;
            diff = (gf[2]-MLE_GF[2]);
            mse += diff*diff;

            gf[0] = MLE_GF[0];
            gf[1] = MLE_GF[1];
            gf[2] = MLE_GF[2];


            ++iter;
        }

        MLE_AF[0] = MLE_GF[0]+0.5*MLE_GF[1];
        MLE_AF[1] = MLE_GF[2]+0.5*MLE_GF[1];
    }
    else
    {
        n = 0;
        int32_t imap[nsamples];
        int32_t no_genotypes = bcf_an2gn(n_allele);
        for (size_t i=0; i<nsamples; ++i)
        {
            if (pls[i*no_genotypes]!=bcf_int32_missing)
            {
                imap[n] = i;
                ++n;
            }
        }

        if (!n) return;

        float gf[no_genotypes];
        float gf_indiv[no_genotypes];

        //initialization
        gf[0] = 1.0/no_genotypes;
        for (size_t i=1; i<no_genotypes; ++i)
        {
            gf[i] = gf[0];
        }

        float mse = e+1;
        while (mse>e && iter<50)
        {
            //initialization
            for (size_t i=0; i<no_genotypes; ++i)
            {
                MLE_GF[i] = 0;
            }

            //iterate through individuals
            for (size_t i=0; i<n; ++i)
            {
                size_t offset = imap[i]*no_genotypes;

                float prob_data = 0;
                for (size_t j=0; j<no_genotypes; ++j)
                {
                    prob_data += (gf_indiv[j] = gf[j]*lt->pl2prob(pls[offset+j]));
                }

                for (size_t j=0; j<no_genotypes; ++j)
                {
                    MLE_GF[j] += (gf_indiv[j] /= prob_data);
                }
            }

            mse = 0;
            float diff;
            for (size_t i=0; i<no_genotypes; ++i)
            {
                MLE_GF[i] /= n;
                diff = gf[i]-MLE_GF[i];
                mse += (diff *= diff);
                gf[i] = MLE_GF[i];
            }

            ++iter;
        }

        for (size_t i=0; i<n_allele; ++i)
        {
            for (size_t j=0; j<=i; ++j)
            {
                int32_t index = bcf_alleles2gt(i,j);
                MLE_AF[i] += 0.5*MLE_GF[index];
                MLE_AF[i] += 0.5*MLE_GF[index];
            }
        }
    }
}
Exemple #7
0
/**
 * Computes allele frequencies using EM algorithm from genotype likelihoods
 * under assumption of Hardy-Weinberg Equilibrium.
 *
 * @pls        - PHRED genotype likelihoods
 * @no_samples - number of samples
 * @ploidy     - ploidy
 * @no_alleles - number of alleles
 * @MLE_HWE_AF - estimated AF
 * @MLE_HWE_GF - estimated GF
 * @n          - effective sample size
 * @e          - error
 */
void Estimator::compute_gl_af_hwe(int32_t *pls, int32_t no_samples, int32_t ploidy,
                int32_t no_alleles, float *MLE_HWE_AF, float *MLE_HWE_GF, int32_t& n,
                double e)
{
    int32_t iter = 0;

    if (ploidy!=2)
    {
        return;
    }

    if (no_alleles==2)
    {
        n = 0;
        int32_t imap[no_samples];

        for (size_t i=0; i<no_samples; ++i)
        {
            if (pls[i*3]!=bcf_int32_missing)
            {
                imap[n] = i;
                ++n;
            }
        }

        if (!n)
        {
            return;
        }

        float af[2] = {0.5, 0.5};
        float gf[3];
        float gf_indiv[3];

        float mse = e+1;
        float diff = 0;
        while (mse>e && iter<50)
        {
            gf[0] = af[0]*af[0];
            gf[1] = 2*af[0]*af[1];
            gf[2] = af[1]*af[1];

            MLE_HWE_AF[0] = 0;
            MLE_HWE_AF[1] = 0;

            for (size_t i=0; i<n; ++i)
            {
                size_t offset = imap[i]*3;

                float prob_data = (gf_indiv[0] = gf[0]*lt->pl2prob(pls[offset]));
                prob_data += (gf_indiv[1] = gf[1]*lt->pl2prob(pls[offset+1]));
                prob_data += (gf_indiv[2] = gf[2]*lt->pl2prob(pls[offset+2]));

                gf_indiv[0] /= prob_data;
                gf_indiv[1] /= prob_data;
                gf_indiv[2] /= prob_data;

                MLE_HWE_AF[0] += gf_indiv[0] + 0.5*gf_indiv[1];
                MLE_HWE_AF[1] += gf_indiv[2] + 0.5*gf_indiv[1];
            }

            MLE_HWE_AF[0] /= n;
            MLE_HWE_AF[1] /= n;

            diff = (af[0]-MLE_HWE_AF[0]);
            mse = diff*diff;
            diff = (af[1]-MLE_HWE_AF[1]);
            mse += diff*diff;

            af[0] = MLE_HWE_AF[0];
            af[1] = MLE_HWE_AF[1];

            ++iter;
        }

        MLE_HWE_GF[0] = MLE_HWE_AF[0]*MLE_HWE_AF[0];
        MLE_HWE_GF[1] = 2*MLE_HWE_AF[0]*MLE_HWE_AF[1];
        MLE_HWE_GF[2] = MLE_HWE_AF[1]*MLE_HWE_AF[1];
    }
    else
    {
        n = 0;
        int32_t imap[no_samples];
        int32_t no_genotypes = bcf_an2gn(no_alleles);
        for (size_t k=0; k<no_samples; ++k)
        {
            if (pls[k*no_genotypes]!=bcf_int32_missing)
            {
                imap[n] = k;
                ++n;
            }
        }

        if (!n) return;

        float af[no_alleles];
        float p = 1.0/no_alleles;
        for (size_t i=0; i<no_alleles; ++i)
        {
            af[i] = p;
        }
        float gf[no_genotypes];
        float gf_indiv[no_genotypes];

        bool debug = false;

        float mse = e+1;
        while (mse>e && iter<50)
        {
            for (size_t i=0; i<no_alleles; ++i)
            {
                MLE_HWE_AF[i] = 0;
                for (size_t j=0; j<=i; ++j)
                {
                    gf[bcf_alleles2gt(i,j)] = (i!=j?2:1)*af[i]*af[j];
                }
            }

            //iterate through individuals
            for (size_t k=0; k<n; ++k)
            {
                size_t offset = imap[k]*no_genotypes;

                float prob_data = 0;
                for (size_t i=0; i<no_genotypes; ++i)
                {
                    prob_data += (gf_indiv[i] = gf[i]*lt->pl2prob(pls[offset+i]));
                }

                for (size_t i=0; i<no_genotypes; ++i)
                {
                    gf_indiv[i] /= prob_data;
                }

                for (size_t i=0; i<no_alleles; ++i)
                {
                    for (size_t j=0; j<=i; ++j)
                    {
                        int32_t gf_index = bcf_alleles2gt(i,j);
                        MLE_HWE_AF[i] += 0.5*gf_indiv[gf_index];
                        MLE_HWE_AF[j] += 0.5*gf_indiv[gf_index];
                    }
                }
            }

            //normalize to frequency
            mse = 0;
            float diff;
            for (size_t i=0; i<no_alleles; ++i)
            {
                MLE_HWE_AF[i] /= n;
                diff = af[i]-MLE_HWE_AF[i];
                mse += (diff*diff);
                af[i] = MLE_HWE_AF[i];
            }

            ++iter;
        }

        for (size_t i=0; i<no_alleles; ++i)
        {
            for (size_t j=0; j<=i; ++j)
            {
                MLE_HWE_GF[bcf_alleles2gt(i,j)] = (i!=j?2:1)*MLE_HWE_AF[i]*MLE_HWE_AF[j];
            }
        }
    }
}