static int fake_PLs(args_t *args, bcf_hdr_t *hdr, bcf1_t *line) { // PLs not present, use GTs instead. int fake_PL = args->no_PLs ? args->no_PLs : 99; // with 1, discordance is the number of non-matching GTs int nsm_gt, i; if ( (nsm_gt=bcf_get_genotypes(hdr, line, &args->tmp_arr, &args->ntmp_arr)) <= 0 ) error("GT not present at %s:%d?\n", hdr->id[BCF_DT_CTG][line->rid].key, line->pos+1); nsm_gt /= bcf_hdr_nsamples(hdr); int npl = line->n_allele*(line->n_allele+1)/2; hts_expand(int,npl*bcf_hdr_nsamples(hdr),args->npl_arr,args->pl_arr); for (i=0; i<bcf_hdr_nsamples(hdr); i++) { int *gt_ptr = args->tmp_arr + i*nsm_gt; int a = bcf_gt_allele(gt_ptr[0]); int b = bcf_gt_allele(gt_ptr[1]); int j, *pl_ptr = args->pl_arr + i*npl; if ( a<0 || b<0 ) // missing genotype { for (j=0; j<npl; j++) pl_ptr[j] = -1; } else { for (j=0; j<npl; j++) pl_ptr[j] = fake_PL; int idx = bcf_alleles2gt(a,b); pl_ptr[idx] = 0; } } return npl; }
static void check_gt(args_t *args) { int i,ret, *gt2ipl = NULL, m_gt2ipl = 0, *gt_arr = NULL, ngt_arr = 0; int fake_pls = args->no_PLs; // Initialize things: check which tags are defined in the header, sample names etc. if ( bcf_hdr_id2int(args->gt_hdr, BCF_DT_ID, "GT")<0 ) error("[E::%s] GT not present in the header of %s?\n", __func__, args->files->readers[1].fname); if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "PL")<0 ) { if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "GT")<0 ) error("[E::%s] Neither PL nor GT present in the header of %s\n", __func__, args->files->readers[0].fname); fprintf(stderr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname); fake_pls = 1; } FILE *fp = args->plot ? open_file(NULL, "w", "%s.tab", args->plot) : stdout; print_header(args, fp); int tgt_isample = -1, query_isample = 0; if ( args->target_sample ) { tgt_isample = bcf_hdr_id2int(args->gt_hdr, BCF_DT_SAMPLE, args->target_sample); if ( tgt_isample<0 ) error("No such sample in %s: [%s]\n", args->files->readers[1].fname, args->target_sample); } if ( args->all_sites ) { if ( tgt_isample==-1 ) { fprintf(stderr,"No target sample selected for comparison, using the first sample in %s: %s\n", args->gt_fname,args->gt_hdr->samples[0]); tgt_isample = 0; } } if ( args->query_sample ) { query_isample = bcf_hdr_id2int(args->sm_hdr, BCF_DT_SAMPLE, args->query_sample); if ( query_isample<0 ) error("No such sample in %s: [%s]\n", args->files->readers[0].fname, args->query_sample); } if ( args->all_sites ) fprintf(fp, "# [1]SC, Site by Site Comparison\t[2]Chromosome\t[3]Position\t[4]-g alleles\t[5]-g GT (%s)\t[6]Coverage\t[7]Query alleles\t[8-]Query PLs (%s)\n", args->gt_hdr->samples[tgt_isample],args->sm_hdr->samples[query_isample]); // Main loop while ( (ret=bcf_sr_next_line(args->files)) ) { if ( ret!=2 ) continue; bcf1_t *sm_line = args->files->readers[0].buffer[0]; // the query file bcf1_t *gt_line = args->files->readers[1].buffer[0]; // the -g target file bcf_unpack(sm_line, BCF_UN_FMT); bcf_unpack(gt_line, BCF_UN_FMT); // Init mapping from target genotype index to the sample's PL fields int n_gt2ipl = gt_line->n_allele*(gt_line->n_allele + 1)/2; if ( n_gt2ipl > m_gt2ipl ) { m_gt2ipl = n_gt2ipl; gt2ipl = (int*) realloc(gt2ipl, sizeof(int)*m_gt2ipl); } if ( !init_gt2ipl(args, gt_line, sm_line, gt2ipl, n_gt2ipl) ) continue; // Target genotypes int ngt, npl; if ( (ngt=bcf_get_genotypes(args->gt_hdr, gt_line, >_arr, &ngt_arr)) <= 0 ) error("GT not present at %s:%d?", args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, gt_line->pos+1); ngt /= bcf_hdr_nsamples(args->gt_hdr); if ( ngt!=2 ) continue; // checking only diploid genotypes // Sample PLs if ( !fake_pls ) { if ( (npl=bcf_get_format_int32(args->sm_hdr, sm_line, "PL", &args->pl_arr, &args->npl_arr)) <= 0 ) error("PL not present at %s:%d?", args->sm_hdr->id[BCF_DT_CTG][sm_line->rid].key, sm_line->pos+1); npl /= bcf_hdr_nsamples(args->sm_hdr); } else npl = fake_PLs(args, args->sm_hdr, sm_line); // Calculate likelihoods for all samples, assuming diploid genotypes // For faster access to genotype likelihoods (PLs) of the query sample int max_ipl, *pl_ptr = args->pl_arr + query_isample*npl; double sum_pl = 0; // for converting PLs to probs for (max_ipl=0; max_ipl<npl; max_ipl++) { if ( pl_ptr[max_ipl]==bcf_int32_vector_end ) break; if ( pl_ptr[max_ipl]==bcf_int32_missing ) continue; sum_pl += pow(10, -0.1*pl_ptr[max_ipl]); } if ( sum_pl==0 ) continue; // no PLs present // The main stats: concordance of the query sample with the target -g samples for (i=0; i<bcf_hdr_nsamples(args->gt_hdr); i++) { int *gt_ptr = gt_arr + i*ngt; if ( gt_ptr[1]==bcf_int32_vector_end ) continue; // skip haploid genotypes int a = bcf_gt_allele(gt_ptr[0]); int b = bcf_gt_allele(gt_ptr[1]); if ( a<0 || b<0 ) continue; // missing genotypes if ( args->hom_only && a!=b ) continue; // heterozygous genotype int igt_tgt = igt_tgt = bcf_alleles2gt(a,b); // genotype index in the target file int igt_qry = gt2ipl[igt_tgt]; // corresponding genotype in query file if ( igt_qry>=max_ipl || pl_ptr[igt_qry]<0 ) continue; // genotype not present in query sample: haploid or missing args->lks[i] += log(pow(10, -0.1*pl_ptr[igt_qry])/sum_pl); args->sites[i]++; } if ( args->all_sites ) { // Print LKs at all sites for debugging int *gt_ptr = gt_arr + tgt_isample*ngt; if ( gt_ptr[1]==bcf_int32_vector_end ) continue; // skip haploid genotypes int a = bcf_gt_allele(gt_ptr[0]); int b = bcf_gt_allele(gt_ptr[1]); if ( args->hom_only && a!=b ) continue; // heterozygous genotype fprintf(fp, "SC\t%s\t%d", args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, gt_line->pos+1); for (i=0; i<gt_line->n_allele; i++) fprintf(fp, "%c%s", i==0?'\t':',', gt_line->d.allele[i]); fprintf(fp, "\t%s/%s", a>=0 ? gt_line->d.allele[a] : ".", b>=0 ? gt_line->d.allele[b] : "."); int igt, *pl_ptr = args->pl_arr + query_isample*npl; // PLs of the query sample for (i=0; i<sm_line->n_allele; i++) fprintf(fp, "%c%s", i==0?'\t':',', sm_line->d.allele[i]); for (igt=0; igt<npl; igt++) if ( pl_ptr[igt]==bcf_int32_vector_end ) break; else if ( pl_ptr[igt]==bcf_int32_missing ) fprintf(fp, "."); else fprintf(fp, "\t%d", pl_ptr[igt]); fprintf(fp, "\n"); } } free(gt2ipl); free(gt_arr); free(args->pl_arr); free(args->tmp_arr); // Scale LKs and certainties int nsamples = bcf_hdr_nsamples(args->gt_hdr); double min = args->lks[0]; for (i=1; i<nsamples; i++) if ( min>args->lks[i] ) min = args->lks[i]; for (i=0; i<nsamples; i++) args->lks[i] = min ? args->lks[i] / min : 0; double max_avg = args->sites[0] ? args->lks[0]/args->sites[0] : 0; for (i=1; i<nsamples; i++) { double val = args->sites[i] ? args->lks[i]/args->sites[i] : 0; if ( max_avg<val ) max_avg = val; } // Sorted output double **p = (double**) malloc(sizeof(double*)*nsamples); for (i=0; i<nsamples; i++) p[i] = &args->lks[i]; qsort(p, nsamples, sizeof(int*), cmp_doubleptr); fprintf(fp, "# [1]CN\t[2]Concordance with %s (total)\t[3]Concordance (average)\t[4]Number of sites compared\t[5]Sample\t[6]Sample ID\n", args->sm_hdr->samples[query_isample]); for (i=0; i<nsamples; i++) { int idx = p[i] - args->lks; double avg = args->sites[idx] ? args->lks[idx]/args->sites[idx] : 0; fprintf(fp, "CN\t%e\t%e\t%.0f\t%s\t%d\n", 1-args->lks[idx], 1-avg/max_avg, args->sites[idx], args->gt_hdr->samples[idx], i); } if ( args->plot ) { fclose(fp); plot_check(args, args->target_sample ? args->target_sample : "", args->sm_hdr->samples[query_isample]); } }
/** * Computes Allele Balance from genotype likelihoods. * * @pls - PHRED genotype likelihoods * @no_samples - number of samples * @ploidy - ploidy * @dps - depths * @GF - estimated GF * @no_alleles - number of alleles * @ab - estimate of allele balance * @n - effective sample size */ void Estimator::compute_gl_ab(int32_t *pls, int32_t no_samples, int32_t ploidy, int32_t *dps, float* GF, int32_t no_alleles, float& ab, int32_t& n) { n = 0; if (ploidy!=2) return; if (no_alleles==2) { float num = 0, denum = 0; for (size_t k=0; k<no_samples; ++k) { size_t offset = k*3; if(pls[offset]!=bcf_int32_missing && dps[k]!=0) { float nrefnum = pls[offset+2]-pls[offset+0]; float nrefdenum = pls[offset]+pls[offset+2]-2*pls[offset+1] +6*dps[k]; float nref = 0.5*dps[k]*(1+(nrefdenum?nrefnum/nrefdenum:0)); float phet = lt->pl2prob(pls[offset+1])*GF[1] / ( lt->pl2prob(pls[offset])*GF[0] +lt->pl2prob(pls[offset+1])*GF[1] +lt->pl2prob(pls[offset+2])*GF[2]); num += phet*nref; denum += phet*dps[k]; ++n; } } ab = (0.05+num)/(0.10+denum); } else { int32_t no_genotypes = bcf_an2gn(no_alleles); float num = 0, denum = 0; for (size_t k=0; k<no_samples; ++k) { size_t offset = k*no_genotypes; if(pls[offset]!=bcf_int32_missing) { float prob_data, p_ref; int32_t gt_index = 0; for (size_t j=1; j<no_alleles; ++j) { size_t het_index = bcf_alleles2gt(0,j); size_t homalt_index = bcf_alleles2gt(j,j); float nrefnum = pls[offset+homalt_index]-pls[offset]; float nrefdenum = pls[offset]+pls[offset+homalt_index]-2*pls[offset+het_index] +6*dps[k]; float nref = 0.5*dps[k]*(1+(nrefdenum?nrefnum/nrefdenum:0)); float n = lt->pl2prob(pls[offset+het_index])*GF[het_index] ; float d = (lt->pl2prob(pls[offset])*GF[0] +n +lt->pl2prob(pls[offset+homalt_index])*GF[homalt_index]); float phet = d?n/d:0.333; num += phet*nref; denum += phet*dps[k]; } ++n; } } ab = (0.05+num)/(0.10+denum); } };
/** * Computes the Inbreeding Coefficient Statistic from Genotype likelihoods. * * @pls - PHRED genotype likelihoods * @no_samples - number of samples * @ploidy - ploidy * @GF - GF * @HWE_AF - AF under HWE assumption * @no_alleles - number of alleles * @F - estimated inbreeding coefficient * @n - effective sample size */ void Estimator::compute_gl_fic(int32_t * pls, int32_t no_samples, int32_t ploidy, float* HWE_AF, int32_t no_alleles, float* GF, float& F, int32_t& n) { n = 0; if (ploidy!=2) { return; } float HWE_GF[3]; HWE_GF[0] = HWE_AF[0]*HWE_AF[0]; HWE_GF[1] = 2*HWE_AF[0]*HWE_AF[1]; HWE_GF[2] = HWE_AF[1]*HWE_AF[1]; if (no_alleles==2) { int32_t no_genotypes = 3; float num = 0, denum=0; for (size_t k=0; k<no_samples; ++k) { size_t offset = k*no_genotypes; if (pls[offset]==bcf_int32_missing) { continue; } ++n; float o_het_sum = lt->pl2prob(pls[offset+1])*GF[1]; float o_sum = lt->pl2prob(pls[offset])*GF[0]; o_sum += o_het_sum; o_sum += lt->pl2prob(pls[offset+2])*GF[2]; float e_het_sum = lt->pl2prob(pls[offset+1])*HWE_GF[1]; float e_sum = lt->pl2prob(pls[offset])*HWE_GF[0]; e_sum += e_het_sum; e_sum += lt->pl2prob(pls[offset+2])*HWE_GF[2]; num += o_het_sum/o_sum; denum += e_het_sum/e_sum; } F = 1-num/denum; } else { int32_t no_genotypes = bcf_an2gn(no_alleles); float HWE_GF[no_genotypes]; for (size_t i=0; i<no_alleles; ++i) { for (size_t j=0; j<=i; ++j) { HWE_GF[bcf_alleles2gt(i,j)] = (i!=j?2:1)*HWE_AF[i]*HWE_AF[j]; } } float num=0, denum=0; float o_het_sum; float o_sum; float e_het_sum; float e_sum; for (size_t k=0; k<no_samples; ++k) { size_t offset = k*no_genotypes; if (pls[offset]==bcf_int32_missing) { continue; } ++n; o_het_sum = 0; o_sum = 0; e_het_sum = 0; e_sum = 0; int32_t gt_index = 0; for (size_t i=0; i<no_alleles; ++i) { for (size_t j=0; j<i; ++j) { float p = lt->pl2prob(pls[offset+gt_index]); o_het_sum += p * GF[gt_index]; o_sum += p * GF[gt_index]; e_het_sum += p * HWE_GF[gt_index]; e_sum += p * HWE_GF[gt_index]; ++gt_index; } //for homozygote o_sum += lt->pl2prob(pls[offset+gt_index]) * GF[gt_index]; e_sum += lt->pl2prob(pls[offset+gt_index]) * HWE_GF[gt_index]; ++gt_index; } num += o_het_sum/o_sum; denum += e_het_sum/e_sum; } F = 1-num/denum; } };
/** * Computes allele frequencies using hard calls. * * @gts - genotypes * @no_samples - number of samples * @ploidy - ploidy * @no_alleles - number of alleles * @AC - alternate allele counts * @AN - total number of allele counts * @AF - alternate allele frequency * @GC - genotype counts * @GN - total number of genotype counts * @GF - genotype frequency * @NS - number of samples with data */ void Estimator::compute_af(int32_t *gts, int32_t no_samples, int32_t ploidy, int32_t no_alleles, int32_t *AC, int32_t& AN, float *AF, int32_t *GC, int32_t& GN, float *GF, int32_t& NS) { int32_t iter = 0; if (no_alleles==2 && ploidy==2) { NS = 0; int32_t no_genotypes = 3; AC[0] = 0; AC[1] = 0; GC[0] = 0; GC[1] = 0; GC[2] = 0; GN=0; for (size_t k=0; k<no_samples; ++k) { size_t offset = k*ploidy; int32_t g1 = bcf_gt_allele(gts[offset]); if (g1>=0) { ++AC[g1]; ++AN; } int32_t g2 = bcf_gt_allele(gts[offset+1]); if (g2>=0) { ++AC[g2]; ++AN; } if (g1>=0 && g2>=0) { ++GC[g1+g2]; ++GN; } if (g1>=0||g2>=0) ++NS; } if (!NS) { return; } AF[0] = (float)AC[0]/AN; AF[1] = (float)AC[1]/AN; GF[0] = (float)GC[0]/GN; GF[1] = (float)GC[1]/GN; GF[2] = (float)GC[2]/GN; } else { NS = 0; AN = 0; GN = 0; int32_t no_genotypes = bcf_an2gn(no_alleles); for (size_t i=0; i<no_alleles; ++i) { AC[i] = 0; } for (size_t i=0; i<no_genotypes; ++i) { GC[i] = 0; } int32_t gt_indiv[ploidy]; for (size_t k=0; k<no_samples; ++k) { size_t offset = k*ploidy; int32_t last_AN = AN; for (size_t i=0; i<ploidy; ++i) { gt_indiv[i] = bcf_gt_allele(gts[offset+i]); if (gt_indiv[i]>=0) { ++AC[gt_indiv[i]]; ++AN; } } if (last_AN<AN) ++NS; if (ploidy==2 && gt_indiv[0]>=0 && gt_indiv[1]>=0) { if (gt_indiv[1]<gt_indiv[0]) { gt_indiv[0] += gt_indiv[1]; gt_indiv[1] = gt_indiv[0] - gt_indiv[1]; gt_indiv[0] -= gt_indiv[1]; } ++GC[bcf_alleles2gt(gt_indiv[0],gt_indiv[1])]; ++GN; } } for (size_t i=0; i<no_alleles; ++i) { AF[i] = (float)AC[i]/AN; } for (size_t i=0; i<no_genotypes; ++i) { GF[i] = (float)GC[i]/GN; } } }
/** * Computes allele frequencies using EM algorithm from genotype likelihoods. * * @pls - PHRED genotype likelihoods * @no_samples - number of samples * @ploidy - ploidy * @n_alleles - number of alleles * @MLE_AF - estimated AF * @MLE_GF - estimated GF * @n - effective sample size * @e - error */ void Estimator::compute_gl_af(int32_t *pls, int32_t nsamples, int32_t ploidy, int32_t n_allele, float *MLE_AF, float *MLE_GF, int32_t& n, double e) { int32_t iter = 0; if (n_allele==2 && ploidy==2) { n = 0; int32_t imap[nsamples]; for (size_t i=0; i<nsamples; ++i) { if (pls[i*3]!=bcf_int32_missing) { imap[n] = i; ++n; } } if (!n) { return; } float gf[3]; float gf_indiv[3]; float mse = e+1; float diff = 0; gf[0] = 1.0/3; gf[1] = gf[0]; gf[2] = gf[1]; while (mse>e && iter<50) { MLE_GF[0] = 0; MLE_GF[1] = 0; MLE_GF[2] = 0; for (size_t i=0; i<n; ++i) { size_t offset = imap[i]*3; float prob_data = (gf_indiv[0] = gf[0]*lt->pl2prob(pls[offset])); prob_data += (gf_indiv[1] = gf[1]*lt->pl2prob(pls[offset+1])); prob_data += (gf_indiv[2] = gf[2]*lt->pl2prob(pls[offset+2])); MLE_GF[0] += (gf_indiv[0] /= prob_data); MLE_GF[1] += (gf_indiv[1] /= prob_data); MLE_GF[2] += (gf_indiv[2] /= prob_data); } MLE_GF[0] /= n; MLE_GF[1] /= n; MLE_GF[2] /= n; diff = (gf[0]-MLE_GF[0]); mse = diff*diff; diff = (gf[1]-MLE_GF[1]); mse += diff*diff; diff = (gf[2]-MLE_GF[2]); mse += diff*diff; gf[0] = MLE_GF[0]; gf[1] = MLE_GF[1]; gf[2] = MLE_GF[2]; ++iter; } MLE_AF[0] = MLE_GF[0]+0.5*MLE_GF[1]; MLE_AF[1] = MLE_GF[2]+0.5*MLE_GF[1]; } else { n = 0; int32_t imap[nsamples]; int32_t no_genotypes = bcf_an2gn(n_allele); for (size_t i=0; i<nsamples; ++i) { if (pls[i*no_genotypes]!=bcf_int32_missing) { imap[n] = i; ++n; } } if (!n) return; float gf[no_genotypes]; float gf_indiv[no_genotypes]; //initialization gf[0] = 1.0/no_genotypes; for (size_t i=1; i<no_genotypes; ++i) { gf[i] = gf[0]; } float mse = e+1; while (mse>e && iter<50) { //initialization for (size_t i=0; i<no_genotypes; ++i) { MLE_GF[i] = 0; } //iterate through individuals for (size_t i=0; i<n; ++i) { size_t offset = imap[i]*no_genotypes; float prob_data = 0; for (size_t j=0; j<no_genotypes; ++j) { prob_data += (gf_indiv[j] = gf[j]*lt->pl2prob(pls[offset+j])); } for (size_t j=0; j<no_genotypes; ++j) { MLE_GF[j] += (gf_indiv[j] /= prob_data); } } mse = 0; float diff; for (size_t i=0; i<no_genotypes; ++i) { MLE_GF[i] /= n; diff = gf[i]-MLE_GF[i]; mse += (diff *= diff); gf[i] = MLE_GF[i]; } ++iter; } for (size_t i=0; i<n_allele; ++i) { for (size_t j=0; j<=i; ++j) { int32_t index = bcf_alleles2gt(i,j); MLE_AF[i] += 0.5*MLE_GF[index]; MLE_AF[i] += 0.5*MLE_GF[index]; } } } }
/** * Computes allele frequencies using EM algorithm from genotype likelihoods * under assumption of Hardy-Weinberg Equilibrium. * * @pls - PHRED genotype likelihoods * @no_samples - number of samples * @ploidy - ploidy * @no_alleles - number of alleles * @MLE_HWE_AF - estimated AF * @MLE_HWE_GF - estimated GF * @n - effective sample size * @e - error */ void Estimator::compute_gl_af_hwe(int32_t *pls, int32_t no_samples, int32_t ploidy, int32_t no_alleles, float *MLE_HWE_AF, float *MLE_HWE_GF, int32_t& n, double e) { int32_t iter = 0; if (ploidy!=2) { return; } if (no_alleles==2) { n = 0; int32_t imap[no_samples]; for (size_t i=0; i<no_samples; ++i) { if (pls[i*3]!=bcf_int32_missing) { imap[n] = i; ++n; } } if (!n) { return; } float af[2] = {0.5, 0.5}; float gf[3]; float gf_indiv[3]; float mse = e+1; float diff = 0; while (mse>e && iter<50) { gf[0] = af[0]*af[0]; gf[1] = 2*af[0]*af[1]; gf[2] = af[1]*af[1]; MLE_HWE_AF[0] = 0; MLE_HWE_AF[1] = 0; for (size_t i=0; i<n; ++i) { size_t offset = imap[i]*3; float prob_data = (gf_indiv[0] = gf[0]*lt->pl2prob(pls[offset])); prob_data += (gf_indiv[1] = gf[1]*lt->pl2prob(pls[offset+1])); prob_data += (gf_indiv[2] = gf[2]*lt->pl2prob(pls[offset+2])); gf_indiv[0] /= prob_data; gf_indiv[1] /= prob_data; gf_indiv[2] /= prob_data; MLE_HWE_AF[0] += gf_indiv[0] + 0.5*gf_indiv[1]; MLE_HWE_AF[1] += gf_indiv[2] + 0.5*gf_indiv[1]; } MLE_HWE_AF[0] /= n; MLE_HWE_AF[1] /= n; diff = (af[0]-MLE_HWE_AF[0]); mse = diff*diff; diff = (af[1]-MLE_HWE_AF[1]); mse += diff*diff; af[0] = MLE_HWE_AF[0]; af[1] = MLE_HWE_AF[1]; ++iter; } MLE_HWE_GF[0] = MLE_HWE_AF[0]*MLE_HWE_AF[0]; MLE_HWE_GF[1] = 2*MLE_HWE_AF[0]*MLE_HWE_AF[1]; MLE_HWE_GF[2] = MLE_HWE_AF[1]*MLE_HWE_AF[1]; } else { n = 0; int32_t imap[no_samples]; int32_t no_genotypes = bcf_an2gn(no_alleles); for (size_t k=0; k<no_samples; ++k) { if (pls[k*no_genotypes]!=bcf_int32_missing) { imap[n] = k; ++n; } } if (!n) return; float af[no_alleles]; float p = 1.0/no_alleles; for (size_t i=0; i<no_alleles; ++i) { af[i] = p; } float gf[no_genotypes]; float gf_indiv[no_genotypes]; bool debug = false; float mse = e+1; while (mse>e && iter<50) { for (size_t i=0; i<no_alleles; ++i) { MLE_HWE_AF[i] = 0; for (size_t j=0; j<=i; ++j) { gf[bcf_alleles2gt(i,j)] = (i!=j?2:1)*af[i]*af[j]; } } //iterate through individuals for (size_t k=0; k<n; ++k) { size_t offset = imap[k]*no_genotypes; float prob_data = 0; for (size_t i=0; i<no_genotypes; ++i) { prob_data += (gf_indiv[i] = gf[i]*lt->pl2prob(pls[offset+i])); } for (size_t i=0; i<no_genotypes; ++i) { gf_indiv[i] /= prob_data; } for (size_t i=0; i<no_alleles; ++i) { for (size_t j=0; j<=i; ++j) { int32_t gf_index = bcf_alleles2gt(i,j); MLE_HWE_AF[i] += 0.5*gf_indiv[gf_index]; MLE_HWE_AF[j] += 0.5*gf_indiv[gf_index]; } } } //normalize to frequency mse = 0; float diff; for (size_t i=0; i<no_alleles; ++i) { MLE_HWE_AF[i] /= n; diff = af[i]-MLE_HWE_AF[i]; mse += (diff*diff); af[i] = MLE_HWE_AF[i]; } ++iter; } for (size_t i=0; i<no_alleles; ++i) { for (size_t j=0; j<=i; ++j) { MLE_HWE_GF[bcf_alleles2gt(i,j)] = (i!=j?2:1)*MLE_HWE_AF[i]*MLE_HWE_AF[j]; } } } }