double emFrequency(double *loglike,int numInds, int iter,double start,char *keep,int keepInd){ if(keepInd == 0) return 0.0; double W0; double W1; double W2; // fprintf(stderr,"start=%f\n",start); double p=(double)start; double temp_p=(double)start; double accu=0.00001; double accu2=0; double sum; int it=0; for(it=0;it<iter;it++){ sum= 0; double pl = log(p); double mpl = log(1-p); for(int i=0;i<numInds;i++){ if(keep!=NULL && keep[i]==0) continue; W0=loglike[i*3+0]+2*mpl; W1=loglike[i*3+1]+M_LN2+pl+mpl; W2=loglike[i*3+2]+2*pl; sum+= exp(addProtect2(W1,M_LN2+W2)-M_LN2-addProtect3(W0,W1,W2)); // fprintf(stderr,"%f %f %f\n",W0,W1,W2); if(0&&isnan(sum)){ //fprintf(stderr,"PRE[%d]: W %f\t%f\t%f sum=%f\n",i,W0,W1,W2,sum); exit(0); } } p=sum/keepInd; // fprintf(stderr,"it=%d\tp=%f\tsum=%f\tkeepInd=%d\n",it,p,log(sum),keepInd); if((p-temp_p<accu&&temp_p-p<accu)||(p/temp_p<1+accu2&&p/temp_p>1-accu2)) break; temp_p=p; } if(isnan(p)){ fprintf(stderr,"[emFrequency] caught nan will not exit\n"); fprintf(stderr,"logLike (3*nInd). nInd=%d\n",numInds); //print_array(stderr,loglike,3*numInds); fprintf(stderr,"keepList (nInd)\n"); //print_array(stderr,keep,numInds); fprintf(stderr,"used logLike (3*length(keep))=%d\n",keepInd); for(int ii=0;0&&ii<numInds;ii++){ if(keep!=NULL && keep[ii]==1) fprintf(stderr,"1\t"); for(int gg=0;gg<3;gg++) fprintf(stderr,"%f\t",loglike[ii*3+gg]); fprintf(stderr,"\n"); } sum=0; for(int i=0;i<numInds;i++){ if(keep!=NULL && keep[i]==0) continue; W0=(loglike[i*3+0])*pow(1-p,2); W1=(loglike[i*3+1])*2*p*(1-p); W2=(loglike[i*3+2])*(pow(p,2)); sum+=(W1+2*W2)/(2*(W0+W1+W2)); //fprintf(stderr,"p=%f W %f\t%f\t%f sum=%f loglike: %f\n",p,W0,W1,W2,sum,exp(loglike[i*3+2])*pow(1-p,2)); } p=-999; // exit(0); } return(p); }
void EM_iter(params *pars, double **chunk_data, uint64_t chunk_abs_start_pos, uint64_t chunk_size, out_data *output, int iter) { // Loop over all sites for(uint64_t s = 0; s < chunk_size; s++) { uint64_t abs_s = s + chunk_abs_start_pos; double p = output->site_freq[abs_s]; // Skip site if freq == 0 if (p == 0) continue; // Loop over all individuals for(uint64_t i = 0; i < pars->n_ind; i++) { double F = output->indF[i]; double p0 = pow(1-p,2) + p*(1-p)*F; double p1 = 2*p*(1-p) * (1 - F); double p2 = pow(p,2) + p*(1-p)*F; // If initial guess assumes uniform priors if(iter == 0) p0 = p1 = p2 = 1; double norm = addProtect3(log(p0)+chunk_data[s][i*3+0], log(p1)+chunk_data[s][i*3+1], log(p2)+chunk_data[s][i*3+2]); double pp0 = p0 * exp(chunk_data[s][i*3+0]-norm); double pp1 = p1 * exp(chunk_data[s][i*3+1]-norm); double pp2 = p2 * exp(chunk_data[s][i*3+2]-norm); output->site_tmpprob_var[abs_s] = fmax(output->site_tmpprob_var[abs_s], 1 - pp0); double IBD = 0; double indF_num = 0; double indF_den = 0; // P(IBD) if(iter == 0) { //if initial guess do not use any prior IBD = check_interv(1 - (pp1/(2*(1-p)*p)), false); indF_num = IBD; indF_den = 1; } else { if(pars->approx_EM) { // Vieira et al. algorithm double a0 = pp0*(1-p)*p; double b0 = pow(1-p,2); double c0 = (1-p)*p; double a1 = pp1*2*(1-p)*p; double b1 = 2*(1-p)*p; double c1 = -2*(1-p)*p; double a2 = pp2*(1-p)*p; double b2 = pow(p,2); double c2 = (1-p)*p; double s_num0 = a0/(b0+F*c0) + (a0*c0*F)/pow(b0+F*c0,2); double s_num1 = a1/(b1+F*c1) + (a1*c1*F)/pow(b1+F*c1,2); double s_num2 = a2/(b2+F*c2) + (a2*c2*F)/pow(b2+F*c2,2); double s_den0 = (a0*c0)/pow(b0+F*c0,2); double s_den1 = (a1*c1)/pow(b1+F*c1,2); double s_den2 = (a2*c2)/pow(b2+F*c2,2); indF_num = s_num0 - s_num1 + s_num2; indF_den = s_den0 - s_den1 + s_den2; if( indF_num != indF_num || indF_den != indF_den || indF_num/indF_den != indF_num/indF_den || pars->verbose >= 7 ) printf("site IBD: %lu %lu %f %f / %f %f %f %f %f %f %f %f %f / %f %f %f %f %f %f %f %f %f / %f %f %f %f %f %f / %f %f\n", abs_s, i, p, F, chunk_data[s][i*3+0], chunk_data[s][i*3+1], chunk_data[s][i*3+2], p0, p1, p2, pp0, pp1, pp2, a0, b0, c0, a1, b1, c1, a2, b2, c2, s_num0, s_num1, s_num2, s_den0, s_den1, s_den2, indF_num, indF_den); IBD = check_interv(indF_num/indF_den, false); } else { // Hall et al. algorithm IBD = pp0 * (1-p) * F / ((1-p) * F + pow(1-p,2) * (1-F)) + pp2 * p * F / (p * F + pow(p,2) * (1-F)); indF_num = IBD; indF_den = 1; } } // Update site freq output->site_freq_num[abs_s] += pp1 + pp2 * (2-IBD); output->site_freq_den[abs_s] += pp1 + pp2 * (2-IBD) + pp1 + pp0*(2-IBD); // Update indiv F pthread_mutex_lock(&pars->_F_lock); output->indF_num[i] += indF_num;// * pow(output->site_prob_var[abs_s], 100); output->indF_den[i] += indF_den;// * pow(output->site_prob_var[abs_s], 100); pthread_mutex_unlock(&pars->_F_lock); if( pars->verbose >= 7 ) printf("Ind: %lu\t%.10f %.10f %.10f\tmaf: %f\tindF: %f\tp: %f %f %f\tpp: %f %f %f\tCum_freq: %f (%f/%f)\tCumF: %f (%f/%f)\n", i+1, chunk_data[s][i*3+0], chunk_data[s][i*3+1], chunk_data[s][i*3+2], \ p, F, p0, p1, p2, pp0, pp1, pp2, \ output->site_freq_num[abs_s]/output->site_freq_den[abs_s], output->site_freq_num[abs_s], output->site_freq_den[abs_s], \ output->indF_num[i]/output->indF_den[i], output->indF_num[i], output->indF_den[i]); } if( pars->verbose >= 6 ) printf("\t\t%lu\t%f (%f / %f) %f\n", abs_s+1, output->site_freq_num[abs_s]/output->site_freq_den[abs_s], output->site_freq_num[abs_s], output->site_freq_den[abs_s], output->site_prob_var[abs_s]); } }