// qual:6, strand:1, base:4 int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *q) { call_aux_t aux; int i, j, k, w[32]; if (m > m) return -1; memset(q, 0, m * m * sizeof(float)); if (n == 0) return 0; // calculate aux.esum and aux.fsum if (n > 255) { // then sample 255 bases ks_shuffle(uint16_t, n, bases); n = 255; } ks_introsort(uint16_t, n, bases); memset(w, 0, 32 * sizeof(int)); memset(&aux, 0, sizeof(call_aux_t)); for (j = n - 1; j >= 0; --j) { // calculate esum and fsum uint16_t b = bases[j]; int q = b>>5 < 4? 4 : b>>5; if (q > 63) q = 63; k = b&0x1f; aux.fsum[k&0xf] += em->coef->fk[w[k]]; aux.bsum[k&0xf] += em->coef->fk[w[k]] * em->coef->beta[q<<16|n<<8|aux.c[k&0xf]]; ++aux.c[k&0xf]; ++w[k]; } // generate likelihood for (j = 0; j != m; ++j) { float tmp1, tmp3; int tmp2, bar_e; // homozygous for (k = 0, tmp1 = tmp3 = 0.0, tmp2 = 0; k != m; ++k) { if (k == j) continue; tmp1 += aux.bsum[k]; tmp2 += aux.c[k]; tmp3 += aux.fsum[k]; } if (tmp2) { bar_e = (int)(tmp1 / tmp3 + 0.499); if (bar_e > 63) bar_e = 63; q[j*m+j] = tmp1; } // heterozygous for (k = j + 1; k < m; ++k) { int cjk = aux.c[j] + aux.c[k]; for (i = 0, tmp2 = 0, tmp1 = tmp3 = 0.0; i < m; ++i) { if (i == j || i == k) continue; tmp1 += aux.bsum[i]; tmp2 += aux.c[i]; tmp3 += aux.fsum[i]; } if (tmp2) { bar_e = (int)(tmp1 / tmp3 + 0.499); if (bar_e > 63) bar_e = 63; q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]] + tmp1; } else q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]]; // all the bases are either j or k } for (k = 0; k != m; ++k) if (q[j*m+k] < 0.0) q[j*m+k] = 0.0; } return 0; }
// // em: error model to fit to data // m: number of alleles across all samples // n: number of bases observed in sample // bases[i]: bases observed in pileup [6 bit quality|1 bit strand|4 bit base] // q[i*m+j]: (Output) phred-scaled likelihood of each genotype (i,j) int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *q) { // Aux // aux.c is total count of each base observed (ignoring strand) call_aux_t aux; // Loop variables int i, j, k; // The total count of each base observed per strand int w[32]; /* zero out q */ memset(q, 0, m * m * sizeof(float)); if (n == 0) return 0; // calculate aux.esum and aux.fsum if (n > 255) { // then sample 255 bases ks_shuffle(uint16_t, n, bases); n = 255; } ks_introsort(uint16_t, n, bases); /* zero out w and aux */ memset(w, 0, 32 * sizeof(int)); memset(&aux, 0, sizeof(call_aux_t)); for (j = n - 1; j >= 0; --j) { // calculate esum and fsum uint16_t b = bases[j]; /* extract quality and cap at 63 */ int qual = b>>5 < 4? 4 : b>>5; if (qual > 63) qual = 63; /* extract base ORed with strand */ int basestrand = b&0x1f; /* extract base */ int base = b&0xf; aux.fsum[base] += em->coef->fk[w[basestrand]]; aux.bsum[base] += em->coef->fk[w[basestrand]] * em->coef->beta[qual<<16|n<<8|aux.c[base]]; ++aux.c[base]; ++w[basestrand]; } // generate likelihood for (j = 0; j < m; ++j) { float tmp1, tmp3; int tmp2; // homozygous for (k = 0, tmp1 = tmp3 = 0.0, tmp2 = 0; k < m; ++k) { if (k == j) continue; tmp1 += aux.bsum[k]; tmp2 += aux.c[k]; tmp3 += aux.fsum[k]; } if (tmp2) { q[j*m+j] = tmp1; } // heterozygous for (k = j + 1; k < m; ++k) { int cjk = aux.c[j] + aux.c[k]; for (i = 0, tmp2 = 0, tmp1 = tmp3 = 0.0; i < m; ++i) { if (i == j || i == k) continue; tmp1 += aux.bsum[i]; tmp2 += aux.c[i]; tmp3 += aux.fsum[i]; } if (tmp2) { q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]] + tmp1; } else q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]]; // all the bases are either j or k } /* clamp to greater than 0 */ for (k = 0; k < m; ++k) if (q[j*m+k] < 0.0) q[j*m+k] = 0.0; } return 0; }
// qual:6, strand:1, base:4 int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *q) { // fprintf(stderr,"n=%d\n",n); if (n == 0) return 0; //no data //reset results array; memset(q, 0, m * m * sizeof(float)); //set "counts" array int w[32]; memset(w, 0, 32 * sizeof(int)); for(int i=0;0&&i<32;i++) fprintf(stderr,"pre w[%d]=%d\n",i,w[i]); call_aux_t aux; memset(&aux, 0, sizeof(call_aux_t)); //sample 255 if depth >255 if ((n > 255)) { // then sample 255 bases //THIS MAKES ks_shuffle(uint16_t, n, bases); n = 255; } ks_introsort(uint16_t, n, bases); for (int j = n - 1; j >= 0; --j) { uint16_t b = bases[j]; // fprintf(stderr,"j=%d q=%d strand=%d base=%d\n",j,b>>5,0x1&(b>>4),b&0xf); // fprintf(stderr,"j=%d q=%d\tc=%c\t",j,(b>>5) + 33,(b>>5) + 33); //cap quality at [4,63] int q = b>>5 < 4? 4 : b>>5; if (q > 63) q = 63; int k = b&0x1f; aux.bsum[k&0xf] += em->coef->fk[w[k]] * em->coef->beta[q<<16|n<<8|aux.c[k&0xf]]; ++aux.c[k&0xf]; ++w[k]; for(int i=0;0&&i<32;i++) fprintf(stderr,"w[%d]=%d\n",i,w[i]); } if(0){ //floating point inprecision compared with samtools binary output. But is correct //the genotype like p(data|A1=g1,A2=g2) = p(data|A1=g2,A2=g1) for (int g1 = 0; g1 <5; ++g1) {//allele1=0,1,2,3,4 for (int g2 = g1; g2<5; ++g2) {//allele2=0,1,2,3,4 if(g1!=g2){ // A1!=A2 - heterozygoues int cjk = aux.c[g1] + aux.c[g2];//total depth for allele g1+allele g2 //binomial when ignoring non A1/A2 alleles: Bin (n,k,p);n=cjk=#g1+#g2 , k= aux.c[g2] = #g2 ; p=0.5. returns log q[g1*5+g2] = -4.343 * em->coef->lhet[cjk<<8|aux.c[g2]]; } for (int k = 0; k <5; ++k){ if(k!=g1 && k!=g2) //if a read has a non A1/A2 alleles it is an error. add the log of the prob of these reads q[g1*5+g2] += aux.bsum[k]; } //mirror if(g1!=g2) q[g2*5+g1] = q[g1*5+g2]; if (q[g1*5+g2] < 0.0) q[g1*5+g2] = 0.0; } } return 0; } // generate likelihood THIS WORKS PERFECTLY june 4 ande for (int g1 = 0; g1 <5; ++g1) {//j=0,1,2,3,4 for (int g2 = g1; g2<5; ++g2) {//j=0,1,2,3,4 if(g1==g2){ for (int k = 0; k <5; ++k){ if(k!=g1) q[g1*5+g2] += aux.bsum[k]; } } else{ int other=0; float tmp1=0; for (int k = 0; k < 5; ++k) if (k != g1 && k != g2) { tmp1 += aux.bsum[k]; other = 1; } int cjk = aux.c[g1] + aux.c[g2]; if (other) q[g1*5+g2] = q[g2*5+g1] = -4.343 * em->coef->lhet[cjk<<8|aux.c[g2]] + tmp1; else q[g1*5+g2] = q[g2*5+g1] = -4.343 * em->coef->lhet[cjk<<8|aux.c[g2]]; // all the bases are either j or k } if (q[g1*5+g2] < 0.0) q[g1*5+g2] = 0.0; } } return 0; //old original almost for (int j = 0; j != m; ++j) {//j=0,1,2,3,4 // homozygous for (int k = 0; k != m; ++k){//only updates if k!=j and aux.c[k]!=0 fprintf(stderr,"\t-> j=%d aux.c[%d]=%d aux.bsum[%d]=%f res=%d ",j,k,aux.c[k],k,aux.bsum[k],j*m+j); if (k != j && aux.c[k]) { fprintf(stderr,"USING \n"); q[j*m+j] += aux.bsum[k]; }else fprintf(stderr,"skipping\n"); } // heterozygous for (int k = j + 1; k < m; ++k) {//k=1,...,4 float tmp1=0.0; int isHe=0; for (int i = 0; i < m; ++i) if (i != j && i != k) { tmp1 += aux.bsum[i]; isHe += aux.c[i]; } int cjk = aux.c[j] + aux.c[k]; fprintf(stderr,"j=%d k=%d RES=%d\n",j,k,j*m+k); if(1) { if (isHe) q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]] + tmp1; else q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]]; // all the bases are either j or k }else{ q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]]; if(isHe) q[j*m+k] = q[k*m+j] = q[k*m+j]+tmp1; } } //set to zero if negative, shoulnd't happen for (int k = 0; k != m; ++k) if (q[j*m+k] < 0.0) q[j*m+k] = 0.0; } // exit(0); return 0; }