Exemple #1
0
// qual:6, strand:1, base:4
int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *q)
{
	call_aux_t aux;
	int i, j, k, w[32];

	if (m > m) return -1;
	memset(q, 0, m * m * sizeof(float));
	if (n == 0) return 0;
	// calculate aux.esum and aux.fsum
	if (n > 255) { // then sample 255 bases
		ks_shuffle(uint16_t, n, bases);
		n = 255;
	}
	ks_introsort(uint16_t, n, bases);
	memset(w, 0, 32 * sizeof(int));
	memset(&aux, 0, sizeof(call_aux_t));
	for (j = n - 1; j >= 0; --j) { // calculate esum and fsum
		uint16_t b = bases[j];
		int q = b>>5 < 4? 4 : b>>5;
		if (q > 63) q = 63;
		k = b&0x1f;
		aux.fsum[k&0xf] += em->coef->fk[w[k]];
		aux.bsum[k&0xf] += em->coef->fk[w[k]] * em->coef->beta[q<<16|n<<8|aux.c[k&0xf]];
		++aux.c[k&0xf];
		++w[k];
	}
	// generate likelihood
	for (j = 0; j != m; ++j) {
		float tmp1, tmp3;
		int tmp2, bar_e;
		// homozygous
		for (k = 0, tmp1 = tmp3 = 0.0, tmp2 = 0; k != m; ++k) {
			if (k == j) continue;
			tmp1 += aux.bsum[k]; tmp2 += aux.c[k]; tmp3 += aux.fsum[k];
		}
		if (tmp2) {
			bar_e = (int)(tmp1 / tmp3 + 0.499);
			if (bar_e > 63) bar_e = 63;
			q[j*m+j] = tmp1;
		}
		// heterozygous
		for (k = j + 1; k < m; ++k) {
			int cjk = aux.c[j] + aux.c[k];
			for (i = 0, tmp2 = 0, tmp1 = tmp3 = 0.0; i < m; ++i) {
				if (i == j || i == k) continue;
				tmp1 += aux.bsum[i]; tmp2 += aux.c[i]; tmp3 += aux.fsum[i];
			}
			if (tmp2) {
				bar_e = (int)(tmp1 / tmp3 + 0.499);
				if (bar_e > 63) bar_e = 63;
				q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]] + tmp1;
			} else q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]]; // all the bases are either j or k
		}
		for (k = 0; k != m; ++k) if (q[j*m+k] < 0.0) q[j*m+k] = 0.0;
	}
	return 0;
}
Exemple #2
0
//
// em: error model to fit to data
// m: number of alleles across all samples
// n: number of bases observed in sample
// bases[i]: bases observed in pileup [6 bit quality|1 bit strand|4 bit base]
// q[i*m+j]: (Output) phred-scaled likelihood of each genotype (i,j)
int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *q)
{
    // Aux
    // aux.c is total count of each base observed (ignoring strand)
    call_aux_t aux;
    // Loop variables
    int i, j, k;
    // The total count of each base observed per strand
    int w[32];

        /* zero out q */
    memset(q, 0, m * m * sizeof(float));
    if (n == 0) return 0;
    // calculate aux.esum and aux.fsum
    if (n > 255) { // then sample 255 bases
        ks_shuffle(uint16_t, n, bases);
        n = 255;
    }
    ks_introsort(uint16_t, n, bases);
    /* zero out w and aux */
    memset(w, 0, 32 * sizeof(int));
    memset(&aux, 0, sizeof(call_aux_t));

    for (j = n - 1; j >= 0; --j) { // calculate esum and fsum
        uint16_t b = bases[j];
        /* extract quality and cap at 63 */
        int qual = b>>5 < 4? 4 : b>>5;
        if (qual > 63) qual = 63;
        /* extract base ORed with strand */
        int basestrand = b&0x1f;
        /* extract base */
        int base = b&0xf;
        aux.fsum[base] += em->coef->fk[w[basestrand]];
        aux.bsum[base] += em->coef->fk[w[basestrand]] * em->coef->beta[qual<<16|n<<8|aux.c[base]];
        ++aux.c[base];
        ++w[basestrand];
    }

    // generate likelihood
    for (j = 0; j < m; ++j) {
        float tmp1, tmp3;
        int tmp2;
        // homozygous
        for (k = 0, tmp1 = tmp3 = 0.0, tmp2 = 0; k < m; ++k) {
            if (k == j) continue;
            tmp1 += aux.bsum[k]; tmp2 += aux.c[k]; tmp3 += aux.fsum[k];
        }
        if (tmp2) {
            q[j*m+j] = tmp1;
        }
        // heterozygous
        for (k = j + 1; k < m; ++k) {
            int cjk = aux.c[j] + aux.c[k];
            for (i = 0, tmp2 = 0, tmp1 = tmp3 = 0.0; i < m; ++i) {
                if (i == j || i == k) continue;
                tmp1 += aux.bsum[i]; tmp2 += aux.c[i]; tmp3 += aux.fsum[i];
            }
            if (tmp2) {
                q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]] + tmp1;
            } else q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]]; // all the bases are either j or k
        }
        /* clamp to greater than 0 */
        for (k = 0; k < m; ++k) if (q[j*m+k] < 0.0) q[j*m+k] = 0.0;
    }

    return 0;
}
Exemple #3
0
// qual:6, strand:1, base:4
int errmod_cal(const errmod_t *em, int n, int m, uint16_t *bases, float *q)
{
  //  fprintf(stderr,"n=%d\n",n);
  if (n == 0) 
    return 0; //no data

  //reset results array;
  memset(q, 0, m * m * sizeof(float));

  //set "counts" array
  int w[32];
  memset(w, 0, 32 * sizeof(int));
  for(int i=0;0&&i<32;i++)
    fprintf(stderr,"pre w[%d]=%d\n",i,w[i]);
      
  call_aux_t aux;
  memset(&aux, 0, sizeof(call_aux_t));

  //sample 255 if depth >255
  if ((n > 255)) { // then sample 255 bases //THIS MAKES
    ks_shuffle(uint16_t, n, bases);
    n = 255;
  }
  ks_introsort(uint16_t, n, bases);
  
  for (int j = n - 1; j >= 0; --j) {
    uint16_t b = bases[j];
    //    fprintf(stderr,"j=%d q=%d strand=%d base=%d\n",j,b>>5,0x1&(b>>4),b&0xf);
    //    fprintf(stderr,"j=%d q=%d\tc=%c\t",j,(b>>5) + 33,(b>>5) + 33);
    //cap quality at [4,63]
    int q = b>>5 < 4? 4 : b>>5;
    if (q > 63) 
      q = 63;

    int k = b&0x1f;
    
    
    aux.bsum[k&0xf] += em->coef->fk[w[k]] * em->coef->beta[q<<16|n<<8|aux.c[k&0xf]];
    ++aux.c[k&0xf];
    ++w[k];
    for(int i=0;0&&i<32;i++)
      fprintf(stderr,"w[%d]=%d\n",i,w[i]);
  }
  if(0){
    //floating point inprecision compared with samtools binary output. But is correct
    
    //the genotype like p(data|A1=g1,A2=g2) = p(data|A1=g2,A2=g1)
    for (int g1 = 0; g1 <5; ++g1) {//allele1=0,1,2,3,4
      for (int g2 = g1; g2<5; ++g2) {//allele2=0,1,2,3,4
	if(g1!=g2){ // A1!=A2 - heterozygoues
	  int cjk = aux.c[g1] + aux.c[g2];//total depth for allele g1+allele g2
	  //binomial when ignoring non A1/A2 alleles: Bin (n,k,p);n=cjk=#g1+#g2 , k= aux.c[g2] = #g2 ; p=0.5. returns log
	  q[g1*5+g2] = -4.343 * em->coef->lhet[cjk<<8|aux.c[g2]]; 
	}
	for (int k = 0; k <5; ++k){
	  if(k!=g1 && k!=g2) //if a read has a non A1/A2 alleles it is an error. add the log of the prob of these reads
	    q[g1*5+g2] += aux.bsum[k];
	  
	}

	//mirror
	if(g1!=g2)
	  q[g2*5+g1] =  q[g1*5+g2];
	
	if (q[g1*5+g2] < 0.0) 
	  q[g1*5+g2] = 0.0;
	
      }
    }
    return 0;
  }
  // generate likelihood THIS WORKS PERFECTLY june 4 ande
  for (int g1 = 0; g1 <5; ++g1) {//j=0,1,2,3,4
    for (int g2 = g1; g2<5; ++g2) {//j=0,1,2,3,4
      if(g1==g2){ 
	for (int k = 0; k <5; ++k){
	  if(k!=g1)
	    q[g1*5+g2] += aux.bsum[k];
	}
      }
      else{
	int other=0;
	float tmp1=0;
	for (int k = 0; k < 5; ++k) 
	  if (k != g1 && k != g2) {
	    tmp1 += aux.bsum[k]; 
	    other = 1; 
	}
	int cjk = aux.c[g1] + aux.c[g2];
	if (other) 
	  q[g1*5+g2] = q[g2*5+g1] = -4.343 * em->coef->lhet[cjk<<8|aux.c[g2]] + tmp1;
	else 
	  q[g1*5+g2] = q[g2*5+g1] = -4.343 * em->coef->lhet[cjk<<8|aux.c[g2]]; // all the bases are either j or k

      }
      if (q[g1*5+g2] < 0.0) 
	q[g1*5+g2] = 0.0;

    }
  }
 
  return 0;
  

  //old original almost
  for (int j = 0; j != m; ++j) {//j=0,1,2,3,4
    // homozygous
    for (int k = 0; k != m; ++k){//only updates if k!=j and aux.c[k]!=0
      fprintf(stderr,"\t-> j=%d aux.c[%d]=%d aux.bsum[%d]=%f res=%d ",j,k,aux.c[k],k,aux.bsum[k],j*m+j);
      if (k != j && aux.c[k]) {
	fprintf(stderr,"USING \n");
	q[j*m+j] += aux.bsum[k];
      }else
	fprintf(stderr,"skipping\n");
    }
    
    // heterozygous
    for (int k = j + 1; k < m; ++k) {//k=1,...,4
      float tmp1=0.0;
      int isHe=0;
      for (int i = 0; i < m; ++i) 
	if (i != j && i != k) {
	  tmp1 += aux.bsum[i]; 
	  isHe += aux.c[i]; 
	}
      

      int cjk = aux.c[j] + aux.c[k];
      fprintf(stderr,"j=%d k=%d RES=%d\n",j,k,j*m+k);
      if(1) {

	if (isHe) 
	  q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]] + tmp1;
	else 
	  q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]]; // all the bases are either j or k
      }else{

	q[j*m+k] = q[k*m+j] = -4.343 * em->coef->lhet[cjk<<8|aux.c[k]];
	if(isHe)
	  q[j*m+k] = q[k*m+j] =  q[k*m+j]+tmp1;

      }

    }

    //set to zero if negative, shoulnd't happen
    for (int k = 0; k != m; ++k) 
      if (q[j*m+k] < 0.0) 
	q[j*m+k] = 0.0;
  }
  //  exit(0);
  return 0;
}