/******************************************************************************* Update the motif counts from a putative "real" start. This is done in three stages. In stage 0, all motifs sizes 3-6bp in the region with spacer 3-15bp are counted. In stage 1, only the best motif and all its subsets are counted (e.g. for AGGAG, we would count AGGAG, AGGA, GGAG, AGG, GGA, and GAG). In stage 2, only the best single motif is counted. *******************************************************************************/ void update_motif_counts(double mcnt[4][4][4096], double *zero, unsigned char *seq, unsigned char *rseq, int slen, struct _node *nod, int stage) { int i, j, k, start, spacendx; unsigned char *wseq; struct _motif *mot = &(nod->mot); if(nod->type == STOP || nod->edge == 1) return; if(mot->len == 0) { *zero += 1.0; return; } if(nod->strand == 1) { wseq = seq; start = nod->ndx; } else { wseq = rseq; start = slen-1-nod->ndx; } /* Stage 0: Count all motifs. If a motif is detected, */ /* it is counted for every distance in stage 0. This */ /* is done to make sure off-distance good motifs are */ /* recognized. */ if(stage == 0) { for(i = 3; i >= 0; i--) { for(j = start-18-i; j <= start-6-i; j++) { if(j < 0) continue; if(j <= start-16-i) spacendx = 3; else if(j <= start-14-i) spacendx = 2; else if(j >= start-7-i) spacendx = 1; else spacendx = 0; for(k = 0; k < 4; k++) mcnt[i][k][mer_ndx(i+3, wseq, j)] += 1.0; } } } /* Stage 1: Count only the best motif, but also count */ /* all its sub-motifs. */ else if(stage == 1) { mcnt[mot->len-3][mot->spacendx][mot->ndx] += 1.0; for(i = 0; i < mot->len-3; i++) { for(j = start-(mot->spacer)-(mot->len); j <= start-(mot->spacer)-(i+3); j++) { if(j < 0) continue; if(j <= start-16-i) spacendx = 3; else if(j <= start-14-i) spacendx = 2; else if(j >= start-7-i) spacendx = 1; else spacendx = 0; mcnt[i][spacendx][mer_ndx(i+3, wseq, j)] += 1.0; } } } /* Stage 2: Only count the highest scoring motif. */ else if(stage == 2) mcnt[mot->len-3][mot->spacendx][mot->ndx] += 1.0; }
/* Builds a 'len'-mer background for whole sequence */ void calc_mer_bg(int len, unsigned char *seq, unsigned char *rseq, int slen, double *bg) { int i, glob = 0, size = 1; int *counts; for(i = 1; i <= len; i++) size *= 4; counts = (int *)malloc(size * sizeof(int)); for(i = 0; i < size; i++) counts[i] = 0; for(i = 0; i < slen-len+1; i++) { counts[mer_ndx(len, seq, i)]++; counts[mer_ndx(len, rseq, i)]++; glob+=2; } for(i = 0; i < size; i++) bg[i] = (double)((counts[i]*1.0)/(glob*1.0)); free(counts); }
void calc_dicodon_gene(struct _training *tinf, unsigned char *seq, unsigned char *rseq, int slen, struct _node *nod, int dbeg) { int i, path, counts[4096], glob = 0; int left, right, in_gene; double prob[4096], bg[4096]; for(i = 0; i < 4096; i++) { counts[i] = 0; prob[i] = 0.0; bg[i] = 0.0; } left = -1; right = -1; calc_mer_bg(6, seq, rseq, slen, bg); path = dbeg; in_gene = 0; while(path != -1) { if(nod[path].strand == -1 && nod[path].type != STOP) { in_gene = -1; left = slen-nod[path].ndx-1; } if(nod[path].strand == 1 && nod[path].type == STOP) { in_gene = 1; right = nod[path].ndx+2; } if(in_gene == -1 && nod[path].strand == -1 && nod[path].type == STOP) { right = slen-nod[path].ndx+1; for(i = left; i < right-5; i+=3) { counts[mer_ndx(6, rseq, i)]++; glob++; } in_gene = 0; } if(in_gene == 1 && nod[path].strand == 1 && nod[path].type != STOP) { left = nod[path].ndx; for(i = left; i < right-5; i+=3) { counts[mer_ndx(6, seq, i)]++; glob++; } in_gene = 0; } path = nod[path].traceb; } for(i = 0; i < 4096; i++) { prob[i] = (counts[i]*1.0)/(glob*1.0); if(prob[i] == 0 && bg[i] != 0) tinf->gene_dc[i] = -5.0; else if(bg[i] == 0) tinf->gene_dc[i] = 0.0; else tinf->gene_dc[i] = log(prob[i]/bg[i]); if(tinf->gene_dc[i] > 5.0) tinf->gene_dc[i] = 5.0; if(tinf->gene_dc[i] < -5.0) tinf->gene_dc[i] = -5.0; } }
/******************************************************************************* For a given start, record the base composition of the upstream region at positions -1 and -2 and -15 to -44. This will be used to supplement the SD (or other) motif finder with additional information. *******************************************************************************/ void count_upstream_composition(unsigned char *seq, int slen, int strand, int pos, struct _training *tinf) { int i, start, count = 0; if(strand == 1) start = pos; else start = slen-1-pos; for(i = 1; i < 45; i++) { if(i > 2 && i < 15) continue; tinf->ups_comp[count][mer_ndx(1, seq, start-i)]++; count++; } }