Пример #1
0
/*******************************************************************************
  Update the motif counts from a putative "real" start.  This is done in three
  stages.  In stage 0, all motifs sizes 3-6bp in the region with spacer 3-15bp
  are counted.  In stage 1, only the best motif and all its subsets are
  counted (e.g. for AGGAG, we would count AGGAG, AGGA, GGAG, AGG, GGA, and
  GAG).  In stage 2, only the best single motif is counted.
*******************************************************************************/
void update_motif_counts(double mcnt[4][4][4096], double *zero, unsigned char
                         *seq, unsigned char *rseq, int slen, struct _node *nod,
                         int stage) {
  int i, j, k, start, spacendx;
  unsigned char *wseq;
  struct _motif *mot = &(nod->mot);

  if(nod->type == STOP || nod->edge == 1) return;
  if(mot->len == 0) { *zero += 1.0; return; }

  if(nod->strand == 1) { wseq = seq; start = nod->ndx; }
  else { wseq = rseq; start = slen-1-nod->ndx; }

  /* Stage 0:  Count all motifs.  If a motif is detected, */
  /* it is counted for every distance in stage 0.  This   */
  /* is done to make sure off-distance good motifs are    */
  /* recognized.                                          */
  if(stage == 0) {
    for(i = 3; i >= 0; i--) {
      for(j = start-18-i; j <= start-6-i; j++) {
        if(j < 0) continue;
        if(j <= start-16-i) spacendx = 3;
        else if(j <= start-14-i) spacendx = 2;
        else if(j >= start-7-i) spacendx = 1;
        else spacendx = 0;
        for(k = 0; k < 4; k++) mcnt[i][k][mer_ndx(i+3, wseq, j)] += 1.0;
      }
    }
  }
  /* Stage 1:  Count only the best motif, but also count  */
  /* all its sub-motifs.                                  */
  else if(stage == 1) {
    mcnt[mot->len-3][mot->spacendx][mot->ndx] += 1.0;
    for(i = 0; i < mot->len-3; i++) {
      for(j = start-(mot->spacer)-(mot->len); j <= start-(mot->spacer)-(i+3);
          j++) {
        if(j < 0) continue;
        if(j <= start-16-i) spacendx = 3;
        else if(j <= start-14-i) spacendx = 2;
        else if(j >= start-7-i) spacendx = 1;
        else spacendx = 0;
        mcnt[i][spacendx][mer_ndx(i+3, wseq, j)] += 1.0;
      }
    }
  }
  /* Stage 2:  Only count the highest scoring motif. */
  else if(stage == 2) mcnt[mot->len-3][mot->spacendx][mot->ndx] += 1.0;
}
Пример #2
0
/* Builds a 'len'-mer background for whole sequence */
void calc_mer_bg(int len, unsigned char *seq, unsigned char *rseq, int slen,
                 double *bg) {
  int i, glob = 0, size = 1;
  int *counts;

  for(i = 1; i <= len; i++) size *= 4;
  counts = (int *)malloc(size * sizeof(int));
  for(i = 0; i < size; i++) counts[i] = 0;
  for(i = 0; i < slen-len+1; i++) {
    counts[mer_ndx(len, seq, i)]++;
    counts[mer_ndx(len, rseq, i)]++;
    glob+=2;
  }
  for(i = 0; i < size; i++) bg[i] = (double)((counts[i]*1.0)/(glob*1.0));
  free(counts);
}
Пример #3
0
void calc_dicodon_gene(struct _training *tinf, unsigned char *seq, unsigned
                       char *rseq, int slen, struct _node *nod, int dbeg) {
  int i, path, counts[4096], glob = 0;
  int left, right, in_gene;
  double prob[4096], bg[4096];

  for(i = 0; i < 4096; i++) { counts[i] = 0; prob[i] = 0.0; bg[i] = 0.0; }
  left = -1; right = -1;
  calc_mer_bg(6, seq, rseq, slen, bg);
  path = dbeg; in_gene = 0;
  while(path != -1) {
    if(nod[path].strand == -1 && nod[path].type != STOP) {
      in_gene = -1;
      left = slen-nod[path].ndx-1;
    }
    if(nod[path].strand == 1 && nod[path].type == STOP) {
      in_gene = 1;
      right = nod[path].ndx+2;
    }
    if(in_gene == -1 && nod[path].strand == -1 && nod[path].type == STOP) {
      right = slen-nod[path].ndx+1;
      for(i = left; i < right-5; i+=3) {
        counts[mer_ndx(6, rseq, i)]++;
        glob++;
      }
      in_gene = 0;
    }
    if(in_gene == 1 && nod[path].strand == 1 && nod[path].type != STOP) {
      left = nod[path].ndx;
      for(i = left; i < right-5; i+=3) { counts[mer_ndx(6, seq, i)]++; glob++; }
      in_gene = 0;
    }
    path = nod[path].traceb;
  }
  for(i = 0; i < 4096; i++) {
    prob[i] = (counts[i]*1.0)/(glob*1.0);
    if(prob[i] == 0 && bg[i] != 0) tinf->gene_dc[i] = -5.0;
    else if(bg[i] == 0) tinf->gene_dc[i] = 0.0;
    else tinf->gene_dc[i] = log(prob[i]/bg[i]);
    if(tinf->gene_dc[i] > 5.0) tinf->gene_dc[i] = 5.0;
    if(tinf->gene_dc[i] < -5.0) tinf->gene_dc[i] = -5.0;
  }
}
Пример #4
0
/*******************************************************************************
  For a given start, record the base composition of the upstream region at
  positions -1 and -2 and -15 to -44.  This will be used to supplement the
  SD (or other) motif finder with additional information.
*******************************************************************************/
void count_upstream_composition(unsigned char *seq, int slen, int strand,
                                int pos, struct _training *tinf) {
  int i, start, count = 0;
  if(strand == 1) start = pos;
  else start = slen-1-pos;

  for(i = 1; i < 45; i++) {
    if(i > 2 && i < 15) continue;
    tinf->ups_comp[count][mer_ndx(1, seq, start-i)]++;
    count++;
  }
}