Exemple #1
0
Fichier : seq.c Projet : CPFL/gmeme
/**************************************************************************
 * Prepare a sequence for recognition by
 *  - making sure it is uppercase,
 *  - making sure it doesn't contain illegal characters,
 *  - adding flanking Xs to match START/END states, and
 *  - converting it to an integer format
 *  - computing cumulative GC counts
 *
 * In the integer form, each character in the sequence is replaced by
 * the index of that character in the alphabet array.  Thus, if the
 * alphabet is 'ACGT', every occurence of the letter 'G' in the
 * sequence will be represented by the index 2.
 **************************************************************************/
void prepare_sequence
  (SEQ_T* sequence, ALPH_T alph)
{
  int i_seq;        // Index in the sequence.
  int badchar;      // Number of characters converted.
  char wildcard;    // Wildcard character

  wildcard = alph_wildcard(alph);
  badchar = 0;

  for (i_seq = 0; i_seq < get_seq_length(sequence); i_seq++) {
    // Make sure the sequence is uppercase.
    if (islower((int)(sequence->sequence)[i_seq])) {
      (sequence->sequence)[i_seq] = toupper((int)(sequence->sequence)[i_seq]);
    }

    // Convert non-alphabetic characters to ambiguous.
    if (alph_index(alph, (sequence->sequence)[i_seq]) == -1) {
      fprintf(stderr, "%c -> %c\n", (sequence->sequence)[i_seq], wildcard);
      (sequence->sequence)[i_seq] = wildcard;
      badchar++;
    }
  }

  // Tell the user about the conversions.
  if (badchar) {
    fprintf(stderr, "Warning: converted %d non-alphabetic ", badchar);
    fprintf(stderr, "characters to %c in sequence %s.\n", wildcard, 
        get_seq_name(sequence));
  }

  // Add flanking X's.
  add_flanking_xs(sequence, alph);

  // Make the integer sequence.
  sequence->intseq = (int *)mm_malloc(sizeof(int) * get_seq_length(sequence));
  for (i_seq = 0; i_seq < get_seq_length(sequence); i_seq++) {
    (sequence->intseq)[i_seq]
      = alph_index(alph, (sequence->sequence)[i_seq]);
  }

  //
  // Get cumulative GC counts.
  //
  if (alph == DNA_ALPH) {
    int len = get_seq_length(sequence);
    char c = (sequence->sequence)[0];		// first character

    sequence->gc = (int *)mm_malloc(sizeof(int) * get_seq_length(sequence));

    // set count at first position
    (sequence->gc)[0] = (c == 'G' || c == 'C') ? 1 : 0;
    // set cumulative counts at rest of postitions
    for (i_seq = 1; i_seq < len; i_seq++) {
      c = (sequence->sequence)[i_seq];
      (sequence->gc)[i_seq] = (c == 'G' || c == 'C') ?
        (sequence->gc)[i_seq-1] + 1 : (sequence->gc)[i_seq-1];
    }
  }
}
Exemple #2
0
/*************************************************************************
 * Calculate the log odds score for a single motif-sized window.
 *************************************************************************/
static inline BOOLEAN_T score_motif_site(
  ALPH_T alph,
  char *seq,
  PSSM_T *pssm,
  double *score // OUT
) {
  int asize = alph_size(alph, ALPH_SIZE);
  MATRIX_T* pssm_matrix = pssm->matrix;
  double scaled_log_odds = 0.0;

  // For each position in the site
  int motif_position;
  for (motif_position = 0; motif_position < pssm->w; motif_position++) {

    char c = seq[motif_position];
    int aindex = alph_index(alph, c);
    // Check for gaps and ambiguity codes at this site
    if(aindex == -1 || aindex >= asize) return FALSE;

    scaled_log_odds += get_matrix_cell(motif_position, aindex, pssm_matrix);
  }

  *score = get_unscaled_pssm_score(scaled_log_odds, pssm);

  // Handle scores that are out of range
  if ((int) scaled_log_odds >= get_array_length(pssm->pv)) {
    scaled_log_odds = (float)(get_array_length(pssm->pv) - 1);
    *score = scaled_to_raw(scaled_log_odds, pssm->w, pssm->scale, pssm->offset);
  }
  return TRUE;
}
/*
 * Make one position in an array the sum of a set of other positions.
 */
static void calc_ambig(ALPH_T alph, BOOLEAN_T log_space, char ambig, 
    char *sources, ARRAY_T *array) {
  char *source;
  PROB_T sum;
  PROB_T value;

  sum = 0.0;
  for (source = sources; *source != '\0'; ++source) {
    value = get_array_item(alph_index(alph, *source), array);
    if (log_space) {
      sum = LOG_SUM(sum, value);
    } else {
      sum += value;
    }
  }
  set_array_item(alph_index(alph, ambig), sum, array);
}
/*
 * Replace the elements an array of frequences with the average
 * over complementary bases.
 */
void average_freq_with_complement(ALPH_T alph, ARRAY_T *freqs) {
  int a_index, t_index, g_index, c_index;
  double at_freq, gc_freq;

  assert(alph == DNA_ALPH);

  a_index = alph_index(alph, 'A');
  t_index = alph_index(alph, 'T');
  g_index = alph_index(alph, 'G');
  c_index = alph_index(alph, 'C');

  at_freq = (get_array_item(a_index, freqs) + 
      get_array_item(t_index, freqs)) / 2.0;
  gc_freq = (get_array_item(g_index, freqs) + 
      get_array_item(c_index, freqs)) / 2.0;
  set_array_item(a_index, at_freq, freqs);
  set_array_item(t_index, at_freq, freqs);
  set_array_item(g_index, gc_freq, freqs);
  set_array_item(c_index, gc_freq, freqs);

}
/*
 * Take the counts from an ambiguous character and evenly distribute
 * them among the corresponding concrete characters.
 *
 * This function operates in log space.
 */
static void dist_ambig(ALPH_T alph, char ambig, char *concrete_chars, 
    ARRAY_T* freqs) {
  PROB_T ambig_count, concrete_count;
  int ambig_index, num_concretes, i, concrete_index;

  // Get the count to be distributed.
  ambig_index = alph_index(alph, ambig);
  ambig_count = get_array_item(ambig_index, freqs);
  // Divide it by the number of corresponding concrete characters.
  num_concretes = strlen(concrete_chars);
  ambig_count -= my_log2((PROB_T)num_concretes);
  // Distribute it in equal portions to the given concrete characters.
  for (i = 0; i < num_concretes; i++) {
    concrete_index = alph_index(alph, concrete_chars[i]);
    concrete_count = get_array_item(concrete_index, freqs);
    // Add the ambiguous counts.
    concrete_count = LOG_SUM(concrete_count, ambig_count);
    set_array_item(concrete_index, concrete_count, freqs);
  }
  // Set the ambiguous count to zero.
  set_array_item(ambig_index, LOG_ZERO, freqs);
}
Exemple #6
0
/****************************************************************************
*  Return an array containing the frequencies in the alignment for each 
*  character of the alphabet. Gaps and ambiguity characters other then
*  ANY_BASE are not counted. The freq. of ANY_BASE characters is stored
*  in the last element of the array.
****************************************************************************/
ARRAY_T* get_alignment_freqs(ALPH_T alph, ALIGNMENT_T* alignment) {
  char c = 0;
  int aindex = 0;
  int asize = 0;
  int i = 0;
  int s = 0;
  int total_bases = 0;
  int* num_bases = NULL;
  ARRAY_T* freqs = NULL;
  
  // Initialize counts for each character in the alphabet
  asize = alph_size(alph, ALPH_SIZE);
  num_bases = mm_malloc(asize * sizeof(int));
  for (i = 0; i < asize; i++) {
    num_bases[i] = 0;
  }

  for (s = 0; s < alignment->num_sequences; s++) {
    for (i = 0; i < alignment->length; i++) {
      c = get_seq_char(i, alignment->sequences[s]);
      aindex = alph_index(alph, c);
      // c might be an ambiguity code. We don't count ambiguity codes.
      if (aindex != -1 && aindex < asize) {
        num_bases[aindex]++;
        total_bases++;
      }
    }
  }

  freqs = allocate_array(asize);
  for (i = 0; i < asize; i++) {
    set_array_item(i, (double) num_bases[i] / (double) total_bases, freqs);
  }

  // Clean up the count of characters
  myfree(num_bases);

  return freqs;
}
Exemple #7
0
Fichier : seq.c Projet : CPFL/gmeme
/****************************************************************************
 *  Return an array containing the frequencies in the sequence for each
 *  character of the alphabet. Characters not in the alphabet are not
 *  counted.
 ****************************************************************************/
ARRAY_T* get_sequence_freqs
  (SEQ_T* seq, ALPH_T alph)
{
  char c = 0;
  int a_index = 0;
  int a_size = 0;
  int i = 0;
  int total_bases = 0;
  int* num_bases = NULL;
  ARRAY_T* freqs = NULL;

  // Initialize counts for each character in the alphabet
  a_size = alph_size(alph, ALPH_SIZE);
  num_bases = mm_malloc(a_size * sizeof(int));
  for (i = 0; i < a_size; i++) {
    num_bases[i] = 0;
  }

  for (i = 0; i < seq->length; i++) {
    c = get_seq_char(i, seq);
    if (c != '-' && c != '-') {
      a_index = alph_index(alph, c);
      if (a_index == -1 || a_index >= a_size) continue;
      num_bases[a_index]++;
      total_bases++;
    }
  }

  freqs = allocate_array(a_size);
  for (i = 0; i < a_size; i++) {
    set_array_item(i, (double) num_bases[i] / (double) total_bases, freqs);
  }

  // Clean up the count of characters
  myfree(num_bases);

  return freqs;
}
Exemple #8
0
/*************************************************************************
 * Convert the string representing an alignment column into an integer
 * which will be the column index for that alignment column in the PSSM.
 * If the alphabet has m characters, and the alignment columns have n entries,
 * the array of all alignment columns is conveniently numbered by the set of
 * consecutive n-digit base m numerals: 
 *   AAAA = 0000, AAAC = 0001, ..., TTTG = 3332, TTTT = 3333.
 *************************************************************************/
int hash_alignment_col(ALPH_T alph, char* alignment_col, int alignment_col_size) {

  // The alignment column must have at least
  // one character aside from the terminating null.
  assert(alignment_col_size >= 1);
  assert(alignment_col != NULL);

  int asize = alph_size(alph, ALPH_SIZE);
  int hash = 0;
  int place = 1;
  int i, j;
  for (i = alignment_col_size - 1; i >= 0; i--) {
    // Find alignment column character in alphabet
    j = alph_index(alph, alignment_col[i]);
    if (j == -1 || j >= asize)  {
      die("Alignment character %c not found in alphabet\n", alignment_col[i]);
    }
    hash += j * place;
    place *= asize;
  }

  return hash;
} // hash_alignment_col
Exemple #9
0
Fichier : seq.c Projet : CPFL/gmeme
/****************************************************************************
 *  Return an array containing the frequencies in the sequences for each
 *  character of the alphabet. Characters not in the alphabet are not
 *  counted.
 *
 *  When seq is provided it returns null, otherwise it converts the accumulated 
 *  result in bgcalc into a background.
 *
 *
 *  Pseudocode example:
 *    ALPH_T alph = ...
 *    BGCALC_T *bgcalc = NULL;
 *    for each seq:
 *      calculate_background(alph, seq, &bgcalc);
 *    ARRAY_T *bg = calculate_background(NULL, &bgcalc);
 ****************************************************************************/
ARRAY_T* calculate_background(
  ALPH_T alph,
  SEQ_T* seq,
  BGCALC_T** bgcalc
){
  BGCALC_T *calc;
  int a_size, i, a_index;
  char c;
  double freq, chunk_part, chunk_freq;
  ARRAY_T *background;
  assert(bgcalc != NULL);
  assert(seq != NULL || *bgcalc != NULL);
  // get the alphabet
  // get the alphabet size
  a_size = alph_size(alph, ALPH_SIZE);
  if (*bgcalc == NULL) {
    //allocate and initialize calc
    calc = mm_malloc(sizeof(BGCALC_T));
    calc->alph = alph;
    calc->chunk_seen = 0;
    calc->weight = 0;
    calc->chunk_counts = mm_malloc(a_size * sizeof(long));
    calc->bg = mm_malloc(a_size * sizeof(double));
    for (i = 0; i < a_size; ++i) {
      calc->chunk_counts[i] = 0;
      calc->bg[i] = 0;
    }
    *bgcalc = calc;
  } else {
    calc = *bgcalc;
    assert(alph == calc->alph);
    if (calc->weight == LONG_MAX) return NULL;
  }
  if (seq == NULL) {
    // no sequence so calculate the final result
    background = allocate_array(alph_size(alph, ALL_SIZE));
    if (calc->weight == 0) {
      if (calc->chunk_seen > 0) {
        // when we haven't had to approximate yet
        // just do a normal background calculation
        for (i = 0; i < a_size; i++) {
          freq = (double) calc->chunk_counts[i] / (double) calc->chunk_seen;
          set_array_item(i, freq, background);
        }
      } else {
        fputs("Uniform\n", stdout);
        // when there are no counts then return uniform
        freq = (double) 1 / (double) a_size;
        for (i = 0; i < a_size; i++) {
          set_array_item(i, freq, background);
        }
      }
    } else {
      if (calc->chunk_seen > 0) {
        // combine the frequencies for the existing chunks with the counts
        // for the partially completed chunk
        chunk_part = (double) calc->chunk_seen / (double) BG_CALC_CHUNK;
        for (i = 0; i < a_size; i++) {
          chunk_freq = (double) calc->chunk_counts[i] / 
              (double) calc->chunk_seen;
          freq = ((calc->bg[i] * calc->weight) + (chunk_freq * chunk_part)) / 
              (calc->weight + chunk_part);
          set_array_item(i, freq, background);
        }
      } else {
        // in the odd case we get to an integer number of chunks
        for (i = 0; i < a_size; i++) {
          set_array_item(i, calc->bg[i], background);
        }
      }
    }
    calc_ambigs(alph, FALSE, background);
    // free bgcalc structure
    free(calc->bg);
    free(calc->chunk_counts);
    free(calc);
    *bgcalc = NULL;
    return background;
  }
  // we have a sequence to add to the background calculation
  for (i = 0; i < seq->length; i++) {
    c = get_seq_char(i, seq);
    a_index = alph_index(alph, c);
    if (a_index == -1 || a_index >= a_size) continue;
    calc->chunk_counts[a_index]++;
    calc->chunk_seen++;
    if (calc->chunk_seen == BG_CALC_CHUNK) {
      if (calc->weight == 0) {
        for (i = 0; i < a_size; i++) {
          calc->bg[i] = (double) calc->chunk_counts[i] / (double) BG_CALC_CHUNK;
        }
      } else {
        for (i = 0; i < a_size; i++) {
          chunk_freq = (double) calc->chunk_counts[i] / (double) BG_CALC_CHUNK;
          calc->bg[i] = (calc->bg[i] * calc->weight + chunk_freq) / 
              (calc->weight + 1);
        }
      }
      calc->weight++;
      // reset the counts for the next chunk
      for (i = 0; i < a_size; i++) {
        calc->chunk_counts[i] = 0;
      }
      calc->chunk_seen = 0;
      // I don't think it is feasible to reach this limit
      // but I guess I'd better check anyway
      if (calc->weight == LONG_MAX) {
        fprintf(stderr, "Sequence data set is so large that even the "
            "approximation designed for large datasets can't handle it!");
        return NULL;
      }
    }
  }
  return NULL;
}
Exemple #10
0
/*************************************************************************
 * Calculate the odds score for each motif-sized window at each
 * site in the sequence using the given nucleotide frequencies.
 *
 * This function is a lightweight version based on the one contained in
 * motiph-scoring. Several calculations that are unnecessary for gomo
 * have been removed in order to speed up the process
 *************************************************************************/
static double score_sequence(
    SEQ_T *seq,         // sequence to scan (IN)
    MOTIF_T *motif,     // motif already converted to odds values (IN)
    PSSM_T *m_pssm,     // motif pssm (IN)
    MATRIX_T *m_odds,   // motif odds (IN)
    int method,         // method used for scoring (IN)
    double threshold,   // Threshold to use in TOTAL_HITS mode with a PWM
    ARRAY_T *bg_freqs   //background model
    )
{

  assert(seq != NULL);
  assert(motif != NULL);
  assert((method == TOTAL_HITS && m_pssm) || (method != TOTAL_HITS && m_odds));

  char* raw_seq = get_raw_sequence(seq);
  int seq_length = get_seq_length(seq);

  // Get the pv lookup table
  ARRAY_T* pv_lookup = NULL;
  if (NULL != m_pssm) {
    pv_lookup = m_pssm->pv;
    assert(get_array_length(pv_lookup) > 0);
  }

  // Prepare storage for the string representing the portion
  // of the reference sequence within the window.
  char* window_seq = (char *) mm_malloc(sizeof(char) * (get_motif_length(motif) + 1));
  window_seq[get_motif_length(motif)] = '\0';

  int max_index = seq_length - get_motif_length(motif);
  if (max_index < 0) max_index = 0;
  const int asize = alph_size(get_motif_alph(motif), ALPH_SIZE);
  double* odds =  (double*) mm_malloc(sizeof(double)*max_index);
  double* scaled_log_odds =  (double*) mm_malloc(sizeof(double)*max_index);

  // For each site in the sequence
  int seq_index;
  for (seq_index = 0; seq_index < max_index; seq_index++) {
    double odd = 1.0;
    scaled_log_odds[seq_index] = 0;

    // For each site in the motif window
    int motif_position;
    for (motif_position = 0; motif_position < get_motif_length(motif); motif_position++) {
      char c = raw_seq[seq_index + motif_position];
      window_seq[motif_position] = c;

      // Check for gaps at this site
      if(c == '-' || c == '.') {
        break;
      }

      // Check for ambiguity codes at this site
      //TODO: This next call is very expensive - it takes up approx. 10% of a
      //      programme's running time. It should be fixed up somehow.
      int aindex = alph_index(get_motif_alph(motif), c);
      if (aindex > asize) {
        break;
      }
      if (method == TOTAL_HITS) {
        //If we're in this mode, then we're using LOG ODDS.
        //scaled_log_odds[seq_index] += get_matrix_cell(motif_position, aindex, get_motif_freqs(motif));
        scaled_log_odds[seq_index] += get_matrix_cell(motif_position, aindex, m_pssm->matrix);
      } else {
        odd *= get_matrix_cell(motif_position, aindex, m_odds);
      }
    }
    odds[seq_index] = odd;
  }

  // return odds as requested (MAX or AVG scoring)
  double requested_odds = 0.0;
  if (method == AVG_ODDS){
    for (seq_index = 0; seq_index < max_index; seq_index++) {
      requested_odds += odds[seq_index];
    }
    requested_odds /= max_index + 1;		// Divide by 0 if max_index==0
  } else if (method == MAX_ODDS){
    for (seq_index = 0; seq_index < max_index; seq_index++) {
      if (odds[seq_index] > requested_odds){
        requested_odds = odds[seq_index];
      }
    }
  } else if (method == SUM_ODDS) {
    for (seq_index = 0; seq_index < max_index; seq_index++) {
      requested_odds += odds[seq_index];
    }
  } else if (method == TOTAL_HITS) {
    for (seq_index = 0; seq_index < max_index; seq_index++) {

      if (scaled_log_odds[seq_index] >= (double)get_array_length(pv_lookup)) {
        scaled_log_odds[seq_index] = (double)(get_array_length(pv_lookup) - 1);
      } 
      double pvalue = get_array_item((int) scaled_log_odds[seq_index], pv_lookup);

      //Figure out how to calculate the p-value of a hit
      //fprintf(stderr, "m: %s pv_l len: %i scaled_log_odds: %g seq index: %i pvalue: %g\n", 
      //    get_motif_id(motif), get_array_length(pv_lookup), scaled_log_odds[seq_index], seq_index, pvalue);

      if (pvalue < threshold) {
        requested_odds++; //Add another hit.
      }

      if (verbosity > HIGHER_VERBOSE) {
        fprintf(stderr, "Window Data: %s\t%s\t%i\t%g\t%g\t%g\n",
            get_seq_name(seq), get_motif_id(motif), seq_index, scaled_log_odds[seq_index], pvalue, threshold);
      }
    }
  }

  myfree(odds);
  myfree(scaled_log_odds);
  myfree(window_seq);
  return requested_odds;
}
/*
 * Load background file frequencies into the array.
 */
ARRAY_T* get_file_frequencies(ALPH_T *alph, char *bg_filename, ARRAY_T *freqs) {
  regmatch_t matches[4];
  STR_T *line;
  char chunk[BG_CHUNK_SIZE+1], letter[2], *key;
  int size, terminate, offset, i;
  FILE *fp;
  regex_t bgfreq;
  double freq;
  RBTREE_T *letters;
  RBNODE_T *node;
  
  regcomp_or_die("bg freq", &bgfreq, BGFREQ_RE, REG_EXTENDED);
  letters = rbtree_create(rbtree_strcasecmp, rbtree_strcpy, free, rbtree_dblcpy, free);
  line = str_create(100);
  if (!(fp = fopen(bg_filename, "r"))) {
    die("Unable to open background file \"%s\" for reading.\n", bg_filename);
  }
  
  terminate = feof(fp);
  while (!terminate) {
    size = fread(chunk, sizeof(char), BG_CHUNK_SIZE, fp);
    chunk[size] = '\0';
    terminate = feof(fp);
    offset = 0;
    while (offset < size) {
      // skip mac newline
      if (str_len(line) == 0 && chunk[offset] == '\r') {
        offset++;
        continue;
      }
      // find next new line
      for (i = offset; i < size; ++i) {
        if (chunk[i] == '\n') break;
      }
      // append portion up to the new line or end of chunk
      str_append(line, chunk+offset, i - offset);
      // read more if we didn't find a new line
      if (i == size && !terminate) break;
      // move the offset past the new line
      offset = i + 1;
      // handle windows new line
      if (str_char(line, -1) == '\r') str_truncate(line, -1);
      // remove everything to the right of a comment character
      for (i = 0; i < str_len(line); ++i) {
        if (str_char(line, i) == '#') {
          str_truncate(line, i);
          break;
        }
      }
      // check the line for a single letter followed by a number
      if (regexec_or_die("bg freq", &bgfreq, str_internal(line), 4, matches, 0)) {
        // parse the letter and frequency value
        regex_strncpy(matches+1, str_internal(line), letter, 2);
        freq = regex_dbl(matches+2, str_internal(line));
        // check the frequency is acceptable
        if (freq < 0 || freq > 1) {
          die("The background file lists the illegal probability %g for "
            "the letter %s.\n", freq, letter);
        } else if (freq == 0) {
          die("The background file lists a probability of zero for the "
            "letter %s\n", letter);
        }
        if (freq >= 0 && freq <= 1) rbtree_put(letters, letter, &freq);
      }
      str_clear(line);
    }
  }
  // finished with the file so clean up file parsing stuff
  fclose(fp);
  str_destroy(line, FALSE);
  regfree(&bgfreq);
  // guess the alphabet
  if (*alph == INVALID_ALPH) {
    switch (rbtree_size(letters)) {
      case PROTEIN_ASIZE:
        *alph = PROTEIN_ALPH;
        break;
      case DNA_ASIZE:
        *alph = DNA_ALPH;
        break;
      default:
        die("Number of single character entries in background does not match "
            "an alphabet.\n");
    }
  }
  // make the background
  if (freqs == NULL) freqs = allocate_array(alph_size(*alph, ALL_SIZE));
  assert(get_array_length(freqs) >= alph_size(*alph, ALL_SIZE));
  init_array(-1, freqs);
  for (node = rbtree_first(letters); node != NULL; node = rbtree_next(node)) {
    key = (char*)rbtree_key(node);
    i = alph_index(*alph, key[0]);
    freq = *((double*)rbtree_value(node));
    if (i == -1) {
      die("Background contains letter %s which is not in the %s alphabet.\n", 
          key, alph_name(*alph));
    }
    if (get_array_item(i, freqs) != -1) {
      die("Background contains letter %s which has the same meaning as an "
          "already listed letter.\n", key);
    }
    set_array_item(i, freq, freqs);
  }
  // check that all items were set
  for (i = 0; i < alph_size(*alph, ALPH_SIZE); i++) {
    if (get_array_item(i, freqs) == -1) {
      die("Background is missing letter %c.\n", alph_char(*alph, i));
    }
  }
  // disabled for backwards compatability (AMA test was failing)
  //normalize_subarray(0, ALPH_ASIZE[*alph], 0.0, freqs);
  // calculate the values of the ambiguous letters from the concrete ones
  calc_ambigs(*alph, FALSE, freqs);
  // cleanup
  rbtree_destroy(letters);
  // return result
  return freqs;
}
/*
 * Determine if a letter is ambiguous
 */
BOOLEAN_T alph_is_ambiguous(ALPH_T alph, char letter) {
  int index;
  index = alph_index(alph, letter);
  return (index >= ALPH_ASIZE[alph]);
}
/*
 * Determine if a letter is a concrete representation
 */
BOOLEAN_T alph_is_concrete(ALPH_T alph, char letter) {
  int index;
  index = alph_index(alph, letter);
  return (index != -1 && index < ALPH_ASIZE[alph]);
}
/*
 * Determine if a letter is known in this alphabet
 */
BOOLEAN_T alph_is_known(ALPH_T alph, char letter) {
  int index;
  index = alph_index(alph, letter);
  return (index != -1);
}
Exemple #15
0
void ramen_scan_sequences() {
		FILE* seq_file = NULL;
		MOTIF_T* motif = NULL;
		MOTIF_T* rev_motif = NULL;
		SEQ_T* sequence = NULL;
		SCANNED_SEQUENCE_T* scanned_seq = NULL;
		PATTERN_T* pattern;
		int i;
		int j;
		SEQ_T** seq_list;
		int num_seqs;
		int seq_len;
		//For the bdb_bg mode:
		ARRAY_T* seq_bg_freqs;
		double atcontent;
		double roundatcontent;
		double avg_seq_length = 0;

		//Open the file.
		if (open_file(args.sequence_filename, "r", FALSE, "FASTA", "sequences", &seq_file) == 0) {
				fprintf(stderr, "Couldn't open the file %s.\n", args.sequence_filename);
				ramen_terminate(1);
		}

		//Start reading in the sequences
		read_many_fastas(ramen_alph, seq_file, MAX_SEQ_LENGTH, &num_seqs, &seq_list);


		seq_ids = new_string_list();
		seq_fscores = allocate_array(num_seqs);

		//Allocate the required space for results
		results = malloc(sizeof(double*) * motifs.num);
		for (i=0;i<motifs.num;i++) {
				results[i] = malloc(sizeof(double)*num_seqs);
		}

		for (j=0;j<num_seqs;j++) {

				fprintf(stderr, "\rScanning %i of %i sequences...", j+1, num_seqs);

				//copy the pointer into our current object for clarity
				sequence = seq_list[j];

				//Read the fluorescence data from the description field.
				add_string(get_seq_name(sequence),seq_ids);
				seq_len = get_seq_length(sequence);
				set_array_item(j,atof(get_seq_description(sequence)),seq_fscores);

				//Scan with each motif.
				for (i=0;i<motifs.num;i++) {
						int motifindex = i*2;

						results[i][j] = ramen_sequence_scan(sequence, motif_at(motifs.motifs, motifindex), 
											      motif_at(motifs.motifs, motifindex+1),
											      NULL, NULL, //No need to pass PSSM.
										              AVG_ODDS, 0, TRUE, 0, motifs.bg_freqs);

						if (TRUE == args.linreg_normalise) {
								int k;
								double maxscore = 1;
								motif = motif_at(motifs.motifs,motifindex); 
								for (k=0;k<get_motif_length(motif);k++) {
										double maxprob = 0;
										if (maxprob < get_matrix_cell(k, alph_index(ramen_alph, 'A'), get_motif_freqs(motif)))
												maxprob = get_matrix_cell(k, alph_index(ramen_alph, 'A'), get_motif_freqs(motif));
										if (maxprob < get_matrix_cell(k, alph_index(ramen_alph, 'C'), get_motif_freqs(motif)))
												maxprob = get_matrix_cell(k, alph_index(ramen_alph, 'C'), get_motif_freqs(motif));
										if (maxprob < get_matrix_cell(k, alph_index(ramen_alph, 'G'), get_motif_freqs(motif)))
												maxprob = get_matrix_cell(k, alph_index(ramen_alph, 'G'), get_motif_freqs(motif));
										if (maxprob < get_matrix_cell(k, alph_index(ramen_alph, 'T'), get_motif_freqs(motif)))
												maxprob = get_matrix_cell(k, alph_index(ramen_alph, 'T'), get_motif_freqs(motif));
										maxscore *= maxprob;
								}
								results[i][j] /= maxscore;
						}
				}
		}

}