示例#1
0
/*****************************************************************************
 * Reads frequency attributes into the pre-allocated freqs array.
 ****************************************************************************/
static void parse_freq_attrs(PS_T *ps, const char* tag, const xmlChar **attrs) {
  int i, ncore, seen, *idx;
  char *end_ptr;
  double value, sum;
  RBNODE_T *node;
  bool seen_bad;
  ncore = rbtree_size(ps->alph_ids);
  // initilize the freqs array
  if (ps->freqs == NULL) ps->freqs = mm_malloc(sizeof(double) * ncore);
  // reset freqs array;
  for (i = 0; i < ncore; i++) ps->freqs[i] = -1;
  seen = 0;
  seen_bad = false;
  sum = 0.0;
  // iterate over attributes
  for (i = 0; attrs[i] != NULL; i += 2) {
    idx = (int*)rbtree_get(ps->alph_ids, attrs[i]);
    if (idx != NULL) {
      assert(*idx < ncore);
      if (ps->freqs[*idx] != -1) {
        dreme_attr_parse_error(ps, PARSE_ATTR_DUPLICATE, tag, (const char*)attrs[i], NULL);
        continue;
      }
      seen++;
      errno = 0; // reset because we're about to check it
      value = strtod((const char*)attrs[i+1], &end_ptr);
      // allow out of range values, mainly because freqs can be very close to zero
      if (end_ptr == (const char*)attrs[i+1] || (errno && errno != ERANGE) || value < 0 || value > 1) {
        dreme_attr_parse_error(ps, PARSE_ATTR_BAD_VALUE, tag, (const char*)attrs[i], (const char*)attrs[i+1]);
        ps->freqs[*idx] = 0; // mark frequence as seen, even though it's bad
        seen_bad = true;
        continue;
      }
      ps->freqs[*idx] = value;
      sum += value;
    }
  }
  // check we got everthing
  if (seen < ncore) {
    // identify what we're missing
    for (node = rbtree_first(ps->alph_ids); node != NULL; node = rbtree_next(node)) {
      idx = (int*)rbtree_value(node);
      if (ps->freqs[*idx] == -1) {
        dreme_attr_parse_error(ps, PARSE_ATTR_MISSING, tag, (char*)rbtree_key(node), NULL);
      }
    }
  } else if (!seen_bad) {
    // check the frequencies sum to 1
    double delta = sum - 1;
    delta = (delta < 0 ? -delta : delta);
    if (delta > (0.001 * ncore)) {
      // dreme writes background probabilities to 3 decimal places so assuming 
      // the error on each is at maximum 0.001 then the total error for the 
      // sum must be less than or equal to 0.004
      error(ps, "Probabilities of %s do not sum to 1, got %g .\n", tag, sum);
    }
  }
}
/*****************************************************************************
 * MEME > training_set > /alphabet
 * Read in the number of symbols in the alphabet and if it is nucleotide or 
 * amino-acid (RNA is apparently classed as nucleotide).
 ****************************************************************************/
void mxml_end_alphabet(void *ctx) {
  PARMSG_T *message;
  CTX_T *data;
  RBNODE_T *node;
  char *id, symbol;
  bool *exists;
  int i;

  data = (CTX_T*)ctx;
  if (data->alph == NULL) { // Custom alphabet
    alph_reader_done(data->alph_rdr);
    // report any errors that the alphabet reader found
    while (alph_reader_has_message(data->alph_rdr)) {
      message = alph_reader_next_message(data->alph_rdr);
      if (message->severity == SEVERITY_ERROR) {
        local_error(data, "Alphabet error: %s.\n", message->message);
      } else {
        local_warning(data, "Alphabet warning: %s.\n", message->message);
      }
      parmsg_destroy(message);
    }
    // try to get an alphabet
    data->alph = alph_reader_alphabet(data->alph_rdr);
    alph_reader_destroy(data->alph_rdr);
    data->alph_rdr = NULL;
  } else { // legacy alphabet
    exists = mm_malloc(sizeof(bool) * alph_size_core(data->alph));
    // set list to false
    for (i = 0; i < alph_size_core(data->alph); i++) exists[i] = false;
    // check that id's were defined for all the core alphabet symbols
    for (node = rbtree_first(data->letter_lookup); node != NULL; node = rbtree_next(node)) {
      id = (char*)rbtree_key(node);
      symbol = ((char*)rbtree_value(node))[0];
      if (exists[alph_indexc(data->alph, symbol)]) {
        // duplicate!
        local_error(data, "The letter identifier %s is not the first to refer to symbol %c.\n", id, symbol);
      }
      exists[alph_indexc(data->alph, symbol)] = true;
    }
    // now check for missing identifiers
    for (i = 0; i < alph_size_core(data->alph); i++) {
      if (!exists[i]) {
        // missing id for symbol
        local_error(data, "The symbol %c does not have an assigned identifier.\n", alph_char(data->alph, i));
      }
    }
    free(exists);
  }
}
示例#3
0
/*
 * Load background file frequencies into the array.
 */
ARRAY_T* get_file_frequencies(ALPH_T *alph, char *bg_filename, ARRAY_T *freqs) {
  regmatch_t matches[4];
  STR_T *line;
  char chunk[BG_CHUNK_SIZE+1], letter[2], *key;
  int size, terminate, offset, i;
  FILE *fp;
  regex_t bgfreq;
  double freq;
  RBTREE_T *letters;
  RBNODE_T *node;
  
  regcomp_or_die("bg freq", &bgfreq, BGFREQ_RE, REG_EXTENDED);
  letters = rbtree_create(rbtree_strcasecmp, rbtree_strcpy, free, rbtree_dblcpy, free);
  line = str_create(100);
  if (!(fp = fopen(bg_filename, "r"))) {
    die("Unable to open background file \"%s\" for reading.\n", bg_filename);
  }
  
  terminate = feof(fp);
  while (!terminate) {
    size = fread(chunk, sizeof(char), BG_CHUNK_SIZE, fp);
    chunk[size] = '\0';
    terminate = feof(fp);
    offset = 0;
    while (offset < size) {
      // skip mac newline
      if (str_len(line) == 0 && chunk[offset] == '\r') {
        offset++;
        continue;
      }
      // find next new line
      for (i = offset; i < size; ++i) {
        if (chunk[i] == '\n') break;
      }
      // append portion up to the new line or end of chunk
      str_append(line, chunk+offset, i - offset);
      // read more if we didn't find a new line
      if (i == size && !terminate) break;
      // move the offset past the new line
      offset = i + 1;
      // handle windows new line
      if (str_char(line, -1) == '\r') str_truncate(line, -1);
      // remove everything to the right of a comment character
      for (i = 0; i < str_len(line); ++i) {
        if (str_char(line, i) == '#') {
          str_truncate(line, i);
          break;
        }
      }
      // check the line for a single letter followed by a number
      if (regexec_or_die("bg freq", &bgfreq, str_internal(line), 4, matches, 0)) {
        // parse the letter and frequency value
        regex_strncpy(matches+1, str_internal(line), letter, 2);
        freq = regex_dbl(matches+2, str_internal(line));
        // check the frequency is acceptable
        if (freq < 0 || freq > 1) {
          die("The background file lists the illegal probability %g for "
            "the letter %s.\n", freq, letter);
        } else if (freq == 0) {
          die("The background file lists a probability of zero for the "
            "letter %s\n", letter);
        }
        if (freq >= 0 && freq <= 1) rbtree_put(letters, letter, &freq);
      }
      str_clear(line);
    }
  }
  // finished with the file so clean up file parsing stuff
  fclose(fp);
  str_destroy(line, FALSE);
  regfree(&bgfreq);
  // guess the alphabet
  if (*alph == INVALID_ALPH) {
    switch (rbtree_size(letters)) {
      case PROTEIN_ASIZE:
        *alph = PROTEIN_ALPH;
        break;
      case DNA_ASIZE:
        *alph = DNA_ALPH;
        break;
      default:
        die("Number of single character entries in background does not match "
            "an alphabet.\n");
    }
  }
  // make the background
  if (freqs == NULL) freqs = allocate_array(alph_size(*alph, ALL_SIZE));
  assert(get_array_length(freqs) >= alph_size(*alph, ALL_SIZE));
  init_array(-1, freqs);
  for (node = rbtree_first(letters); node != NULL; node = rbtree_next(node)) {
    key = (char*)rbtree_key(node);
    i = alph_index(*alph, key[0]);
    freq = *((double*)rbtree_value(node));
    if (i == -1) {
      die("Background contains letter %s which is not in the %s alphabet.\n", 
          key, alph_name(*alph));
    }
    if (get_array_item(i, freqs) != -1) {
      die("Background contains letter %s which has the same meaning as an "
          "already listed letter.\n", key);
    }
    set_array_item(i, freq, freqs);
  }
  // check that all items were set
  for (i = 0; i < alph_size(*alph, ALPH_SIZE); i++) {
    if (get_array_item(i, freqs) == -1) {
      die("Background is missing letter %c.\n", alph_char(*alph, i));
    }
  }
  // disabled for backwards compatability (AMA test was failing)
  //normalize_subarray(0, ALPH_ASIZE[*alph], 0.0, freqs);
  // calculate the values of the ambiguous letters from the concrete ones
  calc_ambigs(*alph, FALSE, freqs);
  // cleanup
  rbtree_destroy(letters);
  // return result
  return freqs;
}