/***************************************************************************** * Reads frequency attributes into the pre-allocated freqs array. ****************************************************************************/ static void parse_freq_attrs(PS_T *ps, const char* tag, const xmlChar **attrs) { int i, ncore, seen, *idx; char *end_ptr; double value, sum; RBNODE_T *node; bool seen_bad; ncore = rbtree_size(ps->alph_ids); // initilize the freqs array if (ps->freqs == NULL) ps->freqs = mm_malloc(sizeof(double) * ncore); // reset freqs array; for (i = 0; i < ncore; i++) ps->freqs[i] = -1; seen = 0; seen_bad = false; sum = 0.0; // iterate over attributes for (i = 0; attrs[i] != NULL; i += 2) { idx = (int*)rbtree_get(ps->alph_ids, attrs[i]); if (idx != NULL) { assert(*idx < ncore); if (ps->freqs[*idx] != -1) { dreme_attr_parse_error(ps, PARSE_ATTR_DUPLICATE, tag, (const char*)attrs[i], NULL); continue; } seen++; errno = 0; // reset because we're about to check it value = strtod((const char*)attrs[i+1], &end_ptr); // allow out of range values, mainly because freqs can be very close to zero if (end_ptr == (const char*)attrs[i+1] || (errno && errno != ERANGE) || value < 0 || value > 1) { dreme_attr_parse_error(ps, PARSE_ATTR_BAD_VALUE, tag, (const char*)attrs[i], (const char*)attrs[i+1]); ps->freqs[*idx] = 0; // mark frequence as seen, even though it's bad seen_bad = true; continue; } ps->freqs[*idx] = value; sum += value; } } // check we got everthing if (seen < ncore) { // identify what we're missing for (node = rbtree_first(ps->alph_ids); node != NULL; node = rbtree_next(node)) { idx = (int*)rbtree_value(node); if (ps->freqs[*idx] == -1) { dreme_attr_parse_error(ps, PARSE_ATTR_MISSING, tag, (char*)rbtree_key(node), NULL); } } } else if (!seen_bad) { // check the frequencies sum to 1 double delta = sum - 1; delta = (delta < 0 ? -delta : delta); if (delta > (0.001 * ncore)) { // dreme writes background probabilities to 3 decimal places so assuming // the error on each is at maximum 0.001 then the total error for the // sum must be less than or equal to 0.004 error(ps, "Probabilities of %s do not sum to 1, got %g .\n", tag, sum); } } }
/***************************************************************************** * MEME > training_set > /alphabet * Read in the number of symbols in the alphabet and if it is nucleotide or * amino-acid (RNA is apparently classed as nucleotide). ****************************************************************************/ void mxml_end_alphabet(void *ctx) { PARMSG_T *message; CTX_T *data; RBNODE_T *node; char *id, symbol; bool *exists; int i; data = (CTX_T*)ctx; if (data->alph == NULL) { // Custom alphabet alph_reader_done(data->alph_rdr); // report any errors that the alphabet reader found while (alph_reader_has_message(data->alph_rdr)) { message = alph_reader_next_message(data->alph_rdr); if (message->severity == SEVERITY_ERROR) { local_error(data, "Alphabet error: %s.\n", message->message); } else { local_warning(data, "Alphabet warning: %s.\n", message->message); } parmsg_destroy(message); } // try to get an alphabet data->alph = alph_reader_alphabet(data->alph_rdr); alph_reader_destroy(data->alph_rdr); data->alph_rdr = NULL; } else { // legacy alphabet exists = mm_malloc(sizeof(bool) * alph_size_core(data->alph)); // set list to false for (i = 0; i < alph_size_core(data->alph); i++) exists[i] = false; // check that id's were defined for all the core alphabet symbols for (node = rbtree_first(data->letter_lookup); node != NULL; node = rbtree_next(node)) { id = (char*)rbtree_key(node); symbol = ((char*)rbtree_value(node))[0]; if (exists[alph_indexc(data->alph, symbol)]) { // duplicate! local_error(data, "The letter identifier %s is not the first to refer to symbol %c.\n", id, symbol); } exists[alph_indexc(data->alph, symbol)] = true; } // now check for missing identifiers for (i = 0; i < alph_size_core(data->alph); i++) { if (!exists[i]) { // missing id for symbol local_error(data, "The symbol %c does not have an assigned identifier.\n", alph_char(data->alph, i)); } } free(exists); } }
/* * Load background file frequencies into the array. */ ARRAY_T* get_file_frequencies(ALPH_T *alph, char *bg_filename, ARRAY_T *freqs) { regmatch_t matches[4]; STR_T *line; char chunk[BG_CHUNK_SIZE+1], letter[2], *key; int size, terminate, offset, i; FILE *fp; regex_t bgfreq; double freq; RBTREE_T *letters; RBNODE_T *node; regcomp_or_die("bg freq", &bgfreq, BGFREQ_RE, REG_EXTENDED); letters = rbtree_create(rbtree_strcasecmp, rbtree_strcpy, free, rbtree_dblcpy, free); line = str_create(100); if (!(fp = fopen(bg_filename, "r"))) { die("Unable to open background file \"%s\" for reading.\n", bg_filename); } terminate = feof(fp); while (!terminate) { size = fread(chunk, sizeof(char), BG_CHUNK_SIZE, fp); chunk[size] = '\0'; terminate = feof(fp); offset = 0; while (offset < size) { // skip mac newline if (str_len(line) == 0 && chunk[offset] == '\r') { offset++; continue; } // find next new line for (i = offset; i < size; ++i) { if (chunk[i] == '\n') break; } // append portion up to the new line or end of chunk str_append(line, chunk+offset, i - offset); // read more if we didn't find a new line if (i == size && !terminate) break; // move the offset past the new line offset = i + 1; // handle windows new line if (str_char(line, -1) == '\r') str_truncate(line, -1); // remove everything to the right of a comment character for (i = 0; i < str_len(line); ++i) { if (str_char(line, i) == '#') { str_truncate(line, i); break; } } // check the line for a single letter followed by a number if (regexec_or_die("bg freq", &bgfreq, str_internal(line), 4, matches, 0)) { // parse the letter and frequency value regex_strncpy(matches+1, str_internal(line), letter, 2); freq = regex_dbl(matches+2, str_internal(line)); // check the frequency is acceptable if (freq < 0 || freq > 1) { die("The background file lists the illegal probability %g for " "the letter %s.\n", freq, letter); } else if (freq == 0) { die("The background file lists a probability of zero for the " "letter %s\n", letter); } if (freq >= 0 && freq <= 1) rbtree_put(letters, letter, &freq); } str_clear(line); } } // finished with the file so clean up file parsing stuff fclose(fp); str_destroy(line, FALSE); regfree(&bgfreq); // guess the alphabet if (*alph == INVALID_ALPH) { switch (rbtree_size(letters)) { case PROTEIN_ASIZE: *alph = PROTEIN_ALPH; break; case DNA_ASIZE: *alph = DNA_ALPH; break; default: die("Number of single character entries in background does not match " "an alphabet.\n"); } } // make the background if (freqs == NULL) freqs = allocate_array(alph_size(*alph, ALL_SIZE)); assert(get_array_length(freqs) >= alph_size(*alph, ALL_SIZE)); init_array(-1, freqs); for (node = rbtree_first(letters); node != NULL; node = rbtree_next(node)) { key = (char*)rbtree_key(node); i = alph_index(*alph, key[0]); freq = *((double*)rbtree_value(node)); if (i == -1) { die("Background contains letter %s which is not in the %s alphabet.\n", key, alph_name(*alph)); } if (get_array_item(i, freqs) != -1) { die("Background contains letter %s which has the same meaning as an " "already listed letter.\n", key); } set_array_item(i, freq, freqs); } // check that all items were set for (i = 0; i < alph_size(*alph, ALPH_SIZE); i++) { if (get_array_item(i, freqs) == -1) { die("Background is missing letter %c.\n", alph_char(*alph, i)); } } // disabled for backwards compatability (AMA test was failing) //normalize_subarray(0, ALPH_ASIZE[*alph], 0.0, freqs); // calculate the values of the ambiguous letters from the concrete ones calc_ambigs(*alph, FALSE, freqs); // cleanup rbtree_destroy(letters); // return result return freqs; }