/* * Tests the letter against the alphabet. If the alphabet is unknown * it attempts to work it out and set it from the letter. * For simplicy this assumes you will pass indexes in asscending order. * Returns false if the letter is unacceptable */ BOOLEAN_T alph_test(ALPH_T *alpha, int index, char letter) { char uc_letter; uc_letter = toupper(letter); if (*alpha == INVALID_ALPH) { switch (index) { case 0: return (uc_letter == 'A'); case 1: return (uc_letter == 'C'); case 2: if (uc_letter == 'D') { *alpha = PROTEIN_ALPH; return TRUE; } return (uc_letter == 'G'); // DNA or RNA case 3: if (uc_letter == 'T') { *alpha = DNA_ALPH; } else if (uc_letter == 'U') { *alpha = DNA_ALPH; //FIXME need RNA but substitute DNA for now } else { return FALSE; } return TRUE; default:// Bad state! die("Should not still be attempting to guess by the 5th letter " "(index = %d).", index); return FALSE; } } else { if (index >= alph_size(*alpha, ALPH_SIZE)) return FALSE; // index too big return (uc_letter == alph_char(*alpha, index)); } }
/***************************************************************************** * MEME > model > /background_frequencies ****************************************************************************/ void mxml_end_background(void *ctx) { CTX_T *data; int i; bool error; double sum, delta; data = (CTX_T*)ctx; sum = 0; error = false; for (i = 0; i < get_array_length(data->nums); i++) { if (get_array_item(i, data->nums) == -1) { local_error(data, "Background frequency was not provided for letter %c.\n", alph_char(data->alph, i)); error = true; } else { sum += get_array_item(i, data->nums); } } delta = sum - 1.0; if (delta < 0) delta = -delta; if (delta > 0.01) { local_error(data, "The background frequencies summed to %f but they should sum to 1.0.\n", sum); error = true; } if (error) { free_array(data->nums); } else { data->fscope.background = data->nums; data->nums = NULL; } }
/************************************************************************* * Convert an integer representing a column in a PSSM into the * corresponding alignment column string. * If the alphabet has m characters, and the alignment columns have n entries, * the array of all alignment columns is conveniently numbered by the set of * consecutive n-digit base m numerals: * AAAA = 0000, AAAC = 0001, ..., TTTG = 3332, TTTT = 3333. * The caller must allocate the memory for the alignment column string. * The memory required is the number of sequences in the alignment, plus one * for the terminating null. *************************************************************************/ void unhash_alignment_col( ALPH_T alph, int alignment_col_index, char *alignment_col, int alignment_col_size ) { int asize = alph_size(alph, ALPH_SIZE); assert(alignment_col_index >= 0); assert( alignment_col_index < pow( (double) asize, (double) alignment_col_index ) ); assert(alignment_col != NULL); assert(alignment_col_size >= 1); alignment_col[alignment_col_size] = '\0'; int i, j; for (i = alignment_col_size - 1; i >= 0; i--) { j = alignment_col_index % asize; alignment_col_index -= j; alignment_col[i] = alph_char(alph, j); alignment_col_index /= asize; } } // unhash_alignment_col
/*********************************************************************** * Returns the string that is the best possible match to the given motif. * Caller is responsible for freeing string. ***********************************************************************/ char *get_best_possible_match(MOTIF_T *motif) { int mpos, apos, asize; char *match_string; ALPH_SIZE_T size; asize = alph_size(motif->alph, ALPH_SIZE); assert(motif != NULL); assert(motif->freqs != NULL); assert(motif->length == motif->freqs->num_rows); size = (motif->flags & MOTIF_HAS_AMBIGS ? ALL_SIZE : ALPH_SIZE); assert(alph_size(motif->alph, size) == motif->freqs->num_cols); match_string = mm_malloc(sizeof(char) * (motif->length + 1)); // Find the higest scoring character at each position in the motif. for(mpos = 0; mpos < motif->length; ++mpos) { ARRAY_T *row = motif->freqs->rows[mpos]; double max_v = row->items[0]; int max_i = 0; for(apos = 1; apos < asize; ++apos) { if (row->items[apos] >= max_v) { max_i = apos; max_v = row->items[apos]; } } match_string[mpos] = alph_char(motif->alph, max_i); } // Add null termination match_string[motif->length] = '\0'; return match_string; }
MATRIX_T *get_subst_target_matrix( char *score_filename, /* name of score file */ ALPH_T alph, /* alphabet */ int dist, /* PAM distance (ignored if score_filename != NULL) */ ARRAY_T *back /* background frequencies of standard alphabet */ ) { MATRIX_T *score; /* score matrix */ MATRIX_T *target; /* target frequency matrix */ score = get_score_matrix(score_filename, alph, dist); target = convert_score_to_target(score, back); if (SUBST_MATRIX_DEBUG) { int i, j, alength=alph_size(alph, ALPH_SIZE); double sum; if (score_filename) { printf("From file %s\n", score_filename); } else { printf("Generated PAM %d\n", dist); } printf("%6c ", ' '); for (i=0; i<alength; i++) { printf("%6c ", alph_char(alph, i)); } printf("\n"); sum = 0; for (i=0; i<alength; i++) { printf("%6c ", alph_char(alph, i)); for (j=0; j<alength; j++) { double x = get_matrix_cell(i,j,score); sum += x; printf("%6.4f ", x); } printf("\n"); } printf("sum of entries = %f\n", sum); } free_matrix(score); return(target); } /* get_subst_target_matrix */
/*********************************************************************** * Read the background letter frequencies from XML. * Caller is responsible for freeing the returned array. ***********************************************************************/ ARRAY_T* read_bg_freqs_from_xml(xmlXPathContextPtr xpath_ctxt, ALPH_T alph) { xmlXPathObjectPtr xpathObj = NULL; ATYPE value; ARRAY_T* bg_freqs; int a_size = alph_size(alph, ALPH_SIZE); // Use XPATH to get the background frequencies from XML xpathObj = xpath_query( xpath_ctxt, "//*/background_frequencies/alphabet_array/value" ); int num_values = (xpathObj->nodesetval ? xpathObj->nodesetval->nodeNr : 0); xmlXPathFreeObject(xpathObj); // The number of background frequences should match the alphabet size. assert(num_values == a_size); // Allocate the array. bg_freqs= allocate_array(alph_size(alph, ALL_SIZE)); // XML doesn't enforce any order on the emission probability values, // so force reading bg frequency values in alphabet order. const int MAX_XPATH_EXPRESSION = 200; char xpath_expression[MAX_XPATH_EXPRESSION]; xmlNodePtr currValueNode = NULL; int i_node = 0; for (i_node = 0; i_node < a_size; i_node++) { // Build the XPATH expression to get bg freq for a character. snprintf( xpath_expression, MAX_XPATH_EXPRESSION, "//*/background_frequencies/" "alphabet_array/value[@letter_id='letter_%c']", alph_char(alph, i_node) ); // Read the selected bg frequency. xpathObj = xpath_query(xpath_ctxt, xpath_expression); // Should only find one node assert(xpathObj->nodesetval->nodeNr == 1); // Decode from node set to numeric value for bg freq. currValueNode = xpathObj->nodesetval->nodeTab[0]; xmlXPathFreeObject(xpathObj); value = xmlXPathCastNodeToNumber(currValueNode); set_array_item(i_node, value, bg_freqs); } // Make sure the frequencies add up to 1.0. normalize_subarray(0, a_size, 0.0, bg_freqs); // Fill in ambiguous characters. calc_ambigs(alph, FALSE, bg_freqs); return bg_freqs; }
/***************************************************************************** * MEME > motifs > motif > probabilities > alphabet_matrix > /alphabet_array * Check that all letters have a probability and update the current matrix row. ****************************************************************************/ void mxml_end_probability_pos(void *ctx) { CTX_T *data; ARRAY_T *pos; int i; data = (CTX_T*)ctx; pos = get_matrix_row(data->current_pos, data->mscope.motif->freqs); for (i = 0; i < get_array_length(pos); i++) { if (get_array_item(i, pos) == -1) { local_error(data, "Probability for letter %c in position %d is missing.\n", alph_char(data->alph, i), i + 1); } } data->current_pos++; }
/***************************************************************************** * MEME > training_set > /alphabet * Read in the number of symbols in the alphabet and if it is nucleotide or * amino-acid (RNA is apparently classed as nucleotide). ****************************************************************************/ void mxml_end_alphabet(void *ctx) { PARMSG_T *message; CTX_T *data; RBNODE_T *node; char *id, symbol; bool *exists; int i; data = (CTX_T*)ctx; if (data->alph == NULL) { // Custom alphabet alph_reader_done(data->alph_rdr); // report any errors that the alphabet reader found while (alph_reader_has_message(data->alph_rdr)) { message = alph_reader_next_message(data->alph_rdr); if (message->severity == SEVERITY_ERROR) { local_error(data, "Alphabet error: %s.\n", message->message); } else { local_warning(data, "Alphabet warning: %s.\n", message->message); } parmsg_destroy(message); } // try to get an alphabet data->alph = alph_reader_alphabet(data->alph_rdr); alph_reader_destroy(data->alph_rdr); data->alph_rdr = NULL; } else { // legacy alphabet exists = mm_malloc(sizeof(bool) * alph_size_core(data->alph)); // set list to false for (i = 0; i < alph_size_core(data->alph); i++) exists[i] = false; // check that id's were defined for all the core alphabet symbols for (node = rbtree_first(data->letter_lookup); node != NULL; node = rbtree_next(node)) { id = (char*)rbtree_key(node); symbol = ((char*)rbtree_value(node))[0]; if (exists[alph_indexc(data->alph, symbol)]) { // duplicate! local_error(data, "The letter identifier %s is not the first to refer to symbol %c.\n", id, symbol); } exists[alph_indexc(data->alph, symbol)] = true; } // now check for missing identifiers for (i = 0; i < alph_size_core(data->alph); i++) { if (!exists[i]) { // missing id for symbol local_error(data, "The symbol %c does not have an assigned identifier.\n", alph_char(data->alph, i)); } } free(exists); } }
/** * to_str_seed * * This function converts an integer encoded representation of a seed into an * ascii representation of it. Memory for the string is dynamically allocated * here, and it is the caller's responsibility to later free that memory. */ char *to_str_seed( ALPH_T *alph, // alphabet uint8_t *e_seed, // Integer encoded representation. int w // The length of the string. ) { char *str_seed = NULL; Resize(str_seed, w+1, char); int seed_idx; for (seed_idx = 0; seed_idx < w; seed_idx++) { str_seed[seed_idx] = alph_char(alph, e_seed[seed_idx]); } str_seed[w] = '\0'; return str_seed; }
void mcast_print_bg_freqs( FILE *output, ARRAY_T *bgfreqs, MHMMSCAN_OPTIONS_T *options ) { int asize = alph_size(options->alphabet, ALPH_SIZE); int i; for (i = 0; i < asize; i++) { if (i % 9 == 0) { fputc('\n', output); } fprintf( output, "%c: %1.3f ", alph_char(options->alphabet, i), get_array_item(i, bgfreqs) ); } };
/* * Load background file frequencies into the array. */ ARRAY_T* get_file_frequencies(ALPH_T *alph, char *bg_filename, ARRAY_T *freqs) { regmatch_t matches[4]; STR_T *line; char chunk[BG_CHUNK_SIZE+1], letter[2], *key; int size, terminate, offset, i; FILE *fp; regex_t bgfreq; double freq; RBTREE_T *letters; RBNODE_T *node; regcomp_or_die("bg freq", &bgfreq, BGFREQ_RE, REG_EXTENDED); letters = rbtree_create(rbtree_strcasecmp, rbtree_strcpy, free, rbtree_dblcpy, free); line = str_create(100); if (!(fp = fopen(bg_filename, "r"))) { die("Unable to open background file \"%s\" for reading.\n", bg_filename); } terminate = feof(fp); while (!terminate) { size = fread(chunk, sizeof(char), BG_CHUNK_SIZE, fp); chunk[size] = '\0'; terminate = feof(fp); offset = 0; while (offset < size) { // skip mac newline if (str_len(line) == 0 && chunk[offset] == '\r') { offset++; continue; } // find next new line for (i = offset; i < size; ++i) { if (chunk[i] == '\n') break; } // append portion up to the new line or end of chunk str_append(line, chunk+offset, i - offset); // read more if we didn't find a new line if (i == size && !terminate) break; // move the offset past the new line offset = i + 1; // handle windows new line if (str_char(line, -1) == '\r') str_truncate(line, -1); // remove everything to the right of a comment character for (i = 0; i < str_len(line); ++i) { if (str_char(line, i) == '#') { str_truncate(line, i); break; } } // check the line for a single letter followed by a number if (regexec_or_die("bg freq", &bgfreq, str_internal(line), 4, matches, 0)) { // parse the letter and frequency value regex_strncpy(matches+1, str_internal(line), letter, 2); freq = regex_dbl(matches+2, str_internal(line)); // check the frequency is acceptable if (freq < 0 || freq > 1) { die("The background file lists the illegal probability %g for " "the letter %s.\n", freq, letter); } else if (freq == 0) { die("The background file lists a probability of zero for the " "letter %s\n", letter); } if (freq >= 0 && freq <= 1) rbtree_put(letters, letter, &freq); } str_clear(line); } } // finished with the file so clean up file parsing stuff fclose(fp); str_destroy(line, FALSE); regfree(&bgfreq); // guess the alphabet if (*alph == INVALID_ALPH) { switch (rbtree_size(letters)) { case PROTEIN_ASIZE: *alph = PROTEIN_ALPH; break; case DNA_ASIZE: *alph = DNA_ALPH; break; default: die("Number of single character entries in background does not match " "an alphabet.\n"); } } // make the background if (freqs == NULL) freqs = allocate_array(alph_size(*alph, ALL_SIZE)); assert(get_array_length(freqs) >= alph_size(*alph, ALL_SIZE)); init_array(-1, freqs); for (node = rbtree_first(letters); node != NULL; node = rbtree_next(node)) { key = (char*)rbtree_key(node); i = alph_index(*alph, key[0]); freq = *((double*)rbtree_value(node)); if (i == -1) { die("Background contains letter %s which is not in the %s alphabet.\n", key, alph_name(*alph)); } if (get_array_item(i, freqs) != -1) { die("Background contains letter %s which has the same meaning as an " "already listed letter.\n", key); } set_array_item(i, freq, freqs); } // check that all items were set for (i = 0; i < alph_size(*alph, ALPH_SIZE); i++) { if (get_array_item(i, freqs) == -1) { die("Background is missing letter %c.\n", alph_char(*alph, i)); } } // disabled for backwards compatability (AMA test was failing) //normalize_subarray(0, ALPH_ASIZE[*alph], 0.0, freqs); // calculate the values of the ambiguous letters from the concrete ones calc_ambigs(*alph, FALSE, freqs); // cleanup rbtree_destroy(letters); // return result return freqs; }
main(int argc, char **argv) { int i, j, alength; int dist = 0; ALPH_T alph = PROTEIN_ALPH; char *score_filename = NULL; char *alpha; MATRIX_T *matrix; ARRAY_T *probs; double *freqs; KARLIN_INPUT_T *karlin_input; int nscores; double sum; char usage[1000] = ""; // Define the usage message. strcat(usage, "USAGE: subst_matrix [options] <score file>\n"); strcat(usage, "\n"); strcat(usage, " Options:\n"); strcat(usage, " --dna\n"); strcat(usage, " --dist <float>\n"); strcat(usage, "\n"); // Parse the command line. while (1) { int c; int option_index = 0; const char* option_name; // Define command line options. static struct option long_options[] = { {"dna", 0, 0, 0}, {"dist", 1, 0, 0}, }; // Read the next option, and break if we're done. c = getopt_long_only(argc, argv, "+", long_options, &option_index); if (c == -1) { break; } else if (c != 0) { die("Invalid return from getopt (%d)\n", c); } // Get the option name (we only use long options). option_name = long_options[option_index].name; if (strcmp(option_name, "dna") == 0) { alph = DNA_ALPH; } else if (strcmp(option_name, "dist") == 0) { dist = atoi(optarg); } else { die("Invalid option (%s).\n", option_name); } } // Read the single required argument. if (optind + 1 != argc) { fprintf(stderr, usage); exit(1); } score_filename = argv[optind]; alength = alph_size(alph, ALPH_SIZE); /* background frequencies */ probs = allocate_array(alength); freqs = alph == DNA_ALPH ? pam_dna_freq : pam_prot_freq; fill_array(freqs, probs); /* copy freqs into ARRAY_T */ if (dist > 1) { printf("From gen_pam_matrix:\n"); matrix = gen_pam_matrix(alph, dist, FALSE); printf("%6c ", ' '); for (i=0; i<alength; i++) { printf("%6c ", alph_char(alph, i)); } printf("\n"); sum = 0; for (i=0; i<alength; i++) { printf("%6c ", alph_char(alph, i)); for (j=0; j<alength; j++) { double x = get_matrix_cell(i,j,matrix); sum += x; printf("%6.4f ", x); } printf("\n"); } printf("sum of entries = %f\n", sum); } printf("From get_subst_target_matrix:\n"); matrix = get_subst_target_matrix(score_filename, alph, dist, probs); } /* main */