/*********************************************************************** * Normalize an array in log space. ***********************************************************************/ void log_normalize (ATYPE close_enough, ARRAY_T* array) { int i_item; int num_items; ATYPE total; ATYPE this_value; /* Get the sum of the elements. */ total = log_array_total(array); /* If the array already sums to zero, don't bother. */ if (almost_equal(total, 0.0, close_enough)) { return; } /* If there's nothing in the array, then return all zeroes. */ if (total < LOG_SMALL) { init_array(LOG_ZERO, array); return; } num_items = get_array_length(array); for (i_item = 0; i_item < num_items; i_item++) { this_value = get_array_item(i_item, array) - total; /* If this value is small enough, just make it zero. */ if (this_value < LOG_SMALL) { set_array_item(i_item, LOG_ZERO, array); } else { set_array_item(i_item, this_value, array); } } }
/*********************************************************************** * Convert array by compute the average of complementary dna frequencies. * * Assumes DNA alphabet in order ACGT. ***********************************************************************/ void balance_complementary_dna_freqs (ARRAY_T* source) { double at = (get_array_item(0, source)+get_array_item(3, source))/2.0; double cg = (get_array_item(1, source)+get_array_item(2, source))/2.0; set_array_item(0, at, source); // A -> T set_array_item(1, cg, source); // C -> G set_array_item(2, cg, source); // G -> C set_array_item(3, at, source); // T -> A fill_in_ambiguous_chars(FALSE, source); }
/*********************************************************************** * Convert array by compute the average of complementary dna frequencies. * * Apparently no-one uses this. * * Assumes DNA alphabet in order ACGT. ***********************************************************************/ void balance_complementary_dna_freqs (ARRAY_T* source) { double at = (get_array_item(0, source)+get_array_item(3, source))/2.0; double cg = (get_array_item(1, source)+get_array_item(2, source))/2.0; set_array_item(0, at, source); // A -> T set_array_item(1, cg, source); // C -> G set_array_item(2, cg, source); // G -> C set_array_item(3, at, source); // T -> A calc_ambigs(DNA_ALPH, FALSE, source); }
void dxml_handle_pos(void *ctx, int pos, double A, double C, double G, double T) { CTX_T *data; MOTIF_T *motif; ARRAY_T *row; data = (CTX_T*)ctx; motif = data->motif; row = get_matrix_row(pos - 1, motif->freqs); set_array_item(0, A, row); set_array_item(1, C, row); set_array_item(2, G, row); set_array_item(3, T, row); }
/************************************************************************** * get_scaled_lo_prior_dist * * Takes a scaled distribution of priors and creates a scaled distribution of * log odds priors. The parameters for the scaling of the input priors are * in the PRIOR_DIST_T data structure. The output distribution of log odss * priors are scaled to be in the same range as the PSSM log odds using * the input parameters pssm_range, pssm_scale, and pssm_offset. * * Special handling is required for a uniform distribution of priors. * In that case the max_prior == min_prior, and the distribution only * contains one bin. * * Returns a new array containing the scaled log odds priors **************************************************************************/ ARRAY_T *get_scaled_lo_prior_dist( PRIOR_DIST_T *prior_dist, double alpha, int pssm_range, double pssm_scale, double pssm_offset ) { assert(prior_dist != NULL); // Alocate enought space for elements in [0 ... pssm_range] ARRAY_T *scaled_lo_prior_dist = allocate_array(pssm_range + 1); if (prior_dist != NULL) { ARRAY_T *dist_array = get_prior_dist_array(prior_dist); int len_prior_dist = get_array_length(dist_array); double max_prior = get_prior_dist_maximum(prior_dist); double min_prior = get_prior_dist_minimum(prior_dist); double prior_dist_scale = get_prior_dist_scale(prior_dist); double prior_dist_offset = get_prior_dist_offset(prior_dist); init_array(0.0L, scaled_lo_prior_dist); if (max_prior == min_prior) { // Special case for uniform priors double value = 1.0; double lo_prior = my_log2(alpha * max_prior / (1.0L - (alpha * max_prior))); // Convert lo_prior to PSSM scale int scaled_index = raw_to_scaled(lo_prior, 1.0L, pssm_scale, pssm_offset); set_array_item(scaled_index, value, scaled_lo_prior_dist); } else { int prior_index = 0; for (prior_index = 0; prior_index < len_prior_dist; ++prior_index) { double value = get_array_item(prior_index, dist_array); // Convert index giving scaled prior to raw prior. double scaled_prior = ((double) prior_index) + 0.5L; double prior \ = scaled_to_raw(scaled_prior, 1, prior_dist_scale, prior_dist_offset); double lo_prior = my_log2(alpha * prior / (1.0L - (alpha * prior))); // Scale raw lo_prior using parameters from PSSM. int scaled_index = raw_to_scaled(lo_prior, 1.0L, pssm_scale, pssm_offset); if (scaled_index < pssm_range) { double old_value = get_array_item(scaled_index, scaled_lo_prior_dist); set_array_item(scaled_index, value + old_value, scaled_lo_prior_dist); } } } } return scaled_lo_prior_dist; }
/*********************************************************************** * Compute the reverse complement of one DNA frequency distribution. * * Assumes DNA alphabet in order ACGT. ***********************************************************************/ void complement_dna_freqs (ARRAY_T* source, ARRAY_T* dest) { set_array_item(0, get_array_item(3, source), dest); // A -> T set_array_item(1, get_array_item(2, source), dest); // C -> G set_array_item(2, get_array_item(1, source), dest); // G -> C set_array_item(3, get_array_item(0, source), dest); // T -> A //check if the frequencies have ambiguous characters //for example meme does not use ambiguous characters if (get_array_length(source) > 4) { fill_in_ambiguous_chars(FALSE, dest); } }
void dxml_handle_background(void *ctx, DREME_ALPH_EN alpha, double A, double C, double G, double T, DREME_BG_EN source, char *file, char *last_mod_date) { CTX_T *data; MOTIF_T *motif; ARRAY_T *bg; data = (CTX_T*)ctx; data->file_type_match = 4; // it must be a dreme xml file! data->fscope.alphabet = DNA_ALPH; data->fscope.background = allocate_array(4); bg = data->fscope.background; set_array_item(0, A, bg); set_array_item(1, C, bg); set_array_item(2, G, bg); set_array_item(3, T, bg); }
/************************************************************************** * Get pseudocount frequencies. * * The target_freq matrix only has values for the basic alphabet. * Fill in the ambiguous character pseudocounts afterwards using * the average of pseudocounts for letters matching the ambiguous ones. **************************************************************************/ ARRAY_T *get_pseudocount_freqs( ALPH_T alph, ARRAY_T * f, /* Foreground distribution. */ ARRAY_T * b, /* Background distribution. */ MATRIX_T * target_freq /* Target frequency matrix. */ ) { int i, j; int asize = alph_size(alph, ALPH_SIZE); // excludes ambigs ARRAY_T *g = allocate_array(alph_size(alph, ALL_SIZE));// includes ambigs /* Create pseudocount frequencies. */ for (i = 0; i < asize; i++) { /* non-ambiguous freqs */ double gi = 0; for (j= 0; j < asize; j++) { /* non-ambiguous freqs */ double qij = get_matrix_cell(i, j, target_freq); double fj = get_array_item(j, f); double bj = get_array_item(j, b); gi += (fj/bj) * qij; } /* j */ set_array_item(i, gi, g); if (SUBST_MATRIX_DEBUG) printf("%g %g, ", get_array_item(i, f), gi); } /* i */ calc_ambigs(alph, FALSE, g); /* takes the average pseudocount */ if (SUBST_MATRIX_DEBUG) printf("\n"); return(g); /* return the pseudocounts */ } /* get_pseudocount_freqs */
/*********************************************************************** * Mix two arrays in log space. ***********************************************************************/ void mix_log_arrays (float mixing, /* Percent of array2 that will be retained. */ ARRAY_T* array1, ARRAY_T* array2) { int i_item; int num_items; ATYPE mixed_value; check_null_array(array1); check_null_array(array2); /* Verify that the arrays are of the same length. */ check_array_dimensions(TRUE, array1, array2); /* Verify that we've got a reasonable mixing parameter. */ if ((mixing > 1.0) || (mixing < 0.0)) { die("Invalid mixing parameter (%g).\n", mixing); } num_items = get_array_length(array1); for (i_item = 0; i_item < num_items; i_item++) { mixed_value = LOG_SUM(my_log2(1.0 - mixing) + get_array_item(i_item, array1), my_log2(mixing) + get_array_item(i_item, array2)); set_array_item(i_item, mixed_value, array2); } }
/*********************************************************************** * Read an array of unknown length from a file. * Caller is resposbile for freeing array. ***********************************************************************/ ARRAY_T *read_array_from_file(const char* filename) { int array_size = 100; int i_item = 0; int num_read = 0; ATYPE value; FILE *array_file = fopen(filename, "r"); if (array_file == NULL) { die( "Unable to open file: %s.\nError message: %s.\n", filename, strerror(errno) ); } ARRAY_T *array = allocate_array(array_size); while ((num_read = fscanf(array_file, ASCAN, &value)) == 1) { set_array_item(i_item, value, array); ++i_item; if (i_item >= array_size) { resize_array(array, 2 * array_size); array_size = 2 *array_size; } } if (num_read == 0) { die("Error reading array at position %d.\n", i_item); } fclose(array_file); resize_array(array, i_item); return array; }
/*********************************************************************** * A helper function to normalize a subarray. ***********************************************************************/ void normalize_subarray( int start_index, int length, double tolerance, ARRAY_T* array ) { ATYPE total; int i; /* Compute the total. */ total = 0.0; for (i = start_index; i < start_index + length; i++) { total += get_array_item(i, array); } assert(total != 0.0); /* Don't bother if we're close enough. */ if (almost_equal(1.0, total, tolerance)) { return; } /* Divide each element by the total. */ for (i = start_index; i < start_index + length; i++) { set_array_item(i, get_array_item(i, array) / total, array); } }
/*********************************************************************** * Read the background letter frequencies from XML. * Caller is responsible for freeing the returned array. ***********************************************************************/ ARRAY_T* read_bg_freqs_from_xml(xmlXPathContextPtr xpath_ctxt, ALPH_T alph) { xmlXPathObjectPtr xpathObj = NULL; ATYPE value; ARRAY_T* bg_freqs; int a_size = alph_size(alph, ALPH_SIZE); // Use XPATH to get the background frequencies from XML xpathObj = xpath_query( xpath_ctxt, "//*/background_frequencies/alphabet_array/value" ); int num_values = (xpathObj->nodesetval ? xpathObj->nodesetval->nodeNr : 0); xmlXPathFreeObject(xpathObj); // The number of background frequences should match the alphabet size. assert(num_values == a_size); // Allocate the array. bg_freqs= allocate_array(alph_size(alph, ALL_SIZE)); // XML doesn't enforce any order on the emission probability values, // so force reading bg frequency values in alphabet order. const int MAX_XPATH_EXPRESSION = 200; char xpath_expression[MAX_XPATH_EXPRESSION]; xmlNodePtr currValueNode = NULL; int i_node = 0; for (i_node = 0; i_node < a_size; i_node++) { // Build the XPATH expression to get bg freq for a character. snprintf( xpath_expression, MAX_XPATH_EXPRESSION, "//*/background_frequencies/" "alphabet_array/value[@letter_id='letter_%c']", alph_char(alph, i_node) ); // Read the selected bg frequency. xpathObj = xpath_query(xpath_ctxt, xpath_expression); // Should only find one node assert(xpathObj->nodesetval->nodeNr == 1); // Decode from node set to numeric value for bg freq. currValueNode = xpathObj->nodesetval->nodeTab[0]; xmlXPathFreeObject(xpathObj); value = xmlXPathCastNodeToNumber(currValueNode); set_array_item(i_node, value, bg_freqs); } // Make sure the frequencies add up to 1.0. normalize_subarray(0, a_size, 0.0, bg_freqs); // Fill in ambiguous characters. calc_ambigs(alph, FALSE, bg_freqs); return bg_freqs; }
/*********************************************************************** * Fill an array with a given raw array of values. ***********************************************************************/ void fill_array (ATYPE* raw_array, ARRAY_T* array) { int i_item; int num_items; num_items = get_array_length(array); for (i_item = 0; i_item < num_items; i_item++) { set_array_item(i_item, raw_array[i_item], array); } }
KARLIN_INPUT_T *make_karlin_input( MATRIX_T *matrix, /* scoring matrix */ ARRAY_T *probs /* letter freq distribution */ ) { int i, j; double escore; long lowest, highest; ARRAY_T *score_probs; int nscores; int alen = get_num_rows(matrix); /* size of alphabet */ KARLIN_INPUT_T *karlin_input; /* data to return */ /* find the highest and lowest scores in the scoring matrix */ lowest = 1; highest = -1; for (i=0; i<alen; i++) { for (j=0; j<alen; j++) { double s = get_matrix_cell(i, j, matrix); if (s < lowest) lowest = s; if (s > highest) highest = s; } } if (lowest >= 0) die("Lowest score in scoring matrix must be negative, is %f.", (double)lowest); if (highest<= 0) die("Highest score in scoring matrix must be positve, is %f.", (double)highest); /* allocate the array of score probabilities and set to 0 */ nscores = highest - lowest + 1; score_probs = allocate_array(nscores); init_array(0, score_probs); /* compute the probabilities of different scores */ escore = 0; for (i=0; i<alen; i++) { for (j=0; j<alen; j++) { int s = get_matrix_cell(i, j, matrix); double pi = get_array_item(i, probs); double pj = get_array_item(j, probs); double sp = get_array_item(s-lowest, score_probs); set_array_item(s-lowest, sp + pi*pj, score_probs); /* cumulative prob. of score */ escore += pi*pj*s; /*printf("i %d j %d s %d pi %f pj %f sp %f escore %f\n",i,j,s, pi, pj, sp, escore);*/ } } karlin_input = (KARLIN_INPUT_T *)mm_malloc(sizeof(KARLIN_INPUT_T)); karlin_input->low = lowest; karlin_input->high = highest; karlin_input->escore = escore; karlin_input->prob = score_probs; return(karlin_input); } /* make_karlin_input */
/* * Load uniform frequencies into the array. */ ARRAY_T* get_uniform_frequencies(ALPH_T alph, ARRAY_T *freqs) { int i, n; n = ALPH_ASIZE[alph]; if (freqs == NULL) freqs = allocate_array(alph_size(alph, ALL_SIZE)); assert(get_array_length(freqs) >= alph_size(alph, ALL_SIZE)); for (i = 0; i < n; i++) { set_array_item(i, 1.0/n, freqs); } calc_ambigs(alph, FALSE, freqs); return freqs; }
/*********************************************************************** * Initialize a given array with a given value. ***********************************************************************/ void init_array (ATYPE value, ARRAY_T* array) { int i_item; int num_items; num_items = get_array_length(array); for (i_item = 0; i_item < num_items; i_item++) { set_array_item(i_item, value, array); } }
/* * Replace the elements an array of frequences with the average * over complementary bases. */ void average_freq_with_complement(ALPH_T alph, ARRAY_T *freqs) { int a_index, t_index, g_index, c_index; double at_freq, gc_freq; assert(alph == DNA_ALPH); a_index = alph_index(alph, 'A'); t_index = alph_index(alph, 'T'); g_index = alph_index(alph, 'G'); c_index = alph_index(alph, 'C'); at_freq = (get_array_item(a_index, freqs) + get_array_item(t_index, freqs)) / 2.0; gc_freq = (get_array_item(g_index, freqs) + get_array_item(c_index, freqs)) / 2.0; set_array_item(a_index, at_freq, freqs); set_array_item(t_index, at_freq, freqs); set_array_item(g_index, gc_freq, freqs); set_array_item(c_index, gc_freq, freqs); }
void unlog_array (ARRAY_T* array) { int i_item; int num_items; check_null_array(array); num_items = get_array_length(array); for (i_item = 0; i_item < num_items; i_item++) { set_array_item(i_item, EXP2(get_array_item(i_item, array)), array); } }
/* * Take the counts from an ambiguous character and evenly distribute * them among the corresponding concrete characters. * * This function operates in log space. */ static void dist_ambig(ALPH_T alph, char ambig, char *concrete_chars, ARRAY_T* freqs) { PROB_T ambig_count, concrete_count; int ambig_index, num_concretes, i, concrete_index; // Get the count to be distributed. ambig_index = alph_index(alph, ambig); ambig_count = get_array_item(ambig_index, freqs); // Divide it by the number of corresponding concrete characters. num_concretes = strlen(concrete_chars); ambig_count -= my_log2((PROB_T)num_concretes); // Distribute it in equal portions to the given concrete characters. for (i = 0; i < num_concretes; i++) { concrete_index = alph_index(alph, concrete_chars[i]); concrete_count = get_array_item(concrete_index, freqs); // Add the ambiguous counts. concrete_count = LOG_SUM(concrete_count, ambig_count); set_array_item(concrete_index, concrete_count, freqs); } // Set the ambiguous count to zero. set_array_item(ambig_index, LOG_ZERO, freqs); }
/*********************************************************************** * Multiply each element of an array by a scalar value. ***********************************************************************/ void scalar_mult (ATYPE value, ARRAY_T* array) { int i_item; int num_items; check_null_array(array); num_items = get_array_length(array); for (i_item = 0; i_item < num_items; i_item++) { set_array_item(i_item, get_array_item(i_item, array) * value, array); } }
/*********************************************************************** * Fill an array with random values between 0 and a given maximum. * * Assumes that the random number generator is initialized. ***********************************************************************/ void randomize_array (ATYPE magnitude, ARRAY_T* array) { int num_items; int i_item; check_null_array(array); num_items = get_array_length(array); for (i_item = 0; i_item < num_items; i_item++) { set_array_item(i_item, my_drand() * magnitude, array); } }
/************************************************************************ * Compute the indices and values of transitions to or from a state. ************************************************************************/ void compute_ins_and_outs (MHMM_T* the_hmm, BOOLEAN_T log_form) /* Is the transition matrix in log form? */ { int i_row, i_col; int n = the_hmm->num_states; MATRIX_T *trans = the_hmm->trans; // // Visit the transition matrix cells just once each // to update ntrans, itrans and trans arrays. // This is quadratic in n. // for (i_row = 0; i_row < n; i_row++) { for (i_col = 0; i_col < n; i_col++) { double p; // The transition probability. int old_n, new_n; // Number of transitions. if (!is_zero((p = get_matrix_cell(i_row, i_col, trans)), log_form)) { MHMM_STATE_T * out_state = &(the_hmm->states[i_row]); MHMM_STATE_T * in_state = &(the_hmm->states[i_col]); // out old_n = out_state->ntrans_out; new_n = ++out_state->ntrans_out; mm_resize(out_state->itrans_out, new_n, int); out_state->trans_out = resize_array(out_state->trans_out, new_n); out_state->itrans_out[old_n] = i_col; set_array_item(old_n, p, out_state->trans_out); // in old_n = in_state->ntrans_in; new_n = ++in_state->ntrans_in; mm_resize(in_state->itrans_in, new_n, int); in_state->trans_in = resize_array(in_state->trans_in, new_n); in_state->itrans_in[old_n] = i_row; set_array_item(old_n, p, in_state->trans_in); } } // col } // row
/* * Load the non-redundant database frequencies into the array. */ ARRAY_T* get_nrdb_frequencies(ALPH_T alph, ARRAY_T *freqs) { int i, size; const PROB_T *nrdb_freqs; size = ALPH_ASIZE[alph]; if (freqs == NULL) freqs = allocate_array(alph_size(alph, ALL_SIZE)); assert(get_array_length(freqs) >= alph_size(alph, ALL_SIZE)); nrdb_freqs = ALPH_NRDB[alph]; for (i = 0; i < size; ++i) { set_array_item(i, nrdb_freqs[i], freqs); } normalize_subarray(0, size, 0.0, freqs); calc_ambigs(alph, FALSE, freqs); return freqs; }
/*********************************************************************** * Return one column of a motif, as a newly allocated array of counts. ***********************************************************************/ ARRAY_T* get_motif_counts (int position, MOTIF_T* motif) { ARRAY_T* return_value = allocate_array(motif->alph_size); int i_alph; for (i_alph = 0; i_alph < motif->alph_size; i_alph++) { set_array_item(i_alph, motif->num_sites * get_matrix_cell(position, i_alph, motif->freqs), return_value); } return(return_value); }
/************************************************************************* * Convert spacer states in a given HMM to free-insertion modules. * All transitions out of spacers states are set to 1.0. * * Note that this function only affects the transitions stored in each * individual state. It is assumed that build_transition_matrix will * be called subsequently. *************************************************************************/ static void convert_to_fims (MHMM_T *the_hmm) { int i_state; /* Index of the current state. */ MHMM_STATE_T * the_state; /* The current state. */ int i_trans; /* Index of outgoing transition. */ for (i_state = 0; i_state < the_hmm->num_states; i_state++) { the_state = &(the_hmm->states[i_state]); if (the_state->type == SPACER_STATE) { for (i_trans = 0; i_trans < the_state->ntrans_out; i_trans++) set_array_item(i_trans, 1.0, the_state->trans_out); } } }
/*********************************************************************** * Divide corresponding elements in two arrays. ***********************************************************************/ void dot_divide (ARRAY_T* array1, ARRAY_T* array2) { int i_item; int num_items; check_null_array(array1); check_null_array(array2); check_array_dimensions(TRUE, array1, array2); num_items = get_array_length(array1); for (i_item = 0; i_item < num_items; i_item++) { set_array_item(i_item, get_array_item(i_item, array1) / get_array_item(i_item, array2), array2); } }
/* * Make one position in an array the sum of a set of other positions. */ static void calc_ambig(ALPH_T alph, BOOLEAN_T log_space, char ambig, char *sources, ARRAY_T *array) { char *source; PROB_T sum; PROB_T value; sum = 0.0; for (source = sources; *source != '\0'; ++source) { value = get_array_item(alph_index(alph, *source), array); if (log_space) { sum = LOG_SUM(sum, value); } else { sum += value; } } set_array_item(alph_index(alph, ambig), sum, array); }
/*********************************************************************** * Read an array of known length from a file. ***********************************************************************/ void read_array (FILE * infile, ARRAY_T* array) { int i_item; int num_items; ATYPE value; check_null_array(array); num_items = get_array_length(array); for (i_item = 0; i_item < num_items; i_item++) { if (fscanf((FILE *)infile, ASCAN, &value) != 1) { die("Error reading array at position %d.\n", i_item); } set_array_item(i_item, value, array); } }
/*********************************************************************** * Create a bootstrapped copy of the given array. * * Allocates the bootstrap array; must be freed by caller. * Assumes that the random number generator is initialized. ***********************************************************************/ ARRAY_T* bootstrap_array (ARRAY_T* source_array, int num_items) { ARRAY_T* return_array; check_null_array(source_array); // Allocate the bootstrap array. return_array = allocate_array(num_items); // Fill up the bootstrap array. int i_item; int num_inputs = get_array_length(source_array); for (i_item = 0; i_item < num_items; i_item++) { int random_index = (int)(my_drand() * (double)(num_inputs)); ATYPE random_item = get_array_item(random_index, source_array); set_array_item(i_item, random_item, return_array); } return(return_array); }
/*********************************************************************** * Remove one item from an array, shifting everything else left. ***********************************************************************/ void remove_array_item (int item_index, ARRAY_T* array) { int i_item; int num_items; // Shift everything left one position. num_items = get_array_length(array); for (i_item = item_index + 1; i_item < num_items; i_item++) { set_array_item(i_item - 1, get_array_item(i_item, array), array); } // Reallocate. if ((array->items = (ATYPE*)mm_realloc(array->items, sizeof(ATYPE) * (num_items - 1))) == NULL) { die("Error re-allocating array.\n"); } (array->num_items)--; }