Beispiel #1
0
/***********************************************************************
 * Returns the string that is the best possible match to the given motif.
 * Caller is responsible for freeing string.
 ***********************************************************************/
char *get_best_possible_match(MOTIF_T *motif) {
  int mpos, apos, asize; 
  char *match_string;
  ALPH_SIZE_T size;

  asize = alph_size(motif->alph, ALPH_SIZE);
  
  assert(motif != NULL);
  assert(motif->freqs != NULL);
  assert(motif->length == motif->freqs->num_rows);
  size = (motif->flags & MOTIF_HAS_AMBIGS ? ALL_SIZE : ALPH_SIZE);
  assert(alph_size(motif->alph, size) == motif->freqs->num_cols); 

  match_string = mm_malloc(sizeof(char) * (motif->length + 1));

  // Find the higest scoring character at each position in the motif.
  for(mpos = 0; mpos < motif->length; ++mpos) {
    ARRAY_T *row = motif->freqs->rows[mpos];
    double max_v = row->items[0];
    int max_i = 0;
    for(apos = 1; apos < asize; ++apos) {
     if (row->items[apos] >= max_v) {
        max_i = apos;
        max_v = row->items[apos];
     }
    }
    match_string[mpos] = alph_char(motif->alph, max_i);
  }

  //  Add null termination
  match_string[motif->length] = '\0';

  return match_string;
}
Beispiel #2
0
/**************************************************************************
 * Get pseudocount frequencies.
 *
 * The target_freq matrix only has values for the basic alphabet.
 * Fill in the ambiguous character pseudocounts afterwards using
 * the average of pseudocounts for letters matching the ambiguous ones.
 **************************************************************************/
ARRAY_T *get_pseudocount_freqs(
   ALPH_T alph,
   ARRAY_T *	  f,		/* Foreground distribution. */
   ARRAY_T *      b,		/* Background distribution. */
   MATRIX_T *     target_freq	/* Target frequency matrix. */
)
{
  int i, j;
  int asize = alph_size(alph, ALPH_SIZE);		// excludes ambigs
  ARRAY_T *g = allocate_array(alph_size(alph, ALL_SIZE));// includes ambigs

  /*
    Create pseudocount frequencies.
  */
  for (i = 0; i < asize; i++) {				/* non-ambiguous freqs */
    double gi = 0;
    for (j= 0; j < asize; j++) {			/* non-ambiguous freqs */
      double qij = get_matrix_cell(i, j, target_freq);
      double fj = get_array_item(j, f);
      double bj = get_array_item(j, b);
      gi += (fj/bj) * qij;
    } /* j */
    set_array_item(i, gi, g);
    if (SUBST_MATRIX_DEBUG) printf("%g %g, ", get_array_item(i, f), gi);
  } /* i */
  calc_ambigs(alph, FALSE, g);			/* takes the average pseudocount */
  if (SUBST_MATRIX_DEBUG) printf("\n");

  return(g);						/* return the pseudocounts */
} /* get_pseudocount_freqs */
Beispiel #3
0
/***********************************************************************
 * Turn a given motif into its own reverse complement.
 ***********************************************************************/
void reverse_complement_motif
  (MOTIF_T* a_motif)
{
  ALPH_SIZE_T size;
  int i, temp_trim;
  ARRAY_T* left_freqs;
  ARRAY_T* right_freqs;
  ARRAY_T* temp_freqs;   // Temporary space during swap.

  assert(a_motif->alph == DNA_ALPH);

  // Allocate space.
  size = (a_motif->flags & MOTIF_HAS_AMBIGS ? ALL_SIZE : ALPH_SIZE);
  temp_freqs = allocate_array(alph_size(a_motif->alph, size));

  // Consider each row (position) in the motif.
  for (i = 0; i < (int)((a_motif->length + 1) / 2); i++) {
    left_freqs = get_matrix_row(i, a_motif->freqs);
    right_freqs = get_matrix_row(a_motif->length - (i + 1), a_motif->freqs);

    // Make a temporary copy of one row.
    copy_array(left_freqs, temp_freqs);

    // Compute reverse complements in both directions.
    complement_dna_freqs(right_freqs, left_freqs);
    complement_dna_freqs(temp_freqs, right_freqs);
  }
  free_array(temp_freqs);
  if (a_motif->scores) {
    // Allocate space.
    temp_freqs = allocate_array(alph_size(a_motif->alph, ALPH_SIZE));

    // Consider each row (position) in the motif.
    for (i = 0; i < (int)((a_motif->length + 1) / 2); i++) {
      left_freqs = get_matrix_row(i, a_motif->scores);
      right_freqs = get_matrix_row(a_motif->length - (i + 1), a_motif->scores);

      // Make a temporary copy of one row.
      copy_array(left_freqs, temp_freqs);

      // Compute reverse complements in both directions.
      complement_dna_freqs(right_freqs, left_freqs);
      complement_dna_freqs(temp_freqs, right_freqs);
    }
    free_array(temp_freqs);
  }
  //swap the trimming variables
  temp_trim = a_motif->trim_left;
  a_motif->trim_left = a_motif->trim_right;
  a_motif->trim_right = temp_trim;
  //swap the strand indicator
  //this assumes a ? is equalivant to +
  if (get_motif_strand(a_motif) == '-') {
    set_motif_strand('+', a_motif);
  } else {
    set_motif_strand('-', a_motif);
  }
}
Beispiel #4
0
/***********************************************************************
 * Read the background letter frequencies from XML.
 * Caller is responsible for freeing the returned array.
 ***********************************************************************/
ARRAY_T* read_bg_freqs_from_xml(xmlXPathContextPtr xpath_ctxt, ALPH_T alph) {

  xmlXPathObjectPtr xpathObj = NULL;
  ATYPE    value;
  ARRAY_T* bg_freqs;

  int a_size = alph_size(alph, ALPH_SIZE);

  // Use XPATH to get the background frequencies from XML
  xpathObj = xpath_query(
    xpath_ctxt, 
    "//*/background_frequencies/alphabet_array/value"
  );
  int num_values = (xpathObj->nodesetval ? xpathObj->nodesetval->nodeNr : 0);
  xmlXPathFreeObject(xpathObj);

  // The number of background frequences should match the alphabet size.
  assert(num_values == a_size);

  // Allocate the array.
  bg_freqs= allocate_array(alph_size(alph, ALL_SIZE));

  // XML doesn't enforce any order on the emission probability values,
  // so force reading bg frequency values in alphabet order.
  const int MAX_XPATH_EXPRESSION = 200;
  char xpath_expression[MAX_XPATH_EXPRESSION];
  xmlNodePtr currValueNode = NULL;
  int i_node = 0;
  for (i_node = 0; i_node < a_size; i_node++) {
    // Build the XPATH expression to get bg freq for a character.
    snprintf(
      xpath_expression,
      MAX_XPATH_EXPRESSION,
      "//*/background_frequencies/"
      "alphabet_array/value[@letter_id='letter_%c']",
      alph_char(alph, i_node)
    );
    // Read the selected bg frequency.
    xpathObj = xpath_query(xpath_ctxt, xpath_expression);
    // Should only find one node
    assert(xpathObj->nodesetval->nodeNr == 1);
    // Decode from node set to numeric value for bg freq.
    currValueNode = xpathObj->nodesetval->nodeTab[0];
    xmlXPathFreeObject(xpathObj);
    value = xmlXPathCastNodeToNumber(currValueNode);
    set_array_item(i_node, value, bg_freqs);
  }

  // Make sure the frequencies add up to 1.0. 
  normalize_subarray(0, a_size, 0.0, bg_freqs);

  // Fill in ambiguous characters. 
  calc_ambigs(alph, FALSE, bg_freqs);

  return bg_freqs;

}
/*
 * Load uniform frequencies into the array.
 */
ARRAY_T* get_uniform_frequencies(ALPH_T alph, ARRAY_T *freqs) {
  int i, n;

  n = ALPH_ASIZE[alph];
  if (freqs == NULL) freqs = allocate_array(alph_size(alph, ALL_SIZE));
  assert(get_array_length(freqs) >= alph_size(alph, ALL_SIZE));
  for (i = 0; i < n; i++) { 
    set_array_item(i, 1.0/n, freqs); 
  }
  calc_ambigs(alph, FALSE, freqs);
  return freqs;
}
/*
 * Load the non-redundant database frequencies into the array.
 */
ARRAY_T* get_nrdb_frequencies(ALPH_T alph, ARRAY_T *freqs) {
  int i, size;
  const PROB_T *nrdb_freqs;

  size = ALPH_ASIZE[alph];
  if (freqs == NULL) freqs = allocate_array(alph_size(alph, ALL_SIZE));
  assert(get_array_length(freqs) >= alph_size(alph, ALL_SIZE));
  nrdb_freqs = ALPH_NRDB[alph];
  for (i = 0; i < size; ++i) {
    set_array_item(i, nrdb_freqs[i], freqs);
  }
  normalize_subarray(0, size, 0.0, freqs);
  calc_ambigs(alph, FALSE, freqs);
  return freqs;
}
Beispiel #7
0
/***********************************************************************
 * Compute the complexity of a motif as a number between 0 and 1.
 *
 * Motif complexity is the average K-L distance between the "motif
 * background distribution" and each column of the motif.  The motif
 * background is just the average distribution of all the columns.  The
 * K-L distance, which measures the difference between two
 * distributions, is the same as the information content:
 *
 *  \sum_i p_i log(p_i/f_i)
 *
 * This value increases with increasing complexity.
 ***********************************************************************/
double compute_motif_complexity
  (MOTIF_T* a_motif)
{
  double return_value;
  ARRAY_T* motif_background;  // Mean emission distribution.
  int num_rows;
  int i_row;
  int num_cols;
  int i_col;

  num_cols = alph_size(a_motif->alph, ALPH_SIZE);
  num_rows = a_motif->length;

  // Compute the mean emission distribution.
  motif_background = get_matrix_col_sums(a_motif->freqs);
  scalar_mult(1.0 / (double)num_rows, motif_background);

  // Compute the K-L distance w.r.t. the background.
  return_value = 0;
  for (i_row = 0; i_row < num_rows; i_row++) {
    ARRAY_T* this_emission = get_matrix_row(i_row, a_motif->freqs);
    for (i_col = 0; i_col < num_cols; i_col++) {
      ATYPE this_item = get_array_item(i_col, this_emission);
      ATYPE background_item = get_array_item(i_col, motif_background);

      // Use two logs to avoid handling divide-by-zero as a special case.
      return_value += this_item 
	* (my_log(this_item) - my_log(background_item));
    }
  }

  free_array(motif_background);
  return(return_value / (double)num_rows);
}
Beispiel #8
0
/***********************************************************************
 * Apply a pseudocount to the motif pspm.
 ***********************************************************************/
void apply_pseudocount_to_motif
  (MOTIF_T* motif, ARRAY_T *background, double pseudocount)
{
  int pos, letter, len, asize, sites;
  double prob, count, total;
  ARRAY_T *temp;

  // no point in doing work when it makes no difference
  if (pseudocount == 0) return;
  assert(pseudocount > 0);
  // motif dimensions
  asize = alph_size(motif->alph, ALPH_SIZE);
  len = motif->length;
  // create a uniform background if none is given
  temp = NULL;
  if (background == NULL) {
    temp = allocate_array(asize);
    get_uniform_frequencies(motif->alph, temp);
    background = temp;
  }
  // calculate the counts
  sites = (motif->num_sites > 0 ? motif->num_sites : DEFAULT_SITE_COUNT);
  total = sites + pseudocount;
  for (pos = 0; pos < len; ++pos) {
    for (letter = 0; letter < asize; ++letter) {
      prob = get_matrix_cell(pos, letter, motif->freqs);
      count = (prob * sites) + (pseudocount * get_array_item(letter, background));
      prob = count / total;
      set_matrix_cell(pos, letter, prob, motif->freqs);
    }
  }
  if (temp) free_array(temp);
}
Beispiel #9
0
void dxml_start_motif(void *ctx, char *id, char *seq, int length,
                      long num_sites, long p_hits, long n_hits,
                      double pvalue, double evalue, double uevalue) {
    CTX_T *data;
    MOTIF_T *motif;

    data = (CTX_T*)ctx;
    data->motif = (MOTIF_T*)mm_malloc(sizeof(MOTIF_T));
    motif = data->motif;
    memset(motif, 0, sizeof(MOTIF_T));
    set_motif_id(seq, strlen(seq), motif);
    set_motif_id2("", 0, motif);
    set_motif_strand('+', motif);
    motif->length = length;
    motif->num_sites = num_sites;
    motif->evalue = evalue;
    // both DNA and RNA have 4 letters
    motif->alph = data->fscope.alphabet;
    motif->flags = MOTIF_BOTH_STRANDS; // DREME does not support the concept of single strand scanning (yet)
    // allocate the matrix
    motif->freqs = allocate_matrix(motif->length, alph_size(motif->alph, ALPH_SIZE));
    motif->scores = NULL; // no scores in DREME xml
    // no url in DREME
    motif->url = strdup("");
    // set by postprocessing
    motif->complexity = -1;
    motif->trim_left = 0;
    motif->trim_right = 0;
}
Beispiel #10
0
/*************************************************************************
 * Convert an integer representing a column in a PSSM into the
 * corresponding alignment column string. 
 * If the alphabet has m characters, and the alignment columns have n entries,
 * the array of all alignment columns is conveniently numbered by the set of
 * consecutive n-digit base m numerals: 
 *   AAAA = 0000, AAAC = 0001, ..., TTTG = 3332, TTTT = 3333.
 * The caller must allocate the memory for the alignment column string. 
 * The memory required is the number of sequences in the alignment, plus one
 * for the terminating null.
 *************************************************************************/
void unhash_alignment_col(
  ALPH_T alph,
  int alignment_col_index, 
  char *alignment_col, 
  int alignment_col_size
) {
  int asize = alph_size(alph, ALPH_SIZE);

  assert(alignment_col_index >= 0);
  assert(
    alignment_col_index < pow(
      (double) asize, 
      (double) alignment_col_index
    )
  );
  assert(alignment_col != NULL);
  assert(alignment_col_size >= 1);
  
  alignment_col[alignment_col_size] = '\0';
  int i, j;
  for (i = alignment_col_size - 1; i >= 0; i--) {
    j = alignment_col_index % asize;
    alignment_col_index -= j;
    alignment_col[i] = alph_char(alph, j);
    alignment_col_index /= asize;
  }
} // unhash_alignment_col
Beispiel #11
0
/*
 * When the parser has been selected do some processing
 */
static void parser_selected(MREAD_T *mread) {
  ALPH_T alph;
  MFORMAT_T* format;
  format = mread->formats;
  // get the alphabet
  alph = format->get_alphabet(mread->formats->data);
  // get the background
  if (format->get_bg(format->data, &(mread->motif_bg))) {
    normalize_subarray(0, alph_size(alph, ALPH_SIZE), 0.0, mread->motif_bg);
    resize_array(mread->motif_bg, alph_size(alph, ALL_SIZE));
    calc_ambigs(alph, FALSE, mread->motif_bg);
  } else {
    mread->motif_bg = get_uniform_frequencies(alph, mread->motif_bg);
  }
  set_pseudo_bg(mread);
}
Beispiel #12
0
/**************************************************************************
*
*	reverse_complement_pssm_matrix
*
*	Turn a pssm matrix into its own reverse complement.
*
 *************************************************************************/
static void reverse_complement_pssm (
  ALPH_T alph,
  MATRIX_T* pssm_matrix
)
{
  int i;
  ARRAY_T* left_scores;
  ARRAY_T* right_scores;
  ARRAY_T* temp_scores;   // Temporary space during swap.
  int length = get_num_rows(pssm_matrix);

  // Allocate space.
  temp_scores = allocate_array(alph_size(alph, ALL_SIZE));

  // Consider each row (position) in the motif.
  for (i = 0; i < (int)((length+1) / 2); i++) {
    left_scores = get_matrix_row(i, pssm_matrix);
    right_scores = get_matrix_row(length - (i + 1), pssm_matrix);

    // Make a temporary copy of one row.
    copy_array(left_scores, temp_scores);

    // Compute reverse complements in both directions.
    complement_dna_freqs(right_scores, left_scores);
    complement_dna_freqs(temp_scores, right_scores);
  }
  free_array(temp_scores);
} // reverse_complement_pssm_matrix
Beispiel #13
0
/*************************************************************************
 * Calculate the log odds score for a single motif-sized window.
 *************************************************************************/
static inline BOOLEAN_T score_motif_site(
  ALPH_T alph,
  char *seq,
  PSSM_T *pssm,
  double *score // OUT
) {
  int asize = alph_size(alph, ALPH_SIZE);
  MATRIX_T* pssm_matrix = pssm->matrix;
  double scaled_log_odds = 0.0;

  // For each position in the site
  int motif_position;
  for (motif_position = 0; motif_position < pssm->w; motif_position++) {

    char c = seq[motif_position];
    int aindex = alph_index(alph, c);
    // Check for gaps and ambiguity codes at this site
    if(aindex == -1 || aindex >= asize) return FALSE;

    scaled_log_odds += get_matrix_cell(motif_position, aindex, pssm_matrix);
  }

  *score = get_unscaled_pssm_score(scaled_log_odds, pssm);

  // Handle scores that are out of range
  if ((int) scaled_log_odds >= get_array_length(pssm->pv)) {
    scaled_log_odds = (float)(get_array_length(pssm->pv) - 1);
    *score = scaled_to_raw(scaled_log_odds, pssm->w, pssm->scale, pssm->offset);
  }
  return TRUE;
}
Beispiel #14
0
/**************************************************************************
*
*	hash_sequence
*
*	Hash a sequence, compressing hash_n letters into 1.
*
*	Return the newly allocated sequence.
*
 *************************************************************************/
static int* hash_sequence(
  ALPH_T alph,
  int *int_sequence,				// Sequence in integer format.
  int seq_length,				// Length of sequence.
  int hash_n					// Number of letters to compress to 1.
)
{
  int i, j;
  int base = alph_size(alph, ALL_SIZE) + 1;	// Base to hash to.
  int* hashed_sequence = NULL;

  // Allocate the hashed sequence.
  mm_resize(hashed_sequence, seq_length, int);

  for(i=0; i<seq_length; i++) {
    int c = int_sequence[i];			// Character in hashed alphabet.
    int* old_cp;				// Pointer to unhashed character in int_sequence.
    if ((seq_length - i - hash_n) < 0) {	// Hash window is within sequence.
      for(j=1, old_cp=&(int_sequence[i+1]); j<hash_n; j++, old_cp++) {
	c = (base * c) + *old_cp;
      }
    } else {					// Hash window runs off sequence end.
      for(j=1, old_cp=&(int_sequence[i+1]); j<hash_n; j++, old_cp++) {
	c = (base * c);
        if (old_cp - int_sequence < seq_length) c += *old_cp;
      }
    }
    hashed_sequence[i] = c; 			// Record the hashed character.
  }

  return(hashed_sequence);
} // hash_sequence
/*
 *  Tests the letter against the alphabet. If the alphabet is unknown
 *  it attempts to work it out and set it from the letter.
 *  For simplicy this assumes you will pass indexes in asscending order.
 *  Returns false if the letter is unacceptable
 */
BOOLEAN_T alph_test(ALPH_T *alpha, int index, char letter) {
  char uc_letter;
  uc_letter = toupper(letter);
  if (*alpha == INVALID_ALPH) {
    switch (index) {
      case 0:
        return (uc_letter == 'A');
      case 1:
        return (uc_letter == 'C');
      case 2:
        if (uc_letter == 'D') {
          *alpha = PROTEIN_ALPH;
          return TRUE;
        }
        return (uc_letter == 'G'); // DNA or RNA
      case 3:
        if (uc_letter == 'T') {
          *alpha = DNA_ALPH;
        } else if (uc_letter == 'U') {
          *alpha = DNA_ALPH; //FIXME need RNA but substitute DNA for now
        } else {
          return FALSE;
        }
        return TRUE;
      default:// Bad state!
        die("Should not still be attempting to guess by the 5th letter "
            "(index = %d).", index);
        return FALSE;
    }
  } else {
    if (index >= alph_size(*alpha, ALPH_SIZE)) return FALSE; // index too big
    return (uc_letter == alph_char(*alpha, index));
  }
}
Beispiel #16
0
extern MATRIX_T* gen_pam_matrix(
  ALPH_T alph,                  /* alphabet */
  int dist,			/* PAM distance */
  BOOLEAN_T logodds		/* true: generate log-odds matrix 
				   false: generate target frequency matrix 
				*/
)
{
  assert(alph == DNA_ALPH || alph == PROTEIN_ALPH);
  int i, j;
  MATRIX_T *matrix, *mul;
  BOOLEAN_T dna = (alph == DNA_ALPH);
  double *pfreq = dna ? pam_dna_freq : pam_prot_freq;	// standard frequencies
  int alen = alph_size(alph, ALPH_SIZE);  // length of standard alphabet
  double factor = dist < 170 ? 2/log(2) : 3/log(2);	// same as in "pam" Version 1.0.6

  /* create the array for the joint probability matrix */
  matrix = allocate_matrix(alen, alen);
  mul = allocate_matrix(alen, alen);

  /* initialize the matrix: PAM 1:
     due to roundoff, take the average of the two estimates of the joint frequency
     of i and j as the joint, then compute the conditionals for the matrix
  */
  for (i=0; i<alen; i++) {
    for (j=0; j<=i; j++) {
      double vij = dna ? trans[i][j] : dayhoff[i][j];
      double vji = dna ? trans[j][i] : dayhoff[j][i];
      double joint = ((vij * pfreq[j]) + (vji * pfreq[i]))/20000;/* use average to fix rndoff */
      set_matrix_cell(i, j, joint/pfreq[j], matrix);
      if (i!=j) set_matrix_cell(j, i, joint/pfreq[i], matrix);
    }
  }

  /* take PAM matrix to desired power to scale it */ 
  copy_matrix(matrix, mul);
  for (i=dist; i>1; i--) {
    MATRIX_T *product = matrix_multiply(matrix, mul);
    SWAP(MATRIX_T*, product, matrix)
    free_matrix(product);
  } 
  free_matrix(mul);

  /* convert to joint or logodds matrix:
     target:  J_ij = Pr(i,j) = Mij pr(j) 
     logodds: L_ij = log (Pr(i,j)/(Pr(i)Pr(j)) = log (Mij Pr(j)/Pr(i)Pr(j)) = log(Mij/pr(i)) 
  */
  for (i=0; i<alen; i++) {
    for (j=0; j<alen; j++) {
      double vij = get_matrix_cell(i, j, matrix);
      vij = logodds ? nint(factor * log((vij+EPSILON)/pfreq[i])) : vij * pfreq[j];
      set_matrix_cell(i, j, vij, matrix);
    }
  }

  return matrix;
} /* gen_pam_matrix */
Beispiel #17
0
/***********************************************************************
 * Normalize the motif's pspm
 ***********************************************************************/
void normalize_motif
  (MOTIF_T *motif, double tolerance)
{
  int i_row, asize;
  asize = alph_size(motif->alph, ALPH_SIZE);
  for (i_row = 0; i_row < motif->length; ++i_row) {
    normalize_subarray(0, asize, tolerance, get_matrix_row(i_row, motif->freqs));
  }
}
Beispiel #18
0
/***********************************************************************
 * Calculate the ambiguous letters from the concrete ones.
 ***********************************************************************/
void calc_motif_ambigs
  (MOTIF_T *motif)
{
  int i_row;
  resize_matrix(motif->length, alph_size(motif->alph, ALL_SIZE), 0, motif->freqs);
  motif->flags |= MOTIF_HAS_AMBIGS;
  for (i_row = 0; i_row < motif->length; ++i_row) {
    calc_ambigs(motif->alph, FALSE, get_matrix_row(i_row, motif->freqs));
  }
}
Beispiel #19
0
/***********************************************************************
 * Copy a motif from one place to another.
 ***********************************************************************/
void copy_motif
  (MOTIF_T* source,
   MOTIF_T* dest)
{
  ALPH_SIZE_T size;
  memset(dest, 0, sizeof(MOTIF_T));
  strcpy(dest->id, source->id);
  strcpy(dest->id2, source->id2);
  dest->length = source->length;
  dest->alph = source->alph;
  dest->flags = source->flags;
  dest->evalue = source->evalue;
  dest->num_sites = source->num_sites;
  dest->complexity = source->complexity;
  dest->trim_left = source->trim_left;
  dest->trim_right = source->trim_right;
  if (source->freqs) {
    size = (dest->flags & MOTIF_HAS_AMBIGS ? ALL_SIZE : ALPH_SIZE);
    // Allocate memory for the matrix.
    dest->freqs = allocate_matrix(dest->length, alph_size(dest->alph, size));
    // Copy the matrix.
    copy_matrix(source->freqs, dest->freqs);
  } else {
    dest->freqs = NULL;
  }
  if (source->scores) {
    // Allocate memory for the matrix. Note that scores don't contain ambigs.
    dest->scores = allocate_matrix(dest->length, alph_size(dest->alph, ALPH_SIZE));
    // Copy the matrix.
    copy_matrix(source->scores, dest->scores);
  } else {
    dest->scores = NULL;
  }
  if (dest->url != NULL) {
    free(dest->url);
    dest->url = NULL;
  }
  copy_string(&(dest->url), source->url);
}
Beispiel #20
0
/*************************************************************************
 * Converts the motif frequency matrix into a odds matrix: taken from old ama-scan.c
 *************************************************************************/
void convert_to_odds_matrix(MOTIF_T* motif, ARRAY_T* bg_freqs){
  const int asize = alph_size(get_motif_alph(motif), ALPH_SIZE);
  int motif_position_index,alph_index;
  MATRIX_T *freqs;
  freqs = get_motif_freqs(motif);

  const int num_motif_positions = get_num_rows(freqs);
  for (alph_index=0;alph_index<asize;++alph_index){
    double bg_likelihood = get_array_item(alph_index, bg_freqs);
    for (motif_position_index=0;motif_position_index<num_motif_positions;++motif_position_index){
      freqs->rows[motif_position_index]->items[alph_index] /= bg_likelihood;
    }
  }
}
Beispiel #21
0
/*************************************************************************
 * Copies the motif frequency matrix and converts it into a odds matrix
 *************************************************************************/
MATRIX_T* create_odds_matrix(MOTIF_T *motif, ARRAY_T* bg_freqs){
  const int asize = alph_size(get_motif_alph(motif), ALPH_SIZE);
  int pos, aidx;
  MATRIX_T *odds;
  
  odds = duplicate_matrix(get_motif_freqs(motif));
  const int num_pos = get_num_rows(odds);
  for (aidx = 0; aidx < asize; ++aidx) {
    double bg_likelihood = get_array_item(aidx, bg_freqs);
    for (pos = 0; pos < num_pos; ++pos) {
      odds->rows[pos]->items[aidx] /= bg_likelihood;
    }
  }
  return odds;
}
Beispiel #22
0
/***********************************************************************
 * Return one column of a motif, as a newly allocated array of counts.
 * This assumes that num_sites is a reasonable value and not zero...
 ***********************************************************************/
ARRAY_T* get_motif_counts
  (int      position,
   MOTIF_T* motif)
{
  int i_alph, asize;
  ARRAY_T* return_value;
  
  asize = alph_size(motif->alph, ALPH_SIZE);
  return_value = allocate_array(asize);

  for (i_alph = 0; i_alph < asize; i_alph++) {
    set_array_item(i_alph, motif->num_sites * 
        get_matrix_cell(position, i_alph, motif->freqs), return_value);
  }
  return(return_value);
}
Beispiel #23
0
/***********************************************************************
 * Calculates the information content of a position of the motif.
 ***********************************************************************/
static inline double position_information_content(
  MOTIF_T *a_motif,
  int position
) {
  int i, asize;
  double H, item;
  ARRAY_T *freqs;

  asize = alph_size(a_motif->alph, ALPH_SIZE);
  H = 0;
  freqs = get_matrix_row(position, a_motif->freqs);
  for (i = 0; i < asize; ++i) {
    item = get_array_item(i, freqs);
    H -= item*my_log2(item);
  }
  return my_log2(asize) - H;
}
Beispiel #24
0
/*************************************************************************
 * Output JSON data for a motif
 *************************************************************************/
static void output_motif_json(JSONWR_T* json, MOTIF_STATS_T* stats, 
    SITE_COUNTS_T* counts) {
  //vars
  MOTIF_T *motif;
  MATRIX_T *freqs;
  int i, j, mlen, asize, end;
  motif = stats->motif;
  freqs = get_motif_freqs(motif);
  asize = alph_size(get_motif_alph(motif), ALPH_SIZE);
  jsonwr_start_object_value(json);
  jsonwr_lng_prop(json, "db", stats->db->id);
  jsonwr_str_prop(json, "id", get_motif_id(motif));
  if (*(get_motif_id2(motif))) {
    jsonwr_str_prop(json, "alt", get_motif_id2(motif));
  }
  mlen = get_motif_length(motif);
  jsonwr_lng_prop(json, "len", mlen);
  jsonwr_dbl_prop(json, "motif_evalue", get_motif_evalue(motif));
  jsonwr_dbl_prop(json, "motif_nsites", get_motif_nsites(motif));
  if (get_motif_url(motif) && *get_motif_url(motif)) {
    jsonwr_str_prop(json, "url", get_motif_url(motif));
  }
  jsonwr_property(json, "pwm");
  jsonwr_start_array_value(json);
  for (i = 0; i < mlen; i++) {
    jsonwr_start_array_value(json);
    for (j = 0; j < asize; j++) {
      jsonwr_dbl_value(json, get_matrix_cell(i, j, freqs));
    }
    jsonwr_end_array_value(json);
  }
  jsonwr_end_array_value(json);
  jsonwr_lng_prop(json, "bin_width", stats->central_window+1);
  jsonwr_dbl_prop(json, "bin_sites", stats->central_sites);
  jsonwr_lng_prop(json, "total_sites", counts->total_sites);
  jsonwr_dbl_prop(json, "log_pvalue", stats->log_adj_pvalue);
  jsonwr_dbl_prop(json, "max_prob", stats->max_prob);
  jsonwr_property(json, "sites");
  jsonwr_start_array_value(json);
  end = counts->allocated - (mlen - 1);
  for (i = (mlen - 1); i < end; i += 2) {
    jsonwr_dbl_value(json, counts->sites[i]);
  }
  jsonwr_end_array_value(json);
  jsonwr_end_object_value(json);
}
Beispiel #25
0
MATRIX_T *get_subst_target_matrix(
  char *score_filename,		/* name of score file */
  ALPH_T alph,                  /* alphabet */
  int dist,			/* PAM distance (ignored if score_filename != NULL) */
  ARRAY_T *back			/* background frequencies of standard alphabet */
)
{
  MATRIX_T *score;		/* score matrix */
  MATRIX_T *target;		/* target frequency matrix */

  score = get_score_matrix(score_filename, alph, dist);
  target = convert_score_to_target(score, back);

  if (SUBST_MATRIX_DEBUG)
  {
    int i, j, alength=alph_size(alph, ALPH_SIZE);
    double sum;

      if (score_filename) {
	printf("From file %s\n", score_filename);
      } else {
	printf("Generated PAM %d\n", dist);
      }
      printf("%6c ", ' ');
      for (i=0; i<alength; i++) {
	printf("%6c ", alph_char(alph, i));
      }
      printf("\n");
    sum = 0;
    for (i=0; i<alength; i++) {
      printf("%6c ", alph_char(alph, i));
      for (j=0; j<alength; j++) {
	double x = get_matrix_cell(i,j,score);
	sum += x;
	printf("%6.4f ", x);
      }
      printf("\n");
    }
    printf("sum of entries = %f\n", sum);
  }

  free_matrix(score);
    
  return(target);
} /* get_subst_target_matrix */
Beispiel #26
0
void mcast_print_bg_freqs(
  FILE *output,
  ARRAY_T *bgfreqs,
  MHMMSCAN_OPTIONS_T *options
) {
  int asize = alph_size(options->alphabet, ALPH_SIZE);
  int i;
  for (i = 0; i < asize; i++) {
    if (i % 9 == 0) {
      fputc('\n', output);
    }
    fprintf(
      output,
      "%c: %1.3f ",
      alph_char(options->alphabet, i),
      get_array_item(i, bgfreqs)
    );
  }
};
Beispiel #27
0
/***********************************************************************
 * Takes a matrix of meme scores and converts them into letter 
 * probabilities.
 *
 * The probablility can be got by:
 * p = (2 ^ (s / 100)) * bg
 *
 ***********************************************************************/
MATRIX_T* convert_scores_into_freqs
  (ALPH_T alph,
   MATRIX_T *scores,
   ARRAY_T *bg,
   int site_count,
   double pseudo_count)
{
  int asize, length;
  double freq, score, total_count, counts, bg_freq;
  MATRIX_T *freqs;
  int row, col;

  assert(alph != INVALID_ALPH);
  assert(scores != NULL);
  assert(bg != NULL);

  length = get_num_rows(scores);
  asize = alph_size(alph, ALPH_SIZE);

  freqs = allocate_matrix(length, asize);
  total_count = site_count + pseudo_count;

  for (col = 0; col < asize; ++col) {
    bg_freq = get_array_item(col, bg);
    for (row = 0; row < length; ++row) {
      score = get_matrix_cell(row, col, scores);
      // convert to a probability
      freq = pow(2.0, score / 100.0) * bg_freq;
      // remove the pseudo count
      freq = ((freq * total_count) - (bg_freq * pseudo_count)) / site_count;
      if (freq < 0) freq = 0;
      else if (freq > 1) freq = 1;
      set_matrix_cell(row, col, freq, freqs);
    }
  }
  for (row = 0; row < length; ++row) {
    normalize_subarray(0, asize, 0.0, get_matrix_row(row, freqs));
  }

  return freqs;
}
Beispiel #28
0
/*******************************************************************
  Print the column frequency distribution.
 ********************************************************************/
static void print_col_frequencies(
  ALPH_T alph,
  ARRAY_T* alignment_column_freqs
)
{
  int i;
  int num_freqs = get_array_length(alignment_column_freqs);
  int asize = alph_size(alph, ALPH_SIZE);
  int num_leaves = NINT(log(num_freqs)/log(asize));
  char* alignment_col = mm_malloc((num_leaves + 1) * sizeof(char));
  for (i=0; i<num_freqs; i++) {
    unhash_alignment_col(
      alph,
      i,                              //col_index
      alignment_col,
      num_leaves
    );
    printf("%s %d %g\n", alignment_col, i+1, 
      get_array_item(i, alignment_column_freqs));
  }
} // print_col_freqs
Beispiel #29
0
/*************************************************************************
 *  Build array containing the counts of columns in the alignment
 *  Caller is responsible for freeing the returned array.
 *  If input parameter "freqs" is NULL, allocates the array.
 *  Otherwise, the counts are added to the existing counts in the counts
 *  array.  Ignores all columns containing gaps or ambiguity characters:
 *    [.-nNxX]
 *************************************************************************/
static ARRAY_T* build_alignment_column_counts(
  ALPH_T alph,
  ALIGNMENT_T* alignment,
  ARRAY_T* counts 
) 
{

  assert(alignment != NULL);

  int asize = alph_size(alph, ALPH_SIZE);

  // Calculate number of possible alignment columns
  // and create storage for counting occurences.
  int num_seqs = get_num_aligned_sequences(alignment);
  int num_alignment_cols = (int) pow((double) asize, (double) num_seqs);
  if (counts == NULL) {
    counts = allocate_array(num_alignment_cols);
  }

  // Count how many examples of each column occur in the alignment.
  // Skip columns that contain gaps or ambiguity characters.
  int alignment_length = get_alignment_length(alignment);
  char* alignment_col = mm_malloc(sizeof(char) * (num_seqs + 1));
  alignment_col[num_seqs] = 0;
  int i, h;
  for(i = 0; i < alignment_length; i++) {
    get_alignment_col(i, alignment_col, alignment);
    if (strchr(alignment_col, '-') != NULL) { continue; }
    if (strchr(alignment_col, '.') != NULL) { continue; }
    if (strchr(alignment_col, 'N') != NULL) { continue; }
    if (strchr(alignment_col, 'n') != NULL) { continue; }
    if (strchr(alignment_col, 'X') != NULL) { continue; }
    if (strchr(alignment_col, 'x') != NULL) { continue; }
    h = hash_alignment_col(alph, alignment_col, num_seqs);
    incr_array_item(h, 1, counts);
  }

  return counts;
} // build_alignment_column_counts
Beispiel #30
0
/****************************************************************************
*  Return an array containing the frequencies in the alignment for each 
*  character of the alphabet. Gaps and ambiguity characters other then
*  ANY_BASE are not counted. The freq. of ANY_BASE characters is stored
*  in the last element of the array.
****************************************************************************/
ARRAY_T* get_alignment_freqs(ALPH_T alph, ALIGNMENT_T* alignment) {
  char c = 0;
  int aindex = 0;
  int asize = 0;
  int i = 0;
  int s = 0;
  int total_bases = 0;
  int* num_bases = NULL;
  ARRAY_T* freqs = NULL;
  
  // Initialize counts for each character in the alphabet
  asize = alph_size(alph, ALPH_SIZE);
  num_bases = mm_malloc(asize * sizeof(int));
  for (i = 0; i < asize; i++) {
    num_bases[i] = 0;
  }

  for (s = 0; s < alignment->num_sequences; s++) {
    for (i = 0; i < alignment->length; i++) {
      c = get_seq_char(i, alignment->sequences[s]);
      aindex = alph_index(alph, c);
      // c might be an ambiguity code. We don't count ambiguity codes.
      if (aindex != -1 && aindex < asize) {
        num_bases[aindex]++;
        total_bases++;
      }
    }
  }

  freqs = allocate_array(asize);
  for (i = 0; i < asize; i++) {
    set_array_item(i, (double) num_bases[i] / (double) total_bases, freqs);
  }

  // Clean up the count of characters
  myfree(num_bases);

  return freqs;
}