Ejemplo n.º 1
0
/*
 *  Tests the letter against the alphabet. If the alphabet is unknown
 *  it attempts to work it out and set it from the letter.
 *  For simplicy this assumes you will pass indexes in asscending order.
 *  Returns false if the letter is unacceptable
 */
BOOLEAN_T alph_test(ALPH_T *alpha, int index, char letter) {
  char uc_letter;
  uc_letter = toupper(letter);
  if (*alpha == INVALID_ALPH) {
    switch (index) {
      case 0:
        return (uc_letter == 'A');
      case 1:
        return (uc_letter == 'C');
      case 2:
        if (uc_letter == 'D') {
          *alpha = PROTEIN_ALPH;
          return TRUE;
        }
        return (uc_letter == 'G'); // DNA or RNA
      case 3:
        if (uc_letter == 'T') {
          *alpha = DNA_ALPH;
        } else if (uc_letter == 'U') {
          *alpha = DNA_ALPH; //FIXME need RNA but substitute DNA for now
        } else {
          return FALSE;
        }
        return TRUE;
      default:// Bad state!
        die("Should not still be attempting to guess by the 5th letter "
            "(index = %d).", index);
        return FALSE;
    }
  } else {
    if (index >= alph_size(*alpha, ALPH_SIZE)) return FALSE; // index too big
    return (uc_letter == alph_char(*alpha, index));
  }
}
/*****************************************************************************
 * MEME > model > /background_frequencies
 ****************************************************************************/
void mxml_end_background(void *ctx) {
  CTX_T *data;
  int i;
  bool error;
  double sum, delta;
  data = (CTX_T*)ctx;
  sum = 0;
  error = false;
  for (i = 0; i < get_array_length(data->nums); i++) {
    if (get_array_item(i, data->nums) == -1) {
      local_error(data, "Background frequency was not provided for letter %c.\n", alph_char(data->alph, i));
      error = true;
    } else {
      sum += get_array_item(i, data->nums);
    }
  }
  delta = sum - 1.0;
  if (delta < 0) delta = -delta;
  if (delta > 0.01) {
    local_error(data, "The background frequencies summed to %f but they should sum to 1.0.\n", sum);
    error = true;
  }
  if (error) {
    free_array(data->nums);
  } else {
    data->fscope.background = data->nums;
    data->nums = NULL;
  }
}
Ejemplo n.º 3
0
/*************************************************************************
 * Convert an integer representing a column in a PSSM into the
 * corresponding alignment column string. 
 * If the alphabet has m characters, and the alignment columns have n entries,
 * the array of all alignment columns is conveniently numbered by the set of
 * consecutive n-digit base m numerals: 
 *   AAAA = 0000, AAAC = 0001, ..., TTTG = 3332, TTTT = 3333.
 * The caller must allocate the memory for the alignment column string. 
 * The memory required is the number of sequences in the alignment, plus one
 * for the terminating null.
 *************************************************************************/
void unhash_alignment_col(
  ALPH_T alph,
  int alignment_col_index, 
  char *alignment_col, 
  int alignment_col_size
) {
  int asize = alph_size(alph, ALPH_SIZE);

  assert(alignment_col_index >= 0);
  assert(
    alignment_col_index < pow(
      (double) asize, 
      (double) alignment_col_index
    )
  );
  assert(alignment_col != NULL);
  assert(alignment_col_size >= 1);
  
  alignment_col[alignment_col_size] = '\0';
  int i, j;
  for (i = alignment_col_size - 1; i >= 0; i--) {
    j = alignment_col_index % asize;
    alignment_col_index -= j;
    alignment_col[i] = alph_char(alph, j);
    alignment_col_index /= asize;
  }
} // unhash_alignment_col
Ejemplo n.º 4
0
Archivo: motif.c Proyecto: CPFL/gmeme
/***********************************************************************
 * Returns the string that is the best possible match to the given motif.
 * Caller is responsible for freeing string.
 ***********************************************************************/
char *get_best_possible_match(MOTIF_T *motif) {
  int mpos, apos, asize; 
  char *match_string;
  ALPH_SIZE_T size;

  asize = alph_size(motif->alph, ALPH_SIZE);
  
  assert(motif != NULL);
  assert(motif->freqs != NULL);
  assert(motif->length == motif->freqs->num_rows);
  size = (motif->flags & MOTIF_HAS_AMBIGS ? ALL_SIZE : ALPH_SIZE);
  assert(alph_size(motif->alph, size) == motif->freqs->num_cols); 

  match_string = mm_malloc(sizeof(char) * (motif->length + 1));

  // Find the higest scoring character at each position in the motif.
  for(mpos = 0; mpos < motif->length; ++mpos) {
    ARRAY_T *row = motif->freqs->rows[mpos];
    double max_v = row->items[0];
    int max_i = 0;
    for(apos = 1; apos < asize; ++apos) {
     if (row->items[apos] >= max_v) {
        max_i = apos;
        max_v = row->items[apos];
     }
    }
    match_string[mpos] = alph_char(motif->alph, max_i);
  }

  //  Add null termination
  match_string[motif->length] = '\0';

  return match_string;
}
Ejemplo n.º 5
0
MATRIX_T *get_subst_target_matrix(
  char *score_filename,		/* name of score file */
  ALPH_T alph,                  /* alphabet */
  int dist,			/* PAM distance (ignored if score_filename != NULL) */
  ARRAY_T *back			/* background frequencies of standard alphabet */
)
{
  MATRIX_T *score;		/* score matrix */
  MATRIX_T *target;		/* target frequency matrix */

  score = get_score_matrix(score_filename, alph, dist);
  target = convert_score_to_target(score, back);

  if (SUBST_MATRIX_DEBUG)
  {
    int i, j, alength=alph_size(alph, ALPH_SIZE);
    double sum;

      if (score_filename) {
	printf("From file %s\n", score_filename);
      } else {
	printf("Generated PAM %d\n", dist);
      }
      printf("%6c ", ' ');
      for (i=0; i<alength; i++) {
	printf("%6c ", alph_char(alph, i));
      }
      printf("\n");
    sum = 0;
    for (i=0; i<alength; i++) {
      printf("%6c ", alph_char(alph, i));
      for (j=0; j<alength; j++) {
	double x = get_matrix_cell(i,j,score);
	sum += x;
	printf("%6.4f ", x);
      }
      printf("\n");
    }
    printf("sum of entries = %f\n", sum);
  }

  free_matrix(score);
    
  return(target);
} /* get_subst_target_matrix */
Ejemplo n.º 6
0
/***********************************************************************
 * Read the background letter frequencies from XML.
 * Caller is responsible for freeing the returned array.
 ***********************************************************************/
ARRAY_T* read_bg_freqs_from_xml(xmlXPathContextPtr xpath_ctxt, ALPH_T alph) {

  xmlXPathObjectPtr xpathObj = NULL;
  ATYPE    value;
  ARRAY_T* bg_freqs;

  int a_size = alph_size(alph, ALPH_SIZE);

  // Use XPATH to get the background frequencies from XML
  xpathObj = xpath_query(
    xpath_ctxt, 
    "//*/background_frequencies/alphabet_array/value"
  );
  int num_values = (xpathObj->nodesetval ? xpathObj->nodesetval->nodeNr : 0);
  xmlXPathFreeObject(xpathObj);

  // The number of background frequences should match the alphabet size.
  assert(num_values == a_size);

  // Allocate the array.
  bg_freqs= allocate_array(alph_size(alph, ALL_SIZE));

  // XML doesn't enforce any order on the emission probability values,
  // so force reading bg frequency values in alphabet order.
  const int MAX_XPATH_EXPRESSION = 200;
  char xpath_expression[MAX_XPATH_EXPRESSION];
  xmlNodePtr currValueNode = NULL;
  int i_node = 0;
  for (i_node = 0; i_node < a_size; i_node++) {
    // Build the XPATH expression to get bg freq for a character.
    snprintf(
      xpath_expression,
      MAX_XPATH_EXPRESSION,
      "//*/background_frequencies/"
      "alphabet_array/value[@letter_id='letter_%c']",
      alph_char(alph, i_node)
    );
    // Read the selected bg frequency.
    xpathObj = xpath_query(xpath_ctxt, xpath_expression);
    // Should only find one node
    assert(xpathObj->nodesetval->nodeNr == 1);
    // Decode from node set to numeric value for bg freq.
    currValueNode = xpathObj->nodesetval->nodeTab[0];
    xmlXPathFreeObject(xpathObj);
    value = xmlXPathCastNodeToNumber(currValueNode);
    set_array_item(i_node, value, bg_freqs);
  }

  // Make sure the frequencies add up to 1.0. 
  normalize_subarray(0, a_size, 0.0, bg_freqs);

  // Fill in ambiguous characters. 
  calc_ambigs(alph, FALSE, bg_freqs);

  return bg_freqs;

}
/*****************************************************************************
 * MEME > motifs > motif > probabilities > alphabet_matrix > /alphabet_array
 * Check that all letters have a probability and update the current matrix row.
 ****************************************************************************/
void mxml_end_probability_pos(void *ctx) {
  CTX_T *data;
  ARRAY_T *pos;
  int i;
  data = (CTX_T*)ctx;
  pos = get_matrix_row(data->current_pos, data->mscope.motif->freqs);
  for (i = 0; i < get_array_length(pos); i++) {
    if (get_array_item(i, pos) == -1) {
      local_error(data, "Probability for letter %c in position %d is missing.\n", alph_char(data->alph, i), i + 1);
    }
  }
  data->current_pos++;
}
/*****************************************************************************
 * MEME > training_set > /alphabet
 * Read in the number of symbols in the alphabet and if it is nucleotide or 
 * amino-acid (RNA is apparently classed as nucleotide).
 ****************************************************************************/
void mxml_end_alphabet(void *ctx) {
  PARMSG_T *message;
  CTX_T *data;
  RBNODE_T *node;
  char *id, symbol;
  bool *exists;
  int i;

  data = (CTX_T*)ctx;
  if (data->alph == NULL) { // Custom alphabet
    alph_reader_done(data->alph_rdr);
    // report any errors that the alphabet reader found
    while (alph_reader_has_message(data->alph_rdr)) {
      message = alph_reader_next_message(data->alph_rdr);
      if (message->severity == SEVERITY_ERROR) {
        local_error(data, "Alphabet error: %s.\n", message->message);
      } else {
        local_warning(data, "Alphabet warning: %s.\n", message->message);
      }
      parmsg_destroy(message);
    }
    // try to get an alphabet
    data->alph = alph_reader_alphabet(data->alph_rdr);
    alph_reader_destroy(data->alph_rdr);
    data->alph_rdr = NULL;
  } else { // legacy alphabet
    exists = mm_malloc(sizeof(bool) * alph_size_core(data->alph));
    // set list to false
    for (i = 0; i < alph_size_core(data->alph); i++) exists[i] = false;
    // check that id's were defined for all the core alphabet symbols
    for (node = rbtree_first(data->letter_lookup); node != NULL; node = rbtree_next(node)) {
      id = (char*)rbtree_key(node);
      symbol = ((char*)rbtree_value(node))[0];
      if (exists[alph_indexc(data->alph, symbol)]) {
        // duplicate!
        local_error(data, "The letter identifier %s is not the first to refer to symbol %c.\n", id, symbol);
      }
      exists[alph_indexc(data->alph, symbol)] = true;
    }
    // now check for missing identifiers
    for (i = 0; i < alph_size_core(data->alph); i++) {
      if (!exists[i]) {
        // missing id for symbol
        local_error(data, "The symbol %c does not have an assigned identifier.\n", alph_char(data->alph, i));
      }
    }
    free(exists);
  }
}
Ejemplo n.º 9
0
/**
 * to_str_seed
 *
 * This function converts an integer encoded representation of a seed into an
 * ascii representation of it. Memory for the string is dynamically allocated
 * here, and it is the caller's responsibility to later free that memory.
 */
char *to_str_seed(
  ALPH_T *alph,      // alphabet
  uint8_t *e_seed,   // Integer encoded representation.
  int w              // The length of the string.
)
{
  char *str_seed = NULL;
  Resize(str_seed, w+1, char);

  int seed_idx;
  for (seed_idx = 0; seed_idx < w; seed_idx++) {
    str_seed[seed_idx] = alph_char(alph, e_seed[seed_idx]);
  }
  str_seed[w] = '\0';
  return str_seed;
}
Ejemplo n.º 10
0
void mcast_print_bg_freqs(
  FILE *output,
  ARRAY_T *bgfreqs,
  MHMMSCAN_OPTIONS_T *options
) {
  int asize = alph_size(options->alphabet, ALPH_SIZE);
  int i;
  for (i = 0; i < asize; i++) {
    if (i % 9 == 0) {
      fputc('\n', output);
    }
    fprintf(
      output,
      "%c: %1.3f ",
      alph_char(options->alphabet, i),
      get_array_item(i, bgfreqs)
    );
  }
};
Ejemplo n.º 11
0
/*
 * Load background file frequencies into the array.
 */
ARRAY_T* get_file_frequencies(ALPH_T *alph, char *bg_filename, ARRAY_T *freqs) {
  regmatch_t matches[4];
  STR_T *line;
  char chunk[BG_CHUNK_SIZE+1], letter[2], *key;
  int size, terminate, offset, i;
  FILE *fp;
  regex_t bgfreq;
  double freq;
  RBTREE_T *letters;
  RBNODE_T *node;
  
  regcomp_or_die("bg freq", &bgfreq, BGFREQ_RE, REG_EXTENDED);
  letters = rbtree_create(rbtree_strcasecmp, rbtree_strcpy, free, rbtree_dblcpy, free);
  line = str_create(100);
  if (!(fp = fopen(bg_filename, "r"))) {
    die("Unable to open background file \"%s\" for reading.\n", bg_filename);
  }
  
  terminate = feof(fp);
  while (!terminate) {
    size = fread(chunk, sizeof(char), BG_CHUNK_SIZE, fp);
    chunk[size] = '\0';
    terminate = feof(fp);
    offset = 0;
    while (offset < size) {
      // skip mac newline
      if (str_len(line) == 0 && chunk[offset] == '\r') {
        offset++;
        continue;
      }
      // find next new line
      for (i = offset; i < size; ++i) {
        if (chunk[i] == '\n') break;
      }
      // append portion up to the new line or end of chunk
      str_append(line, chunk+offset, i - offset);
      // read more if we didn't find a new line
      if (i == size && !terminate) break;
      // move the offset past the new line
      offset = i + 1;
      // handle windows new line
      if (str_char(line, -1) == '\r') str_truncate(line, -1);
      // remove everything to the right of a comment character
      for (i = 0; i < str_len(line); ++i) {
        if (str_char(line, i) == '#') {
          str_truncate(line, i);
          break;
        }
      }
      // check the line for a single letter followed by a number
      if (regexec_or_die("bg freq", &bgfreq, str_internal(line), 4, matches, 0)) {
        // parse the letter and frequency value
        regex_strncpy(matches+1, str_internal(line), letter, 2);
        freq = regex_dbl(matches+2, str_internal(line));
        // check the frequency is acceptable
        if (freq < 0 || freq > 1) {
          die("The background file lists the illegal probability %g for "
            "the letter %s.\n", freq, letter);
        } else if (freq == 0) {
          die("The background file lists a probability of zero for the "
            "letter %s\n", letter);
        }
        if (freq >= 0 && freq <= 1) rbtree_put(letters, letter, &freq);
      }
      str_clear(line);
    }
  }
  // finished with the file so clean up file parsing stuff
  fclose(fp);
  str_destroy(line, FALSE);
  regfree(&bgfreq);
  // guess the alphabet
  if (*alph == INVALID_ALPH) {
    switch (rbtree_size(letters)) {
      case PROTEIN_ASIZE:
        *alph = PROTEIN_ALPH;
        break;
      case DNA_ASIZE:
        *alph = DNA_ALPH;
        break;
      default:
        die("Number of single character entries in background does not match "
            "an alphabet.\n");
    }
  }
  // make the background
  if (freqs == NULL) freqs = allocate_array(alph_size(*alph, ALL_SIZE));
  assert(get_array_length(freqs) >= alph_size(*alph, ALL_SIZE));
  init_array(-1, freqs);
  for (node = rbtree_first(letters); node != NULL; node = rbtree_next(node)) {
    key = (char*)rbtree_key(node);
    i = alph_index(*alph, key[0]);
    freq = *((double*)rbtree_value(node));
    if (i == -1) {
      die("Background contains letter %s which is not in the %s alphabet.\n", 
          key, alph_name(*alph));
    }
    if (get_array_item(i, freqs) != -1) {
      die("Background contains letter %s which has the same meaning as an "
          "already listed letter.\n", key);
    }
    set_array_item(i, freq, freqs);
  }
  // check that all items were set
  for (i = 0; i < alph_size(*alph, ALPH_SIZE); i++) {
    if (get_array_item(i, freqs) == -1) {
      die("Background is missing letter %c.\n", alph_char(*alph, i));
    }
  }
  // disabled for backwards compatability (AMA test was failing)
  //normalize_subarray(0, ALPH_ASIZE[*alph], 0.0, freqs);
  // calculate the values of the ambiguous letters from the concrete ones
  calc_ambigs(*alph, FALSE, freqs);
  // cleanup
  rbtree_destroy(letters);
  // return result
  return freqs;
}
Ejemplo n.º 12
0
main(int argc, char **argv) {
  int i, j, alength;
  int dist = 0;
  ALPH_T alph = PROTEIN_ALPH;
  char *score_filename = NULL;
  char *alpha;
  MATRIX_T *matrix;
  ARRAY_T *probs;
  double *freqs;
  KARLIN_INPUT_T *karlin_input;
  int nscores;
  double sum;
  char usage[1000] = "";

  // Define the usage message.
  strcat(usage, "USAGE: subst_matrix [options] <score file>\n");
  strcat(usage, "\n");
  strcat(usage, "   Options:\n");
  strcat(usage, "     --dna\n");
  strcat(usage, "     --dist <float>\n");
  strcat(usage, "\n");

  // Parse the command line.
  while (1) { 
    int c;
    int option_index = 0;
    const char* option_name;

    // Define command line options.
    static struct option long_options[] = {
      {"dna", 0, 0, 0},
      {"dist", 1, 0, 0},
    };

    // Read the next option, and break if we're done.
    c = getopt_long_only(argc, argv, "+", long_options, &option_index);
    if (c == -1) {
      break;
    } else if (c != 0) {
      die("Invalid return from getopt (%d)\n", c);
    }

    // Get the option name (we only use long options).
    option_name = long_options[option_index].name;
    if (strcmp(option_name, "dna") == 0) {
      alph = DNA_ALPH;
    } else if (strcmp(option_name, "dist") == 0) {
      dist = atoi(optarg);
    } else {
      die("Invalid option (%s).\n", option_name);
    }
  }

  // Read the single required argument.
  if (optind + 1 != argc) {
    fprintf(stderr, usage);
    exit(1);
  }
  score_filename = argv[optind];



  alength = alph_size(alph, ALPH_SIZE);

  /* background frequencies */
  probs = allocate_array(alength);
  freqs = alph == DNA_ALPH ? pam_dna_freq : pam_prot_freq;
  fill_array(freqs, probs);			/* copy freqs into ARRAY_T */

  if (dist > 1) {
    printf("From gen_pam_matrix:\n");
    matrix = gen_pam_matrix(alph, dist, FALSE);
    printf("%6c ", ' ');
    for (i=0; i<alength; i++) {
      printf("%6c ", alph_char(alph, i));
    }
    printf("\n");
    sum = 0;
    for (i=0; i<alength; i++) {
      printf("%6c ", alph_char(alph, i));
      for (j=0; j<alength; j++) {
	double x = get_matrix_cell(i,j,matrix);
	sum += x;
	printf("%6.4f ", x);
      }
      printf("\n");
    }
    printf("sum of entries = %f\n", sum);
  }

  printf("From get_subst_target_matrix:\n");
  matrix = get_subst_target_matrix(score_filename, alph, dist, probs);
} /* main */