Exemplo n.º 1
0
Arquivo: motif.c Projeto: CPFL/gmeme
/***********************************************************************
 * Normalize the motif's pspm
 ***********************************************************************/
void normalize_motif
  (MOTIF_T *motif, double tolerance)
{
  int i_row, asize;
  asize = alph_size(motif->alph, ALPH_SIZE);
  for (i_row = 0; i_row < motif->length; ++i_row) {
    normalize_subarray(0, asize, tolerance, get_matrix_row(i_row, motif->freqs));
  }
}
Exemplo n.º 2
0
/***********************************************************************
 * Read the background letter frequencies from XML.
 * Caller is responsible for freeing the returned array.
 ***********************************************************************/
ARRAY_T* read_bg_freqs_from_xml(xmlXPathContextPtr xpath_ctxt, ALPH_T alph) {

  xmlXPathObjectPtr xpathObj = NULL;
  ATYPE    value;
  ARRAY_T* bg_freqs;

  int a_size = alph_size(alph, ALPH_SIZE);

  // Use XPATH to get the background frequencies from XML
  xpathObj = xpath_query(
    xpath_ctxt, 
    "//*/background_frequencies/alphabet_array/value"
  );
  int num_values = (xpathObj->nodesetval ? xpathObj->nodesetval->nodeNr : 0);
  xmlXPathFreeObject(xpathObj);

  // The number of background frequences should match the alphabet size.
  assert(num_values == a_size);

  // Allocate the array.
  bg_freqs= allocate_array(alph_size(alph, ALL_SIZE));

  // XML doesn't enforce any order on the emission probability values,
  // so force reading bg frequency values in alphabet order.
  const int MAX_XPATH_EXPRESSION = 200;
  char xpath_expression[MAX_XPATH_EXPRESSION];
  xmlNodePtr currValueNode = NULL;
  int i_node = 0;
  for (i_node = 0; i_node < a_size; i_node++) {
    // Build the XPATH expression to get bg freq for a character.
    snprintf(
      xpath_expression,
      MAX_XPATH_EXPRESSION,
      "//*/background_frequencies/"
      "alphabet_array/value[@letter_id='letter_%c']",
      alph_char(alph, i_node)
    );
    // Read the selected bg frequency.
    xpathObj = xpath_query(xpath_ctxt, xpath_expression);
    // Should only find one node
    assert(xpathObj->nodesetval->nodeNr == 1);
    // Decode from node set to numeric value for bg freq.
    currValueNode = xpathObj->nodesetval->nodeTab[0];
    xmlXPathFreeObject(xpathObj);
    value = xmlXPathCastNodeToNumber(currValueNode);
    set_array_item(i_node, value, bg_freqs);
  }

  // Make sure the frequencies add up to 1.0. 
  normalize_subarray(0, a_size, 0.0, bg_freqs);

  // Fill in ambiguous characters. 
  calc_ambigs(alph, FALSE, bg_freqs);

  return bg_freqs;

}
Exemplo n.º 3
0
/*
 * Load the non-redundant database frequencies into the array.
 */
ARRAY_T* get_nrdb_frequencies(ALPH_T alph, ARRAY_T *freqs) {
  int i, size;
  const PROB_T *nrdb_freqs;

  size = ALPH_ASIZE[alph];
  if (freqs == NULL) freqs = allocate_array(alph_size(alph, ALL_SIZE));
  assert(get_array_length(freqs) >= alph_size(alph, ALL_SIZE));
  nrdb_freqs = ALPH_NRDB[alph];
  for (i = 0; i < size; ++i) {
    set_array_item(i, nrdb_freqs[i], freqs);
  }
  normalize_subarray(0, size, 0.0, freqs);
  calc_ambigs(alph, FALSE, freqs);
  return freqs;
}
Exemplo n.º 4
0
/*
 * When the parser has been selected do some processing
 */
static void parser_selected(MREAD_T *mread) {
  ALPH_T alph;
  MFORMAT_T* format;
  format = mread->formats;
  // get the alphabet
  alph = format->get_alphabet(mread->formats->data);
  // get the background
  if (format->get_bg(format->data, &(mread->motif_bg))) {
    normalize_subarray(0, alph_size(alph, ALPH_SIZE), 0.0, mread->motif_bg);
    resize_array(mread->motif_bg, alph_size(alph, ALL_SIZE));
    calc_ambigs(alph, FALSE, mread->motif_bg);
  } else {
    mread->motif_bg = get_uniform_frequencies(alph, mread->motif_bg);
  }
  set_pseudo_bg(mread);
}
Exemplo n.º 5
0
Arquivo: motif.c Projeto: CPFL/gmeme
/***********************************************************************
 * Takes a matrix of meme scores and converts them into letter 
 * probabilities.
 *
 * The probablility can be got by:
 * p = (2 ^ (s / 100)) * bg
 *
 ***********************************************************************/
MATRIX_T* convert_scores_into_freqs
  (ALPH_T alph,
   MATRIX_T *scores,
   ARRAY_T *bg,
   int site_count,
   double pseudo_count)
{
  int asize, length;
  double freq, score, total_count, counts, bg_freq;
  MATRIX_T *freqs;
  int row, col;

  assert(alph != INVALID_ALPH);
  assert(scores != NULL);
  assert(bg != NULL);

  length = get_num_rows(scores);
  asize = alph_size(alph, ALPH_SIZE);

  freqs = allocate_matrix(length, asize);
  total_count = site_count + pseudo_count;

  for (col = 0; col < asize; ++col) {
    bg_freq = get_array_item(col, bg);
    for (row = 0; row < length; ++row) {
      score = get_matrix_cell(row, col, scores);
      // convert to a probability
      freq = pow(2.0, score / 100.0) * bg_freq;
      // remove the pseudo count
      freq = ((freq * total_count) - (bg_freq * pseudo_count)) / site_count;
      if (freq < 0) freq = 0;
      else if (freq > 1) freq = 1;
      set_matrix_cell(row, col, freq, freqs);
    }
  }
  for (row = 0; row < length; ++row) {
    normalize_subarray(0, asize, 0.0, get_matrix_row(row, freqs));
  }

  return freqs;
}
Exemplo n.º 6
0
static double * log_cumulative_background(ALPH_T *alph, const int sdbg_order, SEQ_T *sequence) {
  BGCALC_T *calc;
  ARRAY_T *cp;
  double *logcumback;
  const char *raw_seq;
  int i;
  if (sdbg_order < 0) die("No such thing as a negative background order");
  logcumback = mm_malloc(sizeof(double) * (get_seq_length(sequence)+1));
  raw_seq = get_raw_sequence(sequence);
  calc = NULL;
  // calculate background model
  calculate_markov_model(alph, sdbg_order, 1.0, false, raw_seq, &calc);
  cp = calculate_markov_model(alph, sdbg_order, 1.0, false, NULL, &calc);
  // add x-tuples to model
  extend_markov_model(alph, true, SUM_FREQS, cp);
  // normalize for each prefix (convert to conditional probability)
  for (i = 0; i < get_array_length(cp); i += alph_size_wild(alph)) {
    normalize_subarray(i, alph_size_core(alph), 0, cp); 
    set_array_item(i + alph_wild(alph), 1.0, cp);
  }
  calculate_log_cumulative_background(alph, true, sdbg_order, cp, raw_seq, logcumback);
  free_array(cp);
  return logcumback;
}
Exemplo n.º 7
0
/*************************************************************************
 * Entry point for centrimo
 *************************************************************************/
int main(int argc, char *argv[]) {
  CENTRIMO_OPTIONS_T options;
  SEQ_SITES_T seq_sites;
  SITE_COUNTS_T counts;
  int seqN, motifN, seqlen, db_i, motif_i, i;
  double log_pvalue_thresh;
  SEQ_T** sequences = NULL;
  ARRAY_T* bg_freqs = NULL;
  ARRAYLST_T *stats_list;
  MOTIF_DB_T **dbs, *db;
  MREAD_T *mread;
  MOTIF_STATS_T *stats;
  MOTIF_T *motif, *rev_motif;
  PSSM_T *pos_pssm, *rev_pssm;
  char *sites_path, *desc;
  FILE *sites_file;
  HTMLWR_T *html;
  JSONWR_T *json;

  // COMMAND LINE PROCESSING
  process_command_line(argc, argv, &options);

  // load the sequences
  read_sequences(options.alphabet, options.seq_source, &sequences, &seqN);
  seqlen = (seqN ? get_seq_length(sequences[0]) : 0);
  // calculate a sequence background (unless other background is given)
  if (!options.bg_source) {
    bg_freqs = calc_bg_from_fastas(options.alphabet, seqN, sequences);
  }

  // load the motifs
  motifN = 0;
  dbs = mm_malloc(sizeof(MOTIF_DB_T*) * arraylst_size(options.motif_sources));
  for (i = 0; i < arraylst_size(options.motif_sources); i++) {
    char* db_source;
    db_source = (char*)arraylst_get(i, options.motif_sources);
    dbs[i] = read_motifs(i, db_source, options.bg_source, &bg_freqs, 
        options.pseudocount, options.selected_motifs, options.alphabet);
    motifN += arraylst_size(dbs[i]->motifs);
  }
  log_pvalue_thresh = log(options.evalue_thresh) - log(motifN);
  // Setup some things for double strand scanning
  if (options.scan_both_strands == TRUE) {
    // Set up hash tables for computing reverse complement
    setup_hash_alph(DNAB);
    setalph(0);
    // Correct background by averaging on freq. for both strands.
    average_freq_with_complement(options.alphabet, bg_freqs);
    normalize_subarray(0, alph_size(options.alphabet, ALPH_SIZE), 0.0, bg_freqs);
    calc_ambigs(options.alphabet, FALSE, bg_freqs);
  }
  // Create output directory
  if (create_output_directory(options.output_dirname, options.allow_clobber, 
        (verbosity >= NORMAL_VERBOSE))) {
    die("Couldn't create output directory %s.\n", options.output_dirname);
  }
  // open output files
  sites_path = make_path_to_file(options.output_dirname, SITES_FILENAME);
  sites_file = fopen(sites_path, "w");
  free(sites_path);
  // setup html monolith writer
  json = NULL;
  if ((html = htmlwr_create(get_meme_etc_dir(), TEMPLATE_FILENAME))) {
    htmlwr_set_dest_name(html, options.output_dirname, HTML_FILENAME);
    htmlwr_replace(html, "centrimo_data.js", "data");
    json = htmlwr_output(html);
    if (json == NULL) die("Template does not contain data section.\n");
  } else {
    DEBUG_MSG(QUIET_VERBOSE, "Failed to open html template file.\n");
  }
  if (json) {
    // output some top level variables
    jsonwr_str_prop(json, "version", VERSION);
    jsonwr_str_prop(json, "revision", REVISION);
    jsonwr_str_prop(json, "release", ARCHIVE_DATE);
    jsonwr_str_array_prop(json, "cmd", argv, argc);
    jsonwr_property(json, "options");
    jsonwr_start_object_value(json);
    jsonwr_dbl_prop(json, "motif-pseudo", options.pseudocount);
    jsonwr_dbl_prop(json, "score", options.score_thresh);
    jsonwr_dbl_prop(json, "ethresh", options.evalue_thresh);
    jsonwr_lng_prop(json, "maxbin", options.max_window+1);
    jsonwr_bool_prop(json, "norc", !options.scan_both_strands);
    jsonwr_bool_prop(json, "noflip", options.no_flip);
    jsonwr_end_object_value(json);
    // output the description
    desc = prepare_description(&options);
    if (desc) {
      jsonwr_str_prop(json, "job_description", desc);
      free(desc);
    }
    // output size metrics
    jsonwr_lng_prop(json, "seqlen", seqlen);
    jsonwr_lng_prop(json, "tested", motifN);
    // output the fasta db
    jsonwr_property(json, "sequence_db");
    jsonwr_start_object_value(json);
    jsonwr_str_prop(json, "source", options.seq_source);
    jsonwr_lng_prop(json, "count", seqN);
    jsonwr_end_object_value(json);
    // output the motif dbs
    jsonwr_property(json, "motif_dbs");
    jsonwr_start_array_value(json);
    for (db_i = 0; db_i < arraylst_size(options.motif_sources); db_i++) {
      db = dbs[db_i];
      jsonwr_start_object_value(json);
      jsonwr_str_prop(json, "source", db->source);
      jsonwr_lng_prop(json, "count", arraylst_size(db->motifs));
      jsonwr_end_object_value(json);
    }
    jsonwr_end_array_value(json);
    // start the motif array
    jsonwr_property(json, "motifs");
    jsonwr_start_array_value(json);
  }
  /**************************************************************
   * Tally the positions of the best sites for each of the 
   * selected motifs.
   **************************************************************/
  // prepare the sequence sites
  memset(&seq_sites, 0, sizeof(SEQ_SITES_T));
  // prepare the site counts
  counts.allocated = ((2 * seqlen) - 1);
  counts.sites = mm_malloc(sizeof(double) * counts.allocated);
  // prepare the motifs stats list
  stats_list = arraylst_create();
  // prepare the other vars
  motif = NULL; pos_pssm = NULL; rev_motif = NULL; rev_pssm = NULL;
  for (db_i = 0; db_i < arraylst_size(options.motif_sources); db_i++) {
    db = dbs[db_i];
    for (motif_i = 0; motif_i < arraylst_size(db->motifs); motif_i++) {
      motif = (MOTIF_T *) arraylst_get(motif_i, db->motifs);
      DEBUG_FMT(NORMAL_VERBOSE, "Using motif %s of width %d.\n",  
          get_motif_id(motif), get_motif_length(motif));
      // reset the counts
      for (i = 0; i < counts.allocated; i++) counts.sites[i] = 0;
      counts.total_sites = 0;
      // create the pssm 
      pos_pssm = make_pssm(bg_freqs, motif);
      // If required, do the same for the reverse complement motif.
      if (options.scan_both_strands) {
        rev_motif = dup_rc_motif(motif);
        rev_pssm = make_pssm(bg_freqs, rev_motif);
      }
      // scan the sequences
      for (i = 0; i < seqN; i++)
        score_sequence(&options, sequences[i], pos_pssm, rev_pssm, 
            &seq_sites, &counts);
      // DEBUG check that the sum of the sites is close to the site count
      double sum_check = 0, sum_diff;
      for (i = 0; i < counts.allocated; i++) sum_check += counts.sites[i];
      sum_diff = counts.total_sites - sum_check;
      if (sum_diff < 0) sum_diff = -sum_diff;
      if (sum_diff > 0.1) {
        fprintf(stderr, "Warning: site counts don't sum to accurate value! "
            "%g != %ld", sum_check, counts.total_sites);
      }
      // output the plain text site counts
      output_site_counts(sites_file, seqlen, db, motif, &counts);
      // compute the best central window
      stats = compute_stats(options.max_window, seqlen, db, motif, &counts);
      // check if it passes the threshold
      if (json && stats->log_adj_pvalue <= log_pvalue_thresh) {
        output_motif_json(json, stats, &counts);
        arraylst_add(stats, stats_list);
      } else {
        free(stats);
      }
      // Free memory associated with this motif.
      free_pssm(pos_pssm);
      free_pssm(rev_pssm);
      destroy_motif(rev_motif);
    }
  }
  if (json) jsonwr_end_array_value(json);
  // finish writing sites
  fclose(sites_file);
  // finish writing html file
  if (html) {
    if (htmlwr_output(html) != NULL) {
      die("Found another JSON replacement!\n");
    }
    htmlwr_destroy(html);
  }
  // write text file
  output_centrimo_text(&options, motifN, stats_list);
  // Clean up.
  for (i = 0; i < seqN; ++i) {
    free_seq(sequences[i]); 
  }
  free(sequences);
  for (i = 0; i < arraylst_size(options.motif_sources); i++) {
    free_db(dbs[i]);
  }
  free(dbs);
  free_array(bg_freqs);
  free(counts.sites);
  free(seq_sites.sites);
  arraylst_destroy(free, stats_list);
  cleanup_options(&options);
  return 0;

}