Ejemplo n.º 1
0
void mcast_print_motif_list(FILE * output, MOTIF_T* motifs, int num_motifs) {
  fputs("\n", output);
  int i;
  for (i = 0; i < num_motifs; i++) {
    MOTIF_T *motif = motif_at(motifs, i);
    MOTIF_T *rc_motif = NULL;
    char *motif_id = get_motif_id(motif);
    int width = get_motif_length(motif);
    char *rc_motif_id = NULL;
    if (i < (num_motifs - 1)) {
      rc_motif = motif_at(motifs, i + 1);
      rc_motif_id = get_motif_id(rc_motif);
    }
    char *best_possible_match = get_best_possible_match(motif);
    char *colored_best_possible_match = color_dna_sequence(best_possible_match);
    char *best_possible_rc_match = NULL;
    char *colored_best_possible_rc_match = NULL;
    if (rc_motif_id && strcmp(motif_id, rc_motif_id) == 0) {
      ++i; // Pair of identiical motif ids indicate forward/reverse pair.
      best_possible_rc_match = get_best_possible_match(rc_motif);
      colored_best_possible_rc_match = color_dna_sequence(best_possible_rc_match);
    }
    const char *indent = "               ";
    fprintf(output, "%s<tr>\n", indent);
    fprintf(output, "%s<td>%s</td>\n", indent, motif_id);
    fprintf(output, "%s<td>%d</td>\n", indent, width);
    fprintf(output, "%s<td class=\"sequence\">%s</td>\n", indent, colored_best_possible_match);
    fprintf(output, "%s<td class=\"sequence\">%s</td>\n", indent, colored_best_possible_rc_match);
    fprintf(output, "%s</tr>\n", indent);
    myfree(best_possible_match);
    myfree(best_possible_rc_match);
    myfree(colored_best_possible_match);
    myfree(colored_best_possible_rc_match);
  }
};
Ejemplo n.º 2
0
/**************************************************************************
 * Makes the histogram file name from the details in the motifs and the
 * file extension.
 * Caller is responsible for freeing memory
 **************************************************************************/
static char* make_pattern_file_name(char *prefix, char *ext, MOTIF_T* primary, SECONDARY_MOTIF_T *secondary) {
  const char *fmt = "%s_%s_db%d_%s.%s";
  char dummy[1];
  char *ret;
  int len;
  len = snprintf(dummy, 1, fmt, prefix, get_motif_id(primary), secondary->db->id, get_motif_id(secondary->motif), ext);
  ret = mm_malloc(sizeof(char) * (len+1));
  snprintf(ret, (len+1), fmt, prefix, get_motif_id(primary), secondary->db->id, get_motif_id(secondary->motif), ext);
  return ret;
}
Ejemplo n.º 3
0
/*************************************************************************
 * Read a motif database
 *************************************************************************/
static MOTIF_DB_T* read_motifs(int id, char* motif_source, char* bg_source, 
    ARRAY_T** bg, double pseudocount, RBTREE_T *selected, ALPH_T alph) {
  // vars
  int read_motifs;
  MOTIF_DB_T* motifdb;
  MREAD_T *mread;
  MOTIF_T *motif;
  ARRAYLST_T *motifs;
  // open the motif file for reading
  mread = mread_create(motif_source, OPEN_MFILE);
  mread_set_pseudocount(mread, pseudocount);
  // determine background to use
  if (*bg != NULL) mread_set_background(mread, *bg);
  else mread_set_bg_source(mread, bg_source);
  // load motifs
  read_motifs = 0;
  if (rbtree_size(selected) > 0) {
    motifs = arraylst_create();
    while(mread_has_motif(mread)) {
      motif = mread_next_motif(mread);
      read_motifs++;
      if (rbtree_find(selected, get_motif_id(motif))) {
        arraylst_add(motif, motifs);
      } else {
        DEBUG_FMT(NORMAL_VERBOSE, "Discarding motif %s in %s.\n", 
            get_motif_id(motif), motif_source);
        destroy_motif(motif);
      }
    }
  } else {
    motifs = mread_load(mread, NULL);
    read_motifs = arraylst_size(motifs);
  }
  arraylst_fit(motifs);
  if (read_motifs > 0) {
    // check the alphabet
    if (mread_get_alphabet(mread) != alph) {
      die("Expected %s alphabet motifs\n", alph_name(alph));
    }
    // get the background
    if (*bg == NULL) *bg = mread_get_background(mread);
  } else {
    fprintf(stderr, "Warning: Motif file %s contains no motifs.\n", motif_source);
  }
  // clean up motif reader
  mread_destroy(mread);
  // create motif db
  motifdb = mm_malloc(sizeof(MOTIF_DB_T));
  memset(motifdb, 0, sizeof(MOTIF_DB_T));
  motifdb->id = id;
  motifdb->source = strdup(motif_source);
  motifdb->motifs = motifs;
  return motifdb; 
}
Ejemplo n.º 4
0
/**************************************************************************
 * Outputs txt for the spacings
 **************************************************************************/
void output_secondary_motif_txt(FILE *txt_output, 
    MOTIF_DB_T *primary_db, MOTIF_T *primary_motif, 
    SECONDARY_MOTIF_T *parent, SECONDARY_MOTIF_T *smotif, 
    int n_secondary_motifs, LINKLST_T *rmotifs) {
  LINK_T *node;
  SIGSPACE_T sig;
  int i, quad;
  char* orient_names[NORIENTS];
  // this maintains the previous definition of orientation but actual names would be better
  orient_names[LEFT | SAME] = "0";
  orient_names[LEFT | OPPO] = "1";
  orient_names[RIGHT | SAME] = "2";
  orient_names[RIGHT | OPPO] = "3";
  orient_names[UP_SEC_PAL] = "4";
  orient_names[UP_PRI_PAL] = "5";
  orient_names[DOWN_PRI_PAL] = "6";
  orient_names[DOWN_SEC_PAL] = "7";
  orient_names[BOTH_PAL] = "8";

  for (i = 0; i < smotif->sig_count; ++i) {
    sig = smotif->sigs[i];
    // Output txt line.
    fprintf(txt_output, "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%d\t%d\t%s\t%s\t%s\t%.2e\t%d\t%s\t%d\t%d\t%.2e\t%.2e\n",
      primary_db->name,
      get_motif_id(primary_motif),
      strcmp(get_motif_id2(primary_motif), "") ? get_motif_id2(primary_motif) : ".",
      get_motif_consensus(primary_motif),
      smotif->db->name,
      get_motif_id(smotif->motif),
      strcmp(get_motif_id2(smotif->motif), "") ? get_motif_id2(smotif->motif) : ".",
      get_motif_consensus(smotif->motif),
      get_motif_trim_left(smotif->motif),
      get_motif_trim_right(smotif->motif),
      parent ? smotif->db->name : ".",
      parent ? get_motif_id(parent->motif) : ".",
      parent && strcmp(get_motif_id2(parent->motif), "") ? get_motif_id2(parent->motif) : ".",
      smotif->min_pvalue * n_secondary_motifs, // E-value
      sig.bin, // gap
      orient_names[sig.orient], // orientation
      smotif->spacings[sig.orient].count[sig.bin], // count
      smotif->total_spacings, // total occurrences of secondary in margins
      sig.pvalue < 1e-10 ? sig.pvalue/NORIENTS : 1-pow((1-sig.pvalue),1.0/NORIENTS),
      sig.pvalue
    );
  }

  if (rmotifs != NULL && linklst_size(rmotifs) > 0) {
    for (node = linklst_first(rmotifs); node != NULL; node = linklst_next(node)) {
      SECONDARY_MOTIF_T *rmotif = linklst_get(node);
      output_secondary_motif_txt(txt_output, primary_db, primary_motif, smotif, rmotif, n_secondary_motifs, NULL);
    }
  }
}
Ejemplo n.º 5
0
/*************************************************************************
 * Compare two motif_stats in an arraylst
 * Takes pointers to pointers to MOTIF_STATS_T.
 *************************************************************************/
static int motif_stats_compar(const void *s1, const void *s2) {
  MOTIF_STATS_T *m_stats1, *m_stats2;
  double diff;
  int diff2;
  m_stats1 = *((MOTIF_STATS_T**)s1);
  m_stats2 = *((MOTIF_STATS_T**)s2);
  diff = m_stats1->log_adj_pvalue - m_stats2->log_adj_pvalue;
  if (diff != 0) return (diff < 0 ? -1 : 1);
  diff2 = strcmp(get_motif_id(m_stats1->motif), get_motif_id(m_stats2->motif));
  if (diff2 != 0) return diff2;
  return m_stats1->db->id - m_stats2->db->id;
}
Ejemplo n.º 6
0
static inline char* get_full_motif_id
  (MOTIF_T *motif)
{
  if (get_motif_strands(motif)) {
    return get_motif_st_id(motif);
  }
  return get_motif_id(motif);
}
Ejemplo n.º 7
0
/**************************************************************************
 * Callback invoked when matching an opening matched_element tag for a 
 * CISML file of a secondary motif database. A hit must pass the checks:
 * 1) The current match is for a sequence/motif that we're interested in.
 * 2) A score is supplied.
 * 3) The score supplied is better or equal to the existing best score.
 * 4) Consistant with CISML format so start and stop are larger than 0.
 * 5) The distance between the start and stop matches the motif.
 * 6) It fits within the margin region around the primary motif
 * 7) It does not overlap the primary motif. 
 * Provided all those checks pass then the hit is calculated relative to
 * the start of the matched region. If the score is equal to the current
 * best then the relative hit position is added to the list of best hits,
 * otherwise the list is cleared, the best score is updated and the hit
 * is added to the previously empty list.
 **************************************************************************/
void match_secondary(void *ctx, long start, long stop, double *score, double *pvalue, char *clusterId) {
  SECONDARY_LOADER_T *loader = (SECONDARY_LOADER_T*)ctx;
  int lpos, rpos, rc, relative_position, match;
  //check if we're loading this match
  if (loader->current_sequence == NULL) return;
  //check if this match has enough information to be considered
  if (score == NULL) return;
  //check to see if the existing match is better
  if (loader->hit_count > 0 && loader->secondary_score > *score) return;
  //convert the coordinates of the match into easier to use ones
  if (start <= 0 || stop <= 0) {
    die("Expected start and stop fields in cisml to be 1 or larger.\n");
  }
  if (start < stop) {
    lpos = start;
    rpos = stop;
    rc = FALSE;
  } else {
    lpos = stop;
    rpos = start;
    rc = TRUE;
  }
  //check that gap makes sense
  if ((rpos - lpos + 1) != get_motif_length(loader->secondary_motif->motif)) {
    die("Motif %s has length %d but a match in a CISML file had a start of %ld and stop of %ld which evaluates to a length of %d\n", 
        get_motif_id(loader->secondary_motif->motif), get_motif_length(loader->secondary_motif->motif), start, stop, (rpos - lpos + 1) );
  }
  //check for overlap with the primary match
  //and that the secondary motif fits within the margin
  if (rpos < loader->primary_lpos) { // left side (upstream)
    if ((loader->primary_lpos - lpos) > loader->margin) return;//outside margin    
  } else if (lpos > loader->primary_rpos) { // right side (downstream)
    if ((rpos - loader->primary_rpos) > loader->margin) return;//outside margin
  } else {
    return;//overlap
  }
  //match seems valid and better than anything we've seen previous so update
  //note that stored position is relative to the start of the margin, as if
  //this was scored on a trimmed sequence indexing from 1 
  //this has the advantage that we only need the width of the primary
  //motif and the size of the margin to calculate the offset
  relative_position = lpos - (loader->primary_lpos - loader->margin) + 1;
  //now make the scale pos/neg dependent on if the match is with a
  //reverse complement
  match = (rc ? -relative_position : relative_position);
  if (loader->hit_count == 0 || loader->secondary_score > *score) {
    loader->secondary_score = *score;
    loader->hit_count = 1;
    loader->hits[0] = match;
  } else if (loader->secondary_score == *score) {
    if (loader->hit_count >= loader->hits_size) {
      loader->hits_size = loader->hit_count + 10;
      loader->hits = mm_realloc(loader->hits, sizeof(int) * loader->hits_size);
    }
    loader->hits[loader->hit_count++] = match;
  }
}
Ejemplo n.º 8
0
/***********************************************************************
 * Discard motifs that are not connected.
 ***********************************************************************/
static void throw_out_unused_motifs
  (MATRIX_T* transp_freq,
   MATRIX_T* spacer_ave,
   int*      num_motifs,
   MOTIF_T*  motifs)
{
  int i_motif, j_motif;
  ARRAY_T* row_sums;
  ARRAY_T* col_sums;

  // Extract the margins of the transition matrix.
  row_sums = get_matrix_row_sums(transp_freq);
  col_sums = get_matrix_col_sums(transp_freq);

  for (i_motif = 0; i_motif < *num_motifs; i_motif++) {

    // Is this row or column empty?
    if ((get_array_item(i_motif + 1, row_sums) == 0.0) ||
      (get_array_item(i_motif + 1, col_sums) == 0.0)) {

      if (verbosity >= NORMAL_VERBOSE) {
        fprintf(stderr, "Removing unused motif %s. No occurrences of this motif were found.\n",
        get_motif_id(&(motifs[i_motif])));
      }

      // Remove the row and column from the transition matrix.
      remove_matrix_row(i_motif + 1, transp_freq);
      remove_matrix_col(i_motif + 1, transp_freq);
      assert(get_num_rows(transp_freq) == get_num_cols(transp_freq));

      remove_matrix_row(i_motif + 1, spacer_ave);
      remove_matrix_col(i_motif + 1, spacer_ave);
      assert(get_num_rows(spacer_ave) == get_num_cols(spacer_ave));

      // Remove the motif from the array.
      for (j_motif = i_motif + 1; j_motif < *num_motifs; j_motif++) {
        free_motif(&(motifs[j_motif - 1]));
        copy_motif(&(motifs[j_motif]), &(motifs[j_motif - 1]));
      }
      free_motif(&(motifs[j_motif - 1]));
      (*num_motifs)--;
      i_motif--;

      // Recalculate the row and column sums.
      free_array(row_sums);
      free_array(col_sums);
      row_sums = get_matrix_row_sums(transp_freq);
      col_sums = get_matrix_col_sums(transp_freq);

    }
  }

  free_array(row_sums);
  free_array(col_sums);
}
Ejemplo n.º 9
0
/***********************************************************************
 * Should the given motif be inserted into the model?
 * FIXME: These tests needn't be mutually exclusive.
 ***********************************************************************/
static BOOLEAN_T retain_motif(
  STRING_LIST_T* requested_motifs, // IDs of motifs to include.
  double         e_threshold,      // E-value to include motifs. 
  double         complexity_threshold, // Complexity threshold to include.
  ORDER_T*       order_spacing,    // Motif order and spacing (linear HMM). 
  MOTIF_T*       motif             // The motif. 
) {
  int num_requested;
  int i;
  char* motif_id;

  /* Method 1: Select motifs by index. */
  num_requested = get_num_strings(requested_motifs);
  if (num_requested > 0) {
    motif_id = get_motif_id(motif);
    for (i = 0; i < num_requested; i++) {
      if (strcmp(get_nth_string(i, requested_motifs), motif_id) == 0) {
        return(TRUE);
      }
    }
    return(FALSE);
  }

  /* Method 2: Select motifs below a certain E-value threshold. */
  else if (e_threshold != 0.0) {
    return (get_motif_evalue(motif) <= e_threshold);
  }

  /* Method 3: Select motifs that are included in the order string. */
  else if (order_spacing != NULL) {
    return order_contains(get_motif_id(motif), order_spacing);
  }

  // Method 4: Select motifs by their complexity score.
  else if (complexity_threshold != 0.0) {
    return(motif->complexity >= complexity_threshold);
  }

  /* Default is to include all motifs. */
  return(TRUE);
}
Ejemplo n.º 10
0
/*************************************************************************
 * Output CentriMo text output
 *************************************************************************/
static void output_centrimo_text(CENTRIMO_OPTIONS_T *options, int motifN,
    ARRAYLST_T *stats_list) {
  MOTIF_STATS_T *stats;
  char *file_path;
  int i, pad;
  double log_motifN;
  FILE *text_file;
  MOTIF_T *motif;
  // find the evalue conversion factor
  log_motifN = log(motifN);
  // Sort and write centrimo.txt
  arraylst_qsort(motif_stats_compar, stats_list);
  // open centrimo text file
  file_path = make_path_to_file(options->output_dirname, TEXT_FILENAME);
  text_file = fopen(file_path, "w");
  free(file_path);
  fputs("# motif             \tE-value\tadj_p-value\tlog_adj_p-value\t"
      "bin_width\ttotal_width\tsites_in_bin\ttotal_sites\tp_success\t"
      "p-value\tmult_tests\n", text_file);
  fprintf(text_file, "# Found %d motifs with E-values <= %g\n", 
      arraylst_size(stats_list), options->evalue_thresh);
  // write centrimo text output
  for (i = 0; i < arraylst_size(stats_list); i++) {
    stats = arraylst_get(i, stats_list);
    motif = stats->motif;
    pad = 19 - strlen(get_motif_id(motif));
    fprintf(text_file, "%s %-*s\t", get_motif_id(motif), 
        pad, get_motif_id2(motif));
    print_log_value(text_file, stats->log_adj_pvalue + log_motifN, 1);
    fputs("\t", text_file);
    print_log_value(text_file, stats->log_adj_pvalue, 1);
    fprintf(text_file, "\t%.2f\t%d\t%d\t%.0f\t%ld\t%.5f\t", 
        stats->log_adj_pvalue, stats->central_window+1, stats->all_window, 
        stats->central_sites, stats->total_sites, 
        stats->central_prob);
    print_log_value(text_file, stats->log_pvalue, 1);
    fprintf(text_file, "\t%d\n", stats->n_win_tested);
  }
  fclose(text_file);
}
Ejemplo n.º 11
0
/******************************************************************************
  Print JavaScript code defining an array of motifs 
  and their best possible matches.
 *****************************************************************************/
void mcast_print_motif_array(FILE *output, MOTIF_T *motifs, int num_motifs) {
   int i;
   fputs("\n", output);
   for (i = 0; i < num_motifs; i++) {
     MOTIF_T *motif = motif_at(motifs, i);
     MOTIF_T *rc_motif = NULL;
     char *motif_id = get_motif_id(motif);
     char *rc_motif_id = NULL;
     if (i < (num_motifs - 1)) {
       rc_motif = motif_at(motifs, i + 1);
       rc_motif_id = get_motif_id(rc_motif);
     }
     char *best_possible_match = get_best_possible_match(motif);
     if (rc_motif_id && strcmp(motif_id, rc_motif_id) == 0) {
       ++i; // Pair of identiical motif ids indicate forward/reverse pair.
       char *best_possible_rc_match = get_best_possible_match(rc_motif);
       fprintf(
           output,
          "      motifs[\"%s\"] = new Motif(\"%s\", \"nucleotide\", \"%s\", \"%s\");\n",
          motif_id,
          motif_id,
          best_possible_match,
          best_possible_rc_match
       );
       myfree(best_possible_rc_match);
     }
     else {
       fprintf(
           output,
          "      motifs[\"%s\"] = new Motif(\"%s\", \"nucleotide\", \"%s\", \"%s\");\n",
          motif_id,
          motif_id,
          best_possible_match,
          "" 
       );
     }
     myfree(best_possible_match);
   }
};
Ejemplo n.º 12
0
/**************************************************************************
 * Callback invoked when matching a matched_element tag in the CISML file
 * for the primary motif. If we are recording scores for this motif and
 * sequence then it:
 * 1) Checks that a score was supplied
 * 2) Checks that the start and stop are correctly spaced for the expected 
 *    motif.
 * 3) Checks that the hit does not overlap the margin on each end of the
 *    sequence.
 * 4) If we don't have a best score, or this score is better: 
 *      - clear the list of best hits and add this one.
 * 5) Alternately if this score is equal to the existing one:
 *      - add the hit to the list of best hits.
 **************************************************************************/
void match_primary(void *ctx, long start, long stop, double *score, double *pvalue, char *clusterId) {
  PRIMARY_LOADER_T *loader = (PRIMARY_LOADER_T*)ctx;
  int lpos, rpos, rc;
  //check we're actually loading data
  if (loader->current_sequence == NULL) return;
  //check that this match is worth investigating further
  if (score == NULL) return;
  if (start <= 0 || stop <= 0) {
    die("Expected start and stop fields in cisml to be 1 or larger.\n");
  }
  if (start < stop) {
    lpos = start;
    rpos = stop;
    rc = FALSE;
  } else {
    lpos = stop;
    rpos = start;
    rc = TRUE;
  }
  //check that gap makes sense
  if ((rpos - lpos + 1) != get_motif_length(loader->motif)) {
    die("Motif %s has length %d but a match in a CISML file had a start of %ld and stop of %ld which evaluates to a length of %d\n", 
        get_motif_id(loader->motif), get_motif_length(loader->motif), start, stop, (rpos - lpos + 1) );
  }
  //check left margin
  // For example if we had a margin of 1 then the primary motif must start at 
  // 2 or larger which would allow a secondary motif of length 1 to fit at 
  // position 1
  if (lpos <= loader->margin) return;
  //check right margin
  // For example if we had a sequence of length 5 and a margin of 1 then the 
  // primary motif must finish at 4 or smaller which would allow a secondary 
  // motif of length 1 to fit at position 5
  if (rpos > (loader->current_sequence->length - loader->margin)) return;
  //now see if our existing best match is worse than this one
  if (loader->hit_count == 0 || *score > loader->current_score) {
    loader->current_score = *score;
    loader->hit_count = 1;
    loader->hits[0] = (rc ? -lpos : lpos);
  } else if (*score == loader->current_score) {
    if (loader->hit_count >= loader->hits_size) {
      loader->hits_size = loader->hit_count + 10;
      loader->hits = mm_realloc(loader->hits, sizeof(int) * loader->hits_size);
    }
    loader->hits[loader->hit_count++] = (rc ? -lpos : lpos);
  }
}
Ejemplo n.º 13
0
/*************************************************************************
 * Output motif site counts
 *************************************************************************/
static void output_site_counts(FILE* fh, int sequence_length, 
    MOTIF_DB_T* db, MOTIF_T* motif, SITE_COUNTS_T* counts) {
  // vars
  int i, w, end;
  char *alt;
  fprintf(fh, "DB %d MOTIF\t%s", db->id, get_motif_id(motif));
  alt = get_motif_id2(motif);
  if (alt[0]) fprintf(fh, "\t%s", alt);
  fprintf(fh, "\n");
  w = get_motif_length(motif);
  end = counts->allocated - (w - 1);
  for (i = (w - 1); i < end; i += 2) {
    fprintf(fh, "% 6.1f\t%g\n", 
        ((double)(i - sequence_length + 1)) / 2.0, 
        counts->sites[i]);
  }
}
Ejemplo n.º 14
0
/*************************************************************************
 * Output JSON data for a motif
 *************************************************************************/
static void output_motif_json(JSONWR_T* json, MOTIF_STATS_T* stats, 
    SITE_COUNTS_T* counts) {
  //vars
  MOTIF_T *motif;
  MATRIX_T *freqs;
  int i, j, mlen, asize, end;
  motif = stats->motif;
  freqs = get_motif_freqs(motif);
  asize = alph_size(get_motif_alph(motif), ALPH_SIZE);
  jsonwr_start_object_value(json);
  jsonwr_lng_prop(json, "db", stats->db->id);
  jsonwr_str_prop(json, "id", get_motif_id(motif));
  if (*(get_motif_id2(motif))) {
    jsonwr_str_prop(json, "alt", get_motif_id2(motif));
  }
  mlen = get_motif_length(motif);
  jsonwr_lng_prop(json, "len", mlen);
  jsonwr_dbl_prop(json, "motif_evalue", get_motif_evalue(motif));
  jsonwr_dbl_prop(json, "motif_nsites", get_motif_nsites(motif));
  if (get_motif_url(motif) && *get_motif_url(motif)) {
    jsonwr_str_prop(json, "url", get_motif_url(motif));
  }
  jsonwr_property(json, "pwm");
  jsonwr_start_array_value(json);
  for (i = 0; i < mlen; i++) {
    jsonwr_start_array_value(json);
    for (j = 0; j < asize; j++) {
      jsonwr_dbl_value(json, get_matrix_cell(i, j, freqs));
    }
    jsonwr_end_array_value(json);
  }
  jsonwr_end_array_value(json);
  jsonwr_lng_prop(json, "bin_width", stats->central_window+1);
  jsonwr_dbl_prop(json, "bin_sites", stats->central_sites);
  jsonwr_lng_prop(json, "total_sites", counts->total_sites);
  jsonwr_dbl_prop(json, "log_pvalue", stats->log_adj_pvalue);
  jsonwr_dbl_prop(json, "max_prob", stats->max_prob);
  jsonwr_property(json, "sites");
  jsonwr_start_array_value(json);
  end = counts->allocated - (mlen - 1);
  for (i = (mlen - 1); i < end; i += 2) {
    jsonwr_dbl_value(json, counts->sites[i]);
  }
  jsonwr_end_array_value(json);
  jsonwr_end_object_value(json);
}
Ejemplo n.º 15
0
/**************************************************************************
 * Generate logos for a motif
 * Warning, this may modify the path and motif arguments.
 **************************************************************************/
static void generate_motif_logos(OPTIONS_T *options, STR_T *path, MOTIF_T *motif) {
  int path_len;
  char name[MAX_MOTIF_ID_LENGTH + 1];

  copy_and_sanatise_name(name, get_motif_id(motif), MAX_MOTIF_ID_LENGTH);
  name[MAX_MOTIF_ID_LENGTH] = '\0';
  path_len = str_len(path);
  
  str_appendf(path, "logo%s", name);
  CL_create1(motif, FALSE, FALSE, "MEME (no SSC)", str_internal(path), 
      options->eps, options->png);

  if (options->rc) {
    str_truncate(path, path_len);
    str_appendf(path, "logo_rc%s", name);
    reverse_complement_motif(motif);
    CL_create1(motif, FALSE, FALSE, "MEME (no SSC)", str_internal(path), 
        options->eps, options->png);
  }

  str_truncate(path, path_len);
}
Ejemplo n.º 16
0
/***********************************************************************
 * Eliminate motifs that don't meet thresholds or are unused
 ***********************************************************************/
static void filter_motifs(
  STRING_LIST_T* requested_motifs, // Indices of motifs to include. 
  double         e_threshold,      // E-value to include motifs. 
  double         complexity_threshold, // For eliminate low-complexity motifs.
  ORDER_T**      order_spacing,    // Motif order and spacing (linear HMM). 
  int*           num_motifs,       // Number of motifs retrieved. 
  MOTIF_T*       motifs            // The retrieved motifs. 
) {
  int i_motif = 0;
  int num_motifs_kept = 0;

  for(i_motif = 0, num_motifs_kept = 0; i_motif < *num_motifs; i_motif++) {
    if (retain_motif(requested_motifs, e_threshold, complexity_threshold, 
         *order_spacing, &(motifs[i_motif]))) {
      if (i_motif > num_motifs_kept) {
        motifs[num_motifs_kept] = motifs[i_motif];
      }
      num_motifs_kept++;
    }
  }
  
  *num_motifs = num_motifs_kept;

  /* Tell the user which motifs were retained. */
  if (verbosity >= NORMAL_VERBOSE && *num_motifs > 0) {
    fprintf(stderr, "Retaining motif");
    for (i_motif = 0; i_motif < *num_motifs; i_motif++) {
      fprintf(stderr, " %s", get_motif_id(&(motifs[i_motif])));
    }
    fprintf(stderr, ".\n");
  }

  /* No point in proceeding if no motifs were retained. */
  if (*num_motifs == 0) {
    die("No motifs satisfied filters");
  }
    
}
Ejemplo n.º 17
0
/*************************************************************************
 * Entry point for centrimo
 *************************************************************************/
int main(int argc, char *argv[]) {
  CENTRIMO_OPTIONS_T options;
  SEQ_SITES_T seq_sites;
  SITE_COUNTS_T counts;
  int seqN, motifN, seqlen, db_i, motif_i, i;
  double log_pvalue_thresh;
  SEQ_T** sequences = NULL;
  ARRAY_T* bg_freqs = NULL;
  ARRAYLST_T *stats_list;
  MOTIF_DB_T **dbs, *db;
  MREAD_T *mread;
  MOTIF_STATS_T *stats;
  MOTIF_T *motif, *rev_motif;
  PSSM_T *pos_pssm, *rev_pssm;
  char *sites_path, *desc;
  FILE *sites_file;
  HTMLWR_T *html;
  JSONWR_T *json;

  // COMMAND LINE PROCESSING
  process_command_line(argc, argv, &options);

  // load the sequences
  read_sequences(options.alphabet, options.seq_source, &sequences, &seqN);
  seqlen = (seqN ? get_seq_length(sequences[0]) : 0);
  // calculate a sequence background (unless other background is given)
  if (!options.bg_source) {
    bg_freqs = calc_bg_from_fastas(options.alphabet, seqN, sequences);
  }

  // load the motifs
  motifN = 0;
  dbs = mm_malloc(sizeof(MOTIF_DB_T*) * arraylst_size(options.motif_sources));
  for (i = 0; i < arraylst_size(options.motif_sources); i++) {
    char* db_source;
    db_source = (char*)arraylst_get(i, options.motif_sources);
    dbs[i] = read_motifs(i, db_source, options.bg_source, &bg_freqs, 
        options.pseudocount, options.selected_motifs, options.alphabet);
    motifN += arraylst_size(dbs[i]->motifs);
  }
  log_pvalue_thresh = log(options.evalue_thresh) - log(motifN);
  // Setup some things for double strand scanning
  if (options.scan_both_strands == TRUE) {
    // Set up hash tables for computing reverse complement
    setup_hash_alph(DNAB);
    setalph(0);
    // Correct background by averaging on freq. for both strands.
    average_freq_with_complement(options.alphabet, bg_freqs);
    normalize_subarray(0, alph_size(options.alphabet, ALPH_SIZE), 0.0, bg_freqs);
    calc_ambigs(options.alphabet, FALSE, bg_freqs);
  }
  // Create output directory
  if (create_output_directory(options.output_dirname, options.allow_clobber, 
        (verbosity >= NORMAL_VERBOSE))) {
    die("Couldn't create output directory %s.\n", options.output_dirname);
  }
  // open output files
  sites_path = make_path_to_file(options.output_dirname, SITES_FILENAME);
  sites_file = fopen(sites_path, "w");
  free(sites_path);
  // setup html monolith writer
  json = NULL;
  if ((html = htmlwr_create(get_meme_etc_dir(), TEMPLATE_FILENAME))) {
    htmlwr_set_dest_name(html, options.output_dirname, HTML_FILENAME);
    htmlwr_replace(html, "centrimo_data.js", "data");
    json = htmlwr_output(html);
    if (json == NULL) die("Template does not contain data section.\n");
  } else {
    DEBUG_MSG(QUIET_VERBOSE, "Failed to open html template file.\n");
  }
  if (json) {
    // output some top level variables
    jsonwr_str_prop(json, "version", VERSION);
    jsonwr_str_prop(json, "revision", REVISION);
    jsonwr_str_prop(json, "release", ARCHIVE_DATE);
    jsonwr_str_array_prop(json, "cmd", argv, argc);
    jsonwr_property(json, "options");
    jsonwr_start_object_value(json);
    jsonwr_dbl_prop(json, "motif-pseudo", options.pseudocount);
    jsonwr_dbl_prop(json, "score", options.score_thresh);
    jsonwr_dbl_prop(json, "ethresh", options.evalue_thresh);
    jsonwr_lng_prop(json, "maxbin", options.max_window+1);
    jsonwr_bool_prop(json, "norc", !options.scan_both_strands);
    jsonwr_bool_prop(json, "noflip", options.no_flip);
    jsonwr_end_object_value(json);
    // output the description
    desc = prepare_description(&options);
    if (desc) {
      jsonwr_str_prop(json, "job_description", desc);
      free(desc);
    }
    // output size metrics
    jsonwr_lng_prop(json, "seqlen", seqlen);
    jsonwr_lng_prop(json, "tested", motifN);
    // output the fasta db
    jsonwr_property(json, "sequence_db");
    jsonwr_start_object_value(json);
    jsonwr_str_prop(json, "source", options.seq_source);
    jsonwr_lng_prop(json, "count", seqN);
    jsonwr_end_object_value(json);
    // output the motif dbs
    jsonwr_property(json, "motif_dbs");
    jsonwr_start_array_value(json);
    for (db_i = 0; db_i < arraylst_size(options.motif_sources); db_i++) {
      db = dbs[db_i];
      jsonwr_start_object_value(json);
      jsonwr_str_prop(json, "source", db->source);
      jsonwr_lng_prop(json, "count", arraylst_size(db->motifs));
      jsonwr_end_object_value(json);
    }
    jsonwr_end_array_value(json);
    // start the motif array
    jsonwr_property(json, "motifs");
    jsonwr_start_array_value(json);
  }
  /**************************************************************
   * Tally the positions of the best sites for each of the 
   * selected motifs.
   **************************************************************/
  // prepare the sequence sites
  memset(&seq_sites, 0, sizeof(SEQ_SITES_T));
  // prepare the site counts
  counts.allocated = ((2 * seqlen) - 1);
  counts.sites = mm_malloc(sizeof(double) * counts.allocated);
  // prepare the motifs stats list
  stats_list = arraylst_create();
  // prepare the other vars
  motif = NULL; pos_pssm = NULL; rev_motif = NULL; rev_pssm = NULL;
  for (db_i = 0; db_i < arraylst_size(options.motif_sources); db_i++) {
    db = dbs[db_i];
    for (motif_i = 0; motif_i < arraylst_size(db->motifs); motif_i++) {
      motif = (MOTIF_T *) arraylst_get(motif_i, db->motifs);
      DEBUG_FMT(NORMAL_VERBOSE, "Using motif %s of width %d.\n",  
          get_motif_id(motif), get_motif_length(motif));
      // reset the counts
      for (i = 0; i < counts.allocated; i++) counts.sites[i] = 0;
      counts.total_sites = 0;
      // create the pssm 
      pos_pssm = make_pssm(bg_freqs, motif);
      // If required, do the same for the reverse complement motif.
      if (options.scan_both_strands) {
        rev_motif = dup_rc_motif(motif);
        rev_pssm = make_pssm(bg_freqs, rev_motif);
      }
      // scan the sequences
      for (i = 0; i < seqN; i++)
        score_sequence(&options, sequences[i], pos_pssm, rev_pssm, 
            &seq_sites, &counts);
      // DEBUG check that the sum of the sites is close to the site count
      double sum_check = 0, sum_diff;
      for (i = 0; i < counts.allocated; i++) sum_check += counts.sites[i];
      sum_diff = counts.total_sites - sum_check;
      if (sum_diff < 0) sum_diff = -sum_diff;
      if (sum_diff > 0.1) {
        fprintf(stderr, "Warning: site counts don't sum to accurate value! "
            "%g != %ld", sum_check, counts.total_sites);
      }
      // output the plain text site counts
      output_site_counts(sites_file, seqlen, db, motif, &counts);
      // compute the best central window
      stats = compute_stats(options.max_window, seqlen, db, motif, &counts);
      // check if it passes the threshold
      if (json && stats->log_adj_pvalue <= log_pvalue_thresh) {
        output_motif_json(json, stats, &counts);
        arraylst_add(stats, stats_list);
      } else {
        free(stats);
      }
      // Free memory associated with this motif.
      free_pssm(pos_pssm);
      free_pssm(rev_pssm);
      destroy_motif(rev_motif);
    }
  }
  if (json) jsonwr_end_array_value(json);
  // finish writing sites
  fclose(sites_file);
  // finish writing html file
  if (html) {
    if (htmlwr_output(html) != NULL) {
      die("Found another JSON replacement!\n");
    }
    htmlwr_destroy(html);
  }
  // write text file
  output_centrimo_text(&options, motifN, stats_list);
  // Clean up.
  for (i = 0; i < seqN; ++i) {
    free_seq(sequences[i]); 
  }
  free(sequences);
  for (i = 0; i < arraylst_size(options.motif_sources); i++) {
    free_db(dbs[i]);
  }
  free(dbs);
  free_array(bg_freqs);
  free(counts.sites);
  free(seq_sites.sites);
  arraylst_destroy(free, stats_list);
  cleanup_options(&options);
  return 0;

}
Ejemplo n.º 18
0
/*************************************************************************
 * Entry point for ama
 *************************************************************************/
int main(int argc, char **argv) {
  AMA_OPTIONS_T options;
  ARRAYLST_T *motifs;
  clock_t c0, c1; // measuring cpu_time
  MOTIF_AND_PSSM_T *combo;
  CISML_T *cisml;
  PATTERN_T** patterns;
  PATTERN_T *pattern;
  FILE *fasta_file, *text_output, *cisml_output;
  int i, seq_loading_num, seq_counter, unique_seqs, seq_len, scan_len, x1, x2, y1, y2;
  char *seq_name, *path;
  bool need_postprocessing, created;
  SEQ_T *sequence;
  RBTREE_T *seq_ids;
  RBNODE_T *seq_node;
  double *logcumback;
  ALPH_T *alph;

  // process the command
  process_command_line(argc, argv, &options);

  // load DNA motifs
  motifs = load_motifs(&options);

  // get the alphabet
  if (arraylst_size(motifs) > 0) {
    combo = (MOTIF_AND_PSSM_T*)arraylst_get(0, motifs);
    alph = alph_hold(get_motif_alph(combo->motif));
  } else {
    alph = alph_dna();
  }

  // pick columns for GC operations
  x1 = -1; x2 = -1; y1 = -1; y2 = -1;
  if (alph_size_core(alph) == 4 && alph_size_pairs(alph) == 2) {
    x1 = 0; // A
    x2 = alph_complement(alph, x1); // T
    y1 = (x2 == 1 ? 2 : 1); // C
    y2 = alph_complement(alph, y1); // G
    assert(x1 != x2 && y1 != y2 && x1 != y1 && x2 != y2 && x1 != y2 && x2 != y1);
  }

  // record starting time
  c0 = clock();

  // Create cisml data structure for recording results
  cisml = allocate_cisml(PROGRAM_NAME, options.command_line, options.motif_filename, options.fasta_filename);
  set_cisml_background_file(cisml, options.bg_filename);

  // make a CISML pattern to hold scores for each motif
  for (i = 0; i < arraylst_size(motifs); i++) {
    combo = (MOTIF_AND_PSSM_T*)arraylst_get(i, motifs);
    add_cisml_pattern(cisml, allocate_pattern(get_motif_id(combo->motif), ""));
  }

  // Open the FASTA file for reading.
  fasta_file = NULL;
  if (!open_file(options.fasta_filename, "r", false, "FASTA", "sequences", &fasta_file)) {
    die("Couldn't open the file %s.\n", options.fasta_filename);
  }
  if (verbosity >= NORMAL_VERBOSE) {
    if (options.last == 0) {
      fprintf(stderr, "Using entire sequence\n");
    } else {
      fprintf(stderr, "Limiting sequence to last %d positions.\n", options.last);
    }
  }

  //
  // Read in all sequences and score with all motifs
  //
  seq_loading_num = 0;  // keeps track on the number of sequences read in total
  seq_counter = 0;      // holds the index to the seq in the pattern
  unique_seqs = 0;      // keeps track on the number of unique sequences
  need_postprocessing = false;
  sequence = NULL;
  logcumback = NULL;
  seq_ids = rbtree_create(rbtree_strcasecmp,rbtree_strcpy,free,rbtree_intcpy,free);
  while (read_one_fasta(alph, fasta_file, options.max_seq_length, &sequence)) {
    ++seq_loading_num;
    seq_name = get_seq_name(sequence);
    seq_len = get_seq_length(sequence);
    scan_len = (options.last != 0 ? options.last : seq_len);
    // red-black trees are only required if duplicates should be combined
    if (options.combine_duplicates){
      //lookup seq id and create new entry if required, return sequence index
      seq_node = rbtree_lookup(seq_ids, get_seq_name(sequence), true, &created);
      if (created) { // assign it a loading number
        rbtree_set(seq_ids, seq_node, &unique_seqs);
        seq_counter = unique_seqs;
        ++unique_seqs;
      } else {
        seq_counter = *((int*)rbnode_get(seq_node));
      }
    }
          
    //
    // Set up sequence-dependent background model and compute
    // log cumulative probability of sequence.
    // This needs the sequence in raw format.
    //
    if (options.sdbg_order >= 0)
      logcumback = log_cumulative_background(alph, options.sdbg_order, sequence);

    // Index the sequence, throwing away the raw format and ambiguous characters
    index_sequence(sequence, alph, SEQ_NOAMBIG);

    // Get the GC content of the sequence if binning p-values by GC
    // and store it in the sequence object.
    if (options.num_gc_bins > 1) {
      ARRAY_T *freqs = get_sequence_freqs(sequence, alph);
      set_total_gc_sequence(sequence, get_array_item(y1, freqs) + get_array_item(y2, freqs)); // f(C) + f(G)
      free_array(freqs);                        // clean up
    } else {
      set_total_gc_sequence(sequence, -1);      // flag ignore
    }

    // Scan with motifs.
    for (i = 0; i < arraylst_size(motifs); i++) {
      pattern = get_cisml_patterns(cisml)[i];
      combo = (MOTIF_AND_PSSM_T*)arraylst_get(i, motifs);
      if (verbosity >= HIGHER_VERBOSE) {
        fprintf(stderr, "Scanning %s sequence with length %d "
            "abbreviated to %d with motif %s with length %d.\n",
            seq_name, seq_len, scan_len, 
            get_motif_id(combo->motif), get_motif_length(combo->motif));
      }
      SCANNED_SEQUENCE_T* scanned_seq = NULL;
      if (!options.combine_duplicates || get_pattern_num_scanned_sequences(pattern) <= seq_counter) {
        // Create a scanned_sequence record and save it in the pattern.
        scanned_seq = allocate_scanned_sequence(seq_name, seq_name, pattern);
        set_scanned_sequence_length(scanned_seq, scan_len);
      } else {
        // get existing sequence record
        scanned_seq = get_pattern_scanned_sequences(pattern)[seq_counter];
        set_scanned_sequence_length(scanned_seq, max(scan_len, get_scanned_sequence_length(scanned_seq)));
      }
      
      // check if scanned component of sequence has sufficient length for the motif
      if (scan_len < get_motif_length(combo->motif)) {
        // set score to zero and p-value to 1 if not set yet
        if(!has_scanned_sequence_score(scanned_seq)){
          set_scanned_sequence_score(scanned_seq, 0.0);
        }
        if(options.pvalues && !has_scanned_sequence_pvalue(scanned_seq)){
          set_scanned_sequence_pvalue(scanned_seq, 1.0);
        } 
        add_scanned_sequence_scanned_position(scanned_seq); 
        if (get_scanned_sequence_num_scanned_positions(scanned_seq) > 0L) {
          need_postprocessing = true;
        }
        if (verbosity >= HIGH_VERBOSE) {
          fprintf(stderr, "%s too short for motif %s. Score set to 0.\n",
              seq_name, get_motif_id(combo->motif));
        }
      } else {
        // scan the sequence using average/maximum motif affinity
        ama_sequence_scan(alph, sequence, logcumback, combo->pssm_pair,
            options.scoring, options.pvalues, options.last, scanned_seq,
            &need_postprocessing);
      }
    } // All motifs scanned

    free_seq(sequence);
    if (options.sdbg_order >= 0) myfree(logcumback);

  } // read sequences

  fclose(fasta_file);
  if (verbosity >= HIGH_VERBOSE) fprintf(stderr, "(%d) sequences read in.\n", seq_loading_num);
  if (verbosity >= NORMAL_VERBOSE) fprintf(stderr, "Finished          \n");

        
  // if any sequence identifier was multiple times in the sequence set  then
  // postprocess of the data is required
  if (need_postprocessing || options.normalize_scores) {
    post_process(cisml, motifs, options.normalize_scores);
  }
        
  // output results
  if (options.output_format == DIRECTORY_FORMAT) {
    if (create_output_directory(options.out_dir, options.clobber, verbosity > QUIET_VERBOSE)) {
      // only warn in higher verbose modes
      fprintf(stderr, "failed to create output directory `%s' or already exists\n", options.out_dir);
      exit(1);
    }
    path = make_path_to_file(options.out_dir, text_filename);
    //FIXME check for errors: MEME doesn't either and we at least know we have a good directory
    text_output = fopen(path, "w");
    free(path);
    path = make_path_to_file(options.out_dir, cisml_filename);
    //FIXME check for errors
    cisml_output = fopen(path, "w");
    free(path);
    print_cisml(cisml_output, cisml, true, NULL, false);
    print_score(cisml, text_output);
    fclose(cisml_output);
    fclose(text_output);
  } else if (options.output_format == GFF_FORMAT) {
    print_score(cisml, stdout);
  } else if (options.output_format == CISML_FORMAT) {
    print_cisml(stdout, cisml, true, NULL, false);
  } else {
    die("Output format invalid!\n");
  }

  //
  // Clean up.
  //
  rbtree_destroy(seq_ids);
  arraylst_destroy(motif_and_pssm_destroy, motifs);
  free_cisml(cisml);
  rbtree_destroy(options.selected_motifs);
  alph_release(alph);
        
  // measure time
  if (verbosity >= NORMAL_VERBOSE) { // starting time
    c1 = clock();
    fprintf(stderr, "cycles (CPU);            %ld cycles\n", (long) c1);
    fprintf(stderr, "elapsed CPU time:        %f seconds\n", (float) (c1-c0) / CLOCKS_PER_SEC);
  }
  return 0;
}
Ejemplo n.º 19
0
ARRAYLST_T* load_motifs(AMA_OPTIONS_T *opts) {
  ARRAYLST_T *motifs;
  ARRAY_T *pos_bg_freqs, *rev_bg_freqs;
  MREAD_T *mread;
  MOTIF_T *motif, *motif_rc;
  double range;
  PSSM_T *pos_pssm, *neg_pssm;
  int total_motifs;
  ALPH_T *alph;

  //
  // Read the motifs and background model.
  //
  //this reads any meme file, xml, txt and html
  mread = mread_create(opts->motif_filename, OPEN_MFILE);
  mread_set_bg_source(mread, opts->bg_filename);
  mread_set_pseudocount(mread, opts->pseudocount);

  // sanity check, since the rest of the code relies on the motifs being complementable
  alph = alph_hold(mread_get_alphabet(mread));
  if (alph == NULL) die("Unable to determine alphabet from motifs");
  if (opts->scan_both_strands && !alph_has_complement(alph)) {
    opts->scan_both_strands = false;
  }
  if (opts->num_gc_bins > 1 && alph_size_core(alph) != 4 && alph_size_pairs(alph) != 2) {
    fprintf(stderr, "Warning: The motif alphabet does not have exactly 2 complementary pairs so \"GC binning\" will be disabled.\n");
    opts->num_gc_bins = 1;
  }

  pos_bg_freqs = mread_get_background(mread);
  rev_bg_freqs = NULL;
  if (opts->scan_both_strands) {
    rev_bg_freqs = allocate_array(get_array_length(pos_bg_freqs));
    copy_array(pos_bg_freqs, rev_bg_freqs);
    complement_swap_freqs(alph, rev_bg_freqs, rev_bg_freqs);
  }

  // allocate memory for motifs
  motifs = arraylst_create();
  //
  // Convert motif matrices into log-odds matrices.
  // Scale them.
  // Compute the lookup tables for the PDF of scaled log-odds scores.
  //
  range = 300; // 100 is not very good; 1000 is great but too slow
  neg_pssm = NULL;
  total_motifs = 0;
  while (mread_has_motif(mread)) {
    motif = mread_next_motif(mread);
    total_motifs++;
    if (rbtree_size(opts->selected_motifs) == 0 || rbtree_find(opts->selected_motifs, get_motif_id(motif)) != NULL) {
      if (verbosity >= HIGH_VERBOSE) {
        fprintf(stderr, "Using motif %s of width %d.\n", get_motif_id(motif), get_motif_length(motif));
      }
      pos_pssm =
        build_motif_pssm(
          motif, 
          pos_bg_freqs, 
          pos_bg_freqs, 
          NULL, // Priors not used
          0.0L, // alpha not used
          range, 
          opts->num_gc_bins, 
          true 
        );
      //
      //  Note: If scanning both strands, we complement the motif frequencies
      //  but not the background frequencies so the motif looks the same.
      //  However, the given frequencies are used in computing the p-values
      //  since they represent the frequencies on the negative strands.
      //  (If we instead were to complement the input sequence, keeping the
      //  the motif fixed, we would need to use the complemented frequencies
      //  in computing the p-values.  Is that any clearer?)
      //
      if (opts->scan_both_strands) {
        motif_rc = dup_rc_motif(motif);
        neg_pssm =
          build_motif_pssm(
            motif_rc, 
            rev_bg_freqs, 
            pos_bg_freqs, 
            NULL, // Priors not used
            0.0L, // alpha not used
            range, 
            opts->num_gc_bins, 
            true
          );
        destroy_motif(motif_rc);
      }
      arraylst_add(motif_and_pssm_create(motif, pos_pssm, neg_pssm), motifs);
    } else {
      if (verbosity >= HIGH_VERBOSE) fprintf(stderr, "Skipping motif %s.\n",
          get_motif_id(motif));
      destroy_motif(motif);
    }
  }
  mread_destroy(mread);
  free_array(pos_bg_freqs);
  free_array(rev_bg_freqs);
  alph_release(alph);
  if (verbosity >= NORMAL_VERBOSE) {
    fprintf(stderr, "Loaded %d/%d motifs from %s.\n", 
        arraylst_size(motifs), total_motifs, opts->motif_filename);
  }
  return motifs;
}
Ejemplo n.º 20
0
Archivo: ramen.c Proyecto: CPFL/gmeme
/*
 * Using the linreg test,
 *
 * this method returns the lowest scoring subdivision of a set of sequences for a given motif.
 * It's not self-contained, as it requires to hook into the global variables results, motifs, seq_ids.
 */
ramen_result_t* ramen_do_linreg_test(int motif_num) {
  //Assorted vars
  int seq_num;
  int j,k;
  int motif_index = motif_num * 2; //This is a workaround to the change in the motif datastructure where it now
                      // goes +MOTIFA -MOTIFA +MOTIFB etc. rather than all + then all - motifs.

  //Vars for the regression
  double* x;
  double* y;
  double m = 0;
  double b = 0;
  double mse = 0;

  //Vars for scoring
  ramen_result_t* r;

  //Allocate memory or set initial values
  seq_num = get_num_strings(seq_ids); //number of sequences
  r = malloc(sizeof(ramen_result_t)); //allocate space, as a ptr to this will go in the array later
                    //that's why we don't free it in this loop.
  x = malloc(sizeof(double)*seq_num);
  y = malloc(sizeof(double)*seq_num);

  //Now we need to copy the scores into two double arrays
  //Use LOG macro so that log(0) 'works'
  for (j=0; j < seq_num; j++) {
    if (args.log_fscores == TRUE) {
      y[j] = LOG(get_array_item(j, seq_fscores));
    } else {
      y[j] = get_array_item(j, seq_fscores);
    }

    if (args.log_pwmscores == TRUE) {
      x[j] = LOG(results[motif_num][j]);
    } else {
      x[j] = results[motif_num][j];
    }
  }

    //Switch x&y if they're to be switched
    if (args.linreg_switchxy) {
        SWAP(double*, x, y);
    }

  // TODO: Tidy and/or remove this for production
  if(args.linreg_dump_dir > 0) {
    FILE *fh;
    char* filename;
    filename = malloc(sizeof(char)*(strlen(args.linreg_dump_dir) + 50));
    sprintf(filename, "%s/%s.tsv", args.linreg_dump_dir, get_motif_id(motif_at(motifs.motifs, motif_index)));

    fh = fopen(filename, "w");
    fputs("PWM_Score\tFluorescence_Score\n", fh);
    for (j=0; j < seq_num; j++) {
      fprintf(fh, "%.10e %.10e\n", x[j], y[j]);
    }
    fclose(fh);
    free(filename);
  }


  /*extern double regress(
    int n,                        / number of points /
    double *x,                    / x values /
    double *y,                    / y values /
    double *m,                    / slope /
    double *b                     / y intercept /
    );*/
    mse = regress(seq_num, x, y, &m, &b);

  if (args.verbose >= 3) {
    printf("LinReg MSE of motif %s on %i seqs: %.4g (m: %.4g b: %.4g)\n",
           get_motif_id(motif_at(motifs.motifs, motif_index)), seq_num, mse, m, b);
  }

  //Add to our motif list if lowest MSE
  r->motif_id = strdup(get_motif_id(motif_at(motifs.motifs, motif_index)));
  r->m = m; //Not p-values, but they'll do when we re-use this structure...
  r->b = b;
  r->mse = mse;
  r->p = -1;

    //Do stochastic sampling if required.
    if (args.repeats > 0) {
        int repeat_wins = 0;
        for (j=0;j<args.repeats;j++) {
            double repeat_mse = 0;
            shuffle(x,seq_num); //Shuffle and break the associations between x and y
            repeat_mse = regress(seq_num, x, y, &m, &b);
            //fprintf(stderr, "Motif %d Repeat %d RMSE: %g MSE: %g\n",motif_index,j,repeat_mse,mse);
            if (repeat_mse <= mse) {
                repeat_wins++;
            }
        }
        r->p = repeat_wins*1.0/ args.repeats*1.0;
    }
    free(x);
    free(y);

  return r;
}
Ejemplo n.º 21
0
Archivo: ama.c Proyecto: a1aks/Haystack
/*************************************************************************
 * Entry point for ama
 *************************************************************************/
int main(int argc, char *argv[]) {
  int max_seq_length = MAX_SEQ;
  STRING_LIST_T* selected_motifs = NULL;
  double pseudocount = 0.01;
  int output_format = CISML_FORMAT;
  program_name = "ama";
  int scoring = AVG_ODDS;
  BOOLEAN_T pvalues = FALSE;
  BOOLEAN_T normalize_scores = FALSE;
  BOOLEAN_T combine_duplicates = FALSE;
  int num_gc_bins = 1;
  int sdbg_order = -1;				// don't use sequence background
  BOOLEAN_T scan_both_strands = TRUE;
  ARRAY_T* pos_bg_freqs = NULL;
  ARRAY_T* rev_bg_freqs = NULL;
  clock_t c0, c1; /* measuring cpu_time */
  CISML_T *cisml;
  char * out_dir = NULL;
  BOOLEAN_T clobber = FALSE;
  int i;
  int last = 0;
  ALPH_T alph = INVALID_ALPH;

  /**********************************************
   * COMMAND LINE PROCESSING
   **********************************************/

  const int num_options = 16;
  cmdoption const motif_scan_options[] = {
    { "max-seq-length", REQUIRED_VALUE },
    { "motif", REQUIRED_VALUE },
    { "motif-pseudo", REQUIRED_VALUE },
    { "rma", NO_VALUE },
    { "pvalues", NO_VALUE },
    { "sdbg", REQUIRED_VALUE },
    { "norc", NO_VALUE },
    { "cs", NO_VALUE },
    { "o-format", REQUIRED_VALUE },
    { "o", REQUIRED_VALUE },
    { "oc", REQUIRED_VALUE },
    { "scoring", REQUIRED_VALUE },
    { "verbosity", REQUIRED_VALUE },
    { "gcbins", REQUIRED_VALUE },
    { "last", REQUIRED_VALUE },
    { "version", NO_VALUE }
  };

  int option_index = 0;

  // Define the usage message.
  char usage[] = "USAGE: ama [options] <motif file> <sequence file> [<background file>]\n"
    "\n"
    "   Options:\n"
    "     --sdbg <order>\t\t\tUse Markov background model of\n"
    "       \t\t\t\t\torder <order> derived from the sequence\n"
    "       \t\t\t\t\tto compute its likelihood ratios.\n"
    "       \t\t\t\t\tOverrides --pvalues, --gcbins and --rma;\n"
    "       \t\t\t\t\t<background file> is required unless\n"
    "       \t\t\t\t\t--sdbg is given.\n"
    "     --motif <id>\t\t\tUse only the motif identified by <id>.\n"
    "       \t\t\t\t\tThis option may be repeated.\n"
    "     --motif-pseudo <float>\t\tThe value <float> times the background\n"
    "       \t\t\t\t\tfrequency is added to the count of each\n"
    "       \t\t\t\t\tletter when creating the likelihood \n"
    "       \t\t\t\t\tratio matrix (default: %g).\n"
    "     --norc\t\t\t\tDisables the scanning of the reverse\n"
    "       \t\t\t\t\tcomplement strand.\n"
    "     --scoring [avg-odds|max-odds]\tIndicates whether the average or \n"
    "       \t\t\t\t\tthe maximum odds should be calculated\n"
    "       \t\t\t\t\t(default: avg-odds)\n"
    "     --rma\t\t\t\tScale motif scores to the range 0-1.\n"
    "       \t\t\t\t\t(Relative Motif Affinity).\n"
    "       \t\t\t\t\tMotif scores are scaled by the maximum\n"
    "       \t\t\t\t\tscore achievable by that PWM. (default:\n"
    "       \t\t\t\t\tmotif scores are not normalized)\n"
    "     --pvalues\t\t\t\tPrint p-value of avg-odds score in cisml\n"
    "       \t\t\t\t\toutput. Ignored for max-odds scoring.\n"
    "       \t\t\t\t\t(default: p-values are not printed)\n"
    "     --gcbins <bins>\t\t\tCompensate p-values for GC content of\n"
    "       \t\t\t\t\teach sequence using given number of \n"
    "       \t\t\t\t\tGC range bins. Recommended bins: 41.\n"
    "       \t\t\t\t\t(default: p-values are based on\n"
    "       \t\t\t\t\tfrequencies in background file)\n"
    "     --cs\t\t\t\tEnable combining sequences with same\n"
    "       \t\t\t\t\tidentifier by taking the average score\n"
    "       \t\t\t\t\tand the Sidac corrected p-value.\n"
    "     --o-format [gff|cisml]\t\tOutput file format (default: cisml)\n"
    "       \t\t\t\t\tignored if --o or --oc option used\n"
    "     --o <directory>\t\t\tOutput all available formats to\n"
    "       \t\t\t\t\t<directory>; give up if <directory>\n"
    "       \t\t\t\t\texists\n"
    "     --oc <directory>\t\t\tOutput all available formats to\n"
    "       \t\t\t\t\t<directory>; if <directory> exists\n"
    "       \t\t\t\t\toverwrite contents\n"
    "     --verbosity [1|2|3|4]\t\tControls amount of screen output\n"
    "       \t\t\t\t\t(default: %d)\n"
    "     --max-seq-length <int>\t\tSet the maximum length allowed for \n"
    "       \t\t\t\t\tinput sequences. (default: %d)\n"
    "     --last <int>\t\t\tUse only scores of (up to) last <n>\n"
    "       \t\t\t\t\tsequence positions to compute AMA.\n"
    "     --version   \t\t\tPrint version and exit.\n"
    "\n";

  // Parse the command line.
  if (simple_setopt(argc, argv, num_options, motif_scan_options) != NO_ERROR) {
    die("Error processing command line options: option name too long.\n");
  }
    
    BOOLEAN_T setoutputformat = FALSE;
    BOOLEAN_T setoutputdirectory = FALSE;

  while (TRUE) {
    int c = 0;
    char* option_name = NULL;
    char* option_value = NULL;
    const char * message = NULL;

    // Read the next option, and break if we're done.
    c = simple_getopt(&option_name, &option_value, &option_index);
    if (c == 0) {
      break;
    } else if (c < 0) {
      (void) simple_getopterror(&message);
      die("Error processing command line options (%s).\n", message);
    } else if (strcmp(option_name, "max-seq-length") == 0) {
	max_seq_length = atoi(option_value);
    } else if (strcmp(option_name, "norc") == 0) {
	scan_both_strands = FALSE;
    } else if (strcmp(option_name, "cs") == 0) {
		combine_duplicates = TRUE;
    } else if (strcmp(option_name, "motif") == 0) {
	if (selected_motifs == NULL) {
	  selected_motifs = new_string_list();
	}
	add_string(option_value, selected_motifs);
    } else if (strcmp(option_name, "motif-pseudo") == 0) {
	pseudocount = atof(option_value);
    } else if (strcmp(option_name, "o-format") == 0) {
        if (setoutputdirectory) {
            if (verbosity >= NORMAL_VERBOSE)
                fprintf(stderr, "output directory specified, ignoring --o-format\n");
        } else {
            setoutputformat = TRUE;
            if (strcmp(option_value, "gff") == 0)
                output_format = GFF_FORMAT;
            else if (strcmp(option_value, "cisml") == 0)
                output_format = CISML_FORMAT;
            else {
                if (verbosity >= NORMAL_VERBOSE)
                  fprintf(stderr, "Output format not known. Using standard instead (cisML).\n");
                  output_format = CISML_FORMAT;
            }
        }
    } else if (strcmp(option_name, "o") == 0 || strcmp(option_name, "oc") == 0) {
        setoutputdirectory = TRUE;
        if (setoutputformat) {
            if (verbosity >= NORMAL_VERBOSE)
                fprintf(stderr, "output directory specified, ignoring --o-format\n");
        }
        clobber = strcmp(option_name, "oc") == 0;
        out_dir = (char*) malloc (sizeof(char)*(strlen(option_value)+1));
        strcpy(out_dir, option_value);
        output_format = DIRECTORY_FORMAT;
    } else if (strcmp(option_name, "verbosity") == 0) {
	verbosity = atoi(option_value);
    } else if (strcmp(option_name, "scoring") == 0) {
      if (strcmp(option_value, "max-odds") == 0)
	scoring = MAX_ODDS;
      else if (strcmp(option_value, "avg-odds") == 0)
	scoring = AVG_ODDS;
      else if (strcmp(option_value, "sum-odds") == 0)
	scoring = SUM_ODDS;
	  else
	die("Specified scoring scheme not known.\n", message);
    } else if (strcmp(option_name, "pvalues") == 0) {
      pvalues = TRUE;
    } else if (strcmp(option_name, "rma") == 0) {
      normalize_scores = TRUE;
      fprintf(stderr, "Normalizing motif scores using RMA method.\n");
    } else if (strcmp(option_name, "gcbins") == 0) {
      num_gc_bins = atoi(option_value);
      pvalues = TRUE;
      if (num_gc_bins <= 1) die("Number of bins in --gcbins must be greater than 1.\n", message);
    } else if (strcmp(option_name, "sdbg") == 0) {
      sdbg_order = atoi(option_value);			// >=0 means use sequence bkg
    }
    else if (strcmp(option_name, "last") == 0) {
      int i = 0;
      if (option_value[0] == '-') ++i;
      while (option_value[i] != '\0') {
        if (!isdigit(option_value[i])) {
          die("Specified parameter 'last' contains non-numeric characters.\n");
        }
        ++i;
      }
      last = atoi(option_value);
      if (errno != 0) {
        die("Specified parameter 'last' could not be parsed as a number as:\n%s\n",strerror(errno));
      }
      if (last < 0) {
        die("Specified parameter 'last' had negative value (%d) when only postive or zero values are allowed \n", last);
      }
    }
    else if (strcmp(option_name, "version") == 0) {
      fprintf(stdout, VERSION "\n");
      exit(EXIT_SUCCESS);
    }
  }

  // --sdbg overrides --pvalues and --gcbins and --rma
  int req_args = 3;
  if (sdbg_order >= 0) {
    pvalues = FALSE;
    normalize_scores = FALSE;
    num_gc_bins = 1;
    req_args = 2;
  }

  // Check all required arguments given
  if (sdbg_order >= 0 && argc > option_index + req_args) {
    die("<background file> cannot be given together with --sdbg.\n");
  } else if (argc != option_index + req_args) {
    fprintf(stderr, usage, pseudocount, verbosity, max_seq_length);
    exit(EXIT_FAILURE);
  }

  // Get required arguments. 
  char* motif_filename = argv[option_index];
  option_index++;
  char* fasta_filename = argv[option_index];
  option_index++;
  char* bg_filename;
  if (req_args == 3) {			// required unless --sdbg given
    bg_filename = argv[option_index];
    option_index++;
  } else {
    bg_filename = "--uniform--";	// So PSSMs will use uniform background;
					// we can multiply them out later.
  }

  // measure time
  c0 = clock();

  // Set up hash tables for computing reverse complement if doing --sdbg
  if (sdbg_order >= 0) setup_hash_alph(DNAB);

  // Create cisml data structure for recording results
  cisml = allocate_cisml(program_name, motif_filename, fasta_filename);
  set_cisml_background_file(cisml, bg_filename);

  /**********************************************
   * Read the motifs and background model.
   **********************************************/
  int num_motifs = 0;
  MREAD_T *mread;
  ARRAYLST_T *motifs;
  PSSM_PAIR_T** pssm_pairs;	// note pssm_pairs is an array of pointers

  //this reads any meme file, xml, txt and html
  mread = mread_create(motif_filename, OPEN_MFILE);
  mread_set_bg_source(mread, bg_filename);
  mread_set_pseudocount(mread, pseudocount);

  motifs = mread_load(mread, NULL);
  alph = mread_get_alphabet(mread);
  pos_bg_freqs = mread_get_background(mread);

  mread_destroy(mread);

  num_motifs = arraylst_size(motifs);

  // allocate memory for PSSM pairs
  pssm_pairs = (PSSM_PAIR_T**)mm_malloc(sizeof(PSSM_PAIR_T*) * num_motifs);

  if (verbosity >= NORMAL_VERBOSE) 
    fprintf(stderr, "Number of motifs in file %d.\n", num_motifs);

  // make a CISML pattern to hold scores for each motif
  PATTERN_T** patterns = NULL;
  Resize(patterns, num_motifs, PATTERN_T*);
  int motif_index;
  for (motif_index = 0; motif_index < num_motifs; motif_index++) {
    MOTIF_T* motif = (MOTIF_T*)arraylst_get(motif_index, motifs);
    patterns[motif_index] = allocate_pattern(get_motif_id(motif), "");
    add_cisml_pattern(cisml, patterns[motif_index]);
  }

  // make reverse complement motifs and background frequencies.
  if (scan_both_strands == TRUE) {
    add_reverse_complements(motifs);
    assert(arraylst_size(motifs) == (2 * num_motifs));
    rev_bg_freqs = allocate_array(get_array_length(pos_bg_freqs));
    complement_dna_freqs(pos_bg_freqs, rev_bg_freqs);
  }

  /**************************************************************
   * Convert motif matrices into log-odds matrices.
   * Scale them.
   * Compute the lookup tables for the PDF of scaled log-odds scores.
   **************************************************************/
  int ns = scan_both_strands ? 2 : 1;	// number of strands
  for (motif_index = 0; motif_index < num_motifs; motif_index++) {
    MOTIF_T *motif, *motif_rc;
    motif = (MOTIF_T*)arraylst_get(motif_index*ns, motifs);
    if (scan_both_strands)
      motif_rc = (MOTIF_T*)arraylst_get(motif_index*ns + 1, motifs);
    else
      motif_rc = NULL;
    /*
     *  Note: If scanning both strands, we complement the motif frequencies
     *  but not the background frequencies so the motif looks the same.
     *  However, the given frequencies are used in computing the p-values
     *  since they represent the frequencies on the negative strands.
     *  (If we instead were to complement the input sequence, keeping the
     *  the motif fixed, we would need to use the complemented frequencies
     *  in computing the p-values.  Is that any clearer?)
    */
    double range = 300;		// 100 is not very good; 1000 is great but too slow
    PSSM_T* pos_pssm =
      build_motif_pssm(
        motif, 
        pos_bg_freqs, 
        pos_bg_freqs, 
        NULL, // Priors not used
        0.0L, // alpha not used
        range, 
        num_gc_bins, 
        TRUE
      );
    PSSM_T* neg_pssm = (scan_both_strands ?
      build_motif_pssm(
        motif_rc, 
        rev_bg_freqs, 
        pos_bg_freqs, 
        NULL, // Priors not used
        0.0L, // alpha not used
        range, 
        num_gc_bins, 
        TRUE
      )
      : NULL
    );
    pssm_pairs[motif_index] = create_pssm_pair(pos_pssm, neg_pssm);
  }

  // Open the FASTA file for reading.
  FILE* fasta_file = NULL;
  if (open_file(fasta_filename, "r", FALSE, "FASTA", "sequences", &fasta_file) == 0) {
    die("Couldn't open the file %s.\n", fasta_filename);
  }
  if (verbosity >= NORMAL_VERBOSE) {
    if (last == 0) {
      fprintf(stderr, "Using entire sequence\n");
    } else {
      fprintf(stderr, "Limiting sequence to last %d positions.\n", last);
    }
  }

  /**************************************************************
   * Read in all sequences and score with all motifs
   **************************************************************/
  int seq_loading_num = 0;  // keeps track on the number of sequences read in total
  int seq_counter = 0;		// holds the index to the seq in the pattern
  int unique_seqs = 0;      // keeps track on the number of unique sequences
  BOOLEAN_T need_postprocessing = FALSE;
  SEQ_T* sequence = NULL;
  RBTREE_T* seq_ids = rbtree_create(rbtree_strcasecmp,NULL,free,rbtree_intcpy,free);
  RBNODE_T* seq_node;
  BOOLEAN_T created;
  while (read_one_fasta(alph, fasta_file, max_seq_length, &sequence)) {
    ++seq_loading_num;
	created = FALSE;
    char* seq_name = get_seq_name(sequence);
    int seq_len = get_seq_length(sequence);
    int scan_len;
    if (last != 0) {
      scan_len = last;
    } else {
      scan_len = seq_len;
    }
	  
	// red-black trees are only required if duplicates should be combined
	if (combine_duplicates){
		//lookup seq id and create new entry if required, return sequence index
		char *tmp_id = mm_malloc(strlen(seq_name)+1); // required copy for rb-tree
		strncpy(tmp_id,seq_name,strlen(seq_name)+1);
		seq_node = rbtree_lookup(seq_ids, tmp_id, TRUE, &created);
		if (created) {// assign it a loading number
			rbtree_set(seq_ids, seq_node, &unique_seqs);
			seq_counter = unique_seqs;
			++unique_seqs;
		} else {
			seq_counter = *((int*)rbnode_get(seq_node));
		}
	}
	  
    //
    // Set up sequence-dependent background model and compute
    // log cumulative probability of sequence.
    //
    double *logcumback = NULL;                    // array of log cumulative probs.
    if (sdbg_order >= 0) {
      Resize(logcumback, seq_len+1, double);
      char* raw_seq = get_raw_sequence(sequence);
      BOOLEAN rc = FALSE;
      double *a_cp = get_markov_from_sequence(raw_seq, alph_string(alph), rc, sdbg_order, 0);
      log_cum_back(raw_seq, a_cp, sdbg_order, logcumback);
      myfree(a_cp);
    }

    // Get the GC content of the sequence if binning p-values by GC
    // and store it in the sequence object.
    if (num_gc_bins > 1) {
      ARRAY_T *freqs = get_sequence_freqs(sequence, alph);
      set_total_gc_sequence(sequence,
        get_array_item(1,freqs) + get_array_item(2,freqs));	// f(C) + f(G)
      free_array(freqs);			// clean up
    } else {
      set_total_gc_sequence(sequence, -1);	// flag ignore
    }

    /**************************************************************
     * Process all motifs.
     **************************************************************/
    int ns = scan_both_strands ? 2 : 1;
    for (motif_index = 0; motif_index < num_motifs; motif_index++) {
      PATTERN_T *pattern = patterns[motif_index];
      MOTIF_T* motif = (MOTIF_T*)arraylst_get(ns*motif_index, motifs);
      char* motif_id = (scan_both_strands ? get_motif_st_id(motif) : get_motif_id(motif));
      if (verbosity >= HIGH_VERBOSE) {
        fprintf(stderr, "Using motif %s of width %d.\n", motif_id, get_motif_length(motif));
      }
      if ((selected_motifs == NULL) || (have_string(get_motif_id(motif), selected_motifs) == TRUE)) {
        if (verbosity >= HIGHER_VERBOSE) {
          fprintf(stderr, "Scanning %s sequence with length %d "
              "abbreviated to %d with motif %s with length %d.\n",
              seq_name, seq_len, scan_len, motif_id, get_motif_length(motif));
        }
		SCANNED_SEQUENCE_T* scanned_seq = NULL;

		
		if (!combine_duplicates || get_pattern_num_scanned_sequences(pattern) <= seq_counter){
			// Create a scanned_sequence record and save it in the pattern.
			scanned_seq = allocate_scanned_sequence(seq_name, seq_name, pattern);
			set_scanned_sequence_length(scanned_seq, scan_len);
		} else {
			// get existing sequence record
			scanned_seq = get_pattern_scanned_sequences(pattern)[seq_counter];
			set_scanned_sequence_length(scanned_seq, max(scan_len, get_scanned_sequence_length(scanned_seq)));
		}
		
		// check if scanned component of sequence has sufficient length for the motif
		if (scan_len < get_motif_length(motif)) {
			// set score to zero and p-value to 1 if not set yet
			if(!has_scanned_sequence_score(scanned_seq)){
				set_scanned_sequence_score(scanned_seq, 0.0);
			}
			if(pvalues && !has_scanned_sequence_pvalue(scanned_seq)){
				set_scanned_sequence_pvalue(scanned_seq, 1.0);
			} 
			add_scanned_sequence_scanned_position(scanned_seq); 
			if (get_scanned_sequence_num_scanned_positions(scanned_seq) > 0L) need_postprocessing = TRUE;
			if (verbosity >= HIGH_VERBOSE) fprintf(stderr, "%s too short for motif %s. Score set to 0!\n", seq_name, motif_id);
		} else {  
			// scan the sequence using average/maximum motif affinity
			ama_sequence_scan(alph, sequence, logcumback, pssm_pairs[motif_index], scoring, 
							  pvalues, last, scanned_seq, &need_postprocessing);
		}

      } else {
        if (verbosity >= HIGH_VERBOSE) fprintf(stderr, "Skipping motif %s.\n", motif_id);
      }
    } // All motifs parsed

    free_seq(sequence);
    if (sdbg_order >= 0) myfree(logcumback);

  } // read sequences
Ejemplo n.º 22
0
/*************************************************************************
 * Calculate the odds score for each motif-sized window at each
 * site in the sequence using the given nucleotide frequencies.
 *
 * This function is a lightweight version based on the one contained in
 * motiph-scoring. Several calculations that are unnecessary for gomo
 * have been removed in order to speed up the process
 *************************************************************************/
static double score_sequence(
    SEQ_T *seq,         // sequence to scan (IN)
    MOTIF_T *motif,     // motif already converted to odds values (IN)
    PSSM_T *m_pssm,     // motif pssm (IN)
    MATRIX_T *m_odds,   // motif odds (IN)
    int method,         // method used for scoring (IN)
    double threshold,   // Threshold to use in TOTAL_HITS mode with a PWM
    ARRAY_T *bg_freqs   //background model
    )
{

  assert(seq != NULL);
  assert(motif != NULL);
  assert((method == TOTAL_HITS && m_pssm) || (method != TOTAL_HITS && m_odds));

  char* raw_seq = get_raw_sequence(seq);
  int seq_length = get_seq_length(seq);

  // Get the pv lookup table
  ARRAY_T* pv_lookup = NULL;
  if (NULL != m_pssm) {
    pv_lookup = m_pssm->pv;
    assert(get_array_length(pv_lookup) > 0);
  }

  // Prepare storage for the string representing the portion
  // of the reference sequence within the window.
  char* window_seq = (char *) mm_malloc(sizeof(char) * (get_motif_length(motif) + 1));
  window_seq[get_motif_length(motif)] = '\0';

  int max_index = seq_length - get_motif_length(motif);
  if (max_index < 0) max_index = 0;
  const int asize = alph_size(get_motif_alph(motif), ALPH_SIZE);
  double* odds =  (double*) mm_malloc(sizeof(double)*max_index);
  double* scaled_log_odds =  (double*) mm_malloc(sizeof(double)*max_index);

  // For each site in the sequence
  int seq_index;
  for (seq_index = 0; seq_index < max_index; seq_index++) {
    double odd = 1.0;
    scaled_log_odds[seq_index] = 0;

    // For each site in the motif window
    int motif_position;
    for (motif_position = 0; motif_position < get_motif_length(motif); motif_position++) {
      char c = raw_seq[seq_index + motif_position];
      window_seq[motif_position] = c;

      // Check for gaps at this site
      if(c == '-' || c == '.') {
        break;
      }

      // Check for ambiguity codes at this site
      //TODO: This next call is very expensive - it takes up approx. 10% of a
      //      programme's running time. It should be fixed up somehow.
      int aindex = alph_index(get_motif_alph(motif), c);
      if (aindex > asize) {
        break;
      }
      if (method == TOTAL_HITS) {
        //If we're in this mode, then we're using LOG ODDS.
        //scaled_log_odds[seq_index] += get_matrix_cell(motif_position, aindex, get_motif_freqs(motif));
        scaled_log_odds[seq_index] += get_matrix_cell(motif_position, aindex, m_pssm->matrix);
      } else {
        odd *= get_matrix_cell(motif_position, aindex, m_odds);
      }
    }
    odds[seq_index] = odd;
  }

  // return odds as requested (MAX or AVG scoring)
  double requested_odds = 0.0;
  if (method == AVG_ODDS){
    for (seq_index = 0; seq_index < max_index; seq_index++) {
      requested_odds += odds[seq_index];
    }
    requested_odds /= max_index + 1;		// Divide by 0 if max_index==0
  } else if (method == MAX_ODDS){
    for (seq_index = 0; seq_index < max_index; seq_index++) {
      if (odds[seq_index] > requested_odds){
        requested_odds = odds[seq_index];
      }
    }
  } else if (method == SUM_ODDS) {
    for (seq_index = 0; seq_index < max_index; seq_index++) {
      requested_odds += odds[seq_index];
    }
  } else if (method == TOTAL_HITS) {
    for (seq_index = 0; seq_index < max_index; seq_index++) {

      if (scaled_log_odds[seq_index] >= (double)get_array_length(pv_lookup)) {
        scaled_log_odds[seq_index] = (double)(get_array_length(pv_lookup) - 1);
      } 
      double pvalue = get_array_item((int) scaled_log_odds[seq_index], pv_lookup);

      //Figure out how to calculate the p-value of a hit
      //fprintf(stderr, "m: %s pv_l len: %i scaled_log_odds: %g seq index: %i pvalue: %g\n", 
      //    get_motif_id(motif), get_array_length(pv_lookup), scaled_log_odds[seq_index], seq_index, pvalue);

      if (pvalue < threshold) {
        requested_odds++; //Add another hit.
      }

      if (verbosity > HIGHER_VERBOSE) {
        fprintf(stderr, "Window Data: %s\t%s\t%i\t%g\t%g\t%g\n",
            get_seq_name(seq), get_motif_id(motif), seq_index, scaled_log_odds[seq_index], pvalue, threshold);
      }
    }
  }

  myfree(odds);
  myfree(scaled_log_odds);
  myfree(window_seq);
  return requested_odds;
}
Ejemplo n.º 23
0
/**************************************************************************
 * Callback invoked when matching an opening pattern tag in the CISML file 
 * for the primary motif. Checks to see if the pattern refers to the primary
 * motif and sets the flag in_motif to indicate this.
 **************************************************************************/
void motif_primary(void *ctx, char *accession, char *name, char *db, char *lsId, double *pvalue, double *score) {
  PRIMARY_LOADER_T *loader = (PRIMARY_LOADER_T*)ctx;
  loader->in_motif = (strcmp(get_motif_id(loader->motif), accession) == 0);
}
Ejemplo n.º 24
0
/*************************************************************************
 * Entry point for pmp_bf
 *************************************************************************/
int main(int argc, char *argv[]) {

  char* bg_filename = NULL;
  char* motif_name = "motif"; // Use this motif name in the output.
  STRING_LIST_T* selected_motifs = NULL;
  double fg_rate = 1.0;
  double bg_rate = 1.0;
  double purine_pyrimidine = 1.0; // r
  double transition_transversion = 0.5; // R
  double pseudocount = 0.1;
  GAP_SUPPORT_T gap_support = SKIP_GAPS;
  MODEL_TYPE_T model_type = F81_MODEL;
  BOOLEAN_T use_halpern_bruno = FALSE;
  char* ustar_label = NULL;	// TLB; create uniform star tree
  int i;

  program_name = "pmp_bf";

  /**********************************************
   * COMMAND LINE PROCESSING
   **********************************************/

  // Define command line options. (FIXME: Repeated code)
  // FIXME: Note that if you add or remove options you
  // must change n_options.
  int n_options = 12;
  cmdoption const pmp_options[] = {
    {"hb", NO_VALUE},
    {"ustar", REQUIRED_VALUE},
    {"model", REQUIRED_VALUE},
    {"pur-pyr", REQUIRED_VALUE},
    {"transition-transversion", REQUIRED_VALUE},
    {"bg", REQUIRED_VALUE},
    {"fg", REQUIRED_VALUE},
    {"motif", REQUIRED_VALUE},
    {"motif-name", REQUIRED_VALUE},
    {"bgfile", REQUIRED_VALUE},
    {"pseudocount", REQUIRED_VALUE},
    {"verbosity", REQUIRED_VALUE}
  };

  int option_index = 0;

  // Define the usage message.
  char      usage[1000] = "";
  strcat(usage, "USAGE: pmp [options] <tree file> <MEME file>\n");
  strcat(usage, "\n");
  strcat(usage, "   Options:\n");

  // Evolutionary model parameters.
  strcat(usage, "     --hb\n");
  strcat(usage, "     --model single|average|jc|k2|f81|f84|hky|tn");
  strcat(usage, " (default=f81)\n");
  strcat(usage, "     --pur-pyr <float> (default=1.0)\n");
  strcat(usage, "     --transition-transversion <float> (default=0.5)\n");
  strcat(usage, "     --bg <float> (default=1.0)\n");
  strcat(usage, "     --fg <float> (default=1.0)\n");

  // Motif parameters.
  strcat(usage, "     --motif <id> (default=all)\n");
  strcat(usage, "     --motif-name <string> (default from motif file)\n");

  // Miscellaneous parameters
  strcat(usage, "     --bgfile <background> (default from motif file)\n");
  strcat(usage, "     --pseudocount <float> (default=0.1)\n");
  strcat(usage, "     --ustar <label>\n");	// TLB; create uniform star tree
  strcat(usage, "     --verbosity [1|2|3|4] (default 2)\n");
  strcat(usage, "\n    Prints the FP and FN rate at each of 10000 score values.\n");
  strcat(usage, "\n    Output format: [<motif_id> score <score> FPR <fpr> TPR <tpr>]+\n");

  // Parse the command line.
  if (simple_setopt(argc, argv, n_options, pmp_options) != NO_ERROR) {
    die("Error processing command line options: option name too long.\n");
  }

  while (TRUE) { 
    int c = 0;
    char* option_name = NULL;
    char* option_value = NULL;
    const char * message = NULL;

    // Read the next option, and break if we're done.
    c = simple_getopt(&option_name, &option_value, &option_index);
    if (c == 0) {
      break;
    } else if (c < 0) {
      (void) simple_getopterror(&message);
      die("Error processing command line options (%s)\n", message);
    }
    
    if (strcmp(option_name, "model") == 0) {
      if (strcmp(option_value, "jc") == 0) {
        model_type = JC_MODEL;
      } else if (strcmp(option_value, "k2") == 0) {
        model_type = K2_MODEL;
      } else if (strcmp(option_value, "f81") == 0) {
        model_type = F81_MODEL;
      } else if (strcmp(option_value, "f84") == 0) {
        model_type = F84_MODEL;
      } else if (strcmp(option_value, "hky") == 0) {
        model_type = HKY_MODEL;
      } else if (strcmp(option_value, "tn") == 0) {
        model_type = TAMURA_NEI_MODEL;
      } else if (strcmp(option_value, "single") == 0) {
        model_type = SINGLE_MODEL;
      } else if (strcmp(option_value, "average") == 0) {
        model_type = AVERAGE_MODEL;
      } else {
        die("Unknown model: %s\n", option_value);
      }
    } else if (strcmp(option_name, "hb") == 0){
        use_halpern_bruno = TRUE;
    } else if (strcmp(option_name, "ustar") == 0){	// TLB; create uniform star tree
        ustar_label = option_value;
    } else if (strcmp(option_name, "pur-pyr") == 0){
        purine_pyrimidine = atof(option_value);
    } else if (strcmp(option_name, "transition-transversion") == 0){
        transition_transversion = atof(option_value);
    } else if (strcmp(option_name, "bg") == 0){
      bg_rate = atof(option_value);
    } else if (strcmp(option_name, "fg") == 0){
      fg_rate = atof(option_value);
    } else if (strcmp(option_name, "motif") == 0){
        if (selected_motifs == NULL) {
          selected_motifs = new_string_list();
        }
       add_string(option_value, selected_motifs);
    } else if (strcmp(option_name, "motif-name") == 0){
        motif_name = option_value;
    } else if (strcmp(option_name, "bgfile") == 0){
      bg_filename = option_value;
    } else if (strcmp(option_name, "pseudocount") == 0){
        pseudocount = atof(option_value);
    } else if (strcmp(option_name, "verbosity") == 0){
        verbosity = atoi(option_value);
    }
  }

  // Must have tree and motif file names
  if (argc != option_index + 2) {
    fprintf(stderr, "%s", usage);
    exit(EXIT_FAILURE);
  } 

  /**********************************************
   * Read the phylogenetic tree.
   **********************************************/
  char* tree_filename = NULL;
  TREE_T* tree = NULL;
  tree_filename = argv[option_index];
  option_index++;
  tree = read_tree_from_file(tree_filename);

  // get the species names
  STRING_LIST_T* alignment_species = make_leaf_list(tree);
  char *root_label = get_label(tree);	// in case target in center
  if (strlen(root_label)>0) add_string(root_label, alignment_species);
  //write_string_list(" ", alignment_species, stderr);

  // TLB; Convert the tree to a uniform star tree with
  // the target sequence at its center.
  if (ustar_label != NULL) {
    tree = convert_to_uniform_star_tree(tree, ustar_label);
    if (tree == NULL) 
      die("Tree or alignment missing target %s\n", ustar_label);
    if (verbosity >= NORMAL_VERBOSE) {
      fprintf(stderr, 
	"Target %s placed at center of uniform (d=%.3f) star tree:\n", 
          ustar_label, get_total_length(tree) / get_num_children(tree) 
      );
      write_tree(tree, stderr);
    }
  }

  /**********************************************
   * Read the motifs.
   **********************************************/
  char* meme_filename = argv[option_index];
  option_index++;
  int num_motifs = 0; 

  MREAD_T *mread;
  ALPH_T alph;
  ARRAYLST_T *motifs;
  ARRAY_T *bg_freqs;

  mread = mread_create(meme_filename, OPEN_MFILE);
  mread_set_bg_source(mread, bg_filename);
  mread_set_pseudocount(mread, pseudocount);
  // read motifs
  motifs = mread_load(mread, NULL);
  alph = mread_get_alphabet(mread);
  bg_freqs = mread_get_background(mread);
  // check
  if (arraylst_size(motifs) == 0) die("No motifs in %s.", meme_filename);

  

  // TLB; need to resize bg_freqs array to ALPH_SIZE items
  // or copy array breaks in HB mode.  This throws away
  // the freqs for the ambiguous characters;
  int asize = alph_size(alph, ALPH_SIZE);
  resize_array(bg_freqs, asize);

  /**************************************************************
  * Compute probability distributions for each of the selected motifs.
  **************************************************************/
  int motif_index;
  for (motif_index = 0; motif_index < arraylst_size(motifs); motif_index++) {

    MOTIF_T* motif = (MOTIF_T*)arraylst_get(motif_index, motifs);
    char* motif_id = get_motif_id(motif);
    char* bare_motif_id = motif_id;

    // We may have specified on the command line that
    // only certain motifs were to be used.
    if (selected_motifs != NULL) {
      if (*bare_motif_id == '+' || *bare_motif_id == '-') {
        // The selected  motif id won't included a strand indicator.
        bare_motif_id++;
      }
      if (have_string(bare_motif_id, selected_motifs) == FALSE) {
        continue;
      }
    }

    if (verbosity >= NORMAL_VERBOSE) {
      fprintf(
        stderr, 
        "Using motif %s of width %d.\n",
        motif_id, get_motif_length(motif)
      );
    }

    // Build an array of evolutionary models for each position in the motif.
    EVOMODEL_T** models = make_motif_models(
      motif, 
      bg_freqs,
      model_type,
      fg_rate, 
      bg_rate, 
      purine_pyrimidine, 
      transition_transversion, 
      use_halpern_bruno
    );

    // Get the frequencies under the background model (row 0) 
    // and position-dependent scores (rows 1..w)
    // for each possible alignment column.
    MATRIX_T* pssm_matrix = build_alignment_pssm_matrix(
      alph,
      alignment_species,
      get_motif_length(motif) + 1, 
      models, 
      tree, 
      gap_support
    );
    ARRAY_T* alignment_col_freqs = allocate_array(get_num_cols(pssm_matrix)); 
    copy_array(get_matrix_row(0, pssm_matrix), alignment_col_freqs);
    remove_matrix_row(0, pssm_matrix);		// throw away first row
    //print_col_frequencies(alph, alignment_col_freqs);

    //
    // Get the position-dependent null model alignment column frequencies
    //
    int w = get_motif_length(motif);
    int ncols = get_num_cols(pssm_matrix); 
    MATRIX_T* pos_dep_bkg = allocate_matrix(w, ncols);
    for (i=0; i<w; i++) {
      // get the evo model corresponding to this column of the motif
      // and store it as the first evolutionary model.
      myfree(models[0]);
      // Use motif PSFM for equilibrium freqs. for model.
      ARRAY_T* site_specific_freqs = allocate_array(asize);
      int j = 0;
      for(j = 0; j < asize; j++) {
	double value = get_matrix_cell(i, j, get_motif_freqs(motif));
	set_array_item(j, value, site_specific_freqs);
      }
      if (use_halpern_bruno == FALSE) {
	models[0] = make_model(
	  model_type,
	  fg_rate,
	  transition_transversion,
	  purine_pyrimidine,
	  site_specific_freqs,
          NULL
	);
      } else {
        models[0] = make_model(
	  model_type,
	  fg_rate,
	  transition_transversion,
	  purine_pyrimidine,
	  bg_freqs,
	  site_specific_freqs
	);
      }
      // get the alignment column frequencies using this model
      MATRIX_T* tmp_pssm_matrix = build_alignment_pssm_matrix(
        alph,
	alignment_species,
	2,				// only interested in freqs under bkg
	models, 
	tree, 
	gap_support
      );
      // assemble the position-dependent background alignment column freqs.
      set_matrix_row(i, get_matrix_row(0, tmp_pssm_matrix), pos_dep_bkg);
      // chuck the pssm (not his real name)
      free_matrix(tmp_pssm_matrix);
    }

    //
    // Compute and print the score distribution under the background model
    // and under the (position-dependent) motif model.
    //
    int range = 10000;	// 10^4 gives same result as 10^5, but 10^3 differs

    // under background model
    PSSM_T* pssm = build_matrix_pssm(alph, pssm_matrix, alignment_col_freqs, range);

    // under position-dependent background (motif) model
    PSSM_T* pssm_pos_dep = build_matrix_pssm(alph, pssm_matrix, alignment_col_freqs, range);
    get_pv_lookup_pos_dep(
      pssm_pos_dep, 
      pos_dep_bkg, 
      NULL // no priors used
    );

    // print FP and FN distributions
    int num_items = get_pssm_pv_length(pssm_pos_dep);
    for (i=0; i<num_items; i++) {
      double pvf = get_pssm_pv(i, pssm);
      double pvt = get_pssm_pv(i, pssm_pos_dep);
      double fpr = pvf;
      double fnr = 1 - pvt;
      if (fpr >= 0.99999 || fnr == 0) continue;
      printf("%s score %d FPR %.3g FNR %.3g\n", motif_id, i, fpr, fnr);
    }

    // free stuff
    free_pssm(pssm);
    free_pssm(pssm_pos_dep);
    if (models != NULL) {
      int model_index;
      int num_models = get_motif_length(motif) + 1;
      for (model_index = 0; model_index < num_models; model_index++) {
        free_model(models[model_index]);
      }
      myfree(models);
    }

  } // motif

  arraylst_destroy(destroy_motif, motifs);

  /**********************************************
   * Clean up.
   **********************************************/
  // TLB may have encountered a memory corruption bug here
  // CEG has not been able to reproduce it. valgrind says all is well.
  free_array(bg_freqs);
  free_tree(TRUE, tree);
  free_string_list(selected_motifs);

  return(0);
} // main
Ejemplo n.º 25
0
void generate_ceq_logos(char *meme_path, char *output_dir) {
  int i, dir_len, prefix_len, path_len;
  ARRAY_T *background;
  BOOLEAN_T has_reverse_strand;
  char *path, *alphabet;
  double logo_height, logo_width;
  ARRAYLST_T *motifs;
  MOTIF_T *motif;

  motifs = arraylst_create();

  logo_height = LOGOHEIGHT;
  //make the path
  dir_len = strlen(output_dir);
  prefix_len = strlen(LOGO_PREFIX);
  path_len = dir_len + 1 + prefix_len + MAX_MOTIF_ID_LENGTH + 1;
  path = malloc(sizeof(char)*path_len);
  strncpy(path, output_dir, path_len);
  if (path[dir_len-1] != '/') {
    path[dir_len] = '/';
    path[++dir_len] = '\0';
  }
  strncpy(path+dir_len, LOGO_PREFIX, path_len - dir_len);

  // Read all motifs into an array.
  read_meme_file2(meme_path,
		 NULL, // bg file name
		 DEFAULT_PSEUDOCOUNTS,
     REQUIRE_PSPM,
		 motifs, 
		 NULL,//motif occurrences
		 &has_reverse_strand,
		 &background);

  // global alphabet is set by read_meme_file
  alphabet = get_alphabet(FALSE);

  if (create_output_directory(output_dir, TRUE, (verbosity >= NORMAL_VERBOSE))) {
    // Failed to create output directory.
    exit(1);
  }

  for(i = 0; i < arraylst_size(motifs); i++) {
    motif = (MOTIF_T*)arraylst_get(i, motifs);
    logo_width = get_motif_length(motif);
    if (logo_width > MAXLOGOWIDTH) logo_width = MAXLOGOWIDTH;
    copy_and_sanatise_name(path+(dir_len+prefix_len), get_motif_id(motif), path_len - (dir_len + prefix_len)); 
    CL_create2(
      motif, 			        // motif
      "", 			          // no title 
      NULL, 			        // no second motif
      "", 			          // no x-axis label
      FALSE, 			        // no error bars
      FALSE,			        // ssc
      logo_height,		    // logo height (cm)
      logo_width,		      // logo width (cm)
      alphabet, 	        // alphabet
      0, 			            // no offset to second motif
      path,			          // output file path
      "MEME (no SSC)"		  // program name
    );
  }
  free_motifs(motifs);
  free_array(background); // not used 
  free(path);
}