Esempio n. 1
0
extern int setup_hash_alph(
  char *alphabet			/* the alphabet to set up hashing for */
)
{
  int i, alength;
  char c;

  /* 
   get length of alphabet 
  */
  alength = strlen(alphabet);
  if (alength > MAXALPH) {
    fprintf(stderr, "Alphabet too long (> %d).\n", MAXALPH);
    exit(1);
  }

  /*
    determine if current alphabet DNAB or PROTEINB or other
  */
  if (!strcmp(alphabet, DNAB)) {
    setalph(0);					/* current alphabet DNAB */
  } else if (!strcmp(alphabet, PROTEINB)) {
    setalph(1);					/* current alphabet PROTEINB */
  } else {
    setalph(2);					/* current alphabet other */
  }

  /* 
    flag unused letters 
  */
  for (i=0; i<MAXASCII; i++) hash(i) = -1;

  /* 
    set up the hashing and unhashing indices
  */
  for (i = 0; (c = alphabet[i]); i++) {
    c = islower((int)c) ? toupper((int)c) : c;	/* convert to uppercase */
    hash(c) = i;
    unhash(i) = c;
  }

  /* 
    MEME: convert ambiguous characters to X
  */
  if (!strcmp(alphabet, DNA0)) { 		/* DNA */
    hash('X') = alength;
    unhash(alength) = 'X';
    for (i=0; (c=DNAB[i]); i++) if (!strchr(DNA0, c)) 
      hash(c) = alength;
  } else if (!strcmp(alphabet, PROTEIN0)) { 	/* PROTEIN */
    hash('X') = alength;
    unhash(alength) = 'X';
    for (i=0; (c=PROTEINB[i]); i++) if (!strchr(PROTEIN0, c)) 
      hash(c) = alength;
  }

  return alength;
} /* setup_hash_alph */
Esempio n. 2
0
extern char *get_blast_alphabet(
  char *old_alph, 		/* old alphabet IN (but converted to uppercase) */
  int *p[MAXASCII]		/* permutation and substitution matrix OUT */
)
{
  int i, j;
  int old_alen = strlen(old_alph);	/* length of old alphabet */
  char *new_alph;			/* new (complete) alphabet */
  int new_alen;				/* length of new alphabet */
  char *to;				/* list of substitutions */
  int to_len;				/* length of to */
  char **subst;				/* substitution list */

  /* convert alphabet to uppercase */
  for (i=0; i<old_alen; i++) 
    if (islower((int)old_alph[i])) old_alph[i] = toupper((int)old_alph[i]);

  /* 
    determine what type of alphabet we have 
  */
  if (strspn(old_alph, DNAB) == old_alen) {
    setalph(0);			/* set alphabet hash function */
    new_alph = DNAB; 		/* BLAST DNA alphabet */
    subst = dna_subst;		/* list of substitutions */
  } else if (strspn(old_alph, PROTEINB) == old_alen) {
    setalph(1);			/* set alphabet hash function */
    new_alph = PROTEINB;	/* BLAST PROTEIN alphabet */
    subst = prot_subst;		/* list of substitutions */
  } else {
    fprintf(stderr, "Don't recognize the motif alphabet: %s\n", old_alph);
    exit(1);
  }

  /* 
    create mapping from new position to old positions 
  */
  new_alen = strlen(new_alph);		/* length of new alphabet */
  for (i=0; i < new_alen; i++) { 	/* init mapping matrix */
    p[i] = NULL; 			/* create row of substitutions */
    Resize(p[i], 2, int); 		/* make row length 2 */
    p[i][0] = -1;			/* flag end of substitution list */
  }
  for (i=0; i < new_alen; i++) {	/* new position */
    char c = new_alph[i];		/* new letter */
    char *o;				/* pointer to c in old alphabet */ 
    if ((o = strchr(old_alph, c)) != NULL) {	/* letter is in old alphabet */
      p[i][0] = (int)(o-old_alph);	/* put in list */
      p[i][1] = -1;			/* flag end of list */ 
    } else {				/* letter not in old alphabet */
      to = subst[i];			/* letters to substitute */
      to_len = strlen(to);		/* number of substitutions */
      Resize(p[i], to_len+1, int);	/* make list of substitions */
      for (j=0; j<to_len; j++) {
        if ((o = strchr(old_alph, to[j])) != NULL) {	/* letter found */
          p[i][j] = (int) (o - old_alph);	/* put in list */
        } else {			/* required letter missing */
          char *a = (subst==dna_subst ? "DNA" : "protein");
          fprintf(stderr, 
            "The motif alphabet %s appears to be a %s alphabet\n", old_alph, a);
 	  fprintf(stderr, "but is missing the required letter `%c'.\n", to[j]);
          exit(1);
        }
      }
      p[i][j] = -1;			/* flag end of list */ 
    }
  }					/* new position */

  return new_alph;
} /* get_blast_alphabet */
Esempio n. 3
0
/*************************************************************************
 * Entry point for centrimo
 *************************************************************************/
int main(int argc, char *argv[]) {
  CENTRIMO_OPTIONS_T options;
  SEQ_SITES_T seq_sites;
  SITE_COUNTS_T counts;
  int seqN, motifN, seqlen, db_i, motif_i, i;
  double log_pvalue_thresh;
  SEQ_T** sequences = NULL;
  ARRAY_T* bg_freqs = NULL;
  ARRAYLST_T *stats_list;
  MOTIF_DB_T **dbs, *db;
  MREAD_T *mread;
  MOTIF_STATS_T *stats;
  MOTIF_T *motif, *rev_motif;
  PSSM_T *pos_pssm, *rev_pssm;
  char *sites_path, *desc;
  FILE *sites_file;
  HTMLWR_T *html;
  JSONWR_T *json;

  // COMMAND LINE PROCESSING
  process_command_line(argc, argv, &options);

  // load the sequences
  read_sequences(options.alphabet, options.seq_source, &sequences, &seqN);
  seqlen = (seqN ? get_seq_length(sequences[0]) : 0);
  // calculate a sequence background (unless other background is given)
  if (!options.bg_source) {
    bg_freqs = calc_bg_from_fastas(options.alphabet, seqN, sequences);
  }

  // load the motifs
  motifN = 0;
  dbs = mm_malloc(sizeof(MOTIF_DB_T*) * arraylst_size(options.motif_sources));
  for (i = 0; i < arraylst_size(options.motif_sources); i++) {
    char* db_source;
    db_source = (char*)arraylst_get(i, options.motif_sources);
    dbs[i] = read_motifs(i, db_source, options.bg_source, &bg_freqs, 
        options.pseudocount, options.selected_motifs, options.alphabet);
    motifN += arraylst_size(dbs[i]->motifs);
  }
  log_pvalue_thresh = log(options.evalue_thresh) - log(motifN);
  // Setup some things for double strand scanning
  if (options.scan_both_strands == TRUE) {
    // Set up hash tables for computing reverse complement
    setup_hash_alph(DNAB);
    setalph(0);
    // Correct background by averaging on freq. for both strands.
    average_freq_with_complement(options.alphabet, bg_freqs);
    normalize_subarray(0, alph_size(options.alphabet, ALPH_SIZE), 0.0, bg_freqs);
    calc_ambigs(options.alphabet, FALSE, bg_freqs);
  }
  // Create output directory
  if (create_output_directory(options.output_dirname, options.allow_clobber, 
        (verbosity >= NORMAL_VERBOSE))) {
    die("Couldn't create output directory %s.\n", options.output_dirname);
  }
  // open output files
  sites_path = make_path_to_file(options.output_dirname, SITES_FILENAME);
  sites_file = fopen(sites_path, "w");
  free(sites_path);
  // setup html monolith writer
  json = NULL;
  if ((html = htmlwr_create(get_meme_etc_dir(), TEMPLATE_FILENAME))) {
    htmlwr_set_dest_name(html, options.output_dirname, HTML_FILENAME);
    htmlwr_replace(html, "centrimo_data.js", "data");
    json = htmlwr_output(html);
    if (json == NULL) die("Template does not contain data section.\n");
  } else {
    DEBUG_MSG(QUIET_VERBOSE, "Failed to open html template file.\n");
  }
  if (json) {
    // output some top level variables
    jsonwr_str_prop(json, "version", VERSION);
    jsonwr_str_prop(json, "revision", REVISION);
    jsonwr_str_prop(json, "release", ARCHIVE_DATE);
    jsonwr_str_array_prop(json, "cmd", argv, argc);
    jsonwr_property(json, "options");
    jsonwr_start_object_value(json);
    jsonwr_dbl_prop(json, "motif-pseudo", options.pseudocount);
    jsonwr_dbl_prop(json, "score", options.score_thresh);
    jsonwr_dbl_prop(json, "ethresh", options.evalue_thresh);
    jsonwr_lng_prop(json, "maxbin", options.max_window+1);
    jsonwr_bool_prop(json, "norc", !options.scan_both_strands);
    jsonwr_bool_prop(json, "noflip", options.no_flip);
    jsonwr_end_object_value(json);
    // output the description
    desc = prepare_description(&options);
    if (desc) {
      jsonwr_str_prop(json, "job_description", desc);
      free(desc);
    }
    // output size metrics
    jsonwr_lng_prop(json, "seqlen", seqlen);
    jsonwr_lng_prop(json, "tested", motifN);
    // output the fasta db
    jsonwr_property(json, "sequence_db");
    jsonwr_start_object_value(json);
    jsonwr_str_prop(json, "source", options.seq_source);
    jsonwr_lng_prop(json, "count", seqN);
    jsonwr_end_object_value(json);
    // output the motif dbs
    jsonwr_property(json, "motif_dbs");
    jsonwr_start_array_value(json);
    for (db_i = 0; db_i < arraylst_size(options.motif_sources); db_i++) {
      db = dbs[db_i];
      jsonwr_start_object_value(json);
      jsonwr_str_prop(json, "source", db->source);
      jsonwr_lng_prop(json, "count", arraylst_size(db->motifs));
      jsonwr_end_object_value(json);
    }
    jsonwr_end_array_value(json);
    // start the motif array
    jsonwr_property(json, "motifs");
    jsonwr_start_array_value(json);
  }
  /**************************************************************
   * Tally the positions of the best sites for each of the 
   * selected motifs.
   **************************************************************/
  // prepare the sequence sites
  memset(&seq_sites, 0, sizeof(SEQ_SITES_T));
  // prepare the site counts
  counts.allocated = ((2 * seqlen) - 1);
  counts.sites = mm_malloc(sizeof(double) * counts.allocated);
  // prepare the motifs stats list
  stats_list = arraylst_create();
  // prepare the other vars
  motif = NULL; pos_pssm = NULL; rev_motif = NULL; rev_pssm = NULL;
  for (db_i = 0; db_i < arraylst_size(options.motif_sources); db_i++) {
    db = dbs[db_i];
    for (motif_i = 0; motif_i < arraylst_size(db->motifs); motif_i++) {
      motif = (MOTIF_T *) arraylst_get(motif_i, db->motifs);
      DEBUG_FMT(NORMAL_VERBOSE, "Using motif %s of width %d.\n",  
          get_motif_id(motif), get_motif_length(motif));
      // reset the counts
      for (i = 0; i < counts.allocated; i++) counts.sites[i] = 0;
      counts.total_sites = 0;
      // create the pssm 
      pos_pssm = make_pssm(bg_freqs, motif);
      // If required, do the same for the reverse complement motif.
      if (options.scan_both_strands) {
        rev_motif = dup_rc_motif(motif);
        rev_pssm = make_pssm(bg_freqs, rev_motif);
      }
      // scan the sequences
      for (i = 0; i < seqN; i++)
        score_sequence(&options, sequences[i], pos_pssm, rev_pssm, 
            &seq_sites, &counts);
      // DEBUG check that the sum of the sites is close to the site count
      double sum_check = 0, sum_diff;
      for (i = 0; i < counts.allocated; i++) sum_check += counts.sites[i];
      sum_diff = counts.total_sites - sum_check;
      if (sum_diff < 0) sum_diff = -sum_diff;
      if (sum_diff > 0.1) {
        fprintf(stderr, "Warning: site counts don't sum to accurate value! "
            "%g != %ld", sum_check, counts.total_sites);
      }
      // output the plain text site counts
      output_site_counts(sites_file, seqlen, db, motif, &counts);
      // compute the best central window
      stats = compute_stats(options.max_window, seqlen, db, motif, &counts);
      // check if it passes the threshold
      if (json && stats->log_adj_pvalue <= log_pvalue_thresh) {
        output_motif_json(json, stats, &counts);
        arraylst_add(stats, stats_list);
      } else {
        free(stats);
      }
      // Free memory associated with this motif.
      free_pssm(pos_pssm);
      free_pssm(rev_pssm);
      destroy_motif(rev_motif);
    }
  }
  if (json) jsonwr_end_array_value(json);
  // finish writing sites
  fclose(sites_file);
  // finish writing html file
  if (html) {
    if (htmlwr_output(html) != NULL) {
      die("Found another JSON replacement!\n");
    }
    htmlwr_destroy(html);
  }
  // write text file
  output_centrimo_text(&options, motifN, stats_list);
  // Clean up.
  for (i = 0; i < seqN; ++i) {
    free_seq(sequences[i]); 
  }
  free(sequences);
  for (i = 0; i < arraylst_size(options.motif_sources); i++) {
    free_db(dbs[i]);
  }
  free(dbs);
  free_array(bg_freqs);
  free(counts.sites);
  free(seq_sites.sites);
  arraylst_destroy(free, stats_list);
  cleanup_options(&options);
  return 0;

}