/**
 * Creates a motif for a given mod using a simple frequency matrix.
 */
void create_simple_motif(SUMMARY_T* summary,
                         MOMO_OPTIONS_T* options,
                         MOD_INFO_T * mod_info) {
  int i;
  int j;
  
  const char* alph_letters = summary->alph_letters;
  
  // Create the frequency matrix
  MATRIX_T* freqs = NULL;
  freqs = get_count_matrix(freqs, mod_info->seq_list, NULL, options, summary);
  normalize_rows(0.0, freqs);
  
  // Create the motif
  MOTIF_INFO_T* motifinfo = mm_malloc(sizeof(MOTIF_INFO_T));
  motifinfo->motif = allocate_motif(mod_info->mod_name, "", summary->alph, freqs, NULL);
  motifinfo->seqs = arraylst_create();
  for (i = 0; i < arraylst_size(mod_info->seq_list); ++i) {
    SEQ_T* seqobject = options->eliminate_repeats ? hash_get_entry_value(arraylst_get(i, mod_info->seq_list)) : arraylst_get(i, mod_info->seq_list);
    arraylst_add(get_raw_sequence(seqobject), motifinfo->seqs);
  }
  motifinfo->fg_size = arraylst_size(mod_info->seq_list);
  arraylst_add(motifinfo, mod_info->motifinfos);
  
  // clean up
  free_matrix(freqs);
}
Esempio n. 2
0
/***********************************************************************
 * Converts a list of TRANSFAC motifs to a list MEME motif.
 * If the use_accession parameter is true the TRANSFAC accession
 * is used as the name of the MEME motif. Otherwise the ID is used.
 * Caller is responsible for freeing the returned ARRAYLST
 ***********************************************************************/
ARRAYLST_T *convert_transfac_motifs_to_meme_motifs(
    BOOLEAN_T use_accession,
    int pseudocount,
    ARRAY_T *bg,
    ARRAYLST_T *tfac_motifs
) {
    int num_motifs = arraylst_size(tfac_motifs);
    ARRAYLST_T *meme_motifs = arraylst_create_sized(num_motifs);
    int motif_index = 0;
    for (motif_index = 0; motif_index < num_motifs; ++motif_index) {
        TRANSFAC_MOTIF_T *tfac_motif
            = (TRANSFAC_MOTIF_T *) arraylst_get(motif_index, tfac_motifs);
        char *name = NULL;
        if (use_accession == TRUE) {
            name = get_transfac_accession(tfac_motif);
            if (name == NULL) {
                die("No accession string found in TRANSFAC motif.");
            }
        }
        else {
            name = get_transfac_id(tfac_motif);
            if (name == NULL) {
                die("No ID string found in TRANSFAC motif.");
            }
        }
        MOTIF_T *meme_motif
            = convert_transfac_motif_to_meme_motif(name, pseudocount, bg, tfac_motif);
        arraylst_add(meme_motif, meme_motifs);
    }
    return meme_motifs;
}
Esempio n. 3
0
/*
 * Loads all the currently buffered motifs into a list.
 * If the file is set then this will read all the motifs in the
 * file into the list. If a list is not passed then
 * it will create a new one. 
 * returns the list.
 */
ARRAYLST_T* mread_load(MREAD_T *mread, ARRAYLST_T *motifs) {
  MOTIF_T *motif;
  if (motifs == NULL) motifs = arraylst_create();
  while ((motif = mread_next_motif(mread)) != NULL) {
    arraylst_add(motif, motifs);
  }
  return motifs;
}
Esempio n. 4
0
/*************************************************************************
 * Read a motif database
 *************************************************************************/
static MOTIF_DB_T* read_motifs(int id, char* motif_source, char* bg_source, 
    ARRAY_T** bg, double pseudocount, RBTREE_T *selected, ALPH_T alph) {
  // vars
  int read_motifs;
  MOTIF_DB_T* motifdb;
  MREAD_T *mread;
  MOTIF_T *motif;
  ARRAYLST_T *motifs;
  // open the motif file for reading
  mread = mread_create(motif_source, OPEN_MFILE);
  mread_set_pseudocount(mread, pseudocount);
  // determine background to use
  if (*bg != NULL) mread_set_background(mread, *bg);
  else mread_set_bg_source(mread, bg_source);
  // load motifs
  read_motifs = 0;
  if (rbtree_size(selected) > 0) {
    motifs = arraylst_create();
    while(mread_has_motif(mread)) {
      motif = mread_next_motif(mread);
      read_motifs++;
      if (rbtree_find(selected, get_motif_id(motif))) {
        arraylst_add(motif, motifs);
      } else {
        DEBUG_FMT(NORMAL_VERBOSE, "Discarding motif %s in %s.\n", 
            get_motif_id(motif), motif_source);
        destroy_motif(motif);
      }
    }
  } else {
    motifs = mread_load(mread, NULL);
    read_motifs = arraylst_size(motifs);
  }
  arraylst_fit(motifs);
  if (read_motifs > 0) {
    // check the alphabet
    if (mread_get_alphabet(mread) != alph) {
      die("Expected %s alphabet motifs\n", alph_name(alph));
    }
    // get the background
    if (*bg == NULL) *bg = mread_get_background(mread);
  } else {
    fprintf(stderr, "Warning: Motif file %s contains no motifs.\n", motif_source);
  }
  // clean up motif reader
  mread_destroy(mread);
  // create motif db
  motifdb = mm_malloc(sizeof(MOTIF_DB_T));
  memset(motifdb, 0, sizeof(MOTIF_DB_T));
  motifdb->id = id;
  motifdb->source = strdup(motif_source);
  motifdb->motifs = motifs;
  return motifdb; 
}
/*****************************************************************************
 * MEME > scanned_sites_summary > scanned_sites
 ****************************************************************************/
void mxml_start_scanned_seq(void *ctx, char *seq_id, double log10pvalue, int site_count) {
  CTX_T *data;
  int *length;
  struct seqinfo *seq;
  data = (CTX_T*)ctx;
  if (data->options & SCANNED_SITES) {
    data->current_site = 0;
    seq = (struct seqinfo *)rbtree_get(data->sequence_lookup, seq_id);
    if (seq == NULL) {
      local_error(data, "Scanned sites references unknown sequence \"%s\".\n", seq_id);
      return;
    }
    arraylst_add(sseq_create(seq->name, seq->length, log10pvalue, site_count), data->fscope.scanned_sites);
  }
}
Esempio n. 6
0
ARRAYLST_T* load_motifs(AMA_OPTIONS_T *opts) {
  ARRAYLST_T *motifs;
  ARRAY_T *pos_bg_freqs, *rev_bg_freqs;
  MREAD_T *mread;
  MOTIF_T *motif, *motif_rc;
  double range;
  PSSM_T *pos_pssm, *neg_pssm;
  int total_motifs;
  ALPH_T *alph;

  //
  // Read the motifs and background model.
  //
  //this reads any meme file, xml, txt and html
  mread = mread_create(opts->motif_filename, OPEN_MFILE);
  mread_set_bg_source(mread, opts->bg_filename);
  mread_set_pseudocount(mread, opts->pseudocount);

  // sanity check, since the rest of the code relies on the motifs being complementable
  alph = alph_hold(mread_get_alphabet(mread));
  if (alph == NULL) die("Unable to determine alphabet from motifs");
  if (opts->scan_both_strands && !alph_has_complement(alph)) {
    opts->scan_both_strands = false;
  }
  if (opts->num_gc_bins > 1 && alph_size_core(alph) != 4 && alph_size_pairs(alph) != 2) {
    fprintf(stderr, "Warning: The motif alphabet does not have exactly 2 complementary pairs so \"GC binning\" will be disabled.\n");
    opts->num_gc_bins = 1;
  }

  pos_bg_freqs = mread_get_background(mread);
  rev_bg_freqs = NULL;
  if (opts->scan_both_strands) {
    rev_bg_freqs = allocate_array(get_array_length(pos_bg_freqs));
    copy_array(pos_bg_freqs, rev_bg_freqs);
    complement_swap_freqs(alph, rev_bg_freqs, rev_bg_freqs);
  }

  // allocate memory for motifs
  motifs = arraylst_create();
  //
  // Convert motif matrices into log-odds matrices.
  // Scale them.
  // Compute the lookup tables for the PDF of scaled log-odds scores.
  //
  range = 300; // 100 is not very good; 1000 is great but too slow
  neg_pssm = NULL;
  total_motifs = 0;
  while (mread_has_motif(mread)) {
    motif = mread_next_motif(mread);
    total_motifs++;
    if (rbtree_size(opts->selected_motifs) == 0 || rbtree_find(opts->selected_motifs, get_motif_id(motif)) != NULL) {
      if (verbosity >= HIGH_VERBOSE) {
        fprintf(stderr, "Using motif %s of width %d.\n", get_motif_id(motif), get_motif_length(motif));
      }
      pos_pssm =
        build_motif_pssm(
          motif, 
          pos_bg_freqs, 
          pos_bg_freqs, 
          NULL, // Priors not used
          0.0L, // alpha not used
          range, 
          opts->num_gc_bins, 
          true 
        );
      //
      //  Note: If scanning both strands, we complement the motif frequencies
      //  but not the background frequencies so the motif looks the same.
      //  However, the given frequencies are used in computing the p-values
      //  since they represent the frequencies on the negative strands.
      //  (If we instead were to complement the input sequence, keeping the
      //  the motif fixed, we would need to use the complemented frequencies
      //  in computing the p-values.  Is that any clearer?)
      //
      if (opts->scan_both_strands) {
        motif_rc = dup_rc_motif(motif);
        neg_pssm =
          build_motif_pssm(
            motif_rc, 
            rev_bg_freqs, 
            pos_bg_freqs, 
            NULL, // Priors not used
            0.0L, // alpha not used
            range, 
            opts->num_gc_bins, 
            true
          );
        destroy_motif(motif_rc);
      }
      arraylst_add(motif_and_pssm_create(motif, pos_pssm, neg_pssm), motifs);
    } else {
      if (verbosity >= HIGH_VERBOSE) fprintf(stderr, "Skipping motif %s.\n",
          get_motif_id(motif));
      destroy_motif(motif);
    }
  }
  mread_destroy(mread);
  free_array(pos_bg_freqs);
  free_array(rev_bg_freqs);
  alph_release(alph);
  if (verbosity >= NORMAL_VERBOSE) {
    fprintf(stderr, "Loaded %d/%d motifs from %s.\n", 
        arraylst_size(motifs), total_motifs, opts->motif_filename);
  }
  return motifs;
}
/**
 * Recursive function. Creates and stores a motif using the motif-x
 * algorithm until no more are left.
 */
void create_motifx_motif(ARRAYLST_T* phospho_seqs,
                         ARRAYLST_T* bg_seqs,
                         MOTIFX_STATUS_T** phospho_status,
                         MOTIFX_STATUS_T** bg_status,
                         MATRIX_T* phospho_count,
                         MATRIX_T* bg_count,
                         int* num_active,
                         int* num_bg_active,
                         char* modname,
                         MOD_INFO_T* mod_info,
                         MOMO_OPTIONS_T* options,
                         SUMMARY_T* summary) {
  int i;
  int j;
  
  const char* alph_letters = summary->alph_letters;
  
  // Initialize pattern, sequence count, bg sequence count, and overall score for this motif.
  char* pattern = mm_malloc(options->width + 1);
  for (i = 0; i < options->width; ++i) {
    pattern[i] = 'X';
  }
  pattern[options->width] = '\0';
  int* num_active_copy = mm_malloc(sizeof(int));
  *num_active_copy = *num_active;
  int* num_bg_active_copy = mm_malloc(sizeof(int));
  *num_bg_active_copy = *num_bg_active;
  double* motif_score = mm_malloc(sizeof(double));
  *motif_score = 0;
  
  // Set the pattern, num active copy, num bg active copy, motif score, and get a count of the sequences
  MATRIX_T* result_count_matrix = add_to_pattern(pattern, phospho_seqs, bg_seqs, phospho_status, bg_status, num_active_copy, num_bg_active_copy, phospho_count, bg_count, motif_score, summary, options);
  
  // If any of the characters are not X, then we have found a pattern
  BOOLEAN_T found_pattern = FALSE;
  for (i = 0; i < options->width; ++i) {
    if (pattern[i] != 'X') {
      found_pattern = TRUE;
    }
  }
  
  // If there is a pattern, store the pattern and call create_motifx_motif again.
  if (found_pattern) {
    // fill out the rest of the pattern (e.g. if you have pattern ..ASAAA, and realize the actual pattern is A.ASAAA
    for (i = 0; i < options->width; i++) {
      for (j = 0; j < strlen(alph_letters); j++) {
        if ((int) get_matrix_cell_defcheck(i, j, result_count_matrix) == *num_active_copy) {
          pattern[i] = alph_letters[j];
        }
      }
    }
    
    // create the pattern name
    char* pattern_name = mm_malloc(strlen(pattern) + strlen(modname) + 3);
    pattern_name[0] = '\0';
    strncat(pattern_name, pattern, strlen(pattern)/2);
    strncat(pattern_name, "_", 1);
    strncat(pattern_name, modname, strlen(modname));
    strncat(pattern_name, "_", 1);
    strncat(pattern_name, pattern + strlen(pattern)/2 + 1, strlen(pattern)/2);
    
    // convert this count matrix into frequencies
    normalize_rows(0.0, result_count_matrix);
    
    // Store this motif
    MOTIF_INFO_T* motifinfo = mm_malloc(sizeof(MOTIF_INFO_T));
    MOTIF_T* motif = allocate_motif(pattern_name, "", summary->alph, result_count_matrix, NULL);
    set_motif_nsites(motif, *num_active_copy);
    motifinfo->motif = motif;
    motifinfo->seqs = arraylst_create();
    motifinfo->score = *motif_score;
    motifinfo->fg_match = *num_active_copy;
    motifinfo->fg_size = *num_active;
    motifinfo->bg_match = *num_bg_active_copy;
    motifinfo->bg_size = *num_bg_active;
    for (i = 0; i < arraylst_size(phospho_seqs); ++i) {
      MOTIFX_STATUS_T status = (*phospho_status)[i];
      if (status == ACTIVE) {
        SEQ_T* active_sequence = (options->eliminate_repeats) ? hash_get_entry_value(arraylst_get(i, phospho_seqs)) : arraylst_get(i, phospho_seqs);
        arraylst_add(get_raw_sequence(active_sequence), motifinfo->seqs);
      }
    }
    arraylst_add(motifinfo, mod_info->motifinfos);
    
    // delete the sequences from this motif. turn inactive into active.
    delete_sequences(phospho_status, arraylst_size(phospho_seqs));
    delete_sequences(bg_status, arraylst_size(bg_seqs));
    
    // update the count of number of actives
    *num_active = *num_active - *num_active_copy;
    *num_bg_active = *num_bg_active - *num_bg_active_copy;
    
    // recalculate phospho count and bg count.
    phospho_count = get_count_matrix(phospho_count, phospho_seqs, phospho_status, options, summary);
    bg_count = get_count_matrix(bg_count, bg_seqs, bg_status, options, summary);
    
    // free up space
    myfree(pattern);
    myfree(num_active_copy);
    myfree(num_bg_active_copy);
    myfree(motif_score);
    myfree(pattern_name);
    
    // try to create another motif.
    create_motifx_motif(phospho_seqs,
                        bg_seqs,
                        phospho_status,
                        bg_status,
                        phospho_count,
                        bg_count,
                        num_active,
                        num_bg_active,
                        modname,
                        mod_info,
                        options,
                        summary);
  }
  // free up space
  myfree(pattern);
  myfree(num_active_copy);
  myfree(num_bg_active_copy);
  myfree(motif_score);
}
Esempio n. 8
0
/***********************************************************************
 * Read TRANSFAC motifs from a TRANSFAC file.
 * Returns an arraylist of pointers to TRANSFAC_MOTIF_T
 ***********************************************************************/
ARRAYLST_T *read_motifs_from_transfac_file (
    const char* transfac_filename  // Name of TRANSFAC file or '-' for stdin IN
) {

    // Create dynamic storage for motifs
    ARRAYLST_T *motif_list = arraylst_create();

    // Open the TRANFAC file for reading.
    FILE *transfac_file = NULL;
    if (open_file(
                transfac_filename,
                "r",
                TRUE, // Allow '-' for stdin
                "transfac file",
                "",
                &transfac_file
            ) == FALSE) {
        exit(1);
    }

    // Read and parse the TRANFAC file.
    int num_bases = 4;
    char *line = NULL;
    while ((line = getline2(transfac_file)) != NULL) {

        // Split the line into an initial tag and everything else.
        char *this_accession = split(line, ' ');
        char *tag = line;

        // Have we reached a new matrix?
        if (strcmp(tag, "AC") == 0) {

            trim(this_accession);

            char *this_id = NULL;
            char *this_name = NULL;
            char *this_descr = NULL;
            char *this_species = NULL;
            char this_consensus[MAX_CONSENSUS_LENGTH];
            STRING_LIST_T *species_list = new_string_list();

            // Old versions of TRANSFAC use pee-zero; new use pee-oh.
            while (strcmp(tag, "PO") != 0 && strcmp(tag, "P0") != 0) {

                line = getline2(transfac_file);
                if (line == NULL) {
                    die ("Can't find PO line for TRANSFAC matrix %s.\n", this_accession);
                }
                char *data = split(line, ' ');
                if (data != NULL) {
                    trim(data);
                }
                tag = line;

                // Store the id line.
                if (strcmp(tag, "ID") == 0) {
                    this_id = strdup(data);
                }
                // Store the species line.
                else if (strcmp(tag, "BF") == 0) {
                    add_string(data, species_list);
                }
                // Store the name line.
                else if (strcmp(tag, "NA") == 0) {
                    this_name = strdup(data);
                }
                // Store the description line.
                else if (strcmp(tag, "DE") == 0) {
                    this_descr = strdup(data);
                }
            }

            // Check how many positions in the motif
            // Mark current position in file
            fpos_t file_position;
            errno = 0;
            int status = fgetpos(transfac_file, &file_position);
            if (status) {
                die("Error reading file %s: %s", transfac_filename, strerror(errno));
            }

            int num_motif_positions = 0;
            while (TRUE) {

                // Read till we reach the end of the counts or the end of the motif
                line = getline2(transfac_file);
                if (line == NULL) {
                    break;
                }

                char *data = split(line, ' ');
                if (data != NULL) {
                    trim(data);
                }
                tag = line;

                // Read till we reach the end of the counts or the end of the motif
                if ((strcmp(tag, "XX\n") == 0) || (strcmp(tag, "//\n") == 0)) {
                    break;
                }

                ++num_motif_positions;
            }
            // Rewind file
            errno = 0;
            status = fsetpos(transfac_file, &file_position);
            if (status) {
                die("Error reading file %s: %s", transfac_filename, strerror(errno));
            }

            // Read the motif counts.
            int num_seqs = 0;
            this_consensus[0] = 0;
            MATRIX_T *motif_counts = allocate_matrix(num_motif_positions, 4);
            int position = 0;
            while (TRUE) {

                line = getline2(transfac_file);
                if (line == NULL) {
                    break;
                }

                char *data = split(line, ' ');
                if (data != NULL) {
                    trim(data);
                }
                tag = line;

                // Look for the end of the motif.
                if ((strcmp(tag, "XX\n") == 0) || (strcmp(tag, "//\n") == 0)) {
                    break;
                }

                position = atoi(tag);
                if (position > num_motif_positions) {
                    die(
                        "Error reading motif counts at position %d of motif %s in file %s",
                        position,  this_accession, transfac_filename
                    );
                }

                // Store the contents of this row.
                int count[4];
                char consensus;
                sscanf(
                    data,
                    "%d %d %d %d %c",
                    &(count[0]),
                    &(count[1]),
                    &(count[2]),
                    &(count[3]),
                    &consensus
                );
                int i_base;
                for (i_base = 0; i_base < num_bases; i_base++) {
                    set_matrix_cell(position - 1, i_base, count[i_base], motif_counts);
                }
                this_consensus[position - 1] = consensus;

            }

            this_consensus[position] = 0;
            TRANSFAC_MOTIF_T *motif = new_transfac_motif(
                                          this_accession,
                                          this_id,
                                          this_name,
                                          this_descr,
                                          this_consensus,
                                          species_list,
                                          motif_counts
                                      );
            arraylst_add(motif, motif_list);

        }
    }

    fclose(transfac_file);
    return motif_list;

}
Esempio n. 9
0
/*************************************************************************
 * Entry point for centrimo
 *************************************************************************/
int main(int argc, char *argv[]) {
  CENTRIMO_OPTIONS_T options;
  SEQ_SITES_T seq_sites;
  SITE_COUNTS_T counts;
  int seqN, motifN, seqlen, db_i, motif_i, i;
  double log_pvalue_thresh;
  SEQ_T** sequences = NULL;
  ARRAY_T* bg_freqs = NULL;
  ARRAYLST_T *stats_list;
  MOTIF_DB_T **dbs, *db;
  MREAD_T *mread;
  MOTIF_STATS_T *stats;
  MOTIF_T *motif, *rev_motif;
  PSSM_T *pos_pssm, *rev_pssm;
  char *sites_path, *desc;
  FILE *sites_file;
  HTMLWR_T *html;
  JSONWR_T *json;

  // COMMAND LINE PROCESSING
  process_command_line(argc, argv, &options);

  // load the sequences
  read_sequences(options.alphabet, options.seq_source, &sequences, &seqN);
  seqlen = (seqN ? get_seq_length(sequences[0]) : 0);
  // calculate a sequence background (unless other background is given)
  if (!options.bg_source) {
    bg_freqs = calc_bg_from_fastas(options.alphabet, seqN, sequences);
  }

  // load the motifs
  motifN = 0;
  dbs = mm_malloc(sizeof(MOTIF_DB_T*) * arraylst_size(options.motif_sources));
  for (i = 0; i < arraylst_size(options.motif_sources); i++) {
    char* db_source;
    db_source = (char*)arraylst_get(i, options.motif_sources);
    dbs[i] = read_motifs(i, db_source, options.bg_source, &bg_freqs, 
        options.pseudocount, options.selected_motifs, options.alphabet);
    motifN += arraylst_size(dbs[i]->motifs);
  }
  log_pvalue_thresh = log(options.evalue_thresh) - log(motifN);
  // Setup some things for double strand scanning
  if (options.scan_both_strands == TRUE) {
    // Set up hash tables for computing reverse complement
    setup_hash_alph(DNAB);
    setalph(0);
    // Correct background by averaging on freq. for both strands.
    average_freq_with_complement(options.alphabet, bg_freqs);
    normalize_subarray(0, alph_size(options.alphabet, ALPH_SIZE), 0.0, bg_freqs);
    calc_ambigs(options.alphabet, FALSE, bg_freqs);
  }
  // Create output directory
  if (create_output_directory(options.output_dirname, options.allow_clobber, 
        (verbosity >= NORMAL_VERBOSE))) {
    die("Couldn't create output directory %s.\n", options.output_dirname);
  }
  // open output files
  sites_path = make_path_to_file(options.output_dirname, SITES_FILENAME);
  sites_file = fopen(sites_path, "w");
  free(sites_path);
  // setup html monolith writer
  json = NULL;
  if ((html = htmlwr_create(get_meme_etc_dir(), TEMPLATE_FILENAME))) {
    htmlwr_set_dest_name(html, options.output_dirname, HTML_FILENAME);
    htmlwr_replace(html, "centrimo_data.js", "data");
    json = htmlwr_output(html);
    if (json == NULL) die("Template does not contain data section.\n");
  } else {
    DEBUG_MSG(QUIET_VERBOSE, "Failed to open html template file.\n");
  }
  if (json) {
    // output some top level variables
    jsonwr_str_prop(json, "version", VERSION);
    jsonwr_str_prop(json, "revision", REVISION);
    jsonwr_str_prop(json, "release", ARCHIVE_DATE);
    jsonwr_str_array_prop(json, "cmd", argv, argc);
    jsonwr_property(json, "options");
    jsonwr_start_object_value(json);
    jsonwr_dbl_prop(json, "motif-pseudo", options.pseudocount);
    jsonwr_dbl_prop(json, "score", options.score_thresh);
    jsonwr_dbl_prop(json, "ethresh", options.evalue_thresh);
    jsonwr_lng_prop(json, "maxbin", options.max_window+1);
    jsonwr_bool_prop(json, "norc", !options.scan_both_strands);
    jsonwr_bool_prop(json, "noflip", options.no_flip);
    jsonwr_end_object_value(json);
    // output the description
    desc = prepare_description(&options);
    if (desc) {
      jsonwr_str_prop(json, "job_description", desc);
      free(desc);
    }
    // output size metrics
    jsonwr_lng_prop(json, "seqlen", seqlen);
    jsonwr_lng_prop(json, "tested", motifN);
    // output the fasta db
    jsonwr_property(json, "sequence_db");
    jsonwr_start_object_value(json);
    jsonwr_str_prop(json, "source", options.seq_source);
    jsonwr_lng_prop(json, "count", seqN);
    jsonwr_end_object_value(json);
    // output the motif dbs
    jsonwr_property(json, "motif_dbs");
    jsonwr_start_array_value(json);
    for (db_i = 0; db_i < arraylst_size(options.motif_sources); db_i++) {
      db = dbs[db_i];
      jsonwr_start_object_value(json);
      jsonwr_str_prop(json, "source", db->source);
      jsonwr_lng_prop(json, "count", arraylst_size(db->motifs));
      jsonwr_end_object_value(json);
    }
    jsonwr_end_array_value(json);
    // start the motif array
    jsonwr_property(json, "motifs");
    jsonwr_start_array_value(json);
  }
  /**************************************************************
   * Tally the positions of the best sites for each of the 
   * selected motifs.
   **************************************************************/
  // prepare the sequence sites
  memset(&seq_sites, 0, sizeof(SEQ_SITES_T));
  // prepare the site counts
  counts.allocated = ((2 * seqlen) - 1);
  counts.sites = mm_malloc(sizeof(double) * counts.allocated);
  // prepare the motifs stats list
  stats_list = arraylst_create();
  // prepare the other vars
  motif = NULL; pos_pssm = NULL; rev_motif = NULL; rev_pssm = NULL;
  for (db_i = 0; db_i < arraylst_size(options.motif_sources); db_i++) {
    db = dbs[db_i];
    for (motif_i = 0; motif_i < arraylst_size(db->motifs); motif_i++) {
      motif = (MOTIF_T *) arraylst_get(motif_i, db->motifs);
      DEBUG_FMT(NORMAL_VERBOSE, "Using motif %s of width %d.\n",  
          get_motif_id(motif), get_motif_length(motif));
      // reset the counts
      for (i = 0; i < counts.allocated; i++) counts.sites[i] = 0;
      counts.total_sites = 0;
      // create the pssm 
      pos_pssm = make_pssm(bg_freqs, motif);
      // If required, do the same for the reverse complement motif.
      if (options.scan_both_strands) {
        rev_motif = dup_rc_motif(motif);
        rev_pssm = make_pssm(bg_freqs, rev_motif);
      }
      // scan the sequences
      for (i = 0; i < seqN; i++)
        score_sequence(&options, sequences[i], pos_pssm, rev_pssm, 
            &seq_sites, &counts);
      // DEBUG check that the sum of the sites is close to the site count
      double sum_check = 0, sum_diff;
      for (i = 0; i < counts.allocated; i++) sum_check += counts.sites[i];
      sum_diff = counts.total_sites - sum_check;
      if (sum_diff < 0) sum_diff = -sum_diff;
      if (sum_diff > 0.1) {
        fprintf(stderr, "Warning: site counts don't sum to accurate value! "
            "%g != %ld", sum_check, counts.total_sites);
      }
      // output the plain text site counts
      output_site_counts(sites_file, seqlen, db, motif, &counts);
      // compute the best central window
      stats = compute_stats(options.max_window, seqlen, db, motif, &counts);
      // check if it passes the threshold
      if (json && stats->log_adj_pvalue <= log_pvalue_thresh) {
        output_motif_json(json, stats, &counts);
        arraylst_add(stats, stats_list);
      } else {
        free(stats);
      }
      // Free memory associated with this motif.
      free_pssm(pos_pssm);
      free_pssm(rev_pssm);
      destroy_motif(rev_motif);
    }
  }
  if (json) jsonwr_end_array_value(json);
  // finish writing sites
  fclose(sites_file);
  // finish writing html file
  if (html) {
    if (htmlwr_output(html) != NULL) {
      die("Found another JSON replacement!\n");
    }
    htmlwr_destroy(html);
  }
  // write text file
  output_centrimo_text(&options, motifN, stats_list);
  // Clean up.
  for (i = 0; i < seqN; ++i) {
    free_seq(sequences[i]); 
  }
  free(sequences);
  for (i = 0; i < arraylst_size(options.motif_sources); i++) {
    free_db(dbs[i]);
  }
  free(dbs);
  free_array(bg_freqs);
  free(counts.sites);
  free(seq_sites.sites);
  arraylst_destroy(free, stats_list);
  cleanup_options(&options);
  return 0;

}
Esempio n. 10
0
/***********************************************************************
  Process command line options
 ***********************************************************************/
static void process_command_line(
  int argc,
  char* argv[],
  CENTRIMO_OPTIONS_T *options
) {

  // Define command line options.
  const int num_options = 12;
  cmdoption const centrimo_options[] = {
    {"bgfile", REQUIRED_VALUE},
    {"o", REQUIRED_VALUE},
    {"oc", REQUIRED_VALUE},
    {"score", REQUIRED_VALUE},
    {"motif-pseudo", REQUIRED_VALUE},
    {"ethresh", REQUIRED_VALUE},
    {"maxbin", REQUIRED_VALUE},
    {"norc", NO_VALUE},
    {"noflip", NO_VALUE},
    {"desc", REQUIRED_VALUE},
    {"dfile", REQUIRED_VALUE},
    {"verbosity", REQUIRED_VALUE}
  };


  int option_index = 0;

  /* Make sure various options are set to NULL or defaults. */
  options->alphabet = DNA_ALPH;
  options->allow_clobber = TRUE;
  options->scan_both_strands = TRUE;
  options->no_flip = FALSE;

  options->description = NULL;
  options->desc_file = NULL;
  options->bg_source = NULL;
  options->output_dirname = "centrimo_out";
  options->seq_source = NULL;
  options->motif_sources = arraylst_create();

  options->score_thresh = DEFAULT_SCORE_THRESH;

  options->pseudocount = DEFAULT_PSEUDOCOUNT;

  options->evalue_thresh = DEFAULT_EVALUE_THRESH;

  options->max_window = DEFAULT_MAX_WINDOW;

  // no need to copy, as string is declared in argv array
  options->selected_motifs = rbtree_create(rbtree_strcmp, NULL, NULL, NULL, NULL);

  verbosity = NORMAL_VERBOSE;

  simple_setopt(argc, argv, num_options, centrimo_options);

  // Parse the command line.
  while (TRUE) {
    int c = 0;
    char* option_name = NULL;
    char* option_value = NULL;
    const char * message = NULL;

    // Read the next option, and break if we're done.
    c = simple_getopt(&option_name, &option_value, &option_index);
    if (c == 0) {
      break;
    }
    else if (c < 0) {
      (void) simple_getopterror(&message);
      fprintf(stderr, "Error processing command line options (%s)\n", message);
      fprintf(stderr, CENTRIMO_USAGE, DEFAULT_PSEUDOCOUNT, DEFAULT_SCORE_THRESH,
          DEFAULT_EVALUE_THRESH, NORMAL_VERBOSE);
      exit(EXIT_FAILURE);
    }
    if (strcmp(option_name, "bgfile") == 0){
      options->bg_source = option_value;
    }
    else if (strcmp(option_name, "ethresh") == 0){
      options->evalue_thresh = atof(option_value);
    }
    else if (strcmp(option_name, "maxbin") == 0){
      // max_window is one less than the number of places a motif can align
      // within the central window
      options->max_window = atoi(option_value) - 1;  
    }
    else if (strcmp(option_name, "motif") == 0){
      rbtree_put(options->selected_motifs, option_value, NULL);
    }
    else if (strcmp(option_name, "motif-pseudo") == 0){
      options->pseudocount = atof(option_value);
    }
    else if (strcmp(option_name, "norc") == 0){
      options->scan_both_strands = FALSE;
    }
    else if (strcmp(option_name, "noflip") == 0){
      options->no_flip = TRUE;
    }
    else if (strcmp(option_name, "o") == 0){
      // Set output directory with no clobber
      options->output_dirname = option_value;
      options->allow_clobber = FALSE;
    }
    else if (strcmp(option_name, "oc") == 0){
      // Set output directory with clobber
      options->output_dirname = option_value;
      options->allow_clobber = TRUE;
    }
    else if (strcmp(option_name, "score") == 0){
      options->score_thresh = atof(option_value);
    }
    else if (strcmp(option_name, "desc") == 0) {
      options->description = option_value;
    } 
    else if (strcmp(option_name, "dfile") == 0) {
      options->desc_file = option_value;
    }
    else if (strcmp(option_name, "verbosity") == 0){
      verbosity = atoi(option_value);
    }
  }
  // Must have sequence and motif file names
  if (argc < option_index + 2) {
      fprintf(stderr, "Sequences and motifs are both required\n");
    fprintf(stderr, CENTRIMO_USAGE, DEFAULT_PSEUDOCOUNT, DEFAULT_SCORE_THRESH,
        DEFAULT_EVALUE_THRESH, NORMAL_VERBOSE);
    exit(EXIT_FAILURE);
  }

  // Record the input file names
  options->seq_source = argv[option_index++];
  for (;option_index < argc; option_index++) 
    arraylst_add(argv[option_index], options->motif_sources);

  // Set up path values for needed stylesheets and output files.
}