コード例 #1
0
/*************************************************************************
 * Converts the motif frequency matrix into a odds matrix: taken from old ama-scan.c
 *************************************************************************/
void convert_to_odds_matrix(MOTIF_T* motif, ARRAY_T* bg_freqs){
  const int asize = alph_size(get_motif_alph(motif), ALPH_SIZE);
  int motif_position_index,alph_index;
  MATRIX_T *freqs;
  freqs = get_motif_freqs(motif);

  const int num_motif_positions = get_num_rows(freqs);
  for (alph_index=0;alph_index<asize;++alph_index){
    double bg_likelihood = get_array_item(alph_index, bg_freqs);
    for (motif_position_index=0;motif_position_index<num_motif_positions;++motif_position_index){
      freqs->rows[motif_position_index]->items[alph_index] /= bg_likelihood;
    }
  }
}
コード例 #2
0
/*************************************************************************
 * Copies the motif frequency matrix and converts it into a odds matrix
 *************************************************************************/
MATRIX_T* create_odds_matrix(MOTIF_T *motif, ARRAY_T* bg_freqs){
  const int asize = alph_size(get_motif_alph(motif), ALPH_SIZE);
  int pos, aidx;
  MATRIX_T *odds;
  
  odds = duplicate_matrix(get_motif_freqs(motif));
  const int num_pos = get_num_rows(odds);
  for (aidx = 0; aidx < asize; ++aidx) {
    double bg_likelihood = get_array_item(aidx, bg_freqs);
    for (pos = 0; pos < num_pos; ++pos) {
      odds->rows[pos]->items[aidx] /= bg_likelihood;
    }
  }
  return odds;
}
コード例 #3
0
ファイル: centrimo.c プロジェクト: CPFL/gmeme
/*************************************************************************
 * Output JSON data for a motif
 *************************************************************************/
static void output_motif_json(JSONWR_T* json, MOTIF_STATS_T* stats, 
    SITE_COUNTS_T* counts) {
  //vars
  MOTIF_T *motif;
  MATRIX_T *freqs;
  int i, j, mlen, asize, end;
  motif = stats->motif;
  freqs = get_motif_freqs(motif);
  asize = alph_size(get_motif_alph(motif), ALPH_SIZE);
  jsonwr_start_object_value(json);
  jsonwr_lng_prop(json, "db", stats->db->id);
  jsonwr_str_prop(json, "id", get_motif_id(motif));
  if (*(get_motif_id2(motif))) {
    jsonwr_str_prop(json, "alt", get_motif_id2(motif));
  }
  mlen = get_motif_length(motif);
  jsonwr_lng_prop(json, "len", mlen);
  jsonwr_dbl_prop(json, "motif_evalue", get_motif_evalue(motif));
  jsonwr_dbl_prop(json, "motif_nsites", get_motif_nsites(motif));
  if (get_motif_url(motif) && *get_motif_url(motif)) {
    jsonwr_str_prop(json, "url", get_motif_url(motif));
  }
  jsonwr_property(json, "pwm");
  jsonwr_start_array_value(json);
  for (i = 0; i < mlen; i++) {
    jsonwr_start_array_value(json);
    for (j = 0; j < asize; j++) {
      jsonwr_dbl_value(json, get_matrix_cell(i, j, freqs));
    }
    jsonwr_end_array_value(json);
  }
  jsonwr_end_array_value(json);
  jsonwr_lng_prop(json, "bin_width", stats->central_window+1);
  jsonwr_dbl_prop(json, "bin_sites", stats->central_sites);
  jsonwr_lng_prop(json, "total_sites", counts->total_sites);
  jsonwr_dbl_prop(json, "log_pvalue", stats->log_adj_pvalue);
  jsonwr_dbl_prop(json, "max_prob", stats->max_prob);
  jsonwr_property(json, "sites");
  jsonwr_start_array_value(json);
  end = counts->allocated - (mlen - 1);
  for (i = (mlen - 1); i < end; i += 2) {
    jsonwr_dbl_value(json, counts->sites[i]);
  }
  jsonwr_end_array_value(json);
  jsonwr_end_object_value(json);
}
コード例 #4
0
ファイル: pmp_bf.c プロジェクト: brsaran/FuzzyApp
/*************************************************************************
 * Entry point for pmp_bf
 *************************************************************************/
int main(int argc, char *argv[]) {

  char* bg_filename = NULL;
  char* motif_name = "motif"; // Use this motif name in the output.
  STRING_LIST_T* selected_motifs = NULL;
  double fg_rate = 1.0;
  double bg_rate = 1.0;
  double purine_pyrimidine = 1.0; // r
  double transition_transversion = 0.5; // R
  double pseudocount = 0.1;
  GAP_SUPPORT_T gap_support = SKIP_GAPS;
  MODEL_TYPE_T model_type = F81_MODEL;
  BOOLEAN_T use_halpern_bruno = FALSE;
  char* ustar_label = NULL;	// TLB; create uniform star tree
  int i;

  program_name = "pmp_bf";

  /**********************************************
   * COMMAND LINE PROCESSING
   **********************************************/

  // Define command line options. (FIXME: Repeated code)
  // FIXME: Note that if you add or remove options you
  // must change n_options.
  int n_options = 12;
  cmdoption const pmp_options[] = {
    {"hb", NO_VALUE},
    {"ustar", REQUIRED_VALUE},
    {"model", REQUIRED_VALUE},
    {"pur-pyr", REQUIRED_VALUE},
    {"transition-transversion", REQUIRED_VALUE},
    {"bg", REQUIRED_VALUE},
    {"fg", REQUIRED_VALUE},
    {"motif", REQUIRED_VALUE},
    {"motif-name", REQUIRED_VALUE},
    {"bgfile", REQUIRED_VALUE},
    {"pseudocount", REQUIRED_VALUE},
    {"verbosity", REQUIRED_VALUE}
  };

  int option_index = 0;

  // Define the usage message.
  char      usage[1000] = "";
  strcat(usage, "USAGE: pmp [options] <tree file> <MEME file>\n");
  strcat(usage, "\n");
  strcat(usage, "   Options:\n");

  // Evolutionary model parameters.
  strcat(usage, "     --hb\n");
  strcat(usage, "     --model single|average|jc|k2|f81|f84|hky|tn");
  strcat(usage, " (default=f81)\n");
  strcat(usage, "     --pur-pyr <float> (default=1.0)\n");
  strcat(usage, "     --transition-transversion <float> (default=0.5)\n");
  strcat(usage, "     --bg <float> (default=1.0)\n");
  strcat(usage, "     --fg <float> (default=1.0)\n");

  // Motif parameters.
  strcat(usage, "     --motif <id> (default=all)\n");
  strcat(usage, "     --motif-name <string> (default from motif file)\n");

  // Miscellaneous parameters
  strcat(usage, "     --bgfile <background> (default from motif file)\n");
  strcat(usage, "     --pseudocount <float> (default=0.1)\n");
  strcat(usage, "     --ustar <label>\n");	// TLB; create uniform star tree
  strcat(usage, "     --verbosity [1|2|3|4] (default 2)\n");
  strcat(usage, "\n    Prints the FP and FN rate at each of 10000 score values.\n");
  strcat(usage, "\n    Output format: [<motif_id> score <score> FPR <fpr> TPR <tpr>]+\n");

  // Parse the command line.
  if (simple_setopt(argc, argv, n_options, pmp_options) != NO_ERROR) {
    die("Error processing command line options: option name too long.\n");
  }

  while (TRUE) { 
    int c = 0;
    char* option_name = NULL;
    char* option_value = NULL;
    const char * message = NULL;

    // Read the next option, and break if we're done.
    c = simple_getopt(&option_name, &option_value, &option_index);
    if (c == 0) {
      break;
    } else if (c < 0) {
      (void) simple_getopterror(&message);
      die("Error processing command line options (%s)\n", message);
    }
    
    if (strcmp(option_name, "model") == 0) {
      if (strcmp(option_value, "jc") == 0) {
        model_type = JC_MODEL;
      } else if (strcmp(option_value, "k2") == 0) {
        model_type = K2_MODEL;
      } else if (strcmp(option_value, "f81") == 0) {
        model_type = F81_MODEL;
      } else if (strcmp(option_value, "f84") == 0) {
        model_type = F84_MODEL;
      } else if (strcmp(option_value, "hky") == 0) {
        model_type = HKY_MODEL;
      } else if (strcmp(option_value, "tn") == 0) {
        model_type = TAMURA_NEI_MODEL;
      } else if (strcmp(option_value, "single") == 0) {
        model_type = SINGLE_MODEL;
      } else if (strcmp(option_value, "average") == 0) {
        model_type = AVERAGE_MODEL;
      } else {
        die("Unknown model: %s\n", option_value);
      }
    } else if (strcmp(option_name, "hb") == 0){
        use_halpern_bruno = TRUE;
    } else if (strcmp(option_name, "ustar") == 0){	// TLB; create uniform star tree
        ustar_label = option_value;
    } else if (strcmp(option_name, "pur-pyr") == 0){
        purine_pyrimidine = atof(option_value);
    } else if (strcmp(option_name, "transition-transversion") == 0){
        transition_transversion = atof(option_value);
    } else if (strcmp(option_name, "bg") == 0){
      bg_rate = atof(option_value);
    } else if (strcmp(option_name, "fg") == 0){
      fg_rate = atof(option_value);
    } else if (strcmp(option_name, "motif") == 0){
        if (selected_motifs == NULL) {
          selected_motifs = new_string_list();
        }
       add_string(option_value, selected_motifs);
    } else if (strcmp(option_name, "motif-name") == 0){
        motif_name = option_value;
    } else if (strcmp(option_name, "bgfile") == 0){
      bg_filename = option_value;
    } else if (strcmp(option_name, "pseudocount") == 0){
        pseudocount = atof(option_value);
    } else if (strcmp(option_name, "verbosity") == 0){
        verbosity = atoi(option_value);
    }
  }

  // Must have tree and motif file names
  if (argc != option_index + 2) {
    fprintf(stderr, "%s", usage);
    exit(EXIT_FAILURE);
  } 

  /**********************************************
   * Read the phylogenetic tree.
   **********************************************/
  char* tree_filename = NULL;
  TREE_T* tree = NULL;
  tree_filename = argv[option_index];
  option_index++;
  tree = read_tree_from_file(tree_filename);

  // get the species names
  STRING_LIST_T* alignment_species = make_leaf_list(tree);
  char *root_label = get_label(tree);	// in case target in center
  if (strlen(root_label)>0) add_string(root_label, alignment_species);
  //write_string_list(" ", alignment_species, stderr);

  // TLB; Convert the tree to a uniform star tree with
  // the target sequence at its center.
  if (ustar_label != NULL) {
    tree = convert_to_uniform_star_tree(tree, ustar_label);
    if (tree == NULL) 
      die("Tree or alignment missing target %s\n", ustar_label);
    if (verbosity >= NORMAL_VERBOSE) {
      fprintf(stderr, 
	"Target %s placed at center of uniform (d=%.3f) star tree:\n", 
          ustar_label, get_total_length(tree) / get_num_children(tree) 
      );
      write_tree(tree, stderr);
    }
  }

  /**********************************************
   * Read the motifs.
   **********************************************/
  char* meme_filename = argv[option_index];
  option_index++;
  int num_motifs = 0; 

  MREAD_T *mread;
  ALPH_T alph;
  ARRAYLST_T *motifs;
  ARRAY_T *bg_freqs;

  mread = mread_create(meme_filename, OPEN_MFILE);
  mread_set_bg_source(mread, bg_filename);
  mread_set_pseudocount(mread, pseudocount);
  // read motifs
  motifs = mread_load(mread, NULL);
  alph = mread_get_alphabet(mread);
  bg_freqs = mread_get_background(mread);
  // check
  if (arraylst_size(motifs) == 0) die("No motifs in %s.", meme_filename);

  

  // TLB; need to resize bg_freqs array to ALPH_SIZE items
  // or copy array breaks in HB mode.  This throws away
  // the freqs for the ambiguous characters;
  int asize = alph_size(alph, ALPH_SIZE);
  resize_array(bg_freqs, asize);

  /**************************************************************
  * Compute probability distributions for each of the selected motifs.
  **************************************************************/
  int motif_index;
  for (motif_index = 0; motif_index < arraylst_size(motifs); motif_index++) {

    MOTIF_T* motif = (MOTIF_T*)arraylst_get(motif_index, motifs);
    char* motif_id = get_motif_id(motif);
    char* bare_motif_id = motif_id;

    // We may have specified on the command line that
    // only certain motifs were to be used.
    if (selected_motifs != NULL) {
      if (*bare_motif_id == '+' || *bare_motif_id == '-') {
        // The selected  motif id won't included a strand indicator.
        bare_motif_id++;
      }
      if (have_string(bare_motif_id, selected_motifs) == FALSE) {
        continue;
      }
    }

    if (verbosity >= NORMAL_VERBOSE) {
      fprintf(
        stderr, 
        "Using motif %s of width %d.\n",
        motif_id, get_motif_length(motif)
      );
    }

    // Build an array of evolutionary models for each position in the motif.
    EVOMODEL_T** models = make_motif_models(
      motif, 
      bg_freqs,
      model_type,
      fg_rate, 
      bg_rate, 
      purine_pyrimidine, 
      transition_transversion, 
      use_halpern_bruno
    );

    // Get the frequencies under the background model (row 0) 
    // and position-dependent scores (rows 1..w)
    // for each possible alignment column.
    MATRIX_T* pssm_matrix = build_alignment_pssm_matrix(
      alph,
      alignment_species,
      get_motif_length(motif) + 1, 
      models, 
      tree, 
      gap_support
    );
    ARRAY_T* alignment_col_freqs = allocate_array(get_num_cols(pssm_matrix)); 
    copy_array(get_matrix_row(0, pssm_matrix), alignment_col_freqs);
    remove_matrix_row(0, pssm_matrix);		// throw away first row
    //print_col_frequencies(alph, alignment_col_freqs);

    //
    // Get the position-dependent null model alignment column frequencies
    //
    int w = get_motif_length(motif);
    int ncols = get_num_cols(pssm_matrix); 
    MATRIX_T* pos_dep_bkg = allocate_matrix(w, ncols);
    for (i=0; i<w; i++) {
      // get the evo model corresponding to this column of the motif
      // and store it as the first evolutionary model.
      myfree(models[0]);
      // Use motif PSFM for equilibrium freqs. for model.
      ARRAY_T* site_specific_freqs = allocate_array(asize);
      int j = 0;
      for(j = 0; j < asize; j++) {
	double value = get_matrix_cell(i, j, get_motif_freqs(motif));
	set_array_item(j, value, site_specific_freqs);
      }
      if (use_halpern_bruno == FALSE) {
	models[0] = make_model(
	  model_type,
	  fg_rate,
	  transition_transversion,
	  purine_pyrimidine,
	  site_specific_freqs,
          NULL
	);
      } else {
        models[0] = make_model(
	  model_type,
	  fg_rate,
	  transition_transversion,
	  purine_pyrimidine,
	  bg_freqs,
	  site_specific_freqs
	);
      }
      // get the alignment column frequencies using this model
      MATRIX_T* tmp_pssm_matrix = build_alignment_pssm_matrix(
        alph,
	alignment_species,
	2,				// only interested in freqs under bkg
	models, 
	tree, 
	gap_support
      );
      // assemble the position-dependent background alignment column freqs.
      set_matrix_row(i, get_matrix_row(0, tmp_pssm_matrix), pos_dep_bkg);
      // chuck the pssm (not his real name)
      free_matrix(tmp_pssm_matrix);
    }

    //
    // Compute and print the score distribution under the background model
    // and under the (position-dependent) motif model.
    //
    int range = 10000;	// 10^4 gives same result as 10^5, but 10^3 differs

    // under background model
    PSSM_T* pssm = build_matrix_pssm(alph, pssm_matrix, alignment_col_freqs, range);

    // under position-dependent background (motif) model
    PSSM_T* pssm_pos_dep = build_matrix_pssm(alph, pssm_matrix, alignment_col_freqs, range);
    get_pv_lookup_pos_dep(
      pssm_pos_dep, 
      pos_dep_bkg, 
      NULL // no priors used
    );

    // print FP and FN distributions
    int num_items = get_pssm_pv_length(pssm_pos_dep);
    for (i=0; i<num_items; i++) {
      double pvf = get_pssm_pv(i, pssm);
      double pvt = get_pssm_pv(i, pssm_pos_dep);
      double fpr = pvf;
      double fnr = 1 - pvt;
      if (fpr >= 0.99999 || fnr == 0) continue;
      printf("%s score %d FPR %.3g FNR %.3g\n", motif_id, i, fpr, fnr);
    }

    // free stuff
    free_pssm(pssm);
    free_pssm(pssm_pos_dep);
    if (models != NULL) {
      int model_index;
      int num_models = get_motif_length(motif) + 1;
      for (model_index = 0; model_index < num_models; model_index++) {
        free_model(models[model_index]);
      }
      myfree(models);
    }

  } // motif

  arraylst_destroy(destroy_motif, motifs);

  /**********************************************
   * Clean up.
   **********************************************/
  // TLB may have encountered a memory corruption bug here
  // CEG has not been able to reproduce it. valgrind says all is well.
  free_array(bg_freqs);
  free_tree(TRUE, tree);
  free_string_list(selected_motifs);

  return(0);
} // main
コード例 #5
0
ファイル: build-hmm.c プロジェクト: rreja/CRM_discovery
/*************************************************************************
 * Build a completely connected HMM.
 *************************************************************************/
void build_complete_hmm
  (ARRAY_T* background,
   int spacer_states, 
   MOTIF_T *motifs,
   int nmotifs,
   MATRIX_T *transp_freq,
   MATRIX_T *spacer_ave,
   BOOLEAN_T fim,
   MHMM_T **the_hmm)
{
  ALPH_T    alph;
  int motif_states; // Total length of the motifs.
  int num_spacers;  // Total number of spacer states.
  int num_states;   // Total number of states in the model.
  int i_motif;      // Index of the current "from" motif.
  int j_motif;      // Index of the current "to" motif.
  int i_position;   // Index within the current motif or spacer.
  int i_state = 0;  // Index of the current state.

  assert(nmotifs > 0);
  alph = get_motif_alph(motifs);// get the alphabet from the first motif

  // Count the width of the motifs.
  for (motif_states = 0, i_motif = 0; i_motif < nmotifs; i_motif++)
    motif_states += get_motif_length(motif_at(motifs, i_motif));
  // Count the spacer states adjacent to begin and end.
  num_spacers = nmotifs * 2;
  // Add the spacer states between motifs.
  num_spacers += nmotifs * nmotifs;
  // Total states = motifs + spacer_states + begin/end
  num_states = motif_states + (num_spacers * spacer_states) + 2;

  // Allocate the model.
  *the_hmm = allocate_mhmm(alph, num_states);

  // Record that this is a completely connected model.
  (*the_hmm)->type = COMPLETE_HMM;

  // Record the number of motifs in the model.
  (*the_hmm)->num_motifs = nmotifs;

  // Record the number of states in the model.
  (*the_hmm)->num_states = num_states;
  (*the_hmm)->num_spacers = ((nmotifs + 1) * (nmotifs + 1)) - 1;
  (*the_hmm)->spacer_states = spacer_states;

  // Put the background distribution into the model.
  copy_array(background, (*the_hmm)->background);

  // Build the begin state.
  build_complete_state(
      START_STATE, 
      i_state,
      alph,
      0, // expected length
      NULL, // Emissions.
      0, // Number of sites.
      NON_MOTIF_INDEX,
      NON_MOTIF_POSITION,
      nmotifs,
      0, // previous motif
      0, // next motif
      transp_freq,
      spacer_states,
      num_spacers,
      motifs,
      &((*the_hmm)->states[i_state]));
  i_state++;

  int from_motif_state, to_motif_state;
  // Build the spacer states. No transitions from the end state.
  for (i_motif = 0; i_motif <= nmotifs; i_motif++) {
    // No transitions to the start state.
    for (j_motif = 1; j_motif <= nmotifs+1; j_motif++) {
      // No transitions from start to end.
      if ((i_motif == 0) && (j_motif == nmotifs+1))
        continue;
      // Allow multi-state spacers.
      for (i_position = 0; i_position < spacer_states; i_position++, i_state++) {
        build_complete_state(
            SPACER_STATE, 
            i_state, 
            alph,
            get_matrix_cell(i_motif, j_motif, spacer_ave),
            background,
            SPACER_NUMSITES,
            NON_MOTIF_INDEX,
            i_position,
            nmotifs,
            i_motif,
            j_motif,
            transp_freq,
            spacer_states,
            num_spacers,
            motifs,
            &((*the_hmm)->states[i_state]));
      }
    }
  }

  // Build the motif states.
  for (i_motif = 0; i_motif < nmotifs; i_motif++) {
    MOTIF_T *this_motif = motif_at(motifs, i_motif);
    STATE_T state;
    for (i_position = 0; i_position < get_motif_length(this_motif); i_position++, i_state++) {
      if (i_position == 0) {
        state = START_MOTIF_STATE;
      } else if (i_position == (get_motif_length(this_motif) - 1)) {
        state = END_MOTIF_STATE;
      } else {
        state = MID_MOTIF_STATE;
      }
      build_complete_state(
          MID_MOTIF_STATE, 
          i_state,
          alph,
          0, // Expected spacer length. 
          get_matrix_row(i_position, get_motif_freqs(this_motif)),
          get_motif_nsites(this_motif),
          i_motif,
          i_position, 
          nmotifs,
          0, // Previous motif index.
          0, // Next motif index.
          transp_freq,
          spacer_states,
          num_spacers,
          motifs,
          &((*the_hmm)->states[i_state]));
    }
  }

  // Build the end state.
  build_complete_state(
      END_STATE, 
      i_state,
      alph,
      0, // Expected spacer length.
      NULL, // Emissions
      0, // Number of sites.
      NON_MOTIF_INDEX,
      NON_MOTIF_POSITION,
      nmotifs,
      0, // Previous motif index.
      0, // Next motif index.
      transp_freq,
      spacer_states,
      num_spacers,
      motifs,
      &((*the_hmm)->states[i_state]));
  i_state++;

  // Convert spacers to FIMs if requested.
  if (fim) {
    convert_to_fims(*the_hmm);
  }

  // Fill in the transition matrix.
  build_transition_matrix(*the_hmm);
}
コード例 #6
0
ファイル: build-hmm.c プロジェクト: rreja/CRM_discovery
/*************************************************************************
 * Build a linear HMM.
 *************************************************************************/
void build_linear_hmm
  (ARRAY_T*  background,
   ORDER_T*  order_spacing,
   int       spacer_states, 
   RBTREE_T* motifs, // motifs with key as in order_spacing
   BOOLEAN_T fim,
   MHMM_T**  the_hmm)
{
  ALPH_T    alph;
  int       model_length; // Total number of states in the model.
  int       i_state;      // Index of the current state.
  int       i_order;      // Index within the order and spacing.
  int       i_position;   // Index within the current motif or spacer.
  int       motif_i;      // motif key in order spacing
  MOTIF_T  *motif;        // motif
  RBNODE_T *node;

  alph = get_motif_alph((MOTIF_T*)rbtree_value(rbtree_first(motifs)));

  // Calculate the total length of the model.
  model_length = 2; // start and end state
  for (i_order = 0; i_order < get_order_occurs(order_spacing); i_order++) {
    motif_i = get_order_motif(order_spacing, i_order);
    motif = (MOTIF_T*)rbtree_get(motifs, &motif_i);
    model_length += get_motif_length(motif);
  }
  model_length += (get_order_occurs(order_spacing) + 1) * spacer_states;


  // Allocate the model.
  *the_hmm = allocate_mhmm(alph, model_length);
  check_sq_matrix((*the_hmm)->trans, model_length);

  // Record that this is a linear model.
  (*the_hmm)->type = LINEAR_HMM;

  // Record the number of motifs in the model. 
  // It doesn't want the distinct count
  (*the_hmm)->num_motifs = get_order_occurs(order_spacing);

  // Record the number of states in the model.
  (*the_hmm)->num_states = model_length;
  (*the_hmm)->num_spacers = get_order_occurs(order_spacing) + 1;
  (*the_hmm)->spacer_states = spacer_states;

  // Put the background distribution into the model.
  copy_array(background, (*the_hmm)->background);

  // Begin the model with a non-emitting state.
  i_state = 0;
  check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states);
  build_linear_state(
      alph,
      START_STATE,
      i_state,
      get_spacer_length(order_spacing, 0),
      NULL, // Emissions.
      0, // Number of sites.
      NON_MOTIF_INDEX,
      NON_MOTIF_POSITION, // position within state (not relevant to start state)
      NULL, // no motif
      &((*the_hmm)->states[i_state]));
  ++i_state;

  // Build the first spacer.
  for (i_position = 0; i_position < spacer_states; i_position++, i_state++) {
    check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states);
    build_linear_state(
        alph,
        SPACER_STATE,
        i_state, 
        get_spacer_length(order_spacing, 0),
        background, 
        SPACER_NUMSITES,
        NON_MOTIF_INDEX,
        i_position, // position within spacer
        NULL, // no motif
        &((*the_hmm)->states[i_state]));
  }

  // Build each motif and subsequent spacer.
  for (i_order = 0; i_order < get_order_occurs(order_spacing); i_order++) {
    STATE_T state;
    int spacer_len;
    motif_i = get_order_motif(order_spacing, i_order);
    motif = (MOTIF_T*)rbtree_get(motifs, &motif_i);

    // Build the motif.
    for (i_position = 0; i_position < get_motif_length(motif); i_position++, i_state++) {
      if (i_position == 0) {
        state = START_MOTIF_STATE;
        spacer_len = get_spacer_length(order_spacing, i_order);
      } else if (i_position == (get_motif_length(motif) - 1)) {
        state = END_MOTIF_STATE;
        spacer_len = get_spacer_length(order_spacing, i_order+1);
      } else {
        state = MID_MOTIF_STATE;
        spacer_len = 0;
      }
      check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states);
      build_linear_state(
          alph, 
          state, 
          i_state, 
          spacer_len, // Expected spacer length.
          get_matrix_row(i_position, get_motif_freqs(motif)),
          get_motif_nsites(motif),
          i_order,
          i_position, // position within motif (middle)
          motif,
          &((*the_hmm)->states[i_state]));
    }

    // Build the following spacer.
    for (i_position = 0; i_position < spacer_states; i_position++, i_state++) {
      check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states);
      build_linear_state(
          alph, 
          SPACER_STATE, 
          i_state, 
          get_spacer_length(order_spacing, i_order+1),
          background,
          SPACER_NUMSITES,
          NON_MOTIF_INDEX, 
          i_position, // position within spacer
          NULL, // no motif
          &((*the_hmm)->states[i_state]));
    }
  }

  check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states);
  // Finish up the model with a non-emitting end state.
  build_linear_state(
      alph, 
      END_STATE, 
      i_state, 
      get_spacer_length(order_spacing, i_order),
      NULL, // Emissions.
      0, // Number of sites.
      NON_MOTIF_INDEX,
      NON_MOTIF_POSITION, // position within state (not relevant to end state)
      NULL, // no motif
      &((*the_hmm)->states[i_state]));
  ++i_state;
  assert(i_state == model_length);

  check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states);
  // Convert spacers to FIMs if requested.
  if (fim) {
    convert_to_fims(*the_hmm);
  }

  // Fill in the transition matrix.
  build_transition_matrix(*the_hmm);
}
コード例 #7
0
ファイル: build-hmm.c プロジェクト: rreja/CRM_discovery
/*************************************************************************
 * Build a star topology HMM.
 *************************************************************************/
void build_star_hmm
  (ARRAY_T*  background,
   int       spacer_states, 
   MOTIF_T*  motifs,
   int       nmotifs,
   BOOLEAN_T fim,
   MHMM_T**  the_hmm)
{
  ALPH_T alph;
  int       motif_states; /* Total length of the motifs. */
  int       num_spacers;  /* Total number of spacer states. */
  int       num_states;   /* Total number of states in the model. */
  int       i_motif;      /* Index of the current "from" motif. */
  int       i_position;   /* Index within the current motif or spacer. */
  int       i_state = 0;  /* Index of the current state. */

  alph = get_motif_alph(motif_at(motifs, 0));

  /* Count the width of the motifs. */
  for (motif_states = 0, i_motif = 0; i_motif < nmotifs; i_motif++)
    motif_states += get_motif_length(motif_at(motifs, i_motif));
  // Only 1 spacer.
  num_spacers = 1;
  /* Total states = motifs + spacer_states + begin/end */
  num_states = motif_states + (num_spacers * spacer_states) + 2;
  /* fprintf(stderr, "motif_states=%d num_spacers=%d num_states=%d\n",
	  motif_states, num_spacers, num_states); */

  /* Allocate the model. */
  *the_hmm = allocate_mhmm(alph, num_states);

  /* Record that this is a star model. */
  (*the_hmm)->type = STAR_HMM;

  /* Record the number of motifs in the model. */
  (*the_hmm)->num_motifs = nmotifs;

  /* Record the number of states in the model. */
  (*the_hmm)->num_states = num_states;
  (*the_hmm)->num_spacers = 1;
  (*the_hmm)->spacer_states = spacer_states;

  // Put the background distribution into the model.
  copy_array(background, (*the_hmm)->background);

  /* Build the begin state. */
  build_star_state(
    alph,
    START_STATE,
		i_state,
		0, // expected length
		NULL,
		0, // Number of sites.
		NON_MOTIF_INDEX,
		NON_MOTIF_POSITION,
		nmotifs,
		spacer_states,
		motifs,
		&((*the_hmm)->states[i_state])
  );
  i_state++;

  // Build the spacer state (state 0).  Allow multi-state spacers.
  for (i_position = 0; i_position < spacer_states; i_position++) {
    build_star_state(
      alph,
      SPACER_STATE, 
		  i_state, 
		  DEFAULT_SPACER_LENGTH,
		  background,
		  SPACER_NUMSITES,
		  NON_MOTIF_INDEX,
		  i_position,
		  nmotifs,
		  spacer_states,
		  motifs,
		  &((*the_hmm)->states[i_state])
    );
    i_state++;
  }

  /* Build the motif states. */
  for (i_motif = 0; i_motif < nmotifs; i_motif++) {
    MOTIF_T *this_motif = motif_at(motifs, i_motif);
    assert(get_motif_length(this_motif) > 1);
    i_position = 0;
    build_star_state(
      alph,
      START_MOTIF_STATE, 
		  i_state,
		  0, // Expected spacer length.
		  get_matrix_row(i_position, get_motif_freqs(this_motif)),
		  get_motif_nsites(this_motif),
		  i_motif,
		  i_position,
		  nmotifs,
		  spacer_states,
		  motifs,
		  &((*the_hmm)->states[i_state])
    );
    i_state++;
    for (i_position = 1; i_position < get_motif_length(this_motif) - 1; i_position++) {
      build_star_state(
        alph,
        MID_MOTIF_STATE, 
		    i_state,
		    0, // Expected spacer length. 
		    get_matrix_row(i_position, get_motif_freqs(this_motif)),
		    get_motif_nsites(this_motif),
		    i_motif,
		    i_position, 
		    nmotifs,
		    spacer_states,
		    motifs,
		    &((*the_hmm)->states[i_state])
      );
      i_state++;
    }
    build_star_state(
      alph,
      END_MOTIF_STATE, 
		  i_state,
		  0, // Expected spacer length.
		  get_matrix_row(i_position, get_motif_freqs(this_motif)),
		  get_motif_nsites(this_motif),
		  i_motif,
		  i_position,
		  nmotifs,
		  spacer_states,
		  motifs,
		  &((*the_hmm)->states[i_state])
    );
    i_state++;
  }

  /* Build the end state. */
  build_star_state(
    alph,
    END_STATE, 
		i_state,
		0, // Expected spacer length.
		NULL, // Emissions
		0, // Number of sites.
		NON_MOTIF_INDEX,
		NON_MOTIF_POSITION,
		nmotifs,
		spacer_states,
		motifs,
		&((*the_hmm)->states[i_state])
  );
  i_state++;

  /* Convert spacers to FIMs if requested. */
  if (fim) {
    convert_to_fims(*the_hmm);
  }

  /* Fill in the transition matrix. */
  build_transition_matrix(*the_hmm);
} // build_star_hmm
コード例 #8
0
ファイル: ramen.c プロジェクト: CPFL/gmeme
void ramen_scan_sequences() {
		FILE* seq_file = NULL;
		MOTIF_T* motif = NULL;
		MOTIF_T* rev_motif = NULL;
		SEQ_T* sequence = NULL;
		SCANNED_SEQUENCE_T* scanned_seq = NULL;
		PATTERN_T* pattern;
		int i;
		int j;
		SEQ_T** seq_list;
		int num_seqs;
		int seq_len;
		//For the bdb_bg mode:
		ARRAY_T* seq_bg_freqs;
		double atcontent;
		double roundatcontent;
		double avg_seq_length = 0;

		//Open the file.
		if (open_file(args.sequence_filename, "r", FALSE, "FASTA", "sequences", &seq_file) == 0) {
				fprintf(stderr, "Couldn't open the file %s.\n", args.sequence_filename);
				ramen_terminate(1);
		}

		//Start reading in the sequences
		read_many_fastas(ramen_alph, seq_file, MAX_SEQ_LENGTH, &num_seqs, &seq_list);


		seq_ids = new_string_list();
		seq_fscores = allocate_array(num_seqs);

		//Allocate the required space for results
		results = malloc(sizeof(double*) * motifs.num);
		for (i=0;i<motifs.num;i++) {
				results[i] = malloc(sizeof(double)*num_seqs);
		}

		for (j=0;j<num_seqs;j++) {

				fprintf(stderr, "\rScanning %i of %i sequences...", j+1, num_seqs);

				//copy the pointer into our current object for clarity
				sequence = seq_list[j];

				//Read the fluorescence data from the description field.
				add_string(get_seq_name(sequence),seq_ids);
				seq_len = get_seq_length(sequence);
				set_array_item(j,atof(get_seq_description(sequence)),seq_fscores);

				//Scan with each motif.
				for (i=0;i<motifs.num;i++) {
						int motifindex = i*2;

						results[i][j] = ramen_sequence_scan(sequence, motif_at(motifs.motifs, motifindex), 
											      motif_at(motifs.motifs, motifindex+1),
											      NULL, NULL, //No need to pass PSSM.
										              AVG_ODDS, 0, TRUE, 0, motifs.bg_freqs);

						if (TRUE == args.linreg_normalise) {
								int k;
								double maxscore = 1;
								motif = motif_at(motifs.motifs,motifindex); 
								for (k=0;k<get_motif_length(motif);k++) {
										double maxprob = 0;
										if (maxprob < get_matrix_cell(k, alph_index(ramen_alph, 'A'), get_motif_freqs(motif)))
												maxprob = get_matrix_cell(k, alph_index(ramen_alph, 'A'), get_motif_freqs(motif));
										if (maxprob < get_matrix_cell(k, alph_index(ramen_alph, 'C'), get_motif_freqs(motif)))
												maxprob = get_matrix_cell(k, alph_index(ramen_alph, 'C'), get_motif_freqs(motif));
										if (maxprob < get_matrix_cell(k, alph_index(ramen_alph, 'G'), get_motif_freqs(motif)))
												maxprob = get_matrix_cell(k, alph_index(ramen_alph, 'G'), get_motif_freqs(motif));
										if (maxprob < get_matrix_cell(k, alph_index(ramen_alph, 'T'), get_motif_freqs(motif)))
												maxprob = get_matrix_cell(k, alph_index(ramen_alph, 'T'), get_motif_freqs(motif));
										maxscore *= maxprob;
								}
								results[i][j] /= maxscore;
						}
				}
		}

}
コード例 #9
0
ファイル: ramen_scan.c プロジェクト: CPFL/gmeme
/*************************************************************************
 * Calculate the odds score for each motif-sized window at each
 * site in the sequence using the given nucleotide frequencies.
 *
 * This function is a lightweight version based on the one contained in
 * motiph-scoring. Several calculations that are unnecessary for gomo
 * have been removed in order to speed up the process
 *************************************************************************/
double score_sequence(
		SEQ_T*        seq,		// sequence to scan (IN)
		MOTIF_T*      motif,		// motif already converted to odds values (IN)
		PSSM_T*       pssm,		// motif pssm
		char*         feature_type,	// motif name (IN)
		char*         seq_name,		// sequence name (IN)
		int 		  method,	// method used for scoring (IN)
		double 		  threshold,	// Threshold to use in TOTAL_HITS mode with a PWM
		ARRAY_T*	  bg_freqs 	//background model
	)
{

	assert(seq != NULL);
	assert(motif != NULL);

	// Get data for sequence
	if (seq_name == NULL) {
		seq_name = get_seq_name(seq);
	}
	char* raw_seq = get_raw_sequence(seq);
	int seq_length = get_seq_length(seq);

	// Get the pv lookup table
	ARRAY_T* pv_lookup = NULL;
	if (NULL != pssm) {
		pv_lookup = pssm->pv;
		assert(get_array_length(pv_lookup) > 0);
        }

	// Prepare storage for the string representing the portion
	// of the reference sequence within the window.
	char* window_seq = (char *) mm_malloc(sizeof(char) * (get_motif_length(motif) + 1));
	window_seq[get_motif_length(motif)] = '\0';

	// Get a character for the strand.
	char strand = 0;
	if (get_motif_id(motif)!= NULL) {
		get_motif_strand(motif);
	} else {
		strand = '.';
	}

	int max_index = seq_length - get_motif_length(motif);
	const int asize = alph_size(get_motif_alph(motif), ALPH_SIZE);
	double* odds =  (double*) mm_malloc(sizeof(double)*max_index);
	double* scaled_log_odds =  (double*) mm_malloc(sizeof(double)*max_index);

	// For each site in the sequence
	int seq_index;
	for (seq_index = 0; seq_index < max_index; seq_index++) {
		BOOLEAN_T skip_window = FALSE;
		BOOLEAN_T found_gap = FALSE;
		BOOLEAN_T found_ambiguity = FALSE;
		double odd = 1.0;
		scaled_log_odds[seq_index] = 0;

		// For each site in the motif window
		int motif_position;
		for (motif_position = 0; motif_position < get_motif_length(motif); motif_position++) {
			char c = raw_seq[seq_index + motif_position];
			window_seq[motif_position] = c;

			// Check for gaps at this site
			if(c == '-' || c == '.') {
				found_gap = TRUE;
				skip_window = TRUE;
				break;
			}

			// Check for ambiguity codes at this site
			//TODO: This next call is very expensive - it takes up approx. 10% of a
			//      programme's running time. It should be fixed up somehow.
			int aindex = alph_index(get_motif_alph(motif), c);
			if (aindex > asize) {
				found_ambiguity = TRUE;
				skip_window = TRUE;
				break;
			}
			if (method == TOTAL_HITS) {
				//If we're in this mode, then we're using LOG ODDS.
				//scaled_log_odds[seq_index] += get_matrix_cell(motif_position, aindex, get_motif_freqs(motif));
				scaled_log_odds[seq_index] += get_matrix_cell(motif_position, aindex, pssm->matrix);
			} else {
				odd *= get_matrix_cell(motif_position, aindex, get_motif_freqs(motif));
			}
		}
		odds[seq_index] = odd;
	}

	// return odds as requested (MAX or AVG scoring)
	double requested_odds = 0.0;
	if (method == AVG_ODDS){
		for (seq_index = 0; seq_index < max_index; seq_index++) {
			requested_odds += odds[seq_index];
		}
		requested_odds /= max_index;
	} else if (method == MAX_ODDS){
		for (seq_index = 0; seq_index < max_index; seq_index++) {
			if (odds[seq_index] > requested_odds){
				requested_odds = odds[seq_index];
			}
		}
	} else if (method == SUM_ODDS) {
		for (seq_index = 0; seq_index < max_index; seq_index++) {
			requested_odds += odds[seq_index];
		}
	} else if (method == TOTAL_HITS) {
		for (seq_index = 0; seq_index < max_index; seq_index++) {

			if (scaled_log_odds[seq_index] >= (double)get_array_length(pv_lookup)) {
				scaled_log_odds[seq_index] = (double)(get_array_length(pv_lookup) - 1);
			} 
			double pvalue = get_array_item((int) scaled_log_odds[seq_index], pv_lookup);
			
			//Figure out how to calculate the p-value of a hit
			//fprintf(stderr, "m: %s pv_l len: %i scaled_log_odds: %g seq index: %i pvalue: %g\n", 
			//	get_motif_id(motif), get_array_length(pv_lookup), scaled_log_odds[seq_index], seq_index, pvalue);

			if (pvalue < threshold) {
				requested_odds++; //Add another hit.
			}

			if (verbosity > HIGHER_VERBOSE) {
			fprintf(stderr, "Window Data: %s\t%s\t%i\t%g\t%g\t%g\n",
					get_seq_name(seq), get_motif_id(motif), seq_index, scaled_log_odds[seq_index], pvalue, threshold);
			}
		}
	}

	myfree(odds);
	myfree(scaled_log_odds);
	myfree(window_seq);
	return requested_odds;
}