示例#1
0
/****************************************************************************
 * Remove from an alignment any sequence whose ID is not in a given list.
 *
 * N.B. It is NOT an error for the given list to contain sequence IDs that 
 * are not in the alignment.
 ****************************************************************************/
ALIGNMENT_T* remove_alignment_seqs
  (STRING_LIST_T* seqs_to_keep,
   ALIGNMENT_T*   alignment)
{
  // Extract the names of the sequences in the alignment.
  STRING_LIST_T* alignment_species = get_species_names(alignment);
  int num_species = get_num_strings(alignment_species);

  // Count how many sequences will be in the new alignment.
  int i_species;
  int num_final = 0;
  for (i_species = 0; i_species < num_species; i_species++) {
    char* this_species = get_nth_string(i_species, alignment_species);

    if (have_string(this_species, seqs_to_keep)) {
      num_final++;
    } else {
      if (verbosity >= NORMAL_VERBOSE) {
        fprintf(stderr, "Removing %s from alignment.\n", this_species);
      }
    }
  }

  // Allocate space for the new sequences.
  SEQ_T** new_sequences = (SEQ_T**)mm_malloc(num_final * sizeof(SEQ_T*));

  // Copy the sequences.
  int final_index = 0;
  num_species = get_num_strings(seqs_to_keep);
  for (i_species = 0; i_species < num_species; i_species++) {
    char* this_species = get_nth_string(i_species, seqs_to_keep);

    // If the requested ID is in the alignment, then copy over the sequence.
    if (have_string(this_species, alignment_species)) {
      SEQ_T* this_seq 
	= get_alignment_sequence_by_name(this_species, alignment);
      new_sequences[final_index] = copy_sequence(this_seq);
      final_index++;
    }
  }

  // Allocate and return the new alignment.
  
  char *consensus = NULL;
  copy_string(&consensus, get_consensus_string(alignment));
  ALIGNMENT_T* new_alignment
    = allocate_alignment(get_alignment_name(alignment),
			 get_alignment_description(alignment),
			 num_final,
			 new_sequences,
			 consensus);

  return(new_alignment);
}
示例#2
0
文件: subst-matrix.c 项目: CPFL/gmeme
MATRIX_T *read_score_matrix(
  char *score_filename,			/* name of score file */
  char **alpha1				/* alphabet in score matrix */
)
{
  int i;
  char *alpha;				/* alphabet in file */
  int alen;
  FILE *score_file;
  RDB_MATRIX_T *rdb_matrix;

  /* open the score file */
  if (open_file(score_filename, "r", FALSE, "score", "substitution scores", &score_file) == 0) 
    exit(1);

  /* read in the score file */
  rdb_matrix = read_rdb_matrix(" ", FALSE, 0, FALSE, NULL, score_file);

  /* get alphabet */
  alen = get_num_strings(rdb_matrix->col_names);
  alpha = (char *)mm_malloc(sizeof(char) * (alen+1));
  for (i=0; i<alen; i++) alpha[i] = get_nth_string(i, rdb_matrix->col_names)[0];
  alpha[i] = '\0';
  *alpha1 = alpha;			/* return alphabet */

  return(rdb_matrix->matrix);
} /* read_score_matrix */
示例#3
0
/****************************************************************************
 * Create a new alignment with any sequence that contains nothing but 
 * gap ('-') characters removed. Returns the new alignment.  Does not 
 * change the old alignment.
 * If there are no all-gap sequences, the returned alignment is the
 * same object as the original alignment.
 ****************************************************************************/
static ALIGNMENT_T* remove_allgap_sequences(ALIGNMENT_T* alignment)
{
  ALIGNMENT_T* new_alignment;
  int i_aln;
  int l_aln = get_num_aligned_sequences(alignment);
  STRING_LIST_T* keeper_seqs = new_string_list();

  // Identify the all-gap sequences.
  for (i_aln=0; i_aln<l_aln; i_aln++) {
    SEQ_T* sequence = get_alignment_sequence(i_aln, alignment);
    int i_seq;
    int l_seq = get_seq_length(sequence);
    // Add sequence to keepers if it contains a non-gap.
    for (i_seq=0; i_seq<l_seq; i_seq++) {
      if (get_seq_char(i_seq, sequence) != '-') {           // not gap?
	add_string(get_seq_name(sequence), keeper_seqs);    // non-gap: keeper
	break;
      }
    }
  }

  // Remove any sequences not in keeper list.
  if (get_num_strings(keeper_seqs) < l_aln) {
    new_alignment = remove_alignment_seqs(keeper_seqs, alignment);
    free_string_list(keeper_seqs);
  } else {
    new_alignment = alignment;
  }

  return(new_alignment);
} // remove_allgap_sequences
示例#4
0
文件: ramen.c 项目: CPFL/gmeme
void ramen_get_scores() {
  int i;
  int seq_num;

  seq_num = get_num_strings(seq_ids); //number of sequences
  //allocate space for final one result per motif array.
  rsr = malloc(sizeof(ramen_result_t*)*motifs.num);

  for(i=0;i<motifs.num;i++) {
    fprintf(stderr, "\rScoring %i of %i motifs...", i+1, motifs.num);
        rsr[i] = ramen_do_linreg_test(i);
  }

  //Order by MSE.
  qsort(rsr, motifs.num, sizeof(ramen_result_t*), ramen_compare_mse);

  fprintf(stderr, "\n");

}
示例#5
0
/***********************************************************************
 * Should the given motif be inserted into the model?
 * FIXME: These tests needn't be mutually exclusive.
 ***********************************************************************/
static BOOLEAN_T retain_motif(
  STRING_LIST_T* requested_motifs, // IDs of motifs to include.
  double         e_threshold,      // E-value to include motifs. 
  double         complexity_threshold, // Complexity threshold to include.
  ORDER_T*       order_spacing,    // Motif order and spacing (linear HMM). 
  MOTIF_T*       motif             // The motif. 
) {
  int num_requested;
  int i;
  char* motif_id;

  /* Method 1: Select motifs by index. */
  num_requested = get_num_strings(requested_motifs);
  if (num_requested > 0) {
    motif_id = get_motif_id(motif);
    for (i = 0; i < num_requested; i++) {
      if (strcmp(get_nth_string(i, requested_motifs), motif_id) == 0) {
        return(TRUE);
      }
    }
    return(FALSE);
  }

  /* Method 2: Select motifs below a certain E-value threshold. */
  else if (e_threshold != 0.0) {
    return (get_motif_evalue(motif) <= e_threshold);
  }

  /* Method 3: Select motifs that are included in the order string. */
  else if (order_spacing != NULL) {
    return order_contains(get_motif_id(motif), order_spacing);
  }

  // Method 4: Select motifs by their complexity score.
  else if (complexity_threshold != 0.0) {
    return(motif->complexity >= complexity_threshold);
  }

  /* Default is to include all motifs. */
  return(TRUE);
}
示例#6
0
/*************************************************************************
 * int main
 *************************************************************************/
int main(int argc, char *argv[])
{
  /* Data structures. */
  int       num_motifs;         /* The number of motifs in the model. */
  MOTIF_T   motifs[2 * MAX_MOTIFS]; /* The motifs. */
  STRING_LIST_T* motif_occurrences = NULL; /* Strings describing occurrences of
                                              motifs */
  BOOLEAN_T has_reverse_strand = FALSE;    /* MEME file contained both strands */
  ARRAY_T*  background;         /* Background probs for alphabet. */
  ORDER_T*  order_spacing;      /* Linear HMM order and spacing. */
  MATRIX_T* transp_freq = NULL; /* Matrix of inter-motif transitions freqs. */
  MATRIX_T* spacer_ave = NULL;  /* Matrix of average spacer lengths. */
  MHMM_T *  the_hmm = NULL;     /* The HMM being constructed. */

  /* Command line parameters. */
  char *    meme_filename;      /* Input file containg motifs. */
  char *    hmm_type_str;       /* HMM type. */
  HMM_T     hmm_type;
  STRING_LIST_T* requested_motifs; /* Indices of requested motifs. */
  int       request_n;          /* The user asked for the first n motifs. */
  double    e_threshold;        /* E-value threshold for motif inclusion. */
  double    complexity_threshold; // For eliminating low-complexity motifs.
  double    p_threshold;        /* p-value threshold for motif occurences. */
  char*     order_string;       /* Motif order and spacing. */
  int       spacer_states;      /* Number of states in each spacer. */
  BOOLEAN_T fim;                /* Represent spacers as free insertion
				   modules? */
  BOOLEAN_T keep_unused;        // Drop unused inter-motif transitions?
  double    trans_pseudo;       /* Transition pseudocount. */
  double    spacer_pseudo;      // Spacer (self-loop) pseudocount. */
  char*     description;        // Descriptive text to be stored in model.
  BOOLEAN_T print_header;       /* Print file header? */
  BOOLEAN_T print_params;       /* Print parameter summary? */
  BOOLEAN_T print_time;         /* Print timing data (dummy: always false). */

  /* Local variables. */
  int       i_motif;

  /**********************************************
   * COMMAND LINE PROCESSING
   **********************************************/
  // Define command line options.
  cmdoption const options[] = {
    {"type", OPTIONAL_VALUE},
    {"description", REQUIRED_VALUE},
    {"motif", REQUIRED_VALUE},
    {"nmotifs", REQUIRED_VALUE},
    {"ethresh", REQUIRED_VALUE},
    {"lowcomp", REQUIRED_VALUE},
    {"pthresh", REQUIRED_VALUE},
    {"order", REQUIRED_VALUE},
    {"nspacer", REQUIRED_VALUE},
    {"fim", NO_VALUE},
    {"keep-unused", NO_VALUE},
    {"transpseudo", REQUIRED_VALUE},
    {"spacerpseudo", REQUIRED_VALUE},
    {"verbosity", REQUIRED_VALUE},
    {"noheader", NO_VALUE},
    {"noparams", NO_VALUE},
    {"notime", NO_VALUE},
    {"quiet", NO_VALUE},
  };
  int option_count = 18;
  int option_index = 0;

  // Define the usage message.
  char      usage[1000] = "";
  strcat(usage, "USAGE: mhmm [options] <MEME file>\n");
  strcat(usage, "\n");
  strcat(usage, "   Options:\n");
  strcat(usage, "     --type [linear|complete|star] (default=linear)\n");
  strcat(usage, "     --description <string> (may be repeated)\n");
  strcat(usage, "     --motif <motif #> (may be repeated)\n");
  strcat(usage, "     --nmotifs <#>\n");
  strcat(usage, "     --ethresh <E-value>\n");
  strcat(usage, "     --lowcomp <value>\n");
  strcat(usage, "     --pthresh <p-value>\n");
  strcat(usage, "     --order <string>\n");
  strcat(usage, "     --nspacer <spacer length> (default=1)\n");
  strcat(usage, "     --fim\n");
  strcat(usage, "     --keep-unused\n");
  strcat(usage, "     --transpseudo <pseudocount>\n");
  strcat(usage, "     --spacerpseudo <pseudocount>\n");
  strcat(usage, "     --verbosity 1|2|3|4|5 (default=2)\n");
  strcat(usage, "     --noheader\n");
  strcat(usage, "     --noparams\n");
  strcat(usage, "     --notime\n");
  strcat(usage, "     --quiet\n");
  strcat(usage, "\n");

  /* Make sure various options are set to NULL or defaults. */
  meme_filename = NULL;
  hmm_type_str = NULL;
  hmm_type = INVALID_HMM;
  requested_motifs = new_string_list();
  request_n = 0;
  e_threshold = 0.0;
  complexity_threshold = 0.0;
  p_threshold = 0.0;
  order_string = NULL;
  spacer_states = DEFAULT_SPACER_STATES,
  fim = FALSE;
  keep_unused = FALSE;
  trans_pseudo = DEFAULT_TRANS_PSEUDO;
  spacer_pseudo = DEFAULT_SPACER_PSEUDO;
  description = NULL;
  print_header = TRUE;
  print_params = TRUE;
  print_time = FALSE;

	simple_setopt(argc, argv, option_count, options);

  // Parse the command line.
  while (1) { 
    int c = 0;
    char* option_name = NULL;
    char* option_value = NULL;
    const char * message = NULL;


    // Read the next option, and break if we're done.
    c = simple_getopt(&option_name, &option_value, &option_index);
    if (c == 0) {
      break;
    } else if (c < 0) {
    	simple_getopterror(&message);
      die("Error processing command line options (%s)\n", message);
    }

    if (strcmp(option_name, "type") == 0) {
			if (option_value != NULL) {
      	hmm_type_str = option_value;
			}
    } else if (strcmp(option_name, "description") == 0) {
      description = option_value;
    } else if (strcmp(option_name, "motif") == 0) {
      add_string(option_value, requested_motifs);
    } else if (strcmp(option_name, "nmotifs") == 0) {
      request_n = atoi(option_value);
    } else if (strcmp(option_name, "ethresh") == 0) {
      e_threshold = atof(option_value);
    } else if (strcmp(option_name, "lowcomp") == 0) {
      complexity_threshold = atof(option_value);
    } else if (strcmp(option_name, "pthresh") == 0) {
      p_threshold = atof(option_value);
    } else if (strcmp(option_name, "order") == 0) {
      order_string = option_value;
    } else if (strcmp(option_name, "nspacer") == 0) {
      spacer_states = atoi(option_value);
    } else if (strcmp(option_name, "fim") == 0) {
      fim = TRUE;
    } else if (strcmp(option_name, "keep-unused") == 0) {
      keep_unused = TRUE;
    } else if (strcmp(option_name, "transpseudo") == 0) {
      trans_pseudo = atof(option_value);
    } else if (strcmp(option_name, "spacerpseudo") == 0) {
      spacer_pseudo = atof(option_value);
    } else if (strcmp(option_name, "verbosity") == 0) {
      verbosity = (VERBOSE_T)atoi(option_value);
    } else if (strcmp(option_name, "noheader") == 0) {
      print_header = FALSE;
    } else if (strcmp(option_name, "noparams") == 0) {
      print_params = FALSE;
    } else if (strcmp(option_name, "notime") == 0) {
      print_time = FALSE;
    } else if (strcmp(option_name, "quiet") == 0) {
      print_header = print_params = print_time = FALSE;
      verbosity = QUIET_VERBOSE;
    }
  }

  // Read the single required argument.
  if (option_index + 1 != argc) {
    fprintf(stderr, "%s", usage);
    exit(1);
  }
  meme_filename = argv[option_index];

  // Set up motif requests. 
  if (request_n != 0) {
    if (get_num_strings(requested_motifs) != 0) {
      die("Can't combine the -motif and -nmotifs options.\n");
    } else {
      for (i_motif = 0; i_motif < request_n; i_motif++) {
        char motif_id[MAX_MOTIF_ID_LENGTH + 1];
        sprintf(motif_id, "%d", i_motif + 1);
        add_string(motif_id, requested_motifs);
      }
    }
  }

  /* Set the model type. */
  hmm_type = convert_enum_type_str(hmm_type_str, LINEAR_HMM, HMM_STRS, 
				   NUM_HMM_T);

  /* Gotta have positive spacer length. */
  if (spacer_states <= 0) {
    die("Negative spacer length (%d).\n", spacer_states);
  }

  /* Make sure motifs weren't selected redundantly. */
  // FIXME: Add tests for complexity threshold.
  if ((get_num_strings(requested_motifs) != 0) && (e_threshold != 0.0)) {
    die("Can't use -motif or -nmotifs with -ethresh.");
  }
  if ((get_num_strings(requested_motifs) != 0) && (order_string != NULL)) {
    die("Can't use -motif or -nmotifs with -order.");
  }
  if ((order_string != NULL) && (e_threshold != 0.0)) {
    die("Can't use -ethresh and -order.");
  }

  /* Prevent trying to build a complete or star model with ordering. */
  if (order_string != NULL) {
    if (hmm_type == COMPLETE_HMM) 
      die("Can't specify motif order with a completely connected model.");
    else if (hmm_type == STAR_HMM)
      die("Can't specify motif order with a star model.");
  } 

  // Parse the order string. 
  order_spacing = create_order(order_string);

  /**********************************************
   * READING THE MOTIFS
   **********************************************/

  BOOLEAN_T read_file = FALSE;
  double pseudocount = 0;

  read_meme_file(
		 meme_filename,
		 "motif-file", // Take bg freq. from motif file.
		 pseudocount,
     REQUIRE_PSPM,
		 &num_motifs,
		 motifs,
		 &motif_occurrences,
		 &has_reverse_strand,
		 &background
		 );

  process_raw_motifs_for_model(
       &num_motifs,
       motifs,
       motif_occurrences,
       requested_motifs,
       has_reverse_strand,
       keep_unused,
       p_threshold,
       e_threshold, 
       complexity_threshold, 
       &order_spacing,
       &transp_freq,
       &spacer_ave,
       trans_pseudo,
       spacer_pseudo
  );

  /**********************************************
   * BUILDING THE HMM
   **********************************************/

  /* Build the motif-based HMM. */
  if (hmm_type == LINEAR_HMM) {

    if (order_spacing != NULL) {
      reorder_motifs(order_spacing, &num_motifs, motifs);
    }
    else {
      die("No order specified for the motifs.\n"
          "For the linear model the motif file must contain motif occurence\n" 
          "data or the motif order must be specified using "
          "the --order option.");
    }

    build_linear_hmm(
      background,
		  order_spacing,
		  spacer_states,
		  motifs,
		  num_motifs, 
		  fim,
		  &the_hmm
    );

  } else if (hmm_type == COMPLETE_HMM) {

    build_complete_hmm(
      background,
		  spacer_states,
		  motifs,
		  num_motifs,
		  transp_freq,
		  spacer_ave,
		  fim,
		  &the_hmm
    );

  } else if (hmm_type == STAR_HMM) {

    build_star_hmm(
      background,
		  spacer_states,
		  motifs,
		  num_motifs,
		  fim,
		  &the_hmm
    );

  }

  // Add some global information.
  copy_string(&(the_hmm->motif_file), meme_filename);

  /**********************************************
   * WRITING THE HMM
   **********************************************/

  /* Print the header. */
  if (print_header)
    write_header(
     program, 
     "",
		 description,
		 meme_filename,
		 NULL,
		 NULL, 
		 stdout
    );

  /* Write the HMM. */
  write_mhmm(verbosity, the_hmm, stdout);

  /* Print the program parameters. */
  if (print_params) {
    printf("Program parameters for mhmm\n");
    printf("  MEME file: %s\n", meme_filename);
    printf("  Motifs:");
    write_string_list(" ", requested_motifs, stdout);
    printf("\n");
    printf("  Model topology: %s\n",
	   convert_enum_type(hmm_type, HMM_STRS, NUM_HMM_T));
    printf("  States per spacer: %d\n", spacer_states);
    printf("  Spacers are free-insertion modules: %s\n",
	   boolean_to_string(fim));
    printf("\n");
  }

  free_array(background);
  free_string_list(requested_motifs);
  free_order(order_spacing);
  free_matrix(transp_freq);
  free_matrix(spacer_ave);
  for (i_motif = 0; i_motif < num_motifs; i_motif++)
    free_motif(&(motifs[i_motif]));
  free_mhmm(the_hmm);
  return(0);
}
示例#7
0
/***********************************************************************
 *  Select the motifs used to build the model, parse any motif
 *  occurences, build the motif order object, and the motif
 *  and spacer frequency matrices.
 ***********************************************************************/
void process_raw_motifs_for_model(
     int* num_motifs,                  // Number of motifs. IN, OUT
     MOTIF_T* motifs,                  // Array of motifs IN, OUT
     STRING_LIST_T* motif_occurrences, // List of motif occurrences. OUT
     STRING_LIST_T* requested_motifs,  // Explicitly requested motifs. IN
     BOOLEAN_T has_reverse_strand,     // Did file contain both strands? IN
     BOOLEAN_T keep_unused,            // Retain unsed motifs? IN
     double p_threshold,               // Motif p-value threshold IN
     double e_threshold,               // Motif e-value threshold IN
     double complexity_threshold,      // Motif complexity threshold IN
     ORDER_T** order_spacing,          // Motif/spacer order IN, OUT
     MATRIX_T** transp_freq,           // Motif transition freqs OUT
     MATRIX_T** spacer_ave,            // Spacer transition freqs OUT
     double trans_pseudo,              // Motif transition pseudo-counts IN
     double spacer_pseudo              // Spacer transition pseudo-counts IN
) {

  // If both strands, make reverse complements.
  if (has_reverse_strand) {
    add_reverse_complements(num_motifs, motifs);
  }

  /* Remove motifs not allowed by the command line parameters */
  filter_motifs(
    requested_motifs, 
    e_threshold, 
    complexity_threshold, 
    order_spacing,  
    num_motifs, 
    motifs
  );

  /* Turn the raw motifs and motif occurences into the */
  /* elements of the model */
  if (motif_occurrences != NULL && get_num_strings(motif_occurrences) > 0) {
    parse_motif_occurrences(
       motif_occurrences,
       has_reverse_strand,
       p_threshold,
       order_spacing,
       transp_freq,
       spacer_ave,
       *num_motifs,
       motifs
    );
  }
  else {
    // If no occurrences are found, initialize matrices uniformly.
    compute_naive_transitions_and_spacers(
      *num_motifs, 
      transp_freq, 
      spacer_ave
    );
  }

  // Convert spacer info to probabilities.
  normalize_spacer_counts(
        trans_pseudo, 
        spacer_pseudo,
        keep_unused,
        *transp_freq, 
        *spacer_ave);

  // Throw out unused motifs.
  throw_out_unused_motifs(*transp_freq, *spacer_ave, num_motifs, motifs);
}
示例#8
0
/***********************************************************************
 * Parse the motif occurrences.
 *
 * Each motif occurence string contains the following items
 *  - sequence id,
 *  - sequence p-value,
 *  - number n of motif occurrences, and
 *  - length of sequence.
 *
 * This is followed by n triples containing
 *  - motif id,
 *  - occurrence position, and
 *  - occurrence p-value.
 *
 ***********************************************************************/
static void parse_motif_occurrences(
  STRING_LIST_T*  motif_occurrences, // List of motif occurences OUT
  BOOLEAN_T  has_reverse_strand, // File included both strands? IN
  double     p_threshold,        // P-value to include motif occurences. OUT
  ORDER_T**  order_spacing,      // Motif order and spacing (linear HMM) 
                                 // IN OUT.
  MATRIX_T** transp_freq,        // Motif-to-motif transitions. OUT
  MATRIX_T** spacer_ave,         // Average inter-motif distances. OUT
  int        num_motifs,         // Number of motifs retrieved. IN
  MOTIF_T*   motifs              // The retrieved motifs. IN
) {

  ORDER_T*  new_order;      // New order and spacing. 
  BOOLEAN_T find_order;     // Should we look for the motif order? 

  // If we already have a motif order and spacing, don't find any more. 
  if (*order_spacing == NULL) {
    find_order = TRUE;
  } else {
    find_order = FALSE;
  }
  new_order = NULL;
  
  // Allocate the matrices. 
  *transp_freq = allocate_matrix(num_motifs + 2, num_motifs + 2);
  *spacer_ave = allocate_matrix(num_motifs + 2, num_motifs + 2);
  init_matrix(0.0, *transp_freq);
  init_matrix(0.0, *spacer_ave);

  int num_occurrence_strings = get_num_strings(motif_occurrences);
  int i;
  for (i = 0; i < num_occurrence_strings; i++) {
    char*  sequence_id;       // ID of the current sequence. 
    float  sequence_p;        // pvalue of the entire sequence. 
    int    num_occurs;        // Number of motif occurences in this sequence. 
    int    seq_length;        // Length of the current sequence. 
    int    i_occur;           // Index of the current occurrence. 
    char   prev_motif[MAX_MOTIF_ID_LENGTH + 1]; // Index of the previous motif. 
    int    prev_position;     // Location of the right edge of previous motif. 
    float  motif_p;           // P-value of the current occurrence. 
    char *c;          // Dummy to hold return of strtok.

    char* line = get_nth_string(i, motif_occurrences);

    /* Read the sequence identifier, p-value, number of occurrences
       and length. */
    // tlb; sscanf crashes if strtok returns NULL so pass it "" then
    sequence_id = strtok(line, " ");
    if (sequence_id == NULL) {
      die("Error reading motif occurrences.\n%s", line);
    }
    if (sscanf((c=strtok(NULL, " "))?c:"", "%f", &sequence_p) != 1) {
      die("Can't read p-value of sequence %s.", sequence_id);
    }
    if (sscanf((c=strtok(NULL, " "))?c:"", "%d", &num_occurs) != 1) {
      die("Can't read number of motif occurences in sequence %s.",
    sequence_id);
    }
    if (sscanf((c=strtok(NULL, " "))?c:"", "%d", &seq_length) != 1) {
      die("Can't read length of sequence %s.", sequence_id);
    }

    if (verbosity > NORMAL_VERBOSE) {
      fprintf(stderr, "Reading motif occurrences for sequence %s.\n", 
        sequence_id);
    }

    // If requested, try to create an order string. 
    if (find_order) {
      new_order = create_empty_order(num_occurs, sequence_p);
    }

    // Accumulate motif occurence data. 
    sprintf(prev_motif, "%d", 0);
    prev_position = 0;
    for (i_occur = 0; i_occur < num_occurs; i_occur++) {
      char  motif_id[MAX_MOTIF_ID_LENGTH + 1]; // ID of the current motif. 
      int   motif_position;    // Position of the current motif occurrence. 
      char *c;       // Dummy to hold return of strtok.
      
      // Read the three values. 
      if (sscanf((c=strtok(NULL, " "))?c:"", "%s", motif_id) != 1) {
        die("Can't read index of occurrence %d in sequence %s.",
        i_occur, sequence_id);
      }
      if (sscanf((c=strtok(NULL, " "))?c:"", "%d", &motif_position) != 1) {
        die("Can't read position of occurrence %d in sequence %s.",
        i_occur, sequence_id);
      }
      if (sscanf((c=strtok(NULL, " "))?c:"", "%f", &motif_p) != 1) {
        die("Can't read p-value of occurrence %d in sequence %s.",
        i_occur, sequence_id);
      }

      // Only include motifs that have been retained
      if (have_motif(motif_id, num_motifs, motifs)) {
        // Make sure we have strand information in the ID.
        if (has_reverse_strand) {
          add_strand(motif_id);
        }
  
        // Record this occurrence.
        record_occurrence(sequence_id,
          motif_id,
          p_threshold,
          motif_p,
          prev_motif,
          &prev_position,
          motif_position,
          *transp_freq,
          *spacer_ave,
          new_order,
          num_motifs, 
          motifs);
  
        /* Motifs are stored in order of their motif IDs, but they are
            indexed from zero rather than one. */
        prev_position = motif_position +
          (motifs[find_matrix_location(motifs, motif_id, num_motifs) - 1]).length;
      }
    }
  
    assert(seq_length >= prev_position);
  
    // Record the transition to the end state.
    record_occurrence(sequence_id,
          END_TRANSITION,
          p_threshold,
          motif_p,
          prev_motif,
          &prev_position,
          seq_length,
          *transp_freq,
          *spacer_ave,
          new_order,
          num_motifs,
          motifs);

    // Decide whether to keep the new order object. 
    if (find_order) {
      if ((get_num_distinct(new_order) > get_num_distinct(*order_spacing)) ||
         (((get_num_distinct(new_order) == get_num_distinct(*order_spacing))
         && (get_pvalue(new_order) < get_pvalue(*order_spacing))))) {
            if (verbosity > NORMAL_VERBOSE) {
              fprintf(stderr, "Storing order from sequence %s (%g < %g).\n",
                sequence_id, get_pvalue(new_order), 
                get_pvalue(*order_spacing));
              print_order_and_spacing(stderr, new_order);
            }
            free_order(*order_spacing);
            *order_spacing = new_order;
      } else {
        free_order(new_order);
      }
    }
  }
}
示例#9
0
文件: ramen.c 项目: CPFL/gmeme
/*
 * Using the linreg test,
 *
 * this method returns the lowest scoring subdivision of a set of sequences for a given motif.
 * It's not self-contained, as it requires to hook into the global variables results, motifs, seq_ids.
 */
ramen_result_t* ramen_do_linreg_test(int motif_num) {
  //Assorted vars
  int seq_num;
  int j,k;
  int motif_index = motif_num * 2; //This is a workaround to the change in the motif datastructure where it now
                      // goes +MOTIFA -MOTIFA +MOTIFB etc. rather than all + then all - motifs.

  //Vars for the regression
  double* x;
  double* y;
  double m = 0;
  double b = 0;
  double mse = 0;

  //Vars for scoring
  ramen_result_t* r;

  //Allocate memory or set initial values
  seq_num = get_num_strings(seq_ids); //number of sequences
  r = malloc(sizeof(ramen_result_t)); //allocate space, as a ptr to this will go in the array later
                    //that's why we don't free it in this loop.
  x = malloc(sizeof(double)*seq_num);
  y = malloc(sizeof(double)*seq_num);

  //Now we need to copy the scores into two double arrays
  //Use LOG macro so that log(0) 'works'
  for (j=0; j < seq_num; j++) {
    if (args.log_fscores == TRUE) {
      y[j] = LOG(get_array_item(j, seq_fscores));
    } else {
      y[j] = get_array_item(j, seq_fscores);
    }

    if (args.log_pwmscores == TRUE) {
      x[j] = LOG(results[motif_num][j]);
    } else {
      x[j] = results[motif_num][j];
    }
  }

    //Switch x&y if they're to be switched
    if (args.linreg_switchxy) {
        SWAP(double*, x, y);
    }

  // TODO: Tidy and/or remove this for production
  if(args.linreg_dump_dir > 0) {
    FILE *fh;
    char* filename;
    filename = malloc(sizeof(char)*(strlen(args.linreg_dump_dir) + 50));
    sprintf(filename, "%s/%s.tsv", args.linreg_dump_dir, get_motif_id(motif_at(motifs.motifs, motif_index)));

    fh = fopen(filename, "w");
    fputs("PWM_Score\tFluorescence_Score\n", fh);
    for (j=0; j < seq_num; j++) {
      fprintf(fh, "%.10e %.10e\n", x[j], y[j]);
    }
    fclose(fh);
    free(filename);
  }


  /*extern double regress(
    int n,                        / number of points /
    double *x,                    / x values /
    double *y,                    / y values /
    double *m,                    / slope /
    double *b                     / y intercept /
    );*/
    mse = regress(seq_num, x, y, &m, &b);

  if (args.verbose >= 3) {
    printf("LinReg MSE of motif %s on %i seqs: %.4g (m: %.4g b: %.4g)\n",
           get_motif_id(motif_at(motifs.motifs, motif_index)), seq_num, mse, m, b);
  }

  //Add to our motif list if lowest MSE
  r->motif_id = strdup(get_motif_id(motif_at(motifs.motifs, motif_index)));
  r->m = m; //Not p-values, but they'll do when we re-use this structure...
  r->b = b;
  r->mse = mse;
  r->p = -1;

    //Do stochastic sampling if required.
    if (args.repeats > 0) {
        int repeat_wins = 0;
        for (j=0;j<args.repeats;j++) {
            double repeat_mse = 0;
            shuffle(x,seq_num); //Shuffle and break the associations between x and y
            repeat_mse = regress(seq_num, x, y, &m, &b);
            //fprintf(stderr, "Motif %d Repeat %d RMSE: %g MSE: %g\n",motif_index,j,repeat_mse,mse);
            if (repeat_mse <= mse) {
                repeat_wins++;
            }
        }
        r->p = repeat_wins*1.0/ args.repeats*1.0;
    }
    free(x);
    free(y);

  return r;
}
示例#10
0
/*******************************************************************************
 * Return count of TRANSFAC species strings.
 ******************************************************************************/
int get_transfac_num_species(TRANSFAC_MOTIF_T *motif) {
    return motif->species_list == NULL
           ? 0 : get_num_strings(motif->species_list);
}
示例#11
0
/****************************************************************************
 *  Return a list containing the empirical column frequency distributions
 *  for all alignments in the input.
 *
 *  Each file in the list of filenames is read and the species list is
 *  determined.  The counts of each occurring column are tallied.  
 *  All files with the same species lists get their counts combined.
 *
 *  The returned list contains one distribution per species list that 
 *  occurs in some alignment.
 ****************************************************************************/
OBJECT_LIST_T* get_alignment_column_freqs_list
  (ALPH_T alph, 
   STRING_LIST_T* filenames,
  BOOLEAN_T remove_allgap_seqs) 
{
  int file_index;
  int num_filenames = get_num_strings(filenames);
  ARRAY_T* alignment_column_freqs = NULL;
  OBJECT_LIST_T* alignment_column_freqs_list
    = new_object_list(equal_string_lists,
		      (void*)copy_string_list,
		      free_string_list,
		      free_array);

  // Consider each alignment in turn.
  for(file_index = 0; file_index < num_filenames; file_index++) { 
    char* filename = get_nth_string(file_index, filenames);

    if (verbosity >= NORMAL_VERBOSE && !(file_index % 1)) {
      fprintf(
	stderr, 
	"Computing column freqs: alignment file number %d of %d total files.\n",
	file_index+1, num_filenames
      );
    }

    // Read the alignment
    int ref_seq_index = 0;
    ALIGNMENT_T* alignment = 
      read_alignment_from_file(filename, TRUE, remove_allgap_seqs, &ref_seq_index);
    STRING_LIST_T* alignment_species = get_species_names(alignment);

    // Try to retrieve the counts so far for this list of species.
    alignment_column_freqs = 
      (ARRAY_T*)retrieve_object(
	alignment_species, 
	alignment_column_freqs_list
      );
    
    // Found counts for current species list?
    if (alignment_column_freqs) {
      // Add counts from current alignment.
      (void) build_alignment_column_counts(alph, alignment, alignment_column_freqs);
      // Note: objects in lists are references, so no need to re-store
      // after modification.
    } 
    // Didn't find counts for this species list, so create new array of counts.
    else {
      alignment_column_freqs = build_alignment_column_counts(alph, alignment, NULL);
      store_object(
	(void*)alignment_column_freqs,
	(void*)alignment_species,
	0.0, // Score
	alignment_column_freqs_list
      );
    }
    // free space used by alignment
    free_alignment(alignment);
  } // each filename
  fprintf(stderr, "\n");

  // Convert counts to frequencies by retrieving each array of counts
  // and dividing by the total counts for that list of species.
  while ( 
    (alignment_column_freqs = retrieve_next_object(alignment_column_freqs_list)
    ) != NULL )
  {
    int i;
    int num_freqs = get_array_length(alignment_column_freqs);
    double total_counts;

    // Get total counts.
    for (i=total_counts=0; i<num_freqs; i++) {
      total_counts += get_array_item(i, alignment_column_freqs);
    }

    // Get frequencies.
    for (i=0; i<num_freqs; i++) {
      double f = get_array_item(i, alignment_column_freqs);
      set_array_item(i, f/total_counts, alignment_column_freqs);

#ifdef DEBUG
      int asize = alph_size(alph, ALPH_SIZE);
      int num_leaves = NINT(log(num_freqs)/log(asize));
      char* alignment_col = mm_malloc((num_leaves + 1) * sizeof(char));
      unhash_alignment_col(
        alph,
        i, 				//col_index
	alignment_col,
	num_leaves
      );
      printf("%s %g %g\n", alignment_col, f, f/total_counts);
      myfree(alignment_col);
#endif
    } // get frequencies
  } // while more species lists

  return(alignment_column_freqs_list);
} // get_alignment_column_freqs_list