Ejemplo n.º 1
0
/***********************************************************************
 * Turn a given motif into its own reverse complement.
 * TODO this does not handle the scores matrix, and it should.
 ***********************************************************************/
void reverse_complement_motif
  (MOTIF_T* a_motif)
{
  int i, temp_trim;
  ARRAY_T* left_freqs;
  ARRAY_T* right_freqs;
  ARRAY_T* temp_freqs;   // Temporary space during swap.

  // Allocate space.
  //temp_freqs = allocate_array(get_alph_size(ALL_SIZE)); //this relys on a global which assumes DNA has ambigs which meme doesn't seem to use 
  temp_freqs = allocate_array(a_motif->alph_size + a_motif->ambigs);

  // Consider each row (position) in the motif.
  for (i = 0; i < (int)((a_motif->length + 1) / 2); i++) {
    left_freqs = get_matrix_row(i, a_motif->freqs);
    right_freqs = get_matrix_row(a_motif->length - (i + 1), a_motif->freqs);

    // Make a temporary copy of one row.
    copy_array(left_freqs, temp_freqs);

    // Compute reverse complements in both directions.
    complement_dna_freqs(right_freqs, left_freqs);
    complement_dna_freqs(temp_freqs, right_freqs);
  }
  free_array(temp_freqs);
  //swap the trimming variables
  temp_trim = a_motif->trim_left;
  a_motif->trim_left = a_motif->trim_right;
  a_motif->trim_right = temp_trim;
}
Ejemplo n.º 2
0
void copy_row(int *from_matrix, int *to_matrix, int from_row_idx, int to_row_idx, int num_cols)
{
  int *from_row = get_matrix_row(from_matrix, from_row_idx, num_cols);
  int *to_row = get_matrix_row(to_matrix, to_row_idx, num_cols);

  memcpy(to_row, from_row, sizeof(int)*num_cols);
}
Ejemplo n.º 3
0
/**************************************************************************
*
*	reverse_complement_pssm_matrix
*
*	Turn a pssm matrix into its own reverse complement.
*
 *************************************************************************/
static void reverse_complement_pssm (
  ALPH_T alph,
  MATRIX_T* pssm_matrix
)
{
  int i;
  ARRAY_T* left_scores;
  ARRAY_T* right_scores;
  ARRAY_T* temp_scores;   // Temporary space during swap.
  int length = get_num_rows(pssm_matrix);

  // Allocate space.
  temp_scores = allocate_array(alph_size(alph, ALL_SIZE));

  // Consider each row (position) in the motif.
  for (i = 0; i < (int)((length+1) / 2); i++) {
    left_scores = get_matrix_row(i, pssm_matrix);
    right_scores = get_matrix_row(length - (i + 1), pssm_matrix);

    // Make a temporary copy of one row.
    copy_array(left_scores, temp_scores);

    // Compute reverse complements in both directions.
    complement_dna_freqs(right_scores, left_scores);
    complement_dna_freqs(temp_scores, right_scores);
  }
  free_array(temp_scores);
} // reverse_complement_pssm_matrix
Ejemplo n.º 4
0
Archivo: motif.c Proyecto: CPFL/gmeme
/***********************************************************************
 * Turn a given motif into its own reverse complement.
 ***********************************************************************/
void reverse_complement_motif
  (MOTIF_T* a_motif)
{
  ALPH_SIZE_T size;
  int i, temp_trim;
  ARRAY_T* left_freqs;
  ARRAY_T* right_freqs;
  ARRAY_T* temp_freqs;   // Temporary space during swap.

  assert(a_motif->alph == DNA_ALPH);

  // Allocate space.
  size = (a_motif->flags & MOTIF_HAS_AMBIGS ? ALL_SIZE : ALPH_SIZE);
  temp_freqs = allocate_array(alph_size(a_motif->alph, size));

  // Consider each row (position) in the motif.
  for (i = 0; i < (int)((a_motif->length + 1) / 2); i++) {
    left_freqs = get_matrix_row(i, a_motif->freqs);
    right_freqs = get_matrix_row(a_motif->length - (i + 1), a_motif->freqs);

    // Make a temporary copy of one row.
    copy_array(left_freqs, temp_freqs);

    // Compute reverse complements in both directions.
    complement_dna_freqs(right_freqs, left_freqs);
    complement_dna_freqs(temp_freqs, right_freqs);
  }
  free_array(temp_freqs);
  if (a_motif->scores) {
    // Allocate space.
    temp_freqs = allocate_array(alph_size(a_motif->alph, ALPH_SIZE));

    // Consider each row (position) in the motif.
    for (i = 0; i < (int)((a_motif->length + 1) / 2); i++) {
      left_freqs = get_matrix_row(i, a_motif->scores);
      right_freqs = get_matrix_row(a_motif->length - (i + 1), a_motif->scores);

      // Make a temporary copy of one row.
      copy_array(left_freqs, temp_freqs);

      // Compute reverse complements in both directions.
      complement_dna_freqs(right_freqs, left_freqs);
      complement_dna_freqs(temp_freqs, right_freqs);
    }
    free_array(temp_freqs);
  }
  //swap the trimming variables
  temp_trim = a_motif->trim_left;
  a_motif->trim_left = a_motif->trim_right;
  a_motif->trim_right = temp_trim;
  //swap the strand indicator
  //this assumes a ? is equalivant to +
  if (get_motif_strand(a_motif) == '-') {
    set_motif_strand('+', a_motif);
  } else {
    set_motif_strand('-', a_motif);
  }
}
Ejemplo n.º 5
0
/***********************************************************************
 * Convert transition counts to transition probabilities, and compute
 * average spacer lengths.
 *
 * Each matrix is indexed 0 ... n+1, where n is the number of motifs.
 * The entry at [i,j] corresponds to the transition from motif i to
 * motif j.  Hence, after normalization, each row in the transition
 * matrix should sum to 1.
 ***********************************************************************/
static void normalize_spacer_counts(
  double    trans_pseudo,
  double    spacer_pseudo,    // Pseudocount for self-loop.
  BOOLEAN_T keep_unused,
  MATRIX_T* transp_freq,
  MATRIX_T* spacer_ave
) {
  int i_row;
  int i_col;
  int num_rows;
  double total_spacer;
  double num_transitions;
  double ave_spacer;
  
  /* Divide the spacer lengths by the number of occurrences. */
  num_rows = get_num_rows(transp_freq);
  for (i_row = 0; i_row < num_rows; i_row++) {
    for (i_col = 0; i_col < num_rows; i_col++) {
      total_spacer = get_matrix_cell(i_row, i_col, spacer_ave) + spacer_pseudo;
      num_transitions = get_matrix_cell(i_row, i_col, transp_freq);
      if (spacer_pseudo > 0) num_transitions++;
      if (num_transitions != 0.0) {
        ave_spacer = total_spacer / num_transitions;
        set_matrix_cell(i_row, i_col, ave_spacer, spacer_ave);
      }
    }
  }

  // Add pseudocounts.
  for (i_row = 0; i_row < num_rows; i_row++) {
    for (i_col = 0; i_col < num_rows; i_col++) {

      // Force some transitions to zero.
      if (// No transitions to the start state.
        (i_col == 0) || 
        // No transitions from the end state.
        (i_row == num_rows - 1) ||
        // No transition from start to end.
        ((i_row == 0) && (i_col == num_rows - 1))) {
        set_matrix_cell(i_row, i_col, 0.0, transp_freq);
      }
      else {
        // Only increment the used transitions.
        if ((keep_unused) || 
            (get_matrix_cell(i_row, i_col, transp_freq) > 0.0)) {
          incr_matrix_cell(i_row, i_col, trans_pseudo, transp_freq);
        }
      }
    }
  }

  // Normalize rows.
  for (i_row = 0; i_row < num_rows - 1; i_row++) {
    if (array_total(get_matrix_row(i_row, transp_freq)) > 0.0) {
      normalize(SLOP, get_matrix_row(i_row, transp_freq));
    }
  }
}
Ejemplo n.º 6
0
BOOLEAN_T verify_trans_matrix
  (BOOLEAN_T log_form,    /* Is the transition matrix in log form? */
   int       num_states,  /* Number of states in the (square) matrix. */
   MATRIX_T* trans)       /* The matrix. */
{
  int    i_state;
  PROB_T total;

  for (i_state = 0; i_state < num_states - 1; i_state++) {

    /* Cf. Rabiner, formula (43b), p. 265. */
    if (log_form) {
      total = log_array_total(get_matrix_row(i_state, trans));
      if ((!almost_equal(total, 0.0, SLOP)) &&
	  (!almost_equal(total, 1.0, SLOP)) && // Allow for FIMS.
	  (!almost_equal(EXP2(total), 0.0, SLOP))) { 
	fprintf(stderr,
		"Warning: Row %d of transition matrix differs from 0.0 by %g.\n",
		i_state, EXP2(total));
	return(FALSE);
      }
    } else {
      total = array_total(get_matrix_row(i_state, trans));
      if ((!almost_equal(total, 1.0, SLOP)) &&
	  (!almost_equal(total, 2.0, SLOP)) && // Allow FIMs.
	  (!almost_equal(total, 0.0, SLOP))) { // Allow inaccessible motifs.
	fprintf(stderr,
		"Warning: Row %d of transition matrix differs from 1.0 by %g.\n",
		i_state, 1.0 - total);
	return(FALSE);
      }
    }

    /* All transitions from the end state must be zero. */
    if ((log_form) &&
	(get_matrix_cell(num_states - 1, i_state, trans) > LOG_SMALL)) {
      fprintf(stderr,
	      "Warning: Transition %d from end state is non-zero (%g).\n", 
	      i_state, get_matrix_cell(num_states - 1, i_state, trans));
      return(FALSE);
    } else if (!(log_form) &&
	       (!almost_equal(get_matrix_cell(num_states - 1, i_state, trans), 
			      0.0, SLOP))) {
      fprintf(stderr,
	      "Warning: Transition %d from end state is non-zero (%g).\n", 
	      i_state, get_matrix_cell(num_states - 1, i_state, trans));
      return(FALSE);
    }
  }
  return(TRUE);
}  
Ejemplo n.º 7
0
/***********************************************************************
 * Compute the complexity of a motif as a number between 0 and 1.
 *
 * Motif complexity is the average K-L distance between the "motif
 * background distribution" and each column of the motif.  The motif
 * background is just the average distribution of all the columns.  The
 * K-L distance, which measures the difference between two
 * distributions, is the same as the information content:
 *
 *  \sum_i p_i log(p_i/f_i)
 *
 * This value increases with increasing complexity.
 ***********************************************************************/
double compute_motif_complexity
  (MOTIF_T* a_motif)
{
  double return_value;
  ARRAY_T* motif_background;  // Mean emission distribution.
  int num_rows;
  int i_row;
  int num_cols;
  int i_col;

  num_cols = get_alph_size(ALPH_SIZE);
  num_rows = get_num_rows(a_motif->freqs);

  // Compute the mean emission distribution.
  motif_background = get_matrix_col_sums(a_motif->freqs);
  scalar_mult(1.0 / (double)num_rows, motif_background);

  // Compute the K-L distance w.r.t. the background.
  return_value = 0;
  for (i_row = 0; i_row < num_rows; i_row++) {
    ARRAY_T* this_emission = get_matrix_row(i_row, a_motif->freqs);
    for (i_col = 0; i_col < num_cols; i_col++) {
      ATYPE this_item = get_array_item(i_col, this_emission);
      ATYPE background_item = get_array_item(i_col, motif_background);

      // Use two logs to avoid handling divide-by-zero as a special case.
      return_value += this_item 
	* (my_log(this_item) - my_log(background_item));
    }
  }

  free_array(motif_background);
  return(return_value / (double)num_rows);
}
Ejemplo n.º 8
0
void check_sq_matrix(MATRIX_T *sq, int expected_size) {
  int i;
  assert(get_num_rows(sq) == expected_size);
  assert(get_num_cols(sq) == expected_size);
  for (i = 0; i < expected_size; ++i) {
    assert(get_array_length(get_matrix_row(i, sq)) == expected_size);
  }
}
Ejemplo n.º 9
0
Archivo: motif.c Proyecto: CPFL/gmeme
/***********************************************************************
 * Normalize the motif's pspm
 ***********************************************************************/
void normalize_motif
  (MOTIF_T *motif, double tolerance)
{
  int i_row, asize;
  asize = alph_size(motif->alph, ALPH_SIZE);
  for (i_row = 0; i_row < motif->length; ++i_row) {
    normalize_subarray(0, asize, tolerance, get_matrix_row(i_row, motif->freqs));
  }
}
Ejemplo n.º 10
0
Archivo: motif.c Proyecto: CPFL/gmeme
/***********************************************************************
 * Calculate the ambiguous letters from the concrete ones.
 ***********************************************************************/
void calc_motif_ambigs
  (MOTIF_T *motif)
{
  int i_row;
  resize_matrix(motif->length, alph_size(motif->alph, ALL_SIZE), 0, motif->freqs);
  motif->flags |= MOTIF_HAS_AMBIGS;
  for (i_row = 0; i_row < motif->length; ++i_row) {
    calc_ambigs(motif->alph, FALSE, get_matrix_row(i_row, motif->freqs));
  }
}
Ejemplo n.º 11
0
void dxml_handle_pos(void *ctx, int pos, double A, double C, double G, double T) {
    CTX_T *data;
    MOTIF_T *motif;
    ARRAY_T *row;

    data = (CTX_T*)ctx;
    motif = data->motif;
    row = get_matrix_row(pos - 1, motif->freqs);
    set_array_item(0, A, row);
    set_array_item(1, C, row);
    set_array_item(2, G, row);
    set_array_item(3, T, row);
}
/*****************************************************************************
 * MEME > motifs > motif > probabilities > alphabet_matrix > /alphabet_array
 * Check that all letters have a probability and update the current matrix row.
 ****************************************************************************/
void mxml_end_probability_pos(void *ctx) {
  CTX_T *data;
  ARRAY_T *pos;
  int i;
  data = (CTX_T*)ctx;
  pos = get_matrix_row(data->current_pos, data->mscope.motif->freqs);
  for (i = 0; i < get_array_length(pos); i++) {
    if (get_array_item(i, pos) == -1) {
      local_error(data, "Probability for letter %c in position %d is missing.\n", alph_char(data->alph, i), i + 1);
    }
  }
  data->current_pos++;
}
Ejemplo n.º 13
0
/***********************************************************************
 * Calculates the information content of a position of the motif.
 *
 * Assumes that alph_size does not include ambigious characters.
 ***********************************************************************/
static inline double position_information_content(
  MOTIF_T *a_motif,
  int position
) {
  int i;
  double H, item;
  ARRAY_T *freqs;

  H = 0;
  freqs = get_matrix_row(position, a_motif->freqs);
  for (i = 0; i < a_motif->alph_size; ++i) {
    item = get_array_item(i, freqs);
    H -= item*my_log2(item);
  }
  return my_log2(a_motif->alph_size) - H;
}
Ejemplo n.º 14
0
Archivo: motif.c Proyecto: CPFL/gmeme
/***********************************************************************
 * Takes a matrix of meme scores and converts them into letter 
 * probabilities.
 *
 * The probablility can be got by:
 * p = (2 ^ (s / 100)) * bg
 *
 ***********************************************************************/
MATRIX_T* convert_scores_into_freqs
  (ALPH_T alph,
   MATRIX_T *scores,
   ARRAY_T *bg,
   int site_count,
   double pseudo_count)
{
  int asize, length;
  double freq, score, total_count, counts, bg_freq;
  MATRIX_T *freqs;
  int row, col;

  assert(alph != INVALID_ALPH);
  assert(scores != NULL);
  assert(bg != NULL);

  length = get_num_rows(scores);
  asize = alph_size(alph, ALPH_SIZE);

  freqs = allocate_matrix(length, asize);
  total_count = site_count + pseudo_count;

  for (col = 0; col < asize; ++col) {
    bg_freq = get_array_item(col, bg);
    for (row = 0; row < length; ++row) {
      score = get_matrix_cell(row, col, scores);
      // convert to a probability
      freq = pow(2.0, score / 100.0) * bg_freq;
      // remove the pseudo count
      freq = ((freq * total_count) - (bg_freq * pseudo_count)) / site_count;
      if (freq < 0) freq = 0;
      else if (freq > 1) freq = 1;
      set_matrix_cell(row, col, freq, freqs);
    }
  }
  for (row = 0; row < length; ++row) {
    normalize_subarray(0, asize, 0.0, get_matrix_row(row, freqs));
  }

  return freqs;
}
Ejemplo n.º 15
0
/*************************************************************************
 * Entry point for pmp_bf
 *************************************************************************/
int main(int argc, char *argv[]) {

  char* bg_filename = NULL;
  char* motif_name = "motif"; // Use this motif name in the output.
  STRING_LIST_T* selected_motifs = NULL;
  double fg_rate = 1.0;
  double bg_rate = 1.0;
  double purine_pyrimidine = 1.0; // r
  double transition_transversion = 0.5; // R
  double pseudocount = 0.1;
  GAP_SUPPORT_T gap_support = SKIP_GAPS;
  MODEL_TYPE_T model_type = F81_MODEL;
  BOOLEAN_T use_halpern_bruno = FALSE;
  char* ustar_label = NULL;	// TLB; create uniform star tree
  int i;

  program_name = "pmp_bf";

  /**********************************************
   * COMMAND LINE PROCESSING
   **********************************************/

  // Define command line options. (FIXME: Repeated code)
  // FIXME: Note that if you add or remove options you
  // must change n_options.
  int n_options = 12;
  cmdoption const pmp_options[] = {
    {"hb", NO_VALUE},
    {"ustar", REQUIRED_VALUE},
    {"model", REQUIRED_VALUE},
    {"pur-pyr", REQUIRED_VALUE},
    {"transition-transversion", REQUIRED_VALUE},
    {"bg", REQUIRED_VALUE},
    {"fg", REQUIRED_VALUE},
    {"motif", REQUIRED_VALUE},
    {"motif-name", REQUIRED_VALUE},
    {"bgfile", REQUIRED_VALUE},
    {"pseudocount", REQUIRED_VALUE},
    {"verbosity", REQUIRED_VALUE}
  };

  int option_index = 0;

  // Define the usage message.
  char      usage[1000] = "";
  strcat(usage, "USAGE: pmp [options] <tree file> <MEME file>\n");
  strcat(usage, "\n");
  strcat(usage, "   Options:\n");

  // Evolutionary model parameters.
  strcat(usage, "     --hb\n");
  strcat(usage, "     --model single|average|jc|k2|f81|f84|hky|tn");
  strcat(usage, " (default=f81)\n");
  strcat(usage, "     --pur-pyr <float> (default=1.0)\n");
  strcat(usage, "     --transition-transversion <float> (default=0.5)\n");
  strcat(usage, "     --bg <float> (default=1.0)\n");
  strcat(usage, "     --fg <float> (default=1.0)\n");

  // Motif parameters.
  strcat(usage, "     --motif <id> (default=all)\n");
  strcat(usage, "     --motif-name <string> (default from motif file)\n");

  // Miscellaneous parameters
  strcat(usage, "     --bgfile <background> (default from motif file)\n");
  strcat(usage, "     --pseudocount <float> (default=0.1)\n");
  strcat(usage, "     --ustar <label>\n");	// TLB; create uniform star tree
  strcat(usage, "     --verbosity [1|2|3|4] (default 2)\n");
  strcat(usage, "\n    Prints the FP and FN rate at each of 10000 score values.\n");
  strcat(usage, "\n    Output format: [<motif_id> score <score> FPR <fpr> TPR <tpr>]+\n");

  // Parse the command line.
  if (simple_setopt(argc, argv, n_options, pmp_options) != NO_ERROR) {
    die("Error processing command line options: option name too long.\n");
  }

  while (TRUE) { 
    int c = 0;
    char* option_name = NULL;
    char* option_value = NULL;
    const char * message = NULL;

    // Read the next option, and break if we're done.
    c = simple_getopt(&option_name, &option_value, &option_index);
    if (c == 0) {
      break;
    } else if (c < 0) {
      (void) simple_getopterror(&message);
      die("Error processing command line options (%s)\n", message);
    }
    
    if (strcmp(option_name, "model") == 0) {
      if (strcmp(option_value, "jc") == 0) {
        model_type = JC_MODEL;
      } else if (strcmp(option_value, "k2") == 0) {
        model_type = K2_MODEL;
      } else if (strcmp(option_value, "f81") == 0) {
        model_type = F81_MODEL;
      } else if (strcmp(option_value, "f84") == 0) {
        model_type = F84_MODEL;
      } else if (strcmp(option_value, "hky") == 0) {
        model_type = HKY_MODEL;
      } else if (strcmp(option_value, "tn") == 0) {
        model_type = TAMURA_NEI_MODEL;
      } else if (strcmp(option_value, "single") == 0) {
        model_type = SINGLE_MODEL;
      } else if (strcmp(option_value, "average") == 0) {
        model_type = AVERAGE_MODEL;
      } else {
        die("Unknown model: %s\n", option_value);
      }
    } else if (strcmp(option_name, "hb") == 0){
        use_halpern_bruno = TRUE;
    } else if (strcmp(option_name, "ustar") == 0){	// TLB; create uniform star tree
        ustar_label = option_value;
    } else if (strcmp(option_name, "pur-pyr") == 0){
        purine_pyrimidine = atof(option_value);
    } else if (strcmp(option_name, "transition-transversion") == 0){
        transition_transversion = atof(option_value);
    } else if (strcmp(option_name, "bg") == 0){
      bg_rate = atof(option_value);
    } else if (strcmp(option_name, "fg") == 0){
      fg_rate = atof(option_value);
    } else if (strcmp(option_name, "motif") == 0){
        if (selected_motifs == NULL) {
          selected_motifs = new_string_list();
        }
       add_string(option_value, selected_motifs);
    } else if (strcmp(option_name, "motif-name") == 0){
        motif_name = option_value;
    } else if (strcmp(option_name, "bgfile") == 0){
      bg_filename = option_value;
    } else if (strcmp(option_name, "pseudocount") == 0){
        pseudocount = atof(option_value);
    } else if (strcmp(option_name, "verbosity") == 0){
        verbosity = atoi(option_value);
    }
  }

  // Must have tree and motif file names
  if (argc != option_index + 2) {
    fprintf(stderr, "%s", usage);
    exit(EXIT_FAILURE);
  } 

  /**********************************************
   * Read the phylogenetic tree.
   **********************************************/
  char* tree_filename = NULL;
  TREE_T* tree = NULL;
  tree_filename = argv[option_index];
  option_index++;
  tree = read_tree_from_file(tree_filename);

  // get the species names
  STRING_LIST_T* alignment_species = make_leaf_list(tree);
  char *root_label = get_label(tree);	// in case target in center
  if (strlen(root_label)>0) add_string(root_label, alignment_species);
  //write_string_list(" ", alignment_species, stderr);

  // TLB; Convert the tree to a uniform star tree with
  // the target sequence at its center.
  if (ustar_label != NULL) {
    tree = convert_to_uniform_star_tree(tree, ustar_label);
    if (tree == NULL) 
      die("Tree or alignment missing target %s\n", ustar_label);
    if (verbosity >= NORMAL_VERBOSE) {
      fprintf(stderr, 
	"Target %s placed at center of uniform (d=%.3f) star tree:\n", 
          ustar_label, get_total_length(tree) / get_num_children(tree) 
      );
      write_tree(tree, stderr);
    }
  }

  /**********************************************
   * Read the motifs.
   **********************************************/
  char* meme_filename = argv[option_index];
  option_index++;
  int num_motifs = 0; 

  MREAD_T *mread;
  ALPH_T alph;
  ARRAYLST_T *motifs;
  ARRAY_T *bg_freqs;

  mread = mread_create(meme_filename, OPEN_MFILE);
  mread_set_bg_source(mread, bg_filename);
  mread_set_pseudocount(mread, pseudocount);
  // read motifs
  motifs = mread_load(mread, NULL);
  alph = mread_get_alphabet(mread);
  bg_freqs = mread_get_background(mread);
  // check
  if (arraylst_size(motifs) == 0) die("No motifs in %s.", meme_filename);

  

  // TLB; need to resize bg_freqs array to ALPH_SIZE items
  // or copy array breaks in HB mode.  This throws away
  // the freqs for the ambiguous characters;
  int asize = alph_size(alph, ALPH_SIZE);
  resize_array(bg_freqs, asize);

  /**************************************************************
  * Compute probability distributions for each of the selected motifs.
  **************************************************************/
  int motif_index;
  for (motif_index = 0; motif_index < arraylst_size(motifs); motif_index++) {

    MOTIF_T* motif = (MOTIF_T*)arraylst_get(motif_index, motifs);
    char* motif_id = get_motif_id(motif);
    char* bare_motif_id = motif_id;

    // We may have specified on the command line that
    // only certain motifs were to be used.
    if (selected_motifs != NULL) {
      if (*bare_motif_id == '+' || *bare_motif_id == '-') {
        // The selected  motif id won't included a strand indicator.
        bare_motif_id++;
      }
      if (have_string(bare_motif_id, selected_motifs) == FALSE) {
        continue;
      }
    }

    if (verbosity >= NORMAL_VERBOSE) {
      fprintf(
        stderr, 
        "Using motif %s of width %d.\n",
        motif_id, get_motif_length(motif)
      );
    }

    // Build an array of evolutionary models for each position in the motif.
    EVOMODEL_T** models = make_motif_models(
      motif, 
      bg_freqs,
      model_type,
      fg_rate, 
      bg_rate, 
      purine_pyrimidine, 
      transition_transversion, 
      use_halpern_bruno
    );

    // Get the frequencies under the background model (row 0) 
    // and position-dependent scores (rows 1..w)
    // for each possible alignment column.
    MATRIX_T* pssm_matrix = build_alignment_pssm_matrix(
      alph,
      alignment_species,
      get_motif_length(motif) + 1, 
      models, 
      tree, 
      gap_support
    );
    ARRAY_T* alignment_col_freqs = allocate_array(get_num_cols(pssm_matrix)); 
    copy_array(get_matrix_row(0, pssm_matrix), alignment_col_freqs);
    remove_matrix_row(0, pssm_matrix);		// throw away first row
    //print_col_frequencies(alph, alignment_col_freqs);

    //
    // Get the position-dependent null model alignment column frequencies
    //
    int w = get_motif_length(motif);
    int ncols = get_num_cols(pssm_matrix); 
    MATRIX_T* pos_dep_bkg = allocate_matrix(w, ncols);
    for (i=0; i<w; i++) {
      // get the evo model corresponding to this column of the motif
      // and store it as the first evolutionary model.
      myfree(models[0]);
      // Use motif PSFM for equilibrium freqs. for model.
      ARRAY_T* site_specific_freqs = allocate_array(asize);
      int j = 0;
      for(j = 0; j < asize; j++) {
	double value = get_matrix_cell(i, j, get_motif_freqs(motif));
	set_array_item(j, value, site_specific_freqs);
      }
      if (use_halpern_bruno == FALSE) {
	models[0] = make_model(
	  model_type,
	  fg_rate,
	  transition_transversion,
	  purine_pyrimidine,
	  site_specific_freqs,
          NULL
	);
      } else {
        models[0] = make_model(
	  model_type,
	  fg_rate,
	  transition_transversion,
	  purine_pyrimidine,
	  bg_freqs,
	  site_specific_freqs
	);
      }
      // get the alignment column frequencies using this model
      MATRIX_T* tmp_pssm_matrix = build_alignment_pssm_matrix(
        alph,
	alignment_species,
	2,				// only interested in freqs under bkg
	models, 
	tree, 
	gap_support
      );
      // assemble the position-dependent background alignment column freqs.
      set_matrix_row(i, get_matrix_row(0, tmp_pssm_matrix), pos_dep_bkg);
      // chuck the pssm (not his real name)
      free_matrix(tmp_pssm_matrix);
    }

    //
    // Compute and print the score distribution under the background model
    // and under the (position-dependent) motif model.
    //
    int range = 10000;	// 10^4 gives same result as 10^5, but 10^3 differs

    // under background model
    PSSM_T* pssm = build_matrix_pssm(alph, pssm_matrix, alignment_col_freqs, range);

    // under position-dependent background (motif) model
    PSSM_T* pssm_pos_dep = build_matrix_pssm(alph, pssm_matrix, alignment_col_freqs, range);
    get_pv_lookup_pos_dep(
      pssm_pos_dep, 
      pos_dep_bkg, 
      NULL // no priors used
    );

    // print FP and FN distributions
    int num_items = get_pssm_pv_length(pssm_pos_dep);
    for (i=0; i<num_items; i++) {
      double pvf = get_pssm_pv(i, pssm);
      double pvt = get_pssm_pv(i, pssm_pos_dep);
      double fpr = pvf;
      double fnr = 1 - pvt;
      if (fpr >= 0.99999 || fnr == 0) continue;
      printf("%s score %d FPR %.3g FNR %.3g\n", motif_id, i, fpr, fnr);
    }

    // free stuff
    free_pssm(pssm);
    free_pssm(pssm_pos_dep);
    if (models != NULL) {
      int model_index;
      int num_models = get_motif_length(motif) + 1;
      for (model_index = 0; model_index < num_models; model_index++) {
        free_model(models[model_index]);
      }
      myfree(models);
    }

  } // motif

  arraylst_destroy(destroy_motif, motifs);

  /**********************************************
   * Clean up.
   **********************************************/
  // TLB may have encountered a memory corruption bug here
  // CEG has not been able to reproduce it. valgrind says all is well.
  free_array(bg_freqs);
  free_tree(TRUE, tree);
  free_string_list(selected_motifs);

  return(0);
} // main
Ejemplo n.º 16
0
int mg_computepath(CombinedScoreMatrixEntry **combinedscore_matrix,
                   HitInformation *hit_information,
                   unsigned long rows,
                   unsigned long contig_len,
                   ParseStruct *parsestruct_ptr, GtError * err)
{
  int had_err = 0;

  /* Initialisieren der Matrix fuer die Pfadberechnung */
  PathMatrixEntry **path_matrix;

  /* i: Zaehlvariable fuer die Matrix-Zeilen; k: Zaehlvariable Precursors
     (von 0 bis max 2) maxpath_frame: Speichern des vorherigen Frames von
     dem der max-Wert berechnet wird */
  unsigned short row_index = 0,
    precursor_index = 0,
    precursors_row = 0,
    maxpath_frame = 0;

  /* Position in der Query-DNA */
  unsigned long column_index = 0;

  /* Variablen fuer den aktuellen Frame, den vorherigen Frame(speichert
     einen Wert aus precursors[], die Zeile des vorherigen Frames, GtArray
     mit den Precursors-Frames */
  short current_frame = 0,
    precursors_frame = 0,
    precursors[NUM_PRECURSORS];

  /* q ist der Wert, der bei Aus- oder Eintreten in ein Gen auf dem
     Forward- bzw. Reverse-Strang berechnet wird */
  double q = ARGUMENTSSTRUCT(leavegene_value),
    max_new = 1,
    max_old = 1;

  /* Speicherreservierung fuer die Path-Matrix - Groesse entsprechend der
     CombinedScore-Matrix */
  gt_array2dim_calloc(path_matrix, 7, contig_len);

  gt_error_check(err);

  /* fuer die erste Spalte der Path-Matrix wird die erste Spalte der
     CombinedScore-Matrix uebernommen */
  for (row_index = 0; row_index < rows; row_index++)
  {
    path_matrix[row_index][0].score =
      combinedscore_matrix[row_index][0].matrix_score;
    path_matrix[row_index][0].path_frame = row_index;
  }

  /* Spaltenweise Berechnung des opt. Pfades */
  for (column_index = 1; column_index < contig_len; column_index++)
  {
    for (row_index = 0; row_index < rows; row_index++)
    {
      /* Zaehlvariable fuer die Zeile wird umgerechnet in den entsprechenden
         Leserahmen */
      current_frame = get_current_frame(row_index);
      /* Aufruf der Methode zum Berechnen der moeglichen Leserahmen anhand von
         aktuellem Leserahmen und der Query-DNA-Sequenz */
      compute_precursors(current_frame,
                         column_index,
                         precursors);

      /* der max-Wert der moeglichen Vorgaenger wird berechnet */
      for (precursor_index = 0;
           precursor_index < NUM_PRECURSORS
             && (precursors[precursor_index] != UNDEFINED);
           ++precursor_index)
      {
        /* aktueller Vorgaengerleserahmen - es gibt max. 3 moegliche
           Vorgaenger */
        precursors_frame = precursors[precursor_index];
        /* Vorgaengerleserahmen wird umgerechnet in die entsprechende
           Matrix-Zeile */
        precursors_row = get_matrix_row(precursors_frame);

        /* der DP-Algo umfasst 3 moegliche Faelle
           1. Fall: Wechsel vom Reversen- auf den Forward-Strang bzw.
           umgekehrt */
        if ((current_frame < 0 && precursors_frame > 0) ||
            (current_frame > 0 && precursors_frame < 0))
        {
            max_new = path_matrix[precursors_row][column_index-1].score +
                      combinedscore_matrix[row_index][column_index].matrix_score
                      + 2*q;
        }
        /* 2. Fall: Einfacher Wechsel des Leserahmens, also von + zu +
           bzw.- zu - */
        else if (current_frame != 0 && precursors_frame != current_frame)
        {
            max_new = path_matrix[precursors_row][column_index-1].score +
                      combinedscore_matrix[row_index][column_index].matrix_score
                      + q;
        }
        /* 3. Fall: Leserahmen wird beibehalten bzw. Wechsel von kodierend zu
           nicht-kodierend oder umgekehrt */
        else
        {
            max_new = path_matrix[precursors_row][column_index-1].score +
                      combinedscore_matrix[row_index][column_index]
                      .matrix_score;
        }

        /* Bestimmen des Max-Wertes der max. 3 Moeglichkeiten und Speichern der
           Zeile, von der der Max-Wert stammt */
        if (gt_double_compare(max_new, max_old) > 0)
        {
            max_old = max_new;
            maxpath_frame = precursors_row;
        }
      }

      /* Speichern des Max-Wertes und der "Vorgaenger"-Zeile;
         zuruecksetzen der Variablen */
      path_matrix[row_index][column_index].score      = max_old;
      path_matrix[row_index][column_index].path_frame = maxpath_frame;

      max_new = DBL_MIN;
      max_old = DBL_MIN;
      maxpath_frame = 0;
    }
  }

  /* Aufruf der Methode zur Genvorhersage */
  had_err = mg_compute_gene_prediction(combinedscore_matrix,
                                       path_matrix,
                                       contig_len,
                                       hit_information,
                                       parsestruct_ptr, err);

  gt_array2dim_delete(path_matrix);

  return had_err;
}
Ejemplo n.º 17
0
int main(int argc, char **argv) {
	gettimeofday(&start_global, NULL);
	print_lib_version();

	mpz_init(N);
	mpz_t B;
	mpz_init(B);

	unsigned long int uBase;
	int64_t nb_primes;
	modular_root_t *modular_roots;

	uint64_t i, j;

	if (mpz_init_set_str(N, argv[1], 10) == -1) {
		printf("Cannot load N %s\n", argv[1]);
		exit(2);
	}

	mpz_t sqrtN, rem;
	mpz_init(sqrtN);
	mpz_init(rem);
	mpz_sqrtrem(sqrtN, rem, N);

	if (mpz_cmp_ui(rem, 0) != 0) /* if not perfect square, calculate the ceiling */
		mpz_add_ui(sqrtN, sqrtN, 1);
	else /* N is a perfect square, factored! */
	{
		printf("\n<<<[FACTOR]>>> %s\n", mpz_get_str(NULL, 10, sqrtN));
		return 0;
	}

	if (mpz_probab_prime_p(N, 10) > 0) /* don't bother factoring */
	{
		printf("N:%s is prime\n", mpz_get_str(NULL, 10, N));
		exit(0);
	}

	OPEN_LOG_FILE("freq");

//--------------------------------------------------------
//  calculate the smoothness base for the given N
//--------------------------------------------------------
	get_smoothness_base(B, N); /* if N is too small, the program will surely fail, please consider a pen and paper instead */
	uBase = mpz_get_ui(B);
	printf("n: %s\tBase: %s\n", mpz_get_str(NULL, 10, N),
			mpz_get_str(NULL, 10, B));

//--------------------------------------------------------
// sieve primes that are less than the smoothness base using Eratosthenes sieve
//--------------------------------------------------------
	START_TIMER();
	nb_primes = sieve_primes_up_to((int64_t) (uBase));

	printf("\nPrimes found %" PRId64 " [Smoothness Base %lu]\n", nb_primes,
			uBase);
	STOP_TIMER_PRINT_TIME("\tEratosthenes Sieving done");

//--------------------------------------------------------
// fill the primes array with primes to which n is a quadratic residue
//--------------------------------------------------------
	START_TIMER();
	primes = calloc(nb_primes, sizeof(int64_t));
	nb_qr_primes = fill_primes_with_quadratic_residue(primes, N);

	/*for(i=0; i<nb_qr_primes; i++)
	 printf("%" PRId64 "\n", primes[i]);*/

	printf("\nN-Quadratic primes found %" PRId64 "\n", nb_qr_primes);
	STOP_TIMER_PRINT_TIME("\tQuadratic prime filtering done");

//--------------------------------------------------------
// calculate modular roots
//--------------------------------------------------------
	START_TIMER();
	modular_roots = calloc(nb_qr_primes, sizeof(modular_root_t));
	mpz_t tmp, r1, r2;
	mpz_init(tmp);
	mpz_init(r1);
	mpz_init(r2);

	for (i = 0; i < nb_qr_primes; i++) {
		mpz_set_ui(tmp, (unsigned long) primes[i]);
		mpz_sqrtm(r1, N, tmp); /* calculate the modular root */
		mpz_neg(r2, r1); /* -q mod n */
		mpz_mod(r2, r2, tmp);

		modular_roots[i].root1 = mpz_get_ui(r1);
		modular_roots[i].root2 = mpz_get_ui(r2);
	}
	mpz_clear(tmp);
	mpz_clear(r1);
	mpz_clear(r2);
	STOP_TIMER_PRINT_TIME("\nModular roots calculation done");

	/*for(i=0; i<nb_qr_primes; i++)
	 {
	 printf("[%10" PRId64 "-> roots: %10u - %10u]\n", primes[i], modular_roots[i].root1, modular_roots[i].root2);
	 }*/

//--------------------------------------------------------
//         ***** initialize the matrix *****
//--------------------------------------------------------
	START_TIMER();
	init_matrix(&matrix, nb_qr_primes + NB_VECTORS_OFFSET, nb_qr_primes);
	mpz_init2(tmp_matrix_row, nb_qr_primes);
	STOP_TIMER_PRINT_TIME("\nMatrix initialized");

//--------------------------------------------------------
// [Sieving]
//--------------------------------------------------------
	START_TIMER();

	mpz_t x, sieving_index, next_sieving_index;
	unsigned long ui_index, SIEVING_STEP = 50000; /* we sieve for 50000 elements at each loop */
	uint64_t p_pow;
	smooth_number_t *x_squared;

	x_squared = calloc(SIEVING_STEP, sizeof(smooth_number_t));
	smooth_numbers = calloc(nb_qr_primes + NB_VECTORS_OFFSET,
			sizeof(smooth_number_t));

	mpz_init_set(x, sqrtN);
	mpz_init_set(sieving_index, x);
	mpz_init_set(next_sieving_index, x);

	mpz_t p;
	mpz_init(p);
	mpz_t str;
	mpz_init_set(str, sieving_index);
	printf("\nSieving ...\n");

//--------------------------------------------------------
// Init before sieving
//--------------------------------------------------------
	for (i = 0; i < SIEVING_STEP; i++) {
		mpz_init(x_squared[i].value_x);
		mpz_init(x_squared[i].value_x_squared);

		/* the factors_exp array is used to keep track of exponents */
		//x_squared[i].factors_exp = calloc(nb_qr_primes, sizeof(uint64_t));
		/* we use directly the exponents vector modulo 2 to preserve space */mpz_init2(
				x_squared[i].factors_vect, nb_qr_primes);
		mpz_add_ui(x, x, 1);
	}

	int nb_smooth_per_round = 0;
	char s[512];

//--------------------------------------------------------
// WHILE smooth numbers found less than the primes in the smooth base + NB_VECTORS_OFFSET
//--------------------------------------------------------
	while (nb_smooth_numbers_found < nb_qr_primes + NB_VECTORS_OFFSET) {
		nb_smooth_per_round = 0;
		mpz_set(x, next_sieving_index); /* sieve numbers from sieving_index to sieving_index + sieving_step */
		mpz_set(sieving_index, next_sieving_index);

		printf("\r");
		printf(
				"\t\tSieving at: %s30 <--> Smooth numbers found: %" PRId64 "/%" PRId64 "",
				mpz_get_str(NULL, 10, sieving_index), nb_smooth_numbers_found,
				nb_qr_primes);
		fflush(stdout);

		for (i = 0; i < SIEVING_STEP; i++) {
			mpz_set(x_squared[i].value_x, x);

			mpz_pow_ui(x_squared[i].value_x_squared, x, 2); /* calculate value_x_squared <- x²-n */
			mpz_sub(x_squared[i].value_x_squared, x_squared[i].value_x_squared,
					N);

			mpz_clear(x_squared[i].factors_vect);
			mpz_init2(x_squared[i].factors_vect, nb_qr_primes); /* reconstruct a new fresh 0ed vector of size nb_qr_primes bits */

			mpz_add_ui(x, x, 1);
		}
		mpz_set(next_sieving_index, x);

//--------------------------------------------------------
// eliminate factors in the x_squared array, those who are 'destructed' to 1 are smooth
//--------------------------------------------------------

		for (i = 0; i < nb_qr_primes; i++) {
			mpz_set_ui(p, (unsigned long) primes[i]);
			mpz_set(x, sieving_index);

			/* get the first multiple of p that is directly larger that sieving_index
			 * Quadratic SIEVING: all elements from this number and in positions multiples of root1 and root2
			 * are also multiples of p */
			get_sieving_start_index(x, x, p, modular_roots[i].root1);
			mpz_set(str, x);
			mpz_sub(x, x, sieving_index); /* x contains index of first number that is divisible by p */

			for (j = mpz_get_ui(x); j < SIEVING_STEP; j += primes[i]) {
				p_pow = mpz_remove(x_squared[j].value_x_squared,
						x_squared[j].value_x_squared, p); /* eliminate all factors of p */

				if (p_pow & 1) /* mark bit if odd power of p exists in this x_squared[j] */
				{
					mpz_setbit(x_squared[j].factors_vect, i);
				}

				if (mpz_cmp_ui(x_squared[j].value_x_squared, 1) == 0) {
					save_smooth_number(x_squared[j]);
					nb_smooth_per_round++;
				}
				/* sieve next element located p steps from here */
			}

			/* same goes for root2 */
			if (modular_roots[i].root2 == modular_roots[i].root1)
				continue;

			mpz_set(x, sieving_index);

			get_sieving_start_index(x, x, p, modular_roots[i].root2);
			mpz_set(str, x);
			mpz_sub(x, x, sieving_index);

			for (j = mpz_get_ui(x); j < SIEVING_STEP; j += primes[i]) {
				p_pow = mpz_remove(x_squared[j].value_x_squared,
						x_squared[j].value_x_squared, p);

				if (p_pow & 1) {
					mpz_setbit(x_squared[j].factors_vect, i);
				}

				if (mpz_cmp_ui(x_squared[j].value_x_squared, 1) == 0) {
					save_smooth_number(x_squared[j]);
					nb_smooth_per_round++;
				}
			}
		}
		//printf("\tSmooth numbers found %" PRId64 "\n", nb_smooth_numbers_found);
		/*sprintf(s, "[start: %s - end: %s - step: %" PRId64 "] nb_smooth_per_round: %d",
		 mpz_get_str(NULL, 10, sieving_index),
		 mpz_get_str(NULL, 10, next_sieving_index),
		 SIEVING_STEP,
		 nb_smooth_per_round);
		 APPEND_TO_LOG_FILE(s);*/
	}

	STOP_TIMER_PRINT_TIME("\nSieving DONE");

	uint64_t t = 0;

//--------------------------------------------------------
//the matrix ready, start Gauss elimination. The Matrix is filled on the call of save_smooth_number()
//--------------------------------------------------------
	START_TIMER();
	gauss_elimination(&matrix);
	STOP_TIMER_PRINT_TIME("\nGauss elimination done");
	//print_matrix_matrix(&matrix);
	//print_matrix_identity(&matrix);

	uint64_t row_index = nb_qr_primes + NB_VECTORS_OFFSET - 1; /* last row in the matrix */
	int nb_linear_relations = 0;
	mpz_t linear_relation_z, solution_z;
	mpz_init(linear_relation_z);
	mpz_init(solution_z);

	get_matrix_row(linear_relation_z, &matrix, row_index--); /* get the last few rows in the Gauss eliminated matrix*/
	while (mpz_cmp_ui(linear_relation_z, 0) == 0) {
		nb_linear_relations++;
		get_matrix_row(linear_relation_z, &matrix, row_index--);
	}

	printf("\tLinear dependent relations found : %d\n", nb_linear_relations);

//--------------------------------------------------------
// Factor
//--------------------------------------------------------
	//We use the last linear relation to reconstruct our solution
	START_TIMER();
	printf("\nFactorizing..\n");
	mpz_t solution_X, solution_Y;
	mpz_init(solution_X);
	mpz_init(solution_Y);

	/* we start testing from the first linear relation encountered in the matrix */
	for (j = nb_linear_relations; j > 0; j--) {
		printf("Trying %d..\n", nb_linear_relations - j + 1);
		mpz_set_ui(solution_X, 1);
		mpz_set_ui(solution_Y, 1);

		get_identity_row(solution_z, &matrix,
				nb_qr_primes + NB_VECTORS_OFFSET - j + 1);

		for (i = 0; i < nb_qr_primes; i++) {
			if (mpz_tstbit(solution_z, i)) {
				mpz_mul(solution_X, solution_X, smooth_numbers[i].value_x);
				mpz_mod(solution_X, solution_X, N); /* reduce x to modulo N */

				mpz_mul(solution_Y, solution_Y,
						smooth_numbers[i].value_x_squared);
				/*TODO: handling huge stuff here, there is no modulo N like in the solution_X case!
				 * eliminate squares as long as you go*/
			}
		}

		mpz_sqrt(solution_Y, solution_Y);
		mpz_mod(solution_Y, solution_Y, N); /* y = sqrt(MUL(xi²-n)) mod N */

		mpz_sub(solution_X, solution_X, solution_Y);

		mpz_gcd(solution_X, solution_X, N);

		if (mpz_cmp(solution_X, N) != 0 && mpz_cmp_ui(solution_X, 1) != 0) /* factor can be 1 or N, try another relation */
			break;
	}
	mpz_cdiv_q(solution_Y, N, solution_X);

	printf("\n>>>>>>>>>>> FACTORED %s =\n", mpz_get_str(NULL, 10, N));
	printf("\tFactor 1: %s \n\tFactor 2: %s", mpz_get_str(NULL, 10, solution_X),
			mpz_get_str(NULL, 10, solution_Y));

	/*sprintf(s, "\n>>>>>>>>>>> FACTORED %s =\n", mpz_get_str(NULL, 10, N));
	 APPEND_TO_LOG_FILE(s);
	 sprintf(s, "\tFactor 1: %s \n\tFactor 2: %s", mpz_get_str(NULL, 10, solution_X), mpz_get_str(NULL, 10, solution_Y));
	 APPEND_TO_LOG_FILE(s);

	 gettimeofday(&end_global, NULL);
	 timersub(&end_global, &start_global, &elapsed);
	 sprintf(s, "****** TOTAL TIME: %.3f ms\n", elapsed.tv_sec * 1000 + elapsed.tv_usec / (double) 1000);
	 APPEND_TO_LOG_FILE(s);*/

	STOP_TIMER_PRINT_TIME("\nFactorizing done");

	printf("Cleaning memory..\n");

	/********************** clear the x_squared array **********************/
	for (i = 0; i < SIEVING_STEP; i++) {
		mpz_clear(x_squared[i].value_x);
		mpz_clear(x_squared[i].value_x_squared);
		//free(x_squared[i].factors_exp);
		mpz_clear(x_squared[i].factors_vect);
	}
	free(x_squared);
	/********************** clear the x_squared array **********************/

	free(modular_roots);
	/********************** clear the smooth_numbers array **********************/
	for (i = 0; i < nb_qr_primes + NB_VECTORS_OFFSET; i++) {
		mpz_clear(smooth_numbers[i].value_x);
		mpz_clear(smooth_numbers[i].value_x_squared);
		//free(smooth_numbers[i].factors_exp);
	}
	free(smooth_numbers);
	/********************** clear the smooth_numbers array **********************/

	free(primes);
	/********************** clear mpz _t **********************/mpz_clear(B);
	mpz_clear(N);
	sqrtN, rem;
	mpz_clear(x);
	mpz_clear(sieving_index);
	mpz_clear(next_sieving_index);
	mpz_clear(p);
	mpz_clear(str);
	/********************** clear mpz _t **********************/

	free_matrix(&matrix);

	gettimeofday(&end_global, NULL);
	timersub(&end_global, &start_global, &elapsed);
	printf("****** TOTAL TIME: %.3f ms\n",
			elapsed.tv_sec * 1000 + elapsed.tv_usec / (double) 1000);
	show_mem_usage();
	return 0;
}
Ejemplo n.º 18
0
/*************************************************************************
 * Build a completely connected HMM.
 *************************************************************************/
void build_complete_hmm
  (ARRAY_T* background,
   int spacer_states, 
   MOTIF_T *motifs,
   int nmotifs,
   MATRIX_T *transp_freq,
   MATRIX_T *spacer_ave,
   BOOLEAN_T fim,
   MHMM_T **the_hmm)
{
  ALPH_T    alph;
  int motif_states; // Total length of the motifs.
  int num_spacers;  // Total number of spacer states.
  int num_states;   // Total number of states in the model.
  int i_motif;      // Index of the current "from" motif.
  int j_motif;      // Index of the current "to" motif.
  int i_position;   // Index within the current motif or spacer.
  int i_state = 0;  // Index of the current state.

  assert(nmotifs > 0);
  alph = get_motif_alph(motifs);// get the alphabet from the first motif

  // Count the width of the motifs.
  for (motif_states = 0, i_motif = 0; i_motif < nmotifs; i_motif++)
    motif_states += get_motif_length(motif_at(motifs, i_motif));
  // Count the spacer states adjacent to begin and end.
  num_spacers = nmotifs * 2;
  // Add the spacer states between motifs.
  num_spacers += nmotifs * nmotifs;
  // Total states = motifs + spacer_states + begin/end
  num_states = motif_states + (num_spacers * spacer_states) + 2;

  // Allocate the model.
  *the_hmm = allocate_mhmm(alph, num_states);

  // Record that this is a completely connected model.
  (*the_hmm)->type = COMPLETE_HMM;

  // Record the number of motifs in the model.
  (*the_hmm)->num_motifs = nmotifs;

  // Record the number of states in the model.
  (*the_hmm)->num_states = num_states;
  (*the_hmm)->num_spacers = ((nmotifs + 1) * (nmotifs + 1)) - 1;
  (*the_hmm)->spacer_states = spacer_states;

  // Put the background distribution into the model.
  copy_array(background, (*the_hmm)->background);

  // Build the begin state.
  build_complete_state(
      START_STATE, 
      i_state,
      alph,
      0, // expected length
      NULL, // Emissions.
      0, // Number of sites.
      NON_MOTIF_INDEX,
      NON_MOTIF_POSITION,
      nmotifs,
      0, // previous motif
      0, // next motif
      transp_freq,
      spacer_states,
      num_spacers,
      motifs,
      &((*the_hmm)->states[i_state]));
  i_state++;

  int from_motif_state, to_motif_state;
  // Build the spacer states. No transitions from the end state.
  for (i_motif = 0; i_motif <= nmotifs; i_motif++) {
    // No transitions to the start state.
    for (j_motif = 1; j_motif <= nmotifs+1; j_motif++) {
      // No transitions from start to end.
      if ((i_motif == 0) && (j_motif == nmotifs+1))
        continue;
      // Allow multi-state spacers.
      for (i_position = 0; i_position < spacer_states; i_position++, i_state++) {
        build_complete_state(
            SPACER_STATE, 
            i_state, 
            alph,
            get_matrix_cell(i_motif, j_motif, spacer_ave),
            background,
            SPACER_NUMSITES,
            NON_MOTIF_INDEX,
            i_position,
            nmotifs,
            i_motif,
            j_motif,
            transp_freq,
            spacer_states,
            num_spacers,
            motifs,
            &((*the_hmm)->states[i_state]));
      }
    }
  }

  // Build the motif states.
  for (i_motif = 0; i_motif < nmotifs; i_motif++) {
    MOTIF_T *this_motif = motif_at(motifs, i_motif);
    STATE_T state;
    for (i_position = 0; i_position < get_motif_length(this_motif); i_position++, i_state++) {
      if (i_position == 0) {
        state = START_MOTIF_STATE;
      } else if (i_position == (get_motif_length(this_motif) - 1)) {
        state = END_MOTIF_STATE;
      } else {
        state = MID_MOTIF_STATE;
      }
      build_complete_state(
          MID_MOTIF_STATE, 
          i_state,
          alph,
          0, // Expected spacer length. 
          get_matrix_row(i_position, get_motif_freqs(this_motif)),
          get_motif_nsites(this_motif),
          i_motif,
          i_position, 
          nmotifs,
          0, // Previous motif index.
          0, // Next motif index.
          transp_freq,
          spacer_states,
          num_spacers,
          motifs,
          &((*the_hmm)->states[i_state]));
    }
  }

  // Build the end state.
  build_complete_state(
      END_STATE, 
      i_state,
      alph,
      0, // Expected spacer length.
      NULL, // Emissions
      0, // Number of sites.
      NON_MOTIF_INDEX,
      NON_MOTIF_POSITION,
      nmotifs,
      0, // Previous motif index.
      0, // Next motif index.
      transp_freq,
      spacer_states,
      num_spacers,
      motifs,
      &((*the_hmm)->states[i_state]));
  i_state++;

  // Convert spacers to FIMs if requested.
  if (fim) {
    convert_to_fims(*the_hmm);
  }

  // Fill in the transition matrix.
  build_transition_matrix(*the_hmm);
}
Ejemplo n.º 19
0
/**************************************************************************
*
*	get_pdf_table
*
*	Compute the pdf of a pssm.
*
*	Returns an array of pdf values:
*		pdf[x] = Pr(score == x)
*	for 0<=x<=range*w.
*
*	Assumes:
*		1) motif scores are non-negative, integral
*		2) background model is position-dependent, 0-order Markov
*
**************************************************************************/
static ARRAY_T* get_pdf_table(
  PSSM_T* pssm,				          // The PSSM.
  MATRIX_T* background_matrix,  // Background model PSSM matrix.
  ARRAY_T* scaled_lo_prior_dist // Scaled distribution of log odds priors.
)
{
  int i, j, k;
  MATRIX_T* matrix = pssm->matrix;// The PSSM matrix.
  int w = pssm->w;// PSSM width.
  int alen = pssm->alphsize;
  if (alen == alph_size(pssm->alph, ALL_SIZE)) { 
    // CONSIDER We need to review how ambiguity characters are used 
    // in probability calculation throughout the code.
    // We don't want to include the background probabilities for ambiguity 
    // characters in this calculation, only the primary charcters of the alphabet.
    // However, Motiph the motiph 'alphabet' actually includes all possible
    // columns of the mutiple alignment, so we skip this clause if 
    // pssm->alphsize > get_alph_size(ALL_SIZE)
    alen = alph_size(pssm->alph, ALPH_SIZE);
  }
  int range = pssm->range;// Maximum score in PSSM.
  int size = w*range+1;
  if (scaled_lo_prior_dist != NULL) {
    // Having priors expands the range of possible scores
    size += range;
  }
  ARRAY_T* pdf_old = allocate_array(size);
  ARRAY_T* pdf_new = allocate_array(size);

  init_array(0, pdf_new);
  set_array_item(0, 1, pdf_new);		// Prob(0)

  if (scaled_lo_prior_dist != NULL) {
    // Use distribution of log odds priors to 
    // initialize starting probabilities.
    for (k=0; k<=range; k++) {
      double prob = get_array_item(k, scaled_lo_prior_dist);
      set_array_item(k, prob, pdf_new);
    }
  }

  // Compute the pdf recursively.
  for (i=0; i<w; i++) {
    int max;
    if (scaled_lo_prior_dist == NULL) {
      max = i * range;
    }
    else {
      // Having priors expands the range of possible scores
      max = (i + 1) * range;
    }
    // get position dependent background model
    ARRAY_T* background = get_matrix_row(i, background_matrix);
    SWAP(ARRAY_T*, pdf_new, pdf_old)
    for (k=0; k<=max+range; k++) {
      set_array_item(k, 0, pdf_new);
    }
    for (j=0; j<alen; j++) {
      int s = (int) get_matrix_cell(i, j, matrix);
      for(k=0; k<=max; k++) {
        double old = get_array_item(k, pdf_old);
        if (old != 0) {
          double new = get_array_item(k+s, pdf_new) +
            (old * get_array_item(j, background));
          set_array_item(k+s, new, pdf_new);
        } // old
      } // k
    } // j
  } // i
Ejemplo n.º 20
0
/*************************************************************************
 * Build a linear HMM.
 *************************************************************************/
void build_linear_hmm
  (ARRAY_T*  background,
   ORDER_T*  order_spacing,
   int       spacer_states, 
   RBTREE_T* motifs, // motifs with key as in order_spacing
   BOOLEAN_T fim,
   MHMM_T**  the_hmm)
{
  ALPH_T    alph;
  int       model_length; // Total number of states in the model.
  int       i_state;      // Index of the current state.
  int       i_order;      // Index within the order and spacing.
  int       i_position;   // Index within the current motif or spacer.
  int       motif_i;      // motif key in order spacing
  MOTIF_T  *motif;        // motif
  RBNODE_T *node;

  alph = get_motif_alph((MOTIF_T*)rbtree_value(rbtree_first(motifs)));

  // Calculate the total length of the model.
  model_length = 2; // start and end state
  for (i_order = 0; i_order < get_order_occurs(order_spacing); i_order++) {
    motif_i = get_order_motif(order_spacing, i_order);
    motif = (MOTIF_T*)rbtree_get(motifs, &motif_i);
    model_length += get_motif_length(motif);
  }
  model_length += (get_order_occurs(order_spacing) + 1) * spacer_states;


  // Allocate the model.
  *the_hmm = allocate_mhmm(alph, model_length);
  check_sq_matrix((*the_hmm)->trans, model_length);

  // Record that this is a linear model.
  (*the_hmm)->type = LINEAR_HMM;

  // Record the number of motifs in the model. 
  // It doesn't want the distinct count
  (*the_hmm)->num_motifs = get_order_occurs(order_spacing);

  // Record the number of states in the model.
  (*the_hmm)->num_states = model_length;
  (*the_hmm)->num_spacers = get_order_occurs(order_spacing) + 1;
  (*the_hmm)->spacer_states = spacer_states;

  // Put the background distribution into the model.
  copy_array(background, (*the_hmm)->background);

  // Begin the model with a non-emitting state.
  i_state = 0;
  check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states);
  build_linear_state(
      alph,
      START_STATE,
      i_state,
      get_spacer_length(order_spacing, 0),
      NULL, // Emissions.
      0, // Number of sites.
      NON_MOTIF_INDEX,
      NON_MOTIF_POSITION, // position within state (not relevant to start state)
      NULL, // no motif
      &((*the_hmm)->states[i_state]));
  ++i_state;

  // Build the first spacer.
  for (i_position = 0; i_position < spacer_states; i_position++, i_state++) {
    check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states);
    build_linear_state(
        alph,
        SPACER_STATE,
        i_state, 
        get_spacer_length(order_spacing, 0),
        background, 
        SPACER_NUMSITES,
        NON_MOTIF_INDEX,
        i_position, // position within spacer
        NULL, // no motif
        &((*the_hmm)->states[i_state]));
  }

  // Build each motif and subsequent spacer.
  for (i_order = 0; i_order < get_order_occurs(order_spacing); i_order++) {
    STATE_T state;
    int spacer_len;
    motif_i = get_order_motif(order_spacing, i_order);
    motif = (MOTIF_T*)rbtree_get(motifs, &motif_i);

    // Build the motif.
    for (i_position = 0; i_position < get_motif_length(motif); i_position++, i_state++) {
      if (i_position == 0) {
        state = START_MOTIF_STATE;
        spacer_len = get_spacer_length(order_spacing, i_order);
      } else if (i_position == (get_motif_length(motif) - 1)) {
        state = END_MOTIF_STATE;
        spacer_len = get_spacer_length(order_spacing, i_order+1);
      } else {
        state = MID_MOTIF_STATE;
        spacer_len = 0;
      }
      check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states);
      build_linear_state(
          alph, 
          state, 
          i_state, 
          spacer_len, // Expected spacer length.
          get_matrix_row(i_position, get_motif_freqs(motif)),
          get_motif_nsites(motif),
          i_order,
          i_position, // position within motif (middle)
          motif,
          &((*the_hmm)->states[i_state]));
    }

    // Build the following spacer.
    for (i_position = 0; i_position < spacer_states; i_position++, i_state++) {
      check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states);
      build_linear_state(
          alph, 
          SPACER_STATE, 
          i_state, 
          get_spacer_length(order_spacing, i_order+1),
          background,
          SPACER_NUMSITES,
          NON_MOTIF_INDEX, 
          i_position, // position within spacer
          NULL, // no motif
          &((*the_hmm)->states[i_state]));
    }
  }

  check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states);
  // Finish up the model with a non-emitting end state.
  build_linear_state(
      alph, 
      END_STATE, 
      i_state, 
      get_spacer_length(order_spacing, i_order),
      NULL, // Emissions.
      0, // Number of sites.
      NON_MOTIF_INDEX,
      NON_MOTIF_POSITION, // position within state (not relevant to end state)
      NULL, // no motif
      &((*the_hmm)->states[i_state]));
  ++i_state;
  assert(i_state == model_length);

  check_sq_matrix((*the_hmm)->trans, (*the_hmm)->num_states);
  // Convert spacers to FIMs if requested.
  if (fim) {
    convert_to_fims(*the_hmm);
  }

  // Fill in the transition matrix.
  build_transition_matrix(*the_hmm);
}
Ejemplo n.º 21
0
/***********************************************************************
 * Compute the number of positions from the start or end of a motif
 * that contain a given percentage of the information content.
 *
 * Information content is the same as relative entropy, and is computed
 * as
 *
 *  \sum_i p_i log(p_i/f_i)
 *
 ***********************************************************************/
int get_info_content_position
  (BOOLEAN_T from_start, // Count from start?  Otherwise, count from end.
   float     threshold,  // Information content threshold (in 0-100).
   ARRAY_T*  background, // Background distribution.
   MOTIF_T*  a_motif)
{
  // Make sure the given threshold is in the right range.
  if ((threshold < 0.0) || (threshold > 100.0)) {
    die(
      "Information threshold (%g) must be a percentage between 0 and 100.\n",
	    threshold
    );
  }

  // Get the dimensions of the motif.
  int num_cols = get_alph_size(ALPH_SIZE);
  int num_rows = get_num_rows(a_motif->freqs);

  // Compute and store the information content for each row
  // and the total information content for the motif.
  ATYPE total_information_content = 0.0;
  ARRAY_T* information_content = allocate_array(num_rows);
  int i_row;
  int i_col;
  for (i_row = 0; i_row < num_rows; i_row++) {
    ATYPE row_content = 0.0;
    ARRAY_T* this_emission = get_matrix_row(i_row, a_motif->freqs);
    for (i_col = 0; i_col < num_cols; i_col++) {
      ATYPE this_item = get_array_item(i_col, this_emission);
      ATYPE background_item = get_array_item(i_col, background);

      // Use two logs to avoid handling divide-by-zero as a special case.
      ATYPE partial_row_content = 
        this_item * (my_log(this_item) - my_log(background_item));

      row_content += partial_row_content;
      total_information_content += partial_row_content;

    }
    set_array_item(i_row, row_content, information_content);
  }

  // Search for the target position.
  int return_value = -1;
  ATYPE cumulative_content = 0.0;
  ATYPE percent = 0.0;
  if (from_start) {
    // Search from start for IC exceeding threshold.
    for (i_row = 0; i_row < num_rows; i_row++) {
      cumulative_content += get_array_item(i_row, information_content);
      percent = 100 *  cumulative_content / total_information_content;
      if (percent >= threshold) {
        return_value = i_row;
        break;
      }
    }
  }
  else {
    // Search from end for IC exceeding threshold.
    for (i_row = num_rows - 1; i_row >= 0; i_row--) {
      cumulative_content += get_array_item(i_row, information_content);
      percent = 100 *  cumulative_content / total_information_content;
      if (percent >= threshold) {
        return_value = i_row;
        break;
      }
    }
  }

  if (return_value == -1) {
    die(
      "Can't find a position that accounts for %g of information content.",
      threshold
    );
  }
  free_array(information_content);
  return(return_value);
}
Ejemplo n.º 22
0
/* assuming slaves (workers)) are all homogenous, let them all do the calculations
 regarding primes sieving, calculating the smoothness base and the modular roots */
int main(int argc, char **argv) {
	MPI_Init(&argc, &argv);
	MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
	MPI_Comm_size(MPI_COMM_WORLD, &mpi_group_size);
	int len;
	MPI_Get_processor_name(processor_name, &len);

	gettimeofday(&start_global, NULL);
	print_lib_version();

	mpz_init(N);
	mpz_t B;
	mpz_init(B);

	unsigned long int uBase;
	int64_t nb_primes;
	modular_root_t *modular_roots;

	uint64_t i, j;

	if (argc < 2) {
		PRINT(my_rank, "usage: %s Number_to_factorize\n", argv[0]);
		exit(2);
	}

	if (mpz_init_set_str(N, argv[1], 10) == -1) {
		PRINT(my_rank, "Cannot load N %s\n", argv[1]);
		exit(2);
	}

	mpz_t sqrtN, rem;
	mpz_init(sqrtN);
	mpz_init(rem);
	mpz_sqrtrem(sqrtN, rem, N);

	if (mpz_cmp_ui(rem, 0) != 0) /* if not perfect square, calculate the ceiling */
		mpz_add_ui(sqrtN, sqrtN, 1);
	else /* N is a perfect square, factored! */
	{
		PRINT(my_rank, "\n<<<[FACTOR]>>> %s\n", mpz_get_str(NULL, 10, sqrtN));
		return 0;
	}

	if (mpz_probab_prime_p(N, 10) > 0) /* don't bother factoring */
	{
		PRINT(my_rank, "N:%s is prime\n", mpz_get_str(NULL, 10, N));
		exit(0);
	}

	OPEN_LOG_FILE("freq");

//--------------------------------------------------------
//  calculate the smoothness base for the given N
//--------------------------------------------------------
	get_smoothness_base(B, N); /* if N is too small, the program will surely fail, please consider a pen and paper instead */
	uBase = mpz_get_ui(B);
	PRINT(my_rank, "n: %s\tBase: %s\n",
			mpz_get_str(NULL, 10, N), mpz_get_str(NULL, 10, B));

//--------------------------------------------------------
// sieve primes that are less than the smoothness base using Eratosthenes sieve
//--------------------------------------------------------
	START_TIMER();
	nb_primes = sieve_primes_up_to((int64_t) (uBase));

	PRINT(my_rank, "\tPrimes found %" PRId64 " [Smoothness Base %lu]\n",
			nb_primes, uBase);
	STOP_TIMER_PRINT_TIME("\tEratosthenes Sieving done");

//--------------------------------------------------------
// fill the primes array with primes to which n is a quadratic residue
//--------------------------------------------------------
	START_TIMER();
	primes = calloc(nb_primes, sizeof(int64_t));
	nb_qr_primes = fill_primes_with_quadratic_residue(primes, N);

	/*for(i=0; i<nb_qr_primes; i++)
	 PRINT(my_rank, "%" PRId64 "\n", primes[i]);*/

	PRINT(my_rank, "\tN-Quadratic primes found %" PRId64 "\n", nb_qr_primes);
	STOP_TIMER_PRINT_TIME("\tQuadratic prime filtering done");

//--------------------------------------------------------
// calculate modular roots
//--------------------------------------------------------
	START_TIMER();
	modular_roots = calloc(nb_qr_primes, sizeof(modular_root_t));
	mpz_t tmp, r1, r2;
	mpz_init(tmp);
	mpz_init(r1);
	mpz_init(r2);

	for (i = 0; i < nb_qr_primes; i++) {
		mpz_set_ui(tmp, (unsigned long) primes[i]);
		mpz_sqrtm(r1, N, tmp); /* calculate the modular root */
		mpz_neg(r2, r1); /* -q mod n */
		mpz_mod(r2, r2, tmp);

		modular_roots[i].root1 = mpz_get_ui(r1);
		modular_roots[i].root2 = mpz_get_ui(r2);
	}
	mpz_clear(tmp);
	mpz_clear(r1);
	mpz_clear(r2);
	STOP_TIMER_PRINT_TIME("Modular roots calculation done");

//--------------------------------------------------------
//         ***** initialize the matrix *****
//--------------------------------------------------------
	if (my_rank == 0) /* only the master have the matrix */
	{
		START_TIMER();
		init_matrix(&matrix, nb_qr_primes + NB_VECTORS_OFFSET, nb_qr_primes);
		mpz_init2(tmp_matrix_row, nb_qr_primes);
		STOP_TIMER_PRINT_TIME("Matrix initialized");
	}

//--------------------------------------------------------
// [Sieving] - everyones sieves including the master
//--------------------------------------------------------
	START_TIMER();

	mpz_t x, sieving_index, next_sieving_index, relative_start, global_step;
	unsigned long ui_index, SIEVING_STEP = 50000; /* we sieve for 50000 elements at each loop */
	int LOCAL_SIEVING_ROUNDS = 10; /* number of iterations a worker sieves before communicating results to the master */
	unsigned long sieving_round = 0;
	unsigned long nb_big_rounds = 0;

	uint64_t p_pow;
	smooth_number_t *x_squared;

	x_squared = calloc(SIEVING_STEP, sizeof(smooth_number_t));

	if (my_rank == 0)
		smooth_numbers = calloc(nb_qr_primes + NB_VECTORS_OFFSET,
				sizeof(smooth_number_t));
	else
		temp_slaves_smooth_numbers = calloc(500, sizeof(smooth_number_t));
	/* TODO: this is not properly correct, using a linkedlist is better to keep track of temporary
	 * smooth numbers at the slaves nodes however it's pretty rare to find 500 smooth numbers in
	 * 50000 * 10 interval. */

	mpz_init_set(x, sqrtN);
	mpz_init(global_step);
	mpz_init(relative_start);
	mpz_init(sieving_index);
	mpz_init(next_sieving_index);

	mpz_t p;
	mpz_init(p);
	mpz_t str;
	mpz_init_set(str, sieving_index);
	PRINT(my_rank, "\n[%s] Sieving ...\n", processor_name);

//--------------------------------------------------------
// Init before sieving
//--------------------------------------------------------
	for (i = 0; i < SIEVING_STEP; i++) {
		mpz_init(x_squared[i].value_x);
		mpz_init(x_squared[i].value_x_squared);

		mpz_init2(x_squared[i].factors_vect, nb_qr_primes);
		mpz_add_ui(x, x, 1);
	}

	int nb_smooth_per_round = 0;
	char s[512];

//--------------------------------------------------------
// WHILE smooth numbers found less than the primes in the smooth base + NB_VECTORS_OFFSET for master
// Or master asked for more smooth numbers from slaves
//--------------------------------------------------------
	while (1) {
		mpz_set_ui(global_step, nb_big_rounds); /* calculates the coordinate where the workers start sieving from */
		mpz_mul_ui(global_step, global_step, (unsigned long) mpi_group_size);
		mpz_mul_ui(global_step, global_step, SIEVING_STEP);
		mpz_mul_ui(global_step, global_step, LOCAL_SIEVING_ROUNDS);
		mpz_add(global_step, global_step, sqrtN);

		mpz_set_ui(relative_start, SIEVING_STEP);
		mpz_mul_ui(relative_start, relative_start, LOCAL_SIEVING_ROUNDS);
		mpz_mul_ui(relative_start, relative_start, (unsigned long) my_rank);
		mpz_add(relative_start, relative_start, global_step);

		mpz_set(sieving_index, relative_start);
		mpz_set(next_sieving_index, relative_start);

		for (sieving_round = 0; sieving_round < LOCAL_SIEVING_ROUNDS; /* each slave sieves for LOCAL_SIEVING_ROUNDS rounds */
		sieving_round++) {
			nb_smooth_per_round = 0;
			mpz_set(x, next_sieving_index); /* sieve numbers from sieving_index to sieving_index + sieving_step */
			mpz_set(sieving_index, next_sieving_index);

			if (my_rank == 0) {
				printf("\r");
				printf(
						"\t\tSieving at: %s30 <--> Smooth numbers found: %" PRId64 "/%" PRId64 "",
						mpz_get_str(NULL, 10, sieving_index),
						nb_global_smooth_numbers_found, nb_qr_primes);
				fflush(stdout);
			}

			for (i = 0; i < SIEVING_STEP; i++) {
				mpz_set(x_squared[i].value_x, x);

				mpz_pow_ui(x_squared[i].value_x_squared, x, 2); /* calculate value_x_squared <- x²-n */
				mpz_sub(x_squared[i].value_x_squared,
						x_squared[i].value_x_squared, N);

				mpz_clear(x_squared[i].factors_vect);
				mpz_init2(x_squared[i].factors_vect, nb_qr_primes); /* reconstruct a new fresh 0ed vector of size nb_qr_primes bits */

				mpz_add_ui(x, x, 1);
			}
			mpz_set(next_sieving_index, x);

//--------------------------------------------------------
// eliminate factors in the x_squared array, those who are 'destructed' to 1 are smooth
//--------------------------------------------------------
			for (i = 0; i < nb_qr_primes; i++) {
				mpz_set_ui(p, (unsigned long) primes[i]);
				mpz_set(x, sieving_index);

				/* get the first multiple of p that is directly larger that sieving_index
				 * Quadratic SIEVING: all elements from this number and in positions multiples of root1 and root2
				 * are also multiples of p */
				get_sieving_start_index(x, x, p, modular_roots[i].root1);
				mpz_set(str, x);
				mpz_sub(x, x, sieving_index); /* x contains index of first number that is divisible by p */

				for (j = mpz_get_ui(x); j < SIEVING_STEP; j += primes[i]) {
					p_pow = mpz_remove(x_squared[j].value_x_squared,
							x_squared[j].value_x_squared, p); /* eliminate all factors of p */

					if (p_pow & 1) /* mark bit if odd power of p exists in this x_squared[j] */
					{
						mpz_setbit(x_squared[j].factors_vect, i);
					}

					if (mpz_cmp_ui(x_squared[j].value_x_squared, 1) == 0) {
						save_smooth_number(x_squared[j]);
						nb_smooth_per_round++;
					}
					/* sieve next element located p steps from here */
				}

				/* same goes for root2 */
				if (modular_roots[i].root2 == modular_roots[i].root1)
					continue;

				mpz_set(x, sieving_index);

				get_sieving_start_index(x, x, p, modular_roots[i].root2);
				mpz_set(str, x);
				mpz_sub(x, x, sieving_index);

				for (j = mpz_get_ui(x); j < SIEVING_STEP; j += primes[i]) {
					p_pow = mpz_remove(x_squared[j].value_x_squared,
							x_squared[j].value_x_squared, p);

					if (p_pow & 1) {
						mpz_setbit(x_squared[j].factors_vect, i);
					}

					if (mpz_cmp_ui(x_squared[j].value_x_squared, 1) == 0) {
						save_smooth_number(x_squared[j]);
						nb_smooth_per_round++;
					}
				}
			}
		}

		if (my_rank == 0) /* master gathers smooth numbers from slaves */
		{
			gather_smooth_numbers();
			notify_slaves();
		} else /* slaves send their smooth numbers to master */
		{
			send_smooth_numbers_to_master();
			nb_global_smooth_numbers_found = get_server_notification();
		}

		if (nb_global_smooth_numbers_found >= nb_qr_primes + NB_VECTORS_OFFSET)
			break;

		nb_big_rounds++;
	}

	STOP_TIMER_PRINT_TIME("\nSieving DONE");

	if (my_rank == 0) {
		uint64_t t = 0;

//--------------------------------------------------------
//the matrix ready, start Gauss elimination. The Matrix is filled on the call of save_smooth_number()
//--------------------------------------------------------
		START_TIMER();
		gauss_elimination(&matrix);
		STOP_TIMER_PRINT_TIME("\nGauss elimination done");

		uint64_t row_index = nb_qr_primes + NB_VECTORS_OFFSET - 1; /* last row in the matrix */
		int nb_linear_relations = 0;
		mpz_t linear_relation_z, solution_z;
		mpz_init(linear_relation_z);
		mpz_init(solution_z);

		get_matrix_row(linear_relation_z, &matrix, row_index--); /* get the last few rows in the Gauss eliminated matrix*/
		while (mpz_cmp_ui(linear_relation_z, 0) == 0) {
			nb_linear_relations++;
			get_matrix_row(linear_relation_z, &matrix, row_index--);
		}

		PRINT(my_rank, "\tLinear dependent relations found : %d\n",
				nb_linear_relations);

//--------------------------------------------------------
// Factor
//--------------------------------------------------------
		//We use the last linear relation to reconstruct our solution
		START_TIMER();
		PRINT(my_rank, "%s", "\nFactorizing..\n");
		mpz_t solution_X, solution_Y;
		mpz_init(solution_X);
		mpz_init(solution_Y);

		/* we start testing from the first linear relation encountered in the matrix */
		for (j = nb_linear_relations; j > 0; j--) {
			PRINT(my_rank, "Trying %d..\n", nb_linear_relations - j + 1);
			mpz_set_ui(solution_X, 1);
			mpz_set_ui(solution_Y, 1);

			get_identity_row(solution_z, &matrix,
					nb_qr_primes + NB_VECTORS_OFFSET - j + 1);

			for (i = 0; i < nb_qr_primes; i++) {
				if (mpz_tstbit(solution_z, i)) {
					mpz_mul(solution_X, solution_X, smooth_numbers[i].value_x);
					mpz_mod(solution_X, solution_X, N); /* reduce x to modulo N */

					mpz_mul(solution_Y, solution_Y,
							smooth_numbers[i].value_x_squared);
					/*TODO: handling huge stuff here, there is no modulo N like in the solution_X case!
					 * eliminate squares as long as you go*/
				}
			}

			mpz_sqrt(solution_Y, solution_Y);
			mpz_mod(solution_Y, solution_Y, N); /* y = sqrt(MUL(xi²-n)) mod N */

			mpz_sub(solution_X, solution_X, solution_Y);

			mpz_gcd(solution_X, solution_X, N);

			if (mpz_cmp(solution_X, N) != 0 && mpz_cmp_ui(solution_X, 1) != 0) /* factor can be 1 or N, try another relation */
				break;
		}
		mpz_cdiv_q(solution_Y, N, solution_X);

		PRINT(my_rank, "\n>>>>>>>>>>> FACTORED %s =\n",
				mpz_get_str(NULL, 10, N));
		PRINT(
				my_rank,
				"\tFactor 1: %s \n\tFactor 2: %s",
				mpz_get_str(NULL, 10, solution_X), mpz_get_str(NULL, 10, solution_Y));

		sprintf(s, "\n>>>>>>>>>>> FACTORED %s =\n", mpz_get_str(NULL, 10, N));
		APPEND_TO_LOG_FILE(s);
		sprintf(s, "\tFactor 1: %s \n\tFactor 2: %s",
				mpz_get_str(NULL, 10, solution_X),
				mpz_get_str(NULL, 10, solution_Y));
		APPEND_TO_LOG_FILE(s);

		gettimeofday(&end_global, NULL);
		timersub(&end_global, &start_global, &elapsed);
		sprintf(s, "****** TOTAL TIME: %.3f ms\n",
				elapsed.tv_sec * 1000 + elapsed.tv_usec / (double) 1000);
		APPEND_TO_LOG_FILE(s);

		STOP_TIMER_PRINT_TIME("\nFactorizing done");
	}

	PRINT(my_rank, "%s", "\nCleaning memory..\n");

	/********************** clear the x_squared array **********************/
	for (i = 0; i < SIEVING_STEP; i++) {
		mpz_clear(x_squared[i].value_x);
		mpz_clear(x_squared[i].value_x_squared);
		//free(x_squared[i].factors_exp);
		mpz_clear(x_squared[i].factors_vect);
	}
	free(x_squared);
	/********************** clear the x_squared array **********************/

	free(modular_roots);
	/********************** clear the smooth_numbers array **********************/
	if (my_rank == 0) {
		for (i = 0; i < nb_qr_primes + NB_VECTORS_OFFSET; i++) {
			mpz_clear(smooth_numbers[i].value_x);
			mpz_clear(smooth_numbers[i].value_x_squared);
			mpz_clear(smooth_numbers[i].factors_vect);
			//free(smooth_numbers[i].factors_exp);
		}
		free(smooth_numbers);
	} else {
		for (i = 0; i < 500; i++) {
			mpz_clear(temp_slaves_smooth_numbers[i].value_x);
			mpz_clear(temp_slaves_smooth_numbers[i].value_x_squared);
			mpz_clear(temp_slaves_smooth_numbers[i].factors_vect);
		}
		free(temp_slaves_smooth_numbers);
	}
	/********************** clear the smooth_numbers array **********************/

	free(primes);
	/********************** clear mpz _t **********************/mpz_clear(B);
	mpz_clear(N);
	sqrtN, rem;
	mpz_clear(x);
	mpz_clear(sieving_index);
	mpz_clear(next_sieving_index);
	mpz_clear(p);
	mpz_clear(str);
	/********************** clear mpz _t **********************/

	free_matrix(&matrix);

	gettimeofday(&end_global, NULL);
	timersub(&end_global, &start_global, &elapsed);
	PRINT(my_rank, "****** TOTAL TIME: %.3f ms\n",
			elapsed.tv_sec * 1000 + elapsed.tv_usec / (double) 1000);
	show_mem_usage();

	MPI_Finalize();

	return 0;
}
Ejemplo n.º 23
0
/*************************************************************************
 * Build a star topology HMM.
 *************************************************************************/
void build_star_hmm
  (ARRAY_T*  background,
   int       spacer_states, 
   MOTIF_T*  motifs,
   int       nmotifs,
   BOOLEAN_T fim,
   MHMM_T**  the_hmm)
{
  ALPH_T alph;
  int       motif_states; /* Total length of the motifs. */
  int       num_spacers;  /* Total number of spacer states. */
  int       num_states;   /* Total number of states in the model. */
  int       i_motif;      /* Index of the current "from" motif. */
  int       i_position;   /* Index within the current motif or spacer. */
  int       i_state = 0;  /* Index of the current state. */

  alph = get_motif_alph(motif_at(motifs, 0));

  /* Count the width of the motifs. */
  for (motif_states = 0, i_motif = 0; i_motif < nmotifs; i_motif++)
    motif_states += get_motif_length(motif_at(motifs, i_motif));
  // Only 1 spacer.
  num_spacers = 1;
  /* Total states = motifs + spacer_states + begin/end */
  num_states = motif_states + (num_spacers * spacer_states) + 2;
  /* fprintf(stderr, "motif_states=%d num_spacers=%d num_states=%d\n",
	  motif_states, num_spacers, num_states); */

  /* Allocate the model. */
  *the_hmm = allocate_mhmm(alph, num_states);

  /* Record that this is a star model. */
  (*the_hmm)->type = STAR_HMM;

  /* Record the number of motifs in the model. */
  (*the_hmm)->num_motifs = nmotifs;

  /* Record the number of states in the model. */
  (*the_hmm)->num_states = num_states;
  (*the_hmm)->num_spacers = 1;
  (*the_hmm)->spacer_states = spacer_states;

  // Put the background distribution into the model.
  copy_array(background, (*the_hmm)->background);

  /* Build the begin state. */
  build_star_state(
    alph,
    START_STATE,
		i_state,
		0, // expected length
		NULL,
		0, // Number of sites.
		NON_MOTIF_INDEX,
		NON_MOTIF_POSITION,
		nmotifs,
		spacer_states,
		motifs,
		&((*the_hmm)->states[i_state])
  );
  i_state++;

  // Build the spacer state (state 0).  Allow multi-state spacers.
  for (i_position = 0; i_position < spacer_states; i_position++) {
    build_star_state(
      alph,
      SPACER_STATE, 
		  i_state, 
		  DEFAULT_SPACER_LENGTH,
		  background,
		  SPACER_NUMSITES,
		  NON_MOTIF_INDEX,
		  i_position,
		  nmotifs,
		  spacer_states,
		  motifs,
		  &((*the_hmm)->states[i_state])
    );
    i_state++;
  }

  /* Build the motif states. */
  for (i_motif = 0; i_motif < nmotifs; i_motif++) {
    MOTIF_T *this_motif = motif_at(motifs, i_motif);
    assert(get_motif_length(this_motif) > 1);
    i_position = 0;
    build_star_state(
      alph,
      START_MOTIF_STATE, 
		  i_state,
		  0, // Expected spacer length.
		  get_matrix_row(i_position, get_motif_freqs(this_motif)),
		  get_motif_nsites(this_motif),
		  i_motif,
		  i_position,
		  nmotifs,
		  spacer_states,
		  motifs,
		  &((*the_hmm)->states[i_state])
    );
    i_state++;
    for (i_position = 1; i_position < get_motif_length(this_motif) - 1; i_position++) {
      build_star_state(
        alph,
        MID_MOTIF_STATE, 
		    i_state,
		    0, // Expected spacer length. 
		    get_matrix_row(i_position, get_motif_freqs(this_motif)),
		    get_motif_nsites(this_motif),
		    i_motif,
		    i_position, 
		    nmotifs,
		    spacer_states,
		    motifs,
		    &((*the_hmm)->states[i_state])
      );
      i_state++;
    }
    build_star_state(
      alph,
      END_MOTIF_STATE, 
		  i_state,
		  0, // Expected spacer length.
		  get_matrix_row(i_position, get_motif_freqs(this_motif)),
		  get_motif_nsites(this_motif),
		  i_motif,
		  i_position,
		  nmotifs,
		  spacer_states,
		  motifs,
		  &((*the_hmm)->states[i_state])
    );
    i_state++;
  }

  /* Build the end state. */
  build_star_state(
    alph,
    END_STATE, 
		i_state,
		0, // Expected spacer length.
		NULL, // Emissions
		0, // Number of sites.
		NON_MOTIF_INDEX,
		NON_MOTIF_POSITION,
		nmotifs,
		spacer_states,
		motifs,
		&((*the_hmm)->states[i_state])
  );
  i_state++;

  /* Convert spacers to FIMs if requested. */
  if (fim) {
    convert_to_fims(*the_hmm);
  }

  /* Fill in the transition matrix. */
  build_transition_matrix(*the_hmm);
} // build_star_hmm
Ejemplo n.º 24
0
void read_regexp_file(
   char*      filename,          // Name of MEME file  IN
   int*       num_motifs,             // Number of motifs retrieved  OUT
   MOTIF_T*   motifs                 // The retrieved motifs - NOT ALLOCATED!
) {
	FILE*      motif_file;         // MEME file containing the motifs.
	char motif_name[MAX_MOTIF_ID_LENGTH+1];
	char motif_regexp[MAX_MOTIF_WIDTH];
	ARRAY_T* these_freqs;
	MOTIF_T* m;
	int i;

	//Set things to the defaults.
	*num_motifs = 0;

	// Open the given MEME file.
	if (open_file(filename, "r", TRUE, "motif", "motifs", &motif_file) == 0)
		exit(1);

	//Set alphabet - ONLY supports dna.
	set_alphabet(verbosity, "ACGT");

	while (fscanf(motif_file, "%s\t%s", motif_name, motif_regexp) == 2) {
		/*
		 * Now we:
		 * 1. Fill in new motif (preallocated)
		 * 2. Assign name
		 * 3. Convert regexp into frequency table.
		 */

		m = &(motifs[*num_motifs]);
		set_motif_id(motif_name, m);
		m->length = strlen(motif_regexp);
		/* Store the alphabet size in the motif. */
		m->alph_size = get_alph_size(ALPH_SIZE);
		m->ambigs = get_alph_size(AMBIG_SIZE);
		/* Allocate memory for the matrix. */
		m->freqs = allocate_matrix(m->length, get_alph_size(ALL_SIZE));

		//Set motif frequencies here.
		for (i=0;i<strlen(motif_regexp);i++) {
			switch(toupper(motif_regexp[i])) {
			case 'A':
				set_matrix_cell(i,alphabet_index('A',get_alphabet(TRUE)),1,m->freqs);
				break;
			case 'C':
				set_matrix_cell(i,alphabet_index('C',get_alphabet(TRUE)),1,m->freqs);
				break;
			case 'G':
				set_matrix_cell(i,alphabet_index('G',get_alphabet(TRUE)),1,m->freqs);
				break;
			case 'T':
				set_matrix_cell(i,alphabet_index('T',get_alphabet(TRUE)),1,m->freqs);
				break;
			case 'U':
				set_matrix_cell(i,alphabet_index('U',get_alphabet(TRUE)),1,m->freqs);
				break;
			case 'R': //purines
				set_matrix_cell(i,alphabet_index('G',get_alphabet(TRUE)),1,m->freqs);
				set_matrix_cell(i,alphabet_index('A',get_alphabet(TRUE)),1,m->freqs);
				break;
			case 'Y': //pyramidines
				set_matrix_cell(i,alphabet_index('T',get_alphabet(TRUE)),1,m->freqs);
				set_matrix_cell(i,alphabet_index('C',get_alphabet(TRUE)),1,m->freqs);
				break;
			case 'K': //keto
				set_matrix_cell(i,alphabet_index('G',get_alphabet(TRUE)),1,m->freqs);
				set_matrix_cell(i,alphabet_index('T',get_alphabet(TRUE)),1,m->freqs);
				break;
			case 'M': //amino
				set_matrix_cell(i,alphabet_index('A',get_alphabet(TRUE)),1,m->freqs);
				set_matrix_cell(i,alphabet_index('C',get_alphabet(TRUE)),1,m->freqs);
				break;
			case 'S': //strong
				set_matrix_cell(i,alphabet_index('G',get_alphabet(TRUE)),1,m->freqs);
				set_matrix_cell(i,alphabet_index('C',get_alphabet(TRUE)),1,m->freqs);
				break;
			case 'W': //weak
				set_matrix_cell(i,alphabet_index('A',get_alphabet(TRUE)),1,m->freqs);
				set_matrix_cell(i,alphabet_index('T',get_alphabet(TRUE)),1,m->freqs);
				break;
			case 'B':
				set_matrix_cell(i,alphabet_index('G',get_alphabet(TRUE)),1,m->freqs);
				set_matrix_cell(i,alphabet_index('T',get_alphabet(TRUE)),1,m->freqs);
				set_matrix_cell(i,alphabet_index('C',get_alphabet(TRUE)),1,m->freqs);
				break;
			case 'D':
				set_matrix_cell(i,alphabet_index('G',get_alphabet(TRUE)),1,m->freqs);
				set_matrix_cell(i,alphabet_index('A',get_alphabet(TRUE)),1,m->freqs);
				set_matrix_cell(i,alphabet_index('T',get_alphabet(TRUE)),1,m->freqs);
				break;
			case 'H':
				set_matrix_cell(i,alphabet_index('A',get_alphabet(TRUE)),1,m->freqs);
				set_matrix_cell(i,alphabet_index('C',get_alphabet(TRUE)),1,m->freqs);
				set_matrix_cell(i,alphabet_index('T',get_alphabet(TRUE)),1,m->freqs);
				break;
			case 'V':
				set_matrix_cell(i,alphabet_index('G',get_alphabet(TRUE)),1,m->freqs);
				set_matrix_cell(i,alphabet_index('C',get_alphabet(TRUE)),1,m->freqs);
				set_matrix_cell(i,alphabet_index('A',get_alphabet(TRUE)),1,m->freqs);
				break;
			case 'N':
				set_matrix_cell(i,alphabet_index('A',get_alphabet(TRUE)),1,m->freqs);
				set_matrix_cell(i,alphabet_index('C',get_alphabet(TRUE)),1,m->freqs);
				set_matrix_cell(i,alphabet_index('G',get_alphabet(TRUE)),1,m->freqs);
				set_matrix_cell(i,alphabet_index('T',get_alphabet(TRUE)),1,m->freqs);
				break;
			}
		}

	    /* Compute values for ambiguous characters. */
		for (i = 0; i < m->length; i++) {
		    these_freqs = get_matrix_row(i, m->freqs);
		    fill_in_ambiguous_chars(FALSE, these_freqs);
		}

		/* Compute and store the motif complexity. */
		m->complexity = compute_motif_complexity(m);

		//Move our pointer along to do the next motif.
		(*num_motifs)++;
	}
}