Exemplo n.º 1
0
static MOTIF_T* post_process_motif(MREAD_T *mread, MOTIF_T *motif) {
  ARRAY_T *bg;
  int site_count;
  if (motif == NULL) return NULL;
  assert(motif->alph != INVALID_ALPH);
  if (motif->freqs) normalize_motif(motif, 0.00001);
  site_count = (motif->num_sites > 0 ? motif->num_sites : DEFAULT_SITES);
  if (motif->freqs != NULL && motif->scores != NULL) {
    // validate? May not be possible for protein motifs as MEME tweaks the PSSM
  } else if (motif->scores != NULL) {
    // calculate the freqs
    motif->freqs = convert_scores_into_freqs(motif->alph, motif->scores, 
        mread->motif_bg, site_count, PSEUDO);
  } else if (motif->freqs != NULL) {
    // calculate the scores
    motif->scores = convert_freqs_into_scores(motif->alph, motif->freqs,
        mread->motif_bg, site_count, PSEUDO);
  } else {
    die("Motif with no PSPM or PSSM should not get here!\n");
  }

  apply_pseudocount_to_motif(motif, mread->pseudo_bg, mread->pseudo_total);
  motif->complexity = compute_motif_complexity(motif);
  if (mread->options & CALC_AMBIGS) calc_motif_ambigs(motif);
  if (mread->trim) trim_motif_by_bit_threshold(motif, mread->trim_bits);
  return motif;
}
Exemplo n.º 2
0
void read_regexp_file(
   char*      filename,          // Name of MEME file  IN
   int*       num_motifs,             // Number of motifs retrieved  OUT
   MOTIF_T*   motifs                 // The retrieved motifs - NOT ALLOCATED!
) {
	FILE*      motif_file;         // MEME file containing the motifs.
	char motif_name[MAX_MOTIF_ID_LENGTH+1];
	char motif_regexp[MAX_MOTIF_WIDTH];
	ARRAY_T* these_freqs;
	MOTIF_T* m;
	int i;

	//Set things to the defaults.
	*num_motifs = 0;

	// Open the given MEME file.
	if (open_file(filename, "r", TRUE, "motif", "motifs", &motif_file) == 0)
		exit(1);

	//Set alphabet - ONLY supports dna.
	set_alphabet(verbosity, "ACGT");

	while (fscanf(motif_file, "%s\t%s", motif_name, motif_regexp) == 2) {
		/*
		 * Now we:
		 * 1. Fill in new motif (preallocated)
		 * 2. Assign name
		 * 3. Convert regexp into frequency table.
		 */

		m = &(motifs[*num_motifs]);
		set_motif_id(motif_name, m);
		m->length = strlen(motif_regexp);
		/* Store the alphabet size in the motif. */
		m->alph_size = get_alph_size(ALPH_SIZE);
		m->ambigs = get_alph_size(AMBIG_SIZE);
		/* Allocate memory for the matrix. */
		m->freqs = allocate_matrix(m->length, get_alph_size(ALL_SIZE));

		//Set motif frequencies here.
		for (i=0;i<strlen(motif_regexp);i++) {
			switch(toupper(motif_regexp[i])) {
			case 'A':
				set_matrix_cell(i,alphabet_index('A',get_alphabet(TRUE)),1,m->freqs);
				break;
			case 'C':
				set_matrix_cell(i,alphabet_index('C',get_alphabet(TRUE)),1,m->freqs);
				break;
			case 'G':
				set_matrix_cell(i,alphabet_index('G',get_alphabet(TRUE)),1,m->freqs);
				break;
			case 'T':
				set_matrix_cell(i,alphabet_index('T',get_alphabet(TRUE)),1,m->freqs);
				break;
			case 'U':
				set_matrix_cell(i,alphabet_index('U',get_alphabet(TRUE)),1,m->freqs);
				break;
			case 'R': //purines
				set_matrix_cell(i,alphabet_index('G',get_alphabet(TRUE)),1,m->freqs);
				set_matrix_cell(i,alphabet_index('A',get_alphabet(TRUE)),1,m->freqs);
				break;
			case 'Y': //pyramidines
				set_matrix_cell(i,alphabet_index('T',get_alphabet(TRUE)),1,m->freqs);
				set_matrix_cell(i,alphabet_index('C',get_alphabet(TRUE)),1,m->freqs);
				break;
			case 'K': //keto
				set_matrix_cell(i,alphabet_index('G',get_alphabet(TRUE)),1,m->freqs);
				set_matrix_cell(i,alphabet_index('T',get_alphabet(TRUE)),1,m->freqs);
				break;
			case 'M': //amino
				set_matrix_cell(i,alphabet_index('A',get_alphabet(TRUE)),1,m->freqs);
				set_matrix_cell(i,alphabet_index('C',get_alphabet(TRUE)),1,m->freqs);
				break;
			case 'S': //strong
				set_matrix_cell(i,alphabet_index('G',get_alphabet(TRUE)),1,m->freqs);
				set_matrix_cell(i,alphabet_index('C',get_alphabet(TRUE)),1,m->freqs);
				break;
			case 'W': //weak
				set_matrix_cell(i,alphabet_index('A',get_alphabet(TRUE)),1,m->freqs);
				set_matrix_cell(i,alphabet_index('T',get_alphabet(TRUE)),1,m->freqs);
				break;
			case 'B':
				set_matrix_cell(i,alphabet_index('G',get_alphabet(TRUE)),1,m->freqs);
				set_matrix_cell(i,alphabet_index('T',get_alphabet(TRUE)),1,m->freqs);
				set_matrix_cell(i,alphabet_index('C',get_alphabet(TRUE)),1,m->freqs);
				break;
			case 'D':
				set_matrix_cell(i,alphabet_index('G',get_alphabet(TRUE)),1,m->freqs);
				set_matrix_cell(i,alphabet_index('A',get_alphabet(TRUE)),1,m->freqs);
				set_matrix_cell(i,alphabet_index('T',get_alphabet(TRUE)),1,m->freqs);
				break;
			case 'H':
				set_matrix_cell(i,alphabet_index('A',get_alphabet(TRUE)),1,m->freqs);
				set_matrix_cell(i,alphabet_index('C',get_alphabet(TRUE)),1,m->freqs);
				set_matrix_cell(i,alphabet_index('T',get_alphabet(TRUE)),1,m->freqs);
				break;
			case 'V':
				set_matrix_cell(i,alphabet_index('G',get_alphabet(TRUE)),1,m->freqs);
				set_matrix_cell(i,alphabet_index('C',get_alphabet(TRUE)),1,m->freqs);
				set_matrix_cell(i,alphabet_index('A',get_alphabet(TRUE)),1,m->freqs);
				break;
			case 'N':
				set_matrix_cell(i,alphabet_index('A',get_alphabet(TRUE)),1,m->freqs);
				set_matrix_cell(i,alphabet_index('C',get_alphabet(TRUE)),1,m->freqs);
				set_matrix_cell(i,alphabet_index('G',get_alphabet(TRUE)),1,m->freqs);
				set_matrix_cell(i,alphabet_index('T',get_alphabet(TRUE)),1,m->freqs);
				break;
			}
		}

	    /* Compute values for ambiguous characters. */
		for (i = 0; i < m->length; i++) {
		    these_freqs = get_matrix_row(i, m->freqs);
		    fill_in_ambiguous_chars(FALSE, these_freqs);
		}

		/* Compute and store the motif complexity. */
		m->complexity = compute_motif_complexity(m);

		//Move our pointer along to do the next motif.
		(*num_motifs)++;
	}
}