Ejemplo n.º 1
0
/***********************************************************************
 * Convert transition counts to transition probabilities, and compute
 * average spacer lengths.
 *
 * Each matrix is indexed 0 ... n+1, where n is the number of motifs.
 * The entry at [i,j] corresponds to the transition from motif i to
 * motif j.  Hence, after normalization, each row in the transition
 * matrix should sum to 1.
 ***********************************************************************/
static void normalize_spacer_counts(
  double    trans_pseudo,
  double    spacer_pseudo,    // Pseudocount for self-loop.
  BOOLEAN_T keep_unused,
  MATRIX_T* transp_freq,
  MATRIX_T* spacer_ave
) {
  int i_row;
  int i_col;
  int num_rows;
  double total_spacer;
  double num_transitions;
  double ave_spacer;
  
  /* Divide the spacer lengths by the number of occurrences. */
  num_rows = get_num_rows(transp_freq);
  for (i_row = 0; i_row < num_rows; i_row++) {
    for (i_col = 0; i_col < num_rows; i_col++) {
      total_spacer = get_matrix_cell(i_row, i_col, spacer_ave) + spacer_pseudo;
      num_transitions = get_matrix_cell(i_row, i_col, transp_freq);
      if (spacer_pseudo > 0) num_transitions++;
      if (num_transitions != 0.0) {
        ave_spacer = total_spacer / num_transitions;
        set_matrix_cell(i_row, i_col, ave_spacer, spacer_ave);
      }
    }
  }

  // Add pseudocounts.
  for (i_row = 0; i_row < num_rows; i_row++) {
    for (i_col = 0; i_col < num_rows; i_col++) {

      // Force some transitions to zero.
      if (// No transitions to the start state.
        (i_col == 0) || 
        // No transitions from the end state.
        (i_row == num_rows - 1) ||
        // No transition from start to end.
        ((i_row == 0) && (i_col == num_rows - 1))) {
        set_matrix_cell(i_row, i_col, 0.0, transp_freq);
      }
      else {
        // Only increment the used transitions.
        if ((keep_unused) || 
            (get_matrix_cell(i_row, i_col, transp_freq) > 0.0)) {
          incr_matrix_cell(i_row, i_col, trans_pseudo, transp_freq);
        }
      }
    }
  }

  // Normalize rows.
  for (i_row = 0; i_row < num_rows - 1; i_row++) {
    if (array_total(get_matrix_row(i_row, transp_freq)) > 0.0) {
      normalize(SLOP, get_matrix_row(i_row, transp_freq));
    }
  }
}
Ejemplo n.º 2
0
extern MATRIX_T* gen_pam_matrix(
  ALPH_T alph,                  /* alphabet */
  int dist,			/* PAM distance */
  BOOLEAN_T logodds		/* true: generate log-odds matrix 
				   false: generate target frequency matrix 
				*/
)
{
  assert(alph == DNA_ALPH || alph == PROTEIN_ALPH);
  int i, j;
  MATRIX_T *matrix, *mul;
  BOOLEAN_T dna = (alph == DNA_ALPH);
  double *pfreq = dna ? pam_dna_freq : pam_prot_freq;	// standard frequencies
  int alen = alph_size(alph, ALPH_SIZE);  // length of standard alphabet
  double factor = dist < 170 ? 2/log(2) : 3/log(2);	// same as in "pam" Version 1.0.6

  /* create the array for the joint probability matrix */
  matrix = allocate_matrix(alen, alen);
  mul = allocate_matrix(alen, alen);

  /* initialize the matrix: PAM 1:
     due to roundoff, take the average of the two estimates of the joint frequency
     of i and j as the joint, then compute the conditionals for the matrix
  */
  for (i=0; i<alen; i++) {
    for (j=0; j<=i; j++) {
      double vij = dna ? trans[i][j] : dayhoff[i][j];
      double vji = dna ? trans[j][i] : dayhoff[j][i];
      double joint = ((vij * pfreq[j]) + (vji * pfreq[i]))/20000;/* use average to fix rndoff */
      set_matrix_cell(i, j, joint/pfreq[j], matrix);
      if (i!=j) set_matrix_cell(j, i, joint/pfreq[i], matrix);
    }
  }

  /* take PAM matrix to desired power to scale it */ 
  copy_matrix(matrix, mul);
  for (i=dist; i>1; i--) {
    MATRIX_T *product = matrix_multiply(matrix, mul);
    SWAP(MATRIX_T*, product, matrix)
    free_matrix(product);
  } 
  free_matrix(mul);

  /* convert to joint or logodds matrix:
     target:  J_ij = Pr(i,j) = Mij pr(j) 
     logodds: L_ij = log (Pr(i,j)/(Pr(i)Pr(j)) = log (Mij Pr(j)/Pr(i)Pr(j)) = log(Mij/pr(i)) 
  */
  for (i=0; i<alen; i++) {
    for (j=0; j<alen; j++) {
      double vij = get_matrix_cell(i, j, matrix);
      vij = logodds ? nint(factor * log((vij+EPSILON)/pfreq[i])) : vij * pfreq[j];
      set_matrix_cell(i, j, vij, matrix);
    }
  }

  return matrix;
} /* gen_pam_matrix */
Ejemplo n.º 3
0
MATRIX_T *reorder_matrix(
  const char *alpha1,				/* current alphabet */
  const char *alpha2,				/* new alphabet; must be subset */
  MATRIX_T *in_matrix			/* matrix to reorder */
)
{
  int i, j;
  int alen1 = strlen(alpha1);
  int alen2 = strlen(alpha2);
  MATRIX_T *out_matrix;

  if (alen2 > alen1) 
    die("The new alphabet %s must be a subset of the old alphabet %s.\n", alpha2, alpha1);

  out_matrix = allocate_matrix(alen2, alen2);
  for (i=0; i<alen2; i++) {
    int ii = strchr(alpha1, alpha2[i]) - alpha1;
    for (j=0; j<alen2; j++) {
      int jj;
      char *ptr = strchr(alpha1, alpha2[j]);
      if (!ptr)
        die("The new alphabet %s must be a subset of the old alphabet %s\n", alpha2, alpha1);
      jj = ptr - alpha1;
      set_matrix_cell(i, j, get_matrix_cell(ii, jj, in_matrix), out_matrix);
    }
  }
  return(out_matrix);
} /* reorder_matrix */
Ejemplo n.º 4
0
/**************************************************************************
*
	hash_pssm_matrix_pos

	Recursively create a single position of a hashed PSSM.

*
**************************************************************************/
static void hash_pssm_matrix_pos(
  MATRIX_T *pssm, 		// pssm to hash
  MATRIX_T *hashed_pssm, 	// hashed pssm
  int  pos,			// position in pssm
  int  hashed_pos,		// position in hashed pssm
  int  n,			// number of columns to hash together
  double score,			// cumulative score; call with 0
  int index			// cumulative index; call with 0
)
{
  int i;
  int alen = get_num_cols(pssm);	// alphabet length
  int w = get_num_rows(pssm);		// pssm width

  if (n==0) {				// done, set hashed_pssm entry
    set_matrix_cell(hashed_pos, index, score, hashed_pssm);
  } else {				// combine next column of pssm
    for (i=0; i<=alen; i++) {		// letters + blank
      // not past right edge of motif and not blank?
      double s = (pos<w && i!=alen) ? get_matrix_cell(pos, i, pssm) : 0;
      hash_pssm_matrix_pos(pssm,
		    hashed_pssm,
		    pos+1, 		// position in old pssm
		    hashed_pos, 	// position working on
		    n-1, 		// positions remaining to hash
		    score+s, 		// score so far
		    index*(alen+1)+i);	// hashed alphabet index so far
    } // leter
  }
} // hash_pssm_matrix_pos
Ejemplo n.º 5
0
Archivo: motif.c Proyecto: CPFL/gmeme
/***********************************************************************
 * Apply a pseudocount to the motif pspm.
 ***********************************************************************/
void apply_pseudocount_to_motif
  (MOTIF_T* motif, ARRAY_T *background, double pseudocount)
{
  int pos, letter, len, asize, sites;
  double prob, count, total;
  ARRAY_T *temp;

  // no point in doing work when it makes no difference
  if (pseudocount == 0) return;
  assert(pseudocount > 0);
  // motif dimensions
  asize = alph_size(motif->alph, ALPH_SIZE);
  len = motif->length;
  // create a uniform background if none is given
  temp = NULL;
  if (background == NULL) {
    temp = allocate_array(asize);
    get_uniform_frequencies(motif->alph, temp);
    background = temp;
  }
  // calculate the counts
  sites = (motif->num_sites > 0 ? motif->num_sites : DEFAULT_SITE_COUNT);
  total = sites + pseudocount;
  for (pos = 0; pos < len; ++pos) {
    for (letter = 0; letter < asize; ++letter) {
      prob = get_matrix_cell(pos, letter, motif->freqs);
      count = (prob * sites) + (pseudocount * get_array_item(letter, background));
      prob = count / total;
      set_matrix_cell(pos, letter, prob, motif->freqs);
    }
  }
  if (temp) free_array(temp);
}
Ejemplo n.º 6
0
/**************************************************************************
*	scale_pssm
*
*	Scale and round the scores in a PSSM so that the score of a word
*	is in the range [0..w*range].
*
*	Returns the scaled PSSM.
*
**************************************************************************/
void scale_pssm(
  PSSM_T *pssm,		          // The PSSM. (IN/OUT)
  PRIOR_DIST_T *prior_dist, // Distribution of priors (IN)
  double alpha,             // Fraction of all TFBS that are the TFBS of interest
  int range 			          // The desired range. (IN) 
)
{
  int i, j;
  MATRIX_T* matrix = pssm->matrix;
  int r = pssm->w;
  int c = pssm->alphsize;
  double small = BIG;
  double large = -BIG;
  double scale, offset;

  // Get the largest and smallest scores in the PSSM.
  for (i=0; i<r; i++) {
    for (j=0; j<c; j++) {
      double x = get_matrix_cell(i, j, matrix);
      small = MIN(small, x);
      large = MAX(large, x);
    }
  }

  // Get the smallest and largest prior log-odds from the prior distribution
  // and use them to adjust small and large.
  if (prior_dist != NULL) {
    double min_lo_prior = get_min_lo_prior(prior_dist, alpha);
    double max_lo_prior = get_max_lo_prior(prior_dist, alpha);
    small = MIN(small, min_lo_prior);
    large = MAX(large, max_lo_prior);
  }
  
  // Find offset and scale factors so that PSSM scores for words is in the 
  // range: [0..w*range]
  // To make LO=0 map back to 0, need offset*scale to be an integer.
  // So we make offset and scale integers. (TLB 31 May 2013)
  if (large == small) { small = large - 1; }	// In case all motif entries are the same.
  offset = small = floor(small);		// Make offset an integer.
  scale = floor(range/(large-small));		// Ensure scaled scores are <= range.

  // Scale and round the PSSM entries.
  for (i=0; i<r; i++) {
    for (j=0; j<c; j++) {
      double x = raw_to_scaled(get_matrix_cell(i, j, matrix), 1, scale, offset);
      set_matrix_cell(i, j, x, matrix);
    }
  }

  // return scale and offset of scores
  pssm->scale = scale;
  pssm->offset = offset;
  pssm->range = range;

} // scale_pssm
Ejemplo n.º 7
0
/*************************************************************************
 * Using information stored in the states of an HMM, fill in the HMM's
 * transition matrix.
 *************************************************************************/
static void build_transition_matrix
  (MHMM_T *the_hmm)
{
  int            i_state;    /* Indices into the matrix. */
  int            j_state; 
  MHMM_STATE_T * this_state; /* Pointer to the current state. */
  int            num_out;    /* No. of trans out of the current state. */
  int            i_out;      /* Index of outgoing transition. */

  check_sq_matrix(the_hmm->trans, the_hmm->num_states);
  /* First make sure the matrix is zeroed. */
  for (i_state = 0; i_state < the_hmm->num_states; i_state++) {
    for (j_state = 0; j_state < the_hmm->num_states; j_state++) {
      set_matrix_cell(i_state, j_state, 0.0, the_hmm->trans);
    }
  }

  /* Look at each state in the model. */
  for (i_state = 0; i_state < the_hmm->num_states; i_state++) {
    this_state = &(the_hmm->states[i_state]);
    
    /* Find out how many transitions out of this state there are. */
    num_out = this_state->ntrans_out;

    for (i_out = 0; i_out < num_out; i_out++) {
      /* Get the index of the state being transitioned to. */
      j_state = this_state->itrans_out[i_out];
      assert(j_state != 0);

      /* Fill in the matrix with the appropriate value. */
      set_matrix_cell(i_state, j_state, 
		      get_array_item(i_out, this_state->trans_out),
		      the_hmm->trans);
    }
  }
  assert(verify_trans_matrix(FALSE, the_hmm->num_states, the_hmm->trans));
}
Ejemplo n.º 8
0
Archivo: motif.c Proyecto: CPFL/gmeme
/***********************************************************************
 * Takes a matrix of meme scores and converts them into letter 
 * probabilities.
 *
 * The probablility can be got by:
 * p = (2 ^ (s / 100)) * bg
 *
 ***********************************************************************/
MATRIX_T* convert_scores_into_freqs
  (ALPH_T alph,
   MATRIX_T *scores,
   ARRAY_T *bg,
   int site_count,
   double pseudo_count)
{
  int asize, length;
  double freq, score, total_count, counts, bg_freq;
  MATRIX_T *freqs;
  int row, col;

  assert(alph != INVALID_ALPH);
  assert(scores != NULL);
  assert(bg != NULL);

  length = get_num_rows(scores);
  asize = alph_size(alph, ALPH_SIZE);

  freqs = allocate_matrix(length, asize);
  total_count = site_count + pseudo_count;

  for (col = 0; col < asize; ++col) {
    bg_freq = get_array_item(col, bg);
    for (row = 0; row < length; ++row) {
      score = get_matrix_cell(row, col, scores);
      // convert to a probability
      freq = pow(2.0, score / 100.0) * bg_freq;
      // remove the pseudo count
      freq = ((freq * total_count) - (bg_freq * pseudo_count)) / site_count;
      if (freq < 0) freq = 0;
      else if (freq > 1) freq = 1;
      set_matrix_cell(row, col, freq, freqs);
    }
  }
  for (row = 0; row < length; ++row) {
    normalize_subarray(0, asize, 0.0, get_matrix_row(row, freqs));
  }

  return freqs;
}
Ejemplo n.º 9
0
/***********************************************************************
 * Converts a TRANSFAC motif to a MEME motif.
 * Caller is responsible for freeing the returned MOTIF_T.
 ***********************************************************************/
MOTIF_T *convert_transfac_motif_to_meme_motif(
    char *id,
    int pseudocount,
    ARRAY_T *bg,
    TRANSFAC_MOTIF_T *motif
) {
    MATRIX_T *counts = get_transfac_counts(motif);
    if (counts == NULL) {
        die(
            "Unable to convert TRANSFAC motif %s to MEME motif: "
            "missing counts matrix.",
            id
        );
    };

    // Convert the motif counts to frequencies.
    int num_bases = get_num_cols(counts);
    int motif_width = get_num_rows(counts);
    int motif_position = 0;
    MATRIX_T *freqs = allocate_matrix(motif_width, num_bases);
    for (motif_position = 0; motif_position < motif_width; ++motif_position) {
        int i_base = 0;
        int num_seqs = 0; // motif columns may have different counts
        for (i_base = 0; i_base < num_bases; i_base++) {
            num_seqs += get_matrix_cell(motif_position, i_base, counts);
        }
        for (i_base = 0; i_base < num_bases; i_base++) {
            double freq =
                (get_matrix_cell(motif_position, i_base, counts)
                 + (pseudocount * get_array_item(i_base, bg))) / (num_seqs + pseudocount);
            set_matrix_cell(motif_position, i_base, freq, freqs);
        }
    }

    MOTIF_T *meme_motif = allocate_motif(id, DNA_ALPH, NULL, freqs);
    calc_motif_ambigs(meme_motif);
    return meme_motif;
}
Ejemplo n.º 10
0
MATRIX_T *convert_score_to_target(
  MATRIX_T *score,			/* score matrix */
  ARRAY_T *prob				/* letter frequencies */
)
{
  int i, j;
  KARLIN_INPUT_T *karlin_input;
  double lambda, K, H;
  MATRIX_T *target;			/* target freq. matrix */
  int alen = get_num_rows(score);	/* alphabet length */

  /* make input for karlin() */
  karlin_input = make_karlin_input(score, prob);
  
  /* get lambda */
  karlin(karlin_input->low, karlin_input->high, karlin_input->prob->items,
    &lambda, &K, &H);
  /*printf("lambda %f K %f H %f\n", lambda, K, H);*/

  /* calculate target frequencies */
  target = allocate_matrix(alen, alen);
  for (i=0; i<alen; i++) {
    for (j=0; j<alen; j++) {
      double pi = get_array_item(i, prob);
      double pj = get_array_item(j, prob);
      double sij = get_matrix_cell(i, j, score);
      double f = pi * pj * exp(lambda * sij);
      set_matrix_cell(i, j, f, target);
    }
  }

  // Free local dynamic memory.
  free_array(karlin_input->prob);
  myfree(karlin_input);

  return(target);
} /* convert_score_to_target */
Ejemplo n.º 11
0
Archivo: motif.c Proyecto: CPFL/gmeme
/***********************************************************************
 * Takes a matrix of letter probabilities and converts them into meme
 * score.
 *
 * Assuming the probability is nonzero the score is just: 
 * s = log2(p / bg) * 100
 *
 ***********************************************************************/
MATRIX_T* convert_freqs_into_scores
  (ALPH_T alph,
   MATRIX_T *freqs,
   ARRAY_T *bg,
   int site_count,
   double pseudo_count) 
{
  int asize, length;
  double freq, score, total_count, counts, bg_freq;
  MATRIX_T *scores;
  int row, col;

  assert(alph != INVALID_ALPH);
  assert(freqs != NULL);
  assert(bg != NULL);

  length = get_num_rows(freqs);
  asize = alph_size(alph, ALPH_SIZE);

  scores = allocate_matrix(length, asize);
  total_count = site_count + pseudo_count;

  for (col = 0; col < asize; ++col) {
    bg_freq = get_array_item(col, bg);
    for (row = 0; row < length; ++row) {
      freq = get_matrix_cell(row, col, freqs);
      // apply a pseudo count
      freq = ((pseudo_count * bg_freq) + (freq * site_count)) / total_count;
      // if the background is correct this shouldn't happen
      if (freq <= 0) freq = 0.0000005;
      // convert to a score
      score = (log(freq / bg_freq) / log(2)) * 100;
      set_matrix_cell(row, col, score, scores);
    }
  }
  return scores;
}
/*****************************************************************************
 * MEME > motifs > motif > probabilities > alphabet_matrix > alphabet_array > /value
 * Lookup a letter and check it exists and does not have a probability. 
 * Set the letter's score to the passed value.
 ****************************************************************************/
void mxml_probability_value(void *ctx, char *letter_id, double probability) {
  CTX_T *data;
  MATRIX_T *freqs;
  char *symbol;
  int index;
  data = (CTX_T*)ctx;
  freqs = data->mscope.motif->freqs;
  // lookup letter ID
  symbol = (char*)rbtree_get(data->letter_lookup, letter_id);
  if (symbol == NULL) {
    local_error(data, "Probability is not allowed for unknown letter identifier \"%s\".\n", letter_id);
    return;
  }
  index = alph_indexc(data->alph, symbol[0]);
  if (index < 0) {
    local_error(data, "Probability is not allowed for non-core letter %c.\n", symbol[0]);
    return;
  }
  if (get_matrix_cell(data->current_pos, index, freqs) != -1) {
    local_error(data, "Probability for letter %c in position %d has already been set.\n", symbol[0], data->current_pos + 1);
    return;
  }
  set_matrix_cell(data->current_pos, index, probability, freqs);
}
Ejemplo n.º 13
0
void read_regexp_file(
   char*      filename,          // Name of MEME file  IN
   int*       num_motifs,             // Number of motifs retrieved  OUT
   MOTIF_T*   motifs                 // The retrieved motifs - NOT ALLOCATED!
) {
	FILE*      motif_file;         // MEME file containing the motifs.
	char motif_name[MAX_MOTIF_ID_LENGTH+1];
	char motif_regexp[MAX_MOTIF_WIDTH];
	ARRAY_T* these_freqs;
	MOTIF_T* m;
	int i;

	//Set things to the defaults.
	*num_motifs = 0;

	// Open the given MEME file.
	if (open_file(filename, "r", TRUE, "motif", "motifs", &motif_file) == 0)
		exit(1);

	//Set alphabet - ONLY supports dna.
	set_alphabet(verbosity, "ACGT");

	while (fscanf(motif_file, "%s\t%s", motif_name, motif_regexp) == 2) {
		/*
		 * Now we:
		 * 1. Fill in new motif (preallocated)
		 * 2. Assign name
		 * 3. Convert regexp into frequency table.
		 */

		m = &(motifs[*num_motifs]);
		set_motif_id(motif_name, m);
		m->length = strlen(motif_regexp);
		/* Store the alphabet size in the motif. */
		m->alph_size = get_alph_size(ALPH_SIZE);
		m->ambigs = get_alph_size(AMBIG_SIZE);
		/* Allocate memory for the matrix. */
		m->freqs = allocate_matrix(m->length, get_alph_size(ALL_SIZE));

		//Set motif frequencies here.
		for (i=0;i<strlen(motif_regexp);i++) {
			switch(toupper(motif_regexp[i])) {
			case 'A':
				set_matrix_cell(i,alphabet_index('A',get_alphabet(TRUE)),1,m->freqs);
				break;
			case 'C':
				set_matrix_cell(i,alphabet_index('C',get_alphabet(TRUE)),1,m->freqs);
				break;
			case 'G':
				set_matrix_cell(i,alphabet_index('G',get_alphabet(TRUE)),1,m->freqs);
				break;
			case 'T':
				set_matrix_cell(i,alphabet_index('T',get_alphabet(TRUE)),1,m->freqs);
				break;
			case 'U':
				set_matrix_cell(i,alphabet_index('U',get_alphabet(TRUE)),1,m->freqs);
				break;
			case 'R': //purines
				set_matrix_cell(i,alphabet_index('G',get_alphabet(TRUE)),1,m->freqs);
				set_matrix_cell(i,alphabet_index('A',get_alphabet(TRUE)),1,m->freqs);
				break;
			case 'Y': //pyramidines
				set_matrix_cell(i,alphabet_index('T',get_alphabet(TRUE)),1,m->freqs);
				set_matrix_cell(i,alphabet_index('C',get_alphabet(TRUE)),1,m->freqs);
				break;
			case 'K': //keto
				set_matrix_cell(i,alphabet_index('G',get_alphabet(TRUE)),1,m->freqs);
				set_matrix_cell(i,alphabet_index('T',get_alphabet(TRUE)),1,m->freqs);
				break;
			case 'M': //amino
				set_matrix_cell(i,alphabet_index('A',get_alphabet(TRUE)),1,m->freqs);
				set_matrix_cell(i,alphabet_index('C',get_alphabet(TRUE)),1,m->freqs);
				break;
			case 'S': //strong
				set_matrix_cell(i,alphabet_index('G',get_alphabet(TRUE)),1,m->freqs);
				set_matrix_cell(i,alphabet_index('C',get_alphabet(TRUE)),1,m->freqs);
				break;
			case 'W': //weak
				set_matrix_cell(i,alphabet_index('A',get_alphabet(TRUE)),1,m->freqs);
				set_matrix_cell(i,alphabet_index('T',get_alphabet(TRUE)),1,m->freqs);
				break;
			case 'B':
				set_matrix_cell(i,alphabet_index('G',get_alphabet(TRUE)),1,m->freqs);
				set_matrix_cell(i,alphabet_index('T',get_alphabet(TRUE)),1,m->freqs);
				set_matrix_cell(i,alphabet_index('C',get_alphabet(TRUE)),1,m->freqs);
				break;
			case 'D':
				set_matrix_cell(i,alphabet_index('G',get_alphabet(TRUE)),1,m->freqs);
				set_matrix_cell(i,alphabet_index('A',get_alphabet(TRUE)),1,m->freqs);
				set_matrix_cell(i,alphabet_index('T',get_alphabet(TRUE)),1,m->freqs);
				break;
			case 'H':
				set_matrix_cell(i,alphabet_index('A',get_alphabet(TRUE)),1,m->freqs);
				set_matrix_cell(i,alphabet_index('C',get_alphabet(TRUE)),1,m->freqs);
				set_matrix_cell(i,alphabet_index('T',get_alphabet(TRUE)),1,m->freqs);
				break;
			case 'V':
				set_matrix_cell(i,alphabet_index('G',get_alphabet(TRUE)),1,m->freqs);
				set_matrix_cell(i,alphabet_index('C',get_alphabet(TRUE)),1,m->freqs);
				set_matrix_cell(i,alphabet_index('A',get_alphabet(TRUE)),1,m->freqs);
				break;
			case 'N':
				set_matrix_cell(i,alphabet_index('A',get_alphabet(TRUE)),1,m->freqs);
				set_matrix_cell(i,alphabet_index('C',get_alphabet(TRUE)),1,m->freqs);
				set_matrix_cell(i,alphabet_index('G',get_alphabet(TRUE)),1,m->freqs);
				set_matrix_cell(i,alphabet_index('T',get_alphabet(TRUE)),1,m->freqs);
				break;
			}
		}

	    /* Compute values for ambiguous characters. */
		for (i = 0; i < m->length; i++) {
		    these_freqs = get_matrix_row(i, m->freqs);
		    fill_in_ambiguous_chars(FALSE, these_freqs);
		}

		/* Compute and store the motif complexity. */
		m->complexity = compute_motif_complexity(m);

		//Move our pointer along to do the next motif.
		(*num_motifs)++;
	}
}
Ejemplo n.º 14
0
/***********************************************************************
 * Read TRANSFAC motifs from a TRANSFAC file.
 * Returns an arraylist of pointers to TRANSFAC_MOTIF_T
 ***********************************************************************/
ARRAYLST_T *read_motifs_from_transfac_file (
    const char* transfac_filename  // Name of TRANSFAC file or '-' for stdin IN
) {

    // Create dynamic storage for motifs
    ARRAYLST_T *motif_list = arraylst_create();

    // Open the TRANFAC file for reading.
    FILE *transfac_file = NULL;
    if (open_file(
                transfac_filename,
                "r",
                TRUE, // Allow '-' for stdin
                "transfac file",
                "",
                &transfac_file
            ) == FALSE) {
        exit(1);
    }

    // Read and parse the TRANFAC file.
    int num_bases = 4;
    char *line = NULL;
    while ((line = getline2(transfac_file)) != NULL) {

        // Split the line into an initial tag and everything else.
        char *this_accession = split(line, ' ');
        char *tag = line;

        // Have we reached a new matrix?
        if (strcmp(tag, "AC") == 0) {

            trim(this_accession);

            char *this_id = NULL;
            char *this_name = NULL;
            char *this_descr = NULL;
            char *this_species = NULL;
            char this_consensus[MAX_CONSENSUS_LENGTH];
            STRING_LIST_T *species_list = new_string_list();

            // Old versions of TRANSFAC use pee-zero; new use pee-oh.
            while (strcmp(tag, "PO") != 0 && strcmp(tag, "P0") != 0) {

                line = getline2(transfac_file);
                if (line == NULL) {
                    die ("Can't find PO line for TRANSFAC matrix %s.\n", this_accession);
                }
                char *data = split(line, ' ');
                if (data != NULL) {
                    trim(data);
                }
                tag = line;

                // Store the id line.
                if (strcmp(tag, "ID") == 0) {
                    this_id = strdup(data);
                }
                // Store the species line.
                else if (strcmp(tag, "BF") == 0) {
                    add_string(data, species_list);
                }
                // Store the name line.
                else if (strcmp(tag, "NA") == 0) {
                    this_name = strdup(data);
                }
                // Store the description line.
                else if (strcmp(tag, "DE") == 0) {
                    this_descr = strdup(data);
                }
            }

            // Check how many positions in the motif
            // Mark current position in file
            fpos_t file_position;
            errno = 0;
            int status = fgetpos(transfac_file, &file_position);
            if (status) {
                die("Error reading file %s: %s", transfac_filename, strerror(errno));
            }

            int num_motif_positions = 0;
            while (TRUE) {

                // Read till we reach the end of the counts or the end of the motif
                line = getline2(transfac_file);
                if (line == NULL) {
                    break;
                }

                char *data = split(line, ' ');
                if (data != NULL) {
                    trim(data);
                }
                tag = line;

                // Read till we reach the end of the counts or the end of the motif
                if ((strcmp(tag, "XX\n") == 0) || (strcmp(tag, "//\n") == 0)) {
                    break;
                }

                ++num_motif_positions;
            }
            // Rewind file
            errno = 0;
            status = fsetpos(transfac_file, &file_position);
            if (status) {
                die("Error reading file %s: %s", transfac_filename, strerror(errno));
            }

            // Read the motif counts.
            int num_seqs = 0;
            this_consensus[0] = 0;
            MATRIX_T *motif_counts = allocate_matrix(num_motif_positions, 4);
            int position = 0;
            while (TRUE) {

                line = getline2(transfac_file);
                if (line == NULL) {
                    break;
                }

                char *data = split(line, ' ');
                if (data != NULL) {
                    trim(data);
                }
                tag = line;

                // Look for the end of the motif.
                if ((strcmp(tag, "XX\n") == 0) || (strcmp(tag, "//\n") == 0)) {
                    break;
                }

                position = atoi(tag);
                if (position > num_motif_positions) {
                    die(
                        "Error reading motif counts at position %d of motif %s in file %s",
                        position,  this_accession, transfac_filename
                    );
                }

                // Store the contents of this row.
                int count[4];
                char consensus;
                sscanf(
                    data,
                    "%d %d %d %d %c",
                    &(count[0]),
                    &(count[1]),
                    &(count[2]),
                    &(count[3]),
                    &consensus
                );
                int i_base;
                for (i_base = 0; i_base < num_bases; i_base++) {
                    set_matrix_cell(position - 1, i_base, count[i_base], motif_counts);
                }
                this_consensus[position - 1] = consensus;

            }

            this_consensus[position] = 0;
            TRANSFAC_MOTIF_T *motif = new_transfac_motif(
                                          this_accession,
                                          this_id,
                                          this_name,
                                          this_descr,
                                          this_consensus,
                                          species_list,
                                          motif_counts
                                      );
            arraylst_add(motif, motif_list);

        }
    }

    fclose(transfac_file);
    return motif_list;

}