Exemple #1
0
/***********************************************************************
 * Convert transition counts to transition probabilities, and compute
 * average spacer lengths.
 *
 * Each matrix is indexed 0 ... n+1, where n is the number of motifs.
 * The entry at [i,j] corresponds to the transition from motif i to
 * motif j.  Hence, after normalization, each row in the transition
 * matrix should sum to 1.
 ***********************************************************************/
static void normalize_spacer_counts(
  double    trans_pseudo,
  double    spacer_pseudo,    // Pseudocount for self-loop.
  BOOLEAN_T keep_unused,
  MATRIX_T* transp_freq,
  MATRIX_T* spacer_ave
) {
  int i_row;
  int i_col;
  int num_rows;
  double total_spacer;
  double num_transitions;
  double ave_spacer;
  
  /* Divide the spacer lengths by the number of occurrences. */
  num_rows = get_num_rows(transp_freq);
  for (i_row = 0; i_row < num_rows; i_row++) {
    for (i_col = 0; i_col < num_rows; i_col++) {
      total_spacer = get_matrix_cell(i_row, i_col, spacer_ave) + spacer_pseudo;
      num_transitions = get_matrix_cell(i_row, i_col, transp_freq);
      if (spacer_pseudo > 0) num_transitions++;
      if (num_transitions != 0.0) {
        ave_spacer = total_spacer / num_transitions;
        set_matrix_cell(i_row, i_col, ave_spacer, spacer_ave);
      }
    }
  }

  // Add pseudocounts.
  for (i_row = 0; i_row < num_rows; i_row++) {
    for (i_col = 0; i_col < num_rows; i_col++) {

      // Force some transitions to zero.
      if (// No transitions to the start state.
        (i_col == 0) || 
        // No transitions from the end state.
        (i_row == num_rows - 1) ||
        // No transition from start to end.
        ((i_row == 0) && (i_col == num_rows - 1))) {
        set_matrix_cell(i_row, i_col, 0.0, transp_freq);
      }
      else {
        // Only increment the used transitions.
        if ((keep_unused) || 
            (get_matrix_cell(i_row, i_col, transp_freq) > 0.0)) {
          incr_matrix_cell(i_row, i_col, trans_pseudo, transp_freq);
        }
      }
    }
  }

  // Normalize rows.
  for (i_row = 0; i_row < num_rows - 1; i_row++) {
    if (array_total(get_matrix_row(i_row, transp_freq)) > 0.0) {
      normalize(SLOP, get_matrix_row(i_row, transp_freq));
    }
  }
}
Exemple #2
0
/**************************************************************************
*	scale_pssm
*
*	Scale and round the scores in a PSSM so that the score of a word
*	is in the range [0..w*range].
*
*	Returns the scaled PSSM.
*
**************************************************************************/
void scale_pssm(
  PSSM_T *pssm,		          // The PSSM. (IN/OUT)
  PRIOR_DIST_T *prior_dist, // Distribution of priors (IN)
  double alpha,             // Fraction of all TFBS that are the TFBS of interest
  int range 			          // The desired range. (IN) 
)
{
  int i, j;
  MATRIX_T* matrix = pssm->matrix;
  int r = pssm->w;
  int c = pssm->alphsize;
  double small = BIG;
  double large = -BIG;
  double scale, offset;

  // Get the largest and smallest scores in the PSSM.
  for (i=0; i<r; i++) {
    for (j=0; j<c; j++) {
      double x = get_matrix_cell(i, j, matrix);
      small = MIN(small, x);
      large = MAX(large, x);
    }
  }

  // Get the smallest and largest prior log-odds from the prior distribution
  // and use them to adjust small and large.
  if (prior_dist != NULL) {
    double min_lo_prior = get_min_lo_prior(prior_dist, alpha);
    double max_lo_prior = get_max_lo_prior(prior_dist, alpha);
    small = MIN(small, min_lo_prior);
    large = MAX(large, max_lo_prior);
  }
  
  // Find offset and scale factors so that PSSM scores for words is in the 
  // range: [0..w*range]
  // To make LO=0 map back to 0, need offset*scale to be an integer.
  // So we make offset and scale integers. (TLB 31 May 2013)
  if (large == small) { small = large - 1; }	// In case all motif entries are the same.
  offset = small = floor(small);		// Make offset an integer.
  scale = floor(range/(large-small));		// Ensure scaled scores are <= range.

  // Scale and round the PSSM entries.
  for (i=0; i<r; i++) {
    for (j=0; j<c; j++) {
      double x = raw_to_scaled(get_matrix_cell(i, j, matrix), 1, scale, offset);
      set_matrix_cell(i, j, x, matrix);
    }
  }

  // return scale and offset of scores
  pssm->scale = scale;
  pssm->offset = offset;
  pssm->range = range;

} // scale_pssm
Exemple #3
0
KARLIN_INPUT_T *make_karlin_input(
  MATRIX_T *matrix,			/* scoring matrix */
  ARRAY_T *probs			/* letter freq distribution */
)
{
  int i, j;
  double escore;
  long lowest, highest;
  ARRAY_T *score_probs; 
  int nscores;
  int alen = get_num_rows(matrix);	/* size of alphabet */
  KARLIN_INPUT_T *karlin_input;		/* data to return */

  /*  find the highest and lowest scores in the scoring matrix */
  lowest = 1;
  highest = -1;
  for (i=0; i<alen; i++) {
    for (j=0; j<alen; j++) {
      double s = get_matrix_cell(i, j, matrix);
      if (s < lowest) lowest = s;
      if (s > highest) highest = s;
    }
  }
  if (lowest >= 0) die("Lowest score in scoring matrix must be negative, is %f.", (double)lowest);
  if (highest<= 0) die("Highest score in scoring matrix must be positve, is %f.", (double)highest);

  /* allocate the array of score probabilities and set to 0 */
  nscores = highest - lowest + 1;
  score_probs = allocate_array(nscores);
  init_array(0, score_probs);
  
  /* compute the probabilities of different scores */ 
  escore = 0;
  for (i=0; i<alen; i++) {
    for (j=0; j<alen; j++) {
      int s = get_matrix_cell(i, j, matrix);
      double pi = get_array_item(i, probs);
      double pj = get_array_item(j, probs);
      double sp = get_array_item(s-lowest, score_probs); 
      set_array_item(s-lowest, sp + pi*pj, score_probs);	/* cumulative prob. of score */
      escore += pi*pj*s;
      /*printf("i %d j %d s %d pi %f pj %f sp %f escore %f\n",i,j,s, pi, pj, sp, escore);*/
    }
  }

  karlin_input = (KARLIN_INPUT_T *)mm_malloc(sizeof(KARLIN_INPUT_T));
  karlin_input->low = lowest;
  karlin_input->high = highest;
  karlin_input->escore = escore;
  karlin_input->prob = score_probs;

  return(karlin_input);
} /* make_karlin_input */
Exemple #4
0
BOOLEAN_T verify_trans_matrix
  (BOOLEAN_T log_form,    /* Is the transition matrix in log form? */
   int       num_states,  /* Number of states in the (square) matrix. */
   MATRIX_T* trans)       /* The matrix. */
{
  int    i_state;
  PROB_T total;

  for (i_state = 0; i_state < num_states - 1; i_state++) {

    /* Cf. Rabiner, formula (43b), p. 265. */
    if (log_form) {
      total = log_array_total(get_matrix_row(i_state, trans));
      if ((!almost_equal(total, 0.0, SLOP)) &&
	  (!almost_equal(total, 1.0, SLOP)) && // Allow for FIMS.
	  (!almost_equal(EXP2(total), 0.0, SLOP))) { 
	fprintf(stderr,
		"Warning: Row %d of transition matrix differs from 0.0 by %g.\n",
		i_state, EXP2(total));
	return(FALSE);
      }
    } else {
      total = array_total(get_matrix_row(i_state, trans));
      if ((!almost_equal(total, 1.0, SLOP)) &&
	  (!almost_equal(total, 2.0, SLOP)) && // Allow FIMs.
	  (!almost_equal(total, 0.0, SLOP))) { // Allow inaccessible motifs.
	fprintf(stderr,
		"Warning: Row %d of transition matrix differs from 1.0 by %g.\n",
		i_state, 1.0 - total);
	return(FALSE);
      }
    }

    /* All transitions from the end state must be zero. */
    if ((log_form) &&
	(get_matrix_cell(num_states - 1, i_state, trans) > LOG_SMALL)) {
      fprintf(stderr,
	      "Warning: Transition %d from end state is non-zero (%g).\n", 
	      i_state, get_matrix_cell(num_states - 1, i_state, trans));
      return(FALSE);
    } else if (!(log_form) &&
	       (!almost_equal(get_matrix_cell(num_states - 1, i_state, trans), 
			      0.0, SLOP))) {
      fprintf(stderr,
	      "Warning: Transition %d from end state is non-zero (%g).\n", 
	      i_state, get_matrix_cell(num_states - 1, i_state, trans));
      return(FALSE);
    }
  }
  return(TRUE);
}  
Exemple #5
0
/**************************************************************************
*	get_min_pvalue
*
*	Return the minimum p-value for a given pssm.
*
**************************************************************************/
static double get_min_pvalue(
  PSSM_T *pssm 			// The PSSM.
)
{
  int i, j;
  int max_score;
  int r = pssm->w;
  int c = pssm->alphsize;
  double min_p_value;

  // Get the largest score in each row and sum them.
  max_score = 0;
  for (i=0; i<r; i++) {
    double large = -BIG;
    for (j=0; j<c; j++) {
      double x = get_matrix_cell(i, j, pssm->matrix);
      large = MAX(large, x);
    }
    max_score += large;
  }

  min_p_value = get_array_item(max_score, pssm->pv);

  return(min_p_value);
} /* get_min_pvalue */
Exemple #6
0
/**************************************************************************
*
	hash_pssm_matrix_pos

	Recursively create a single position of a hashed PSSM.

*
**************************************************************************/
static void hash_pssm_matrix_pos(
  MATRIX_T *pssm, 		// pssm to hash
  MATRIX_T *hashed_pssm, 	// hashed pssm
  int  pos,			// position in pssm
  int  hashed_pos,		// position in hashed pssm
  int  n,			// number of columns to hash together
  double score,			// cumulative score; call with 0
  int index			// cumulative index; call with 0
)
{
  int i;
  int alen = get_num_cols(pssm);	// alphabet length
  int w = get_num_rows(pssm);		// pssm width

  if (n==0) {				// done, set hashed_pssm entry
    set_matrix_cell(hashed_pos, index, score, hashed_pssm);
  } else {				// combine next column of pssm
    for (i=0; i<=alen; i++) {		// letters + blank
      // not past right edge of motif and not blank?
      double s = (pos<w && i!=alen) ? get_matrix_cell(pos, i, pssm) : 0;
      hash_pssm_matrix_pos(pssm,
		    hashed_pssm,
		    pos+1, 		// position in old pssm
		    hashed_pos, 	// position working on
		    n-1, 		// positions remaining to hash
		    score+s, 		// score so far
		    index*(alen+1)+i);	// hashed alphabet index so far
    } // leter
  }
} // hash_pssm_matrix_pos
Exemple #7
0
/*************************************************************************
 * Calculate the log odds score for a single motif-sized window.
 *************************************************************************/
static inline BOOLEAN_T score_motif_site(
  ALPH_T alph,
  char *seq,
  PSSM_T *pssm,
  double *score // OUT
) {
  int asize = alph_size(alph, ALPH_SIZE);
  MATRIX_T* pssm_matrix = pssm->matrix;
  double scaled_log_odds = 0.0;

  // For each position in the site
  int motif_position;
  for (motif_position = 0; motif_position < pssm->w; motif_position++) {

    char c = seq[motif_position];
    int aindex = alph_index(alph, c);
    // Check for gaps and ambiguity codes at this site
    if(aindex == -1 || aindex >= asize) return FALSE;

    scaled_log_odds += get_matrix_cell(motif_position, aindex, pssm_matrix);
  }

  *score = get_unscaled_pssm_score(scaled_log_odds, pssm);

  // Handle scores that are out of range
  if ((int) scaled_log_odds >= get_array_length(pssm->pv)) {
    scaled_log_odds = (float)(get_array_length(pssm->pv) - 1);
    *score = scaled_to_raw(scaled_log_odds, pssm->w, pssm->scale, pssm->offset);
  }
  return TRUE;
}
Exemple #8
0
/***********************************************************************
 * Apply a pseudocount to the motif pspm.
 ***********************************************************************/
void apply_pseudocount_to_motif
  (MOTIF_T* motif, ARRAY_T *background, double pseudocount)
{
  int pos, letter, len, asize, sites;
  double prob, count, total;
  ARRAY_T *temp;

  // no point in doing work when it makes no difference
  if (pseudocount == 0) return;
  assert(pseudocount > 0);
  // motif dimensions
  asize = alph_size(motif->alph, ALPH_SIZE);
  len = motif->length;
  // create a uniform background if none is given
  temp = NULL;
  if (background == NULL) {
    temp = allocate_array(asize);
    get_uniform_frequencies(motif->alph, temp);
    background = temp;
  }
  // calculate the counts
  sites = (motif->num_sites > 0 ? motif->num_sites : DEFAULT_SITE_COUNT);
  total = sites + pseudocount;
  for (pos = 0; pos < len; ++pos) {
    for (letter = 0; letter < asize; ++letter) {
      prob = get_matrix_cell(pos, letter, motif->freqs);
      count = (prob * sites) + (pseudocount * get_array_item(letter, background));
      prob = count / total;
      set_matrix_cell(pos, letter, prob, motif->freqs);
    }
  }
  if (temp) free_array(temp);
}
Exemple #9
0
MATRIX_T *reorder_matrix(
  const char *alpha1,				/* current alphabet */
  const char *alpha2,				/* new alphabet; must be subset */
  MATRIX_T *in_matrix			/* matrix to reorder */
)
{
  int i, j;
  int alen1 = strlen(alpha1);
  int alen2 = strlen(alpha2);
  MATRIX_T *out_matrix;

  if (alen2 > alen1) 
    die("The new alphabet %s must be a subset of the old alphabet %s.\n", alpha2, alpha1);

  out_matrix = allocate_matrix(alen2, alen2);
  for (i=0; i<alen2; i++) {
    int ii = strchr(alpha1, alpha2[i]) - alpha1;
    for (j=0; j<alen2; j++) {
      int jj;
      char *ptr = strchr(alpha1, alpha2[j]);
      if (!ptr)
        die("The new alphabet %s must be a subset of the old alphabet %s\n", alpha2, alpha1);
      jj = ptr - alpha1;
      set_matrix_cell(i, j, get_matrix_cell(ii, jj, in_matrix), out_matrix);
    }
  }
  return(out_matrix);
} /* reorder_matrix */
Exemple #10
0
/**************************************************************************
 * Get pseudocount frequencies.
 *
 * The target_freq matrix only has values for the basic alphabet.
 * Fill in the ambiguous character pseudocounts afterwards using
 * the average of pseudocounts for letters matching the ambiguous ones.
 **************************************************************************/
ARRAY_T *get_pseudocount_freqs(
   ALPH_T alph,
   ARRAY_T *	  f,		/* Foreground distribution. */
   ARRAY_T *      b,		/* Background distribution. */
   MATRIX_T *     target_freq	/* Target frequency matrix. */
)
{
  int i, j;
  int asize = alph_size(alph, ALPH_SIZE);		// excludes ambigs
  ARRAY_T *g = allocate_array(alph_size(alph, ALL_SIZE));// includes ambigs

  /*
    Create pseudocount frequencies.
  */
  for (i = 0; i < asize; i++) {				/* non-ambiguous freqs */
    double gi = 0;
    for (j= 0; j < asize; j++) {			/* non-ambiguous freqs */
      double qij = get_matrix_cell(i, j, target_freq);
      double fj = get_array_item(j, f);
      double bj = get_array_item(j, b);
      gi += (fj/bj) * qij;
    } /* j */
    set_array_item(i, gi, g);
    if (SUBST_MATRIX_DEBUG) printf("%g %g, ", get_array_item(i, f), gi);
  } /* i */
  calc_ambigs(alph, FALSE, g);			/* takes the average pseudocount */
  if (SUBST_MATRIX_DEBUG) printf("\n");

  return(g);						/* return the pseudocounts */
} /* get_pseudocount_freqs */
Exemple #11
0
/**********************************************************************
 post_process()
 
 adjust/normalize scores and p-values
 **********************************************************************/
void post_process(CISML_T* cisml, ARRAYLST_T* motifs, BOOLEAN_T normalize_scores){
  int m_index, seq_index;
  MOTIF_AND_PSSM_T *combo;
  for (m_index = 0; m_index < get_cisml_num_patterns(cisml); ++m_index) {
    PATTERN_T* pattern = get_cisml_patterns(cisml)[m_index];
    double maxscore = 1;
    
    // FIXME: This should be done to the PSSM, not the individual scores!!!
    // Normalize the scores to RMA format if necessary.
    if (normalize_scores) {
      int k;
      combo = (MOTIF_AND_PSSM_T*)arraylst_get(m_index, motifs);
      PSSM_T* pssm = combo->pssm_pair->pos_pssm;
      for (k = 0; k < pssm->w; k++) {
        double maxprob = -BIG;    // These are scores, not probabilities!!!
        int a;
        for (a = 0; a < alph_size_core(pssm->alph); a++) {
          double prob = get_matrix_cell(k, a, pssm->matrix);
          if (maxprob < prob) maxprob = prob;
        }
        maxscore *= maxprob;
      }
    }
    
    // adjust each scanned sequence
    for (seq_index = 0; seq_index < get_pattern_num_scanned_sequences(pattern); 
        ++seq_index) {
      SCANNED_SEQUENCE_T* scanned_seq = 
        get_pattern_scanned_sequences(pattern)[seq_index];
      // only adjust scores and p-values if more than one copy was scored
      // num_scanned_positions is (mis-)used in ama to indicate the number of times 
      // a sequence identifier 0occured in the set
      if (get_scanned_sequence_num_scanned_positions(scanned_seq) > 1L){
        // take average score
        if(has_scanned_sequence_score(scanned_seq)){
          double avg_odds = get_scanned_sequence_score(scanned_seq) / 
            get_scanned_sequence_num_scanned_positions(scanned_seq);
          set_scanned_sequence_score(scanned_seq, avg_odds);
        }
        // adjust the minimum p-value for multiple hypothesis testing
        if(has_scanned_sequence_pvalue(scanned_seq)){
          double corr_pvalue = 1.0 - pow(
              1.0 - get_scanned_sequence_pvalue(scanned_seq),
              get_scanned_sequence_num_scanned_positions(scanned_seq)
              );
          set_scanned_sequence_pvalue(scanned_seq, corr_pvalue);
        }
      }
      
      // normalize if requested
      if (normalize_scores) {
        set_scanned_sequence_score(scanned_seq, 
            get_scanned_sequence_score(scanned_seq) / maxscore
            );
      }
    }
  }
}
Exemple #12
0
extern MATRIX_T* gen_pam_matrix(
  ALPH_T alph,                  /* alphabet */
  int dist,			/* PAM distance */
  BOOLEAN_T logodds		/* true: generate log-odds matrix 
				   false: generate target frequency matrix 
				*/
)
{
  assert(alph == DNA_ALPH || alph == PROTEIN_ALPH);
  int i, j;
  MATRIX_T *matrix, *mul;
  BOOLEAN_T dna = (alph == DNA_ALPH);
  double *pfreq = dna ? pam_dna_freq : pam_prot_freq;	// standard frequencies
  int alen = alph_size(alph, ALPH_SIZE);  // length of standard alphabet
  double factor = dist < 170 ? 2/log(2) : 3/log(2);	// same as in "pam" Version 1.0.6

  /* create the array for the joint probability matrix */
  matrix = allocate_matrix(alen, alen);
  mul = allocate_matrix(alen, alen);

  /* initialize the matrix: PAM 1:
     due to roundoff, take the average of the two estimates of the joint frequency
     of i and j as the joint, then compute the conditionals for the matrix
  */
  for (i=0; i<alen; i++) {
    for (j=0; j<=i; j++) {
      double vij = dna ? trans[i][j] : dayhoff[i][j];
      double vji = dna ? trans[j][i] : dayhoff[j][i];
      double joint = ((vij * pfreq[j]) + (vji * pfreq[i]))/20000;/* use average to fix rndoff */
      set_matrix_cell(i, j, joint/pfreq[j], matrix);
      if (i!=j) set_matrix_cell(j, i, joint/pfreq[i], matrix);
    }
  }

  /* take PAM matrix to desired power to scale it */ 
  copy_matrix(matrix, mul);
  for (i=dist; i>1; i--) {
    MATRIX_T *product = matrix_multiply(matrix, mul);
    SWAP(MATRIX_T*, product, matrix)
    free_matrix(product);
  } 
  free_matrix(mul);

  /* convert to joint or logodds matrix:
     target:  J_ij = Pr(i,j) = Mij pr(j) 
     logodds: L_ij = log (Pr(i,j)/(Pr(i)Pr(j)) = log (Mij Pr(j)/Pr(i)Pr(j)) = log(Mij/pr(i)) 
  */
  for (i=0; i<alen; i++) {
    for (j=0; j<alen; j++) {
      double vij = get_matrix_cell(i, j, matrix);
      vij = logodds ? nint(factor * log((vij+EPSILON)/pfreq[i])) : vij * pfreq[j];
      set_matrix_cell(i, j, vij, matrix);
    }
  }

  return matrix;
} /* gen_pam_matrix */
Exemple #13
0
/***********************************************************************
 * Converts a TRANSFAC motif to a MEME motif.
 * Caller is responsible for freeing the returned MOTIF_T.
 ***********************************************************************/
MOTIF_T *convert_transfac_motif_to_meme_motif(
    char *id,
    int pseudocount,
    ARRAY_T *bg,
    TRANSFAC_MOTIF_T *motif
) {
    MATRIX_T *counts = get_transfac_counts(motif);
    if (counts == NULL) {
        die(
            "Unable to convert TRANSFAC motif %s to MEME motif: "
            "missing counts matrix.",
            id
        );
    };

    // Convert the motif counts to frequencies.
    int num_bases = get_num_cols(counts);
    int motif_width = get_num_rows(counts);
    int motif_position = 0;
    MATRIX_T *freqs = allocate_matrix(motif_width, num_bases);
    for (motif_position = 0; motif_position < motif_width; ++motif_position) {
        int i_base = 0;
        int num_seqs = 0; // motif columns may have different counts
        for (i_base = 0; i_base < num_bases; i_base++) {
            num_seqs += get_matrix_cell(motif_position, i_base, counts);
        }
        for (i_base = 0; i_base < num_bases; i_base++) {
            double freq =
                (get_matrix_cell(motif_position, i_base, counts)
                 + (pseudocount * get_array_item(i_base, bg))) / (num_seqs + pseudocount);
            set_matrix_cell(motif_position, i_base, freq, freqs);
        }
    }

    MOTIF_T *meme_motif = allocate_motif(id, DNA_ALPH, NULL, freqs);
    calc_motif_ambigs(meme_motif);
    return meme_motif;
}
Exemple #14
0
/***********************************************************************
 * Return one column of a motif, as a newly allocated array of counts.
 ***********************************************************************/
ARRAY_T* get_motif_counts
  (int      position,
   MOTIF_T* motif)
{
  ARRAY_T* return_value = allocate_array(motif->alph_size);

  int i_alph;
  for (i_alph = 0; i_alph < motif->alph_size; i_alph++) {
    set_array_item(i_alph,
		   motif->num_sites * get_matrix_cell(position, 
						      i_alph, motif->freqs),
		   return_value);
  }
  return(return_value);
}
Exemple #15
0
/*************************************************************************
 * Output JSON data for a motif
 *************************************************************************/
static void output_motif_json(JSONWR_T* json, MOTIF_STATS_T* stats, 
    SITE_COUNTS_T* counts) {
  //vars
  MOTIF_T *motif;
  MATRIX_T *freqs;
  int i, j, mlen, asize, end;
  motif = stats->motif;
  freqs = get_motif_freqs(motif);
  asize = alph_size(get_motif_alph(motif), ALPH_SIZE);
  jsonwr_start_object_value(json);
  jsonwr_lng_prop(json, "db", stats->db->id);
  jsonwr_str_prop(json, "id", get_motif_id(motif));
  if (*(get_motif_id2(motif))) {
    jsonwr_str_prop(json, "alt", get_motif_id2(motif));
  }
  mlen = get_motif_length(motif);
  jsonwr_lng_prop(json, "len", mlen);
  jsonwr_dbl_prop(json, "motif_evalue", get_motif_evalue(motif));
  jsonwr_dbl_prop(json, "motif_nsites", get_motif_nsites(motif));
  if (get_motif_url(motif) && *get_motif_url(motif)) {
    jsonwr_str_prop(json, "url", get_motif_url(motif));
  }
  jsonwr_property(json, "pwm");
  jsonwr_start_array_value(json);
  for (i = 0; i < mlen; i++) {
    jsonwr_start_array_value(json);
    for (j = 0; j < asize; j++) {
      jsonwr_dbl_value(json, get_matrix_cell(i, j, freqs));
    }
    jsonwr_end_array_value(json);
  }
  jsonwr_end_array_value(json);
  jsonwr_lng_prop(json, "bin_width", stats->central_window+1);
  jsonwr_dbl_prop(json, "bin_sites", stats->central_sites);
  jsonwr_lng_prop(json, "total_sites", counts->total_sites);
  jsonwr_dbl_prop(json, "log_pvalue", stats->log_adj_pvalue);
  jsonwr_dbl_prop(json, "max_prob", stats->max_prob);
  jsonwr_property(json, "sites");
  jsonwr_start_array_value(json);
  end = counts->allocated - (mlen - 1);
  for (i = (mlen - 1); i < end; i += 2) {
    jsonwr_dbl_value(json, counts->sites[i]);
  }
  jsonwr_end_array_value(json);
  jsonwr_end_object_value(json);
}
Exemple #16
0
MATRIX_T *get_subst_target_matrix(
  char *score_filename,		/* name of score file */
  ALPH_T alph,                  /* alphabet */
  int dist,			/* PAM distance (ignored if score_filename != NULL) */
  ARRAY_T *back			/* background frequencies of standard alphabet */
)
{
  MATRIX_T *score;		/* score matrix */
  MATRIX_T *target;		/* target frequency matrix */

  score = get_score_matrix(score_filename, alph, dist);
  target = convert_score_to_target(score, back);

  if (SUBST_MATRIX_DEBUG)
  {
    int i, j, alength=alph_size(alph, ALPH_SIZE);
    double sum;

      if (score_filename) {
	printf("From file %s\n", score_filename);
      } else {
	printf("Generated PAM %d\n", dist);
      }
      printf("%6c ", ' ');
      for (i=0; i<alength; i++) {
	printf("%6c ", alph_char(alph, i));
      }
      printf("\n");
    sum = 0;
    for (i=0; i<alength; i++) {
      printf("%6c ", alph_char(alph, i));
      for (j=0; j<alength; j++) {
	double x = get_matrix_cell(i,j,score);
	sum += x;
	printf("%6.4f ", x);
      }
      printf("\n");
    }
    printf("sum of entries = %f\n", sum);
  }

  free_matrix(score);
    
  return(target);
} /* get_subst_target_matrix */
Exemple #17
0
/***********************************************************************
 * Takes a matrix of meme scores and converts them into letter 
 * probabilities.
 *
 * The probablility can be got by:
 * p = (2 ^ (s / 100)) * bg
 *
 ***********************************************************************/
MATRIX_T* convert_scores_into_freqs
  (ALPH_T alph,
   MATRIX_T *scores,
   ARRAY_T *bg,
   int site_count,
   double pseudo_count)
{
  int asize, length;
  double freq, score, total_count, counts, bg_freq;
  MATRIX_T *freqs;
  int row, col;

  assert(alph != INVALID_ALPH);
  assert(scores != NULL);
  assert(bg != NULL);

  length = get_num_rows(scores);
  asize = alph_size(alph, ALPH_SIZE);

  freqs = allocate_matrix(length, asize);
  total_count = site_count + pseudo_count;

  for (col = 0; col < asize; ++col) {
    bg_freq = get_array_item(col, bg);
    for (row = 0; row < length; ++row) {
      score = get_matrix_cell(row, col, scores);
      // convert to a probability
      freq = pow(2.0, score / 100.0) * bg_freq;
      // remove the pseudo count
      freq = ((freq * total_count) - (bg_freq * pseudo_count)) / site_count;
      if (freq < 0) freq = 0;
      else if (freq > 1) freq = 1;
      set_matrix_cell(row, col, freq, freqs);
    }
  }
  for (row = 0; row < length; ++row) {
    normalize_subarray(0, asize, 0.0, get_matrix_row(row, freqs));
  }

  return freqs;
}
static int count_trans
  (MATRIX_T* trans,      /* The transition matrix. */
   BOOLEAN_T log_form,   /* Is the transition matrix in log form? */
   int       num_states, /* Number of states in the (square) matrix. */
   int       state_num,  /* Index of the state we're interested in. */
   int       in_or_out)  /* Incoming or outgoing transitions? */
{
  int i_row;
  int i_col;
  int ntrans = 0;  /* The return value. */

  for (i_row = 0; i_row < num_states; i_row++) {
    for (i_col = 0; i_col < num_states; i_col++) {
      if (!is_zero(get_matrix_cell(i_row, i_col, trans), log_form)) {
        if ((in_or_out == TRANS_IN) && (i_col == state_num))
          ntrans++;
        else if ((in_or_out == TRANS_OUT) && (i_row == state_num))
          ntrans++;
      }
    }
  }
  return(ntrans);
} // count_trans
Exemple #19
0
MATRIX_T *convert_score_to_target(
  MATRIX_T *score,			/* score matrix */
  ARRAY_T *prob				/* letter frequencies */
)
{
  int i, j;
  KARLIN_INPUT_T *karlin_input;
  double lambda, K, H;
  MATRIX_T *target;			/* target freq. matrix */
  int alen = get_num_rows(score);	/* alphabet length */

  /* make input for karlin() */
  karlin_input = make_karlin_input(score, prob);
  
  /* get lambda */
  karlin(karlin_input->low, karlin_input->high, karlin_input->prob->items,
    &lambda, &K, &H);
  /*printf("lambda %f K %f H %f\n", lambda, K, H);*/

  /* calculate target frequencies */
  target = allocate_matrix(alen, alen);
  for (i=0; i<alen; i++) {
    for (j=0; j<alen; j++) {
      double pi = get_array_item(i, prob);
      double pj = get_array_item(j, prob);
      double sij = get_matrix_cell(i, j, score);
      double f = pi * pj * exp(lambda * sij);
      set_matrix_cell(i, j, f, target);
    }
  }

  // Free local dynamic memory.
  free_array(karlin_input->prob);
  myfree(karlin_input);

  return(target);
} /* convert_score_to_target */
/************************************************************************
 * Compute the indices and values of transitions to or from a state.
 ************************************************************************/
void compute_ins_and_outs
  (MHMM_T*   the_hmm,
   BOOLEAN_T log_form) /* Is the transition matrix in log form? */
{
  int i_row, i_col;
  int n = the_hmm->num_states;
  MATRIX_T *trans = the_hmm->trans;

  //
  // Visit the transition matrix cells just once each
  // to update ntrans, itrans and trans arrays.
  // This is quadratic in n. 
  //
  for (i_row = 0; i_row < n; i_row++) {
    for (i_col = 0; i_col < n; i_col++) {
      double p;                         // The transition probability.
      int old_n, new_n;                 // Number of transitions.
      if (!is_zero((p = get_matrix_cell(i_row, i_col, trans)), log_form)) {
        MHMM_STATE_T * out_state = &(the_hmm->states[i_row]);
        MHMM_STATE_T * in_state = &(the_hmm->states[i_col]);
        // out
        old_n = out_state->ntrans_out; 
        new_n = ++out_state->ntrans_out;
        mm_resize(out_state->itrans_out, new_n, int);
        out_state->trans_out = resize_array(out_state->trans_out, new_n);
        out_state->itrans_out[old_n] = i_col;
        set_array_item(old_n, p, out_state->trans_out);
        // in
        old_n = in_state->ntrans_in; 
        new_n = ++in_state->ntrans_in;
        mm_resize(in_state->itrans_in, new_n, int);
        in_state->trans_in = resize_array(in_state->trans_in, new_n);
        in_state->itrans_in[old_n] = i_row;
        set_array_item(old_n, p, in_state->trans_in);
      }
    } // col
  } // row
Exemple #21
0
/***********************************************************************
 * Takes a matrix of letter probabilities and converts them into meme
 * score.
 *
 * Assuming the probability is nonzero the score is just: 
 * s = log2(p / bg) * 100
 *
 ***********************************************************************/
MATRIX_T* convert_freqs_into_scores
  (ALPH_T alph,
   MATRIX_T *freqs,
   ARRAY_T *bg,
   int site_count,
   double pseudo_count) 
{
  int asize, length;
  double freq, score, total_count, counts, bg_freq;
  MATRIX_T *scores;
  int row, col;

  assert(alph != INVALID_ALPH);
  assert(freqs != NULL);
  assert(bg != NULL);

  length = get_num_rows(freqs);
  asize = alph_size(alph, ALPH_SIZE);

  scores = allocate_matrix(length, asize);
  total_count = site_count + pseudo_count;

  for (col = 0; col < asize; ++col) {
    bg_freq = get_array_item(col, bg);
    for (row = 0; row < length; ++row) {
      freq = get_matrix_cell(row, col, freqs);
      // apply a pseudo count
      freq = ((pseudo_count * bg_freq) + (freq * site_count)) / total_count;
      // if the background is correct this shouldn't happen
      if (freq <= 0) freq = 0.0000005;
      // convert to a score
      score = (log(freq / bg_freq) / log(2)) * 100;
      set_matrix_cell(row, col, score, scores);
    }
  }
  return scores;
}
/*****************************************************************************
 * MEME > motifs > motif > probabilities > alphabet_matrix > alphabet_array > /value
 * Lookup a letter and check it exists and does not have a probability. 
 * Set the letter's score to the passed value.
 ****************************************************************************/
void mxml_probability_value(void *ctx, char *letter_id, double probability) {
  CTX_T *data;
  MATRIX_T *freqs;
  char *symbol;
  int index;
  data = (CTX_T*)ctx;
  freqs = data->mscope.motif->freqs;
  // lookup letter ID
  symbol = (char*)rbtree_get(data->letter_lookup, letter_id);
  if (symbol == NULL) {
    local_error(data, "Probability is not allowed for unknown letter identifier \"%s\".\n", letter_id);
    return;
  }
  index = alph_indexc(data->alph, symbol[0]);
  if (index < 0) {
    local_error(data, "Probability is not allowed for non-core letter %c.\n", symbol[0]);
    return;
  }
  if (get_matrix_cell(data->current_pos, index, freqs) != -1) {
    local_error(data, "Probability for letter %c in position %d has already been set.\n", symbol[0], data->current_pos + 1);
    return;
  }
  set_matrix_cell(data->current_pos, index, probability, freqs);
}
Exemple #23
0
/*************************************************************************
 * Entry point for pmp_bf
 *************************************************************************/
int main(int argc, char *argv[]) {

  char* bg_filename = NULL;
  char* motif_name = "motif"; // Use this motif name in the output.
  STRING_LIST_T* selected_motifs = NULL;
  double fg_rate = 1.0;
  double bg_rate = 1.0;
  double purine_pyrimidine = 1.0; // r
  double transition_transversion = 0.5; // R
  double pseudocount = 0.1;
  GAP_SUPPORT_T gap_support = SKIP_GAPS;
  MODEL_TYPE_T model_type = F81_MODEL;
  BOOLEAN_T use_halpern_bruno = FALSE;
  char* ustar_label = NULL;	// TLB; create uniform star tree
  int i;

  program_name = "pmp_bf";

  /**********************************************
   * COMMAND LINE PROCESSING
   **********************************************/

  // Define command line options. (FIXME: Repeated code)
  // FIXME: Note that if you add or remove options you
  // must change n_options.
  int n_options = 12;
  cmdoption const pmp_options[] = {
    {"hb", NO_VALUE},
    {"ustar", REQUIRED_VALUE},
    {"model", REQUIRED_VALUE},
    {"pur-pyr", REQUIRED_VALUE},
    {"transition-transversion", REQUIRED_VALUE},
    {"bg", REQUIRED_VALUE},
    {"fg", REQUIRED_VALUE},
    {"motif", REQUIRED_VALUE},
    {"motif-name", REQUIRED_VALUE},
    {"bgfile", REQUIRED_VALUE},
    {"pseudocount", REQUIRED_VALUE},
    {"verbosity", REQUIRED_VALUE}
  };

  int option_index = 0;

  // Define the usage message.
  char      usage[1000] = "";
  strcat(usage, "USAGE: pmp [options] <tree file> <MEME file>\n");
  strcat(usage, "\n");
  strcat(usage, "   Options:\n");

  // Evolutionary model parameters.
  strcat(usage, "     --hb\n");
  strcat(usage, "     --model single|average|jc|k2|f81|f84|hky|tn");
  strcat(usage, " (default=f81)\n");
  strcat(usage, "     --pur-pyr <float> (default=1.0)\n");
  strcat(usage, "     --transition-transversion <float> (default=0.5)\n");
  strcat(usage, "     --bg <float> (default=1.0)\n");
  strcat(usage, "     --fg <float> (default=1.0)\n");

  // Motif parameters.
  strcat(usage, "     --motif <id> (default=all)\n");
  strcat(usage, "     --motif-name <string> (default from motif file)\n");

  // Miscellaneous parameters
  strcat(usage, "     --bgfile <background> (default from motif file)\n");
  strcat(usage, "     --pseudocount <float> (default=0.1)\n");
  strcat(usage, "     --ustar <label>\n");	// TLB; create uniform star tree
  strcat(usage, "     --verbosity [1|2|3|4] (default 2)\n");
  strcat(usage, "\n    Prints the FP and FN rate at each of 10000 score values.\n");
  strcat(usage, "\n    Output format: [<motif_id> score <score> FPR <fpr> TPR <tpr>]+\n");

  // Parse the command line.
  if (simple_setopt(argc, argv, n_options, pmp_options) != NO_ERROR) {
    die("Error processing command line options: option name too long.\n");
  }

  while (TRUE) { 
    int c = 0;
    char* option_name = NULL;
    char* option_value = NULL;
    const char * message = NULL;

    // Read the next option, and break if we're done.
    c = simple_getopt(&option_name, &option_value, &option_index);
    if (c == 0) {
      break;
    } else if (c < 0) {
      (void) simple_getopterror(&message);
      die("Error processing command line options (%s)\n", message);
    }
    
    if (strcmp(option_name, "model") == 0) {
      if (strcmp(option_value, "jc") == 0) {
        model_type = JC_MODEL;
      } else if (strcmp(option_value, "k2") == 0) {
        model_type = K2_MODEL;
      } else if (strcmp(option_value, "f81") == 0) {
        model_type = F81_MODEL;
      } else if (strcmp(option_value, "f84") == 0) {
        model_type = F84_MODEL;
      } else if (strcmp(option_value, "hky") == 0) {
        model_type = HKY_MODEL;
      } else if (strcmp(option_value, "tn") == 0) {
        model_type = TAMURA_NEI_MODEL;
      } else if (strcmp(option_value, "single") == 0) {
        model_type = SINGLE_MODEL;
      } else if (strcmp(option_value, "average") == 0) {
        model_type = AVERAGE_MODEL;
      } else {
        die("Unknown model: %s\n", option_value);
      }
    } else if (strcmp(option_name, "hb") == 0){
        use_halpern_bruno = TRUE;
    } else if (strcmp(option_name, "ustar") == 0){	// TLB; create uniform star tree
        ustar_label = option_value;
    } else if (strcmp(option_name, "pur-pyr") == 0){
        purine_pyrimidine = atof(option_value);
    } else if (strcmp(option_name, "transition-transversion") == 0){
        transition_transversion = atof(option_value);
    } else if (strcmp(option_name, "bg") == 0){
      bg_rate = atof(option_value);
    } else if (strcmp(option_name, "fg") == 0){
      fg_rate = atof(option_value);
    } else if (strcmp(option_name, "motif") == 0){
        if (selected_motifs == NULL) {
          selected_motifs = new_string_list();
        }
       add_string(option_value, selected_motifs);
    } else if (strcmp(option_name, "motif-name") == 0){
        motif_name = option_value;
    } else if (strcmp(option_name, "bgfile") == 0){
      bg_filename = option_value;
    } else if (strcmp(option_name, "pseudocount") == 0){
        pseudocount = atof(option_value);
    } else if (strcmp(option_name, "verbosity") == 0){
        verbosity = atoi(option_value);
    }
  }

  // Must have tree and motif file names
  if (argc != option_index + 2) {
    fprintf(stderr, "%s", usage);
    exit(EXIT_FAILURE);
  } 

  /**********************************************
   * Read the phylogenetic tree.
   **********************************************/
  char* tree_filename = NULL;
  TREE_T* tree = NULL;
  tree_filename = argv[option_index];
  option_index++;
  tree = read_tree_from_file(tree_filename);

  // get the species names
  STRING_LIST_T* alignment_species = make_leaf_list(tree);
  char *root_label = get_label(tree);	// in case target in center
  if (strlen(root_label)>0) add_string(root_label, alignment_species);
  //write_string_list(" ", alignment_species, stderr);

  // TLB; Convert the tree to a uniform star tree with
  // the target sequence at its center.
  if (ustar_label != NULL) {
    tree = convert_to_uniform_star_tree(tree, ustar_label);
    if (tree == NULL) 
      die("Tree or alignment missing target %s\n", ustar_label);
    if (verbosity >= NORMAL_VERBOSE) {
      fprintf(stderr, 
	"Target %s placed at center of uniform (d=%.3f) star tree:\n", 
          ustar_label, get_total_length(tree) / get_num_children(tree) 
      );
      write_tree(tree, stderr);
    }
  }

  /**********************************************
   * Read the motifs.
   **********************************************/
  char* meme_filename = argv[option_index];
  option_index++;
  int num_motifs = 0; 

  MREAD_T *mread;
  ALPH_T alph;
  ARRAYLST_T *motifs;
  ARRAY_T *bg_freqs;

  mread = mread_create(meme_filename, OPEN_MFILE);
  mread_set_bg_source(mread, bg_filename);
  mread_set_pseudocount(mread, pseudocount);
  // read motifs
  motifs = mread_load(mread, NULL);
  alph = mread_get_alphabet(mread);
  bg_freqs = mread_get_background(mread);
  // check
  if (arraylst_size(motifs) == 0) die("No motifs in %s.", meme_filename);

  

  // TLB; need to resize bg_freqs array to ALPH_SIZE items
  // or copy array breaks in HB mode.  This throws away
  // the freqs for the ambiguous characters;
  int asize = alph_size(alph, ALPH_SIZE);
  resize_array(bg_freqs, asize);

  /**************************************************************
  * Compute probability distributions for each of the selected motifs.
  **************************************************************/
  int motif_index;
  for (motif_index = 0; motif_index < arraylst_size(motifs); motif_index++) {

    MOTIF_T* motif = (MOTIF_T*)arraylst_get(motif_index, motifs);
    char* motif_id = get_motif_id(motif);
    char* bare_motif_id = motif_id;

    // We may have specified on the command line that
    // only certain motifs were to be used.
    if (selected_motifs != NULL) {
      if (*bare_motif_id == '+' || *bare_motif_id == '-') {
        // The selected  motif id won't included a strand indicator.
        bare_motif_id++;
      }
      if (have_string(bare_motif_id, selected_motifs) == FALSE) {
        continue;
      }
    }

    if (verbosity >= NORMAL_VERBOSE) {
      fprintf(
        stderr, 
        "Using motif %s of width %d.\n",
        motif_id, get_motif_length(motif)
      );
    }

    // Build an array of evolutionary models for each position in the motif.
    EVOMODEL_T** models = make_motif_models(
      motif, 
      bg_freqs,
      model_type,
      fg_rate, 
      bg_rate, 
      purine_pyrimidine, 
      transition_transversion, 
      use_halpern_bruno
    );

    // Get the frequencies under the background model (row 0) 
    // and position-dependent scores (rows 1..w)
    // for each possible alignment column.
    MATRIX_T* pssm_matrix = build_alignment_pssm_matrix(
      alph,
      alignment_species,
      get_motif_length(motif) + 1, 
      models, 
      tree, 
      gap_support
    );
    ARRAY_T* alignment_col_freqs = allocate_array(get_num_cols(pssm_matrix)); 
    copy_array(get_matrix_row(0, pssm_matrix), alignment_col_freqs);
    remove_matrix_row(0, pssm_matrix);		// throw away first row
    //print_col_frequencies(alph, alignment_col_freqs);

    //
    // Get the position-dependent null model alignment column frequencies
    //
    int w = get_motif_length(motif);
    int ncols = get_num_cols(pssm_matrix); 
    MATRIX_T* pos_dep_bkg = allocate_matrix(w, ncols);
    for (i=0; i<w; i++) {
      // get the evo model corresponding to this column of the motif
      // and store it as the first evolutionary model.
      myfree(models[0]);
      // Use motif PSFM for equilibrium freqs. for model.
      ARRAY_T* site_specific_freqs = allocate_array(asize);
      int j = 0;
      for(j = 0; j < asize; j++) {
	double value = get_matrix_cell(i, j, get_motif_freqs(motif));
	set_array_item(j, value, site_specific_freqs);
      }
      if (use_halpern_bruno == FALSE) {
	models[0] = make_model(
	  model_type,
	  fg_rate,
	  transition_transversion,
	  purine_pyrimidine,
	  site_specific_freqs,
          NULL
	);
      } else {
        models[0] = make_model(
	  model_type,
	  fg_rate,
	  transition_transversion,
	  purine_pyrimidine,
	  bg_freqs,
	  site_specific_freqs
	);
      }
      // get the alignment column frequencies using this model
      MATRIX_T* tmp_pssm_matrix = build_alignment_pssm_matrix(
        alph,
	alignment_species,
	2,				// only interested in freqs under bkg
	models, 
	tree, 
	gap_support
      );
      // assemble the position-dependent background alignment column freqs.
      set_matrix_row(i, get_matrix_row(0, tmp_pssm_matrix), pos_dep_bkg);
      // chuck the pssm (not his real name)
      free_matrix(tmp_pssm_matrix);
    }

    //
    // Compute and print the score distribution under the background model
    // and under the (position-dependent) motif model.
    //
    int range = 10000;	// 10^4 gives same result as 10^5, but 10^3 differs

    // under background model
    PSSM_T* pssm = build_matrix_pssm(alph, pssm_matrix, alignment_col_freqs, range);

    // under position-dependent background (motif) model
    PSSM_T* pssm_pos_dep = build_matrix_pssm(alph, pssm_matrix, alignment_col_freqs, range);
    get_pv_lookup_pos_dep(
      pssm_pos_dep, 
      pos_dep_bkg, 
      NULL // no priors used
    );

    // print FP and FN distributions
    int num_items = get_pssm_pv_length(pssm_pos_dep);
    for (i=0; i<num_items; i++) {
      double pvf = get_pssm_pv(i, pssm);
      double pvt = get_pssm_pv(i, pssm_pos_dep);
      double fpr = pvf;
      double fnr = 1 - pvt;
      if (fpr >= 0.99999 || fnr == 0) continue;
      printf("%s score %d FPR %.3g FNR %.3g\n", motif_id, i, fpr, fnr);
    }

    // free stuff
    free_pssm(pssm);
    free_pssm(pssm_pos_dep);
    if (models != NULL) {
      int model_index;
      int num_models = get_motif_length(motif) + 1;
      for (model_index = 0; model_index < num_models; model_index++) {
        free_model(models[model_index]);
      }
      myfree(models);
    }

  } // motif

  arraylst_destroy(destroy_motif, motifs);

  /**********************************************
   * Clean up.
   **********************************************/
  // TLB may have encountered a memory corruption bug here
  // CEG has not been able to reproduce it. valgrind says all is well.
  free_array(bg_freqs);
  free_tree(TRUE, tree);
  free_string_list(selected_motifs);

  return(0);
} // main
Exemple #24
0
/*************************************************************************
 * Calculate the odds score for each motif-sized window at each
 * site in the sequence using the given nucleotide frequencies.
 *
 * This function is a lightweight version based on the one contained in
 * motiph-scoring. Several calculations that are unnecessary for gomo
 * have been removed in order to speed up the process
 *************************************************************************/
static double score_sequence(
    SEQ_T *seq,         // sequence to scan (IN)
    MOTIF_T *motif,     // motif already converted to odds values (IN)
    PSSM_T *m_pssm,     // motif pssm (IN)
    MATRIX_T *m_odds,   // motif odds (IN)
    int method,         // method used for scoring (IN)
    double threshold,   // Threshold to use in TOTAL_HITS mode with a PWM
    ARRAY_T *bg_freqs   //background model
    )
{

  assert(seq != NULL);
  assert(motif != NULL);
  assert((method == TOTAL_HITS && m_pssm) || (method != TOTAL_HITS && m_odds));

  char* raw_seq = get_raw_sequence(seq);
  int seq_length = get_seq_length(seq);

  // Get the pv lookup table
  ARRAY_T* pv_lookup = NULL;
  if (NULL != m_pssm) {
    pv_lookup = m_pssm->pv;
    assert(get_array_length(pv_lookup) > 0);
  }

  // Prepare storage for the string representing the portion
  // of the reference sequence within the window.
  char* window_seq = (char *) mm_malloc(sizeof(char) * (get_motif_length(motif) + 1));
  window_seq[get_motif_length(motif)] = '\0';

  int max_index = seq_length - get_motif_length(motif);
  if (max_index < 0) max_index = 0;
  const int asize = alph_size(get_motif_alph(motif), ALPH_SIZE);
  double* odds =  (double*) mm_malloc(sizeof(double)*max_index);
  double* scaled_log_odds =  (double*) mm_malloc(sizeof(double)*max_index);

  // For each site in the sequence
  int seq_index;
  for (seq_index = 0; seq_index < max_index; seq_index++) {
    double odd = 1.0;
    scaled_log_odds[seq_index] = 0;

    // For each site in the motif window
    int motif_position;
    for (motif_position = 0; motif_position < get_motif_length(motif); motif_position++) {
      char c = raw_seq[seq_index + motif_position];
      window_seq[motif_position] = c;

      // Check for gaps at this site
      if(c == '-' || c == '.') {
        break;
      }

      // Check for ambiguity codes at this site
      //TODO: This next call is very expensive - it takes up approx. 10% of a
      //      programme's running time. It should be fixed up somehow.
      int aindex = alph_index(get_motif_alph(motif), c);
      if (aindex > asize) {
        break;
      }
      if (method == TOTAL_HITS) {
        //If we're in this mode, then we're using LOG ODDS.
        //scaled_log_odds[seq_index] += get_matrix_cell(motif_position, aindex, get_motif_freqs(motif));
        scaled_log_odds[seq_index] += get_matrix_cell(motif_position, aindex, m_pssm->matrix);
      } else {
        odd *= get_matrix_cell(motif_position, aindex, m_odds);
      }
    }
    odds[seq_index] = odd;
  }

  // return odds as requested (MAX or AVG scoring)
  double requested_odds = 0.0;
  if (method == AVG_ODDS){
    for (seq_index = 0; seq_index < max_index; seq_index++) {
      requested_odds += odds[seq_index];
    }
    requested_odds /= max_index + 1;		// Divide by 0 if max_index==0
  } else if (method == MAX_ODDS){
    for (seq_index = 0; seq_index < max_index; seq_index++) {
      if (odds[seq_index] > requested_odds){
        requested_odds = odds[seq_index];
      }
    }
  } else if (method == SUM_ODDS) {
    for (seq_index = 0; seq_index < max_index; seq_index++) {
      requested_odds += odds[seq_index];
    }
  } else if (method == TOTAL_HITS) {
    for (seq_index = 0; seq_index < max_index; seq_index++) {

      if (scaled_log_odds[seq_index] >= (double)get_array_length(pv_lookup)) {
        scaled_log_odds[seq_index] = (double)(get_array_length(pv_lookup) - 1);
      } 
      double pvalue = get_array_item((int) scaled_log_odds[seq_index], pv_lookup);

      //Figure out how to calculate the p-value of a hit
      //fprintf(stderr, "m: %s pv_l len: %i scaled_log_odds: %g seq index: %i pvalue: %g\n", 
      //    get_motif_id(motif), get_array_length(pv_lookup), scaled_log_odds[seq_index], seq_index, pvalue);

      if (pvalue < threshold) {
        requested_odds++; //Add another hit.
      }

      if (verbosity > HIGHER_VERBOSE) {
        fprintf(stderr, "Window Data: %s\t%s\t%i\t%g\t%g\t%g\n",
            get_seq_name(seq), get_motif_id(motif), seq_index, scaled_log_odds[seq_index], pvalue, threshold);
      }
    }
  }

  myfree(odds);
  myfree(scaled_log_odds);
  myfree(window_seq);
  return requested_odds;
}
Exemple #25
0
/*************************************************************************
 * Calculate the odds score for each motif-sized window at each
 * site in the sequence using the given nucleotide frequencies.
 *
 * This function is a lightweight version based on the one contained in
 * motiph-scoring. Several calculations that are unnecessary for gomo
 * have been removed in order to speed up the process.
 * Scores sequence with up to two motifs.
 *************************************************************************/
double score_sequence(
  SEQ_T*        seq,		// sequence to scan (IN)
  double *logcumback,		// cumulative bkg probability of sequence (IN)
  PSSM_PAIR_T*  pssm_pair,	// pos and neg pssms (IN)
  int method, 			// method used for scoring (IN)
  int last, 			//score only last <n> or
				//score all if <n> is zero (IN)
  BOOLEAN_T* isFeasible		// FLAG indicated if there is at least one position
				// where the motif could be matched against (OUT)
)
{
  assert(pssm_pair != NULL);
  assert(seq != NULL);

  PSSM_T* pos_pssm = pssm_pair->pos_pssm;
  assert(pos_pssm != NULL);
  PSSM_T* neg_pssm = pssm_pair->neg_pssm;
  int n_motifs = neg_pssm ? 2 : 1;

  char* raw_seq = get_raw_sequence(seq);
  int seq_length = get_seq_length(seq);
  int w = get_num_rows(pos_pssm->matrix);
  int n = seq_length - w + 1;

  if (verbosity >= DUMP_VERBOSE) {
    fprintf(stderr, "Debug n_motifs: %d seq_length: %d w: %d n: %d.\n", n_motifs, seq_length, w, n);
  }

  // Get alphabet;
  char* alphabet = get_alphabet(FALSE);
  int alph_size = get_alph_size(ALPH_SIZE);

  // Dependent on the "last" parameter, change the starting point
  int start;
  int N_scored;
  if (last > 0 && last < seq_length) {
    start = seq_length - last;
    N_scored  = n_motifs * (last - w + 1);	// number of sites scored
  } else {
    start = 0;
    N_scored  = n_motifs * n;			// number of sites scored
  }

  // For each motif (positive and reverse complement)
  double max_odds = 0.0;
  double sum_odds = 0.0;
  double requested_odds = 0.0;
  int i;

  if (verbosity >= HIGHER_VERBOSE) {
    fprintf(stderr, "Starting scan at position %d .\n", start);
  }

  for (i=0; i<n_motifs; i++) { 	// pos (and negative) motif
    PSSM_T* pssm = (i==0 ? pos_pssm : neg_pssm);	// choose +/- motif
    // For each site in the sequence
    int seq_index;
    for (seq_index = start; seq_index < n; seq_index++) {	// site 
      double odds = 1.0;
      // For each position in the motif window
      int motif_position;
      for (motif_position = 0; motif_position < w; motif_position++) { // column
        int i_site = seq_index + motif_position;
        char c = raw_seq[i_site];
        // Check for gaps at this site
        if (c == '-' || c == '.') { N_scored--; odds = 0; break; }
        // Check for ambiguity codes at this site
        int alph_index = alphabet_index(c, alphabet);
        if (alph_index >= alph_size || alph_index < 0) { N_scored--; odds = 0; break; }
        // multiple odds by value in appropriate motif cell
        odds *= get_matrix_cell(motif_position, alph_index, pssm->matrix);
      } // column
      //
      // Apply sequence-dependent background model.
      //
      if (logcumback) {
        int i_site = seq_index;
        double log_p = logcumback[i_site+w] - logcumback[i_site];	// log Pr(x | background)
        //printf("log_p:: %g motif_pos %d\n", log_p, motif_position);
        double adjust = exp(w*log(1/4.0) - log_p);	// Pr(x | uniform) / Pr(x | background)
        odds *= adjust;
      }
      // Add odds to growing sum.
      sum_odds += odds;				// sum of odds
      if (odds > max_odds) max_odds = odds;	// max of odds
    } // site
  } // motif

  if (verbosity >= HIGHER_VERBOSE) {
    fprintf(stderr, "Scored %d positions with the sum odds %f and the max odds %f.\n", N_scored, sum_odds, max_odds);
  }

  // has there been anything matched at all?
  if (N_scored == 0){
      if (verbosity >= NORMAL_VERBOSE) {
	    fprintf(stderr,"Sequence \'%s\' offers no location to match the motif against (sequence length too short?)\n",get_seq_name(seq));
      }
	  *isFeasible = FALSE;
	  return 0.0;
    // return odds as requested (MAX or AVG scoring)
  } else if (method == AVG_ODDS) {
    requested_odds = sum_odds / N_scored;	// mean
  } else if (method == MAX_ODDS) {
    requested_odds = max_odds;			// maximum
  } else if (method == SUM_ODDS) {
	requested_odds = sum_odds ;	// sum
  }

  return(requested_odds);
} // score_sequence
Exemple #26
0
void ramen_scan_sequences() {
		FILE* seq_file = NULL;
		MOTIF_T* motif = NULL;
		MOTIF_T* rev_motif = NULL;
		SEQ_T* sequence = NULL;
		SCANNED_SEQUENCE_T* scanned_seq = NULL;
		PATTERN_T* pattern;
		int i;
		int j;
		SEQ_T** seq_list;
		int num_seqs;
		int seq_len;
		//For the bdb_bg mode:
		ARRAY_T* seq_bg_freqs;
		double atcontent;
		double roundatcontent;
		double avg_seq_length = 0;

		//Open the file.
		if (open_file(args.sequence_filename, "r", FALSE, "FASTA", "sequences", &seq_file) == 0) {
				fprintf(stderr, "Couldn't open the file %s.\n", args.sequence_filename);
				ramen_terminate(1);
		}

		//Start reading in the sequences
		read_many_fastas(ramen_alph, seq_file, MAX_SEQ_LENGTH, &num_seqs, &seq_list);


		seq_ids = new_string_list();
		seq_fscores = allocate_array(num_seqs);

		//Allocate the required space for results
		results = malloc(sizeof(double*) * motifs.num);
		for (i=0;i<motifs.num;i++) {
				results[i] = malloc(sizeof(double)*num_seqs);
		}

		for (j=0;j<num_seqs;j++) {

				fprintf(stderr, "\rScanning %i of %i sequences...", j+1, num_seqs);

				//copy the pointer into our current object for clarity
				sequence = seq_list[j];

				//Read the fluorescence data from the description field.
				add_string(get_seq_name(sequence),seq_ids);
				seq_len = get_seq_length(sequence);
				set_array_item(j,atof(get_seq_description(sequence)),seq_fscores);

				//Scan with each motif.
				for (i=0;i<motifs.num;i++) {
						int motifindex = i*2;

						results[i][j] = ramen_sequence_scan(sequence, motif_at(motifs.motifs, motifindex), 
											      motif_at(motifs.motifs, motifindex+1),
											      NULL, NULL, //No need to pass PSSM.
										              AVG_ODDS, 0, TRUE, 0, motifs.bg_freqs);

						if (TRUE == args.linreg_normalise) {
								int k;
								double maxscore = 1;
								motif = motif_at(motifs.motifs,motifindex); 
								for (k=0;k<get_motif_length(motif);k++) {
										double maxprob = 0;
										if (maxprob < get_matrix_cell(k, alph_index(ramen_alph, 'A'), get_motif_freqs(motif)))
												maxprob = get_matrix_cell(k, alph_index(ramen_alph, 'A'), get_motif_freqs(motif));
										if (maxprob < get_matrix_cell(k, alph_index(ramen_alph, 'C'), get_motif_freqs(motif)))
												maxprob = get_matrix_cell(k, alph_index(ramen_alph, 'C'), get_motif_freqs(motif));
										if (maxprob < get_matrix_cell(k, alph_index(ramen_alph, 'G'), get_motif_freqs(motif)))
												maxprob = get_matrix_cell(k, alph_index(ramen_alph, 'G'), get_motif_freqs(motif));
										if (maxprob < get_matrix_cell(k, alph_index(ramen_alph, 'T'), get_motif_freqs(motif)))
												maxprob = get_matrix_cell(k, alph_index(ramen_alph, 'T'), get_motif_freqs(motif));
										maxscore *= maxprob;
								}
								results[i][j] /= maxscore;
						}
				}
		}

}
Exemple #27
0
/*************************************************************************
 * Calculate the odds score for each motif-sized window at each
 * site in the sequence using the given nucleotide frequencies.
 *
 * This function is a lightweight version based on the one contained in
 * motiph-scoring. Several calculations that are unnecessary for gomo
 * have been removed in order to speed up the process.
 * Scores sequence with up to two motifs.
 *************************************************************************/
static double score_sequence(
  ALPH_T*       alph,         // alphabet (IN)
  SEQ_T*        seq,          // sequence to scan (IN)
  double        *logcumback,  // cumulative bkg probability of sequence (IN)
  PSSM_PAIR_T   *pssm_pair,   // pos and neg pssms (IN)
  SCORING_EN    method,       // method used for scoring (IN)
  int           last,         // score only last <n> or score all if <n> 
                              //                                  is zero (IN)
  BOOLEAN_T* isFeasible       // FLAG indicated if there is at least one position
                              // where the motif could be matched against (OUT)
)
{
  PSSM_T *pos_pssm, *neg_pssm, *pssm;
  int strands, seq_length, w, n, asize, strand, start, N_scored, s_pos, m_pos;
  double max_odds, sum_odds, requested_odds, odds, adjust, log_p;
  int8_t *isequence, *iseq;

  assert(pssm_pair != NULL);
  assert(seq != NULL);

  asize = alph_size_core(alph);
  pos_pssm = pssm_pair->pos_pssm;
  assert(pos_pssm != NULL);
  neg_pssm = pssm_pair->neg_pssm;
  strands = neg_pssm ? 2 : 1;

  isequence = get_isequence(seq);
  seq_length = get_seq_length(seq);
  w = get_num_rows(pos_pssm->matrix);
  n = seq_length - w + 1;

  if (verbosity >= DUMP_VERBOSE) {
    fprintf(stderr, "Debug strands: %d seq_length: %d w: %d n: %d.\n", 
        strands, seq_length, w, n);
  }
  // Dependent on the "last" parameter, change the starting point
  if (last > 0 && last < seq_length) {
    start = seq_length - last;
    N_scored  = strands * (last - w + 1); // number of sites scored
  } else {
    start = 0;
    N_scored  = strands * n; // number of sites scored
  }

  // For each motif (positive and reverse complement)
  max_odds = 0.0;
  sum_odds = 0.0;

  if (verbosity >= HIGHER_VERBOSE) {
    fprintf(stderr, "Starting scan at position %d .\n", start);
  }

  for (strand = 0; strand < strands; strand++) { // pos (and negative) motif
   pssm = (strand == 0 ? pos_pssm : neg_pssm); // choose +/- motif
    // For each site in the sequence
    for (s_pos = start; s_pos < n; s_pos++) {
      odds = 1.0;
      // For each position in the motif window
      for (m_pos = 0, iseq = isequence+s_pos; m_pos < w; m_pos++, iseq++) {
        if (*iseq == -1) {
          N_scored--; 
          odds = 0; 
          break; 
        }
        // multiple odds by value in appropriate motif cell
        odds *= get_matrix_cell(m_pos, *iseq, pssm->matrix);
      }
      // Apply sequence-dependent background model.
      if (logcumback) {
        log_p = logcumback[s_pos+w] - logcumback[s_pos]; // log Pr(x | background)
        //printf("log_p:: %g motif_pos %d\n", log_p, m_pos);
        adjust = exp(w*log(1/4.0) - log_p); // Pr(x | uniform) / Pr(x | background)
        odds *= adjust;
      }
      // Add odds to growing sum.
      sum_odds += odds; // sum of odds
      if (odds > max_odds) max_odds = odds; // max of odds
    } // site
  } // strand

  if (verbosity >= HIGHER_VERBOSE) {
    fprintf(stderr, "Scored %d positions with the sum odds %f and the "
        "max odds %f.\n", N_scored, sum_odds, max_odds);
  }

  // has there been anything matched at all?
  if (N_scored == 0) {
    if (verbosity >= NORMAL_VERBOSE) {
      fprintf(stderr,"Sequence \'%s\' offers no location to match "
          "the motif against (sequence length too short?)\n",
          get_seq_name(seq));
    }
    *isFeasible = false;
    return 0.0;
    // return odds as requested (MAX or AVG scoring)
  } else if (method == AVG_ODDS) {
    return sum_odds / N_scored;  // mean
  } else if (method == MAX_ODDS) {
    return max_odds;             // maximum
  } else if (method == SUM_ODDS) {
    return sum_odds;             // sum
  } else {
    die("Unknown scoring method");
    // should not get here... but the compiler will complain if I don't handle this case
    *isFeasible = false;
    return 0.0;
  }
} // score_sequence
Exemple #28
0
/*************************************************************************
 * Build a completely connected HMM.
 *************************************************************************/
void build_complete_hmm
  (ARRAY_T* background,
   int spacer_states, 
   MOTIF_T *motifs,
   int nmotifs,
   MATRIX_T *transp_freq,
   MATRIX_T *spacer_ave,
   BOOLEAN_T fim,
   MHMM_T **the_hmm)
{
  ALPH_T    alph;
  int motif_states; // Total length of the motifs.
  int num_spacers;  // Total number of spacer states.
  int num_states;   // Total number of states in the model.
  int i_motif;      // Index of the current "from" motif.
  int j_motif;      // Index of the current "to" motif.
  int i_position;   // Index within the current motif or spacer.
  int i_state = 0;  // Index of the current state.

  assert(nmotifs > 0);
  alph = get_motif_alph(motifs);// get the alphabet from the first motif

  // Count the width of the motifs.
  for (motif_states = 0, i_motif = 0; i_motif < nmotifs; i_motif++)
    motif_states += get_motif_length(motif_at(motifs, i_motif));
  // Count the spacer states adjacent to begin and end.
  num_spacers = nmotifs * 2;
  // Add the spacer states between motifs.
  num_spacers += nmotifs * nmotifs;
  // Total states = motifs + spacer_states + begin/end
  num_states = motif_states + (num_spacers * spacer_states) + 2;

  // Allocate the model.
  *the_hmm = allocate_mhmm(alph, num_states);

  // Record that this is a completely connected model.
  (*the_hmm)->type = COMPLETE_HMM;

  // Record the number of motifs in the model.
  (*the_hmm)->num_motifs = nmotifs;

  // Record the number of states in the model.
  (*the_hmm)->num_states = num_states;
  (*the_hmm)->num_spacers = ((nmotifs + 1) * (nmotifs + 1)) - 1;
  (*the_hmm)->spacer_states = spacer_states;

  // Put the background distribution into the model.
  copy_array(background, (*the_hmm)->background);

  // Build the begin state.
  build_complete_state(
      START_STATE, 
      i_state,
      alph,
      0, // expected length
      NULL, // Emissions.
      0, // Number of sites.
      NON_MOTIF_INDEX,
      NON_MOTIF_POSITION,
      nmotifs,
      0, // previous motif
      0, // next motif
      transp_freq,
      spacer_states,
      num_spacers,
      motifs,
      &((*the_hmm)->states[i_state]));
  i_state++;

  int from_motif_state, to_motif_state;
  // Build the spacer states. No transitions from the end state.
  for (i_motif = 0; i_motif <= nmotifs; i_motif++) {
    // No transitions to the start state.
    for (j_motif = 1; j_motif <= nmotifs+1; j_motif++) {
      // No transitions from start to end.
      if ((i_motif == 0) && (j_motif == nmotifs+1))
        continue;
      // Allow multi-state spacers.
      for (i_position = 0; i_position < spacer_states; i_position++, i_state++) {
        build_complete_state(
            SPACER_STATE, 
            i_state, 
            alph,
            get_matrix_cell(i_motif, j_motif, spacer_ave),
            background,
            SPACER_NUMSITES,
            NON_MOTIF_INDEX,
            i_position,
            nmotifs,
            i_motif,
            j_motif,
            transp_freq,
            spacer_states,
            num_spacers,
            motifs,
            &((*the_hmm)->states[i_state]));
      }
    }
  }

  // Build the motif states.
  for (i_motif = 0; i_motif < nmotifs; i_motif++) {
    MOTIF_T *this_motif = motif_at(motifs, i_motif);
    STATE_T state;
    for (i_position = 0; i_position < get_motif_length(this_motif); i_position++, i_state++) {
      if (i_position == 0) {
        state = START_MOTIF_STATE;
      } else if (i_position == (get_motif_length(this_motif) - 1)) {
        state = END_MOTIF_STATE;
      } else {
        state = MID_MOTIF_STATE;
      }
      build_complete_state(
          MID_MOTIF_STATE, 
          i_state,
          alph,
          0, // Expected spacer length. 
          get_matrix_row(i_position, get_motif_freqs(this_motif)),
          get_motif_nsites(this_motif),
          i_motif,
          i_position, 
          nmotifs,
          0, // Previous motif index.
          0, // Next motif index.
          transp_freq,
          spacer_states,
          num_spacers,
          motifs,
          &((*the_hmm)->states[i_state]));
    }
  }

  // Build the end state.
  build_complete_state(
      END_STATE, 
      i_state,
      alph,
      0, // Expected spacer length.
      NULL, // Emissions
      0, // Number of sites.
      NON_MOTIF_INDEX,
      NON_MOTIF_POSITION,
      nmotifs,
      0, // Previous motif index.
      0, // Next motif index.
      transp_freq,
      spacer_states,
      num_spacers,
      motifs,
      &((*the_hmm)->states[i_state]));
  i_state++;

  // Convert spacers to FIMs if requested.
  if (fim) {
    convert_to_fims(*the_hmm);
  }

  // Fill in the transition matrix.
  build_transition_matrix(*the_hmm);
}
Exemple #29
0
/*************************************************************************
 * Set up one state in a complete HMM, given the appropriate data.
 *************************************************************************/
static void build_complete_state
  (STATE_T state_type,    // Type of state (START, SPACER,..)
   int i_state,           // State index.
   ALPH_T alph,           // alphabet
   int expected_length,   // For spacers, the expected length of output.
   ARRAY_T *freqs,        // Emission probability distrib.
   double num_sites,      // Number of sites for this emission.
   int i_motif,           // Index of motif this state is in.
   int i_position,        // Position of this state within motif
   int nmotifs,           // Total number of motifs.
   int prev_motif,        // Index of previous motif.
   int next_motif,        // Index of next motif.
   MATRIX_T *transp_freq, // Transition freq matrix.
   int spacer_states,     // Number of HMM states per spacer.
   int num_spacers,       // Total number of spacers in HMM.
   MOTIF_T *motifs,       // Motifs.
   MHMM_STATE_T *a_state) // State to be filled in (pre-allocated).
{
  MOTIF_T *motif; // The motif (for motif state)
  int j_motif;    // Index of the current motif.

  if (i_motif != NON_MOTIF_INDEX) motif = motif_at(motifs, i_motif);
  else motif = NULL;

  // Tell the user what's up.
  if (verbosity >= NORMAL_VERBOSE) {
    switch (state_type) {
    case START_STATE :
      fprintf(stderr, "Building HMM: (0) ");
      break;
    case SPACER_STATE :
      fprintf(stderr, "%d ", i_state);
      break;
    case END_MOTIF_STATE :
      fprintf(stderr, "%d | ", i_state);
      break;
    case START_MOTIF_STATE :
    case MID_MOTIF_STATE :
      fprintf(stderr, "%d-", i_state);
      break;
    case END_STATE :
      fprintf(stderr, "(%d)\n", i_state);
      break;
    default:
      die("Invalid state!");
    }
  }

  // Record what type of state this is.
  a_state->type = state_type;

  // Record the motif width if this is a motif.
  if (state_type == START_MOTIF_STATE ||
      state_type == MID_MOTIF_STATE ||
      state_type == END_MOTIF_STATE) {
    a_state->w_motif = get_motif_length(motif);
  } else {
    a_state->w_motif = 1;
  }
  

  // Set up the emission distribution and a few other tidbits.
  if (freqs != NULL) { // Start and end states have no emissions.
    a_state->emit = allocate_array(alph_size(alph, ALL_SIZE));
    copy_array(freqs, a_state->emit);
  }
  a_state->num_sites = num_sites;
  a_state->i_motif = i_motif;
  a_state->i_position = i_position;

  // Record the motif ID character at this position.
  if ((state_type == START_STATE) ||
      (state_type == END_STATE) ||
      (state_type == SPACER_STATE)) {
    a_state->id_char = NON_MOTIF_ID_CHAR;
  } else { // motif state
    strncpy(a_state->motif_id, get_full_motif_id(motif), MAX_MOTIF_ID_LENGTH + 2);
    a_state->id_char = get_motif_id_char(i_position, motif);
  }
  assert(a_state->id_char != '\0');

  // First set up the transitions into this state.
  switch (state_type) {
  case START_STATE :
    a_state->ntrans_in = 0;
    a_state->itrans_in = NULL;
    a_state->trans_in = NULL;
    break;
  case START_MOTIF_STATE :
    // Transitions come from any motif or from the start state.
    a_state->ntrans_in = nmotifs + 1;
    a_state->itrans_in = (int *)mm_malloc(sizeof(int) * (nmotifs + 1));
    a_state->trans_in = allocate_array(nmotifs + 1);
    for (j_motif = 0; j_motif < nmotifs + 1; j_motif++) {
      a_state->itrans_in[j_motif]
        = spacer_index(j_motif, i_motif + 1, TRUE, nmotifs, spacer_states);
      set_array_item(j_motif, 
                     get_matrix_cell(j_motif, i_motif + 1, transp_freq), 
                     a_state->trans_in);
    }
    break;
  case END_STATE :
    // Transitions come from any motif.
    a_state->ntrans_in = nmotifs;
    a_state->itrans_in = (int *)mm_malloc(sizeof(int) * nmotifs);
    a_state->trans_in = allocate_array(nmotifs);
    for (j_motif = 0; j_motif < nmotifs; j_motif++) {
      a_state->itrans_in[j_motif] = spacer_index(j_motif + 1,
                                                 nmotifs + 1, TRUE,
                                                 nmotifs, spacer_states);
      set_array_item(j_motif, 
                     get_matrix_cell(j_motif + 1, nmotifs + 1, transp_freq), 
                     a_state->trans_in);
    }
    break;
  case MID_MOTIF_STATE :
  case END_MOTIF_STATE :
    a_state->ntrans_in = 1;
    a_state->itrans_in = (int *)mm_malloc(sizeof(int));
    a_state->itrans_in[0] = i_state - 1;
    a_state->trans_in = allocate_array(1);
    set_array_item(0, 1.0, a_state->trans_in);
    break;
  case SPACER_STATE :
    a_state->ntrans_in = 2;
    a_state->itrans_in = (int *)mm_malloc(sizeof(int) * 2);
    a_state->trans_in = allocate_array(2);
    // For multi-state spacers, incoming transition from previous state.
    if (i_position != 0)
      a_state->itrans_in[0] = i_state - 1;
    else 
      a_state->itrans_in[0] = motif_index(prev_motif, TRUE, num_spacers,
                                          spacer_states, motifs, nmotifs);
    // The other transition is a self-transition.
    a_state->itrans_in[1] = i_state;
    set_array_item(0, 1.0 - self_trans(expected_length / spacer_states),
                   a_state->trans_in);
    set_array_item(1, self_trans(expected_length / spacer_states),
                   a_state->trans_in);
    break;
  default:
    die("Illegal state!");
  }

  // Then set up the transitions out of this state.
  switch (state_type) {
  case START_STATE :
    // Transitions go to each motif.
    a_state->ntrans_out = nmotifs;
    a_state->itrans_out = (int *)mm_malloc(sizeof(int) * nmotifs);
    a_state->trans_out = allocate_array(nmotifs);
    for (j_motif = 0; j_motif < nmotifs; j_motif++) {
      a_state->itrans_out[j_motif] = spacer_index(0, j_motif + 1, FALSE,
                                                  nmotifs, spacer_states);
      set_array_item(j_motif,
                     get_matrix_cell(0, j_motif + 1, transp_freq),
                     a_state->trans_out);
    }
    break;
  case END_MOTIF_STATE :
    // Can go to any other motif or to the end state.
    a_state->ntrans_out = nmotifs + 1;
    a_state->itrans_out = (int *)mm_malloc(sizeof(int) * (nmotifs + 1));
    a_state->trans_out = allocate_array(nmotifs + 1);
    for (j_motif = 0; j_motif < nmotifs + 1; j_motif++) {
      a_state->itrans_out[j_motif] = spacer_index(i_motif + 1,
                                                  j_motif + 1, FALSE,
                                                  nmotifs, spacer_states);
      set_array_item(j_motif,
                     get_matrix_cell(i_motif + 1, j_motif + 1, transp_freq),
                     a_state->trans_out);
    }
    break;
  case START_MOTIF_STATE :
  case MID_MOTIF_STATE :
    a_state->ntrans_out = 1;
    a_state->itrans_out = (int *)mm_malloc(sizeof(int));
    a_state->itrans_out[0] = i_state + 1;
    a_state->trans_out = allocate_array(1);
    set_array_item(0, 1.0, a_state->trans_out);
    break;
  case SPACER_STATE :
    a_state->ntrans_out = 2;
    a_state->itrans_out = (int *)mm_malloc(sizeof(int) * 2);
    a_state->trans_out = allocate_array(2);
    // The first transition is a self-transition.
    a_state->itrans_out[0] = i_state;
    // For multi-state spacers, outgoing transition to next state.
    if (i_position < spacer_states - 1)
      a_state->itrans_out[1] = i_state + 1;
    else 
      a_state->itrans_out[1] = motif_index(next_motif, FALSE, num_spacers,
                                           spacer_states, motifs, nmotifs);
    set_array_item(0, self_trans(expected_length), a_state->trans_out);
    set_array_item(1, 1.0 - self_trans(expected_length), a_state->trans_out);
    break;
  case END_STATE :
    a_state->ntrans_out = 0;
    a_state->itrans_out = NULL;
    a_state->trans_out = NULL;
    break;
  default:
    die("Illegal state!");
  }
}
Exemple #30
0
main(int argc, char **argv) {
  int i, j, alength;
  int dist = 0;
  ALPH_T alph = PROTEIN_ALPH;
  char *score_filename = NULL;
  char *alpha;
  MATRIX_T *matrix;
  ARRAY_T *probs;
  double *freqs;
  KARLIN_INPUT_T *karlin_input;
  int nscores;
  double sum;
  char usage[1000] = "";

  // Define the usage message.
  strcat(usage, "USAGE: subst_matrix [options] <score file>\n");
  strcat(usage, "\n");
  strcat(usage, "   Options:\n");
  strcat(usage, "     --dna\n");
  strcat(usage, "     --dist <float>\n");
  strcat(usage, "\n");

  // Parse the command line.
  while (1) { 
    int c;
    int option_index = 0;
    const char* option_name;

    // Define command line options.
    static struct option long_options[] = {
      {"dna", 0, 0, 0},
      {"dist", 1, 0, 0},
    };

    // Read the next option, and break if we're done.
    c = getopt_long_only(argc, argv, "+", long_options, &option_index);
    if (c == -1) {
      break;
    } else if (c != 0) {
      die("Invalid return from getopt (%d)\n", c);
    }

    // Get the option name (we only use long options).
    option_name = long_options[option_index].name;
    if (strcmp(option_name, "dna") == 0) {
      alph = DNA_ALPH;
    } else if (strcmp(option_name, "dist") == 0) {
      dist = atoi(optarg);
    } else {
      die("Invalid option (%s).\n", option_name);
    }
  }

  // Read the single required argument.
  if (optind + 1 != argc) {
    fprintf(stderr, usage);
    exit(1);
  }
  score_filename = argv[optind];



  alength = alph_size(alph, ALPH_SIZE);

  /* background frequencies */
  probs = allocate_array(alength);
  freqs = alph == DNA_ALPH ? pam_dna_freq : pam_prot_freq;
  fill_array(freqs, probs);			/* copy freqs into ARRAY_T */

  if (dist > 1) {
    printf("From gen_pam_matrix:\n");
    matrix = gen_pam_matrix(alph, dist, FALSE);
    printf("%6c ", ' ');
    for (i=0; i<alength; i++) {
      printf("%6c ", alph_char(alph, i));
    }
    printf("\n");
    sum = 0;
    for (i=0; i<alength; i++) {
      printf("%6c ", alph_char(alph, i));
      for (j=0; j<alength; j++) {
	double x = get_matrix_cell(i,j,matrix);
	sum += x;
	printf("%6.4f ", x);
      }
      printf("\n");
    }
    printf("sum of entries = %f\n", sum);
  }

  printf("From get_subst_target_matrix:\n");
  matrix = get_subst_target_matrix(score_filename, alph, dist, probs);
} /* main */