Ejemplo n.º 1
0
/* Subtree version of score test */
void ff_score_tests_sub(TreeModel *mod, MSA *msa, GFF_Set *gff, mode_type mode,
                        double *feat_pvals, double *feat_null_scales,
                        double *feat_derivs, double *feat_sub_derivs,
                        double *feat_teststats, FILE *logf) {
    int i;
    FeatFitData *d, *d2;
    Vector *grad = vec_new(2);
    Matrix *fim = mat_new(2, 2);
    double lnl, teststat;
    FimGrid *grid;
    List *inside=NULL, *outside=NULL;
    TreeModel *modcpy = tm_create_copy(mod); /* need separate copy of tree model
                                              with different internal scaling
                                              data for supertree/subtree case */

    /* init FeatFitData -- one for null model, one for alt */
    d = ff_init_fit_data(modcpy, msa, ALL, NNEUT, FALSE);
    d2 = ff_init_fit_data(mod, msa, SUBTREE, NNEUT, FALSE);
    /* mod has the subtree info, modcpy
       does not */

    /* precompute Fisher information matrices for a grid of scale values */
    grid = col_fim_grid_sub(mod);

    /* prepare lists of leaves inside and outside root, for use in
       checking for informative substitutions */
    if (mod->subtree_root != NULL) {
        inside = lst_new_ptr(mod->tree->nnodes);
        outside = lst_new_ptr(mod->tree->nnodes);
        tr_partition_leaves(mod->tree, mod->subtree_root, inside, outside);
    }

    /* iterate through features  */
    for (i = 0; i < lst_size(gff->features); i++) {
        checkInterrupt();
        d->feat = lst_get_ptr(gff->features, i);

        /* first check for informative substitution data in feature; if none,
           don't waste time computing likelihoods */
        if (!ff_has_data_sub(mod, msa, d->feat, inside, outside)) {
            teststat = 0;
            vec_zero(grad);
        }

        else {
            vec_set(d->cdata->params, 0, d->cdata->init_scale);
            opt_newton_1d(ff_likelihood_wrapper_1d, &d->cdata->params->data[0], d,
                          &lnl, SIGFIGS, d->cdata->lb->data[0], d->cdata->ub->data[0],
                          logf, NULL, NULL);
            /* turns out to be faster to use numerical rather than exact
               derivatives (judging by col case) */

            d2->feat = d->feat;
            d2->cdata->mod->scale = d->cdata->params->data[0];
            d2->cdata->mod->scale_sub = 1;
            tm_set_subst_matrices(d2->cdata->mod);
            ff_scale_derivs_subtree(d2, grad, NULL, d2->cdata->fels_scratch);

            fim = col_get_fim_sub(grid, d2->cdata->mod->scale);
            mat_scale(fim, d->feat->end - d->feat->start + 1);
            /* scale column-by-column FIM by length of feature (expected
               values are additive) */

            teststat = grad->data[1]*grad->data[1] /
                       (fim->data[1][1] - fim->data[0][1]*fim->data[1][0]/fim->data[0][0]);

            if (teststat < 0) {
                fprintf(stderr, "WARNING: teststat < 0 (%f)\n", teststat);
                teststat = 0;
            }

            if ((mode == ACC && grad->data[1] < 0) ||
                    (mode == CON && grad->data[1] > 0))
                teststat = 0;             /* derivative points toward boundary;
                                     truncate at 0 */

            mat_free(fim);
        }

        if (feat_pvals != NULL) {
            if (mode == NNEUT || mode == CONACC)
                feat_pvals[i] = chisq_cdf(teststat, 1, FALSE);
            else
                feat_pvals[i] = half_chisq_cdf(teststat, 1, FALSE);
            /* assumes 50:50 mix of chisq and point mass at zero */

            if (feat_pvals[i] < 1e-20)
                feat_pvals[i] = 1e-20;
            /* approx limit of eval of tail prob; pvals of 0 cause problems */

            if (mode == CONACC && grad->data[1] > 0)
                feat_pvals[i] *= -1; /* mark as acceleration */
        }

        /* store scales and log likelihood ratios if necessary */
        if (feat_null_scales != NULL) feat_null_scales[i] = d->cdata->params->data[0];
        if (feat_derivs != NULL) feat_derivs[i] = grad->data[0];
        if (feat_sub_derivs != NULL) feat_sub_derivs[i] = grad->data[1];
        if (feat_teststats != NULL) feat_teststats[i] = teststat;
    }

    ff_free_fit_data(d);
    ff_free_fit_data(d2);
    vec_free(grad);
    modcpy->estimate_branchlens = TM_BRANCHLENS_ALL;
    /* have to revert for tm_free to work
       correctly */
    tm_free(modcpy);
    col_free_fim_grid(grid);
    if (inside != NULL) lst_free(inside);
    if (outside != NULL) lst_free(outside);
}
Matrix *mm_build_helper(MS *inputMS, int norder, int pseudoCount, int considerReverse) {
  int alph_size, i, j, ignore, tup_idx, l, alph_idx, seqLen;
  double sum = 0, val;
  Vector *freqs = vec_new(int_pow(4, norder+1));
  Matrix *mm; 
  char c;
	
  if(inputMS == NULL)
    die("ERROR: GC% group passed to mm_build_helper was null");
  if (inputMS->nseqs <= 0) //Must have at least one sequence
    die("ERROR: At least one sequence must be present to build a markov model");
  if (norder < 0)//Order of Markov matrix must be positive
    die("Order of markov model to create must be zero or greater");
  
  vec_zero(freqs);
  
  alph_size = (int)strlen(inputMS->alphabet);
  
  //Apply Pseudo-Counts
  vec_set_all(freqs, pseudoCount);
  
  //For each sequence
  for (j = 0; j < inputMS->nseqs; j++) { 
    seqLen = (int)strlen(inputMS->seqs[j]);
    //For each site
    for (i = 0; i < seqLen; i++) { 
      checkInterruptN(i, 10000);
      ignore = 0;
      tup_idx = 0;
      //For each base in the tuple
      for (l = 0; !ignore && l <= norder; l++) {
        
        c = inputMS->seqs[j][i + l];
        if ((alph_idx = inputMS->inv_alphabet[(int)c]) == -1) //If we get an unknown base
          ignore = 1;
        else
          tup_idx += alph_idx * int_pow(alph_size, (norder - l));
      }
      
      if (!ignore)
        vec_set(freqs, tup_idx,
                vec_get(freqs, tup_idx) + 1);
    }
  }
  
  //Take into account reverse complement frequencies
  if(considerReverse == 1)
	{
      //For each sequence
      for (j = 0; j < inputMS->nseqs; j++) { 
        
        //For each site
        for (i = (int)strlen(inputMS->seqs[j]); i >= 0 ; i--) { 
          checkInterruptN(i, 10000);
          ignore = 0;
          tup_idx = 0;
          //For each base in the tuple
          for (l = 0; !ignore && l <= norder; l++) {
            c = inputMS->seqs[j][i - l];
            switch(c)
              {
              case 'A': 
                c = 'T';
                break;
              case 'C':
                c = 'G';
                break;
              case 'G':
                c = 'C';
                break;
              case 'T':
                c = 'A';
                break;
              }
            if ((alph_idx = inputMS->inv_alphabet[(int)c]) == -1) //If we get an unknown base
              ignore = 1;
            else
              tup_idx += alph_idx * int_pow(alph_size, (norder - l));
          }
          
          if (!ignore)
            vec_set(freqs, tup_idx,
                    vec_get(freqs, tup_idx) + 1);
        }
      }
    }
  mm = mat_new(int_pow(alph_size, norder), alph_size);
  //Transform count vector into Markov Matrix of order norder
  for (i = 0; i < freqs->size; i = i + alph_size) {
    sum = 0;
    for (j = 0; j < alph_size; j++) //Calculate sum i.e. for AA = count(AAA) + count(AAC) + count(AAG) + count(AAT)
      sum += vec_get(freqs, i + j);
    for (j = 0; j < alph_size; j++) { //For each base in alphabet
      if (sum == 0) //Handle unknown <prefix, base> tuples
        val = 1.0/alph_size;
      else
        val = vec_get(freqs, i + j) / sum; //i.e. for AAT, AA identified by i, T defined by j; #AAT/#AA
      if((val < 0) || (val > 1)) //Value should be a probability between 0 and 1
        die("ERROR: Generating Markov Models, generated probability must be between 0 and 1");
      mat_set(mm, i / alph_size, j, val);
    }
  }
  
  return mm;
}
Ejemplo n.º 3
0
mat mat_copy(void *s, _dim)
{
	mat x = mat_new(n);
	for_ij x[i][j] = ((double (*)[n])s)[i][j];
	return x;
}
List *pwm_read(const char *filename) {
  List *result;
  Matrix *pwm = NULL;
  int i, currBase, nBases = 0;
  FILE * F;
  //  char *motifName;
  String *line = str_new(STR_MED_LEN);
  List *l = lst_new_ptr(3);
  List *probabilitiesStr = lst_new_ptr(4);
  List *probabilitiesDbl;
  Regex *pssm_re = NULL;
  Regex *motif_name_re = NULL;
  int alphabetLength;

  result = lst_new_ptr(1);
  //letter-probability matrix: alength= 4 w= 8 nsites= 2 E= 1.5e+004

  pssm_re = str_re_new("^letter-probability matrix: alength= ([0-9]+) w= ([0-9]+)");
  motif_name_re = str_re_new("^MOTIF[[:space:]]+(.+?)[[:space:]].*");
  //open PWM file
  F = phast_fopen(filename, "r");
  currBase = 0;
  nBases = -1;
  //For each line in the MEME file
  while ((str_readline(line, F)) != EOF) {
    //If line matches Motif name
    if (str_re_match(line, motif_name_re, l, 1) > 0) {
      //      motifName = copy_charstr(((String*)lst_get_ptr(l, 1))->chars);
      //printf("motifName=%s\n", motifName);
    }
    //If line matches beginning of a probability matrix
    else if (str_re_match(line, pssm_re, l, 2) > 0) {
      //Extract the alphabet size & number of bases in matrix

      if (str_as_int((String*)lst_get_ptr(l, 1), &alphabetLength) != 0)
        die("ERROR: Unable to parse 'alength=' from MEME file, expected integer, read %s", ((String*)lst_get_ptr(l, 1))->chars);
      if (str_as_int((String*)lst_get_ptr(l, 2), &nBases) != 0)
        die("ERROR: Unable to parse 'w=' from MEME file, expected integer, read %s ", ((String*)lst_get_ptr(l, 2))->chars);
      currBase = 0;
      if (nBases <= 0) //We must have at least one base in the PWM
        die("ERROR: No Position Weight Matrices were detected in the provided PWM file");
      if (alphabetLength <= 0) //We must have a positive alphabet length
        die("ERROR: Alphabet lengh specified in PWM file must be greater than zero");
      pwm = mat_new(nBases, alphabetLength);
      mat_set_all(pwm, -1);
      continue;
      //If this row contains matrix data
    } else if (currBase < nBases) {
      //Parse row of probabilities
      str_double_trim(line);
      str_split(line, NULL, probabilitiesStr);
      probabilitiesDbl = str_list_as_dbl(probabilitiesStr);
      for (i = 0; i < lst_size(probabilitiesDbl); i++)
        mat_set(pwm, currBase, i, log(lst_get_dbl(probabilitiesDbl, i)));
      currBase++;
    } else if ((currBase == nBases) && (pwm != NULL)) {
      //Push full matrix
      lst_push_ptr(result, pwm);
      pwm = NULL;
    }
  }
  if (currBase == nBases && pwm != NULL) 
    lst_push_ptr(result, pwm);
  else if (pwm != NULL) 
    die("Premature end of PWM file\n");
  str_re_free(motif_name_re);
  str_re_free(pssm_re);
  phast_fclose(F);
  return result;
}