/* Subtree version of score test */ void ff_score_tests_sub(TreeModel *mod, MSA *msa, GFF_Set *gff, mode_type mode, double *feat_pvals, double *feat_null_scales, double *feat_derivs, double *feat_sub_derivs, double *feat_teststats, FILE *logf) { int i; FeatFitData *d, *d2; Vector *grad = vec_new(2); Matrix *fim = mat_new(2, 2); double lnl, teststat; FimGrid *grid; List *inside=NULL, *outside=NULL; TreeModel *modcpy = tm_create_copy(mod); /* need separate copy of tree model with different internal scaling data for supertree/subtree case */ /* init FeatFitData -- one for null model, one for alt */ d = ff_init_fit_data(modcpy, msa, ALL, NNEUT, FALSE); d2 = ff_init_fit_data(mod, msa, SUBTREE, NNEUT, FALSE); /* mod has the subtree info, modcpy does not */ /* precompute Fisher information matrices for a grid of scale values */ grid = col_fim_grid_sub(mod); /* prepare lists of leaves inside and outside root, for use in checking for informative substitutions */ if (mod->subtree_root != NULL) { inside = lst_new_ptr(mod->tree->nnodes); outside = lst_new_ptr(mod->tree->nnodes); tr_partition_leaves(mod->tree, mod->subtree_root, inside, outside); } /* iterate through features */ for (i = 0; i < lst_size(gff->features); i++) { checkInterrupt(); d->feat = lst_get_ptr(gff->features, i); /* first check for informative substitution data in feature; if none, don't waste time computing likelihoods */ if (!ff_has_data_sub(mod, msa, d->feat, inside, outside)) { teststat = 0; vec_zero(grad); } else { vec_set(d->cdata->params, 0, d->cdata->init_scale); opt_newton_1d(ff_likelihood_wrapper_1d, &d->cdata->params->data[0], d, &lnl, SIGFIGS, d->cdata->lb->data[0], d->cdata->ub->data[0], logf, NULL, NULL); /* turns out to be faster to use numerical rather than exact derivatives (judging by col case) */ d2->feat = d->feat; d2->cdata->mod->scale = d->cdata->params->data[0]; d2->cdata->mod->scale_sub = 1; tm_set_subst_matrices(d2->cdata->mod); ff_scale_derivs_subtree(d2, grad, NULL, d2->cdata->fels_scratch); fim = col_get_fim_sub(grid, d2->cdata->mod->scale); mat_scale(fim, d->feat->end - d->feat->start + 1); /* scale column-by-column FIM by length of feature (expected values are additive) */ teststat = grad->data[1]*grad->data[1] / (fim->data[1][1] - fim->data[0][1]*fim->data[1][0]/fim->data[0][0]); if (teststat < 0) { fprintf(stderr, "WARNING: teststat < 0 (%f)\n", teststat); teststat = 0; } if ((mode == ACC && grad->data[1] < 0) || (mode == CON && grad->data[1] > 0)) teststat = 0; /* derivative points toward boundary; truncate at 0 */ mat_free(fim); } if (feat_pvals != NULL) { if (mode == NNEUT || mode == CONACC) feat_pvals[i] = chisq_cdf(teststat, 1, FALSE); else feat_pvals[i] = half_chisq_cdf(teststat, 1, FALSE); /* assumes 50:50 mix of chisq and point mass at zero */ if (feat_pvals[i] < 1e-20) feat_pvals[i] = 1e-20; /* approx limit of eval of tail prob; pvals of 0 cause problems */ if (mode == CONACC && grad->data[1] > 0) feat_pvals[i] *= -1; /* mark as acceleration */ } /* store scales and log likelihood ratios if necessary */ if (feat_null_scales != NULL) feat_null_scales[i] = d->cdata->params->data[0]; if (feat_derivs != NULL) feat_derivs[i] = grad->data[0]; if (feat_sub_derivs != NULL) feat_sub_derivs[i] = grad->data[1]; if (feat_teststats != NULL) feat_teststats[i] = teststat; } ff_free_fit_data(d); ff_free_fit_data(d2); vec_free(grad); modcpy->estimate_branchlens = TM_BRANCHLENS_ALL; /* have to revert for tm_free to work correctly */ tm_free(modcpy); col_free_fim_grid(grid); if (inside != NULL) lst_free(inside); if (outside != NULL) lst_free(outside); }
Matrix *mm_build_helper(MS *inputMS, int norder, int pseudoCount, int considerReverse) { int alph_size, i, j, ignore, tup_idx, l, alph_idx, seqLen; double sum = 0, val; Vector *freqs = vec_new(int_pow(4, norder+1)); Matrix *mm; char c; if(inputMS == NULL) die("ERROR: GC% group passed to mm_build_helper was null"); if (inputMS->nseqs <= 0) //Must have at least one sequence die("ERROR: At least one sequence must be present to build a markov model"); if (norder < 0)//Order of Markov matrix must be positive die("Order of markov model to create must be zero or greater"); vec_zero(freqs); alph_size = (int)strlen(inputMS->alphabet); //Apply Pseudo-Counts vec_set_all(freqs, pseudoCount); //For each sequence for (j = 0; j < inputMS->nseqs; j++) { seqLen = (int)strlen(inputMS->seqs[j]); //For each site for (i = 0; i < seqLen; i++) { checkInterruptN(i, 10000); ignore = 0; tup_idx = 0; //For each base in the tuple for (l = 0; !ignore && l <= norder; l++) { c = inputMS->seqs[j][i + l]; if ((alph_idx = inputMS->inv_alphabet[(int)c]) == -1) //If we get an unknown base ignore = 1; else tup_idx += alph_idx * int_pow(alph_size, (norder - l)); } if (!ignore) vec_set(freqs, tup_idx, vec_get(freqs, tup_idx) + 1); } } //Take into account reverse complement frequencies if(considerReverse == 1) { //For each sequence for (j = 0; j < inputMS->nseqs; j++) { //For each site for (i = (int)strlen(inputMS->seqs[j]); i >= 0 ; i--) { checkInterruptN(i, 10000); ignore = 0; tup_idx = 0; //For each base in the tuple for (l = 0; !ignore && l <= norder; l++) { c = inputMS->seqs[j][i - l]; switch(c) { case 'A': c = 'T'; break; case 'C': c = 'G'; break; case 'G': c = 'C'; break; case 'T': c = 'A'; break; } if ((alph_idx = inputMS->inv_alphabet[(int)c]) == -1) //If we get an unknown base ignore = 1; else tup_idx += alph_idx * int_pow(alph_size, (norder - l)); } if (!ignore) vec_set(freqs, tup_idx, vec_get(freqs, tup_idx) + 1); } } } mm = mat_new(int_pow(alph_size, norder), alph_size); //Transform count vector into Markov Matrix of order norder for (i = 0; i < freqs->size; i = i + alph_size) { sum = 0; for (j = 0; j < alph_size; j++) //Calculate sum i.e. for AA = count(AAA) + count(AAC) + count(AAG) + count(AAT) sum += vec_get(freqs, i + j); for (j = 0; j < alph_size; j++) { //For each base in alphabet if (sum == 0) //Handle unknown <prefix, base> tuples val = 1.0/alph_size; else val = vec_get(freqs, i + j) / sum; //i.e. for AAT, AA identified by i, T defined by j; #AAT/#AA if((val < 0) || (val > 1)) //Value should be a probability between 0 and 1 die("ERROR: Generating Markov Models, generated probability must be between 0 and 1"); mat_set(mm, i / alph_size, j, val); } } return mm; }
mat mat_copy(void *s, _dim) { mat x = mat_new(n); for_ij x[i][j] = ((double (*)[n])s)[i][j]; return x; }
List *pwm_read(const char *filename) { List *result; Matrix *pwm = NULL; int i, currBase, nBases = 0; FILE * F; // char *motifName; String *line = str_new(STR_MED_LEN); List *l = lst_new_ptr(3); List *probabilitiesStr = lst_new_ptr(4); List *probabilitiesDbl; Regex *pssm_re = NULL; Regex *motif_name_re = NULL; int alphabetLength; result = lst_new_ptr(1); //letter-probability matrix: alength= 4 w= 8 nsites= 2 E= 1.5e+004 pssm_re = str_re_new("^letter-probability matrix: alength= ([0-9]+) w= ([0-9]+)"); motif_name_re = str_re_new("^MOTIF[[:space:]]+(.+?)[[:space:]].*"); //open PWM file F = phast_fopen(filename, "r"); currBase = 0; nBases = -1; //For each line in the MEME file while ((str_readline(line, F)) != EOF) { //If line matches Motif name if (str_re_match(line, motif_name_re, l, 1) > 0) { // motifName = copy_charstr(((String*)lst_get_ptr(l, 1))->chars); //printf("motifName=%s\n", motifName); } //If line matches beginning of a probability matrix else if (str_re_match(line, pssm_re, l, 2) > 0) { //Extract the alphabet size & number of bases in matrix if (str_as_int((String*)lst_get_ptr(l, 1), &alphabetLength) != 0) die("ERROR: Unable to parse 'alength=' from MEME file, expected integer, read %s", ((String*)lst_get_ptr(l, 1))->chars); if (str_as_int((String*)lst_get_ptr(l, 2), &nBases) != 0) die("ERROR: Unable to parse 'w=' from MEME file, expected integer, read %s ", ((String*)lst_get_ptr(l, 2))->chars); currBase = 0; if (nBases <= 0) //We must have at least one base in the PWM die("ERROR: No Position Weight Matrices were detected in the provided PWM file"); if (alphabetLength <= 0) //We must have a positive alphabet length die("ERROR: Alphabet lengh specified in PWM file must be greater than zero"); pwm = mat_new(nBases, alphabetLength); mat_set_all(pwm, -1); continue; //If this row contains matrix data } else if (currBase < nBases) { //Parse row of probabilities str_double_trim(line); str_split(line, NULL, probabilitiesStr); probabilitiesDbl = str_list_as_dbl(probabilitiesStr); for (i = 0; i < lst_size(probabilitiesDbl); i++) mat_set(pwm, currBase, i, log(lst_get_dbl(probabilitiesDbl, i))); currBase++; } else if ((currBase == nBases) && (pwm != NULL)) { //Push full matrix lst_push_ptr(result, pwm); pwm = NULL; } } if (currBase == nBases && pwm != NULL) lst_push_ptr(result, pwm); else if (pwm != NULL) die("Premature end of PWM file\n"); str_re_free(motif_name_re); str_re_free(pssm_re); phast_fclose(F); return result; }