//gets MLE for time interval start_t through end_t inclusive- one parameter for // whole combined interval double mle_one_popsize(int start_t, int end_t, double init_popsize, void *data0) { double popsize, log_popsize = log(init_popsize); double likelihood; int sigfigs=4; struct popsize_data *data = (struct popsize_data*)data0; static double min_popsize = log(100); static double max_popsize = log(1e7); data->min_t = start_t; data->max_t = end_t; opt_newton_1d(one_popsize_neg_likelihood, &log_popsize, data0, &likelihood, sigfigs, min_popsize, max_popsize, NULL, NULL, NULL); popsize = exp(log_popsize); double dlike2=0.0; for (int t=start_t; t <= end_t; t++) { double tmp; one_popsize_like_and_dlike(t, log_popsize, data, NULL, NULL, &tmp); dlike2 += tmp; } double sd = sqrt(-1.0/dlike2); double popsize_min = exp(log_popsize - 2*sd); double popsize_max = exp(log_popsize + 2*sd); for (int t=start_t; t <= end_t; t++) { printLog(LOG_LOW, "mle_popsize %i\t%f\t%f\t%.1f\t%.1f\t%f\t%.1f\t%.1f\n", t, popsize, likelihood, data->coal_totals[t], data->nocoal_totals[t], sqrt(-1.0/dlike2), popsize_min, popsize_max); } return popsize; }
/* Perform a GERP-like computation for each feature. Computes expected number of subst. under neutrality (feat_nneut), expected number after rescaling by ML (feat_nobs), expected number of rejected substitutions (feat_nrejected), and number of species with data (feat_nspecies). If any arrays are NULL, values will not be retained. Gaps and missing data are handled by working with induced subtree. */ void ff_gerp(TreeModel *mod, MSA *msa, GFF_Set *gff, mode_type mode, double *feat_nneut, double *feat_nobs, double *feat_nrejected, double *feat_nspec, FILE *logf) { int i, j, nspec = 0; double nneut, scale, lnl; int *has_data = smalloc(mod->tree->nnodes * sizeof(int)); FeatFitData *d; /* init FeatFitData */ d = ff_init_fit_data(mod, msa, ALL, NNEUT, FALSE); /* iterate through features */ for (i = 0; i < lst_size(gff->features); i++) { GFF_Feature *f = lst_get_ptr(gff->features, i); checkInterrupt(); ff_find_missing_branches(mod, msa, f, has_data, &nspec); if (nspec < 3) nneut = scale = 0; else { vec_set(d->cdata->params, 0, d->cdata->init_scale); d->feat = f; opt_newton_1d(ff_likelihood_wrapper_1d, &d->cdata->params->data[0], d, &lnl, SIGFIGS, d->cdata->lb->data[0], d->cdata->ub->data[0], logf, NULL, NULL); /* turns out to be faster to use numerical rather than exact derivatives (judging by col case) */ scale = d->cdata->params->data[0]; for (j = 1, nneut = 0; j < mod->tree->nnodes; j++) /* node 0 is root */ if (has_data[j]) nneut += ((TreeNode*)lst_get_ptr(mod->tree->nodes, j))->dparent; } if (feat_nspec != NULL) feat_nspec[i] = (double)nspec; if (feat_nneut != NULL) feat_nneut[i] = nneut; if (feat_nobs != NULL) feat_nobs[i] = scale * nneut; if (feat_nrejected != NULL) { feat_nrejected[i] = nneut * (1 - scale); if (mode == ACC) feat_nrejected[i] *= -1; else if (mode == NNEUT) feat_nrejected[i] = fabs(feat_nrejected[i]); } } ff_free_fit_data(d); sfree(has_data); }
/* Subtree version of LRT */ void ff_lrts_sub(TreeModel *mod, MSA *msa, GFF_Set *gff, mode_type mode, double *feat_pvals, double *feat_null_scales, double *feat_scales, double *feat_sub_scales, double *feat_llrs, FILE *logf) { int i; FeatFitData *d, *d2; double null_lnl, alt_lnl, delta_lnl; TreeModel *modcpy; List *inside=NULL, *outside=NULL; modcpy = tm_create_copy(mod); /* need separate copy of tree model with different internal scaling data for supertree/subtree case */ /* init ColFitData -- one for null model, one for alt */ modcpy->estimate_branchlens = TM_BRANCHLENS_ALL; modcpy->subtree_root = NULL; d = ff_init_fit_data(modcpy, msa, ALL, NNEUT, FALSE); d2 = ff_init_fit_data(mod, msa, SUBTREE, mode, FALSE); /* mod has the subtree info, modcpy does not */ /* prepare lists of leaves inside and outside root, for use in checking for informative substitutions */ if (mod->subtree_root != NULL) { inside = lst_new_ptr(mod->tree->nnodes); outside = lst_new_ptr(mod->tree->nnodes); tr_partition_leaves(mod->tree, mod->subtree_root, inside, outside); } /* iterate through features */ for (i = 0; i < lst_size(gff->features); i++) { GFF_Feature *f = lst_get_ptr(gff->features, i); checkInterrupt(); /* first check for informative substitution data in feature; if none, don't waste time computing likelihoods */ if (!ff_has_data_sub(mod, msa, f, inside, outside)) { delta_lnl = 0; d->cdata->params->data[0] = d2->cdata->params->data[0] = d2->cdata->params->data[1] = 1; } else { /* compute log likelihoods under null and alt hypotheses */ d->feat = f; vec_set(d->cdata->params, 0, d->cdata->init_scale); opt_newton_1d(ff_likelihood_wrapper_1d, &d->cdata->params->data[0], d, &null_lnl, SIGFIGS, d->cdata->lb->data[0], d->cdata->ub->data[0], logf, NULL, NULL); null_lnl *= -1; d2->feat = f; vec_set(d2->cdata->params, 0, d->cdata->params->data[0]); /* init to previous estimate to save time */ vec_set(d2->cdata->params, 1, d2->cdata->init_scale_sub); // vec_set(d2->cdata->params, 1, 0.01); if (opt_bfgs(ff_likelihood_wrapper, d2->cdata->params, d2, &alt_lnl, d2->cdata->lb, d2->cdata->ub, logf, NULL, OPT_HIGH_PREC, NULL, NULL) != 0) ; /* do nothing; nonzero exit typically occurs when max iterations is reached; a warning is printed to the log */ alt_lnl *= -1; delta_lnl = alt_lnl - null_lnl; /* This is a hack, it would be better to figure out why the optimization sometimes fails here. If we get a significantly negative lnL, re-initialize params so that they are identical to null model params and re-start */ if (delta_lnl <= -0.05) { d2->feat = f; vec_set(d2->cdata->params, 0, d->cdata->params->data[0]); vec_set(d2->cdata->params, 1, 1.0); if (opt_bfgs(ff_likelihood_wrapper, d2->cdata->params, d2, &alt_lnl, d2->cdata->lb, d2->cdata->ub, logf, NULL, OPT_HIGH_PREC, NULL, NULL) != 0) if (delta_lnl <= -0.1) die("ERROR ff_lrts_sub: delta_lnl (%f) <= -0.1\n", delta_lnl); } if (delta_lnl < 0) delta_lnl = 0; } /* compute p-vals via chi-sq */ if (feat_pvals != NULL) { if (mode == NNEUT || mode == CONACC) feat_pvals[i] = chisq_cdf(2*delta_lnl, 1, FALSE); else feat_pvals[i] = half_chisq_cdf(2*delta_lnl, 1, FALSE); /* assumes 50:50 mix of chisq and point mass at zero, due to bounding of param */ if (feat_pvals[i] < 1e-20) feat_pvals[i] = 1e-20; /* approx limit of eval of tail prob; pvals of 0 cause problems */ if (mode == CONACC && d2->cdata->params->data[1] > 1) feat_pvals[i] *= -1; /* mark as acceleration */ } /* store scales and log likelihood ratios if necessary */ if (feat_null_scales != NULL) feat_null_scales[i] = d->cdata->params->data[0]; if (feat_scales != NULL) feat_scales[i] = d2->cdata->params->data[0]; if (feat_sub_scales != NULL) feat_sub_scales[i] = d2->cdata->params->data[1]; if (feat_llrs != NULL) feat_llrs[i] = delta_lnl; } ff_free_fit_data(d); ff_free_fit_data(d2); modcpy->estimate_branchlens = TM_BRANCHLENS_ALL; /* have to revert for tm_free to work correctly */ tm_free(modcpy); if (inside != NULL) lst_free(inside); if (outside != NULL) lst_free(outside); }
/* Subtree version of score test */ void ff_score_tests_sub(TreeModel *mod, MSA *msa, GFF_Set *gff, mode_type mode, double *feat_pvals, double *feat_null_scales, double *feat_derivs, double *feat_sub_derivs, double *feat_teststats, FILE *logf) { int i; FeatFitData *d, *d2; Vector *grad = vec_new(2); Matrix *fim = mat_new(2, 2); double lnl, teststat; FimGrid *grid; List *inside=NULL, *outside=NULL; TreeModel *modcpy = tm_create_copy(mod); /* need separate copy of tree model with different internal scaling data for supertree/subtree case */ /* init FeatFitData -- one for null model, one for alt */ d = ff_init_fit_data(modcpy, msa, ALL, NNEUT, FALSE); d2 = ff_init_fit_data(mod, msa, SUBTREE, NNEUT, FALSE); /* mod has the subtree info, modcpy does not */ /* precompute Fisher information matrices for a grid of scale values */ grid = col_fim_grid_sub(mod); /* prepare lists of leaves inside and outside root, for use in checking for informative substitutions */ if (mod->subtree_root != NULL) { inside = lst_new_ptr(mod->tree->nnodes); outside = lst_new_ptr(mod->tree->nnodes); tr_partition_leaves(mod->tree, mod->subtree_root, inside, outside); } /* iterate through features */ for (i = 0; i < lst_size(gff->features); i++) { checkInterrupt(); d->feat = lst_get_ptr(gff->features, i); /* first check for informative substitution data in feature; if none, don't waste time computing likelihoods */ if (!ff_has_data_sub(mod, msa, d->feat, inside, outside)) { teststat = 0; vec_zero(grad); } else { vec_set(d->cdata->params, 0, d->cdata->init_scale); opt_newton_1d(ff_likelihood_wrapper_1d, &d->cdata->params->data[0], d, &lnl, SIGFIGS, d->cdata->lb->data[0], d->cdata->ub->data[0], logf, NULL, NULL); /* turns out to be faster to use numerical rather than exact derivatives (judging by col case) */ d2->feat = d->feat; d2->cdata->mod->scale = d->cdata->params->data[0]; d2->cdata->mod->scale_sub = 1; tm_set_subst_matrices(d2->cdata->mod); ff_scale_derivs_subtree(d2, grad, NULL, d2->cdata->fels_scratch); fim = col_get_fim_sub(grid, d2->cdata->mod->scale); mat_scale(fim, d->feat->end - d->feat->start + 1); /* scale column-by-column FIM by length of feature (expected values are additive) */ teststat = grad->data[1]*grad->data[1] / (fim->data[1][1] - fim->data[0][1]*fim->data[1][0]/fim->data[0][0]); if (teststat < 0) { fprintf(stderr, "WARNING: teststat < 0 (%f)\n", teststat); teststat = 0; } if ((mode == ACC && grad->data[1] < 0) || (mode == CON && grad->data[1] > 0)) teststat = 0; /* derivative points toward boundary; truncate at 0 */ mat_free(fim); } if (feat_pvals != NULL) { if (mode == NNEUT || mode == CONACC) feat_pvals[i] = chisq_cdf(teststat, 1, FALSE); else feat_pvals[i] = half_chisq_cdf(teststat, 1, FALSE); /* assumes 50:50 mix of chisq and point mass at zero */ if (feat_pvals[i] < 1e-20) feat_pvals[i] = 1e-20; /* approx limit of eval of tail prob; pvals of 0 cause problems */ if (mode == CONACC && grad->data[1] > 0) feat_pvals[i] *= -1; /* mark as acceleration */ } /* store scales and log likelihood ratios if necessary */ if (feat_null_scales != NULL) feat_null_scales[i] = d->cdata->params->data[0]; if (feat_derivs != NULL) feat_derivs[i] = grad->data[0]; if (feat_sub_derivs != NULL) feat_sub_derivs[i] = grad->data[1]; if (feat_teststats != NULL) feat_teststats[i] = teststat; } ff_free_fit_data(d); ff_free_fit_data(d2); vec_free(grad); modcpy->estimate_branchlens = TM_BRANCHLENS_ALL; /* have to revert for tm_free to work correctly */ tm_free(modcpy); col_free_fim_grid(grid); if (inside != NULL) lst_free(inside); if (outside != NULL) lst_free(outside); }
/* Perform a likelihood ratio test for each feature in a GFF, comparing the given null model with an alternative model that has a free scaling parameter for all branches. Assumes a 0th order model, leaf-to-sequence mapping already available, prob matrices computed, sufficient stats available. Computes p-values based using the chi-sq distribution and stores them in feat_pvals. Will optionally store the individual scale factors in feat_scales and raw log likelihood ratios in feat_llrs if these variables are non-NULL. Must define mode as CON (for 0 <= scale <= 1), ACC (for 1 <= scale), NNEUT (0 <= scale), or CONACC (0 <= scale) */ void ff_lrts(TreeModel *mod, MSA *msa, GFF_Set *gff, mode_type mode, double *feat_pvals, double *feat_scales, double *feat_llrs, FILE *logf) { int i; FeatFitData *d; double null_lnl, alt_lnl, delta_lnl, this_scale = 1; /* init FeatFitData */ d = ff_init_fit_data(mod, msa, ALL, mode, FALSE); /* iterate through features */ for (i = 0; i < lst_size(gff->features); i++) { GFF_Feature *f = lst_get_ptr(gff->features, i); checkInterrupt(); /* first check for actual substitution data in feature; if none, don't waste time computing likelihoods */ if (!ff_has_data(mod, msa, f)) { delta_lnl = 0; this_scale = 1; } else { mod->scale = 1; tm_set_subst_matrices(mod); /* compute log likelihoods under null and alt hypotheses */ null_lnl = ff_compute_log_likelihood(mod, msa, f, d->cdata->fels_scratch[0]); vec_set(d->cdata->params, 0, d->cdata->init_scale); d->feat = f; opt_newton_1d(ff_likelihood_wrapper_1d, &d->cdata->params->data[0], d, &alt_lnl, SIGFIGS, d->cdata->lb->data[0], d->cdata->ub->data[0], logf, NULL, NULL); /* turns out to be faster to use numerical rather than exact derivatives (judging by col case) */ alt_lnl *= -1; this_scale = d->cdata->params->data[0]; delta_lnl = alt_lnl - null_lnl; if (delta_lnl <= -0.01) die("ERROR ff_lrts: delta_lnl (%f) <= -0.01\n", delta_lnl); if (delta_lnl < 0) delta_lnl = 0; } /* compute p-vals via chi-sq */ if (feat_pvals != NULL) { if (mode == NNEUT || mode == CONACC) feat_pvals[i] = chisq_cdf(2*delta_lnl, 1, FALSE); else feat_pvals[i] = half_chisq_cdf(2*delta_lnl, 1, FALSE); /* assumes 50:50 mix of chisq and point mass at zero, due to bounding of param */ if (feat_pvals[i] < 1e-20) feat_pvals[i] = 1e-20; /* approx limit of eval of tail prob; pvals of 0 cause problems */ if (mode == CONACC && this_scale > 1) feat_pvals[i] *= -1; /* mark as acceleration */ } /* store scales and log likelihood ratios if necessary */ if (feat_scales != NULL) feat_scales[i] = this_scale; if (feat_llrs != NULL) feat_llrs[i] = delta_lnl; } ff_free_fit_data(d); sfree(d); //above function doesn't actually free object }