Exemple #1
0
//gets MLE for time interval start_t through end_t inclusive- one parameter for
// whole combined interval
double mle_one_popsize(int start_t, int end_t, double init_popsize, void *data0) {
    double popsize, log_popsize = log(init_popsize);
    double likelihood;
    int sigfigs=4;
    struct popsize_data *data = (struct popsize_data*)data0;
    static double min_popsize = log(100);
    static double max_popsize = log(1e7);
    data->min_t = start_t;
    data->max_t = end_t;
    opt_newton_1d(one_popsize_neg_likelihood, &log_popsize, data0, &likelihood, sigfigs, min_popsize, max_popsize, NULL, NULL, NULL);
    popsize = exp(log_popsize);
    double dlike2=0.0;
    for (int t=start_t; t <= end_t; t++)  {
	double tmp;
	one_popsize_like_and_dlike(t, log_popsize, data, NULL, NULL, &tmp);
	dlike2 += tmp;
    }
    double sd = sqrt(-1.0/dlike2);
    double popsize_min = exp(log_popsize - 2*sd);
    double popsize_max = exp(log_popsize + 2*sd);
    for (int t=start_t; t <= end_t; t++) {
	printLog(LOG_LOW, "mle_popsize %i\t%f\t%f\t%.1f\t%.1f\t%f\t%.1f\t%.1f\n", t, popsize, likelihood, data->coal_totals[t], data->nocoal_totals[t], sqrt(-1.0/dlike2), popsize_min, popsize_max);
    }
   return popsize;
}
Exemple #2
0
/* Perform a GERP-like computation for each feature.  Computes expected
   number of subst. under neutrality (feat_nneut), expected number
   after rescaling by ML (feat_nobs), expected number of rejected
   substitutions (feat_nrejected), and number of species with data
   (feat_nspecies).  If any arrays are NULL, values will not be
   retained.  Gaps and missing data are handled by working with
   induced subtree.  */
void ff_gerp(TreeModel *mod, MSA *msa, GFF_Set *gff, mode_type mode,
             double *feat_nneut, double *feat_nobs, double *feat_nrejected,
             double *feat_nspec, FILE *logf) {
    int i, j, nspec = 0;
    double nneut, scale, lnl;
    int *has_data = smalloc(mod->tree->nnodes * sizeof(int));
    FeatFitData *d;

    /* init FeatFitData */
    d = ff_init_fit_data(mod, msa, ALL, NNEUT, FALSE);

    /* iterate through features  */
    for (i = 0; i < lst_size(gff->features); i++) {
        GFF_Feature *f = lst_get_ptr(gff->features, i);
        checkInterrupt();
        ff_find_missing_branches(mod, msa, f, has_data, &nspec);

        if (nspec < 3)
            nneut = scale = 0;
        else {
            vec_set(d->cdata->params, 0, d->cdata->init_scale);
            d->feat = f;

            opt_newton_1d(ff_likelihood_wrapper_1d, &d->cdata->params->data[0], d,
                          &lnl, SIGFIGS, d->cdata->lb->data[0], d->cdata->ub->data[0],
                          logf, NULL, NULL);
            /* turns out to be faster to use numerical rather than exact
               derivatives (judging by col case) */

            scale = d->cdata->params->data[0];
            for (j = 1, nneut = 0; j < mod->tree->nnodes; j++)  /* node 0 is root */
                if (has_data[j])
                    nneut += ((TreeNode*)lst_get_ptr(mod->tree->nodes, j))->dparent;
        }

        if (feat_nspec != NULL) feat_nspec[i] = (double)nspec;
        if (feat_nneut != NULL) feat_nneut[i] = nneut;
        if (feat_nobs != NULL) feat_nobs[i] = scale * nneut;
        if (feat_nrejected != NULL) {
            feat_nrejected[i] = nneut * (1 - scale);
            if (mode == ACC) feat_nrejected[i] *= -1;
            else if (mode == NNEUT) feat_nrejected[i] = fabs(feat_nrejected[i]);
        }
    }
    ff_free_fit_data(d);
    sfree(has_data);
}
Exemple #3
0
/* Subtree version of LRT */
void ff_lrts_sub(TreeModel *mod, MSA *msa, GFF_Set *gff, mode_type mode,
                 double *feat_pvals, double *feat_null_scales,
                 double *feat_scales, double *feat_sub_scales,
                 double *feat_llrs, FILE *logf) {
    int i;
    FeatFitData *d, *d2;
    double null_lnl, alt_lnl, delta_lnl;
    TreeModel *modcpy;
    List *inside=NULL, *outside=NULL;

    modcpy = tm_create_copy(mod);   /* need separate copy of tree model
                                     with different internal scaling
                                     data for supertree/subtree case */

    /* init ColFitData -- one for null model, one for alt */
    modcpy->estimate_branchlens = TM_BRANCHLENS_ALL;
    modcpy->subtree_root = NULL;
    d = ff_init_fit_data(modcpy, msa, ALL, NNEUT, FALSE);
    d2 = ff_init_fit_data(mod, msa, SUBTREE, mode, FALSE);
    /* mod has the subtree info, modcpy
       does not */

    /* prepare lists of leaves inside and outside root, for use in
       checking for informative substitutions */
    if (mod->subtree_root != NULL) {
        inside = lst_new_ptr(mod->tree->nnodes);
        outside = lst_new_ptr(mod->tree->nnodes);
        tr_partition_leaves(mod->tree, mod->subtree_root, inside, outside);
    }

    /* iterate through features  */
    for (i = 0; i < lst_size(gff->features); i++) {
        GFF_Feature *f = lst_get_ptr(gff->features, i);
        checkInterrupt();

        /* first check for informative substitution data in feature; if none,
           don't waste time computing likelihoods */
        if (!ff_has_data_sub(mod, msa, f, inside, outside)) {
            delta_lnl = 0;
            d->cdata->params->data[0] = d2->cdata->params->data[0] =
                                            d2->cdata->params->data[1] = 1;
        }

        else {
            /* compute log likelihoods under null and alt hypotheses */
            d->feat = f;
            vec_set(d->cdata->params, 0, d->cdata->init_scale);
            opt_newton_1d(ff_likelihood_wrapper_1d, &d->cdata->params->data[0], d,
                          &null_lnl, SIGFIGS, d->cdata->lb->data[0],
                          d->cdata->ub->data[0], logf, NULL, NULL);
            null_lnl *= -1;

            d2->feat = f;
            vec_set(d2->cdata->params, 0, d->cdata->params->data[0]);
            /* init to previous estimate to save time */
            vec_set(d2->cdata->params, 1, d2->cdata->init_scale_sub);
            //      vec_set(d2->cdata->params, 1, 0.01);
            if (opt_bfgs(ff_likelihood_wrapper, d2->cdata->params, d2, &alt_lnl,
                         d2->cdata->lb, d2->cdata->ub, logf, NULL,
                         OPT_HIGH_PREC, NULL, NULL) != 0)
                ;                         /* do nothing; nonzero exit typically
                                     occurs when max iterations is
                                     reached; a warning is printed to
                                     the log */
            alt_lnl *= -1;

            delta_lnl = alt_lnl - null_lnl;

            /* This is a hack, it would be better to figure out why the
            optimization sometimes fails here.
             If we get a significantly negative lnL, re-initialize params
             so that they are identical to null model params and re-start */
            if (delta_lnl <= -0.05) {
                d2->feat = f;
                vec_set(d2->cdata->params, 0, d->cdata->params->data[0]);
                vec_set(d2->cdata->params, 1, 1.0);
                if (opt_bfgs(ff_likelihood_wrapper, d2->cdata->params, d2, &alt_lnl,
                             d2->cdata->lb, d2->cdata->ub, logf, NULL,
                             OPT_HIGH_PREC, NULL, NULL) != 0)
                    if (delta_lnl <= -0.1)
                        die("ERROR ff_lrts_sub: delta_lnl (%f) <= -0.1\n", delta_lnl);
            }
            if (delta_lnl < 0) delta_lnl = 0;
        }

        /* compute p-vals via chi-sq */
        if (feat_pvals != NULL) {
            if (mode == NNEUT || mode == CONACC)
                feat_pvals[i] = chisq_cdf(2*delta_lnl, 1, FALSE);
            else
                feat_pvals[i] = half_chisq_cdf(2*delta_lnl, 1, FALSE);
            /* assumes 50:50 mix of chisq and point mass at zero, due to
               bounding of param */

            if (feat_pvals[i] < 1e-20)
                feat_pvals[i] = 1e-20;
            /* approx limit of eval of tail prob; pvals of 0 cause problems */

            if (mode == CONACC && d2->cdata->params->data[1] > 1)
                feat_pvals[i] *= -1;    /* mark as acceleration */
        }

        /* store scales and log likelihood ratios if necessary */
        if (feat_null_scales != NULL)
            feat_null_scales[i] = d->cdata->params->data[0];
        if (feat_scales != NULL)
            feat_scales[i] = d2->cdata->params->data[0];
        if (feat_sub_scales != NULL)
            feat_sub_scales[i] = d2->cdata->params->data[1];
        if (feat_llrs != NULL)
            feat_llrs[i] = delta_lnl;
    }

    ff_free_fit_data(d);
    ff_free_fit_data(d2);
    modcpy->estimate_branchlens = TM_BRANCHLENS_ALL;
    /* have to revert for tm_free to work
       correctly */
    tm_free(modcpy);
    if (inside != NULL) lst_free(inside);
    if (outside != NULL) lst_free(outside);
}
Exemple #4
0
/* Subtree version of score test */
void ff_score_tests_sub(TreeModel *mod, MSA *msa, GFF_Set *gff, mode_type mode,
                        double *feat_pvals, double *feat_null_scales,
                        double *feat_derivs, double *feat_sub_derivs,
                        double *feat_teststats, FILE *logf) {
    int i;
    FeatFitData *d, *d2;
    Vector *grad = vec_new(2);
    Matrix *fim = mat_new(2, 2);
    double lnl, teststat;
    FimGrid *grid;
    List *inside=NULL, *outside=NULL;
    TreeModel *modcpy = tm_create_copy(mod); /* need separate copy of tree model
                                              with different internal scaling
                                              data for supertree/subtree case */

    /* init FeatFitData -- one for null model, one for alt */
    d = ff_init_fit_data(modcpy, msa, ALL, NNEUT, FALSE);
    d2 = ff_init_fit_data(mod, msa, SUBTREE, NNEUT, FALSE);
    /* mod has the subtree info, modcpy
       does not */

    /* precompute Fisher information matrices for a grid of scale values */
    grid = col_fim_grid_sub(mod);

    /* prepare lists of leaves inside and outside root, for use in
       checking for informative substitutions */
    if (mod->subtree_root != NULL) {
        inside = lst_new_ptr(mod->tree->nnodes);
        outside = lst_new_ptr(mod->tree->nnodes);
        tr_partition_leaves(mod->tree, mod->subtree_root, inside, outside);
    }

    /* iterate through features  */
    for (i = 0; i < lst_size(gff->features); i++) {
        checkInterrupt();
        d->feat = lst_get_ptr(gff->features, i);

        /* first check for informative substitution data in feature; if none,
           don't waste time computing likelihoods */
        if (!ff_has_data_sub(mod, msa, d->feat, inside, outside)) {
            teststat = 0;
            vec_zero(grad);
        }

        else {
            vec_set(d->cdata->params, 0, d->cdata->init_scale);
            opt_newton_1d(ff_likelihood_wrapper_1d, &d->cdata->params->data[0], d,
                          &lnl, SIGFIGS, d->cdata->lb->data[0], d->cdata->ub->data[0],
                          logf, NULL, NULL);
            /* turns out to be faster to use numerical rather than exact
               derivatives (judging by col case) */

            d2->feat = d->feat;
            d2->cdata->mod->scale = d->cdata->params->data[0];
            d2->cdata->mod->scale_sub = 1;
            tm_set_subst_matrices(d2->cdata->mod);
            ff_scale_derivs_subtree(d2, grad, NULL, d2->cdata->fels_scratch);

            fim = col_get_fim_sub(grid, d2->cdata->mod->scale);
            mat_scale(fim, d->feat->end - d->feat->start + 1);
            /* scale column-by-column FIM by length of feature (expected
               values are additive) */

            teststat = grad->data[1]*grad->data[1] /
                       (fim->data[1][1] - fim->data[0][1]*fim->data[1][0]/fim->data[0][0]);

            if (teststat < 0) {
                fprintf(stderr, "WARNING: teststat < 0 (%f)\n", teststat);
                teststat = 0;
            }

            if ((mode == ACC && grad->data[1] < 0) ||
                    (mode == CON && grad->data[1] > 0))
                teststat = 0;             /* derivative points toward boundary;
                                     truncate at 0 */

            mat_free(fim);
        }

        if (feat_pvals != NULL) {
            if (mode == NNEUT || mode == CONACC)
                feat_pvals[i] = chisq_cdf(teststat, 1, FALSE);
            else
                feat_pvals[i] = half_chisq_cdf(teststat, 1, FALSE);
            /* assumes 50:50 mix of chisq and point mass at zero */

            if (feat_pvals[i] < 1e-20)
                feat_pvals[i] = 1e-20;
            /* approx limit of eval of tail prob; pvals of 0 cause problems */

            if (mode == CONACC && grad->data[1] > 0)
                feat_pvals[i] *= -1; /* mark as acceleration */
        }

        /* store scales and log likelihood ratios if necessary */
        if (feat_null_scales != NULL) feat_null_scales[i] = d->cdata->params->data[0];
        if (feat_derivs != NULL) feat_derivs[i] = grad->data[0];
        if (feat_sub_derivs != NULL) feat_sub_derivs[i] = grad->data[1];
        if (feat_teststats != NULL) feat_teststats[i] = teststat;
    }

    ff_free_fit_data(d);
    ff_free_fit_data(d2);
    vec_free(grad);
    modcpy->estimate_branchlens = TM_BRANCHLENS_ALL;
    /* have to revert for tm_free to work
       correctly */
    tm_free(modcpy);
    col_free_fim_grid(grid);
    if (inside != NULL) lst_free(inside);
    if (outside != NULL) lst_free(outside);
}
Exemple #5
0
/* Perform a likelihood ratio test for each feature in a GFF,
   comparing the given null model with an alternative model that has a
   free scaling parameter for all branches.  Assumes a 0th order
   model, leaf-to-sequence mapping already available, prob matrices
   computed, sufficient stats available.  Computes p-values based
   using the chi-sq distribution and stores them in feat_pvals.  Will
   optionally store the individual scale factors in feat_scales and
   raw log likelihood ratios in feat_llrs if these variables are
   non-NULL.  Must define mode as CON (for 0 <= scale <= 1), ACC (for
   1 <= scale), NNEUT (0 <= scale), or CONACC (0 <= scale) */
void ff_lrts(TreeModel *mod, MSA *msa, GFF_Set *gff, mode_type mode,
             double *feat_pvals, double *feat_scales, double *feat_llrs,
             FILE *logf) {
    int i;
    FeatFitData *d;
    double null_lnl, alt_lnl, delta_lnl, this_scale = 1;

    /* init FeatFitData */
    d = ff_init_fit_data(mod, msa, ALL, mode, FALSE);

    /* iterate through features  */
    for (i = 0; i < lst_size(gff->features); i++) {
        GFF_Feature *f = lst_get_ptr(gff->features, i);
        checkInterrupt();

        /* first check for actual substitution data in feature; if none,
           don't waste time computing likelihoods */
        if (!ff_has_data(mod, msa, f)) {
            delta_lnl = 0;
            this_scale = 1;
        }

        else {
            mod->scale = 1;
            tm_set_subst_matrices(mod);

            /* compute log likelihoods under null and alt hypotheses */
            null_lnl = ff_compute_log_likelihood(mod, msa, f,
                                                 d->cdata->fels_scratch[0]);

            vec_set(d->cdata->params, 0, d->cdata->init_scale);
            d->feat = f;

            opt_newton_1d(ff_likelihood_wrapper_1d, &d->cdata->params->data[0], d,
                          &alt_lnl, SIGFIGS, d->cdata->lb->data[0],
                          d->cdata->ub->data[0], logf, NULL, NULL);
            /* turns out to be faster to use numerical rather than exact
               derivatives (judging by col case) */

            alt_lnl *= -1;
            this_scale = d->cdata->params->data[0];

            delta_lnl = alt_lnl - null_lnl;
            if (delta_lnl <= -0.01)
                die("ERROR ff_lrts: delta_lnl (%f) <= -0.01\n", delta_lnl);
            if (delta_lnl < 0) delta_lnl = 0;
        }

        /* compute p-vals via chi-sq */
        if (feat_pvals != NULL) {
            if (mode == NNEUT || mode == CONACC)
                feat_pvals[i] = chisq_cdf(2*delta_lnl, 1, FALSE);
            else
                feat_pvals[i] = half_chisq_cdf(2*delta_lnl, 1, FALSE);
            /* assumes 50:50 mix of chisq and point mass at zero, due to
               bounding of param */

            if (feat_pvals[i] < 1e-20)
                feat_pvals[i] = 1e-20;
            /* approx limit of eval of tail prob; pvals of 0 cause problems */

            if (mode == CONACC && this_scale > 1)
                feat_pvals[i] *= -1; /* mark as acceleration */
        }

        /* store scales and log likelihood ratios if necessary */
        if (feat_scales != NULL) feat_scales[i] = this_scale;
        if (feat_llrs != NULL) feat_llrs[i] = delta_lnl;
    }

    ff_free_fit_data(d);
    sfree(d);  //above function doesn't actually free object
}