Exemple #1
0
int main(int argc, char *argv[]) {

    struct        RNAfold_args_info args_info;
    char          *string, *input_string, *structure=NULL, *cstruc=NULL;
    char          fname[80], ffname[80], gfname[80], *ParamFile=NULL;
    char          *ns_bases=NULL, *c;
    int           i, j, ii, jj, mu, length, l, sym, r, pf=0, noconv=0;
    unsigned int  input_type;
    double        energy, min_en, kT, sfact=1.07;
    int           doMEA=0, circular = 0, N;
    char *pf_struc;
    double dist;
    plist *pl;

    FILE * filehandle;
    FILE * statsfile;
    char* line;

    double tau   = 0.01; /* Variance of energy parameters */
    double sigma = 0.01; /* Variance of experimental constraints */
    double *gradient;           /* Gradient for steepest descent search
                                 epsilon[i+1]= epsilon[i] - gradient *
                                 step_size */
    double initial_step_size = 0.5;  /* Initial step size for steepest
                                      descent search */
    double step_size;
    double D;                  /* Discrepancy (i.e. value of objective
                                function) for the current
                                prediction */
    int iteration, max_iteration = 2000; /* Current and maximum number of
                                         iterations after which
                                         algorithm stops */

    double precision = 0.1; /* cutoff used for stop conditions */
    double tolerance = 0.1;   /* Parameter used by various GSL minimizers */
    int method_id = 1;        /* Method to use for minimization, 0 and 1
                               are custom steepest descent, the rest
                               are GSL implementations (see below)*/

    int initial_guess_method = 0;

    int sample_N = 1000;

    double *prev_epsilon;
    double *prev_gradient;
    double DD, prev_D, sum, norm;
    int status;
    double* gradient_numeric;
    double* gradient_numeric_gsl;

    /* Minimizer vars */
    const gsl_multimin_fdfminimizer_type *T;
    gsl_multimin_fdfminimizer *minimizer;
    gsl_vector *minimizer_x;
    gsl_vector *minimizer_g;
    gsl_multimin_function_fdf minimizer_func;
    minimizer_pars_struct minimizer_pars;

    char *constraints;
    char outfile[256];
    char constraints_file[256];
    char epsilon_file[256];
    FILE* fh;

    double last_non_nan_lnQ;

    pf_overflow = 0;
    pf_underflow = 0;

    dangles=2;

    do_backtrack  = 1;
    string        = NULL;

    noPS = 0;
    outfile[0]='\0';
    epsilon_file[0]='\0';
    strcpy(psDir, "dotplots");

    if(RNAfold_cmdline_parser (argc, argv, &args_info) != 0) exit(1);

    /* RNAbpfold specific options */

    if (args_info.tau_given) tau = args_info.tau_arg;
    if (args_info.sigma_given) sigma = args_info.sigma_arg;
    if (args_info.precision_given) precision = args_info.precision_arg;
    if (args_info.step_given) initial_step_size = args_info.step_arg;
    if (args_info.maxN_given) max_iteration = args_info.maxN_arg;
    if (args_info.minimization_given) method_id = args_info.minimization_arg;
    if (args_info.init_given) initial_guess_method = args_info.init_arg;
    if (args_info.tolerance_given) tolerance = args_info.tolerance_arg;
    if (args_info.outfile_given) strcpy(outfile, args_info.outfile_arg);
    if (args_info.constraints_given) strcpy(constraints_file, args_info.constraints_arg);
    if (args_info.epsilon_given) strcpy(epsilon_file, args_info.epsilon_arg);
    if (args_info.sampleGradient_given) sample_conditionals=1;
    if (args_info.hybridGradient_given) {
        sample_conditionals=1;
        hybrid_conditionals=1;
    }
    if (args_info.numericalGradient_given) numerical=1;
    if (args_info.sampleStructure_given) sample_structure=1;
    if (args_info.psDir_given) strcpy(psDir, args_info.psDir_arg);
    if (args_info.sparsePS_given) sparsePS=args_info.sparsePS_arg;
    if (args_info.gridSearch_given) grid_search = 1;


    /* Generic RNAfold options */

    if (args_info.temp_given)        temperature = args_info.temp_arg;
    if (args_info.reference_given)  fold_constrained=1;
    if (args_info.noTetra_given)     tetra_loop=0;
    if (args_info.dangles_given)     dangles = args_info.dangles_arg;
    if (args_info.noLP_given)        noLonelyPairs = 1;
    if (args_info.noGU_given)        noGU = 1;
    if (args_info.noClosingGU_given) no_closingGU = 1;
    if (args_info.noconv_given)      noconv = 1;
    if (args_info.energyModel_given) energy_set = args_info.energyModel_arg;
    if (args_info.paramFile_given)   ParamFile = strdup(args_info.paramFile_arg);
    if (args_info.nsp_given)         ns_bases = strdup(args_info.nsp_arg);
    if (args_info.pfScale_given)     sfact = args_info.pfScale_arg;
    if (args_info.noPS_given)        noPS=1;



    /* Create postscript directory */
    if (!noPS) {
        struct stat stat_p;
        if (stat (psDir, &stat_p) != 0) {
            if (mkdir(psDir, S_IRWXU|S_IROTH|S_IRGRP ) !=0) {
                fprintf(stderr, "WARNING: Could not create directory: %s", psDir);
            }
        }
    }

    if (ParamFile != NULL) {
        read_parameter_file(ParamFile);
    }

    if (ns_bases != NULL) {
        nonstandards = space(33);
        c=ns_bases;
        i=sym=0;
        if (*c=='-') {
            sym=1;
            c++;
        }
        while (*c!='\0') {
            if (*c!=',') {
                nonstandards[i++]=*c++;
                nonstandards[i++]=*c;
                if ((sym)&&(*c!=*(c-1))) {
                    nonstandards[i++]=*c;
                    nonstandards[i++]=*(c-1);
                }
            }
            c++;
        }
    }

    /*Read sequence*/
    fname[0] = '\0';
    while((input_type = get_input_line(&input_string, 0)) & VRNA_INPUT_FASTA_HEADER) {
        (void) sscanf(input_string, "%42s", fname);
        free(input_string);
    }

    length = (int)    strlen(input_string);
    string = strdup(input_string);
    free(input_string);
    structure = (char *) space((unsigned) length+1);

    /* For testing purpose pass dot bracket structure of reference structure via -C */
    if (fold_constrained) {
        input_type = get_input_line(&input_string, VRNA_INPUT_NOSKIP_COMMENTS);
        if(input_type & VRNA_INPUT_QUIT) {
            exit(1);
        }
        else if((input_type & VRNA_INPUT_MISC) && (strlen(input_string) > 0)) {
            cstruc = strdup(input_string);
            free(input_string);
        }
        else warn_user("-C was given but reference structure is missing");
    }

    if(noconv) {
        str_RNA2RNA(string);
    } else {
        str_DNA2RNA(string);
    }

    /* Allocating space */

    epsilon =     (double *) space(sizeof(double)*(length+1));

    exp_pert =  (double **)space(sizeof(double *)*(length+1));
    perturbations =  (double **)space(sizeof(double *)*(length+1));
    prev_epsilon = (double *) space(sizeof(double)*(length+1));
    gradient =    (double *) space(sizeof(double)*(length+1));
    gradient_numeric =    (double *) space(sizeof(double)*(length+1));
    gradient_numeric_gsl =    (double *) space(sizeof(double)*(length+1));
    prev_gradient = (double *) space(sizeof(double)*(length+1));

    q_unpaired = (double *) space(sizeof(double)*(length+1));
    p_unpaired_cond = (double **)space(sizeof(double *)*(length+1));
    p_unpaired_cond_sampled = (double **)space(sizeof(double *)*(length+1));
    p_pp =  (double **)space(sizeof(double *)*(length+1));
    p_unpaired =  (double *) space(sizeof(double)*(length+1));
    p_unpaired_tmp = (double *) space(sizeof(double)*(length+1));

    for (i=0; i <= length; i++) {
        epsilon[i] = gradient[i] = q_unpaired[i] = 0.0;
        p_unpaired_cond[i] = (double *) space(sizeof(double)*(length+1));
        p_unpaired_cond_sampled[i] = (double *) space(sizeof(double)*(length+1));
        p_pp[i] = (double *) space(sizeof(double)*(length+1));
        exp_pert[i] = (double *) space(sizeof(double)*(length+1));
        perturbations[i] = (double *) space(sizeof(double)*(length+1));
        for (j=0; j <= length; j++) {
            p_pp[i][j]=p_unpaired_cond[i][j] = 0.0;
            p_unpaired_cond_sampled[i][j] = 0.0;
        }
    }


    /*** If file with perturbation vector epsilon is given we fold using
         this epsilon and are done ***/

    if (args_info.epsilon_given) {
        plist *pl, *pl1,*pl2;

        filehandle = fopen (epsilon_file,"r");

        if (filehandle == NULL) {
            nrerror("Could not open file with perturbation vector.");
        }

        i=1;
        while (1) {
            double t;
            line = get_line(filehandle);
            if (line == NULL) break;
            if (i>length) nrerror("Too many values in perturbation vector file.");
            if (sscanf(line, "%lf", &epsilon[i]) !=1) {
                nrerror("Error while reading perturbation vector file.");
            }
            i++;
        }

        if (i-1 != length) {
            nrerror("Too few values in perturbation vector file.");
        }

        init_pf_fold(length);
        pf_fold_pb(string, NULL);

        sprintf(fname,"%s/dot.ps", psDir);
        pl1 = make_plist(length, 1e-5);

        (void) PS_dot_plot_list_epsilon(string, fname, NULL, pl1, epsilon, "");

        exit(0);
    }



    /*** Get constraints from reference structure or from external file ***/

    /* Structure was given by -C */
    if (fold_constrained) {
        for (i=0; i<length; i++) {
            if (cstruc[i] == '(' || cstruc[i] == ')') {
                q_unpaired[i+1] = 0.0;
            } else {
                q_unpaired[i+1] = 1.0;
            }
        }

        /*Read constraints from file*/
    } else {

        filehandle = fopen (constraints_file,"r");

        if (filehandle == NULL) {
            nrerror("No constraints given as dot bracket or wrong file name");
        }

        i=1;
        while (1) {
            double t;
            line = get_line(filehandle);
            if (line == NULL) break;
            if (i>length) nrerror("Too many values in constraints.dat");
            if (sscanf(line, "%lf", &q_unpaired[i]) !=1) {
                nrerror("Error while reading constraints.dat");
            }
            i++;
        }

        if (i-1 != length) {
            nrerror("Too few values in constraints.dat");
        }
    }

    /* Create file handle */
    if (outfile[0] !='\0') {
        statsfile = fopen (outfile,"w");
    } else {
        statsfile = fopen ("stats.dat","w");
    }

    setvbuf(statsfile, NULL, _IONBF, 0);

    if (!grid_search) {
        fprintf(statsfile, "Iteration\tDiscrepancy\tNorm\tdfCount\tMEA\tSampled_structure\tSampled_energy\tSampled_distance\tEpsilon\ttimestamp\n");
    } else {
        /* If we do a grid search we have a different output. */
        fprintf(statsfile, "Dummy\tm\tb\tdummy\tMEA\tepsilon\n");
    }

    if (statsfile == NULL) {
        nrerror("Could not open stats.dat for writing.");
    }

    fprintf(stderr, "tau^2 = %.4f; sigma^2 = %.4f; precision = %.4f; tolerance = %.4f; step-size: %.4f\n\n",
            tau, sigma, precision, tolerance, initial_step_size);

    st_back=1;
    min_en = fold(string, structure);

    (void) fflush(stdout);

    if (length>2000) free_arrays();

    pf_struc = (char *) space((unsigned) length+1);

    kT = (temperature+273.15)*1.98717/1000.; /* in Kcal */
    pf_scale = exp(-(sfact*min_en)/kT/length);

    /* Set up minimizer */

    minimizer_x = gsl_vector_alloc (length+1);
    minimizer_g = gsl_vector_alloc (length+1);

    for (i=0; i <= length; i++) {
        epsilon[i] = 0.0;
        gsl_vector_set (minimizer_g, i, 0.0);
        gsl_vector_set (minimizer_x, i, epsilon[i]);
    }

    minimizer_pars.length=length;
    minimizer_pars.seq = string;
    minimizer_pars.tau=tau;
    minimizer_pars.sigma=sigma;
    minimizer_pars.kT=kT;

    minimizer_func.n = length+1;
    minimizer_func.f = calculate_f;
    minimizer_func.df = numerical ? calculate_df_numerically: calculate_df;
    minimizer_func.fdf = calculate_fdf;
    minimizer_func.params = &minimizer_pars;


    //min_en = fold_pb(string, structure);
    //fprintf(stderr, "%f", min_en);
    //exit(0);

    /* Calling test functions for debugging */
    for (i=1; i <= length; i++) {
        if (i%2==0) {
            epsilon[i] = +0.2*i;
        } else {
            epsilon[i] = -0.2*i;
        }
    }

    //test_folding(string, length);
    /* //test_stochastic_backtracking(string, length); */
    /* //test_gradient(minimizer_func, minimizer_pars); */
    /* //test_gradient_sampling(minimizer_func, minimizer_pars); */
    //exit(1);


    count_df_evaluations=0;

    /* Initial guess for epsilon */

    if (initial_guess_method !=0 && initial_guess_method !=3) {

        /* Vars for inital guess methods */
        double m,b;
        double* curr_epsilon;
        double* best_epsilon;
        double best_m, best_b, best_scale;
        double curr_D;
        double min_D = 999999999.0;
        double inc = +0.25;
        double cut;

        if (initial_guess_method == 1) fprintf(stderr, "Mathew's constant perturbations\n");
        if (initial_guess_method == 2) fprintf(stderr, "Perturbations proportional to q-p\n");

        curr_epsilon = (double *) space(sizeof(double)*(length+1));
        best_epsilon = (double *) space(sizeof(double)*(length+1));

        last_non_nan_lnQ = min_en;

        // Calculate p_unpaired for unperturbed state which we need later
        // for the proportinal method
        if (initial_guess_method == 2) {

            init_pf_fold(length);

            for (i=0; i <= length; i++) {
                epsilon[i] = 0.0;
            }

            pf_fold_pb(string, NULL);
            for (i = 1; i < length; i++) {
                for (j = i+1; j<= length; j++) {
                    p_pp[i][j]=p_pp[j][i]=pr[iindx[i]-j];
                }
            }
            get_pair_prob_vector(p_pp, p_unpaired_tmp, length, 1);
            free_pf_arrays();
        }

        /* We do the same grid search as in the Mathews paper Fig. 4*/
        for (m=0.25; m <=7.0; m+=0.25) {

            // Weird way of writing this inner loop for the grid search. We
            // traverse the grid without big jumps in the parameters to make
            // sure that the updated scaling factor is accurate all the time.
            inc*=-1;

            for (b = inc < 0.0 ? 0.0 : -3.0; inc < 0.0 ? b >= -3.0 : b<= 0.0 ; b+=inc) {

                // calculate cut point with x-axis and skip parameter pairs
                // which give a cut point outside the range of
                // q_unpaired (0 to 1). They gave frequently overflows and the
                // idea is that we both want positive and negative perturbations
                cut = exp( (-1) * b / m ) - 1;

                fprintf(stderr, "\nm = %.2f, b = %.2f, cut=%.2f\n", m, b, cut);

                if (cut > 1.0 || cut < 0.01) {
                    fprintf(stderr, "\nSkipping m = %.2f, b = %.2f\n", m, b);
                    continue;
                }

                /* Mathew's constant perturbations */
                if (initial_guess_method == 1) {
                    for (i=0; i <= length; i++) {

                        /* We add epsilon to unpaired regions (as opposed to
                           paired regions as in the Mathews paper) so we multiply
                           by -1; if missing data we set it to 0.0 */

                        if (q_unpaired[i] < -0.5) {
                            curr_epsilon[i] = 0.0;
                        } else {
                            curr_epsilon[i] = (m *(log(q_unpaired[i]+1))+b) *(-1);
                        }

                        gsl_vector_set (minimizer_x, i, curr_epsilon[i]);
                    }
                    /* Perturbations proportional to q-p */
                } else {

                    for (i=0; i <= length; i++) {
                        curr_epsilon[i] = (m *(log(q_unpaired[i]+1)-log(p_unpaired_tmp[i]+1))+ b ) * (-1);
                        gsl_vector_set (minimizer_x, i, curr_epsilon[i]);
                    }
                }

                // Repeat and adjust scaling factor until we get result without over-/underflows
                do {

                    // First we use default scaling factor
                    if (pf_underflow == 0 && pf_overflow == 0) {
                        sfact = 1.070;
                    }

                    if (pf_underflow) {
                        sfact *= 0.8;
                        fprintf(stderr,"Underflow, adjusting sfact to %.4f\n", sfact );
                    }

                    if (pf_overflow) {
                        sfact *= 1.2;
                        fprintf(stderr,"Overflow, adjusting sfact to %.4f\n", sfact );
                    }

                    pf_scale = exp(-(sfact*last_non_nan_lnQ)/kT/length);

                    //fprintf(stderr,"Scaling factor is now: %.4e\n", pf_scale);

                    curr_D = calculate_f(minimizer_x, (void*)&minimizer_pars);

                    if (!isnan(last_lnQ)) last_non_nan_lnQ = last_lnQ;

                    // Give up when even extreme scaling does not give results
                    // (for some reason I could not get rid of overflows even with high scaling factors)
                    if (sfact < 0.1 || sfact > 2.0) break;

                } while (pf_underflow == 1 || pf_overflow == 1);

                // We have not given up so everything is ok now
                if (!(sfact < 0.1 || sfact > 2.0)) {

                    if (curr_D < min_D) {
                        min_D = curr_D;
                        for (i=0; i <= length; i++) {
                            best_epsilon[i] = curr_epsilon[i];
                        }
                        best_m = m;
                        best_b = b;
                        best_scale = pf_scale;
                    }

                    /*If we are interested in the grid search we misuse the
                      print_stats function and report m and b together with MEA*/
                    if (grid_search) {
                        for (i=0; i <= length; i++) {
                            epsilon[i] = curr_epsilon[i];
                        }
                        print_stats(statsfile, string, cstruc, length, 0, 0, m, 0.0, b, 0);
                    }

                    fprintf(stderr, "curr D: %.2f, minimum D: %.2f\n", curr_D, min_D);

                    // Adjust pf_scale with default scaling factor but lnQ from
                    // previous step
                    sfact = 1.070;
                    pf_scale = exp(-(sfact*last_lnQ)/kT/length);

                } else {
                    sfact = 1.070;
                    fprintf(stderr, "Skipping m = %.2f, b = %.2f; did not get stable result.\n", m, b);
                }
            } // for b
        } // for m

        fprintf(stderr, "Minimum found: m=%.2f, b=%.2f: %.2f\n", best_m, best_b, min_D);

        for (i=0; i <= length; i++) {
            epsilon[i] = best_epsilon[i];
            gsl_vector_set (minimizer_x, i, best_epsilon[i]);
        }
        pf_scale = best_scale;
    }

    if (initial_guess_method == 3) {
        srand((unsigned)time(0));
        for (i=0; i <= length; i++) {
            double r = (double)rand()/(double)RAND_MAX * 4 - 2;
            epsilon[i] = r;
            gsl_vector_set (minimizer_x, i, epsilon[i]);
        }
    }

    /* If we just wanted a grid search we are done now. */
    if (grid_search) {
        exit(0);
    }

    prev_D = calculate_f(minimizer_x, (void*)&minimizer_pars);

    print_stats(statsfile, string, cstruc, length, 0 , count_df_evaluations , prev_D, -1.0, 0.0,1);

    /* GSL minimization */

    if (method_id !=0) {

        if (method_id > 2) {
            char name[100];
            // Available algorithms
            //  3  gsl_multimin_fdfminimizer_conjugate_fr
            //  4  gsl_multimin_fdfminimizer_conjugate_pr
            //  5  gsl_multimin_fdfminimizer_vector_bfgs
            //  6  gsl_multimin_fdfminimizer_vector_bfgs2
            //  7  gsl_multimin_fdfminimizer_steepest_descent

            //   http://www.gnu.org/software/gsl/manual/html_node/Multimin-Algorithms-with-Derivatives.html

            switch (method_id) {
            case 2:
                minimizer = gsl_multimin_fdfminimizer_alloc (gsl_multimin_fdfminimizer_conjugate_fr, length+1);
                strcpy(name, "Fletcher-Reeves conjugate gradient");
                break;
            case 3:
                minimizer = gsl_multimin_fdfminimizer_alloc (gsl_multimin_fdfminimizer_conjugate_pr, length+1);
                strcpy(name, "Polak-Ribiere conjugate gradient");
                break;
            case 4:
                minimizer = gsl_multimin_fdfminimizer_alloc ( gsl_multimin_fdfminimizer_vector_bfgs, length+1);
                strcpy(name, "Broyden-Fletcher-Goldfarb-Shanno");
                break;
            case 5:
                minimizer = gsl_multimin_fdfminimizer_alloc ( gsl_multimin_fdfminimizer_vector_bfgs2, length+1);
                strcpy(name, "Broyden-Fletcher-Goldfarb-Shanno (improved version)");
                break;
            case 6:
                minimizer = gsl_multimin_fdfminimizer_alloc (gsl_multimin_fdfminimizer_steepest_descent, length+1);
                strcpy(name, "Gradient descent (GSL implmementation)");
                break;
            }

            fprintf(stderr, "Starting minimization via GSL implementation of %s...\n\n", name);

            // The last two parmeters are step size and tolerance (with
            // different meaning for different algorithms

            gsl_multimin_fdfminimizer_set (minimizer, &minimizer_func, minimizer_x, initial_step_size, tolerance);

            iteration = 1;

            do {

                status = gsl_multimin_fdfminimizer_iterate (minimizer);
                D = minimizer->f;
                norm = gsl_blas_dnrm2(minimizer->gradient);

                print_stats(statsfile, string, cstruc, length,iteration, count_df_evaluations, D, prev_D, norm, iteration%sparsePS == 0);

                prev_D = D;

                if (status) {
                    fprintf(stderr, "An unexpected error has occured in the iteration (status:%i)\n", status);
                    break;
                }

                status = gsl_multimin_test_gradient (minimizer->gradient, precision);
                if (status == GSL_SUCCESS) fprintf(stderr, "Minimum found stopping.\n");

                iteration++;

            } while (status == GSL_CONTINUE && iteration < max_iteration);

            gsl_multimin_fdfminimizer_free (minimizer);
            gsl_vector_free (minimizer_x);

            /* Custom implementation of steepest descent */
        } else {

            if (method_id == 1) {
                fprintf(stderr, "Starting custom implemented steepest descent search...\n\n");
            } else {
                fprintf(stderr, "Starting custom implemented steepest descent search with Barzilai Borwein step size...\n\n");
            }

            iteration = 0;
            D = 0.0;

            while (iteration++ < max_iteration) {

                for (i=1; i <= length; i++) {
                    gsl_vector_set (minimizer_x, i, epsilon[i]);
                }

                D = calculate_f(minimizer_x, (void*)&minimizer_pars);

                if (numerical) {
                    calculate_df_numerically(minimizer_x, (void*)&minimizer_pars, minimizer_g);
                } else {
                    calculate_df(minimizer_x, (void*)&minimizer_pars, minimizer_g);
                }

                for (i=1; i <= length; i++) {
                    gradient[i] = gsl_vector_get (minimizer_g, i);
                }

                // Do line search

                fprintf(stderr, "\nLine search:\n");

                // After the first iteration, use Barzilai-Borwain (1988) step size (currently turned off)
                if (iteration>1 && method_id==2) {

                    double denominator=0.0;
                    double numerator=0.0;

                    for (i=1; i <= length; i++) {
                        numerator += (epsilon[i]-prev_epsilon[i]) * (gradient[i]-prev_gradient[i]);
                        denominator+=(gradient[i]-prev_gradient[i]) * (gradient[i]-prev_gradient[i]);
                    }

                    step_size = numerator / denominator;

                    norm =1.0;
                } else {
                    // Use step sized given by the user (normalize it first)
                    step_size = initial_step_size / calculate_norm(gradient, length);
                }

                for (i=1; i <= length; i++) {
                    prev_epsilon[i] = epsilon[i];
                    prev_gradient[i] = gradient[i];
                }

                do {

                    for (mu=1; mu <= length; mu++) {
                        epsilon[mu] = prev_epsilon[mu] - step_size * gradient[mu];
                    }

                    for (i=1; i <= length; i++) {
                        gsl_vector_set (minimizer_x, i, epsilon[i]);
                    }

                    DD = calculate_f(minimizer_x, (void*)&minimizer_pars);

                    if (step_size > 0.0001) {
                        fprintf(stderr, "Old D: %.4f; New D: %.4f; Step size: %.4f\n", D, DD, step_size);
                    } else {
                        fprintf(stderr, "Old D: %.4f; New D: %.4f; Step size: %.4e\n", D, DD, step_size);
                    }

                    step_size /= 2;
                } while (step_size > 1e-12 && DD > D);

                norm = calculate_norm(gradient,length);

                if (DD > D) {
                    fprintf(stderr, "Line search did not improve D in iteration %i. Stop.\n", iteration);

                    if (hybrid_conditionals) {
                        sample_conditionals=0;
                    } else {
                        break;
                    }
                }

                print_stats(statsfile, string, cstruc, length,iteration, count_df_evaluations, DD, prev_D, norm, iteration%sparsePS == 0);

                if (norm<precision && iteration>1) {
                    fprintf(stderr, "Minimum found stopping.\n");
                    break;
                }

                prev_D = DD;

            }
        }

        /* Force last dotplot to be printed */
        print_stats(statsfile, string, cstruc, length,iteration, count_df_evaluations, DD, prev_D, norm, 1);
    }

    free(pf_struc);
    if (cstruc!=NULL) free(cstruc);
    (void) fflush(stdout);
    free(string);
    free(structure);
    RNAfold_cmdline_parser_free (&args_info);


    return 0;
}
Exemple #2
0
int main(int argc, char *argv[])
{
    char *string/*, *line*/;
    char *structure=NULL, *cstruc=NULL;
    /*char  fname[13], ffname[20], gfname[20];*/
    /*char  *ParamFile=NULL;*/
    char  *ns_bases=NULL, *c;
    int   i, length, l, sym/*, r*/;
    double energy, min_en;
    double kT, sfact=1.07;
    int   pf=0, noPS=0, istty;
    int noconv=0;
    int circ=0;

    AjPSeq  seq     = NULL;
    AjPFile confile = NULL;
    AjPFile paramfile = NULL;
    AjPFile outf = NULL;
    AjPFile essfile = NULL;
    AjPFile dotfilea = NULL;
    AjPFile dotfileb = NULL;
    

    AjPStr seqstring = NULL;
    AjPStr constring = NULL;
    AjPStr seqname   = NULL;
  
    float eT = 0.;
    AjBool eGU;
    AjBool ecirc = ajFalse;
  
    AjBool eclose;
    AjBool lonely;
    AjBool convert;
    AjPStr ensbases = NULL;
    AjBool etloop;
    AjPStr eenergy = NULL;
    char ewt = '\0';
    float escale = 0.;
    AjPStr edangles = NULL;
    char edangle = '\0';

    ajint len;



    embInitPV("vrnafold",argc,argv,"VIENNA",VERSION);
    
    
    seqstring = ajStrNew();
    constring = ajStrNew();
    seqname   = ajStrNew();
    
    
    seq       = ajAcdGetSeq("sequence");
    confile   = ajAcdGetInfile("constraintfile");
    paramfile = ajAcdGetInfile("paramfile");
    eT        = ajAcdGetFloat("temperature");
    ecirc     = ajAcdGetBoolean("circular");
    eGU       = ajAcdGetBoolean("gu");
    eclose    = ajAcdGetBoolean("closegu");
    lonely    = ajAcdGetBoolean("lp");
    convert   = ajAcdGetBoolean("convert");
    ensbases  = ajAcdGetString("nsbases");
    etloop    = ajAcdGetBoolean("tetraloop");
    eenergy   = ajAcdGetListSingle("energy");
    escale    = ajAcdGetFloat("scale");
    edangles  = ajAcdGetListSingle("dangles");
    outf      = ajAcdGetOutfile("outfile");
    essfile   = ajAcdGetOutfile("ssoutfile");
    /*
      dotfilea  = ajAcdGetOutfile("adotoutfile");
      dotfileb  = ajAcdGetOutfile("bdotoutfile");
    */
    
    do_backtrack = 2; 
    pf = 0;
    string = NULL;
    istty = 0;

    temperature   = (double) eT;
    circ          = !!ecirc;
    noGU          = (eGU) ? 0 : 1;
    no_closingGU  = (eclose) ? 0 : 1;
    noLonelyPairs = (lonely) ? 0 : 1;
    noconv        = (convert) ? 0 : 1;
    ns_bases      = (ajStrGetLen(ensbases)) ? MAJSTRGETPTR(ensbases) : NULL;
    tetra_loop    = !!etloop;
    
    ewt = *ajStrGetPtr(eenergy);
    if(ewt == '0')
	energy_set = 0;
    else if(ewt == '1')
	energy_set = 1;
    else if(ewt == '2')
	energy_set = 2;
    
    sfact = (double) escale;
    
    edangle = *ajStrGetPtr(edangles);
    if(edangle == '0')
	dangles = 0;
    else if(edangle == '1')
	dangles = 1;
    else if(edangle == '2')
	dangles = 2;
    else if(edangle == '3')
	dangles = 3;


    if(circ && noLonelyPairs)
    {

        ajWarn("Depending on the origin of the circular sequence\n"
               "some structures may be missed when using -noLP\nTry "
               "rotating your sequence a few times\n");        
    }


    if(paramfile)
	read_parameter_file(paramfile);
   
    if (ns_bases != NULL)
    {
	nonstandards = space(33);
	c=ns_bases;
	i=sym=0;

	if (*c=='-')
	{
	    sym=1; c++;
	}

	while (*c!='\0')
	{
	    if (*c!=',')
	    {
		nonstandards[i++]=*c++;
		nonstandards[i++]=*c;
		if ((sym)&&(*c!=*(c-1)))
		{
		    nonstandards[i++]=*c;
		    nonstandards[i++]=*(c-1);
		}
	    }
	    c++;
	}
    }


    if(confile)
	vienna_GetConstraints(confile,&constring);
    
    string = NULL;
    structure = NULL;

    length = ajSeqGetLen(seq);
    string = (char *) space(length+1);
    strcpy(string,ajSeqGetSeqC(seq));

    len = ajStrGetLen(constring);
    structure = (char *) space(length+1);
    if(len)
    {
	fold_constrained = 1;
	strcpy(structure,ajStrGetPtr(constring));
    }
    

    for (l = 0; l < length; l++) {
        string[l] = toupper(string[l]);
        if (!noconv && string[l] == 'T') string[l] = 'U';
    }

    /* initialize_fold(length); */
    if (circ)
        min_en = circfold(string, structure);
    else
        min_en = fold(string, structure);

    ajFmtPrintF(outf,"%s\n%s", string, structure);
    if (istty)
        printf("\n minimum free energy = %6.2f kcal/mol\n", min_en);
    else
        ajFmtPrintF(outf," (%6.2f)\n", min_en);

    if (!noPS)
    {
        if (length<2000)
            (void) PS_rna_plot(string, structure, essfile);
        else
            ajWarn("Structure too long, not doing xy_plot\n");
    }
    if (length>=2000) free_arrays(); 

    if (pf)
    {
        char *pf_struc;
        pf_struc = (char *) space((unsigned) length+1);
	if (dangles==1)
        {
            dangles=2;   /* recompute with dangles as in pf_fold() */
            min_en = (circ) ? energy_of_circ_struct(string, structure) :
                energy_of_struct(string, structure);
            dangles=1;
        }

        kT = (temperature+273.15)*1.98717/1000.; /* in Kcal */
        pf_scale = exp(-(sfact*min_en)/kT/length);

        if (length>2000)
            ajWarn("scaling factor %f\n", pf_scale);

        (circ) ? init_pf_circ_fold(length) : init_pf_fold(length);

        if (cstruc!=NULL)
            strncpy(pf_struc, cstruc, length+1);

        energy = (circ) ? pf_circ_fold(string, pf_struc) :
            pf_fold(string, pf_struc);

        if (do_backtrack)
        {
            ajFmtPrintF(outf,"%s", pf_struc);
            ajFmtPrintF(outf," [%6.2f]\n", energy);
        }

        if ((istty)||(!do_backtrack))
            ajFmtPrintF(outf," free energy of ensemble = %6.2f kcal/mol\n",
                        energy);

        if (do_backtrack)
        {
            plist *pl1,*pl2;
            char *cent;
            double dist, cent_en;
            cent = centroid(length, &dist);
            cent_en = (circ) ? energy_of_circ_struct(string, cent) :
                energy_of_struct(string, cent);
            ajFmtPrintF(outf,"%s {%6.2f d=%.2f}\n", cent, cent_en, dist);
            free(cent);

            pl1 = make_plist(length, 1e-5);
            pl2 = b2plist(structure);
            (void) PS_dot_plot_list(string, dotfilea, pl1, pl2, "");
            free(pl2);
            if (do_backtrack==2)
            {
                pl2 = stackProb(1e-5);
                PS_dot_plot_list(string, dotfileb, pl1, pl2,
                                 "Probabilities for stacked pairs (i,j)(i+1,j-1)");
                free(pl2);
            }
            free(pl1);
            free(pf_struc);
        }

        ajFmtPrintF(outf," frequency of mfe structure in ensemble %g; ",
                    exp((energy-min_en)/kT));

        if (do_backtrack)
            ajFmtPrintF(outf,"ensemble diversity %-6.2f", mean_bp_dist(length));

        ajFmtPrintF(outf,"\n");
        free_pf_arrays();

    }

    if (cstruc!=NULL)
        free(cstruc);

    free(string);
    free(structure);

    ajStrDel(&seqstring);
    ajStrDel(&constring);
    ajStrDel(&seqname);

    ajStrDel(&ensbases);
    ajStrDel(&eenergy);
    ajStrDel(&edangles);

    ajSeqDel(&seq);

    ajFileClose(&confile);
    ajFileClose(&paramfile);
    ajFileClose(&outf);
    ajFileClose(&essfile);

/*
  ajFileClose(&dotfilea);
  ajFileClose(&dotfileb);
*/  
    if (length<2000) free_arrays(); 
    embExit();
    
    return 0;
}
Exemple #3
0
void print_stats(FILE* statsfile, char* seq, char* struc, int length, int iteration, int count_df_evaluations, double D, double prev_D, double norm, int printPS) {

    plist *pl, *pl1,*pl2;
    char fname[100];
    char title[100];
    char* ss;
    double MEAgamma, mea, mea_en;
    char* output;
    int i,j;
    static char timestamp[40];
    const struct tm *tm;
    time_t now;

    ss = (char *) space((unsigned) length+1);
    memset(ss,'.',length);

    init_pf_fold(length);
    pf_fold_pb(seq, NULL);

    for (i = 1; i < length; i++) {
        for (j = i+1; j<= length; j++) {
            p_pp[i][j]=p_pp[j][i]=pr[iindx[i]-j];
        }
    }
    get_pair_prob_vector(p_pp, p_unpaired, length, 1);

    fprintf (stderr, "\nITERATION:   %i\n", iteration);
    fprintf(stderr,  "DISCREPANCY: %.4f\n", D);
    fprintf(stderr,  "NORM:        %.2f\n", norm);
    if (prev_D > -1.0) {
        fprintf(stderr,  "IMPROVEMENT: %.4f%%\n\n", (1-(D/prev_D))*100);
    }

    fprintf(statsfile, "%i\t%.4f\t%.4f\t%i\t", iteration, D, norm, count_df_evaluations);

    for (MEAgamma=1e-5; MEAgamma<1e+6; MEAgamma*=10 ) {
        pl = make_plist(length, 1e-4/(1+MEAgamma));
        mea = MEA(pl, ss, MEAgamma);
        mea_en = energy_of_struct(seq, ss);
        fprintf(statsfile,"%s,%.2e;", ss, MEAgamma);
        free(pl);
    }
    fprintf(statsfile, "\t");

    // Stochastic backtracking

    fprintf(stderr, "Sampling structures...\n");

    if (sample_structure) {

        char* best_structure;
        char* curr_structure;
        double x;

        double curr_energy = 0.0;
        double min_energy = +1.0;
        int curr_distance =  0;
        int min_distance = 999999;

        best_structure = (char *) space((unsigned) length+1);

        for (i=1; i<=10000; i++) {

            curr_structure = pbacktrack_pb(seq);
            curr_energy = energy_of_struct(seq, curr_structure);
            curr_distance = 0.0;

            //fprintf(stderr, "%s%.2f ", curr_structure, curr_energy);

            for (j = 1; j <= length; j++) {

                if (q_unpaired[j] > -0.5) {
                    x = (curr_structure[j-1] == '.') ? 1.0 : 0.0;
                    curr_distance += abs(x-q_unpaired[j]);
                }
            }

            if (curr_distance < min_distance) {
                min_distance = curr_distance;
                min_energy = curr_energy;
                strcpy(best_structure, curr_structure);
            }

            if (curr_distance == min_distance) {
                if (curr_energy < min_energy) {
                    min_energy = curr_energy;
                    strcpy(best_structure, curr_structure);
                }
            }

            //fprintf(stderr, "%i\n", curr_distance);
            free(curr_structure);
        }

        //fprintf(stderr, "\n%s %.2f %i\n", best_structure, min_energy, min_distance);
        fprintf(statsfile, "\t%s\t%.2f\t%i\t", best_structure, min_energy, min_distance);

    } else {
        fprintf(statsfile, "NA\tNA\tNA\t");
    }

    for (i = 1; i <= length; i++) {
        fprintf(statsfile, "%.4f", epsilon[i]);
        if (!(i==length)) {
            fprintf(statsfile, ",");
        }
    }

    now = time ( NULL );
    tm = localtime ( &now );

    strftime ( timestamp, 40, "%Y-%m-%d %X", tm );

    fprintf(statsfile, "\t%s\n", timestamp);

    /* Print dotplot only if not noPS is given and function call asks for it */
    if (!noPS && printPS) {

        /* Print dotplot */
        sprintf(fname,"%s/iteration%i.ps", psDir, iteration);
        pl1 = make_plist(length, 1e-5);

        if (struc != NULL) {
            pl2 = b2plist(struc);
        } else {
            pl2 = NULL;
        }
        sprintf(title,"Iteration %i, D = %.4f", iteration, D);
        (void) PS_dot_plot_list_epsilon(seq, fname, pl2, pl1, epsilon, title);
    }


    free_pf_arrays();


}
Exemple #4
0
int main(int argc, char *argv[])
{
  char *string, *line;
  char *structure=NULL, *cstruc=NULL;
  char  fname[13], ffname[20], gfname[20];
  char  *ParamFile=NULL;
  char  *ns_bases=NULL, *c;
  int   i, length, l, sym, r;
  double energy, min_en;
  double kT, sfact=1.07;
  int   pf=0, noPS=0, istty;
  int noconv=0;
  int circ=0;

  do_backtrack = 1;
  string=NULL;
  for (i=1; i<argc; i++) {
    if (argv[i][0]=='-')
      switch ( argv[i][1] )
	{
	case 'T':  if (argv[i][2]!='\0') usage();
	  if(i==argc-1) usage();
	  r=sscanf(argv[++i], "%lf", &temperature);
	  if (!r) usage();
	  break;
	case 'p':  pf=1;
	  if (argv[i][2]!='\0')
	    (void) sscanf(argv[i]+2, "%d", &do_backtrack);
	  break;
	case 'n':
	  if ( strcmp(argv[i], "-noGU")==0) noGU=1;
	  if ( strcmp(argv[i], "-noCloseGU")==0) no_closingGU=1;
	  if ( strcmp(argv[i], "-noLP")==0) noLonelyPairs=1;
	  if ( strcmp(argv[i], "-noPS")==0) noPS=1;
	  if ( strcmp(argv[i], "-nsp") ==0) {
	    if (i==argc-1) usage();
	    ns_bases = argv[++i];
	  }
	  if ( strcmp(argv[i], "-noconv")==0) noconv=1;
	  break;
	case '4':
	  tetra_loop=0;
	  break;
	case 'e':
	  if(i==argc-1) usage();
	  r=sscanf(argv[++i],"%d", &energy_set);
	  if (!r) usage();
	  break;
	case 'C':
	  fold_constrained=1;
	  break;
	case 'c':
	  if ( strcmp(argv[i], "-circ")==0) circ=1;
	  break;
	case 'S':
	  if(i==argc-1) usage();
	  r=sscanf(argv[++i],"%lf", &sfact);
	  if (!r) usage();
	  break;
	case 'd': dangles=0;
	  if (argv[i][2]!='\0') {
	    r=sscanf(argv[i]+2, "%d", &dangles);
	    if (r!=1) usage();
	  }
	  break;
	case 'P':
	  if (i==argc-1) usage();
	  ParamFile = argv[++i];
	  break;
	default: usage();
	}
  }

  if (circ && noLonelyPairs)
    fprintf(stderr, "warning, depending on the origin of the circular sequence, some structures may be missed when using -noLP\nTry rotating your sequence a few times\n");
  if (ParamFile != NULL)
    read_parameter_file(ParamFile);

  if (ns_bases != NULL) {
    nonstandards = space(33);
    c=ns_bases;
    i=sym=0;
    if (*c=='-') {
      sym=1; c++;
    }
    while (*c!='\0') {
      if (*c!=',') {
	nonstandards[i++]=*c++;
	nonstandards[i++]=*c;
	if ((sym)&&(*c!=*(c-1))) {
	  nonstandards[i++]=*c;
	  nonstandards[i++]=*(c-1);
	}
      }
      c++;
    }
  }
  istty = isatty(fileno(stdout))&&isatty(fileno(stdin));
  if ((fold_constrained)&&(istty)) {
    printf("Input constraints using the following notation:\n");
    printf("| : paired with another base\n");
    printf(". : no constraint at all\n");
    printf("x : base must not pair\n");
    printf("< : base i is paired with a base j<i\n");
    printf("> : base i is paired with a base j>i\n");
    printf("matching brackets ( ): base i pairs base j\n");
  }

  do {				/* main loop: continue until end of file */
    if (istty) {
      printf("\nInput string (upper or lower case); @ to quit\n");
      printf("%s%s\n", scale1, scale2);
    }
    fname[0]='\0';
    if ((line = get_line(stdin))==NULL) break;

    /* skip comment lines and get filenames */
    while ((*line=='*')||(*line=='\0')||(*line=='>')) {
      if (*line=='>')
	(void) sscanf(line, ">%12s", fname);
      printf("%s\n", line);
      free(line);
      if ((line = get_line(stdin))==NULL) break;
    }

    if ((line ==NULL) || (strcmp(line, "@") == 0)) break;

    string = (char *) space(strlen(line)+1);
    (void) sscanf(line,"%s",string);
    free(line);
    length = (int) strlen(string);

    structure = (char *) space((unsigned) length+1);
    if (fold_constrained) {
      cstruc = get_line(stdin);
      if (cstruc!=NULL)
	strncpy(structure, cstruc, length);
      else
	fprintf(stderr, "constraints missing\n");
    }
    for (l = 0; l < length; l++) {
      string[l] = toupper(string[l]);
      if (!noconv && string[l] == 'T') string[l] = 'U';
    }
    if (istty)
      printf("length = %d\n", length);

    /* initialize_fold(length); */
    if (circ)
      min_en = circfold(string, structure);
    else
      min_en = fold(string, structure);
    printf("%s\n%s", string, structure);
    if (istty)
      printf("\n minimum free energy = %6.2f kcal/mol\n", min_en);
    else
      printf(" (%6.2f)\n", min_en);

    (void) fflush(stdout);

    if (fname[0]!='\0') {
      strcpy(ffname, fname);
      strcat(ffname, "_ss.ps");
      strcpy(gfname, fname);
      strcat(gfname, "_ss.g");
    } else {
      strcpy(ffname, "rna.ps");
      strcpy(gfname, "rna.g");
    }
    if (!noPS) {
      if (length<2000)
	(void) PS_rna_plot(string, structure, ffname);
      else 
	fprintf(stderr,"INFO: structure too long, not doing xy_plot\n");
    }
    if (length>2000) free_arrays(); 
    if (pf) {
      char *pf_struc;
      pf_struc = (char *) space((unsigned) length+1);
	if (dangles==1) {
	  dangles=2;   /* recompute with dangles as in pf_fold() */
	  min_en = (circ) ? energy_of_circ_struct(string, structure) : energy_of_struct(string, structure);
	  dangles=1;
      }

      kT = (temperature+273.15)*1.98717/1000.; /* in Kcal */
      pf_scale = exp(-(sfact*min_en)/kT/length);
      if (length>2000) fprintf(stderr, "scaling factor %f\n", pf_scale);

      (circ) ? init_pf_circ_fold(length) : init_pf_fold(length);

      if (cstruc!=NULL)
	strncpy(pf_struc, cstruc, length+1);
      energy = (circ) ? pf_circ_fold(string, pf_struc) : pf_fold(string, pf_struc);

      if (do_backtrack) {
	printf("%s", pf_struc);
	if (!istty) printf(" [%6.2f]\n", energy);
	else printf("\n");
      }
      if ((istty)||(!do_backtrack))
	printf(" free energy of ensemble = %6.2f kcal/mol\n", energy);
      if (do_backtrack) {
	plist *pl1,*pl2;
	char *cent;
	double dist, cent_en;
	cent = centroid(length, &dist);
	cent_en = (circ) ? energy_of_circ_struct(string, cent) :energy_of_struct(string, cent);
	printf("%s {%6.2f d=%.2f}\n", cent, cent_en, dist);
	free(cent);
	if (fname[0]!='\0') {
	  strcpy(ffname, fname);
	  strcat(ffname, "_dp.ps");
	} else strcpy(ffname, "dot.ps");
	pl1 = make_plist(length, 1e-5);
	pl2 = b2plist(structure);
	(void) PS_dot_plot_list(string, ffname, pl1, pl2, "");
	free(pl2);
	if (do_backtrack==2) {
	  pl2 = stackProb(1e-5);
	  if (fname[0]!='\0') {
	    strcpy(ffname, fname);
	    strcat(ffname, "_dp2.ps");
	  } else strcpy(ffname, "dot2.ps");
	  PS_dot_plot_list(string, ffname, pl1, pl2,
			   "Probabilities for stacked pairs (i,j)(i+1,j-1)");
	  free(pl2);
	}
	free(pl1);
	free(pf_struc);
      }
      printf(" frequency of mfe structure in ensemble %g; ",
	     exp((energy-min_en)/kT));
      if (do_backtrack)
	printf("ensemble diversity %-6.2f", mean_bp_dist(length));

      printf("\n");
      free_pf_arrays();

    }
    if (cstruc!=NULL) free(cstruc);
    (void) fflush(stdout);
    free(string);
    free(structure);
  } while (1);
  return 0;
}