Пример #1
0
void EM::run_em(NaiveBayesClassifier* classifier, int** feature_vectors, int** labeled_docs, int** unlabeled_docs, int number_unique_words,int number_unlabeled_documents, int number_labeled_documents, int number_labels)
{
    
    /*Initialize arrays to store old parameters*/
    
    long double** old_likelihood = (long double**)malloc(sizeof(long double*)*number_labels);
    long double* old_prior = (long double*)malloc(sizeof(long double)*number_labels);
    
    for(int i = 0;i<number_labels;i++)
        old_likelihood[i] = (long double*)malloc(sizeof(long double)*number_unique_words);
    
    
	/*Initial Step*/
	//Construct classifier with labeled feature_vectors
    
    classifier->calculate_likelihood(labeled_docs, number_unique_words, number_labeled_documents, number_labels);

    classifier->calculate_prior(labeled_docs, number_labeled_documents, number_labels);
    
	for(;;)
	{
        break;
		/*E Step*/
        
        printf("Performing E Step\n");
        classify_all_unlabeled_documents(classifier, unlabeled_docs, number_unique_words, number_unlabeled_documents, number_labels);
        
        //ConsolePrint::print_2d_int(number_unique_words, number_unlabeled_documents, unlabeled_docs);
        //ConsolePrint::print_2d_int(number_unique_words, number_labeled_documents, labeled_docs);
        //ConsolePrint::print_2d_int(number_unique_words, number_labeled_documents+number_unlabeled_documents, feature_vectors);
		
        /*M Step*/
        
        printf("Performing M Step\n");
        copy_parameters(classifier->get_likelihood(), classifier->get_prior(), old_likelihood, old_prior, number_labels, number_unique_words);
        
		classifier->calculate_likelihood(feature_vectors, number_unique_words, number_labeled_documents+number_unlabeled_documents, number_labels);

		classifier->calculate_prior(feature_vectors,number_labeled_documents+number_unlabeled_documents,number_labels);

		/*Check for convergence*/
        
        if(check_if_converged(old_likelihood,old_prior,classifier->get_likelihood(),classifier->get_prior(),number_labels,number_unique_words))
            break;
        
		
	}
    
    printf("EM Process Done\n");
}
/**
 * A function to create a new load balancer.
 * @param[in] api The deltacloud_api structure representing the connection
 * @param[in] name The name to give to the new load balancer
 * @param[in] realm_id The realm ID to put the new load balancer in
 * @param[in] protocol The protocol to load balance
 * @param[in] balancer_port The port the load balancer listens on
 * @param[in] instance_port The port the load balancer balances to
 * @param[in] params An array of deltacloud_create_parameter structures that
 *                   represent any optional parameters to pass into the
 *                   create call
 * @param[in] params_length An integer describing the length of the params
 *                          array
 * @returns 0 on success, -1 on error
 */
int deltacloud_create_loadbalancer(struct deltacloud_api *api, const char *name,
				   const char *realm_id, const char *protocol,
				   int balancer_port, int instance_port,
				   struct deltacloud_create_parameter *params,
				   int params_length)
{
  struct deltacloud_create_parameter *internal_params;
  int ret = -1;
  int pos;

  if (!valid_api(api) || !valid_arg(name) || !valid_arg(realm_id)
      || !valid_arg(protocol))
    return -1;
  if (balancer_port < 0 || balancer_port > 65536) {
    invalid_argument_error("balancer_port must be between 0 and 65536");
    return -1;
  }
  if (instance_port < 0 || instance_port > 65536) {
    invalid_argument_error("instance_port must be between 0 and 65536");
    return -1;
  }

  if (params_length < 0) {
    invalid_argument_error("params_length must be >= 0");
    return -1;
  }

  internal_params = calloc(params_length + 5,
			   sizeof(struct deltacloud_create_parameter));
  if (internal_params == NULL) {
    oom_error();
    return -1;
  }

  pos = copy_parameters(internal_params, params, params_length);
  if (pos < 0)
    /* copy_parameters already set the error */
    goto cleanup;

  if (deltacloud_prepare_parameter(&internal_params[pos++], "name", name) < 0 ||
      deltacloud_prepare_parameter(&internal_params[pos++], "realm_id", realm_id) < 0 ||
      deltacloud_prepare_parameter(&internal_params[pos++], "listener_protocol", protocol) < 0)
    /* deltacloud_prepare_parameter already set the error */
    goto cleanup;

  if (prepare_int_parameter(&internal_params[pos++], "listener_balancer_port",
			    balancer_port) < 0)
    /* prepare_int_parameter already set the error */
    goto cleanup;

  if (prepare_int_parameter(&internal_params[pos++], "listener_instance_port",
			    instance_port) < 0)
    /* prepare_int_parameter already set the error */
    goto cleanup;

  if (internal_create(api, "load_balancers", internal_params, pos, NULL,
		      NULL) < 0)
    /* internal_create already set the error */
    goto cleanup;

  ret = 0;

 cleanup:
  free_parameters(internal_params, pos);
  SAFE_FREE(internal_params);

  return ret;
}
static int lb_register_unregister(struct deltacloud_api *api,
				  struct deltacloud_loadbalancer *balancer,
				  const char *instance_id,
				  struct deltacloud_create_parameter *params,
				  int params_length,
				  const char *link)
{
  struct deltacloud_link *thislink;
  struct deltacloud_create_parameter *internal_params;
  char *href;
  int ret = -1;
  int pos;
  int rc;

  if (!valid_api(api) || !valid_arg(balancer) || !valid_arg(instance_id))
    return -1;

  if (params_length < 0) {
    invalid_argument_error("params_length must be >= 0");
    return -1;
  }

  thislink = api_find_link(api, "load_balancers");
  if (thislink == NULL)
    /* api_find_link set the error */
    return -1;

  internal_params = calloc(params_length + 1,
			   sizeof(struct deltacloud_create_parameter));
  if (internal_params == NULL) {
    oom_error();
    return -1;
  }

  pos = copy_parameters(internal_params, params, params_length);
  if (pos < 0)
    /* copy_parameters already set the error */
    goto cleanup;

  if (deltacloud_prepare_parameter(&internal_params[pos++], "instance_id",
				   instance_id) < 0)
    /* deltacloud_prepare_parameter already set the error */
    goto cleanup;

  if (asprintf(&href, "%s/%s/%s", thislink->href, balancer->id, link) < 0) {
    oom_error();
    goto cleanup;
  }

  rc = internal_post(api, href, internal_params, pos, NULL, NULL);
  SAFE_FREE(href);
  if (rc < 0)
    /* internal_post already set the error */
    goto cleanup;

  ret = 0;

 cleanup:
  free_parameters(internal_params, pos);
  SAFE_FREE(internal_params);

  return ret;
}
void parameter_test(Tree &T, Model &Mod, long Nrep, long length, double eps, std::vector<double> &pvals, std::string data_prefix, bool save_mc_exact){

  long iter;
  long i, r;

  double df, C;
  double distance, KL;
	KL=0;
	distance=0;
  double likel;


  Parameters Parsim, Par, Par_noperm;
  Alignment align;
  Counts data;

  double eps_pseudo = 0.001;     // Amount added to compute the pseudo-counts.

  StateList sl;

  bool save_data = (data_prefix != "");


  std::string output_filename;
  std::stringstream output_index;
  std::ofstream logfile;
  std::ofstream logdistfile;

  std::ofstream out_chi2;
  std::ofstream out_br;
  std::ofstream out_brPerc;

  std::ofstream out_pvals;
  std::ofstream out_pvals_noperm;
  std::ofstream out_qvals;
  std::ofstream out_bound;
  std::ofstream out_variances;
  std::ofstream out_qvalsComb;
  std::ofstream out_qvalsCombzscore;
  std::ofstream out_covmatrix;

  std::ofstream out_parest;
  std::ofstream out_parsim;

  std::vector<double> KLe;
  std::vector<std::vector<double> > chi2_array; // an array of chi2 for every edge.
  std::vector<std::vector<double> > mult_array; // an array of mult for every edge.
  std::vector<std::vector<double> > br_array; // an array of br. length for every edge.
  std::vector<std::vector<double> > br_arrayPerc; // an array of br. length for every edge.

  std::vector<std::vector<double> > cota_array; // an array of upper bounds of the diff in lengths for every edge.
  std::vector<std::vector<double> > pval_array; // an array of pvals for every edge.
  std::vector<std::vector<double> > pval_noperm_array;
  std::vector<std::vector<double> > qval_array; // an array of qvalues for every edge.
  std::vector<std::vector<double> > variances_array; // an array of theoretical variances.
  std::vector<std::vector<double> > parest_array; // array of estimated parameters
  std::vector<std::vector<double> > parsim_array; // array of simulation parameters

	//  ci_binom ci_bin; // condfidence interval
  std::vector<std::vector<ci_binom> > CIbinomial ; //  	vector of CIs

  std::list<long> produced_nan;

  long npars = T.nedges*Mod.df + Mod.rdf;

  // Initializing pvals
  pvals.resize(T.nedges);

  // Initialize the parameters for simulation of K81 data for testing
  Par = create_parameters(T);
  Parsim = create_parameters(T);

  // Initializing data structures
  KLe.resize(T.nedges);

	pval_array.resize(T.nedges);
        pval_noperm_array.resize(T.nedges);
        qval_array.resize(T.nedges);
	chi2_array.resize(T.nedges);
	mult_array.resize(T.nedges);
	br_array.resize(T.nedges);
        br_arrayPerc.resize(T.nedges);
	cota_array.resize(T.nedges);
        variances_array.resize(npars);
        parest_array.resize(npars);
        parsim_array.resize(npars);

	// initialize to 0's
  for (i=0; i < T.nedges; i++) {
      pval_array[i].resize(Nrep, 0);
      pval_noperm_array[i].resize(Nrep, 0);
      qval_array[i].resize(Nrep, 0);
          chi2_array[i].resize(Nrep, 0);
	  mult_array[i].resize(Nrep, 0);
	  br_array[i].resize(Nrep, 0);
          br_arrayPerc[i].resize(Nrep, 0);
	  cota_array[i].resize(Nrep, 0);
  }

  for(i=0; i < npars; i++) {
    variances_array[i].resize(Nrep, 0);
    parest_array[i].resize(Nrep, 0);
    parsim_array[i].resize(Nrep, 0);
  }

  // Information about the chi^2.
  df = Mod.df;
  C = get_scale_constant(Mod);


  if (save_data) {
    logfile.open((data_prefix + ".log").c_str(), std::ios::out);
    logfile << "model:  " << Mod.name << std::endl;
    logfile << "length: " << length << std::endl;
    logfile << "eps:    " << eps << std::endl;
    logfile << "nalpha: " << T.nalpha << std::endl;
    logfile << "leaves: " << T.nleaves << std::endl;
    logfile << "tree:   " << T.tree_name << std::endl;
    logfile << std::endl;
    logdistfile.open((data_prefix + ".dist.log").c_str(), std::ios::out);

    out_chi2.open(("out_chi2-" + data_prefix + ".txt").c_str(), std::ios::out);
    out_br.open(("out_br-" + data_prefix + ".txt").c_str(), std::ios::out);
    out_brPerc.open(("out_brPerc-" + data_prefix + ".txt").c_str(), std::ios::out);
    out_pvals.open(("out_pvals-" + data_prefix + ".txt").c_str(), std::ios::out);
    out_pvals_noperm.open(("out_pvals_noperm-" + data_prefix + ".txt").c_str(), std::ios::out);
    out_qvals.open(("out_qvals-" + data_prefix + ".txt").c_str(), std::ios::out);
    out_variances.open(("out_variances-" + data_prefix + ".txt").c_str(), std::ios::out);
    out_parest.open(("out_params-est-" + data_prefix + ".txt").c_str(), std::ios::out);
    out_parsim.open(("out_params-sim-" + data_prefix + ".txt").c_str(), std::ios::out);
    out_bound.open(("out_bound-" + data_prefix + ".txt").c_str(), std::ios::out);
    out_qvalsComb.open(("out_qvalsComb-" + data_prefix + ".txt").c_str(), std::ios::out);
    out_qvalsCombzscore.open(("out_qvalsCombzscore-" + data_prefix + ".txt").c_str(), std::ios::out);

    out_parsim.precision(15);
    out_parest.precision(15);
    out_variances.precision(15);
  }


	// uncomment the 2 following lines if want to fix the parameters
	// random_parameters_length(T, Mod, Parsim);
	 //random_data(T, Mod, Parsim, length, align);

  for (iter=0; iter < Nrep; iter++) {
    std::cout << "iteration: " << iter << "             \n";

    // Produces an alignment from random parameters
    random_parameters_length(T, Mod, Parsim);


    random_data(T, Mod, Parsim, length, align);
    get_counts(align, data);
    add_pseudocounts(eps_pseudo, data);

    // Saving data
    if (save_data) {
      output_index.str("");
      output_index << iter;
      output_filename = data_prefix + "-" + output_index.str();
      save_alignment(align, output_filename + ".fa");
      save_parameters(Parsim, output_filename + ".sim.dat");
    }

    // Runs the EM
    std::tie(likel, iter) = EMalgorithm(T, Mod, Par, data, eps);

    // If algorithm returns NaN skip this iteration.
    if (boost::math::isnan(likel)) {
      produced_nan.push_back(iter);
      continue;
    }
    copy_parameters(Par, Par_noperm);

    // Chooses the best permutation.
    guess_permutation(T, Mod, Par);

    distance = parameters_distance(Parsim, Par);

      // estimated counts: Par ; original: Parsim
      std::vector<double> counts_est;
      counts_est.resize(T.nalpha, 0);


      // calculate the cov matrix
      std::vector<std::vector<double> > Cov;
      Array2 Cov_br;

      full_MLE_covariance_matrix(T, Mod, Parsim, length, Cov);

      if(save_data) {
          save_matrix(Cov, output_filename + ".cov.dat");
      }

      // Save the covariances in an array
      std::vector<double> param;
      std::vector<double> param_sim;

      param.resize(npars);
      param_sim.resize(npars);

      get_free_param_vector(T, Mod, Par, param);
      get_free_param_vector(T, Mod, Parsim, param_sim);

      for(i=0; i < npars; i++) {
	variances_array[i][iter] = Cov[i][i];
        parsim_array[i][iter] = param_sim[i];
        parest_array[i][iter] = param[i];
      }

      std::vector<double> xbranca, xbranca_noperm, mubranca;
      double chi2_noperm;
      xbranca.resize(Mod.df);
      xbranca_noperm.resize(Mod.df);
      mubranca.resize(Mod.df);
      for (i=0; i < T.nedges; i++) {
		  r = 0; // row to be fixed
		  // Extracts the covariance matrix, 1 edge

				  branch_inverted_covariance_matrix(Mod, Cov, i, Cov_br);
                  get_branch_free_param_vector(T, Mod, Parsim, i, mubranca);
                  get_branch_free_param_vector(T, Mod, Par, i, xbranca);
                  get_branch_free_param_vector(T, Mod, Par_noperm, i, xbranca_noperm);

			      chi2_array[i][iter] = chi2_mult(mubranca, xbranca, Cov_br);
                  chi2_noperm = chi2_mult(mubranca, xbranca_noperm, Cov_br);


                  pval_array[i][iter] =  pvalue_chi2(chi2_array[i][iter], Mod.df);
                  pval_noperm_array[i][iter] = pvalue_chi2(chi2_noperm, Mod.df);

			      br_array[i][iter] = T.edges[i].br - branch_length(Par.tm[i], T.nalpha);
				  br_arrayPerc[i][iter] = branch_length(Par.tm[i], T.nalpha)/T.edges[i].br;


		// Upper bound on the parameter distance using multinomial:
		//  cota_array[i][iter] = bound_mult(Parsim.tm[i], Xm, length);
	    // and using the  L2 bound
		  cota_array[i][iter] = branch_length_error_bound_mult(Parsim.tm[i], Par.tm[i]);
		  out_br <<  br_array[i][iter]  << " ";
		  out_brPerc <<  br_arrayPerc[i][iter]  << " ";

		  out_bound  <<  cota_array[i][iter] << " ";
		  out_chi2 << chi2_array[i][iter] << " ";

			}
         out_chi2 << std::endl;
		 out_bound  <<  std::endl;
		 out_br << std::endl;
		 out_brPerc << std::endl;




    // Saves more data.
    if (save_data) {
      logfile << iter << ": " << distance << "   " << KL << std::endl;
      save_parameters(Par, output_filename + ".est.dat");

      logdistfile << iter << ": ";
      logdistfile << parameters_distance_root(Par, Parsim) << " ";
      for(int j=0; j < T.nedges; j++) {
        logdistfile << parameters_distance_edge(Par, Parsim, j) << " ";
      }
      logdistfile << std::endl;
    }

} // close iter loop here

  // Correct the p-values
  for(i=0; i < T.nedges; i++) {
       BH(pval_array[i], qval_array[i]);
	//save them
  }

  if (save_mc_exact) {
    for(long iter=0; iter < Nrep; iter++) {
      for(long i=0; i < T.nedges; i++) {
        out_pvals << pval_array[i][iter] << "  ";
        out_pvals_noperm << pval_noperm_array[i][iter] << "  ";
        out_qvals << qval_array[i][iter] << "  ";
      }
      out_pvals  << std::endl;
      out_pvals_noperm << std::endl;
      out_qvals  << std::endl;

      for(long i=0; i < npars; i++) {
        out_variances << variances_array[i][iter] << "  ";
        out_parsim << parsim_array[i][iter] << "  ";
        out_parest << parest_array[i][iter] << "  ";
      }
      out_variances << std::endl;
      out_parsim << std::endl;
      out_parest << std::endl;
    }
  }

	// now combine the pvalues
   for(i=0; i < T.nedges; i++) {
	pvals[i] = Fisher_combined_pvalue(pval_array[i]);
    //using the Zscore it goes like this: pvals[i] = Zscore_combined_pvalue(pval_array[i]);
	if (save_mc_exact) {	   out_qvalsComb <<  pvals[i] << "  " ;
	out_qvalsCombzscore << Zscore_combined_pvalue(pval_array[i]) << " ";
	}
  }

  // Close files
  if (save_data) {
    logdistfile.close();
    logfile.close();
  }

if (save_mc_exact) {
	out_chi2.close();
	out_bound.close();
        out_variances.close();
        out_parest.close();
        out_parsim.close();
	out_br.close();
	out_brPerc.close();

	out_pvals.close();
        out_qvals.close();
	out_qvalsComb.close();
	out_qvalsCombzscore.close();
        out_covmatrix.close();
	}

  // Warn if some EM's produced NaN.
  if (produced_nan.size() > 0) {
    std::cout << std::endl;
    std::cout << "WARNING: Some iterations produced NaN." << std::endl;
    std::list<long>::iterator it;
    for (it = produced_nan.begin(); it != produced_nan.end(); it++) {
      std::cout << *it << ", ";
    }
    std::cout << std::endl;
  }
}
/**
 * A function to create a new storage snapshot.
 * @param[in] api The deltacloud_api structure representing the connection
 * @param[in] volume The volume to take the snapshot from
 * @param[in] params An array of deltacloud_create_parameter structures that
 *                   represent any optional parameters to pass into the
 *                   create call
 * @param[in] params_length An integer describing the length of the params
 *                          array
 * @param[out] snap_id The snapshot_id returned by the create call
 * @returns 0 on success, -1 on error
 */
int deltacloud_create_storage_snapshot(struct deltacloud_api *api,
				       struct deltacloud_storage_volume *volume,
				       struct deltacloud_create_parameter *params,
				       int params_length,
				       char **snap_id)
{
  struct deltacloud_create_parameter *internal_params;
  struct deltacloud_storage_snapshot snap;
  char *data = NULL;
  int ret = -1;
  int pos;

  if (!valid_api(api) || !valid_arg(volume))
    return -1;

  if (params_length < 0) {
    invalid_argument_error("params_length must be >= 0");
    return -1;
  }

  internal_params = calloc(params_length + 1,
			   sizeof(struct deltacloud_create_parameter));
  if (internal_params == NULL) {
    oom_error();
    return -1;
  }

  pos = copy_parameters(internal_params, params, params_length);
  if (pos < 0)
    /* copy_parameters already set the error */
    goto cleanup;

  if (deltacloud_prepare_parameter(&internal_params[pos++], "volume_id",
				   volume->id) < 0)
    /* deltacloud_create_parameter already set the error */
    goto cleanup;

  if (internal_create(api, "storage_snapshots", internal_params, pos, &data,
		      NULL) < 0)
    /* internal_create already set the error */
    goto cleanup;

  if (snap_id != NULL) {
    if (internal_xml_parse(data, "storage_snapshot", parse_one_storage_snapshot,
			   1, &snap) < 0)
      /* internal_xml_parse set the error */
      goto cleanup;

    *snap_id = strdup(snap.id);
    deltacloud_free_storage_snapshot(&snap);
    if (*snap_id == NULL) {
      oom_error();
      goto cleanup;
    }
  }

  ret = 0;

 cleanup:
  free_parameters(internal_params, pos);
  SAFE_FREE(internal_params);
  SAFE_FREE(data);

  return ret;
}