Пример #1
0
double CFS::symmetrical_uncertainity (unsigned int i, unsigned int j)
{ 
  double result = 0;

  // SU(X1,X2) = 2 * ((H(Xi) + H(Xj) - H(Xi,Xj)) / (H(Xi) + H(Xj))),
  //
  // where X1 is a feature and X2 is either another feature or the class Y.
  //
  if (H[i] == -1)
    H[i] = compute_entropy (i);

  if (H[j] == -1)
    H[j] = compute_entropy (j);

  if ((H[i] + H[j]) != 0)
  {
    // Compute H(X1), H(X2) and H(X1,X2).
    //
    double H_Xi_Xj = compute_joint_entropy (i, j);  
    if (i == j)
      result = 2 * ((H[i] + H[n] - H_Xi_Xj) / (H[i] + H[n]));
    else
      result = 2 * ((H[i] + H[j] - H_Xi_Xj) / (H[i] + H[j]));
  }
  return result;
}
Пример #2
0
//compute the shannon entropy of a given split
double Dectree_class::compute_erf_entropy(const cv::Mat& labels, const cv::Mat& neg_labels, const cv::Mat& pos_labels)
{
	//class entropy or entropy before split
	double class_entropy = compute_entropy(labels);
	//mutual information (information gain)
	double imp_neg_labels = ((double)neg_labels.rows/labels.rows)*compute_entropy(neg_labels);
	double imp_pos_labels = ((double)pos_labels.rows/labels.rows)*compute_entropy(pos_labels);
	double imp_attr = imp_neg_labels + imp_pos_labels;
	double info_gain = class_entropy - imp_attr;
	//split entropy
	double split_entropy = 0;
	double neg_prob = ((double)neg_labels.rows/labels.rows); //because of the base cases in the learning algorithm, we are sure
								 //labels.rows is greater than 0
	double pos_prob = ((double)pos_labels.rows/labels.rows);
	if( fabs(neg_prob-0.) > FLT_EPSILON && fabs(neg_prob-1.) > FLT_EPSILON )
		split_entropy += neg_prob*log2(neg_prob);
	if( fabs(pos_prob-0.) > FLT_EPSILON && fabs(pos_prob-1.) > FLT_EPSILON )
		split_entropy += pos_prob*log2(pos_prob);
	split_entropy *= -1;
	//shannon entropy
	double shannon_entropy = (2*info_gain)/(class_entropy+split_entropy);

	//for debugging
	/*
	std::cout << "Class entropy: " << class_entropy << std::endl;
	std::cout << "Info gain: " << info_gain << std::endl;
	std::cout << "Split entropy: " << split_entropy << std::endl;
	std::cout << "Shannon entropy: " << shannon_entropy << std::endl;
	*/

	return shannon_entropy;
}
Пример #3
0
bool IRKE::sequence_path_exists(string &sequence, unsigned int min_coverage, float min_entropy, float min_connectivity,
                                vector<unsigned int> &coverage_counter)
{

    unsigned int kmer_length = kcounter.get_kmer_length();

    if (sequence.length() < kmer_length) {
        return (false);
    }

    bool path_exists = true;

    string prev_kmer = sequence.substr(0, kmer_length);
    if (contains_non_gatc(prev_kmer) || !kcounter.kmer_exists(prev_kmer)) {
        path_exists = false;
        coverage_counter.push_back(0);
    }
    else {
        unsigned int kmer_count = kcounter.get_kmer_count(prev_kmer);
        coverage_counter.push_back(kmer_count);

        float entropy = compute_entropy(prev_kmer);

        if (kmer_count < min_coverage || entropy < min_entropy) {
            path_exists = false;
        }
    }


    for (unsigned int i = 1; i <= sequence.length() - kmer_length; i++) {

        string kmer = sequence.substr(i, kmer_length);

        if (contains_non_gatc(kmer) || !kcounter.kmer_exists(kmer)) {
            path_exists = false;
            coverage_counter.push_back(0);
        }
        else {
            unsigned int kmer_count = kcounter.get_kmer_count(kmer);
            coverage_counter.push_back(kmer_count);

            float entropy = compute_entropy(kmer);

            if (kmer_count < min_coverage || entropy < min_entropy) {
                path_exists = false;
            }
        }


        if (path_exists && !exceeds_min_connectivity(kcounter, prev_kmer, kmer, min_connectivity)) {
            path_exists = false;
        }

        prev_kmer = kmer;

    }

    return (path_exists);
}
Пример #4
0
CFS::CFS (ElementSet * a_set)
{
  set = a_set;
  n = set->get_set_cardinality ();
  max_feature_value = 0;

  if (n > 0)
  {
    number_of_rows = set->get_element (0)->get_number_of_values ();

    for (unsigned int i = 0; i < n; i++)
      if (set->get_element (i)->get_max_value () > max_feature_value)
        max_feature_value = set->get_element (i)->get_max_value ();

    Pr_Y = new double [set->get_number_of_labels ()];
    Pr_X = new double [max_feature_value + 1];

    Pr_X1_X2 = new double * [max_feature_value + 1];
    Pr_X1_Y  = new double * [max_feature_value + 1];
    for (unsigned int i = 0; i <= max_feature_value; i++)
    {
      Pr_X [i] = 0;

      Pr_X1_X2 [i] = new double [max_feature_value + 1];
      for (unsigned int j = 0; j <= max_feature_value; j++)
        Pr_X1_X2 [i][j] = 0;
      Pr_X1_Y  [i] = new double [set->get_number_of_labels ()];
      for (unsigned int j = 0; j < set->get_number_of_labels (); j++)
        Pr_X1_Y [i][j] = 0;
    }

    correlation = new double * [n];
    H = new double [n+1];
    for (unsigned int i = 0; i < n; i++)
    {
      H[i] = -1;
      correlation[i] = new double [n];
      for (unsigned int j = 0; j < n; j++)
        correlation[i][j] = -1;
    }
    H[n] = compute_entropy (n);
  }
}
Пример #5
0
bool IRKE::is_good_seed_kmer(kmer_int_type_t kmer, unsigned int kmer_count, unsigned int kmer_length,
                             float)
{

    if (kmer_count == 0) {
        return (false);
    }

    if (kmer == revcomp_val(kmer, kmer_length)) {
        // palindromic kmer, avoid palindromes as seeds

        if (IRKE_COMMON::MONITOR >= 2) {
            cerr << "SEED kmer: " << kcounter.get_kmer_string(kmer) << " is palidnromic.  Skipping. " << endl;
        }

        return (false);
    }


    if (kmer_count < MIN_SEED_COVERAGE) {
        if (IRKE_COMMON::MONITOR >= 2) {
            cerr << "-seed has insufficient coverage, skipping" << endl;
        }

        return (false);
    }

    float entropy = compute_entropy(kmer, kmer_length);


    if (entropy < MIN_SEED_ENTROPY) {

        if (IRKE_COMMON::MONITOR >= 2) {
            cerr << "-skipping seed due to low entropy: " << entropy << endl;
        }

        return (false);
    }


    // got this far, so kmer is fine as a seed
    return (true);
}
Пример #6
0
Node NField::generate_node(const std::vector<cv::Mat_<float> >& img,
                   	   const cv::Mat_<uchar>& mask, 
			   std::vector<Pixel>& pixels) {

	std::vector<Node> nodes;
	for (int i = 0; i < 100 ; ++i) {
 	   Node node = generate_random_node();
    	   nodes.push_back(node);
  	}
  
  	double tmp_entropy = 1.0;  
  	int node_th;
  	for (size_t j = 0; j < nodes.size(); ++j) {
    	  double current_entropy = compute_entropy(nodes[j], pixels, img);
    	  if (tmp_entropy >= current_entropy) {
      		tmp_entropy = current_entropy;
                node_th = j;
    	  }
  	}

  return nodes[node_th];
}
double calculate_after_spliting_entropy(const std::vector<int> &feature_value, 
                                         const std::vector<int> &class_value){
    int feature_value_count[MAX_FEATURE_VALUE_NUMBER] = {0};
    int feature_value_class_count[MAX_FEATURE_VALUE_NUMBER][MAX_CLASS_NUMBER] = {0};
    int sample_number = feature_value.size();	
    for (int i = 0; i < sample_number; i++){
        feature_value_count[feature_value[i]]++;
        feature_value_class_count[feature_value[i]][class_value[i]]++;
    }	
    double result = 0;
    for (int i = 0; i < MAX_FEATURE_VALUE_NUMBER; i++){
        if (feature_value_count[i] == 0){
            continue;
        }
        double entropy = 0;
        for (int j = 0; j < MAX_CLASS_NUMBER; j++){
            double p = (double)feature_value_class_count[i][j] / feature_value_count[i];
            entropy += compute_entropy(p);
        }
        result += (double)feature_value_count[i] / sample_number * entropy;
    }	
    return result;
}
Пример #8
0
void mmas::mixing_product(int n) {
  delete mixed_product;
  mixed_product = new usm;

  real dm_bin = product->star_mass / n;
  
  real m_prev = 0, m_bin = 0;
  real Mtot = 0, Utot = 0, Vtot = 0, r_mean = 0;
  int  n_in_bin = 0;
  int  j = 0;
  real H1tot = 0, He4tot = 0, O16tot = 0, N14tot = 0, C12tot = 0, Ne20tot = 0, Mg24tot = 0, Si28tot = 0, Fe56tot = 0;
  
  
  for (int i = 0; i < product->get_num_shells(); i++) {
    mass_shell &shell_i = product->get_shell(i);
    real dm = shell_i.mass - m_prev;
    m_prev  = shell_i.mass;

    r_mean += shell_i.radius;
    n_in_bin += 1;
 
    m_bin += dm;
    Mtot  += dm;
    Vtot  += dm/shell_i.density;
    H1tot   += dm*shell_i.composition.H1;
    He4tot  += dm*shell_i.composition.He4;
    O16tot  += dm*shell_i.composition.O16;
    N14tot  += dm*shell_i.composition.N14;
    C12tot  += dm*shell_i.composition.C12;
    Ne20tot += dm*shell_i.composition.Ne20;
    Mg24tot += dm*shell_i.composition.Mg24;
    Si28tot += dm*shell_i.composition.Si28;
    Fe56tot += dm*shell_i.composition.Fe56;
    Utot  += compute_energy(shell_i.density, shell_i.temperature, shell_i.mean_mu) * dm;

    if (m_bin > dm_bin) {
//       PRC(j); PRC(n); PRC(Mtot); PRC(m_bin); PRC(dm_bin); PRL(n_in_bin);

      mass_shell shell_j; j++;
//       mass_shell &shell_j = mixed_product->get_shell(j++);
      shell_j.radius      = r_mean/n_in_bin;
      shell_j.mass        = Mtot;
      shell_j.density     = m_bin/Vtot;
      shell_j.composition.H1   = H1tot/m_bin;
      shell_j.composition.He4  = He4tot/m_bin;
      shell_j.composition.O16  = O16tot/m_bin;
      shell_j.composition.N14  = N14tot/m_bin;
      shell_j.composition.C12  = C12tot/m_bin;
      shell_j.composition.Ne20 = Ne20tot/m_bin;
      shell_j.composition.Mg24 = Mg24tot/m_bin;
      shell_j.composition.Si28 = Si28tot/m_bin;
      shell_j.composition.Fe56 = Fe56tot/m_bin;

#define am(x) (1.0+Amass[x]/2.0)/Amass[x]
      real Amass[] = {1, 4, 16, 14, 12, 20, 24, 28, 56};
      shell_j.mean_mu = 2 * shell_j.composition.H1 + 
	am(1) * shell_j.composition.He4 +
	am(2) * shell_j.composition.O16 +
	am(3) * shell_j.composition.N14 +
	am(4) * shell_j.composition.C12 + 
	am(5) * shell_j.composition.Ne20 + 
	am(6) * shell_j.composition.Mg24 +
	am(7) * shell_j.composition.Si28 + 
	am(8) * shell_j.composition.Fe56;
      shell_j.mean_mu = 1.0/shell_j.mean_mu;

      shell_j.e_thermal   = Utot/m_bin;
      shell_j.pressure    = compute_pressure(shell_j.density, shell_j.e_thermal, shell_j.mean_mu);
      shell_j.temperature = compute_temperature(shell_j.density, shell_j.pressure, shell_j.mean_mu);
      shell_j.entropy     = compute_entropy(shell_j.density, shell_j.temperature, shell_j.mean_mu);
      mixed_product->add_shell(shell_j);

      m_bin -= dm_bin;
      m_bin = Utot = Vtot = r_mean = 0;
      H1tot = He4tot = O16tot = N14tot = C12tot = Ne20tot = Mg24tot = Si28tot = Fe56tot = 0;
      n_in_bin = 0;
    }
  }

  mixed_product->build_hashtable();
}
Пример #9
0
void mmas::smooth_product() {
  int n_shells = product->get_num_shells();   /* number of shells in the product */

  smoothing_params params;

  params.arr_x      = new double[n_shells];
  params.arr_y      = new double[n_shells];
  params.smoothed_y = new double[n_shells];

  /* composition */
  cerr << "Smoothing composition\n";
  for (int i = 0; i < n_shells; i++) {
    mass_shell &shell = product->get_shell(i);
    params.arr_x[i] = shell.radius;
    params.arr_x[i] = shell.mass;
    params.arr_y[i] = (4.0/shell.mean_mu - 3.0)/5.0;
  }
  smoothing_integrate(params, n_shells);
  for (int i = 0; i < n_shells; i++) {
    mass_shell &shell = product->get_shell(i);
    shell.mean_mu = 4.0/(5.0*params.smoothed_y[i] + 3);
  }

  /* thermal energy */
  cerr << "Smoothing thermal energy\n";
  for (int i = 0; i < n_shells; i++) {
    mass_shell &shell = product->get_shell(i);
    params.arr_y[i] = compute_energy(shell.density, shell.temperature, shell.mean_mu);
  }
  smoothing_integrate(params, n_shells);
  for (int i = 0; i < n_shells; i++) {
    mass_shell &shell = product->get_shell(i);
    shell.e_thermal =  params.smoothed_y[i];
//     real x = params.arr_x[i];
//     real y = params.arr_y[i];
//     real ys = params.smoothed_y[i];
//     PRC(x); PRC(y); PRL(ys);
  }

  /* density */
//   cerr << "Smoothing density\n";
//   for (int i = 0; i < n_shells; i++) {
//     mass_shell &shell = product->get_shell(i);
//     params.arr_y[i] = shell.density;
//   }
//   smoothing_integrate(params, n_shells);
//   for (int i = 0; i < n_shells; i++) {
//     mass_shell &shell = product->get_shell(i);
//     shell.density = params.smoothed_y[i];
//   }
  
  for (int i = 0; i < n_shells; i++) {
    mass_shell &shell = product->get_shell(i);
    shell.pressure    = compute_pressure(shell.density, shell.e_thermal, shell.mean_mu);
    shell.temperature = compute_temperature(shell.density, shell.pressure, shell.mean_mu);
    shell.entropy     = compute_entropy(shell.density, shell.temperature, shell.mean_mu);
  }

  delete[] params.arr_x;
  delete[] params.arr_y;
  delete[] params.smoothed_y;

}
Пример #10
0
int dump_content(int i, int j, int k, MPI_Datatype datatype,void *writebuf)
{
  int pl;
  FTYPE r, th, vmin[NDIM], vmax[NDIM];
  int ignorecourant;
  struct of_geom geom;
  struct of_state q;
  FTYPE X[NDIM],V[NDIM];
  FTYPE divb;
  FTYPE b[NDIM],ucon[NDIM];
  FTYPE U[NPR];
  FTYPE ftemp;
  FTYPE jcov[NDIM];
  FTYPE fcov[NUMFARADAY];
  FTYPE rho,u,pressure,cs2,Sden;
  int dir,l,m,n,o;


  //////////////
  //
  // some calculations
  //

  coord(i, j, k, CENT, X);
  bl_coord(X, V);
  // if failed, then data output for below invalid, but columns still must exist    

  get_geometry(i, j, k, CENT, &geom);

  if (!failed) {
    if (get_state(pdump[i][j][k], &geom, &q) >= 1)
      FAILSTATEMENT("dump.c:dump()", "get_state() dir=0", 1);
    if (vchar(pdump[i][j][k], &q, 1, &geom, &vmax[1], &vmin[1],&ignorecourant) >= 1)
      FAILSTATEMENT("dump.c:dump()", "vchar() dir=1or2", 1);
    if (vchar(pdump[i][j][k], &q, 2, &geom, &vmax[2], &vmin[2],&ignorecourant) >= 1)
      FAILSTATEMENT("dump.c:dump()", "vchar() dir=1or2", 2);
    if (vchar(pdump[i][j][k], &q, 3, &geom, &vmax[3], &vmin[3],&ignorecourant) >= 1)
      FAILSTATEMENT("dump.c:dump()", "vchar() dir=1or2", 3);
  }
  else {// do a per zone check, otherwise set to 0
    whocalleducon=1; // force no failure mode, just return like failure, and don't return if failure, just set to 0 and continue
    if (get_state(pdump[i][j][k], &geom, &q) >= 1){
      for (pl = 0; pl < NDIM; pl++)
	q.ucon[pl]=0;
      for (pl = 0; pl < NDIM; pl++)
	q.ucov[pl]=0;
      for (pl = 0; pl < NDIM; pl++)
	q.bcon[pl]=0;
      for (pl = 0; pl < NDIM; pl++)
	q.bcov[pl]=0;
    }
    if (vchar(pdump[i][j][k], &q, 1, &geom, &vmax[1], &vmin[1],&ignorecourant) >= 1){
      vmax[1]=vmin[1]=0;
    }
    
    if (vchar(pdump[i][j][k], &q, 2, &geom, &vmax[2], &vmin[2],&ignorecourant) >= 1){
      vmax[2]=vmin[2]=0;
    }

    if (vchar(pdump[i][j][k], &q, 3, &geom, &vmax[3], &vmin[3],&ignorecourant) >= 1){
      vmax[3]=vmin[3]=0;
    }

    whocalleducon=0; // return to normal state
    
  }


  setfdivb(&divb, pdump, udump, i, j, k); // udump also set externally GODMARK

  //////////////////////////
  //
  // do the assignments
  //
  // if you change # of outputted vars, remember to change numcolumns


  //static
  if(!GAMMIEDUMP){
    ftemp=(FTYPE)(i+startpos[1]);
    myset(datatype,&ftemp,0,1,writebuf);
    ftemp=(FTYPE)(j+startpos[2]);
    myset(datatype,&ftemp,0,1,writebuf);
    ftemp=(FTYPE)(k+startpos[3]);
    myset(datatype,&ftemp,0,1,writebuf);
  }
  myset(datatype,X,1,3,writebuf);
  myset(datatype,V,1,3,writebuf);
  // 9

  ////////////////////////
  //
  // rest dynamic

  // primitives
  // must use PDUMPLOOP() since may be any order unlike NPR loop
  PDUMPLOOP(pl) myset(datatype,&(pdump[i][j][k][pl]),0,1,writebuf); // NPRDUMP

  ////////////
  //
  // output some EOS stuff since in general not simple function of rho0,u
  rho = pdump[i][j][k][RHO];
  u = pdump[i][j][k][UU];


  pressure = pressure_rho0_u(rho,u);
  cs2 = cs2_compute(rho,u);
  Sden = compute_entropy(rho,u);
  //  dUdtau = compute_qdot(rho,u);
  
  myset(datatype,&pressure,0,1,writebuf); // 1
  myset(datatype,&cs2,0,1,writebuf); // 1
  myset(datatype,&Sden,0,1,writebuf); // 1
  //  myset(datatype,&dUdtau,0,1,writebuf); // 1

  //////////////////////
  //
  // output the conserved quantities since not easily inverted and at higher order aren't invertable from point primitives
  PDUMPLOOP(pl) myset(datatype,&(udump[i][j][k][pl]),0,1,writebuf); // NPRDUMP
  myset(datatype,&divb,0,1,writebuf); // 1

  for (pl = 0; pl < NDIM; pl++)
    myset(datatype,&(q.ucon[pl]),0,1,writebuf);
  for (pl = 0; pl < NDIM; pl++)
    myset(datatype,&(q.ucov[pl]),0,1,writebuf);
  for (pl = 0; pl < NDIM; pl++)
    myset(datatype,&(q.bcon[pl]),0,1,writebuf);
  for (pl = 0; pl < NDIM; pl++)
    myset(datatype,&(q.bcov[pl]),0,1,writebuf);
  // 4*4
    
  myset(datatype,&vmin[1],0,1,writebuf);
  myset(datatype,&vmax[1],0,1,writebuf);
  myset(datatype,&vmin[2],0,1,writebuf);
  myset(datatype,&vmax[2],0,1,writebuf);
  myset(datatype,&vmin[3],0,1,writebuf);
  myset(datatype,&vmax[3],0,1,writebuf);
  // 6

  // one static term
  myset(datatype,&geom.g,0,1,writebuf); // 1


#if(CALCFARADAYANDCURRENTS) // NIM*2+6*2 = 8+12=20
  // updated 11/16/2003
  // new 10/23/2003
  // current density 
  lower_vec(jcon[i][j][k],&geom,jcov); 
  myset(datatype,jcon[i][j][k],0,NDIM,writebuf); // (NDIM)
  myset(datatype,jcov,0,NDIM,writebuf);// (NDIM)
  // faraday (2*6)
  lowerf(fcon[i][j][k],&geom,fcov);
  myset(datatype,fcon[i][j][k],0,NUMFARADAY,writebuf); //  (6)
  myset(datatype,fcov,0,NUMFARADAY,writebuf); // (6)
#endif

  if(FLUXB==FLUXCTSTAG && 0){ // DEBUG (change corresponding code in dump.c)
    // uses jrdp3dudebug in gtwod.m that assumes CALCFARADAYANDCURRENTS==0
    for(l=1;l<=COMPDIM;l++) myset(datatype,gp_l[l][i][j][k],0,NPR2INTERP,writebuf); // 3*8 = 24
    for(l=1;l<=COMPDIM;l++) myset(datatype,gp_r[l][i][j][k],0,NPR2INTERP,writebuf); // 3*8 = 24
    myset(datatype,pstagscratch[i][j][k],0,NPR,writebuf); // 8
    for(dir=1;dir<=COMPDIM;dir++) for(pl=B1;pl<=B3;pl++) for(n=0;n<=1;n++) myset(datatype,&pbcorninterp[dir][pl][n][i][j][k],0,1,writebuf); // 3*3*2 = 18
    for(dir=1;dir<=COMPDIM;dir++) for(pl=U1;pl<=U3;pl++) for(n=0;n<=1;n++) for(o=0;o<=1;o++) myset(datatype,&pvcorninterp[dir][pl][n][o][i][j][k],0,1,writebuf); // 3*3*2*2 = 36
  }

  return (0);
}
Пример #11
0
//return a 2-tuple indicating the best atr id and the its position of the list of all attributes (this position keeps changing as the list change size) 
//inputs: list of remaining attriutes, current examples, entropy of the ancestor node
dectree_split* Dectree_class::best_split(std::vector<int> attr, const cv::Mat& samples, const cv::Mat& labels)
{

	std::vector<int> best_attr_info(2,0);
	int true_attr_pos = 0;
	int attr_idx;
	double max_info_gain = -1.;
	double attr_info_gain = 0.;
	bool flag_compare_info_gain = false;
	cv::Mat final_neg_attr_data(0,1,CV_32FC1);
	cv::Mat final_pos_attr_data(0,1,CV_32FC1);
	cv::Mat final_neg_attr_labels(0,1,CV_16SC1);
	cv::Mat final_pos_attr_labels(0,1,CV_16SC1);
	cv::Mat neg_attr_labels(0,1,CV_16SC1);
	cv::Mat pos_attr_labels(0,1,CV_16SC1);

	double imp_bef_split = compute_entropy(labels);

	for(std::vector<int>::iterator it_attr = attr.begin(); it_attr != attr.end(); ++it_attr)
	{
		cv::Mat neg_attr_labels(0,1,CV_16SC1);
		cv::Mat pos_attr_labels(0,1,CV_16SC1);
		attr_idx = *it_attr;
		for(int ex = 0; ex < samples.rows; ex++)
		{
			if( fabs(samples.at<float>(ex,attr_idx)-0.) <= FLT_EPSILON)
				neg_attr_labels.push_back(labels.at<int>(ex));
			else
				pos_attr_labels.push_back(labels.at<int>(ex));
		}
		
		double imp_neg_attr_labels = ((double)neg_attr_labels.rows/samples.rows)*compute_entropy(neg_attr_labels);
		double imp_pos_attr_labels = ((double)pos_attr_labels.rows/samples.rows)*compute_entropy(pos_attr_labels);
		double imp_attr = imp_neg_attr_labels + imp_pos_attr_labels;
		attr_info_gain = imp_bef_split - imp_attr;
		//std::cout << "h1: " << imp_neg_attr_labels << std::endl;
		//std::cout << "h2: " << imp_pos_attr_labels << std::endl;
		//std::cout << "*** " << attr_info_gain;

		if(flag_compare_info_gain)
		{
			if(attr_info_gain > max_info_gain)
			{
				best_attr_info.at(0) = attr_idx;
				best_attr_info.at(1) = true_attr_pos;
				max_info_gain = attr_info_gain;
			}
		}
		else
		{
			flag_compare_info_gain = true;
			best_attr_info.at(0) = attr_idx;
			best_attr_info.at(1) = true_attr_pos;
			max_info_gain = attr_info_gain;
		}
		
		true_attr_pos++;
		neg_attr_labels.release();
		pos_attr_labels.release();

	}

	//std::cout << std::endl;
	//std::cout << "max: " << max_info_gain << std::endl;

	//with the found best attribute; fill the split structure
	//it's prefere to separate the samples again because otherwise a lot of memory reallocations take place
	for(int ex = 0; ex < samples.rows; ex++)
	{
		if( fabs(samples.at<float>(ex,best_attr_info.at(0))-0.) <= FLT_EPSILON)
		{
			final_neg_attr_data.push_back(samples.row(ex));
			final_neg_attr_labels.push_back(labels.at<int>(ex));
		}
		else
		{
			final_pos_attr_data.push_back(samples.row(ex));
			final_pos_attr_labels.push_back(labels.at<int>(ex));
		}
	}

	dectree_split* split = new dectree_split();
	split->attr_name = best_attr_info.at(0); 	//global idx of the attribute - to be use later for prediction
	split->attr_idx = best_attr_info.at(1);		//local idx of the attribute
	split->neg_attr_data = final_neg_attr_data;
	split->pos_attr_data = final_pos_attr_data;
	split->neg_attr_labels = final_neg_attr_labels;
	split->pos_attr_labels = final_pos_attr_labels;

	return split;

}
Пример #12
0
void IRKE::compute_sequence_assemblies(KmerCounter& kcounter, float min_connectivity,
									   unsigned int MIN_ASSEMBLY_LENGTH, unsigned int MIN_ASSEMBLY_COVERAGE,
									   bool WRITE_COVERAGE, string COVERAGE_OUTPUT_FILENAME) {
	
    if (! got_sorted_kmers_flag) {
        stringstream error;
        error << stacktrace() << " Error, must populate_sorted_kmers_list() before computing sequence assemblies" << endl;
        throw(error.str());
    }
    

	unsigned int kmer_length = kcounter.get_kmer_length();
	ofstream coverage_writer;
	if (WRITE_COVERAGE) {
		coverage_writer.open(COVERAGE_OUTPUT_FILENAME.c_str());
	}
	
	vector<Kmer_counter_map_iterator>& kmers = sorted_kmers; //kcounter.get_kmers_sort_descending_counts();
	
	unsigned long init_size = kcounter.size();
	
	// string s = "before.kmers";
	// kcounter.dump_kmers_to_file(s);
	
	for (unsigned int i = 0; i < kmers.size(); i++) {
		
		// cerr << "round: " << i << endl;
		
		unsigned long kmer_counter_size = kcounter.size();
		if (kmer_counter_size > init_size) {
			
			// string s = "after.kmers";
			// kcounter.dump_kmers_to_file(s);
			
			stringstream error;
			error << stacktrace() << "Error, Kcounter size has grown from " << init_size
				  << " to " << kmer_counter_size << endl;
			throw (error.str());
		}
		
		
		kmer_int_type_t kmer = kmers[i]->first;
		unsigned int kmer_count = kmers[i]->second;
		
        
		if (kmer_count == 0) {
			continue;
		}
        
        
		if (IRKE_COMMON::MONITOR >= 2) {
			cerr << "SEED kmer: " << kcounter.get_kmer_string(kmer) << ", count: " << kmer_count << endl;
		}

        

		if (kmer == revcomp_val(kmer, kmer_length)) {
			// palindromic kmer, avoid palindromes as seeds
			
            if (IRKE_COMMON::MONITOR >= 2) {
                cerr << "SEED kmer: " << kcounter.get_kmer_string(kmer) << " is palidnromic.  Skipping. " << endl;
            }
            
            continue;
		}
		
        
		if (kmer_count < MIN_SEED_COVERAGE) {
			if (IRKE_COMMON::MONITOR >= 2) {
                cerr << "-seed has insufficient coverage, skipping" << endl;
            }
            
            continue;
		}
		
		
		float entropy = compute_entropy(kmer, kmer_length);
		
		
		if (entropy < MIN_SEED_ENTROPY) {

            if (IRKE_COMMON::MONITOR >= 2) {
                cerr << "-skipping seed due to low entropy: " << entropy << endl;
            }
            
            continue;
		}
		
				
		/* Extend to the right */
		
		Kmer_visitor visitor(kmer_length, DOUBLE_STRANDED_MODE);
		Path_n_count_pair selected_path_n_pair_forward = inchworm(kcounter, 'F', kmer, visitor, min_connectivity); 
		
		visitor.clear();
		// add selected path to visitor
		
		vector<kmer_int_type_t>& forward_path = selected_path_n_pair_forward.first;
		if (IRKE_COMMON::MONITOR >= 2) {
            cerr << "Forward path contains: " << forward_path.size() << " kmers. " << endl;
        }


        for (unsigned int i = 0; i < forward_path.size(); i++) {
			kmer_int_type_t kmer = forward_path[i];
			visitor.add(kmer);
            
            if (IRKE_COMMON::MONITOR >= 2) {
                cerr << "\tForward path kmer: " << kcounter.get_kmer_string(kmer) << endl;
            }
            
		}
		
		
		/* Extend to the left */ 
		visitor.erase(kmer); // reset the seed
		
		Path_n_count_pair selected_path_n_pair_reverse = inchworm(kcounter, 'R', kmer, visitor, min_connectivity);
        if (IRKE_COMMON::MONITOR >= 2) {
            vector<kmer_int_type_t>& reverse_path = selected_path_n_pair_reverse.first;
            cerr << "Reverse path contains: " << reverse_path.size() << " kmers. " << endl;
            for (unsigned int i = 0; i < reverse_path.size(); i++) {
                cerr  << "\tReverse path kmer: " << kcounter.get_kmer_string(reverse_path[i]) << endl; 
            }
        }
        
		
		unsigned int total_counts = selected_path_n_pair_forward.second + selected_path_n_pair_reverse.second + kcounter.get_kmer_count(kmer); 
		
		vector<kmer_int_type_t>& reverse_path = selected_path_n_pair_reverse.first;
		
		vector<kmer_int_type_t> joined_path = _join_forward_n_reverse_paths(reverse_path, kmer, forward_path);
		
		// report sequence reconstructed from path.
		
		vector<unsigned int> assembly_base_coverage;
		string sequence = reconstruct_path_sequence(kcounter, joined_path, assembly_base_coverage);
		
		unsigned int avg_cov =  static_cast<unsigned int> ( (float)total_counts/(sequence.length()-kcounter.get_kmer_length() +1) + 0.5);
		
		/*
		  cout << "Inchworm-reconstructed sequence, length: " << sequence.length() 
		  << ", avgCov: " << avg_cov
		  << " " << sequence << endl;
		*/
		
		
		
		if (sequence.length() >= MIN_ASSEMBLY_LENGTH && avg_cov >= MIN_ASSEMBLY_COVERAGE) {
			
			INCHWORM_ASSEMBLY_COUNTER++;
			
			stringstream headerstream;
			
			
			headerstream << ">a" << INCHWORM_ASSEMBLY_COUNTER << ";" << avg_cov 
						 << " K: " << kmer_length
						 << " length: " << sequence.length();
			
			string header = headerstream.str();
			
            sequence = add_fasta_seq_line_breaks(sequence, 60);
            
			cout << header << endl << sequence << endl;
			
			if (WRITE_COVERAGE) {
				
				coverage_writer << header << endl;
				
				for (unsigned int i = 0; i < assembly_base_coverage.size(); i++) {
					coverage_writer << assembly_base_coverage[i];
					if ( (i+1) % 30 == 0) {
						coverage_writer << endl;
					}
					else {
						coverage_writer << " ";
					}
				}
				coverage_writer << endl;
			}
			
		}
		
		// remove path
		for (unsigned int i = 0; i < joined_path.size(); i++) {
			
			kmer_int_type_t kmer = joined_path[i];
			
			/*
			  if (DEBUG) {
			  cout << "\tpruning kmer: " << kmer << endl;
			  }
			*/
			
				kcounter.clear_kmer(kmer);
		}
		
		/*
		  if (DEBUG) {
		  cout << "done pruning kmers." << endl;
		  }
		*/
		
	}
	
	if (IRKE_COMMON::MONITOR) {
		cerr << endl;
	}
	
	if (WRITE_COVERAGE) {
		coverage_writer.close();
	}

    
    // drop sorted kmer list as part of cleanup
    clear_sorted_kmers_list();
    
	
	return; // end of runIRKE
	
}