double CFS::symmetrical_uncertainity (unsigned int i, unsigned int j) { double result = 0; // SU(X1,X2) = 2 * ((H(Xi) + H(Xj) - H(Xi,Xj)) / (H(Xi) + H(Xj))), // // where X1 is a feature and X2 is either another feature or the class Y. // if (H[i] == -1) H[i] = compute_entropy (i); if (H[j] == -1) H[j] = compute_entropy (j); if ((H[i] + H[j]) != 0) { // Compute H(X1), H(X2) and H(X1,X2). // double H_Xi_Xj = compute_joint_entropy (i, j); if (i == j) result = 2 * ((H[i] + H[n] - H_Xi_Xj) / (H[i] + H[n])); else result = 2 * ((H[i] + H[j] - H_Xi_Xj) / (H[i] + H[j])); } return result; }
//compute the shannon entropy of a given split double Dectree_class::compute_erf_entropy(const cv::Mat& labels, const cv::Mat& neg_labels, const cv::Mat& pos_labels) { //class entropy or entropy before split double class_entropy = compute_entropy(labels); //mutual information (information gain) double imp_neg_labels = ((double)neg_labels.rows/labels.rows)*compute_entropy(neg_labels); double imp_pos_labels = ((double)pos_labels.rows/labels.rows)*compute_entropy(pos_labels); double imp_attr = imp_neg_labels + imp_pos_labels; double info_gain = class_entropy - imp_attr; //split entropy double split_entropy = 0; double neg_prob = ((double)neg_labels.rows/labels.rows); //because of the base cases in the learning algorithm, we are sure //labels.rows is greater than 0 double pos_prob = ((double)pos_labels.rows/labels.rows); if( fabs(neg_prob-0.) > FLT_EPSILON && fabs(neg_prob-1.) > FLT_EPSILON ) split_entropy += neg_prob*log2(neg_prob); if( fabs(pos_prob-0.) > FLT_EPSILON && fabs(pos_prob-1.) > FLT_EPSILON ) split_entropy += pos_prob*log2(pos_prob); split_entropy *= -1; //shannon entropy double shannon_entropy = (2*info_gain)/(class_entropy+split_entropy); //for debugging /* std::cout << "Class entropy: " << class_entropy << std::endl; std::cout << "Info gain: " << info_gain << std::endl; std::cout << "Split entropy: " << split_entropy << std::endl; std::cout << "Shannon entropy: " << shannon_entropy << std::endl; */ return shannon_entropy; }
bool IRKE::sequence_path_exists(string &sequence, unsigned int min_coverage, float min_entropy, float min_connectivity, vector<unsigned int> &coverage_counter) { unsigned int kmer_length = kcounter.get_kmer_length(); if (sequence.length() < kmer_length) { return (false); } bool path_exists = true; string prev_kmer = sequence.substr(0, kmer_length); if (contains_non_gatc(prev_kmer) || !kcounter.kmer_exists(prev_kmer)) { path_exists = false; coverage_counter.push_back(0); } else { unsigned int kmer_count = kcounter.get_kmer_count(prev_kmer); coverage_counter.push_back(kmer_count); float entropy = compute_entropy(prev_kmer); if (kmer_count < min_coverage || entropy < min_entropy) { path_exists = false; } } for (unsigned int i = 1; i <= sequence.length() - kmer_length; i++) { string kmer = sequence.substr(i, kmer_length); if (contains_non_gatc(kmer) || !kcounter.kmer_exists(kmer)) { path_exists = false; coverage_counter.push_back(0); } else { unsigned int kmer_count = kcounter.get_kmer_count(kmer); coverage_counter.push_back(kmer_count); float entropy = compute_entropy(kmer); if (kmer_count < min_coverage || entropy < min_entropy) { path_exists = false; } } if (path_exists && !exceeds_min_connectivity(kcounter, prev_kmer, kmer, min_connectivity)) { path_exists = false; } prev_kmer = kmer; } return (path_exists); }
CFS::CFS (ElementSet * a_set) { set = a_set; n = set->get_set_cardinality (); max_feature_value = 0; if (n > 0) { number_of_rows = set->get_element (0)->get_number_of_values (); for (unsigned int i = 0; i < n; i++) if (set->get_element (i)->get_max_value () > max_feature_value) max_feature_value = set->get_element (i)->get_max_value (); Pr_Y = new double [set->get_number_of_labels ()]; Pr_X = new double [max_feature_value + 1]; Pr_X1_X2 = new double * [max_feature_value + 1]; Pr_X1_Y = new double * [max_feature_value + 1]; for (unsigned int i = 0; i <= max_feature_value; i++) { Pr_X [i] = 0; Pr_X1_X2 [i] = new double [max_feature_value + 1]; for (unsigned int j = 0; j <= max_feature_value; j++) Pr_X1_X2 [i][j] = 0; Pr_X1_Y [i] = new double [set->get_number_of_labels ()]; for (unsigned int j = 0; j < set->get_number_of_labels (); j++) Pr_X1_Y [i][j] = 0; } correlation = new double * [n]; H = new double [n+1]; for (unsigned int i = 0; i < n; i++) { H[i] = -1; correlation[i] = new double [n]; for (unsigned int j = 0; j < n; j++) correlation[i][j] = -1; } H[n] = compute_entropy (n); } }
bool IRKE::is_good_seed_kmer(kmer_int_type_t kmer, unsigned int kmer_count, unsigned int kmer_length, float) { if (kmer_count == 0) { return (false); } if (kmer == revcomp_val(kmer, kmer_length)) { // palindromic kmer, avoid palindromes as seeds if (IRKE_COMMON::MONITOR >= 2) { cerr << "SEED kmer: " << kcounter.get_kmer_string(kmer) << " is palidnromic. Skipping. " << endl; } return (false); } if (kmer_count < MIN_SEED_COVERAGE) { if (IRKE_COMMON::MONITOR >= 2) { cerr << "-seed has insufficient coverage, skipping" << endl; } return (false); } float entropy = compute_entropy(kmer, kmer_length); if (entropy < MIN_SEED_ENTROPY) { if (IRKE_COMMON::MONITOR >= 2) { cerr << "-skipping seed due to low entropy: " << entropy << endl; } return (false); } // got this far, so kmer is fine as a seed return (true); }
Node NField::generate_node(const std::vector<cv::Mat_<float> >& img, const cv::Mat_<uchar>& mask, std::vector<Pixel>& pixels) { std::vector<Node> nodes; for (int i = 0; i < 100 ; ++i) { Node node = generate_random_node(); nodes.push_back(node); } double tmp_entropy = 1.0; int node_th; for (size_t j = 0; j < nodes.size(); ++j) { double current_entropy = compute_entropy(nodes[j], pixels, img); if (tmp_entropy >= current_entropy) { tmp_entropy = current_entropy; node_th = j; } } return nodes[node_th]; }
double calculate_after_spliting_entropy(const std::vector<int> &feature_value, const std::vector<int> &class_value){ int feature_value_count[MAX_FEATURE_VALUE_NUMBER] = {0}; int feature_value_class_count[MAX_FEATURE_VALUE_NUMBER][MAX_CLASS_NUMBER] = {0}; int sample_number = feature_value.size(); for (int i = 0; i < sample_number; i++){ feature_value_count[feature_value[i]]++; feature_value_class_count[feature_value[i]][class_value[i]]++; } double result = 0; for (int i = 0; i < MAX_FEATURE_VALUE_NUMBER; i++){ if (feature_value_count[i] == 0){ continue; } double entropy = 0; for (int j = 0; j < MAX_CLASS_NUMBER; j++){ double p = (double)feature_value_class_count[i][j] / feature_value_count[i]; entropy += compute_entropy(p); } result += (double)feature_value_count[i] / sample_number * entropy; } return result; }
void mmas::mixing_product(int n) { delete mixed_product; mixed_product = new usm; real dm_bin = product->star_mass / n; real m_prev = 0, m_bin = 0; real Mtot = 0, Utot = 0, Vtot = 0, r_mean = 0; int n_in_bin = 0; int j = 0; real H1tot = 0, He4tot = 0, O16tot = 0, N14tot = 0, C12tot = 0, Ne20tot = 0, Mg24tot = 0, Si28tot = 0, Fe56tot = 0; for (int i = 0; i < product->get_num_shells(); i++) { mass_shell &shell_i = product->get_shell(i); real dm = shell_i.mass - m_prev; m_prev = shell_i.mass; r_mean += shell_i.radius; n_in_bin += 1; m_bin += dm; Mtot += dm; Vtot += dm/shell_i.density; H1tot += dm*shell_i.composition.H1; He4tot += dm*shell_i.composition.He4; O16tot += dm*shell_i.composition.O16; N14tot += dm*shell_i.composition.N14; C12tot += dm*shell_i.composition.C12; Ne20tot += dm*shell_i.composition.Ne20; Mg24tot += dm*shell_i.composition.Mg24; Si28tot += dm*shell_i.composition.Si28; Fe56tot += dm*shell_i.composition.Fe56; Utot += compute_energy(shell_i.density, shell_i.temperature, shell_i.mean_mu) * dm; if (m_bin > dm_bin) { // PRC(j); PRC(n); PRC(Mtot); PRC(m_bin); PRC(dm_bin); PRL(n_in_bin); mass_shell shell_j; j++; // mass_shell &shell_j = mixed_product->get_shell(j++); shell_j.radius = r_mean/n_in_bin; shell_j.mass = Mtot; shell_j.density = m_bin/Vtot; shell_j.composition.H1 = H1tot/m_bin; shell_j.composition.He4 = He4tot/m_bin; shell_j.composition.O16 = O16tot/m_bin; shell_j.composition.N14 = N14tot/m_bin; shell_j.composition.C12 = C12tot/m_bin; shell_j.composition.Ne20 = Ne20tot/m_bin; shell_j.composition.Mg24 = Mg24tot/m_bin; shell_j.composition.Si28 = Si28tot/m_bin; shell_j.composition.Fe56 = Fe56tot/m_bin; #define am(x) (1.0+Amass[x]/2.0)/Amass[x] real Amass[] = {1, 4, 16, 14, 12, 20, 24, 28, 56}; shell_j.mean_mu = 2 * shell_j.composition.H1 + am(1) * shell_j.composition.He4 + am(2) * shell_j.composition.O16 + am(3) * shell_j.composition.N14 + am(4) * shell_j.composition.C12 + am(5) * shell_j.composition.Ne20 + am(6) * shell_j.composition.Mg24 + am(7) * shell_j.composition.Si28 + am(8) * shell_j.composition.Fe56; shell_j.mean_mu = 1.0/shell_j.mean_mu; shell_j.e_thermal = Utot/m_bin; shell_j.pressure = compute_pressure(shell_j.density, shell_j.e_thermal, shell_j.mean_mu); shell_j.temperature = compute_temperature(shell_j.density, shell_j.pressure, shell_j.mean_mu); shell_j.entropy = compute_entropy(shell_j.density, shell_j.temperature, shell_j.mean_mu); mixed_product->add_shell(shell_j); m_bin -= dm_bin; m_bin = Utot = Vtot = r_mean = 0; H1tot = He4tot = O16tot = N14tot = C12tot = Ne20tot = Mg24tot = Si28tot = Fe56tot = 0; n_in_bin = 0; } } mixed_product->build_hashtable(); }
void mmas::smooth_product() { int n_shells = product->get_num_shells(); /* number of shells in the product */ smoothing_params params; params.arr_x = new double[n_shells]; params.arr_y = new double[n_shells]; params.smoothed_y = new double[n_shells]; /* composition */ cerr << "Smoothing composition\n"; for (int i = 0; i < n_shells; i++) { mass_shell &shell = product->get_shell(i); params.arr_x[i] = shell.radius; params.arr_x[i] = shell.mass; params.arr_y[i] = (4.0/shell.mean_mu - 3.0)/5.0; } smoothing_integrate(params, n_shells); for (int i = 0; i < n_shells; i++) { mass_shell &shell = product->get_shell(i); shell.mean_mu = 4.0/(5.0*params.smoothed_y[i] + 3); } /* thermal energy */ cerr << "Smoothing thermal energy\n"; for (int i = 0; i < n_shells; i++) { mass_shell &shell = product->get_shell(i); params.arr_y[i] = compute_energy(shell.density, shell.temperature, shell.mean_mu); } smoothing_integrate(params, n_shells); for (int i = 0; i < n_shells; i++) { mass_shell &shell = product->get_shell(i); shell.e_thermal = params.smoothed_y[i]; // real x = params.arr_x[i]; // real y = params.arr_y[i]; // real ys = params.smoothed_y[i]; // PRC(x); PRC(y); PRL(ys); } /* density */ // cerr << "Smoothing density\n"; // for (int i = 0; i < n_shells; i++) { // mass_shell &shell = product->get_shell(i); // params.arr_y[i] = shell.density; // } // smoothing_integrate(params, n_shells); // for (int i = 0; i < n_shells; i++) { // mass_shell &shell = product->get_shell(i); // shell.density = params.smoothed_y[i]; // } for (int i = 0; i < n_shells; i++) { mass_shell &shell = product->get_shell(i); shell.pressure = compute_pressure(shell.density, shell.e_thermal, shell.mean_mu); shell.temperature = compute_temperature(shell.density, shell.pressure, shell.mean_mu); shell.entropy = compute_entropy(shell.density, shell.temperature, shell.mean_mu); } delete[] params.arr_x; delete[] params.arr_y; delete[] params.smoothed_y; }
int dump_content(int i, int j, int k, MPI_Datatype datatype,void *writebuf) { int pl; FTYPE r, th, vmin[NDIM], vmax[NDIM]; int ignorecourant; struct of_geom geom; struct of_state q; FTYPE X[NDIM],V[NDIM]; FTYPE divb; FTYPE b[NDIM],ucon[NDIM]; FTYPE U[NPR]; FTYPE ftemp; FTYPE jcov[NDIM]; FTYPE fcov[NUMFARADAY]; FTYPE rho,u,pressure,cs2,Sden; int dir,l,m,n,o; ////////////// // // some calculations // coord(i, j, k, CENT, X); bl_coord(X, V); // if failed, then data output for below invalid, but columns still must exist get_geometry(i, j, k, CENT, &geom); if (!failed) { if (get_state(pdump[i][j][k], &geom, &q) >= 1) FAILSTATEMENT("dump.c:dump()", "get_state() dir=0", 1); if (vchar(pdump[i][j][k], &q, 1, &geom, &vmax[1], &vmin[1],&ignorecourant) >= 1) FAILSTATEMENT("dump.c:dump()", "vchar() dir=1or2", 1); if (vchar(pdump[i][j][k], &q, 2, &geom, &vmax[2], &vmin[2],&ignorecourant) >= 1) FAILSTATEMENT("dump.c:dump()", "vchar() dir=1or2", 2); if (vchar(pdump[i][j][k], &q, 3, &geom, &vmax[3], &vmin[3],&ignorecourant) >= 1) FAILSTATEMENT("dump.c:dump()", "vchar() dir=1or2", 3); } else {// do a per zone check, otherwise set to 0 whocalleducon=1; // force no failure mode, just return like failure, and don't return if failure, just set to 0 and continue if (get_state(pdump[i][j][k], &geom, &q) >= 1){ for (pl = 0; pl < NDIM; pl++) q.ucon[pl]=0; for (pl = 0; pl < NDIM; pl++) q.ucov[pl]=0; for (pl = 0; pl < NDIM; pl++) q.bcon[pl]=0; for (pl = 0; pl < NDIM; pl++) q.bcov[pl]=0; } if (vchar(pdump[i][j][k], &q, 1, &geom, &vmax[1], &vmin[1],&ignorecourant) >= 1){ vmax[1]=vmin[1]=0; } if (vchar(pdump[i][j][k], &q, 2, &geom, &vmax[2], &vmin[2],&ignorecourant) >= 1){ vmax[2]=vmin[2]=0; } if (vchar(pdump[i][j][k], &q, 3, &geom, &vmax[3], &vmin[3],&ignorecourant) >= 1){ vmax[3]=vmin[3]=0; } whocalleducon=0; // return to normal state } setfdivb(&divb, pdump, udump, i, j, k); // udump also set externally GODMARK ////////////////////////// // // do the assignments // // if you change # of outputted vars, remember to change numcolumns //static if(!GAMMIEDUMP){ ftemp=(FTYPE)(i+startpos[1]); myset(datatype,&ftemp,0,1,writebuf); ftemp=(FTYPE)(j+startpos[2]); myset(datatype,&ftemp,0,1,writebuf); ftemp=(FTYPE)(k+startpos[3]); myset(datatype,&ftemp,0,1,writebuf); } myset(datatype,X,1,3,writebuf); myset(datatype,V,1,3,writebuf); // 9 //////////////////////// // // rest dynamic // primitives // must use PDUMPLOOP() since may be any order unlike NPR loop PDUMPLOOP(pl) myset(datatype,&(pdump[i][j][k][pl]),0,1,writebuf); // NPRDUMP //////////// // // output some EOS stuff since in general not simple function of rho0,u rho = pdump[i][j][k][RHO]; u = pdump[i][j][k][UU]; pressure = pressure_rho0_u(rho,u); cs2 = cs2_compute(rho,u); Sden = compute_entropy(rho,u); // dUdtau = compute_qdot(rho,u); myset(datatype,&pressure,0,1,writebuf); // 1 myset(datatype,&cs2,0,1,writebuf); // 1 myset(datatype,&Sden,0,1,writebuf); // 1 // myset(datatype,&dUdtau,0,1,writebuf); // 1 ////////////////////// // // output the conserved quantities since not easily inverted and at higher order aren't invertable from point primitives PDUMPLOOP(pl) myset(datatype,&(udump[i][j][k][pl]),0,1,writebuf); // NPRDUMP myset(datatype,&divb,0,1,writebuf); // 1 for (pl = 0; pl < NDIM; pl++) myset(datatype,&(q.ucon[pl]),0,1,writebuf); for (pl = 0; pl < NDIM; pl++) myset(datatype,&(q.ucov[pl]),0,1,writebuf); for (pl = 0; pl < NDIM; pl++) myset(datatype,&(q.bcon[pl]),0,1,writebuf); for (pl = 0; pl < NDIM; pl++) myset(datatype,&(q.bcov[pl]),0,1,writebuf); // 4*4 myset(datatype,&vmin[1],0,1,writebuf); myset(datatype,&vmax[1],0,1,writebuf); myset(datatype,&vmin[2],0,1,writebuf); myset(datatype,&vmax[2],0,1,writebuf); myset(datatype,&vmin[3],0,1,writebuf); myset(datatype,&vmax[3],0,1,writebuf); // 6 // one static term myset(datatype,&geom.g,0,1,writebuf); // 1 #if(CALCFARADAYANDCURRENTS) // NIM*2+6*2 = 8+12=20 // updated 11/16/2003 // new 10/23/2003 // current density lower_vec(jcon[i][j][k],&geom,jcov); myset(datatype,jcon[i][j][k],0,NDIM,writebuf); // (NDIM) myset(datatype,jcov,0,NDIM,writebuf);// (NDIM) // faraday (2*6) lowerf(fcon[i][j][k],&geom,fcov); myset(datatype,fcon[i][j][k],0,NUMFARADAY,writebuf); // (6) myset(datatype,fcov,0,NUMFARADAY,writebuf); // (6) #endif if(FLUXB==FLUXCTSTAG && 0){ // DEBUG (change corresponding code in dump.c) // uses jrdp3dudebug in gtwod.m that assumes CALCFARADAYANDCURRENTS==0 for(l=1;l<=COMPDIM;l++) myset(datatype,gp_l[l][i][j][k],0,NPR2INTERP,writebuf); // 3*8 = 24 for(l=1;l<=COMPDIM;l++) myset(datatype,gp_r[l][i][j][k],0,NPR2INTERP,writebuf); // 3*8 = 24 myset(datatype,pstagscratch[i][j][k],0,NPR,writebuf); // 8 for(dir=1;dir<=COMPDIM;dir++) for(pl=B1;pl<=B3;pl++) for(n=0;n<=1;n++) myset(datatype,&pbcorninterp[dir][pl][n][i][j][k],0,1,writebuf); // 3*3*2 = 18 for(dir=1;dir<=COMPDIM;dir++) for(pl=U1;pl<=U3;pl++) for(n=0;n<=1;n++) for(o=0;o<=1;o++) myset(datatype,&pvcorninterp[dir][pl][n][o][i][j][k],0,1,writebuf); // 3*3*2*2 = 36 } return (0); }
//return a 2-tuple indicating the best atr id and the its position of the list of all attributes (this position keeps changing as the list change size) //inputs: list of remaining attriutes, current examples, entropy of the ancestor node dectree_split* Dectree_class::best_split(std::vector<int> attr, const cv::Mat& samples, const cv::Mat& labels) { std::vector<int> best_attr_info(2,0); int true_attr_pos = 0; int attr_idx; double max_info_gain = -1.; double attr_info_gain = 0.; bool flag_compare_info_gain = false; cv::Mat final_neg_attr_data(0,1,CV_32FC1); cv::Mat final_pos_attr_data(0,1,CV_32FC1); cv::Mat final_neg_attr_labels(0,1,CV_16SC1); cv::Mat final_pos_attr_labels(0,1,CV_16SC1); cv::Mat neg_attr_labels(0,1,CV_16SC1); cv::Mat pos_attr_labels(0,1,CV_16SC1); double imp_bef_split = compute_entropy(labels); for(std::vector<int>::iterator it_attr = attr.begin(); it_attr != attr.end(); ++it_attr) { cv::Mat neg_attr_labels(0,1,CV_16SC1); cv::Mat pos_attr_labels(0,1,CV_16SC1); attr_idx = *it_attr; for(int ex = 0; ex < samples.rows; ex++) { if( fabs(samples.at<float>(ex,attr_idx)-0.) <= FLT_EPSILON) neg_attr_labels.push_back(labels.at<int>(ex)); else pos_attr_labels.push_back(labels.at<int>(ex)); } double imp_neg_attr_labels = ((double)neg_attr_labels.rows/samples.rows)*compute_entropy(neg_attr_labels); double imp_pos_attr_labels = ((double)pos_attr_labels.rows/samples.rows)*compute_entropy(pos_attr_labels); double imp_attr = imp_neg_attr_labels + imp_pos_attr_labels; attr_info_gain = imp_bef_split - imp_attr; //std::cout << "h1: " << imp_neg_attr_labels << std::endl; //std::cout << "h2: " << imp_pos_attr_labels << std::endl; //std::cout << "*** " << attr_info_gain; if(flag_compare_info_gain) { if(attr_info_gain > max_info_gain) { best_attr_info.at(0) = attr_idx; best_attr_info.at(1) = true_attr_pos; max_info_gain = attr_info_gain; } } else { flag_compare_info_gain = true; best_attr_info.at(0) = attr_idx; best_attr_info.at(1) = true_attr_pos; max_info_gain = attr_info_gain; } true_attr_pos++; neg_attr_labels.release(); pos_attr_labels.release(); } //std::cout << std::endl; //std::cout << "max: " << max_info_gain << std::endl; //with the found best attribute; fill the split structure //it's prefere to separate the samples again because otherwise a lot of memory reallocations take place for(int ex = 0; ex < samples.rows; ex++) { if( fabs(samples.at<float>(ex,best_attr_info.at(0))-0.) <= FLT_EPSILON) { final_neg_attr_data.push_back(samples.row(ex)); final_neg_attr_labels.push_back(labels.at<int>(ex)); } else { final_pos_attr_data.push_back(samples.row(ex)); final_pos_attr_labels.push_back(labels.at<int>(ex)); } } dectree_split* split = new dectree_split(); split->attr_name = best_attr_info.at(0); //global idx of the attribute - to be use later for prediction split->attr_idx = best_attr_info.at(1); //local idx of the attribute split->neg_attr_data = final_neg_attr_data; split->pos_attr_data = final_pos_attr_data; split->neg_attr_labels = final_neg_attr_labels; split->pos_attr_labels = final_pos_attr_labels; return split; }
void IRKE::compute_sequence_assemblies(KmerCounter& kcounter, float min_connectivity, unsigned int MIN_ASSEMBLY_LENGTH, unsigned int MIN_ASSEMBLY_COVERAGE, bool WRITE_COVERAGE, string COVERAGE_OUTPUT_FILENAME) { if (! got_sorted_kmers_flag) { stringstream error; error << stacktrace() << " Error, must populate_sorted_kmers_list() before computing sequence assemblies" << endl; throw(error.str()); } unsigned int kmer_length = kcounter.get_kmer_length(); ofstream coverage_writer; if (WRITE_COVERAGE) { coverage_writer.open(COVERAGE_OUTPUT_FILENAME.c_str()); } vector<Kmer_counter_map_iterator>& kmers = sorted_kmers; //kcounter.get_kmers_sort_descending_counts(); unsigned long init_size = kcounter.size(); // string s = "before.kmers"; // kcounter.dump_kmers_to_file(s); for (unsigned int i = 0; i < kmers.size(); i++) { // cerr << "round: " << i << endl; unsigned long kmer_counter_size = kcounter.size(); if (kmer_counter_size > init_size) { // string s = "after.kmers"; // kcounter.dump_kmers_to_file(s); stringstream error; error << stacktrace() << "Error, Kcounter size has grown from " << init_size << " to " << kmer_counter_size << endl; throw (error.str()); } kmer_int_type_t kmer = kmers[i]->first; unsigned int kmer_count = kmers[i]->second; if (kmer_count == 0) { continue; } if (IRKE_COMMON::MONITOR >= 2) { cerr << "SEED kmer: " << kcounter.get_kmer_string(kmer) << ", count: " << kmer_count << endl; } if (kmer == revcomp_val(kmer, kmer_length)) { // palindromic kmer, avoid palindromes as seeds if (IRKE_COMMON::MONITOR >= 2) { cerr << "SEED kmer: " << kcounter.get_kmer_string(kmer) << " is palidnromic. Skipping. " << endl; } continue; } if (kmer_count < MIN_SEED_COVERAGE) { if (IRKE_COMMON::MONITOR >= 2) { cerr << "-seed has insufficient coverage, skipping" << endl; } continue; } float entropy = compute_entropy(kmer, kmer_length); if (entropy < MIN_SEED_ENTROPY) { if (IRKE_COMMON::MONITOR >= 2) { cerr << "-skipping seed due to low entropy: " << entropy << endl; } continue; } /* Extend to the right */ Kmer_visitor visitor(kmer_length, DOUBLE_STRANDED_MODE); Path_n_count_pair selected_path_n_pair_forward = inchworm(kcounter, 'F', kmer, visitor, min_connectivity); visitor.clear(); // add selected path to visitor vector<kmer_int_type_t>& forward_path = selected_path_n_pair_forward.first; if (IRKE_COMMON::MONITOR >= 2) { cerr << "Forward path contains: " << forward_path.size() << " kmers. " << endl; } for (unsigned int i = 0; i < forward_path.size(); i++) { kmer_int_type_t kmer = forward_path[i]; visitor.add(kmer); if (IRKE_COMMON::MONITOR >= 2) { cerr << "\tForward path kmer: " << kcounter.get_kmer_string(kmer) << endl; } } /* Extend to the left */ visitor.erase(kmer); // reset the seed Path_n_count_pair selected_path_n_pair_reverse = inchworm(kcounter, 'R', kmer, visitor, min_connectivity); if (IRKE_COMMON::MONITOR >= 2) { vector<kmer_int_type_t>& reverse_path = selected_path_n_pair_reverse.first; cerr << "Reverse path contains: " << reverse_path.size() << " kmers. " << endl; for (unsigned int i = 0; i < reverse_path.size(); i++) { cerr << "\tReverse path kmer: " << kcounter.get_kmer_string(reverse_path[i]) << endl; } } unsigned int total_counts = selected_path_n_pair_forward.second + selected_path_n_pair_reverse.second + kcounter.get_kmer_count(kmer); vector<kmer_int_type_t>& reverse_path = selected_path_n_pair_reverse.first; vector<kmer_int_type_t> joined_path = _join_forward_n_reverse_paths(reverse_path, kmer, forward_path); // report sequence reconstructed from path. vector<unsigned int> assembly_base_coverage; string sequence = reconstruct_path_sequence(kcounter, joined_path, assembly_base_coverage); unsigned int avg_cov = static_cast<unsigned int> ( (float)total_counts/(sequence.length()-kcounter.get_kmer_length() +1) + 0.5); /* cout << "Inchworm-reconstructed sequence, length: " << sequence.length() << ", avgCov: " << avg_cov << " " << sequence << endl; */ if (sequence.length() >= MIN_ASSEMBLY_LENGTH && avg_cov >= MIN_ASSEMBLY_COVERAGE) { INCHWORM_ASSEMBLY_COUNTER++; stringstream headerstream; headerstream << ">a" << INCHWORM_ASSEMBLY_COUNTER << ";" << avg_cov << " K: " << kmer_length << " length: " << sequence.length(); string header = headerstream.str(); sequence = add_fasta_seq_line_breaks(sequence, 60); cout << header << endl << sequence << endl; if (WRITE_COVERAGE) { coverage_writer << header << endl; for (unsigned int i = 0; i < assembly_base_coverage.size(); i++) { coverage_writer << assembly_base_coverage[i]; if ( (i+1) % 30 == 0) { coverage_writer << endl; } else { coverage_writer << " "; } } coverage_writer << endl; } } // remove path for (unsigned int i = 0; i < joined_path.size(); i++) { kmer_int_type_t kmer = joined_path[i]; /* if (DEBUG) { cout << "\tpruning kmer: " << kmer << endl; } */ kcounter.clear_kmer(kmer); } /* if (DEBUG) { cout << "done pruning kmers." << endl; } */ } if (IRKE_COMMON::MONITOR) { cerr << endl; } if (WRITE_COVERAGE) { coverage_writer.close(); } // drop sorted kmer list as part of cleanup clear_sorted_kmers_list(); return; // end of runIRKE }