RcppExport SEXP rthgini(SEXP x_, SEXP mu_, SEXP unbiased_, SEXP nthreads) { Rcpp::NumericVector x(x_); Rcpp::NumericVector mu(mu_); const int unbiased = INTEGER(unbiased_)[0]; const int n = LENGTH(x); Rcpp::NumericVector gini(1); #if RTH_OMP omp_set_num_threads(INT(nthreads)); #elif RTH_TBB tbb::task_scheduler_init init(INT(nthreads)); #endif thrust::device_vector<double> dx(x.begin(), x.end()); thrust::sort(dx.begin(), dx.end()); thrust::counting_iterator<int> begin(0); thrust::counting_iterator<int> end = begin + n; thrust::plus<flouble> binop; gini[0] = (double) thrust::transform_reduce(begin, end, compute_gini(n, dx.begin()), (flouble) 0., binop); if (unbiased) gini[0] = gini[0]/n/(n-1)/mu[0]; else gini[0] = gini[0]/n/n/mu[0]; return gini; }
double information(InternalMatrix& inMx) { switch(information_measure) { case InfoType::gini : return gini(inMx); case InfoType::entropy : return entropy(inMx); } }
// perform actual computation void forestFindThr( int H, int N, int F, const float *data, const uint32 *hs, const float *ws, const uint32 *order, const int split, uint32 &fid, float &thr, double &gain ) { double *Wl, *Wr, *W; float *data1; uint32 *order1; int i, j, j1, j2, h; double vBst, vInit, v, w, wl, wr, g, gl, gr; Wl=new double[H]; Wr=new double[H]; W=new double[H]; // perform initialization vBst = vInit = 0; g = 0; w = 0; fid = 1; thr = 0; for( i=0; i<H; i++ ) W[i] = 0; for( j=0; j<N; j++ ) { w+=ws[j]; W[hs[j]-1]+=ws[j]; } if( split==0 ) { for( i=0; i<H; i++ ) g+=gini(W[i]); vBst=vInit=(1-g/w/w); } if( split==1 ) { for( i=0; i<H; i++ ) g+=entropy(W[i]); vBst=vInit=g/w; } // loop over features, then thresholds (data is sorted by feature value) for( i=0; i<F; i++ ) { order1=(uint32*) order+i*N; data1=(float*) data+i*size_t(N); for( j=0; j<H; j++ ) { Wl[j]=0; Wr[j]=W[j]; } gl=wl=0; gr=g; wr=w; for( j=0; j<N-1; j++ ) { j1=order1[j]; j2=order1[j+1]; h=hs[j1]-1; if(split==0) { // gini = 1-\sum_h p_h^2; v = gini_l*pl + gini_r*pr wl+=ws[j1]; gl-=gini(Wl[h]); Wl[h]+=ws[j1]; gl+=gini(Wl[h]); wr-=ws[j1]; gr-=gini(Wr[h]); Wr[h]-=ws[j1]; gr+=gini(Wr[h]); v = (wl-gl/wl)/w + (wr-gr/wr)/w; } else if (split==1) { // entropy = -\sum_h p_h log(p_h); v = entropy_l*pl + entropy_r*pr gl+=entropy(wl); wl+=ws[j1]; gl-=entropy(wl); gr+=entropy(wr); wr-=ws[j1]; gr-=entropy(wr); gl-=entropy(Wl[h]); Wl[h]+=ws[j1]; gl+=entropy(Wl[h]); gr-=entropy(Wr[h]); Wr[h]-=ws[j1]; gr+=entropy(Wr[h]); v = gl/w + gr/w; } else if (split==2) { // twoing: v = pl*pr*\sum_h(|p_h_left - p_h_right|)^2 [slow if H>>0] j1=order1[j]; j2=order1[j+1]; h=hs[j1]-1; wl+=ws[j1]; Wl[h]+=ws[j1]; wr-=ws[j1]; Wr[h]-=ws[j1]; g=0; for( int h1=0; h1<H; h1++ ) g+=fabs(Wl[h1]/wl-Wr[h1]/wr); v = - wl/w*wr/w*g*g; } if( v<vBst && data1[j2]-data1[j1]>=1e-6f ) { vBst=v; fid=i+1; thr=0.5f*(data1[j1]+data1[j2]); } } } delete [] Wl; delete [] Wr; delete [] W; gain = vInit-vBst; }
/* ************************************************************************ * Function that implements the Random Feature Selection process. * param : * node : the current node to be split * sortedInd : aa array of instance indices sorted by each attribute values. * * Return the Rule object for the split procedure that have been produced with the selected criterion criterion. */ Rule * RndTree::randomFeatSelection(Node * node, u_int ** sortedInd) { //cout << "random feature selection\n"; DataHandler * data = node->getDataSet(); long double bestGain = 0.0; double bestSplit = 0.0; u_int bestAtt = data->getClassInd(); bool found = false; double w_size = data->w_size(); // Compute the gini index or entropy value for the current node subset of data. long double eval0; if(gin) eval0 = gini(data->getDistrib(),data->getNbClass(),w_size); else eval0 = entropy(data->getDistrib(),data->getNbClass(),w_size); // have a vector to memorize attributes already evaluated. vector<u_int> attWindow; for(u_int i=0; i<data->dim(); i++) if(i != data->getClassInd()) attWindow.push_back(i); int k = nbFeat; //node->getDataSet()->afficheBase(); while((attWindow.size()>0) && ((k > 0) && (!found)))//|| { int r = 0; if(attWindow.size() > 1) r = Utils::randInt(attWindow.size()); u_int attIndex = attWindow[r]; double split; long double gain = evalAttribute(node,attIndex,sortedInd[attIndex],&split,eval0,w_size); if(gain > bestGain) { bestGain = gain; bestAtt = attIndex; bestSplit = split; found = true; } attWindow.erase(attWindow.begin()+r); k--; } if(!found) return NULL; u_int bestAttId = data->getAttribute(bestAtt)->getId(); if(data->getAttribute(bestAtt)->is_nominal()) return new Rule(bestAttId,data->getAttribute(bestAtt)->getNbModal()); else return new Rule(bestAttId,bestSplit); }
/* ************************************************************************ * Function that evaluates the quality of the current split * param : * n : the weighted size of the current node's subset * nbClass : the number of class possible values * distribs : a 2D array to memorize class distribution for each child node to be created * tots : an array of total size of each child node subset * nbSplit : the number of child node to be created */ long double Cart::eval(double n, u_int nbClass, double ** distribs, double * tots, u_int nbSplit) { long double eval = 0.0; for(u_int i=0; i<nbSplit; i++) { if(tots[i] != 0.0) { long double i_t; if(gin) i_t = gini(distribs[i],nbClass,tots[i]); else i_t = entropy(distribs[i],nbClass,tots[i]); eval += ((tots[i]/n) * i_t); } } return eval; }
/* ************************************************************************ * Function that implements the Feature Selection process. * param : * node : the current node to be split * sortedInd : an array of instance indices sorted by each attribute values. * * Return the Rule object for the split procedure that have been produced with the selected criterion criterion. */ Rule * RndTree::featSelection(Node * node, u_int ** sortedInd) { DataHandler * data = node->getDataSet(); long double bestGain = 0.0; double bestSplit = 0.0; u_int bestAtt = data->getClassInd();//TODO:not attribute bool found = false; double w_size = data->w_size(); long double eval0; if(gin) eval0 = gini(data->getDistrib(),data->getNbClass(),w_size); else eval0 = entropy(data->getDistrib(),data->getNbClass(),w_size); for(u_int attIndex=0; attIndex<data->dim(); attIndex++) { if(attIndex == data->getClassInd()) continue; double split; long double gain = evalAttribute(node,attIndex,sortedInd[attIndex],&split,eval0,w_size); if(gain > bestGain) { bestGain = gain; bestAtt = attIndex; bestSplit = split; found = true; } } if(!found) return NULL; u_int bestAttId = data->getAttribute(bestAtt)->getId(); Attribute* ah=data->getAttribute(bestAtt); delete data; // if(data->getAttribute(bestAtt)->is_nominal()) return new Rule(bestAttId,data->getAttribute(bestAtt)->getNbModal()); if(ah->is_nominal()) return new Rule(bestAttId,ah->getNbModal()); else return new Rule(bestAttId,bestSplit); }