Exemplo n.º 1
0
RcppExport SEXP rthgini(SEXP x_, SEXP mu_, SEXP unbiased_, SEXP nthreads)
{
  Rcpp::NumericVector x(x_);
  Rcpp::NumericVector mu(mu_);
  const int unbiased = INTEGER(unbiased_)[0];
  const int n = LENGTH(x);
  Rcpp::NumericVector gini(1);
  
  #if RTH_OMP
  omp_set_num_threads(INT(nthreads));
  #elif RTH_TBB
  tbb::task_scheduler_init init(INT(nthreads));
  #endif
  
  thrust::device_vector<double> dx(x.begin(), x.end());
  
  thrust::sort(dx.begin(), dx.end());
  
  thrust::counting_iterator<int> begin(0);
  thrust::counting_iterator<int> end = begin + n;
  
  thrust::plus<flouble> binop;
  gini[0] = (double) thrust::transform_reduce(begin, end, compute_gini(n, dx.begin()), (flouble) 0., binop);
  
  if (unbiased)
    gini[0] = gini[0]/n/(n-1)/mu[0];
  else
    gini[0] = gini[0]/n/n/mu[0];
  
  return gini;
}
Exemplo n.º 2
0
 double information(InternalMatrix& inMx)
     {
         switch(information_measure) {
             case InfoType::gini :
                 return gini(inMx);
             case InfoType::entropy :
                 return entropy(inMx);
         }
     }
Exemplo n.º 3
0
// perform actual computation
void forestFindThr( int H, int N, int F, const float *data,
  const uint32 *hs, const float *ws, const uint32 *order, const int split,
  uint32 &fid, float &thr, double &gain )
{
  double *Wl, *Wr, *W; float *data1; uint32 *order1;
  int i, j, j1, j2, h; double vBst, vInit, v, w, wl, wr, g, gl, gr;
  Wl=new double[H]; Wr=new double[H]; W=new double[H];
  // perform initialization
  vBst = vInit = 0; g = 0; w = 0; fid = 1; thr = 0;
  for( i=0; i<H; i++ ) W[i] = 0;
  for( j=0; j<N; j++ ) { w+=ws[j]; W[hs[j]-1]+=ws[j]; }
  if( split==0 ) { for( i=0; i<H; i++ ) g+=gini(W[i]); vBst=vInit=(1-g/w/w); }
  if( split==1 ) { for( i=0; i<H; i++ ) g+=entropy(W[i]); vBst=vInit=g/w; }
  // loop over features, then thresholds (data is sorted by feature value)
  for( i=0; i<F; i++ ) {
    order1=(uint32*) order+i*N; data1=(float*) data+i*size_t(N);
    for( j=0; j<H; j++ ) { Wl[j]=0; Wr[j]=W[j]; } gl=wl=0; gr=g; wr=w;
    for( j=0; j<N-1; j++ ) {
      j1=order1[j]; j2=order1[j+1]; h=hs[j1]-1;
      if(split==0) {
        // gini = 1-\sum_h p_h^2; v = gini_l*pl + gini_r*pr
        wl+=ws[j1]; gl-=gini(Wl[h]); Wl[h]+=ws[j1]; gl+=gini(Wl[h]);
        wr-=ws[j1]; gr-=gini(Wr[h]); Wr[h]-=ws[j1]; gr+=gini(Wr[h]);
        v = (wl-gl/wl)/w + (wr-gr/wr)/w;
      } else if (split==1) {
        // entropy = -\sum_h p_h log(p_h); v = entropy_l*pl + entropy_r*pr
        gl+=entropy(wl); wl+=ws[j1]; gl-=entropy(wl);
        gr+=entropy(wr); wr-=ws[j1]; gr-=entropy(wr);
        gl-=entropy(Wl[h]); Wl[h]+=ws[j1]; gl+=entropy(Wl[h]);
        gr-=entropy(Wr[h]); Wr[h]-=ws[j1]; gr+=entropy(Wr[h]);
        v = gl/w + gr/w;
      } else if (split==2) {
        // twoing: v = pl*pr*\sum_h(|p_h_left - p_h_right|)^2 [slow if H>>0]
        j1=order1[j]; j2=order1[j+1]; h=hs[j1]-1;
        wl+=ws[j1]; Wl[h]+=ws[j1]; wr-=ws[j1]; Wr[h]-=ws[j1];
        g=0; for( int h1=0; h1<H; h1++ ) g+=fabs(Wl[h1]/wl-Wr[h1]/wr);
        v = - wl/w*wr/w*g*g;
      }
      if( v<vBst && data1[j2]-data1[j1]>=1e-6f ) {
        vBst=v; fid=i+1; thr=0.5f*(data1[j1]+data1[j2]); }
    }
  }
  delete [] Wl; delete [] Wr; delete [] W; gain = vInit-vBst;
}
Exemplo n.º 4
0
/* ************************************************************************
 * Function that implements the Random Feature Selection process.
 * param :
 * 		node : the current node to be split
 * 		sortedInd : aa array of instance indices sorted by each attribute values.
 *
 * Return the Rule object for the split procedure that have been produced with the selected criterion criterion.
 */
Rule * 		RndTree::randomFeatSelection(Node * node, u_int ** sortedInd)
{
//cout << "random feature selection\n";
    DataHandler * data = node->getDataSet();
    long double bestGain = 0.0;
    double bestSplit = 0.0;
    u_int bestAtt = data->getClassInd();
    bool found = false;

    double w_size = data->w_size();

    // Compute the gini index or entropy value for the current node subset of data.
    long double eval0;
    if(gin) eval0 = gini(data->getDistrib(),data->getNbClass(),w_size);
    else eval0 = entropy(data->getDistrib(),data->getNbClass(),w_size);

    // have a vector to memorize attributes already evaluated.
    vector<u_int> attWindow;
    for(u_int i=0; i<data->dim(); i++)
        if(i != data->getClassInd()) attWindow.push_back(i);

    int k = nbFeat;
//node->getDataSet()->afficheBase();
    while((attWindow.size()>0) && ((k > 0) && (!found)))//||
    {
        int r = 0;
        if(attWindow.size() > 1) r = Utils::randInt(attWindow.size());
        u_int attIndex = attWindow[r];

        double split;

        long double gain = evalAttribute(node,attIndex,sortedInd[attIndex],&split,eval0,w_size);

        if(gain > bestGain)
        {
            bestGain = gain;
            bestAtt = attIndex;
            bestSplit = split;
            found = true;
        }

        attWindow.erase(attWindow.begin()+r);
        k--;
    }

    if(!found) return NULL;

    u_int bestAttId = data->getAttribute(bestAtt)->getId();

    if(data->getAttribute(bestAtt)->is_nominal()) return new Rule(bestAttId,data->getAttribute(bestAtt)->getNbModal());
    else return new Rule(bestAttId,bestSplit);
}
Exemplo n.º 5
0
Arquivo: cart.cpp Projeto: ngoix/OCRF
/* ************************************************************************
 * Function that evaluates the quality of the current split
 * param :
 * 		n : the weighted size of the current node's subset
 * 		nbClass : the number of class possible values
 * 		distribs : a 2D array to memorize class distribution for each child node to be created
 * 		tots : an array of total size of each child node subset
 * 		nbSplit : the number of child node to be created
 */
long double 	Cart::eval(double n, u_int nbClass, double ** distribs, double * tots, u_int nbSplit)
{
    long double eval = 0.0;

    for(u_int i=0; i<nbSplit; i++)
    {
        if(tots[i] != 0.0)
        {
            long double i_t;
            if(gin) i_t = gini(distribs[i],nbClass,tots[i]);
            else i_t = entropy(distribs[i],nbClass,tots[i]);
            eval += ((tots[i]/n) * i_t);
        }
    }

    return eval;
}
Exemplo n.º 6
0
/* ************************************************************************
 * Function that implements the Feature Selection process.
 * param :
 * 		node : the current node to be split
 * 		sortedInd : an array of instance indices sorted by each attribute values.
 *
 * Return the Rule object for the split procedure that have been produced with the selected criterion criterion.
 */
Rule * 		RndTree::featSelection(Node * node, u_int ** sortedInd)
{
    DataHandler * data = node->getDataSet();
    long double bestGain = 0.0;
    double bestSplit = 0.0;
    u_int bestAtt = data->getClassInd();//TODO:not attribute
    bool found = false;

    double w_size = data->w_size();

    long double eval0;
    if(gin) eval0 = gini(data->getDistrib(),data->getNbClass(),w_size);
    else eval0 = entropy(data->getDistrib(),data->getNbClass(),w_size);

    for(u_int attIndex=0; attIndex<data->dim(); attIndex++)
    {
        if(attIndex == data->getClassInd()) continue;

        double split;
        long double gain = evalAttribute(node,attIndex,sortedInd[attIndex],&split,eval0,w_size);
        if(gain > bestGain)
        {

            bestGain = gain;
            bestAtt = attIndex;
            bestSplit = split;
            found = true;
        }
    }

    if(!found) return NULL;

    u_int bestAttId = data->getAttribute(bestAtt)->getId();
    Attribute* ah=data->getAttribute(bestAtt);
    delete data;
//	if(data->getAttribute(bestAtt)->is_nominal()) return new Rule(bestAttId,data->getAttribute(bestAtt)->getNbModal());
    if(ah->is_nominal()) return new Rule(bestAttId,ah->getNbModal());
    else return new Rule(bestAttId,bestSplit);
}