Example #1
0
File: dist.c Project: cran/mvpart
void dist(int n,    double *y[],  FLOAT *x,     int nclass, 
       int edge, double *improve, FLOAT *split, int *csplit, double myrisk, double *wt)
    {
    int i, j, k, kj;
    double temp, sumdiffs_sq;
    double left_sum, right_sum;
/*    double left_wt, right_wt;  */
    int left_n, right_n;
    double best, total;
    int direction = LEFT;
    int where = 0;

    right_n = n;
        
    if (nclass==0) {
    
    left_n=0;
    best=0;
    
    total=0;
    for (k=1; k<n; k++)
    for (j=0; j<k; j++) 
    total += *y[rp.n*j-j*(j+1)/2+k-j-1];
    total = total/n;    

    for (i=0; right_n>edge; i++) {
	    temp=0; sumdiffs_sq=0; left_n++;  right_n--;
		right_sum=0; left_sum=0;

		if (i==0) left_sum=0;
		else {
			for (k=1; k<=i; k++)
			for (j=0; j<k; j++)  
			left_sum += *y[rp.n*j-j*(j+1)/2+k-j-1];
			left_sum = left_sum/(i+1);
		}
		
		if (i==(n-1)) right_sum=0;
		else {
			for (k=i+2; k<n; k++) 
			for (j=i+1; j<k; j++) 
			right_sum += *y[rp.n*j-j*(j+1)/2+k-j-1];
			right_sum = right_sum/(n-i-1);
		}

        if (x[i+1] !=x[i] &&  left_n>=edge) {
	        temp = total-left_sum-right_sum;

        if (temp > best) {
            best = temp;
            where = i;
            if (left_sum > right_sum) direction = LEFT;
                      else    direction = RIGHT;
            }
        }
    }

    *improve =  best/ myrisk;
    if (best>0) {   /* found something */
        csplit[0] = direction;
        *split = (x[where] + x[where+1]) /2;
        }
    }

    else {
    
/*
**  Do the easy coding for now - gd
**  Take countn and dsts as square matrices and fold them over
**  Fix it up later !!! 
*/
    for (i=0; i<nclass; i++) {
        count[i] =0;
    for (j=0; j<nclass; j++) {
        countn[i+nclass*j] =0;
        dsts[i+nclass*j] =0;
    }
    }

    k = x[0]-1;
    count[k]++;

    for (i=1; i<n; i++) {
    k = x[i]-1;
    count[k]++;
    for (j=0; j<i; j++) {
        kj = x[j]-1;    
        countn[k+nclass*kj]++;
            dsts[k+nclass*kj] += *y[rp.n*j-j*(j+1)/2+i-j-1];       
    }
    }

    for (i=0; i<nclass; i++) 
    for (j=0; j<=i; j++) {
    if (i!=j) {
        countn[i+nclass*j]=countn[i+nclass*j]+countn[j+nclass*i];
            dsts[i+nclass*j]=dsts[i+nclass*j]+dsts[j+nclass*i];    
    }
    }

        for (i=0; i<nclass; i++) {
        if (count[i]==0) tsplit[i] = 0;
        else tsplit[i] = RIGHT;
    }

    total = 0;
    for (k=0; k<nclass; k++) 
        if (tsplit[k]!=0) {
        for (j=0; j<=k; j++) 
            if (tsplit[j]!=0) 
        total += dsts[k+nclass*j];
    }

    /*
    ** Now find the split that we want
    */

    best = 0;
    /*
    ** Insert gray code bit here
    */

/*  if (numclass==2) graycode_init2(nclass, count, rate);
**              else graycode_init1(nclass, count);
**
**     Just use graycode_init1 here -- gd
*/

    graycode_init1(nclass, count);

    while((i=graycode()) < nclass) {

/* item i changes groups */

    left_n =0;  right_n = 0;
    left_sum = 0; right_sum = 0; 

    if (tsplit[i]==LEFT)  tsplit[i]=RIGHT;
    else tsplit[i]=LEFT;
        
    for (k=0; k<nclass; k++) 
        if (tsplit[k]==LEFT) {
        for (j=0; j<=k; j++) 
            if (tsplit[j]==LEFT)   {        
            left_n += countn[k+nclass*j];
            left_sum += dsts[k+nclass*j]; 
            }
        }
        else if (tsplit[k]==RIGHT) {
        for (j=0; j<=k; j++) 
            if (tsplit[j]==RIGHT)   {       
            right_n += countn[k+nclass*j];
            right_sum += dsts[k+nclass*j];   
                }
        }

    left_n = (int) (sqrt(2*left_n+0.25)+0.5);   
    right_n = (int) (sqrt(2*right_n+0.25)+0.5); 
    
    if (left_n>=edge  &&  right_n>=edge) {
    temp = total/n - left_sum/left_n - right_sum/right_n;

        if (temp > best) {
                best=temp;
                if (left_sum > right_sum)
                for (j=0; j<nclass; j++) csplit[j] = tsplit[j];
            else
                for (j=0; j<nclass; j++) csplit[j] = -tsplit[j];
                }
        }
        }
    }
    *improve = best / myrisk;      /* % improvement */

  }
Example #2
0
File: gini.c Project: csilles/cxxr
/*
 * The gini splitting function.  Find that split point in x such that
 *  the rss within the two groups is decreased as much
 *  as possible.
 */
void
gini(int n, double *y[], double *x, int numcat,
     int edge, double *improve, double *split, int *csplit, double my_risk,
     double *wt)
{
    int i, j, k;
    double lwt, rwt;
    int rtot, ltot;
    int direction = LEFT, where = 0;
    double total_ss, best, temp, p;
    double lmean, rmean;        /* used to decide direction */

    for (i = 0; i < numclass; i++) {
	left[i] = 0;
	right[i] = 0;
    }
    lwt = 0;
    rwt = 0;
    rtot = 0;
    ltot = 0;
    for (i = 0; i < n; i++) {
	j = (int) *y[i] - 1;
	rwt += aprior[j] * wt[i];  /* altered weight = prior * case_weight */
	right[j] += wt[i];
	rtot++;
    }
    total_ss = 0;
    for (i = 0; i < numclass; i++) {
	temp = aprior[i] * right[i] / rwt;      /* p(class=i, given node A) */
	total_ss += rwt * (*impurity) (temp);   /* p(A) * I(A) */
    }
    best = total_ss;  /* total weight of right * impurity of right + 0 *0 */

    /*
     * at this point we split into 2 disjoint paths
     */
    if (numcat > 0)
	goto categorical;

    for (i = 0; rtot > edge; i++) {
	j = (int) *y[i] - 1;
	rwt -= aprior[j] * wt[i];
	lwt += aprior[j] * wt[i];
	rtot--;
	ltot++;
	right[j] -= wt[i];
	left[j] += wt[i];

	if (x[i + 1] != x[i] && (ltot >= edge)) {
	    temp = 0;
	    lmean = 0;
	    rmean = 0;
	    for (j = 0; j < numclass; j++) {
		p = aprior[j] * left[j] / lwt;  /* p(j | left) */
		temp += lwt * (*impurity) (p);  /* p(left) * I(left) */
		lmean += p * j;
		p = aprior[j] * right[j] / rwt; /* p(j | right) */
		temp += rwt * (*impurity) (p);  /* p(right) * I(right) */
		rmean += p * j;
	    }
	    if (temp < best) {
		best = temp;
		where = i;
		direction = lmean < rmean ? LEFT : RIGHT;
	    }
	}
    }

    *improve = total_ss - best;
    if (*improve > 0) {         /* found something */
	csplit[0] = direction;
	*split = (x[where] + x[where + 1]) / 2;
    }
    return;

categorical:;
    /*
     * First collapse the data into a numclass x numcat array
     *  ccnt[i][j] = number of class i obs, category j of the predictor
     */
    for (j = 0; j < numcat; j++) {
	awt[j] = 0;
	countn[j] = 0;
	for (i = 0; i < numclass; i++)
	    ccnt[i][j] = 0;
    }
    for (i = 0; i < n; i++) {
	j = (int) *y[i] - 1;
	k = (int) x[i] - 1;
	awt[k] += aprior[j] * wt[i];
	countn[k]++;
	ccnt[j][k] += wt[i];
    }

    for (i = 0; i < numcat; i++) {
	if (awt[i] == 0)
	    tsplit[i] = 0;
	else {
	    rate[i] = ccnt[0][i] / awt[i];      /* a scratch array */
	    tsplit[i] = RIGHT;
	}
    }

    if (numclass == 2)
	graycode_init2(numcat, countn, rate);
    else
	graycode_init1(numcat, countn);

    while ((i = graycode()) < numcat) {
       /* item i changes groups */
	if (tsplit[i] == LEFT) {
	    tsplit[i] = RIGHT;
	    rwt += awt[i];
	    lwt -= awt[i];
	    rtot += countn[i];
	    ltot -= countn[i];
	    for (j = 0; j < numclass; j++) {
		right[j] += ccnt[j][i];
		left[j] -= ccnt[j][i];
	    }
	} else {
	    tsplit[i] = LEFT;
	    rwt -= awt[i];
	    lwt += awt[i];
	    rtot -= countn[i];
	    ltot += countn[i];
	    for (j = 0; j < numclass; j++) {
		right[j] -= ccnt[j][i];
		left[j] += ccnt[j][i];
	    }
	}

	if (ltot >= edge && rtot >= edge) {
	    temp = 0;
	    lmean = 0;
	    rmean = 0;
	    for (j = 0; j < numclass; j++) {
		p = aprior[j] * left[j] / lwt;
		temp += lwt * (*impurity) (p);
		lmean += p * j;
		p = aprior[j] * right[j] / rwt; /* p(j | right) */
		temp += rwt * (*impurity) (p);  /* p(right) * I(right) */
		rmean += p * j;
	    }
	    if (temp < best) {
		best = temp;
		if (lmean < rmean)
		    for (j = 0; j < numcat; j++) csplit[j] = tsplit[j];
		else
		    for (j = 0; j < numcat; j++) csplit[j] = -tsplit[j];
	    }
	}
    }
    *improve = total_ss - best;
}