void dist(int n, double *y[], FLOAT *x, int nclass, int edge, double *improve, FLOAT *split, int *csplit, double myrisk, double *wt) { int i, j, k, kj; double temp, sumdiffs_sq; double left_sum, right_sum; /* double left_wt, right_wt; */ int left_n, right_n; double best, total; int direction = LEFT; int where = 0; right_n = n; if (nclass==0) { left_n=0; best=0; total=0; for (k=1; k<n; k++) for (j=0; j<k; j++) total += *y[rp.n*j-j*(j+1)/2+k-j-1]; total = total/n; for (i=0; right_n>edge; i++) { temp=0; sumdiffs_sq=0; left_n++; right_n--; right_sum=0; left_sum=0; if (i==0) left_sum=0; else { for (k=1; k<=i; k++) for (j=0; j<k; j++) left_sum += *y[rp.n*j-j*(j+1)/2+k-j-1]; left_sum = left_sum/(i+1); } if (i==(n-1)) right_sum=0; else { for (k=i+2; k<n; k++) for (j=i+1; j<k; j++) right_sum += *y[rp.n*j-j*(j+1)/2+k-j-1]; right_sum = right_sum/(n-i-1); } if (x[i+1] !=x[i] && left_n>=edge) { temp = total-left_sum-right_sum; if (temp > best) { best = temp; where = i; if (left_sum > right_sum) direction = LEFT; else direction = RIGHT; } } } *improve = best/ myrisk; if (best>0) { /* found something */ csplit[0] = direction; *split = (x[where] + x[where+1]) /2; } } else { /* ** Do the easy coding for now - gd ** Take countn and dsts as square matrices and fold them over ** Fix it up later !!! */ for (i=0; i<nclass; i++) { count[i] =0; for (j=0; j<nclass; j++) { countn[i+nclass*j] =0; dsts[i+nclass*j] =0; } } k = x[0]-1; count[k]++; for (i=1; i<n; i++) { k = x[i]-1; count[k]++; for (j=0; j<i; j++) { kj = x[j]-1; countn[k+nclass*kj]++; dsts[k+nclass*kj] += *y[rp.n*j-j*(j+1)/2+i-j-1]; } } for (i=0; i<nclass; i++) for (j=0; j<=i; j++) { if (i!=j) { countn[i+nclass*j]=countn[i+nclass*j]+countn[j+nclass*i]; dsts[i+nclass*j]=dsts[i+nclass*j]+dsts[j+nclass*i]; } } for (i=0; i<nclass; i++) { if (count[i]==0) tsplit[i] = 0; else tsplit[i] = RIGHT; } total = 0; for (k=0; k<nclass; k++) if (tsplit[k]!=0) { for (j=0; j<=k; j++) if (tsplit[j]!=0) total += dsts[k+nclass*j]; } /* ** Now find the split that we want */ best = 0; /* ** Insert gray code bit here */ /* if (numclass==2) graycode_init2(nclass, count, rate); ** else graycode_init1(nclass, count); ** ** Just use graycode_init1 here -- gd */ graycode_init1(nclass, count); while((i=graycode()) < nclass) { /* item i changes groups */ left_n =0; right_n = 0; left_sum = 0; right_sum = 0; if (tsplit[i]==LEFT) tsplit[i]=RIGHT; else tsplit[i]=LEFT; for (k=0; k<nclass; k++) if (tsplit[k]==LEFT) { for (j=0; j<=k; j++) if (tsplit[j]==LEFT) { left_n += countn[k+nclass*j]; left_sum += dsts[k+nclass*j]; } } else if (tsplit[k]==RIGHT) { for (j=0; j<=k; j++) if (tsplit[j]==RIGHT) { right_n += countn[k+nclass*j]; right_sum += dsts[k+nclass*j]; } } left_n = (int) (sqrt(2*left_n+0.25)+0.5); right_n = (int) (sqrt(2*right_n+0.25)+0.5); if (left_n>=edge && right_n>=edge) { temp = total/n - left_sum/left_n - right_sum/right_n; if (temp > best) { best=temp; if (left_sum > right_sum) for (j=0; j<nclass; j++) csplit[j] = tsplit[j]; else for (j=0; j<nclass; j++) csplit[j] = -tsplit[j]; } } } } *improve = best / myrisk; /* % improvement */ }
/* * The gini splitting function. Find that split point in x such that * the rss within the two groups is decreased as much * as possible. */ void gini(int n, double *y[], double *x, int numcat, int edge, double *improve, double *split, int *csplit, double my_risk, double *wt) { int i, j, k; double lwt, rwt; int rtot, ltot; int direction = LEFT, where = 0; double total_ss, best, temp, p; double lmean, rmean; /* used to decide direction */ for (i = 0; i < numclass; i++) { left[i] = 0; right[i] = 0; } lwt = 0; rwt = 0; rtot = 0; ltot = 0; for (i = 0; i < n; i++) { j = (int) *y[i] - 1; rwt += aprior[j] * wt[i]; /* altered weight = prior * case_weight */ right[j] += wt[i]; rtot++; } total_ss = 0; for (i = 0; i < numclass; i++) { temp = aprior[i] * right[i] / rwt; /* p(class=i, given node A) */ total_ss += rwt * (*impurity) (temp); /* p(A) * I(A) */ } best = total_ss; /* total weight of right * impurity of right + 0 *0 */ /* * at this point we split into 2 disjoint paths */ if (numcat > 0) goto categorical; for (i = 0; rtot > edge; i++) { j = (int) *y[i] - 1; rwt -= aprior[j] * wt[i]; lwt += aprior[j] * wt[i]; rtot--; ltot++; right[j] -= wt[i]; left[j] += wt[i]; if (x[i + 1] != x[i] && (ltot >= edge)) { temp = 0; lmean = 0; rmean = 0; for (j = 0; j < numclass; j++) { p = aprior[j] * left[j] / lwt; /* p(j | left) */ temp += lwt * (*impurity) (p); /* p(left) * I(left) */ lmean += p * j; p = aprior[j] * right[j] / rwt; /* p(j | right) */ temp += rwt * (*impurity) (p); /* p(right) * I(right) */ rmean += p * j; } if (temp < best) { best = temp; where = i; direction = lmean < rmean ? LEFT : RIGHT; } } } *improve = total_ss - best; if (*improve > 0) { /* found something */ csplit[0] = direction; *split = (x[where] + x[where + 1]) / 2; } return; categorical:; /* * First collapse the data into a numclass x numcat array * ccnt[i][j] = number of class i obs, category j of the predictor */ for (j = 0; j < numcat; j++) { awt[j] = 0; countn[j] = 0; for (i = 0; i < numclass; i++) ccnt[i][j] = 0; } for (i = 0; i < n; i++) { j = (int) *y[i] - 1; k = (int) x[i] - 1; awt[k] += aprior[j] * wt[i]; countn[k]++; ccnt[j][k] += wt[i]; } for (i = 0; i < numcat; i++) { if (awt[i] == 0) tsplit[i] = 0; else { rate[i] = ccnt[0][i] / awt[i]; /* a scratch array */ tsplit[i] = RIGHT; } } if (numclass == 2) graycode_init2(numcat, countn, rate); else graycode_init1(numcat, countn); while ((i = graycode()) < numcat) { /* item i changes groups */ if (tsplit[i] == LEFT) { tsplit[i] = RIGHT; rwt += awt[i]; lwt -= awt[i]; rtot += countn[i]; ltot -= countn[i]; for (j = 0; j < numclass; j++) { right[j] += ccnt[j][i]; left[j] -= ccnt[j][i]; } } else { tsplit[i] = LEFT; rwt -= awt[i]; lwt += awt[i]; rtot -= countn[i]; ltot += countn[i]; for (j = 0; j < numclass; j++) { right[j] -= ccnt[j][i]; left[j] += ccnt[j][i]; } } if (ltot >= edge && rtot >= edge) { temp = 0; lmean = 0; rmean = 0; for (j = 0; j < numclass; j++) { p = aprior[j] * left[j] / lwt; temp += lwt * (*impurity) (p); lmean += p * j; p = aprior[j] * right[j] / rwt; /* p(j | right) */ temp += rwt * (*impurity) (p); /* p(right) * I(right) */ rmean += p * j; } if (temp < best) { best = temp; if (lmean < rmean) for (j = 0; j < numcat; j++) csplit[j] = tsplit[j]; else for (j = 0; j < numcat; j++) csplit[j] = -tsplit[j]; } } } *improve = total_ss - best; }