/*----------------------------------------------------------------------*/ void regForest(double *x, double *ypred, int *mdim, int *n, int *ntree, int *lDaughter, int *rDaughter, SMALL_INT *nodestatus, int *nrnodes, double *xsplit, double *avnodes, int *mbest, int *treeSize, int *cat, int maxcat, int *keepPred, double *allpred, int doProx, double *proxMat, int *nodes, int *nodex) { int i, j, idx1, idx2, *junk; double *ytree; junk = NULL; ytree = (double *) calloc(*n, sizeof(double)); if (*nodes) { zeroInt(nodex, *n * *ntree); } else { zeroInt(nodex, *n); } if (doProx) zeroDouble(proxMat, *n * *n); if (*keepPred) zeroDouble(allpred, *n * *ntree); idx1 = 0; idx2 = 0; for (i = 0; i < *ntree; ++i) { zeroDouble(ytree, *n); predictRegTree(x, *n, *mdim, lDaughter + idx1, rDaughter + idx1, nodestatus + idx1, ytree, xsplit + idx1, avnodes + idx1, mbest + idx1, treeSize[i], cat, maxcat, nodex + idx2); for (j = 0; j < *n; ++j) ypred[j] += ytree[j]; if (*keepPred) { for (j = 0; j < *n; ++j) allpred[j + i * *n] = ytree[j]; } /* if desired, do proximities for this round */ if (doProx) computeProximity(proxMat, 0, nodex + idx2, junk, junk, *n); idx1 += *nrnodes; /* increment the offset */ if (*nodes) idx2 += *n; } for (i = 0; i < *n; ++i) ypred[i] /= *ntree; if (doProx) { for (i = 0; i < *n; ++i) { for (j = i + 1; j < *n; ++j) { proxMat[i + j * *n] /= *ntree; proxMat[j + i * *n] = proxMat[i + j * *n]; } proxMat[i + i * *n] = 1.0; } } free(ytree); }
/* * Modified by A. Liaw 1/10/2003 (Deal with cutoff) * Re-written in C by A. Liaw 3/08/2004 */ void oob(int nsample, int nclass, int *jin, int *cl, int *jtr, int *jerr, int *counttr, int *out, double *errtr, int *jest, double *cutoff) { int j, n, noob, *noobcl, ntie; double qq, smax, smaxtr; noobcl = (int *) S_alloc_alt(nclass, sizeof(int)); zeroInt(jerr, nsample); zeroDouble(errtr, nclass+1); noob = 0; for (n = 0; n < nsample; ++n) { if (out[n]) { noob++; noobcl[cl[n]-1]++; smax = 0.0; smaxtr = 0.0; ntie = 1; for (j = 0; j < nclass; ++j) { qq = (((double) counttr[j + n*nclass]) / out[n]) / cutoff[j]; if (j+1 != cl[n]) smax = (qq > smax) ? qq : smax; /* if vote / cutoff is larger than current max, re-set max and * change predicted class to the current class */ if (qq > smaxtr) { smaxtr = qq; jest[n] = j+1; } /* break tie at random */ if (qq == smaxtr) { ntie++; if (unif_rand() > 1.0 / ntie) { smaxtr = qq; jest[n] = j+1; } } } if (jest[n] != cl[n]) { errtr[cl[n]] += 1.0; errtr[0] += 1.0; jerr[n] = 1; } } } errtr[0] /= noob; for (n = 1; n <= nclass; ++n) errtr[n] /= noobcl[n-1]; free(noobcl); }
void TestSetError(double *countts, int *jts, int *clts, int *jet, int ntest, int nclass, int nvote, double *errts, int labelts, int *nclts, double *cutoff) { int j, n, ntie; double cmax, crit; for (n = 0; n < ntest; ++n) countts[jts[n]-1 + n*nclass] += 1.0; /* Prediction is the class with the maximum votes */ for (n = 0; n < ntest; ++n) { cmax=0.0; ntie = 1; for (j = 0; j < nclass; ++j) { crit = (countts[j + n*nclass] / nvote) / cutoff[j]; if (crit > cmax) { jet[n] = j+1; cmax = crit; ntie = 1; } /* Break ties at random: */ if (crit == cmax) { if (unif_rand() < 1.0 / ntie) { jet[n] = j+1; cmax = crit; } ntie++; } } } if (labelts) { zeroDouble(errts, nclass + 1); for (n = 0; n < ntest; ++n) { if (jet[n] != clts[n]) { errts[0] += 1.0; errts[clts[n]] += 1.0; } } errts[0] /= ntest; for (n = 1; n <= nclass; ++n) errts[n] /= nclts[n-1]; } }
void classForest(int *mdim, int *ntest, int *nclass, int *maxcat, int *nrnodes, int *ntree, double *x, double *xbestsplit, double *pid, double *cutoff, double *countts, int *treemap, int *nodestatus, int *cat, int *nodeclass, int *jts, int *jet, int *bestvar, int *node, int *treeSize, int *keepPred, int *prox, double *proxMat, int *nodes) { int j, n, n1, n2, idxNodes, offset1, offset2, *junk, ntie; double crit, cmax; zeroDouble(countts, *nclass * *ntest); idxNodes = 0; offset1 = 0; offset2 = 0; junk = NULL; // Rprintf("nclass %d\n", *nclass); for (j = 0; j < *ntree; ++j) { // Rprintf("pCT nclass %d \n", *nclass); /* predict by the j-th tree */ // Rprintf( "#ntree: %d, idxNodes: %d\n", j, idxNodes ); predictClassTree(x, *ntest, *mdim, nrnodes, treemap + 2*idxNodes, nodestatus + idxNodes, xbestsplit + idxNodes, bestvar + idxNodes, nodeclass + idxNodes, treeSize[j], cat, *nclass, jts + offset1, node + offset2, *maxcat); //// original code //predictClassTree(x, *ntest, *mdim, treemap + 2*idxNodes, // nodestatus + idxNodes, xbestsplit + idxNodes, // bestvar + idxNodes, nodeclass + idxNodes, // treeSize[j], cat, *nclass, // jts + offset1, node + offset2, *maxcat); /* accumulate votes: */ for (n = 0; n < *ntest; ++n) { countts[jts[n + offset1] - 1 + n * *nclass] += 1.0; } /* if desired, do proximities for this round */ if (*prox) computeProximity(proxMat, 0, node + offset2, junk, junk, *ntest); idxNodes += *nrnodes; if (*keepPred) offset1 += *ntest; if (*nodes) offset2 += *ntest; } //Rprintf("ntest %d\n", *ntest); /* Aggregated prediction is the class with the maximum votes/cutoff */ for (n = 0; n < *ntest; ++n) { //Rprintf("Ap: ntest %d\n", *ntest); cmax = 0.0; ntie = 1; for (j = 0; j < *nclass; ++j) { crit = (countts[j + n * *nclass] / *ntree) / cutoff[j]; if (crit > cmax) { jet[n] = j + 1; cmax = crit; } /* Break ties at random: */ if (crit == cmax) { ntie++; if (unif_rand() > 1.0 / ntie) jet[n] = j + 1; } } } //Rprintf("ntest %d\n", *ntest); /* if proximities requested, do the final adjustment * (division by number of trees) */ //Rprintf("prox %d",*prox); if (*prox) { //Rprintf("prox: ntest %d\n", *ntest); for (n1 = 0; n1 < *ntest; ++n1) { for (n2 = n1 + 1; n2 < *ntest; ++n2) { proxMat[n1 + n2 * *ntest] /= *ntree; proxMat[n2 + n1 * *ntest] = proxMat[n1 + n2 * *ntest]; } proxMat[n1 + n1 * *ntest] = 1.0; } } //Rprintf("END ntest %d\n", *ntest); }
void classRF(double *x, int *dimx, int *cl, int *ncl, int *cat, int *maxcat, int *sampsize, int *strata, int *Options, int *ntree, int *nvar, int *ipi, double *classwt, double *cut, int *nodesize, int *outcl, int *counttr, double *prox, double *imprt, double *impsd, double *impmat, int *nrnodes, int *ndbigtree, int *nodestatus, int *bestvar, int *treemap, int *nodeclass, double *xbestsplit, double *errtr, int *testdat, double *xts, int *clts, int *nts, double *countts, int *outclts, int labelts, double *proxts, double *errts, int *inbag) { /****************************************************************** * C wrapper for random forests: get input from R and drive * the Fortran routines. * * Input: * * x: matrix of predictors (transposed!) * dimx: two integers: number of variables and number of cases * cl: class labels of the data * ncl: number of classes in the responsema * cat: integer vector of number of classes in the predictor; * 1=continuous * maxcat: maximum of cat * Options: 7 integers: (0=no, 1=yes) * add a second class (for unsupervised RF)? * 1: sampling from product of marginals * 2: sampling from product of uniforms * assess variable importance? * calculate proximity? * calculate proximity based on OOB predictions? * calculate outlying measure? * how often to print output? * keep the forest for future prediction? * ntree: number of trees * nvar: number of predictors to use for each split * ipi: 0=use class proportion as prob.; 1=use supplied priors * pi: double vector of class priors * nodesize: minimum node size: no node with fewer than ndsize * cases will be split * * Output: * * outcl: class predicted by RF * counttr: matrix of votes (transposed!) * imprt: matrix of variable importance measures * impmat: matrix of local variable importance measures * prox: matrix of proximity (if iprox=1) ******************************************************************/ int nsample0, mdim, nclass, addClass, mtry, ntest, nsample, ndsize, mimp, nimp, near, nuse, noutall, nrightall, nrightimpall, keepInbag, nstrata; int jb, j, n, m, k, idxByNnode, idxByNsample, imp, localImp, iprox, oobprox, keepf, replace, stratify, trace, *nright, *nrightimp, *nout, *nclts, Ntree; int *out, *bestsplitnext, *bestsplit, *nodepop, *jin, *nodex, *nodexts, *nodestart, *ta, *ncase, *jerr, *varUsed, *jtr, *classFreq, *idmove, *jvr, *at, *a, *b, *mind, *nind, *jts, *oobpair; int **strata_idx, *strata_size, last, ktmp, anyEmpty, ntry; double av=0.0; double *tgini, *tx, *wl, *classpop, *tclasscat, *tclasspop, *win, *tp, *wr; //Do initialization for COKUS's Random generator seedMT(2*rand()+1); //works well with odd number so why don't use that addClass = Options[0]; imp = Options[1]; localImp = Options[2]; iprox = Options[3]; oobprox = Options[4]; trace = Options[5]; keepf = Options[6]; replace = Options[7]; stratify = Options[8]; keepInbag = Options[9]; mdim = dimx[0]; nsample0 = dimx[1]; nclass = (*ncl==1) ? 2 : *ncl; ndsize = *nodesize; Ntree = *ntree; mtry = *nvar; ntest = *nts; nsample = addClass ? (nsample0 + nsample0) : nsample0; mimp = imp ? mdim : 1; nimp = imp ? nsample : 1; near = iprox ? nsample0 : 1; if (trace == 0) trace = Ntree + 1; /*printf("\nmdim %d, nclass %d, nrnodes %d, nsample %d, ntest %d\n", mdim, nclass, *nrnodes, nsample, ntest); printf("\noobprox %d, mdim %d, nsample0 %d, Ntree %d, mtry %d, mimp %d", oobprox, mdim, nsample0, Ntree, mtry, mimp); printf("\nstratify %d, replace %d",stratify,replace); printf("\n");*/ tgini = (double *) S_alloc_alt(mdim, sizeof(double)); wl = (double *) S_alloc_alt(nclass, sizeof(double)); wr = (double *) S_alloc_alt(nclass, sizeof(double)); classpop = (double *) S_alloc_alt(nclass* *nrnodes, sizeof(double)); tclasscat = (double *) S_alloc_alt(nclass*32, sizeof(double)); tclasspop = (double *) S_alloc_alt(nclass, sizeof(double)); tx = (double *) S_alloc_alt(nsample, sizeof(double)); win = (double *) S_alloc_alt(nsample, sizeof(double)); tp = (double *) S_alloc_alt(nsample, sizeof(double)); out = (int *) S_alloc_alt(nsample, sizeof(int)); bestsplitnext = (int *) S_alloc_alt(*nrnodes, sizeof(int)); bestsplit = (int *) S_alloc_alt(*nrnodes, sizeof(int)); nodepop = (int *) S_alloc_alt(*nrnodes, sizeof(int)); nodestart = (int *) S_alloc_alt(*nrnodes, sizeof(int)); jin = (int *) S_alloc_alt(nsample, sizeof(int)); nodex = (int *) S_alloc_alt(nsample, sizeof(int)); nodexts = (int *) S_alloc_alt(ntest, sizeof(int)); ta = (int *) S_alloc_alt(nsample, sizeof(int)); ncase = (int *) S_alloc_alt(nsample, sizeof(int)); jerr = (int *) S_alloc_alt(nsample, sizeof(int)); varUsed = (int *) S_alloc_alt(mdim, sizeof(int)); jtr = (int *) S_alloc_alt(nsample, sizeof(int)); jvr = (int *) S_alloc_alt(nsample, sizeof(int)); classFreq = (int *) S_alloc_alt(nclass, sizeof(int)); jts = (int *) S_alloc_alt(ntest, sizeof(int)); idmove = (int *) S_alloc_alt(nsample, sizeof(int)); at = (int *) S_alloc_alt(mdim*nsample, sizeof(int)); a = (int *) S_alloc_alt(mdim*nsample, sizeof(int)); b = (int *) S_alloc_alt(mdim*nsample, sizeof(int)); mind = (int *) S_alloc_alt(mdim, sizeof(int)); nright = (int *) S_alloc_alt(nclass, sizeof(int)); nrightimp = (int *) S_alloc_alt(nclass, sizeof(int)); nout = (int *) S_alloc_alt(nclass, sizeof(int)); if (oobprox) { oobpair = (int *) S_alloc_alt(near*near, sizeof(int)); } //printf("nsample=%d\n", nsample); /* Count number of cases in each class. */ zeroInt(classFreq, nclass); for (n = 0; n < nsample; ++n) classFreq[cl[n] - 1] ++; /* Normalize class weights. */ //Rprintf("ipi %d ",*ipi); //for(n=0;n<nclass;n++) Rprintf("%d: %d, %f,",n,classFreq[n],classwt[n]); normClassWt(cl, nsample, nclass, *ipi, classwt, classFreq); //for(n=0;n<nclass;n++) Rprintf("%d: %d, %f,",n,classFreq[n],classwt[n]); if (stratify) { /* Count number of strata and frequency of each stratum. */ nstrata = 0; for (n = 0; n < nsample0; ++n) if (strata[n] > nstrata) nstrata = strata[n]; /* Create the array of pointers, each pointing to a vector * of indices of where data of each stratum is. */ strata_size = (int *) S_alloc_alt(nstrata, sizeof(int)); for (n = 0; n < nsample0; ++n) { strata_size[strata[n] - 1] ++; } strata_idx = (int **) S_alloc_alt(nstrata, sizeof(int *)); for (n = 0; n < nstrata; ++n) { strata_idx[n] = (int *) S_alloc_alt(strata_size[n], sizeof(int)); } zeroInt(strata_size, nstrata); for (n = 0; n < nsample0; ++n) { strata_size[strata[n] - 1] ++; strata_idx[strata[n] - 1][strata_size[strata[n] - 1] - 1] = n; } } else { nind = replace ? NULL : (int *) S_alloc_alt(nsample, sizeof(int)); } /* INITIALIZE FOR RUN */ if (*testdat) zeroDouble(countts, ntest * nclass); zeroInt(counttr, nclass * nsample); zeroInt(out, nsample); zeroDouble(tgini, mdim); zeroDouble(errtr, (nclass + 1) * Ntree); if (labelts) { nclts = (int *) S_alloc_alt(nclass, sizeof(int)); for (n = 0; n < ntest; ++n) nclts[clts[n]-1]++; zeroDouble(errts, (nclass + 1) * Ntree); } //printf("labelts %d\n",labelts);fflush(stdout); if (imp) { zeroDouble(imprt, (nclass+2) * mdim); zeroDouble(impsd, (nclass+1) * mdim); if (localImp) zeroDouble(impmat, nsample * mdim); } if (iprox) { zeroDouble(prox, nsample0 * nsample0); if (*testdat) zeroDouble(proxts, ntest * (ntest + nsample0)); } makeA(x, mdim, nsample, cat, at, b); //R_CheckUserInterrupt(); /* Starting the main loop over number of trees. */ GetRNGstate(); if (trace <= Ntree) { /* Print header for running output. */ Rprintf("ntree OOB"); for (n = 1; n <= nclass; ++n) Rprintf("%7i", n); if (labelts) { Rprintf("| Test"); for (n = 1; n <= nclass; ++n) Rprintf("%7i", n); } Rprintf("\n"); } idxByNnode = 0; idxByNsample = 0; //Rprintf("addclass %d, ntree %d, cl[300]=%d", addClass,Ntree,cl[299]); for(jb = 0; jb < Ntree; jb++) { //Rprintf("addclass %d, ntree %d, cl[300]=%d", addClass,Ntree,cl[299]); //printf("jb=%d,\n",jb); /* Do we need to simulate data for the second class? */ if (addClass) createClass(x, nsample0, nsample, mdim); do { zeroInt(nodestatus + idxByNnode, *nrnodes); zeroInt(treemap + 2*idxByNnode, 2 * *nrnodes); zeroDouble(xbestsplit + idxByNnode, *nrnodes); zeroInt(nodeclass + idxByNnode, *nrnodes); zeroInt(varUsed, mdim); /* TODO: Put all sampling code into a function. */ /* drawSample(sampsize, nsample, ); */ if (stratify) { /* stratified sampling */ zeroInt(jin, nsample); zeroDouble(tclasspop, nclass); zeroDouble(win, nsample); if (replace) { /* with replacement */ for (n = 0; n < nstrata; ++n) { for (j = 0; j < sampsize[n]; ++j) { ktmp = (int) (unif_rand() * strata_size[n]); k = strata_idx[n][ktmp]; tclasspop[cl[k] - 1] += classwt[cl[k] - 1]; win[k] += classwt[cl[k] - 1]; jin[k] = 1; } } } else { /* stratified sampling w/o replacement */ /* re-initialize the index array */ zeroInt(strata_size, nstrata); for (j = 0; j < nsample; ++j) { strata_size[strata[j] - 1] ++; strata_idx[strata[j] - 1][strata_size[strata[j] - 1] - 1] = j; } /* sampling without replacement */ for (n = 0; n < nstrata; ++n) { last = strata_size[n] - 1; for (j = 0; j < sampsize[n]; ++j) { ktmp = (int) (unif_rand() * (last+1)); k = strata_idx[n][ktmp]; swapInt(strata_idx[n][last], strata_idx[n][ktmp]); last--; tclasspop[cl[k] - 1] += classwt[cl[k]-1]; win[k] += classwt[cl[k]-1]; jin[k] = 1; } } } } else { /* unstratified sampling */ anyEmpty = 0; ntry = 0; do { zeroInt(jin, nsample); zeroDouble(tclasspop, nclass); zeroDouble(win, nsample); if (replace) { for (n = 0; n < *sampsize; ++n) { k = unif_rand() * nsample; tclasspop[cl[k] - 1] += classwt[cl[k]-1]; win[k] += classwt[cl[k]-1]; jin[k] = 1; } } else { for (n = 0; n < nsample; ++n) nind[n] = n; last = nsample - 1; for (n = 0; n < *sampsize; ++n) { ktmp = (int) (unif_rand() * (last+1)); k = nind[ktmp]; swapInt(nind[ktmp], nind[last]); last--; tclasspop[cl[k] - 1] += classwt[cl[k]-1]; win[k] += classwt[cl[k]-1]; jin[k] = 1; } } /* check if any class is missing in the sample */ for (n = 0; n < nclass; ++n) { if (tclasspop[n] == 0) anyEmpty = 1; } ntry++; } while (anyEmpty && ntry <= 10); } /* If need to keep indices of inbag data, do that here. */ if (keepInbag) { for (n = 0; n < nsample0; ++n) { inbag[n + idxByNsample] = jin[n]; } } /* Copy the original a matrix back. */ memcpy(a, at, sizeof(int) * mdim * nsample); modA(a, &nuse, nsample, mdim, cat, *maxcat, ncase, jin); #ifdef WIN64 F77_CALL(_buildtree) #endif #ifndef WIN64 F77_CALL(buildtree) #endif (a, b, cl, cat, maxcat, &mdim, &nsample, &nclass, treemap + 2*idxByNnode, bestvar + idxByNnode, bestsplit, bestsplitnext, tgini, nodestatus + idxByNnode, nodepop, nodestart, classpop, tclasspop, tclasscat, ta, nrnodes, idmove, &ndsize, ncase, &mtry, varUsed, nodeclass + idxByNnode, ndbigtree + jb, win, wr, wl, &mdim, &nuse, mind); /* if the "tree" has only the root node, start over */ } while (ndbigtree[jb] == 1); Xtranslate(x, mdim, *nrnodes, nsample, bestvar + idxByNnode, bestsplit, bestsplitnext, xbestsplit + idxByNnode, nodestatus + idxByNnode, cat, ndbigtree[jb]); /* Get test set error */ if (*testdat) { predictClassTree(xts, ntest, mdim, treemap + 2*idxByNnode, nodestatus + idxByNnode, xbestsplit + idxByNnode, bestvar + idxByNnode, nodeclass + idxByNnode, ndbigtree[jb], cat, nclass, jts, nodexts, *maxcat); TestSetError(countts, jts, clts, outclts, ntest, nclass, jb+1, errts + jb*(nclass+1), labelts, nclts, cut); } /* Get out-of-bag predictions and errors. */ predictClassTree(x, nsample, mdim, treemap + 2*idxByNnode, nodestatus + idxByNnode, xbestsplit + idxByNnode, bestvar + idxByNnode, nodeclass + idxByNnode, ndbigtree[jb], cat, nclass, jtr, nodex, *maxcat); zeroInt(nout, nclass); noutall = 0; for (n = 0; n < nsample; ++n) { if (jin[n] == 0) { /* increment the OOB votes */ counttr[n*nclass + jtr[n] - 1] ++; /* count number of times a case is OOB */ out[n]++; /* count number of OOB cases in the current iteration. * nout[n] is the number of OOB cases for the n-th class. * noutall is the number of OOB cases overall. */ nout[cl[n] - 1]++; noutall++; } } /* Compute out-of-bag error rate. */ oob(nsample, nclass, jin, cl, jtr, jerr, counttr, out, errtr + jb*(nclass+1), outcl, cut); if ((jb+1) % trace == 0) { Rprintf("%5i: %6.2f%%", jb+1, 100.0*errtr[jb * (nclass+1)]); for (n = 1; n <= nclass; ++n) { Rprintf("%6.2f%%", 100.0 * errtr[n + jb * (nclass+1)]); } if (labelts) { Rprintf("| "); for (n = 0; n <= nclass; ++n) { Rprintf("%6.2f%%", 100.0 * errts[n + jb * (nclass+1)]); } } Rprintf("\n"); //R_CheckUserInterrupt(); } /* DO VARIABLE IMPORTANCE */ if (imp) { nrightall = 0; /* Count the number of correct prediction by the current tree * among the OOB samples, by class. */ zeroInt(nright, nclass); for (n = 0; n < nsample; ++n) { /* out-of-bag and predicted correctly: */ if (jin[n] == 0 && jtr[n] == cl[n]) { nright[cl[n] - 1]++; nrightall++; } } for (m = 0; m < mdim; ++m) { if (varUsed[m]) { nrightimpall = 0; zeroInt(nrightimp, nclass); for (n = 0; n < nsample; ++n) tx[n] = x[m + n*mdim]; /* Permute the m-th variable. */ permuteOOB(m, x, jin, nsample, mdim); /* Predict the modified data using the current tree. */ predictClassTree(x, nsample, mdim, treemap + 2*idxByNnode, nodestatus + idxByNnode, xbestsplit + idxByNnode, bestvar + idxByNnode, nodeclass + idxByNnode, ndbigtree[jb], cat, nclass, jvr, nodex, *maxcat); /* Count how often correct predictions are made with * the modified data. */ for (n = 0; n < nsample; n++) { if (jin[n] == 0) { if (jvr[n] == cl[n]) { nrightimp[cl[n] - 1]++; nrightimpall++; } if (localImp && jvr[n] != jtr[n]) { if (cl[n] == jvr[n]) { impmat[m + n*mdim] -= 1.0; } else { impmat[m + n*mdim] += 1.0; } } } /* Restore the original data for that variable. */ x[m + n*mdim] = tx[n]; } /* Accumulate decrease in proportions of correct * predictions. */ for (n = 0; n < nclass; ++n) { if (nout[n] > 0) { imprt[m + n*mdim] += ((double) (nright[n] - nrightimp[n])) / nout[n]; impsd[m + n*mdim] += ((double) (nright[n] - nrightimp[n]) * (nright[n] - nrightimp[n])) / nout[n]; } } if (noutall > 0) { imprt[m + nclass*mdim] += ((double)(nrightall - nrightimpall)) / noutall; impsd[m + nclass*mdim] += ((double) (nrightall - nrightimpall) * (nrightall - nrightimpall)) / noutall; } } } } /* DO PROXIMITIES */ if (iprox) { computeProximity(prox, oobprox, nodex, jin, oobpair, near); /* proximity for test data */ if (*testdat) { computeProximity(proxts, 0, nodexts, jin, oobpair, ntest); /* Compute proximity between testset and training set. */ for (n = 0; n < ntest; ++n) { for (k = 0; k < near; ++k) { if (nodexts[n] == nodex[k]) proxts[n + ntest * (k+ntest)] += 1.0; } } } } if (keepf) idxByNnode += *nrnodes; if (keepInbag) idxByNsample += nsample0; } PutRNGstate(); /* Final processing of variable importance. */ for (m = 0; m < mdim; m++) tgini[m] /= Ntree; if (imp) { for (m = 0; m < mdim; ++m) { if (localImp) { /* casewise measures */ for (n = 0; n < nsample; ++n) impmat[m + n*mdim] /= out[n]; } /* class-specific measures */ for (k = 0; k < nclass; ++k) { av = imprt[m + k*mdim] / Ntree; impsd[m + k*mdim] = sqrt(((impsd[m + k*mdim] / Ntree) - av*av) / Ntree); imprt[m + k*mdim] = av; /* imprt[m + k*mdim] = (se <= 0.0) ? -1000.0 - av : av / se; */ } /* overall measures */ av = imprt[m + nclass*mdim] / Ntree; impsd[m + nclass*mdim] = sqrt(((impsd[m + nclass*mdim] / Ntree) - av*av) / Ntree); imprt[m + nclass*mdim] = av; imprt[m + (nclass+1)*mdim] = tgini[m]; } } else { for (m = 0; m < mdim; ++m) imprt[m] = tgini[m]; } /* PROXIMITY DATA ++++++++++++++++++++++++++++++++*/ if (iprox) { for (n = 0; n < near; ++n) { for (k = n + 1; k < near; ++k) { prox[near*k + n] /= oobprox ? (oobpair[near*k + n] > 0 ? oobpair[near*k + n] : 1) : Ntree; prox[near*n + k] = prox[near*k + n]; } prox[near*n + n] = 1.0; } if (*testdat) { for (n = 0; n < ntest; ++n) for (k = 0; k < ntest + nsample; ++k) proxts[ntest*k + n] /= Ntree; } } if (trace <= Ntree){ printf("\nmdim %d, nclass %d, nrnodes %d, nsample %d, ntest %d\n", mdim, nclass, *nrnodes, nsample, ntest); printf("\noobprox %d, mdim %d, nsample0 %d, Ntree %d, mtry %d, mimp %d", oobprox, mdim, nsample0, Ntree, mtry, mimp); printf("\nstratify %d, replace %d",stratify,replace); printf("\n"); } //frees up the memory free(tgini);free(wl);free(wr);free(classpop);free(tclasscat); free(tclasspop);free(tx);free(win);free(tp);free(out); free(bestsplitnext);free(bestsplit);free(nodepop);free(nodestart);free(jin); free(nodex);free(nodexts);free(ta);free(ncase);free(jerr); free(varUsed);free(jtr);free(jvr);free(classFreq);free(jts); free(idmove);free(at);free(a);free(b);free(mind); free(nright);free(nrightimp);free(nout); if (oobprox) { free(oobpair); } if (stratify) { free(strata_size); for (n = 0; n < nstrata; ++n) { free(strata_idx[n]); } free(strata_idx); } else { if (replace) free(nind); } //printf("labelts %d\n",labelts);fflush(stdout); if (labelts) { free(nclts); } //printf("stratify %d",stratify);fflush(stdout); }
void regRF(double *x, double *y, int *xdim, int *sampsize, int *nthsize, int *nrnodes, int *nTree, int *mtry, int *imp, int *cat, int *maxcat, int *jprint, int *doProx, int *oobprox, int *biasCorr, double *yptr, double *errimp, double *impmat, double *impSD, double *prox, int *treeSize, int *nodestatus, int *lDaughter, int *rDaughter, double *avnode, int *mbest, double *upper, double *mse, int *keepf, int *replace, int *testdat, double *xts, int *nts, double *yts, int *labelts, double *yTestPred, double *proxts, double *msets, double *coef, int *nout, int *inbag) { /************************************************************************* Input: mdim=number of variables in data set nsample=number of cases nthsize=number of cases in a node below which the tree will not split, setting nthsize=5 generally gives good results. nTree=number of trees in run. 200-500 gives pretty good results mtry=number of variables to pick to split on at each node. mdim/3 seems to give genrally good performance, but it can be altered up or down imp=1 turns on variable importance. This is computed for the mth variable as the percent rise in the test set mean sum-of- squared errors when the mth variable is randomly permuted. *************************************************************************/ double errts = 0.0, averrb, meanY, meanYts, varY, varYts, r, xrand, errb = 0.0, resid=0.0, ooberr, ooberrperm, delta, *resOOB; double *yb, *xtmp, *xb, *ytr, *ytree, *tgini, *coeffs; int k, m, mr, n, nOOB, j, jout, idx, ntest, last, ktmp, nPerm, nsample, mdim, keepF, keepInbag; int *oobpair, varImp, localImp, *varUsed; int *in, *nind, *nodex, *nodexts, *probs; nsample = xdim[0]; mdim = xdim[1]; ntest = *nts; varImp = imp[0]; localImp = imp[1]; nPerm = imp[2]; keepF = keepf[0]; keepInbag = keepf[1]; if (*jprint == 0) *jprint = *nTree + 1; yb = (double *) S_alloc(*sampsize, sizeof(double)); xb = (double *) S_alloc(mdim * *sampsize, sizeof(double)); ytr = (double *) S_alloc(nsample, sizeof(double)); xtmp = (double *) S_alloc(nsample, sizeof(double)); resOOB = (double *) S_alloc(nsample, sizeof(double)); coeffs = (double *) S_alloc(*sampsize, sizeof(double)); probs = (int *) S_alloc(*sampsize, sizeof(int)); in = (int *) S_alloc(nsample, sizeof(int)); nodex = (int *) S_alloc(nsample, sizeof(int)); varUsed = (int *) S_alloc(mdim, sizeof(int)); nind = *replace ? NULL : (int *) S_alloc(nsample, sizeof(int)); if (*testdat) { ytree = (double *) S_alloc(ntest, sizeof(double)); nodexts = (int *) S_alloc(ntest, sizeof(int)); } oobpair = (*doProx && *oobprox) ? (int *) S_alloc(nsample * nsample, sizeof(int)) : NULL; /* If variable importance is requested, tgini points to the second "column" of errimp, otherwise it's just the same as errimp. */ tgini = varImp ? errimp + mdim : errimp; averrb = 0.0; meanY = 0.0; varY = 0.0; zeroDouble(yptr, nsample); zeroInt(nout, nsample); for (n = 0; n < nsample; ++n) { varY += n * (y[n] - meanY)*(y[n] - meanY) / (n + 1); meanY = (n * meanY + y[n]) / (n + 1); } varY /= nsample; varYts = 0.0; meanYts = 0.0; if (*testdat) { for (n = 0; n < ntest; ++n) { varYts += n * (yts[n] - meanYts)*(yts[n] - meanYts) / (n + 1); meanYts = (n * meanYts + yts[n]) / (n + 1); } varYts /= ntest; } if (*doProx) { zeroDouble(prox, nsample * nsample); if (*testdat) zeroDouble(proxts, ntest * (nsample + ntest)); } if (varImp) { zeroDouble(errimp, mdim * 2); if (localImp) zeroDouble(impmat, nsample * mdim); } else { zeroDouble(errimp, mdim); } if (*labelts) zeroDouble(yTestPred, ntest); /* print header for running output */ if (*jprint <= *nTree) { Rprintf(" | Out-of-bag "); if (*testdat) Rprintf("| Test set "); Rprintf("|\n"); Rprintf("Tree | MSE %%Var(y) "); if (*testdat) Rprintf("| MSE %%Var(y) "); Rprintf("|\n"); } GetRNGstate(); /************************************* * Start the loop over trees. *************************************/ for (j = 0; j < *nTree; ++j) { /* multinomial */ /*unsigned int coeffs[*sampsize];*/ /* for loop implementation */ /*double probs[*sampsize];*/ for (k = 0; k < *sampsize; ++k) { probs[k] = 1/(*sampsize); } ran_multinomial(*sampsize,100,probs,coeffs); idx = keepF ? j * *nrnodes : 0; zeroInt(in, nsample); zeroInt(varUsed, mdim); /* Draw a random sample for growing a tree. */ if (*replace) { /* sampling with replacement */ for (n = 0; n < *sampsize; ++n) { xrand = unif_rand(); k = xrand * nsample; in[k] = 1; yb[n] = y[k]; for(m = 0; m < mdim; ++m) { xb[m + n * mdim] = x[m + k * mdim]; } } } else { /* sampling w/o replacement */ for (n = 0; n < nsample; ++n) nind[n] = n; last = nsample - 1; for (n = 0; n < *sampsize; ++n) { ktmp = (int) (unif_rand() * (last+1)); k = nind[ktmp]; swapInt(nind[ktmp], nind[last]); last--; in[k] = 1; yb[n] = y[k]; for(m = 0; m < mdim; ++m) { xb[m + n * mdim] = x[m + k * mdim]; } } } if (keepInbag) { for (n = 0; n < nsample; ++n) inbag[n + j * nsample] = in[n]; } /* grow the regression tree */ regTree(xb, yb, mdim, *sampsize, lDaughter + idx, rDaughter + idx, upper + idx, avnode + idx, nodestatus + idx, *nrnodes, treeSize + j, *nthsize, *mtry, mbest + idx, cat, tgini, varUsed, coeffs); /* predict the OOB data with the current tree */ /* ytr is the prediction on OOB data by the current tree */ predictRegTree(x, nsample, mdim, lDaughter + idx, rDaughter + idx, nodestatus + idx, ytr, upper + idx, avnode + idx, mbest + idx, treeSize[j], cat, *maxcat, nodex); /* yptr is the aggregated prediction by all trees grown so far */ errb = 0.0; ooberr = 0.0; jout = 0; /* jout is the number of cases that has been OOB so far */ nOOB = 0; /* nOOB is the number of OOB samples for this tree */ for (n = 0; n < nsample; ++n) { if (in[n] == 0) { nout[n]++; nOOB++; yptr[n] = ((nout[n]-1) * yptr[n] + ytr[n]) / nout[n]; resOOB[n] = ytr[n] - y[n]; ooberr += resOOB[n] * resOOB[n]; } if (nout[n]) { jout++; errb += (y[n] - yptr[n]) * (y[n] - yptr[n]); } } errb /= jout; /* Do simple linear regression of y on yhat for bias correction. */ if (*biasCorr) simpleLinReg(nsample, yptr, y, coef, &errb, nout); /* predict testset data with the current tree */ if (*testdat) { predictRegTree(xts, ntest, mdim, lDaughter + idx, rDaughter + idx, nodestatus + idx, ytree, upper + idx, avnode + idx, mbest + idx, treeSize[j], cat, *maxcat, nodexts); /* ytree is the prediction for test data by the current tree */ /* yTestPred is the average prediction by all trees grown so far */ errts = 0.0; for (n = 0; n < ntest; ++n) { yTestPred[n] = (j * yTestPred[n] + ytree[n]) / (j + 1); } /* compute testset MSE */ if (*labelts) { for (n = 0; n < ntest; ++n) { resid = *biasCorr ? yts[n] - (coef[0] + coef[1]*yTestPred[n]) : yts[n] - yTestPred[n]; errts += resid * resid; } errts /= ntest; } } /* Print running output. */ if ((j + 1) % *jprint == 0) { Rprintf("%4d |", j + 1); Rprintf(" %8.4g %8.2f ", errb, 100 * errb / varY); if(*labelts == 1) Rprintf("| %8.4g %8.2f ", errts, 100.0 * errts / varYts); Rprintf("|\n"); } mse[j] = errb; if (*labelts) msets[j] = errts; /* DO PROXIMITIES */ if (*doProx) { computeProximity(prox, *oobprox, nodex, in, oobpair, nsample); /* proximity for test data */ if (*testdat) { /* In the next call, in and oobpair are not used. */ computeProximity(proxts, 0, nodexts, in, oobpair, ntest); for (n = 0; n < ntest; ++n) { for (k = 0; k < nsample; ++k) { if (nodexts[n] == nodex[k]) { proxts[n + ntest * (k+ntest)] += 1.0; } } } } } /* Variable importance */ if (varImp) { for (mr = 0; mr < mdim; ++mr) { if (varUsed[mr]) { /* Go ahead if the variable is used */ /* make a copy of the m-th variable into xtmp */ for (n = 0; n < nsample; ++n) xtmp[n] = x[mr + n * mdim]; ooberrperm = 0.0; for (k = 0; k < nPerm; ++k) { permuteOOB(mr, x, in, nsample, mdim); predictRegTree(x, nsample, mdim, lDaughter + idx, rDaughter + idx, nodestatus + idx, ytr, upper + idx, avnode + idx, mbest + idx, treeSize[j], cat, *maxcat, nodex); for (n = 0; n < nsample; ++n) { if (in[n] == 0) { r = ytr[n] - y[n]; ooberrperm += r * r; if (localImp) { impmat[mr + n * mdim] += (r*r - resOOB[n]*resOOB[n]) / nPerm; } } } } delta = (ooberrperm / nPerm - ooberr) / nOOB; errimp[mr] += delta; impSD[mr] += delta * delta; /* copy original data back */ for (n = 0; n < nsample; ++n) x[mr + n * mdim] = xtmp[n]; } } } } PutRNGstate(); /* end of tree iterations=======================================*/ if (*biasCorr) { /* bias correction for predicted values */ for (n = 0; n < nsample; ++n) { if (nout[n]) yptr[n] = coef[0] + coef[1] * yptr[n]; } if (*testdat) { for (n = 0; n < ntest; ++n) { yTestPred[n] = coef[0] + coef[1] * yTestPred[n]; } } } if (*doProx) { for (n = 0; n < nsample; ++n) { for (k = n + 1; k < nsample; ++k) { prox[nsample*k + n] /= *oobprox ? (oobpair[nsample*k + n] > 0 ? oobpair[nsample*k + n] : 1) : *nTree; prox[nsample * n + k] = prox[nsample * k + n]; } prox[nsample * n + n] = 1.0; } if (*testdat) { for (n = 0; n < ntest; ++n) for (k = 0; k < ntest + nsample; ++k) proxts[ntest*k + n] /= *nTree; } } if (varImp) { for (m = 0; m < mdim; ++m) { errimp[m] = errimp[m] / *nTree; impSD[m] = sqrt( ((impSD[m] / *nTree) - (errimp[m] * errimp[m])) / *nTree ); if (localImp) { for (n = 0; n < nsample; ++n) { impmat[m + n * mdim] /= nout[n]; } } } } for (m = 0; m < mdim; ++m) tgini[m] /= *nTree; }
void findBestSplit(double *x, int *jdex, double *y, int mdim, int nsample, int ndstart, int ndend, int *msplit, double *decsplit, double *ubest, int *ndendl, int *jstat, int mtry, double sumnode, int nodecnt, int *cat) { int last, ncat[32], icat[32], lc, nl, nr, npopl, npopr; int i, j, kv, l; static int *mind, *ncase; static double *xt, *ut, *v, *yl; double sumcat[32], avcat[32], tavcat[32], ubestt; double crit, critmax, critvar, suml, sumr, d, critParent; if (in_findBestSplit==-99){ free(ncase); free(mind); //had to remove this so that it wont crash for when mdim=0, strangely happened for replace=0 free(v); free(yl); free(xt); free(ut); // PRINTF("giving up mem in findBestSplit\n"); return; } if (in_findBestSplit==0){ in_findBestSplit=1; ut = (double *) calloc(nsample, sizeof(double)); xt = (double *) calloc(nsample, sizeof(double)); v = (double *) calloc(nsample, sizeof(double)); yl = (double *) calloc(nsample, sizeof(double)); mind = (int *) calloc(mdim+1, sizeof(int)); //seems that the sometimes i am asking for kv[10] and that causes problesmms //so allocate 1 more. helps with not crashing in windows ncase = (int *) calloc(nsample, sizeof(int)); } zeroDouble(ut, nsample); zeroDouble(xt, nsample); zeroDouble(v, nsample); zeroDouble(yl, nsample); zeroInt(mind, mdim); zeroInt(ncase, nsample); zeroDouble(avcat, 32); zeroDouble(tavcat, 32); /* START BIG LOOP */ *msplit = -1; *decsplit = 0.0; critmax = 0.0; ubestt = 0.0; for (i=0; i < mdim; ++i) mind[i] = i; last = mdim - 1; for (i = 0; i < mtry; ++i) { critvar = 0.0; j = (int) (unif_rand() * (last+1)); //PRINTF("j=%d, last=%d mind[j]=%d\n", j, last, mind[j]);fflush(stdout); kv = mind[j]; //if(kv>100){ // 1; // getchar(); //} swapInt(mind[j], mind[last]); /* mind[j] = mind[last]; * mind[last] = kv; */ last--; lc = cat[kv]; if (lc == 1) { /* numeric variable */ for (j = ndstart; j <= ndend; ++j) { xt[j] = x[kv + (jdex[j] - 1) * mdim]; yl[j] = y[jdex[j] - 1]; } } else { /* categorical variable */ zeroInt(ncat, 32); zeroDouble(sumcat, 32); for (j = ndstart; j <= ndend; ++j) { l = (int) x[kv + (jdex[j] - 1) * mdim]; sumcat[l - 1] += y[jdex[j] - 1]; ncat[l - 1] ++; } /* Compute means of Y by category. */ for (j = 0; j < lc; ++j) { avcat[j] = ncat[j] ? sumcat[j] / ncat[j] : 0.0; } /* Make the category mean the `pseudo' X data. */ for (j = 0; j < nsample; ++j) { xt[j] = avcat[(int) x[kv + (jdex[j] - 1) * mdim] - 1]; yl[j] = y[jdex[j] - 1]; } } /* copy the x data in this node. */ for (j = ndstart; j <= ndend; ++j) v[j] = xt[j]; for (j = 1; j <= nsample; ++j) ncase[j - 1] = j; R_qsort_I(v, ncase, ndstart + 1, ndend + 1); if (v[ndstart] >= v[ndend]) continue; /* ncase(n)=case number of v nth from bottom */ /* Start from the right and search to the left. */ critParent = sumnode * sumnode / nodecnt; suml = 0.0; sumr = sumnode; npopl = 0; npopr = nodecnt; crit = 0.0; /* Search through the "gaps" in the x-variable. */ for (j = ndstart; j <= ndend - 1; ++j) { d = yl[ncase[j] - 1]; suml += d; sumr -= d; npopl++; npopr--; if (v[j] < v[j+1]) { crit = (suml * suml / npopl) + (sumr * sumr / npopr) - critParent; if (crit > critvar) { ubestt = (v[j] + v[j+1]) / 2.0; critvar = crit; } } } if (critvar > critmax) { *ubest = ubestt; *msplit = kv + 1; critmax = critvar; for (j = ndstart; j <= ndend; ++j) { ut[j] = xt[j]; } if (cat[kv] > 1) { for (j = 0; j < cat[kv]; ++j) tavcat[j] = avcat[j]; } } } *decsplit = critmax; /* If best split can not be found, set to terminal node and return. */ if (*msplit != -1) { nl = ndstart; for (j = ndstart; j <= ndend; ++j) { if (ut[j] <= *ubest) { nl++; ncase[nl-1] = jdex[j]; } } *ndendl = imax2(nl - 1, ndstart); nr = *ndendl + 1; for (j = ndstart; j <= ndend; ++j) { if (ut[j] > *ubest) { if (nr >= nsample) break; nr++; ncase[nr - 1] = jdex[j]; } } if (*ndendl >= ndend) *ndendl = ndend - 1; for (j = ndstart; j <= ndend; ++j) jdex[j] = ncase[j]; lc = cat[*msplit - 1]; if (lc > 1) { for (j = 0; j < lc; ++j) { icat[j] = (tavcat[j] < *ubest) ? 1 : 0; } *ubest = pack(lc, icat); } } else *jstat = 1; }
void regTree(double *x, double *y, int mdim, int nsample, int *lDaughter, int *rDaughter, double *upper, double *avnode, SMALL_INT *nodestatus, int nrnodes, int *treeSize, int nthsize, int mtry, int *mbest, int *cat, double *tgini, int *varUsed) { int i, j, k, m, ncur; static int *jdex, *nodestart, *nodepop; int ndstart, ndend, ndendl, nodecnt, jstat, msplit; double d, ss, av, decsplit, ubest, sumnode; if (in_regTree==-99){ free(nodestart); free(jdex); free(nodepop); // PRINTF("giving up mem in in_regTree\n"); return; } if (in_regTree==0){ in_regTree=1; nodestart = (int *) calloc(nrnodes, sizeof(int)); nodepop = (int *) calloc(nrnodes, sizeof(int)); jdex = (int *) calloc(nsample, sizeof(int)); } /* initialize some arrays for the tree */ zeroSMALLInt(nodestatus, nrnodes); zeroInt(nodestart, nrnodes); zeroInt(nodepop, nrnodes); zeroDouble(avnode, nrnodes); for (i = 1; i <= nsample; ++i) jdex[i-1] = i; ncur = 0; nodestart[0] = 0; nodepop[0] = nsample; nodestatus[0] = NODE_TOSPLIT; /* compute mean and sum of squares for Y */ av = 0.0; ss = 0.0; for (i = 0; i < nsample; ++i) { d = y[jdex[i] - 1]; ss += i * (av - d) * (av - d) / (i + 1); av = (i * av + d) / (i + 1); } avnode[0] = av; /* start main loop */ for (k = 0; k < nrnodes - 2; ++k) { if (k > ncur || ncur >= nrnodes - 2) break; /* skip if the node is not to be split */ if (nodestatus[k] != NODE_TOSPLIT) continue; /* initialize for next call to findbestsplit */ ndstart = nodestart[k]; ndend = ndstart + nodepop[k] - 1; nodecnt = nodepop[k]; sumnode = nodecnt * avnode[k]; jstat = 0; decsplit = 0.0; findBestSplit(x, jdex, y, mdim, nsample, ndstart, ndend, &msplit, &decsplit, &ubest, &ndendl, &jstat, mtry, sumnode, nodecnt, cat); if (jstat == 1) { /* Node is terminal: Mark it as such and move on to the next. */ nodestatus[k] = NODE_TERMINAL; continue; } /* Found the best split. */ mbest[k] = msplit; varUsed[msplit - 1] = 1; upper[k] = ubest; tgini[msplit - 1] += decsplit; nodestatus[k] = NODE_INTERIOR; /* leftnode no.= ncur+1, rightnode no. = ncur+2. */ nodepop[ncur + 1] = ndendl - ndstart + 1; nodepop[ncur + 2] = ndend - ndendl; nodestart[ncur + 1] = ndstart; nodestart[ncur + 2] = ndendl + 1; /* compute mean and sum of squares for the left daughter node */ av = 0.0; ss = 0.0; for (j = ndstart; j <= ndendl; ++j) { d = y[jdex[j]-1]; m = j - ndstart; ss += m * (av - d) * (av - d) / (m + 1); av = (m * av + d) / (m+1); } avnode[ncur+1] = av; nodestatus[ncur+1] = NODE_TOSPLIT; if (nodepop[ncur + 1] <= nthsize) { nodestatus[ncur + 1] = NODE_TERMINAL; } /* compute mean and sum of squares for the right daughter node */ av = 0.0; ss = 0.0; for (j = ndendl + 1; j <= ndend; ++j) { d = y[jdex[j]-1]; m = j - (ndendl + 1); ss += m * (av - d) * (av - d) / (m + 1); av = (m * av + d) / (m + 1); } avnode[ncur + 2] = av; nodestatus[ncur + 2] = NODE_TOSPLIT; if (nodepop[ncur + 2] <= nthsize) { nodestatus[ncur + 2] = NODE_TERMINAL; } /* map the daughter nodes */ lDaughter[k] = ncur + 1 + 1; rDaughter[k] = ncur + 2 + 1; /* Augment the tree by two nodes. */ ncur += 2; } *treeSize = nrnodes; for (k = nrnodes - 1; k >= 0; --k) { if (nodestatus[k] == 0) (*treeSize)--; if (nodestatus[k] == NODE_TOSPLIT) { nodestatus[k] = NODE_TERMINAL; } } }
void regRF(double *x, double *y, int *xdim, int *sampsize, int *nthsize, int *nrnodes, int *nTree, int *mtry, int *imp, int *cat, int maxcat, int *jprint, int doProx, int oobprox, int biasCorr, double *yptr, double *errimp, double *impmat, double *impSD, double *prox, int *treeSize, SMALL_INT *nodestatus, int *lDaughter, int *rDaughter, double *avnode, int *mbest, double *upper, double *mse, const int *keepf, int *replace, int testdat, double *xts, int *nts, double *yts, int labelts, double *yTestPred, double *proxts, double *msets, double *coef, int *nout, int *inbag) { /************************************************************************* * Input: * mdim=number of variables in data set * nsample=number of cases * * nthsize=number of cases in a node below which the tree will not split, * setting nthsize=5 generally gives good results. * * nTree=number of trees in run. 200-500 gives pretty good results * * mtry=number of variables to pick to split on at each node. mdim/3 * seems to give genrally good performance, but it can be * altered up or down * * imp=1 turns on variable importance. This is computed for the * mth variable as the percent rise in the test set mean sum-of- * squared errors when the mth variable is randomly permuted. * *************************************************************************/ //PRINTF( "*jprint: %d\n", *jprint ); //mexEvalString( "pause(0.0001)" ); double errts = 0.0, averrb, meanY, meanYts, varY, varYts, r, xrand, errb = 0.0, resid=0.0, ooberr, ooberrperm, delta, *resOOB; double *yb, *xtmp, *xb, *ytr, *ytree = NULL, *tgini; int k, m, mr, n, nOOB, j, jout, idx, ntest, last, ktmp, nPerm, nsample, mdim, keepF, keepInbag; int *oobpair, varImp, localImp, *varUsed; int *in, *nind, *nodex, *nodexts = NULL; //Abhi:temp variable double tmp_d = 0; int tmp_i; SMALL_INT tmp_c; //Do initialization for COKUS's Random generator seedMT(2*rand()+1); //works well with odd number so why don't use that nsample = xdim[0]; mdim = xdim[1]; ntest = *nts; varImp = imp[0]; localImp = imp[1]; nPerm = imp[2]; //PRINTF("nPerm %d\n",nPerm); keepF = keepf[0]; keepInbag = keepf[1]; if (*jprint == 0) *jprint = *nTree + 1; yb = (double *) calloc(*sampsize, sizeof(double)); xb = (double *) calloc(mdim * *sampsize, sizeof(double)); ytr = (double *) calloc(nsample, sizeof(double)); xtmp = (double *) calloc(nsample, sizeof(double)); resOOB = (double *) calloc(nsample, sizeof(double)); in = (int *) calloc(nsample, sizeof(int)); nodex = (int *) calloc(nsample, sizeof(int)); varUsed = (int *) calloc(mdim, sizeof(int)); nind = *replace ? NULL : (int *) calloc(nsample, sizeof(int)); if (testdat) { ytree = (double *) calloc(ntest, sizeof(double)); nodexts = (int *) calloc(ntest, sizeof(int)); } oobpair = (doProx && oobprox) ? (int *) calloc(nsample * nsample, sizeof(int)) : NULL; /* If variable importance is requested, tgini points to the second "column" of errimp, otherwise it's just the same as errimp. */ tgini = varImp ? errimp + mdim : errimp; averrb = 0.0; meanY = 0.0; varY = 0.0; zeroDouble(yptr, nsample); zeroInt(nout, nsample); for (n = 0; n < nsample; ++n) { varY += n * (y[n] - meanY)*(y[n] - meanY) / (n + 1); meanY = (n * meanY + y[n]) / (n + 1); } varY /= nsample; varYts = 0.0; meanYts = 0.0; if (testdat) { for (n = 0; n < ntest; ++n) { varYts += n * (yts[n] - meanYts)*(yts[n] - meanYts) / (n + 1); meanYts = (n * meanYts + yts[n]) / (n + 1); } varYts /= ntest; } if (doProx) { zeroDouble(prox, nsample * nsample); if (testdat) zeroDouble(proxts, ntest * (nsample + ntest)); } if (varImp) { zeroDouble(errimp, mdim * 2); if (localImp) zeroDouble(impmat, nsample * mdim); } else { zeroDouble(errimp, mdim); } if (labelts) zeroDouble(yTestPred, ntest); /* print header for running output */ if (*jprint <= *nTree) { PRINTF(" | Out-of-bag "); if (testdat) PRINTF("| Test set "); PRINTF("|\n"); PRINTF("Tree | MSE %%Var(y) "); if (testdat) PRINTF("| MSE %%Var(y) "); PRINTF("|\n"); // mexEvalString( "pause(0.001)" ); } GetRNGstate(); /************************************* * Start the loop over trees. *************************************/ for (j = 0; j < *nTree; ++j) { //PRINTF("tree num %d\n",j);fflush(stdout); //PRINTF("1. maxcat %d, jprint %d, doProx %d, oobProx %d, biasCorr %d\n", *maxcat, *jprint, doProx, oobprox, biasCorr); idx = keepF ? j * *nrnodes : 0; zeroInt(in, nsample); zeroInt(varUsed, mdim); /* Draw a random sample for growing a tree. */ // PRINTF("1.8. maxcat %d, jprint %d, doProx %d, oobProx %d, biasCorr %d testdat %d\n", maxcat, *jprint, doProx, oobprox, biasCorr,testdat); if (*replace) { /* sampling with replacement */ for (n = 0; n < *sampsize; ++n) { xrand = unif_rand(); k = (int)(xrand * nsample); in[k] = 1; yb[n] = y[k]; for(m = 0; m < mdim; ++m) { xb[m + n * mdim] = x[m + k * mdim]; } } } else { /* sampling w/o replacement */ for (n = 0; n < nsample; ++n) nind[n] = n; last = nsample - 1; for (n = 0; n < *sampsize; ++n) { ktmp = (int) (unif_rand() * (last+1)); k = nind[ktmp]; swapInt(nind[ktmp], nind[last]); last--; in[k] = 1; yb[n] = y[k]; for(m = 0; m < mdim; ++m) { xb[m + n * mdim] = x[m + k * mdim]; } } } if (keepInbag) { for (n = 0; n < nsample; ++n) inbag[n + j * nsample] = in[n]; } // PRINTF("1.9. maxcat %d, jprint %d, doProx %d, oobProx %d, biasCorr %d testdat %d\n", maxcat, *jprint, doProx, oobprox, biasCorr,testdat); /* grow the regression tree */ regTree(xb, yb, mdim, *sampsize, lDaughter + idx, rDaughter + idx, upper + idx, avnode + idx, nodestatus + idx, *nrnodes, treeSize + j, *nthsize, *mtry, mbest + idx, cat, tgini, varUsed); /* predict the OOB data with the current tree */ /* ytr is the prediction on OOB data by the current tree */ // PRINTF("2. maxcat %d, jprint %d, doProx %d, oobProx %d, biasCorr %d testdat %d\n", maxcat, *jprint, doProx, oobprox, biasCorr,testdat); predictRegTree(x, nsample, mdim, lDaughter + idx, rDaughter + idx, nodestatus + idx, ytr, upper + idx, avnode + idx, mbest + idx, treeSize[j], cat, maxcat, nodex); /* yptr is the aggregated prediction by all trees grown so far */ errb = 0.0; ooberr = 0.0; jout = 0; /* jout is the number of cases that has been OOB so far */ nOOB = 0; /* nOOB is the number of OOB samples for this tree */ for (n = 0; n < nsample; ++n) { if (in[n] == 0) { nout[n]++; nOOB++; yptr[n] = ((nout[n]-1) * yptr[n] + ytr[n]) / nout[n]; resOOB[n] = ytr[n] - y[n]; ooberr += resOOB[n] * resOOB[n]; } if (nout[n]) { jout++; errb += (y[n] - yptr[n]) * (y[n] - yptr[n]); } } errb /= jout; /* Do simple linear regression of y on yhat for bias correction. */ if (biasCorr) simpleLinReg(nsample, yptr, y, coef, &errb, nout); //PRINTF("2.5.maxcat %d, jprint %d, doProx %d, oobProx %d, biasCorr %d\n", maxcat, *jprint, doProx, oobprox, biasCorr); /* predict testset data with the current tree */ if (testdat) { predictRegTree(xts, ntest, mdim, lDaughter + idx, rDaughter + idx, nodestatus + idx, ytree, upper + idx, avnode + idx, mbest + idx, treeSize[j], cat, maxcat, nodexts); /* ytree is the prediction for test data by the current tree */ /* yTestPred is the average prediction by all trees grown so far */ errts = 0.0; for (n = 0; n < ntest; ++n) { yTestPred[n] = (j * yTestPred[n] + ytree[n]) / (j + 1); } /* compute testset MSE */ if (labelts) { for (n = 0; n < ntest; ++n) { resid = biasCorr ? yts[n] - (coef[0] + coef[1]*yTestPred[n]) : yts[n] - yTestPred[n]; errts += resid * resid; } errts /= ntest; } } //PRINTF("2.6.maxcat %d, jprint %d, doProx %d, oobProx %d, biasCorr %d, testdat %d\n", maxcat, *jprint, doProx, oobprox, biasCorr,testdat); /* Print running output. */ if ((j + 1) % *jprint == 0) { PRINTF("%4d |", j + 1); PRINTF(" %8.4g %8.2f ", errb, 100 * errb / varY); if(labelts == 1) PRINTF("| %8.4g %8.2f ", errts, 100.0 * errts / varYts); PRINTF("|\n"); fflush(stdout); // mexEvalString("pause(.001);"); // to dump string. } //PRINTF("2.7.maxcat %d, jprint %d, doProx %d, oobProx %d, biasCorr %d, testdat %d\n", maxcat, *jprint, doProx, oobprox, biasCorr,testdat); mse[j] = errb; if (labelts) msets[j] = errts; //PRINTF("2.701 j %d, nTree %d, errts %f errb %f \n", j, *nTree, errts,errb); //PRINTF("2.71.maxcat %d, jprint %d, doProx %d, oobProx %d, biasCorr %d, testdat %d\n", maxcat, *jprint, doProx, oobprox, biasCorr,testdat); /* DO PROXIMITIES */ if (doProx) { computeProximity(prox, oobprox, nodex, in, oobpair, nsample); /* proximity for test data */ if (testdat) { /* In the next call, in and oobpair are not used. */ computeProximity(proxts, 0, nodexts, in, oobpair, ntest); for (n = 0; n < ntest; ++n) { for (k = 0; k < nsample; ++k) { if (nodexts[n] == nodex[k]) { proxts[n + ntest * (k+ntest)] += 1.0; } } } } } //PRINTF("2.8.maxcat %d, jprint %d, doProx %d, oobProx %d, biasCorr %d, testdat %d\n", maxcat, *jprint, doProx, oobprox, biasCorr,testdat); /* Variable importance */ if (varImp) { for (mr = 0; mr < mdim; ++mr) { if (varUsed[mr]) { /* Go ahead if the variable is used */ /* make a copy of the m-th variable into xtmp */ for (n = 0; n < nsample; ++n) xtmp[n] = x[mr + n * mdim]; ooberrperm = 0.0; for (k = 0; k < nPerm; ++k) { permuteOOB(mr, x, in, nsample, mdim); predictRegTree(x, nsample, mdim, lDaughter + idx, rDaughter + idx, nodestatus + idx, ytr, upper + idx, avnode + idx, mbest + idx, treeSize[j], cat, maxcat, nodex); for (n = 0; n < nsample; ++n) { if (in[n] == 0) { r = ytr[n] - y[n]; ooberrperm += r * r; if (localImp) { impmat[mr + n * mdim] += (r*r - resOOB[n]*resOOB[n]) / nPerm; } } } } delta = (ooberrperm / nPerm - ooberr) / nOOB; errimp[mr] += delta; impSD[mr] += delta * delta; /* copy original data back */ for (n = 0; n < nsample; ++n) x[mr + n * mdim] = xtmp[n]; } } } // PRINTF("3. maxcat %d, jprint %d, doProx %d, oobProx %d, biasCorr %d testdat %d\n", maxcat, *jprint, doProx, oobprox, biasCorr,testdat); } PutRNGstate(); /* end of tree iterations=======================================*/ if (biasCorr) { /* bias correction for predicted values */ for (n = 0; n < nsample; ++n) { if (nout[n]) yptr[n] = coef[0] + coef[1] * yptr[n]; } if (testdat) { for (n = 0; n < ntest; ++n) { yTestPred[n] = coef[0] + coef[1] * yTestPred[n]; } } } if (doProx) { for (n = 0; n < nsample; ++n) { for (k = n + 1; k < nsample; ++k) { prox[nsample*k + n] /= oobprox ? (oobpair[nsample*k + n] > 0 ? oobpair[nsample*k + n] : 1) : *nTree; prox[nsample * n + k] = prox[nsample * k + n]; } prox[nsample * n + n] = 1.0; } if (testdat) { for (n = 0; n < ntest; ++n) for (k = 0; k < ntest + nsample; ++k) proxts[ntest*k + n] /= *nTree; } } if (varImp) { for (m = 0; m < mdim; ++m) { errimp[m] = errimp[m] / *nTree; impSD[m] = sqrt( ((impSD[m] / *nTree) - (errimp[m] * errimp[m])) / *nTree ); if (localImp) { for (n = 0; n < nsample; ++n) { impmat[m + n * mdim] /= nout[n]; } } } } for (m = 0; m < mdim; ++m) tgini[m] /= *nTree; //addition by abhi //in order to release the space stored by the variable in findBestSplit // call by setting in_findBestSplit=-99; findBestSplit(&tmp_d, &tmp_i, &tmp_d, tmp_i, tmp_i, tmp_i, tmp_i, &tmp_i, &tmp_d, &tmp_d, &tmp_i, &tmp_i, tmp_i, tmp_d, tmp_i, &tmp_i); //do the same freeing of space by calling with -99 in_regTree=-99; regTree(&tmp_d, &tmp_d, tmp_i, tmp_i, &tmp_i, &tmp_i, &tmp_d, &tmp_d, &tmp_c, tmp_i, &tmp_i, tmp_i, tmp_i, &tmp_i, &tmp_i, &tmp_d, &tmp_i); free(yb); free(xb); free(ytr); free(xtmp); free(resOOB); free(in); free(nodex); free(varUsed); if (!(*replace) ) free(nind); if (testdat) { free(ytree); free(nodexts); } if (doProx && oobprox) free(oobpair) ; }