Пример #1
0
/*----------------------------------------------------------------------*/
void regForest(double *x, double *ypred, int *mdim, int *n,
        int *ntree, int *lDaughter, int *rDaughter,
        SMALL_INT *nodestatus, int *nrnodes, double *xsplit,
        double *avnodes, int *mbest, int *treeSize, int *cat,
        int maxcat, int *keepPred, double *allpred, int doProx,
        double *proxMat, int *nodes, int *nodex) {
    int i, j, idx1, idx2, *junk;
    double *ytree;
    
    junk = NULL;
    ytree = (double *) calloc(*n, sizeof(double));
    if (*nodes) {
        zeroInt(nodex, *n * *ntree);
    } else {
        zeroInt(nodex, *n);
    }
    if (doProx) zeroDouble(proxMat, *n * *n);
    if (*keepPred) zeroDouble(allpred, *n * *ntree);
    idx1 = 0;
    idx2 = 0;
    for (i = 0; i < *ntree; ++i) {
        zeroDouble(ytree, *n);
        predictRegTree(x, *n, *mdim, lDaughter + idx1, rDaughter + idx1,
                nodestatus + idx1, ytree, xsplit + idx1,
                avnodes + idx1, mbest + idx1, treeSize[i], cat, maxcat,
                nodex + idx2);
        
        for (j = 0; j < *n; ++j) ypred[j] += ytree[j];
        if (*keepPred) {
            for (j = 0; j < *n; ++j) allpred[j + i * *n] = ytree[j];
        }
        /* if desired, do proximities for this round */
        if (doProx) computeProximity(proxMat, 0, nodex + idx2, junk,
                junk, *n);
        idx1 += *nrnodes; /* increment the offset */
        if (*nodes) idx2 += *n;
    }
    for (i = 0; i < *n; ++i) ypred[i] /= *ntree;
    if (doProx) {
        for (i = 0; i < *n; ++i) {
            for (j = i + 1; j < *n; ++j) {
                proxMat[i + j * *n] /= *ntree;
                proxMat[j + i * *n] = proxMat[i + j * *n];
            }
            proxMat[i + i * *n] = 1.0;
        }
    }
    free(ytree);
}
Пример #2
0
void oldunpack(int l, int npack, int *icat) {
/*      
 * npack is a long integer.  The sub. returns icat, an integer of zeroes and
 * ones corresponding to the coefficients in the binary expansion of npack.
 */   
    int i;
    zeroInt(icat, 32);
    icat[0] = npack % 2; 
    for (i = 1; i < l; ++i) {
	npack = (npack - icat[i-1]) / 2;
	icat[i] = npack % 2;
    }
}
Пример #3
0
/*
 * Modified by A. Liaw 1/10/2003 (Deal with cutoff)
 * Re-written in C by A. Liaw 3/08/2004
 */
void oob(int nsample, int nclass, int *jin, int *cl, int *jtr, int *jerr,
        int *counttr, int *out, double *errtr, int *jest,
        double *cutoff) {
    int j, n, noob, *noobcl, ntie;
    double qq, smax, smaxtr;
    
    noobcl  = (int *) S_alloc_alt(nclass, sizeof(int));
    zeroInt(jerr, nsample);
    zeroDouble(errtr, nclass+1);
    
    noob = 0;
    for (n = 0; n < nsample; ++n) {
        if (out[n]) {
            noob++;
            noobcl[cl[n]-1]++;
            smax = 0.0;
            smaxtr = 0.0;
            ntie = 1;
            for (j = 0; j < nclass; ++j) {
                qq = (((double) counttr[j + n*nclass]) / out[n]) / cutoff[j];
                if (j+1 != cl[n]) smax = (qq > smax) ? qq : smax;
                /* if vote / cutoff is larger than current max, re-set max and
                 * change predicted class to the current class */
                if (qq > smaxtr) {
                    smaxtr = qq;
                    jest[n] = j+1;
                }
                /* break tie at random */
                if (qq == smaxtr) {
                    ntie++;
                    if (unif_rand() > 1.0 / ntie) {
                        smaxtr = qq;
                        jest[n] = j+1;
                    }
                }
            }
            if (jest[n] != cl[n]) {
                errtr[cl[n]] += 1.0;
                errtr[0] += 1.0;
                jerr[n] = 1;
            }
        }
    }
    errtr[0] /= noob;
    for (n = 1; n <= nclass; ++n) errtr[n] /= noobcl[n-1];
    free(noobcl);
}
Пример #4
0
/*====================================================================*/
void predictRegTree(double *x, int nsample, int mdim,
        int *lDaughter, int *rDaughter, SMALL_INT *nodestatus,
        double *ypred, double *split, double *nodepred,
        int *splitVar, int treeSize, int *cat, int maxcat,
        int *nodex) {
    int i, j, k, m, *cbestsplit = NULL;
    unsigned int npack;
    
    /* decode the categorical splits */
    if (maxcat > 1) {
        cbestsplit = (int *) calloc(maxcat * treeSize, sizeof(int));
        zeroInt(cbestsplit, maxcat * treeSize);
        for (i = 0; i < treeSize; ++i) {
            if (nodestatus[i] != NODE_TERMINAL && cat[splitVar[i] - 1] > 1) {
                npack = (unsigned int) split[i];
                /* unpack `npack' into bits */
                for (j = 0; npack; npack >>= 1, ++j) {
                    cbestsplit[j + i*maxcat] = npack & 1;
                }
            }
        }
Пример #5
0
void classRF(double *x, int *dimx, int *cl, int *ncl, int *cat, int *maxcat,
        int *sampsize, int *strata, int *Options, int *ntree, int *nvar,
        int *ipi, double *classwt, double *cut, int *nodesize,
        int *outcl, int *counttr, double *prox,
        double *imprt, double *impsd, double *impmat, int *nrnodes,
        int *ndbigtree, int *nodestatus, int *bestvar, int *treemap,
        int *nodeclass, double *xbestsplit, double *errtr,
        int *testdat, double *xts, int *clts, int *nts, double *countts,
        int *outclts, int labelts, double *proxts, double *errts,
        int *inbag) {
    /******************************************************************
     *  C wrapper for random forests:  get input from R and drive
     *  the Fortran routines.
     *
     *  Input:
     *
     *  x:        matrix of predictors (transposed!)
     *  dimx:     two integers: number of variables and number of cases
     *  cl:       class labels of the data
     *  ncl:      number of classes in the responsema
     *  cat:      integer vector of number of classes in the predictor;
     *            1=continuous
     * maxcat:    maximum of cat
     * Options:   7 integers: (0=no, 1=yes)
     *     add a second class (for unsupervised RF)?
     *         1: sampling from product of marginals
     *         2: sampling from product of uniforms
     *     assess variable importance?
     *     calculate proximity?
     *     calculate proximity based on OOB predictions?
     *     calculate outlying measure?
     *     how often to print output?
     *     keep the forest for future prediction?
     *  ntree:    number of trees
     *  nvar:     number of predictors to use for each split
     *  ipi:      0=use class proportion as prob.; 1=use supplied priors
     *  pi:       double vector of class priors
     *  nodesize: minimum node size: no node with fewer than ndsize
     *            cases will be split
     *
     *  Output:
     *
     *  outcl:    class predicted by RF
     *  counttr:  matrix of votes (transposed!)
     *  imprt:    matrix of variable importance measures
     *  impmat:   matrix of local variable importance measures
     *  prox:     matrix of proximity (if iprox=1)
     ******************************************************************/
    
    int nsample0, mdim, nclass, addClass, mtry, ntest, nsample, ndsize,
            mimp, nimp, near, nuse, noutall, nrightall, nrightimpall,
            keepInbag, nstrata;
    int jb, j, n, m, k, idxByNnode, idxByNsample, imp, localImp, iprox,
            oobprox, keepf, replace, stratify, trace, *nright,
            *nrightimp, *nout, *nclts, Ntree;
    
    int *out, *bestsplitnext, *bestsplit, *nodepop, *jin, *nodex,
            *nodexts, *nodestart, *ta, *ncase, *jerr, *varUsed,
            *jtr, *classFreq, *idmove, *jvr,
            *at, *a, *b, *mind, *nind, *jts, *oobpair;
    int **strata_idx, *strata_size, last, ktmp, anyEmpty, ntry;
    
    double av=0.0;
    
    double *tgini, *tx, *wl, *classpop, *tclasscat, *tclasspop, *win,
            *tp, *wr;
    
    //Do initialization for COKUS's Random generator
    seedMT(2*rand()+1);  //works well with odd number so why don't use that
    
    addClass = Options[0];
    imp      = Options[1];
    localImp = Options[2];
    iprox    = Options[3];
    oobprox  = Options[4];
    trace    = Options[5];
    keepf    = Options[6];
    replace  = Options[7];
    stratify = Options[8];
    keepInbag = Options[9];
    mdim     = dimx[0];
    nsample0 = dimx[1];
    nclass   = (*ncl==1) ? 2 : *ncl;
    ndsize   = *nodesize;
    Ntree    = *ntree;
    mtry     = *nvar;
    ntest    = *nts;
    nsample = addClass ? (nsample0 + nsample0) : nsample0;
    mimp = imp ? mdim : 1;
    nimp = imp ? nsample : 1;
    near = iprox ? nsample0 : 1;
    if (trace == 0) trace = Ntree + 1;
    
    /*printf("\nmdim %d, nclass %d, nrnodes %d, nsample %d, ntest %d\n", mdim, nclass, *nrnodes, nsample, ntest);
    printf("\noobprox %d, mdim %d, nsample0 %d, Ntree %d, mtry %d, mimp %d", oobprox, mdim, nsample0, Ntree, mtry, mimp);
    printf("\nstratify %d, replace %d",stratify,replace);
    printf("\n");*/
    tgini =      (double *) S_alloc_alt(mdim, sizeof(double));
    wl =         (double *) S_alloc_alt(nclass, sizeof(double));
    wr =         (double *) S_alloc_alt(nclass, sizeof(double));
    classpop =   (double *) S_alloc_alt(nclass* *nrnodes, sizeof(double));
    tclasscat =  (double *) S_alloc_alt(nclass*32, sizeof(double));
    tclasspop =  (double *) S_alloc_alt(nclass, sizeof(double));
    tx =         (double *) S_alloc_alt(nsample, sizeof(double));
    win =        (double *) S_alloc_alt(nsample, sizeof(double));
    tp =         (double *) S_alloc_alt(nsample, sizeof(double));
    
    out =           (int *) S_alloc_alt(nsample, sizeof(int));
    bestsplitnext = (int *) S_alloc_alt(*nrnodes, sizeof(int));
    bestsplit =     (int *) S_alloc_alt(*nrnodes, sizeof(int));
    nodepop =       (int *) S_alloc_alt(*nrnodes, sizeof(int));
    nodestart =     (int *) S_alloc_alt(*nrnodes, sizeof(int));
    jin =           (int *) S_alloc_alt(nsample, sizeof(int));
    nodex =         (int *) S_alloc_alt(nsample, sizeof(int));
    nodexts =       (int *) S_alloc_alt(ntest, sizeof(int));
    ta =            (int *) S_alloc_alt(nsample, sizeof(int));
    ncase =         (int *) S_alloc_alt(nsample, sizeof(int));
    jerr =          (int *) S_alloc_alt(nsample, sizeof(int));
    varUsed =       (int *) S_alloc_alt(mdim, sizeof(int));
    jtr =           (int *) S_alloc_alt(nsample, sizeof(int));
    jvr =           (int *) S_alloc_alt(nsample, sizeof(int));
    classFreq =     (int *) S_alloc_alt(nclass, sizeof(int));
    jts =           (int *) S_alloc_alt(ntest, sizeof(int));
    idmove =        (int *) S_alloc_alt(nsample, sizeof(int));
    at =            (int *) S_alloc_alt(mdim*nsample, sizeof(int));
    a =             (int *) S_alloc_alt(mdim*nsample, sizeof(int));
    b =             (int *) S_alloc_alt(mdim*nsample, sizeof(int));
    mind =          (int *) S_alloc_alt(mdim, sizeof(int));
    nright =        (int *) S_alloc_alt(nclass, sizeof(int));
    nrightimp =     (int *) S_alloc_alt(nclass, sizeof(int));
    nout =          (int *) S_alloc_alt(nclass, sizeof(int));
    if (oobprox) {
        oobpair = (int *) S_alloc_alt(near*near, sizeof(int));
    }
    //printf("nsample=%d\n", nsample);
    /* Count number of cases in each class. */
    zeroInt(classFreq, nclass);
    for (n = 0; n < nsample; ++n) classFreq[cl[n] - 1] ++;
    /* Normalize class weights. */
    //Rprintf("ipi %d ",*ipi);
    //for(n=0;n<nclass;n++) Rprintf("%d: %d, %f,",n,classFreq[n],classwt[n]);
    normClassWt(cl, nsample, nclass, *ipi, classwt, classFreq);
    //for(n=0;n<nclass;n++) Rprintf("%d: %d, %f,",n,classFreq[n],classwt[n]);
   
    if (stratify) {
        /* Count number of strata and frequency of each stratum. */
        nstrata = 0;
        for (n = 0; n < nsample0; ++n)
            if (strata[n] > nstrata) nstrata = strata[n];
        /* Create the array of pointers, each pointing to a vector
         * of indices of where data of each stratum is. */
        strata_size = (int  *) S_alloc_alt(nstrata, sizeof(int));
        for (n = 0; n < nsample0; ++n) {
            strata_size[strata[n] - 1] ++;
        }
        strata_idx =  (int **) S_alloc_alt(nstrata, sizeof(int *));
        for (n = 0; n < nstrata; ++n) {
            strata_idx[n] = (int *) S_alloc_alt(strata_size[n], sizeof(int));
        }
        zeroInt(strata_size, nstrata);
        for (n = 0; n < nsample0; ++n) {
            strata_size[strata[n] - 1] ++;
            strata_idx[strata[n] - 1][strata_size[strata[n] - 1] - 1] = n;
        }
    } else {
        nind = replace ? NULL : (int *) S_alloc_alt(nsample, sizeof(int));
    }
    
    /*    INITIALIZE FOR RUN */
    if (*testdat) zeroDouble(countts, ntest * nclass);
    zeroInt(counttr, nclass * nsample);
    zeroInt(out, nsample);
    zeroDouble(tgini, mdim);
    zeroDouble(errtr, (nclass + 1) * Ntree);
    
    if (labelts) {
        nclts  = (int *) S_alloc_alt(nclass, sizeof(int));
        for (n = 0; n < ntest; ++n) nclts[clts[n]-1]++;
        zeroDouble(errts, (nclass + 1) * Ntree);
    }
    //printf("labelts %d\n",labelts);fflush(stdout);
    if (imp) {
        zeroDouble(imprt, (nclass+2) * mdim);
        zeroDouble(impsd, (nclass+1) * mdim);
        if (localImp) zeroDouble(impmat, nsample * mdim);
    }
    if (iprox) {
        zeroDouble(prox, nsample0 * nsample0);
        if (*testdat) zeroDouble(proxts, ntest * (ntest + nsample0));
    }
    makeA(x, mdim, nsample, cat, at, b);
    
    //R_CheckUserInterrupt();
    
    
    /* Starting the main loop over number of trees. */
    GetRNGstate();
    if (trace <= Ntree) {
        /* Print header for running output. */
        Rprintf("ntree      OOB");
        for (n = 1; n <= nclass; ++n) Rprintf("%7i", n);
        if (labelts) {
            Rprintf("|    Test");
            for (n = 1; n <= nclass; ++n) Rprintf("%7i", n);
        }
        Rprintf("\n");
    }
    idxByNnode = 0;
    idxByNsample = 0;
    
    //Rprintf("addclass %d, ntree %d, cl[300]=%d", addClass,Ntree,cl[299]);
    for(jb = 0; jb < Ntree; jb++) {
		//Rprintf("addclass %d, ntree %d, cl[300]=%d", addClass,Ntree,cl[299]);
        //printf("jb=%d,\n",jb);
        /* Do we need to simulate data for the second class? */
        if (addClass) createClass(x, nsample0, nsample, mdim);
        do {
            zeroInt(nodestatus + idxByNnode, *nrnodes);
            zeroInt(treemap + 2*idxByNnode, 2 * *nrnodes);
            zeroDouble(xbestsplit + idxByNnode, *nrnodes);
            zeroInt(nodeclass + idxByNnode, *nrnodes);
            zeroInt(varUsed, mdim);
            /* TODO: Put all sampling code into a function. */
            /* drawSample(sampsize, nsample, ); */
            if (stratify) {  /* stratified sampling */
                zeroInt(jin, nsample);
                zeroDouble(tclasspop, nclass);
                zeroDouble(win, nsample);
                if (replace) {  /* with replacement */
                    for (n = 0; n < nstrata; ++n) {
                        for (j = 0; j < sampsize[n]; ++j) {
                            ktmp = (int) (unif_rand() * strata_size[n]);
                            k = strata_idx[n][ktmp];
                            tclasspop[cl[k] - 1] += classwt[cl[k] - 1];
                            win[k] += classwt[cl[k] - 1];
                            jin[k] = 1;
                        }
                    }
                } else { /* stratified sampling w/o replacement */
                    /* re-initialize the index array */
                    zeroInt(strata_size, nstrata);
                    for (j = 0; j < nsample; ++j) {
                        strata_size[strata[j] - 1] ++;
                        strata_idx[strata[j] - 1][strata_size[strata[j] - 1] - 1] = j;
                    }
                    /* sampling without replacement */
                    for (n = 0; n < nstrata; ++n) {
                        last = strata_size[n] - 1;
                        for (j = 0; j < sampsize[n]; ++j) {
                            ktmp = (int) (unif_rand() * (last+1));
                            k = strata_idx[n][ktmp];
                            swapInt(strata_idx[n][last], strata_idx[n][ktmp]);
                            last--;
                            tclasspop[cl[k] - 1] += classwt[cl[k]-1];
                            win[k] += classwt[cl[k]-1];
                            jin[k] = 1;
                        }
                    }
                }
            } else {  /* unstratified sampling */
                anyEmpty = 0;
                ntry = 0;
                do {
                    zeroInt(jin, nsample);
                    zeroDouble(tclasspop, nclass);
                    zeroDouble(win, nsample);
                    if (replace) {
                        for (n = 0; n < *sampsize; ++n) {
                            k = unif_rand() * nsample;
                            tclasspop[cl[k] - 1] += classwt[cl[k]-1];
                            win[k] += classwt[cl[k]-1];
                            jin[k] = 1;
                        }
                    } else {
                        for (n = 0; n < nsample; ++n) nind[n] = n;
                        last = nsample - 1;
                        for (n = 0; n < *sampsize; ++n) {
                            ktmp = (int) (unif_rand() * (last+1));
                            k = nind[ktmp];
                            swapInt(nind[ktmp], nind[last]);
                            last--;
                            tclasspop[cl[k] - 1] += classwt[cl[k]-1];
                            win[k] += classwt[cl[k]-1];
                            jin[k] = 1;
                        }
                    }
                    /* check if any class is missing in the sample */
                    for (n = 0; n < nclass; ++n) {
                        if (tclasspop[n] == 0) anyEmpty = 1;
                    }
                    ntry++;
                } while (anyEmpty && ntry <= 10);
            }
            
            /* If need to keep indices of inbag data, do that here. */
            if (keepInbag) {
                for (n = 0; n < nsample0; ++n) {
                    inbag[n + idxByNsample] = jin[n];
                }
            }
            
            /* Copy the original a matrix back. */
            memcpy(a, at, sizeof(int) * mdim * nsample);
            modA(a, &nuse, nsample, mdim, cat, *maxcat, ncase, jin);
            
            #ifdef WIN64
            F77_CALL(_buildtree)
            #endif
                    
            #ifndef WIN64
            F77_CALL(buildtree)
            #endif        
            (a, b, cl, cat, maxcat, &mdim, &nsample,
                    &nclass,
                    treemap + 2*idxByNnode, bestvar + idxByNnode,
                    bestsplit, bestsplitnext, tgini,
                    nodestatus + idxByNnode, nodepop,
                    nodestart, classpop, tclasspop, tclasscat,
                    ta, nrnodes, idmove, &ndsize, ncase,
                    &mtry, varUsed, nodeclass + idxByNnode,
                    ndbigtree + jb, win, wr, wl, &mdim,
                    &nuse, mind);
            /* if the "tree" has only the root node, start over */
        } while (ndbigtree[jb] == 1);
        
        Xtranslate(x, mdim, *nrnodes, nsample, bestvar + idxByNnode,
                bestsplit, bestsplitnext, xbestsplit + idxByNnode,
                nodestatus + idxByNnode, cat, ndbigtree[jb]);
        
        /*  Get test set error */
        if (*testdat) {
            predictClassTree(xts, ntest, mdim, treemap + 2*idxByNnode,
                    nodestatus + idxByNnode, xbestsplit + idxByNnode,
                    bestvar + idxByNnode,
                    nodeclass + idxByNnode, ndbigtree[jb],
                    cat, nclass, jts, nodexts, *maxcat);
            TestSetError(countts, jts, clts, outclts, ntest, nclass, jb+1,
                    errts + jb*(nclass+1), labelts, nclts, cut);
        }
        
        /*  Get out-of-bag predictions and errors. */
        predictClassTree(x, nsample, mdim, treemap + 2*idxByNnode,
                nodestatus + idxByNnode, xbestsplit + idxByNnode,
                bestvar + idxByNnode,
                nodeclass + idxByNnode, ndbigtree[jb],
                cat, nclass, jtr, nodex, *maxcat);
        
        zeroInt(nout, nclass);
        noutall = 0;
        for (n = 0; n < nsample; ++n) {
            if (jin[n] == 0) {
                /* increment the OOB votes */
                counttr[n*nclass + jtr[n] - 1] ++;
                /* count number of times a case is OOB */
                out[n]++;
                /* count number of OOB cases in the current iteration.
                 * nout[n] is the number of OOB cases for the n-th class.
                 * noutall is the number of OOB cases overall. */
                nout[cl[n] - 1]++;
                noutall++;
            }
        }
        
        /* Compute out-of-bag error rate. */
        oob(nsample, nclass, jin, cl, jtr, jerr, counttr, out,
                errtr + jb*(nclass+1), outcl, cut);
        
        if ((jb+1) % trace == 0) {
            Rprintf("%5i: %6.2f%%", jb+1, 100.0*errtr[jb * (nclass+1)]);
            for (n = 1; n <= nclass; ++n) {
                Rprintf("%6.2f%%", 100.0 * errtr[n + jb * (nclass+1)]);
            }
            if (labelts) {
                Rprintf("| ");
                for (n = 0; n <= nclass; ++n) {
                    Rprintf("%6.2f%%", 100.0 * errts[n + jb * (nclass+1)]);
                }
            }
            Rprintf("\n");
            
            //R_CheckUserInterrupt();
        }
        
        /*  DO VARIABLE IMPORTANCE  */
        if (imp) {
            nrightall = 0;
            /* Count the number of correct prediction by the current tree
             * among the OOB samples, by class. */
            zeroInt(nright, nclass);
            for (n = 0; n < nsample; ++n) {
                /* out-of-bag and predicted correctly: */
                if (jin[n] == 0 && jtr[n] == cl[n]) {
                    nright[cl[n] - 1]++;
                    nrightall++;
                }
            }
            for (m = 0; m < mdim; ++m) {
                if (varUsed[m]) {
                    nrightimpall = 0;
                    zeroInt(nrightimp, nclass);
                    for (n = 0; n < nsample; ++n) tx[n] = x[m + n*mdim];
                    /* Permute the m-th variable. */
                    permuteOOB(m, x, jin, nsample, mdim);
                    /* Predict the modified data using the current tree. */
                    predictClassTree(x, nsample, mdim, treemap + 2*idxByNnode,
                            nodestatus + idxByNnode,
                            xbestsplit + idxByNnode,
                            bestvar + idxByNnode,
                            nodeclass + idxByNnode, ndbigtree[jb],
                            cat, nclass, jvr, nodex, *maxcat);
                    /* Count how often correct predictions are made with
                     * the modified data. */
                    for (n = 0; n < nsample; n++) {
                        if (jin[n] == 0) {
                            if (jvr[n] == cl[n]) {
                                nrightimp[cl[n] - 1]++;
                                nrightimpall++;
                            }
                            if (localImp && jvr[n] != jtr[n]) {
                                if (cl[n] == jvr[n]) {
                                    impmat[m + n*mdim] -= 1.0;
                                } else {
                                    impmat[m + n*mdim] += 1.0;
                                }
                            }
                        }
                        /* Restore the original data for that variable. */
                        x[m + n*mdim] = tx[n];
                    }
                    /* Accumulate decrease in proportions of correct
                     * predictions. */
                    for (n = 0; n < nclass; ++n) {
                        if (nout[n] > 0) {
                            imprt[m + n*mdim] +=
                                    ((double) (nright[n] - nrightimp[n])) /
                                    nout[n];
                            impsd[m + n*mdim] +=
                                    ((double) (nright[n] - nrightimp[n]) *
                                    (nright[n] - nrightimp[n])) / nout[n];
                        }
                    }
                    if (noutall > 0) {
                        imprt[m + nclass*mdim] +=
                                ((double)(nrightall - nrightimpall)) / noutall;
                        impsd[m + nclass*mdim] +=
                                ((double) (nrightall - nrightimpall) *
                                (nrightall - nrightimpall)) / noutall;
                    }
                }
            }
        }
        
        /*  DO PROXIMITIES */
        if (iprox) {
            computeProximity(prox, oobprox, nodex, jin, oobpair, near);
            /* proximity for test data */
            if (*testdat) {
                computeProximity(proxts, 0, nodexts, jin, oobpair, ntest);
                /* Compute proximity between testset and training set. */
                for (n = 0; n < ntest; ++n) {
                    for (k = 0; k < near; ++k) {
                        if (nodexts[n] == nodex[k])
                            proxts[n + ntest * (k+ntest)] += 1.0;
                    }
                }
            }
        }
        
        if (keepf) idxByNnode += *nrnodes;
        if (keepInbag) idxByNsample += nsample0;
    }
    PutRNGstate();
   
    
    /*  Final processing of variable importance. */
    for (m = 0; m < mdim; m++) tgini[m] /= Ntree;
      
    if (imp) {
        for (m = 0; m < mdim; ++m) {
            if (localImp) { /* casewise measures */
                for (n = 0; n < nsample; ++n) impmat[m + n*mdim] /= out[n];
            }
            /* class-specific measures */
            for (k = 0; k < nclass; ++k) {
                av = imprt[m + k*mdim] / Ntree;
                impsd[m + k*mdim] =
                        sqrt(((impsd[m + k*mdim] / Ntree) - av*av) / Ntree);
                imprt[m + k*mdim] = av;
                /* imprt[m + k*mdim] = (se <= 0.0) ? -1000.0 - av : av / se; */
            }
            /* overall measures */
            av = imprt[m + nclass*mdim] / Ntree;
            impsd[m + nclass*mdim] =
                    sqrt(((impsd[m + nclass*mdim] / Ntree) - av*av) / Ntree);
            imprt[m + nclass*mdim] = av;
            imprt[m + (nclass+1)*mdim] = tgini[m];
        }
    } else {
        for (m = 0; m < mdim; ++m) imprt[m] = tgini[m];
    }
   
    /*  PROXIMITY DATA ++++++++++++++++++++++++++++++++*/
    if (iprox) {
        for (n = 0; n < near; ++n) {
            for (k = n + 1; k < near; ++k) {
                prox[near*k + n] /= oobprox ?
                    (oobpair[near*k + n] > 0 ? oobpair[near*k + n] : 1) :
                        Ntree;
                        prox[near*n + k] = prox[near*k + n];
            }
            prox[near*n + n] = 1.0;
        }
        if (*testdat) {
            for (n = 0; n < ntest; ++n)
                for (k = 0; k < ntest + nsample; ++k)
                    proxts[ntest*k + n] /= Ntree;
        }
    }
    if (trace <= Ntree){
        printf("\nmdim %d, nclass %d, nrnodes %d, nsample %d, ntest %d\n", mdim, nclass, *nrnodes, nsample, ntest);
        printf("\noobprox %d, mdim %d, nsample0 %d, Ntree %d, mtry %d, mimp %d", oobprox, mdim, nsample0, Ntree, mtry, mimp);
        printf("\nstratify %d, replace %d",stratify,replace);
        printf("\n");
    }
    
    //frees up the memory
    free(tgini);free(wl);free(wr);free(classpop);free(tclasscat);
    free(tclasspop);free(tx);free(win);free(tp);free(out);
    free(bestsplitnext);free(bestsplit);free(nodepop);free(nodestart);free(jin);
    free(nodex);free(nodexts);free(ta);free(ncase);free(jerr);
    free(varUsed);free(jtr);free(jvr);free(classFreq);free(jts);
    free(idmove);free(at);free(a);free(b);free(mind);
    free(nright);free(nrightimp);free(nout);
    
    if (oobprox) {
        free(oobpair);
    }
    
    if (stratify) {
        free(strata_size);
        for (n = 0; n < nstrata; ++n) {
            free(strata_idx[n]);
        }
        free(strata_idx);        
    } else {
        if (replace)
            free(nind);
    }
    //printf("labelts %d\n",labelts);fflush(stdout);
    
    if (labelts) {
        free(nclts);        
    }
    //printf("stratify %d",stratify);fflush(stdout);
}
Пример #6
0
void regRF(double *x, double *y, int *xdim, int *sampsize,
	   int *nthsize, int *nrnodes, int *nTree, int *mtry, int *imp,
	   int *cat, int *maxcat, int *jprint, int *doProx, int *oobprox,
           int *biasCorr, double *yptr, double *errimp, double *impmat,
           double *impSD, double *prox, int *treeSize, int *nodestatus,
           int *lDaughter, int *rDaughter, double *avnode, int *mbest,
           double *upper, double *mse, int *keepf, int *replace,
           int *testdat, double *xts, int *nts, double *yts, int *labelts,
           double *yTestPred, double *proxts, double *msets, double *coef,
           int *nout, int *inbag) {
    /*************************************************************************
   Input:
   mdim=number of variables in data set
   nsample=number of cases

   nthsize=number of cases in a node below which the tree will not split,
   setting nthsize=5 generally gives good results.

   nTree=number of trees in run.  200-500 gives pretty good results

   mtry=number of variables to pick to split on at each node.  mdim/3
   seems to give genrally good performance, but it can be
   altered up or down

   imp=1 turns on variable importance.  This is computed for the
   mth variable as the percent rise in the test set mean sum-of-
   squared errors when the mth variable is randomly permuted.

  *************************************************************************/

    double errts = 0.0, averrb, meanY, meanYts, varY, varYts, r, xrand,
	errb = 0.0, resid=0.0, ooberr, ooberrperm, delta, *resOOB;

    double *yb, *xtmp, *xb, *ytr, *ytree, *tgini, *coeffs;

    int k, m, mr, n, nOOB, j, jout, idx, ntest, last, ktmp, nPerm,
        nsample, mdim, keepF, keepInbag;
    int *oobpair, varImp, localImp, *varUsed;

    int *in, *nind, *nodex, *nodexts, *probs;

    nsample = xdim[0];
    mdim = xdim[1];
    ntest = *nts;
    varImp = imp[0];
    localImp = imp[1];
    nPerm = imp[2];
    keepF = keepf[0];
    keepInbag = keepf[1];

    if (*jprint == 0) *jprint = *nTree + 1;

    yb         = (double *) S_alloc(*sampsize, sizeof(double));
    xb         = (double *) S_alloc(mdim * *sampsize, sizeof(double));
    ytr        = (double *) S_alloc(nsample, sizeof(double));
    xtmp       = (double *) S_alloc(nsample, sizeof(double));
    resOOB     = (double *) S_alloc(nsample, sizeof(double));
    coeffs     = (double *) S_alloc(*sampsize, sizeof(double));
  
    probs      = (int *) S_alloc(*sampsize, sizeof(int));
    in         = (int *) S_alloc(nsample, sizeof(int));
    nodex      = (int *) S_alloc(nsample, sizeof(int));
    varUsed    = (int *) S_alloc(mdim, sizeof(int));
    nind = *replace ? NULL : (int *) S_alloc(nsample, sizeof(int));

    if (*testdat) {
	ytree      = (double *) S_alloc(ntest, sizeof(double));
	nodexts    = (int *) S_alloc(ntest, sizeof(int));
    }
    oobpair = (*doProx && *oobprox) ?
	(int *) S_alloc(nsample * nsample, sizeof(int)) : NULL;

    /* If variable importance is requested, tgini points to the second
       "column" of errimp, otherwise it's just the same as errimp. */
    tgini = varImp ? errimp + mdim : errimp;

    averrb = 0.0;
    meanY = 0.0;
    varY = 0.0;

    zeroDouble(yptr, nsample);
    zeroInt(nout, nsample);
    for (n = 0; n < nsample; ++n) {
	varY += n * (y[n] - meanY)*(y[n] - meanY) / (n + 1);
	meanY = (n * meanY + y[n]) / (n + 1);
    }
    varY /= nsample;

    varYts = 0.0;
    meanYts = 0.0;
    if (*testdat) {
	for (n = 0; n < ntest; ++n) {
	    varYts += n * (yts[n] - meanYts)*(yts[n] - meanYts) / (n + 1);
	    meanYts = (n * meanYts + yts[n]) / (n + 1);
	}
	varYts /= ntest;
    }

    if (*doProx) {
        zeroDouble(prox, nsample * nsample);
	if (*testdat) zeroDouble(proxts, ntest * (nsample + ntest));
    }

    if (varImp) {
        zeroDouble(errimp, mdim * 2);
	if (localImp) zeroDouble(impmat, nsample * mdim);
    } else {
        zeroDouble(errimp, mdim);
    }
    if (*labelts) zeroDouble(yTestPred, ntest);

    /* print header for running output */
    if (*jprint <= *nTree) {
	Rprintf("     |      Out-of-bag   ");
	if (*testdat) Rprintf("|       Test set    ");
	Rprintf("|\n");
	Rprintf("Tree |      MSE  %%Var(y) ");
	if (*testdat) Rprintf("|      MSE  %%Var(y) ");
	Rprintf("|\n");
    }
    GetRNGstate();
    /*************************************
     * Start the loop over trees.
     *************************************/
    for (j = 0; j < *nTree; ++j) {

    /* multinomial */
    /*unsigned int coeffs[*sampsize];*/
    /* for loop implementation */
    /*double probs[*sampsize];*/
    for (k = 0; k < *sampsize; ++k) {
        probs[k] = 1/(*sampsize);
    }

    ran_multinomial(*sampsize,100,probs,coeffs);

		idx = keepF ? j * *nrnodes : 0;
		zeroInt(in, nsample);
        zeroInt(varUsed, mdim);
        /* Draw a random sample for growing a tree. */
		if (*replace) { /* sampling with replacement */
			for (n = 0; n < *sampsize; ++n) {
				xrand = unif_rand();
				k = xrand * nsample;
				in[k] = 1;
				yb[n] = y[k];
				for(m = 0; m < mdim; ++m) {
					xb[m + n * mdim] = x[m + k * mdim];
				}
			}
		} else { /* sampling w/o replacement */
			for (n = 0; n < nsample; ++n) nind[n] = n;
			last = nsample - 1;
			for (n = 0; n < *sampsize; ++n) {
				ktmp = (int) (unif_rand() * (last+1));
                k = nind[ktmp];
                swapInt(nind[ktmp], nind[last]);
				last--;
				in[k] = 1;
				yb[n] = y[k];
				for(m = 0; m < mdim; ++m) {
					xb[m + n * mdim] = x[m + k * mdim];
				}
			}
		}
		if (keepInbag) {
			for (n = 0; n < nsample; ++n) inbag[n + j * nsample] = in[n];
		}
        /* grow the regression tree */
		regTree(xb, yb, mdim, *sampsize, lDaughter + idx, rDaughter + idx,
                upper + idx, avnode + idx, nodestatus + idx, *nrnodes,
                treeSize + j, *nthsize, *mtry, mbest + idx, cat, tgini,
                varUsed, coeffs);
        /* predict the OOB data with the current tree */
		/* ytr is the prediction on OOB data by the current tree */
		predictRegTree(x, nsample, mdim, lDaughter + idx,
                       rDaughter + idx, nodestatus + idx, ytr, upper + idx,
                       avnode + idx, mbest + idx, treeSize[j], cat, *maxcat,
                       nodex);
		/* yptr is the aggregated prediction by all trees grown so far */
		errb = 0.0;
		ooberr = 0.0;
		jout = 0; /* jout is the number of cases that has been OOB so far */
		nOOB = 0; /* nOOB is the number of OOB samples for this tree */
		for (n = 0; n < nsample; ++n) {
			if (in[n] == 0) {
				nout[n]++;
                nOOB++;
				yptr[n] = ((nout[n]-1) * yptr[n] + ytr[n]) / nout[n];
				resOOB[n] = ytr[n] - y[n];
                ooberr += resOOB[n] * resOOB[n];
			}
            if (nout[n]) {
				jout++;
				errb += (y[n] - yptr[n]) * (y[n] - yptr[n]);
			}
		}
		errb /= jout;
		/* Do simple linear regression of y on yhat for bias correction. */
		if (*biasCorr) simpleLinReg(nsample, yptr, y, coef, &errb, nout);

		/* predict testset data with the current tree */
		if (*testdat) {
			predictRegTree(xts, ntest, mdim, lDaughter + idx,
						   rDaughter + idx, nodestatus + idx, ytree,
                           upper + idx, avnode + idx,
						   mbest + idx, treeSize[j], cat, *maxcat, nodexts);
			/* ytree is the prediction for test data by the current tree */
			/* yTestPred is the average prediction by all trees grown so far */
			errts = 0.0;
			for (n = 0; n < ntest; ++n) {
				yTestPred[n] = (j * yTestPred[n] + ytree[n]) / (j + 1);
			}
            /* compute testset MSE */
			if (*labelts) {
				for (n = 0; n < ntest; ++n) {
					resid = *biasCorr ?
                        yts[n] - (coef[0] + coef[1]*yTestPred[n]) :
                        yts[n] - yTestPred[n];
					errts += resid * resid;
				}
				errts /= ntest;
			}
		}
        /* Print running output. */
		if ((j + 1) % *jprint == 0) {
			Rprintf("%4d |", j + 1);
			Rprintf(" %8.4g %8.2f ", errb, 100 * errb / varY);
			if(*labelts == 1) Rprintf("| %8.4g %8.2f ",
									  errts, 100.0 * errts / varYts);
			Rprintf("|\n");
		}
		mse[j] = errb;
		if (*labelts) msets[j] = errts;

		/*  DO PROXIMITIES */
		if (*doProx) {
			computeProximity(prox, *oobprox, nodex, in, oobpair, nsample);
			/* proximity for test data */
			if (*testdat) {
                /* In the next call, in and oobpair are not used. */
                computeProximity(proxts, 0, nodexts, in, oobpair, ntest);
				for (n = 0; n < ntest; ++n) {
					for (k = 0; k < nsample; ++k) {
						if (nodexts[n] == nodex[k]) {
							proxts[n + ntest * (k+ntest)] += 1.0;
						}
					}
				}
			}
		}

		/* Variable importance */
		if (varImp) {
			for (mr = 0; mr < mdim; ++mr) {
                if (varUsed[mr]) { /* Go ahead if the variable is used */
                    /* make a copy of the m-th variable into xtmp */
                    for (n = 0; n < nsample; ++n)
                        xtmp[n] = x[mr + n * mdim];
                    ooberrperm = 0.0;
                    for (k = 0; k < nPerm; ++k) {
                        permuteOOB(mr, x, in, nsample, mdim);
                        predictRegTree(x, nsample, mdim, lDaughter + idx,
                                       rDaughter + idx, nodestatus + idx, ytr,
                                       upper + idx, avnode + idx, mbest + idx,
                                       treeSize[j], cat, *maxcat, nodex);
                        for (n = 0; n < nsample; ++n) {
                            if (in[n] == 0) {
                                r = ytr[n] - y[n];
                                ooberrperm += r * r;
                                if (localImp) {
                                    impmat[mr + n * mdim] +=
                                        (r*r - resOOB[n]*resOOB[n]) / nPerm;
                                }
                            }
                        }
                    }
                    delta = (ooberrperm / nPerm - ooberr) / nOOB;
                    errimp[mr] += delta;
                    impSD[mr] += delta * delta;
                    /* copy original data back */
                    for (n = 0; n < nsample; ++n)
                        x[mr + n * mdim] = xtmp[n];
                }
            }
        }
    }
    PutRNGstate();
    /* end of tree iterations=======================================*/

    if (*biasCorr) {  /* bias correction for predicted values */
		for (n = 0; n < nsample; ++n) {
			if (nout[n]) yptr[n] = coef[0] + coef[1] * yptr[n];
		}
		if (*testdat) {
			for (n = 0; n < ntest; ++n) {
				yTestPred[n] = coef[0] + coef[1] * yTestPred[n];
			}
		}
    }

    if (*doProx) {
		for (n = 0; n < nsample; ++n) {
			for (k = n + 1; k < nsample; ++k) {
                prox[nsample*k + n] /= *oobprox ?
                    (oobpair[nsample*k + n] > 0 ? oobpair[nsample*k + n] : 1) :
                    *nTree;
                prox[nsample * n + k] = prox[nsample * k + n];
            }
			prox[nsample * n + n] = 1.0;
        }
		if (*testdat) {
			for (n = 0; n < ntest; ++n)
				for (k = 0; k < ntest + nsample; ++k)
					proxts[ntest*k + n] /= *nTree;
		}
    }

    if (varImp) {
		for (m = 0; m < mdim; ++m) {
			errimp[m] = errimp[m] / *nTree;
			impSD[m] = sqrt( ((impSD[m] / *nTree) -
							  (errimp[m] * errimp[m])) / *nTree );
			if (localImp) {
                for (n = 0; n < nsample; ++n) {
                    impmat[m + n * mdim] /= nout[n];
                }
			}
        }
    }
    for (m = 0; m < mdim; ++m) tgini[m] /= *nTree;
}
Пример #7
0
void findBestSplit(double *x, int *jdex, double *y, int mdim, int nsample,
        int ndstart, int ndend, int *msplit, double *decsplit,
        double *ubest, int *ndendl, int *jstat, int mtry,
        double sumnode, int nodecnt, int *cat) {
    int last, ncat[32], icat[32], lc, nl, nr, npopl, npopr;
    int i, j, kv, l;
    static int *mind, *ncase;
    static double *xt, *ut, *v, *yl;
    double sumcat[32], avcat[32], tavcat[32], ubestt;
    double crit, critmax, critvar, suml, sumr, d, critParent;
    
    
    if (in_findBestSplit==-99){
      free(ncase);
      free(mind); //had to remove this so that it wont crash for when mdim=0, strangely happened for replace=0
      free(v);
      free(yl);
      free(xt);
      free(ut);
     //	PRINTF("giving up mem in findBestSplit\n");
      return;
    }			
    
    if (in_findBestSplit==0){
    	in_findBestSplit=1;
		ut = (double *) calloc(nsample, sizeof(double));
		xt = (double *) calloc(nsample, sizeof(double));
		v  = (double *) calloc(nsample, sizeof(double));
		yl = (double *) calloc(nsample, sizeof(double));
		mind  = (int *) calloc(mdim+1, sizeof(int));   //seems that the sometimes i am asking for kv[10] and that causes problesmms
													   //so allocate 1 more. helps with not crashing in windows
		ncase = (int *) calloc(nsample, sizeof(int));
    }
    zeroDouble(ut, nsample);
    zeroDouble(xt, nsample);
    zeroDouble(v, nsample);
    zeroDouble(yl, nsample);
    zeroInt(mind, mdim);
    zeroInt(ncase, nsample);
    
    zeroDouble(avcat, 32);
    zeroDouble(tavcat, 32);
    
    /* START BIG LOOP */
    *msplit = -1;
    *decsplit = 0.0;
    critmax = 0.0;
    ubestt = 0.0;
    for (i=0; i < mdim; ++i) mind[i] = i;
    
    last = mdim - 1;
    for (i = 0; i < mtry; ++i) {
        critvar = 0.0;
        j = (int) (unif_rand() * (last+1));
        //PRINTF("j=%d, last=%d mind[j]=%d\n", j, last, mind[j]);fflush(stdout);
        kv = mind[j];
		//if(kv>100){
		//	1;
		//	getchar();
		//}
        swapInt(mind[j], mind[last]);
        /* mind[j] = mind[last];
         * mind[last] = kv; */
        last--;
        
        lc = cat[kv];
        if (lc == 1) {
            /* numeric variable */
            for (j = ndstart; j <= ndend; ++j) {
                xt[j] = x[kv + (jdex[j] - 1) * mdim];
                yl[j] = y[jdex[j] - 1];
            }
        } else {
            /* categorical variable */
            zeroInt(ncat, 32);
            zeroDouble(sumcat, 32);
            for (j = ndstart; j <= ndend; ++j) {
                l = (int) x[kv + (jdex[j] - 1) * mdim];
                sumcat[l - 1] += y[jdex[j] - 1];
                ncat[l - 1] ++;
            }
            /* Compute means of Y by category. */
            for (j = 0; j < lc; ++j) {
                avcat[j] = ncat[j] ? sumcat[j] / ncat[j] : 0.0;
            }
            /* Make the category mean the `pseudo' X data. */
            for (j = 0; j < nsample; ++j) {
                xt[j] = avcat[(int) x[kv + (jdex[j] - 1) * mdim] - 1];
                yl[j] = y[jdex[j] - 1];
            }
        }
        /* copy the x data in this node. */
        for (j = ndstart; j <= ndend; ++j) v[j] = xt[j];
        for (j = 1; j <= nsample; ++j) ncase[j - 1] = j;
        R_qsort_I(v, ncase, ndstart + 1, ndend + 1);
        if (v[ndstart] >= v[ndend]) continue;
        /* ncase(n)=case number of v nth from bottom */
        /* Start from the right and search to the left. */
        critParent = sumnode * sumnode / nodecnt;
        suml = 0.0;
        sumr = sumnode;
        npopl = 0;
        npopr = nodecnt;
        crit = 0.0;
        /* Search through the "gaps" in the x-variable. */
        for (j = ndstart; j <= ndend - 1; ++j) {
            d = yl[ncase[j] - 1];
            suml += d;
            sumr -= d;
            npopl++;
            npopr--;
            if (v[j] < v[j+1]) {
                crit = (suml * suml / npopl) + (sumr * sumr / npopr) -
                        critParent;
                if (crit > critvar) {
                    ubestt = (v[j] + v[j+1]) / 2.0;
                    critvar = crit;
                }
            }
        }
        if (critvar > critmax) {
            *ubest = ubestt;
            *msplit = kv + 1;
            critmax = critvar;
            for (j = ndstart; j <= ndend; ++j) {
                ut[j] = xt[j];
            }
            if (cat[kv] > 1) {
                for (j = 0; j < cat[kv]; ++j) tavcat[j] = avcat[j];
            }
        }
    }
    *decsplit = critmax;
    
    /* If best split can not be found, set to terminal node and return. */
    if (*msplit != -1) {
        nl = ndstart;
        for (j = ndstart; j <= ndend; ++j) {
            if (ut[j] <= *ubest) {
                nl++;
                ncase[nl-1] = jdex[j];
            }
        }
        *ndendl = imax2(nl - 1, ndstart);
        nr = *ndendl + 1;
        for (j = ndstart; j <= ndend; ++j) {
            if (ut[j] > *ubest) {
                if (nr >= nsample) break;
                nr++;
                ncase[nr - 1] = jdex[j];
            }
        }
        if (*ndendl >= ndend) *ndendl = ndend - 1;
        for (j = ndstart; j <= ndend; ++j) jdex[j] = ncase[j];
        
        lc = cat[*msplit - 1];
        if (lc > 1) {
            for (j = 0; j < lc; ++j) {
                icat[j] = (tavcat[j] < *ubest) ? 1 : 0;
            }
            *ubest = pack(lc, icat);
        }
    } else *jstat = 1;
    
}
Пример #8
0
void regTree(double *x, double *y, int mdim, int nsample, int *lDaughter,
        int *rDaughter,
        double *upper, double *avnode, SMALL_INT *nodestatus, int nrnodes,
        int *treeSize, int nthsize, int mtry, int *mbest, int *cat,
        double *tgini, int *varUsed) {
    int i, j, k, m, ncur;
    static int *jdex, *nodestart, *nodepop;
    int ndstart, ndend, ndendl, nodecnt, jstat, msplit;
    double d, ss, av, decsplit, ubest, sumnode;
    
    if (in_regTree==-99){
        free(nodestart);
        free(jdex);
        free(nodepop);
//	PRINTF("giving up mem in in_regTree\n");
        return;
    }
    
    if (in_regTree==0){
        in_regTree=1;
        nodestart = (int *) calloc(nrnodes, sizeof(int));
        nodepop   = (int *) calloc(nrnodes, sizeof(int));
        jdex = (int *) calloc(nsample, sizeof(int));
    }
    
    /* initialize some arrays for the tree */
    zeroSMALLInt(nodestatus, nrnodes);
    zeroInt(nodestart, nrnodes);
    zeroInt(nodepop, nrnodes);
    zeroDouble(avnode, nrnodes);
    
    for (i = 1; i <= nsample; ++i) jdex[i-1] = i;
    
    ncur = 0;
    nodestart[0] = 0;
    nodepop[0] = nsample;
    nodestatus[0] = NODE_TOSPLIT;
    
    /* compute mean and sum of squares for Y */
    av = 0.0;
    ss = 0.0;
    for (i = 0; i < nsample; ++i) {
        d = y[jdex[i] - 1];
        ss += i * (av - d) * (av - d) / (i + 1);
        av = (i * av + d) / (i + 1);
    }
    avnode[0] = av;
    
    /* start main loop */
    for (k = 0; k < nrnodes - 2; ++k) {
        if (k > ncur || ncur >= nrnodes - 2) break;
        /* skip if the node is not to be split */
        if (nodestatus[k] != NODE_TOSPLIT) continue;
        
        /* initialize for next call to findbestsplit */
        ndstart = nodestart[k];
        ndend = ndstart + nodepop[k] - 1;
        nodecnt = nodepop[k];
        sumnode = nodecnt * avnode[k];
        jstat = 0;
        decsplit = 0.0;
        
        findBestSplit(x, jdex, y, mdim, nsample, ndstart, ndend, &msplit,
                &decsplit, &ubest, &ndendl, &jstat, mtry, sumnode,
                nodecnt, cat);
        if (jstat == 1) {
            /* Node is terminal: Mark it as such and move on to the next. */
            nodestatus[k] = NODE_TERMINAL;
            continue;
        }
        /* Found the best split. */
        mbest[k] = msplit;
        varUsed[msplit - 1] = 1;
        upper[k] = ubest;
        tgini[msplit - 1] += decsplit;
        nodestatus[k] = NODE_INTERIOR;
        
        /* leftnode no.= ncur+1, rightnode no. = ncur+2. */
        nodepop[ncur + 1] = ndendl - ndstart + 1;
        nodepop[ncur + 2] = ndend - ndendl;
        nodestart[ncur + 1] = ndstart;
        nodestart[ncur + 2] = ndendl + 1;
        
        /* compute mean and sum of squares for the left daughter node */
        av = 0.0;
        ss = 0.0;
        for (j = ndstart; j <= ndendl; ++j) {
            d = y[jdex[j]-1];
            m = j - ndstart;
            ss += m * (av - d) * (av - d) / (m + 1);
            av = (m * av + d) / (m+1);
        }
        avnode[ncur+1] = av;
        nodestatus[ncur+1] = NODE_TOSPLIT;
        if (nodepop[ncur + 1] <= nthsize) {
            nodestatus[ncur + 1] = NODE_TERMINAL;
        }
        
        /* compute mean and sum of squares for the right daughter node */
        av = 0.0;
        ss = 0.0;
        for (j = ndendl + 1; j <= ndend; ++j) {
            d = y[jdex[j]-1];
            m = j - (ndendl + 1);
            ss += m * (av - d) * (av - d) / (m + 1);
            av = (m * av + d) / (m + 1);
        }
        avnode[ncur + 2] = av;
        nodestatus[ncur + 2] = NODE_TOSPLIT;
        if (nodepop[ncur + 2] <= nthsize) {
            nodestatus[ncur + 2] = NODE_TERMINAL;
        }
        
        /* map the daughter nodes */
        lDaughter[k] = ncur + 1 + 1;
        rDaughter[k] = ncur + 2 + 1;
        /* Augment the tree by two nodes. */
        ncur += 2;
    }
    *treeSize = nrnodes;
    for (k = nrnodes - 1; k >= 0; --k) {
        if (nodestatus[k] == 0) (*treeSize)--;
        if (nodestatus[k] == NODE_TOSPLIT) {
            nodestatus[k] = NODE_TERMINAL;
        }
    }
    
}
Пример #9
0
void regRF(double *x, double *y, int *xdim, int *sampsize,
        int *nthsize, int *nrnodes, int *nTree, int *mtry, int *imp,
        int *cat, int maxcat, int *jprint, int doProx, int oobprox,
        int biasCorr, double *yptr, double *errimp, double *impmat,
        double *impSD, double *prox, int *treeSize, SMALL_INT *nodestatus,
        int *lDaughter, int *rDaughter, double *avnode, int *mbest,
        double *upper, double *mse, const int *keepf, int *replace,
        int testdat, double *xts, int *nts, double *yts, int labelts,
        double *yTestPred, double *proxts, double *msets, double *coef,
        int *nout, int *inbag) {
    /*************************************************************************
     * Input:
     * mdim=number of variables in data set
     * nsample=number of cases
     *
     * nthsize=number of cases in a node below which the tree will not split,
     * setting nthsize=5 generally gives good results.
     *
     * nTree=number of trees in run.  200-500 gives pretty good results
     *
     * mtry=number of variables to pick to split on at each node.  mdim/3
     * seems to give genrally good performance, but it can be
     * altered up or down
     *
     * imp=1 turns on variable importance.  This is computed for the
     * mth variable as the percent rise in the test set mean sum-of-
     * squared errors when the mth variable is randomly permuted.
     *
     *************************************************************************/
    
    //PRINTF( "*jprint: %d\n", *jprint );
    //mexEvalString( "pause(0.0001)" );
    
    double errts = 0.0, averrb, meanY, meanYts, varY, varYts, r, xrand,
            errb = 0.0, resid=0.0, ooberr, ooberrperm, delta, *resOOB;
    
    double *yb, *xtmp, *xb, *ytr, *ytree = NULL, *tgini;
    
    int k, m, mr, n, nOOB, j, jout, idx, ntest, last, ktmp, nPerm,
            nsample, mdim, keepF, keepInbag;
    int *oobpair, varImp, localImp, *varUsed;
    
    int *in, *nind, *nodex, *nodexts = NULL;
    
    //Abhi:temp variable
    double tmp_d = 0;
    int tmp_i;
    SMALL_INT tmp_c;
    
    //Do initialization for COKUS's Random generator
    seedMT(2*rand()+1);  //works well with odd number so why don't use that
    
    nsample = xdim[0];
    mdim = xdim[1];
    ntest = *nts;
    varImp = imp[0];
    localImp = imp[1];
    nPerm = imp[2]; //PRINTF("nPerm %d\n",nPerm);
    keepF = keepf[0];
    keepInbag = keepf[1];
    
    if (*jprint == 0) *jprint = *nTree + 1;
    
    yb         = (double *) calloc(*sampsize, sizeof(double));
    xb         = (double *) calloc(mdim * *sampsize, sizeof(double));
    ytr        = (double *) calloc(nsample, sizeof(double));
    xtmp       = (double *) calloc(nsample, sizeof(double));
    resOOB     = (double *) calloc(nsample, sizeof(double));
    
    in        = (int *) calloc(nsample, sizeof(int));
    nodex      = (int *) calloc(nsample, sizeof(int));
    varUsed    = (int *) calloc(mdim, sizeof(int));
    nind = *replace ? NULL : (int *) calloc(nsample, sizeof(int));
    
    if (testdat) {
        ytree      = (double *) calloc(ntest, sizeof(double));
        nodexts    = (int *) calloc(ntest, sizeof(int));
    }
    oobpair = (doProx && oobprox) ?
        (int *) calloc(nsample * nsample, sizeof(int)) : NULL;
        
        /* If variable importance is requested, tgini points to the second
       "column" of errimp, otherwise it's just the same as errimp. */
        tgini = varImp ? errimp + mdim : errimp;
        
        averrb = 0.0;
        meanY = 0.0;
        varY = 0.0;
        
        zeroDouble(yptr, nsample);
        zeroInt(nout, nsample);
        for (n = 0; n < nsample; ++n) {
            varY += n * (y[n] - meanY)*(y[n] - meanY) / (n + 1);
            meanY = (n * meanY + y[n]) / (n + 1);
        }
        varY /= nsample;
        
        varYts = 0.0;
        meanYts = 0.0;
        if (testdat) {
            for (n = 0; n < ntest; ++n) {
                varYts += n * (yts[n] - meanYts)*(yts[n] - meanYts) / (n + 1);
                meanYts = (n * meanYts + yts[n]) / (n + 1);
            }
            varYts /= ntest;
        }
        
        if (doProx) {
            zeroDouble(prox, nsample * nsample);
            if (testdat) zeroDouble(proxts, ntest * (nsample + ntest));
        }
        
        if (varImp) {
            zeroDouble(errimp, mdim * 2);
            if (localImp) zeroDouble(impmat, nsample * mdim);
        } else {
            zeroDouble(errimp, mdim);
        }
        if (labelts) zeroDouble(yTestPred, ntest);
        
        /* print header for running output */
        if (*jprint <= *nTree) {
            PRINTF("     |      Out-of-bag   ");
            if (testdat) PRINTF("|       Test set    ");
            PRINTF("|\n");
            PRINTF("Tree |      MSE  %%Var(y) ");
            if (testdat) PRINTF("|      MSE  %%Var(y) ");
            PRINTF("|\n");
            // mexEvalString( "pause(0.001)" );
        }
        GetRNGstate();
        /*************************************
         * Start the loop over trees.
         *************************************/
        for (j = 0; j < *nTree; ++j) {
            //PRINTF("tree num %d\n",j);fflush(stdout);
            //PRINTF("1. maxcat %d, jprint %d, doProx %d, oobProx %d, biasCorr %d\n", *maxcat, *jprint, doProx, oobprox, biasCorr);
            
            idx = keepF ? j * *nrnodes : 0;
            zeroInt(in, nsample);
            zeroInt(varUsed, mdim);
            /* Draw a random sample for growing a tree. */
//		PRINTF("1.8. maxcat %d, jprint %d, doProx %d, oobProx %d, biasCorr %d testdat %d\n", maxcat, *jprint, doProx, oobprox, biasCorr,testdat);
            
            if (*replace) { /* sampling with replacement */
                for (n = 0; n < *sampsize; ++n) {
                    xrand = unif_rand();
                    k = (int)(xrand * nsample);
                    in[k] = 1;
                    yb[n] = y[k];
                    for(m = 0; m < mdim; ++m) {
                        xb[m + n * mdim] = x[m + k * mdim];
                    }
                }
            } else { /* sampling w/o replacement */
                for (n = 0; n < nsample; ++n) nind[n] = n;
                last = nsample - 1;
                for (n = 0; n < *sampsize; ++n) {
                    ktmp = (int) (unif_rand() * (last+1));
                    k = nind[ktmp];
                    swapInt(nind[ktmp], nind[last]);
                    last--;
                    in[k] = 1;
                    yb[n] = y[k];
                    for(m = 0; m < mdim; ++m) {
                        xb[m + n * mdim] = x[m + k * mdim];
                    }
                }
            }
            if (keepInbag) {
                for (n = 0; n < nsample; ++n) inbag[n + j * nsample] = in[n];
            }
//		PRINTF("1.9. maxcat %d, jprint %d, doProx %d, oobProx %d, biasCorr %d testdat %d\n", maxcat, *jprint, doProx, oobprox, biasCorr,testdat);
            
            /* grow the regression tree */
            regTree(xb, yb, mdim, *sampsize, lDaughter + idx, rDaughter + idx,
                    upper + idx, avnode + idx, nodestatus + idx, *nrnodes,
                    treeSize + j, *nthsize, *mtry, mbest + idx, cat, tgini,
                    varUsed);
            /* predict the OOB data with the current tree */
            /* ytr is the prediction on OOB data by the current tree */
            
//		PRINTF("2. maxcat %d, jprint %d, doProx %d, oobProx %d, biasCorr %d testdat %d\n", maxcat, *jprint, doProx, oobprox, biasCorr,testdat);
            
            predictRegTree(x, nsample, mdim, lDaughter + idx,
                    rDaughter + idx, nodestatus + idx, ytr, upper + idx,
                    avnode + idx, mbest + idx, treeSize[j], cat, maxcat,
                    nodex);
            /* yptr is the aggregated prediction by all trees grown so far */
            errb = 0.0;
            ooberr = 0.0;
            jout = 0; /* jout is the number of cases that has been OOB so far */
            nOOB = 0; /* nOOB is the number of OOB samples for this tree */
            for (n = 0; n < nsample; ++n) {
                if (in[n] == 0) {
                    nout[n]++;
                    nOOB++;
                    yptr[n] = ((nout[n]-1) * yptr[n] + ytr[n]) / nout[n];
                    resOOB[n] = ytr[n] - y[n];
                    ooberr += resOOB[n] * resOOB[n];
                }
                if (nout[n]) {
                    jout++;
                    errb += (y[n] - yptr[n]) * (y[n] - yptr[n]);
                }
            }
            errb /= jout;
            /* Do simple linear regression of y on yhat for bias correction. */
            if (biasCorr) simpleLinReg(nsample, yptr, y, coef, &errb, nout);
//PRINTF("2.5.maxcat %d, jprint %d, doProx %d, oobProx %d, biasCorr %d\n", maxcat, *jprint, doProx, oobprox, biasCorr);
            
            /* predict testset data with the current tree */
            if (testdat) {
                predictRegTree(xts, ntest, mdim, lDaughter + idx,
                        rDaughter + idx, nodestatus + idx, ytree,
                        upper + idx, avnode + idx,
                        mbest + idx, treeSize[j], cat, maxcat, nodexts);
                /* ytree is the prediction for test data by the current tree */
                /* yTestPred is the average prediction by all trees grown so far */
                errts = 0.0;
                for (n = 0; n < ntest; ++n) {
                    yTestPred[n] = (j * yTestPred[n] + ytree[n]) / (j + 1);
                }
                /* compute testset MSE */
                if (labelts) {
                    for (n = 0; n < ntest; ++n) {
                        resid = biasCorr ?
                            yts[n] - (coef[0] + coef[1]*yTestPred[n]) :
                            yts[n] - yTestPred[n];
                            errts += resid * resid;
                    }
                    errts /= ntest;
                }
            }
//PRINTF("2.6.maxcat %d, jprint %d, doProx %d, oobProx %d, biasCorr %d, testdat %d\n", maxcat, *jprint, doProx, oobprox, biasCorr,testdat);
            
            /* Print running output. */
            if ((j + 1) % *jprint == 0) {
                PRINTF("%4d |", j + 1);
                PRINTF(" %8.4g %8.2f ", errb, 100 * errb / varY);
                if(labelts == 1) PRINTF("| %8.4g %8.2f ",
                        errts, 100.0 * errts / varYts);
                PRINTF("|\n");
                fflush(stdout);
                // mexEvalString("pause(.001);"); // to dump string.
            }
            
//PRINTF("2.7.maxcat %d, jprint %d, doProx %d, oobProx %d, biasCorr %d, testdat %d\n", maxcat, *jprint, doProx, oobprox, biasCorr,testdat);
            
            mse[j] = errb;
            if (labelts) msets[j] = errts;
//PRINTF("2.701  j %d, nTree %d, errts %f errb %f \n", j, *nTree, errts,errb);
//PRINTF("2.71.maxcat %d, jprint %d, doProx %d, oobProx %d, biasCorr %d, testdat %d\n", maxcat, *jprint, doProx, oobprox, biasCorr,testdat);
            
            /*  DO PROXIMITIES */
            if (doProx) {
                computeProximity(prox, oobprox, nodex, in, oobpair, nsample);
                /* proximity for test data */
                if (testdat) {
                    /* In the next call, in and oobpair are not used. */
                    computeProximity(proxts, 0, nodexts, in, oobpair, ntest);
                    for (n = 0; n < ntest; ++n) {
                        for (k = 0; k < nsample; ++k) {
                            if (nodexts[n] == nodex[k]) {
                                proxts[n + ntest * (k+ntest)] += 1.0;
                            }
                        }
                    }
                }
            }
//PRINTF("2.8.maxcat %d, jprint %d, doProx %d, oobProx %d, biasCorr %d, testdat %d\n", maxcat, *jprint, doProx, oobprox, biasCorr,testdat);
            
            /* Variable importance */
            if (varImp) {
                for (mr = 0; mr < mdim; ++mr) {
                    if (varUsed[mr]) { /* Go ahead if the variable is used */
                        /* make a copy of the m-th variable into xtmp */
                        for (n = 0; n < nsample; ++n)
                            xtmp[n] = x[mr + n * mdim];
                        ooberrperm = 0.0;
                        for (k = 0; k < nPerm; ++k) {
                            permuteOOB(mr, x, in, nsample, mdim);
                            predictRegTree(x, nsample, mdim, lDaughter + idx,
                                    rDaughter + idx, nodestatus + idx, ytr,
                                    upper + idx, avnode + idx, mbest + idx,
                                    treeSize[j], cat, maxcat, nodex);
                            for (n = 0; n < nsample; ++n) {
                                if (in[n] == 0) {
                                    r = ytr[n] - y[n];
                                    ooberrperm += r * r;
                                    if (localImp) {
                                        impmat[mr + n * mdim] +=
                                                (r*r - resOOB[n]*resOOB[n]) / nPerm;
                                    }
                                }
                            }
                        }
                        delta = (ooberrperm / nPerm - ooberr) / nOOB;
                        errimp[mr] += delta;
                        impSD[mr] += delta * delta;
                        /* copy original data back */
                        for (n = 0; n < nsample; ++n)
                            x[mr + n * mdim] = xtmp[n];
                    }
                    
                }
                
            }
//	PRINTF("3. maxcat %d, jprint %d, doProx %d, oobProx %d, biasCorr %d testdat %d\n", maxcat, *jprint, doProx, oobprox, biasCorr,testdat);
            
        }
        PutRNGstate();
        /* end of tree iterations=======================================*/
        
        if (biasCorr) {  /* bias correction for predicted values */
            for (n = 0; n < nsample; ++n) {
                if (nout[n]) yptr[n] = coef[0] + coef[1] * yptr[n];
            }
            if (testdat) {
                for (n = 0; n < ntest; ++n) {
                    yTestPred[n] = coef[0] + coef[1] * yTestPred[n];
                }
            }
        }
        
        if (doProx) {
            for (n = 0; n < nsample; ++n) {
                for (k = n + 1; k < nsample; ++k) {
                    prox[nsample*k + n] /= oobprox ?
                        (oobpair[nsample*k + n] > 0 ? oobpair[nsample*k + n] : 1) :
                            *nTree;
                            prox[nsample * n + k] = prox[nsample * k + n];
                }
                prox[nsample * n + n] = 1.0;
            }
            if (testdat) {
                for (n = 0; n < ntest; ++n)
                    for (k = 0; k < ntest + nsample; ++k)
                        proxts[ntest*k + n] /= *nTree;
            }
        }
        
        if (varImp) {
            for (m = 0; m < mdim; ++m) {
                errimp[m] = errimp[m] / *nTree;
                impSD[m] = sqrt( ((impSD[m] / *nTree) -
                        (errimp[m] * errimp[m])) / *nTree );
                if (localImp) {
                    for (n = 0; n < nsample; ++n) {
                        impmat[m + n * mdim] /= nout[n];
                    }
                }
            }
        }
        for (m = 0; m < mdim; ++m) tgini[m] /= *nTree;
        
        
        //addition by abhi
        //in order to release the space stored by the variable in findBestSplit
        // call by setting
        in_findBestSplit=-99;
        findBestSplit(&tmp_d, &tmp_i, &tmp_d, tmp_i, tmp_i,
                tmp_i, tmp_i, &tmp_i, &tmp_d,
                &tmp_d, &tmp_i, &tmp_i, tmp_i,
                tmp_d, tmp_i, &tmp_i);
        
        //do the same freeing of space by calling with -99
        in_regTree=-99;
        regTree(&tmp_d, &tmp_d, tmp_i, tmp_i, &tmp_i,
                &tmp_i,
                &tmp_d, &tmp_d, &tmp_c, tmp_i,
                &tmp_i, tmp_i, tmp_i, &tmp_i, &tmp_i,
                &tmp_d, &tmp_i);
	
	
	free(yb);
        free(xb);
	free(ytr);
	free(xtmp);
	free(resOOB);
        free(in);
	free(nodex);
	free(varUsed);
    if (!(*replace)  )
        free(nind);
    
    if (testdat) {
		free(ytree);
		free(nodexts);
	}
	
	if (doProx && oobprox)
		free(oobpair) ;
}