Пример #1
0
SEXP R_TreeGrow(SEXP learnsample, SEXP weights, SEXP controls) {
            
     SEXP ans, tree, where, nweights, fitmem;
     double *dnweights, *dweights;
     int nobs, i, nodenum = 1, *iwhere;


     GetRNGstate();
     
     PROTECT(fitmem = ctree_memory(learnsample, PROTECT(ScalarLogical(1))));
     nobs = get_nobs(learnsample);
     PROTECT(ans = allocVector(VECSXP, 2));
     SET_VECTOR_ELT(ans, 0, where = allocVector(INTSXP, nobs));
     iwhere = INTEGER(where);
     for (int i = 0; i < nobs; i++) iwhere[i] = 0;
     SET_VECTOR_ELT(ans, 1, tree = allocVector(VECSXP, NODE_LENGTH));
     C_init_node(tree, nobs, get_ninputs(learnsample), get_maxsurrogate(get_splitctrl(controls)),
                 ncol(get_predict_trafo(GET_SLOT(learnsample, PL2_responsesSym))));

     nweights = S3get_nodeweights(tree);
     dnweights = REAL(nweights);
     dweights = REAL(weights);
     for (i = 0; i < nobs; i++) dnweights[i] = dweights[i];
     
     C_TreeGrow(tree, learnsample, fitmem, controls, iwhere, &nodenum, 1);

     if (LOGICAL(GET_SLOT(get_tgctrl(controls), PL2_remove_weightsSym))[0])
         C_remove_weights(tree, 0);
     
     PutRNGstate();
     
     UNPROTECT(3);
     return(ans);
}
Пример #2
0
void C_TreeGrow(SEXP node, SEXP learnsample, SEXP fitmem, 
                SEXP controls, int *where, int *nodenum, int depth) {

    SEXP weights;
    int nobs, i, stop;
    double *dweights;
    
    weights = S3get_nodeweights(node);
    
    /* stop if either stumps have been requested or 
       the maximum depth is exceeded */
    stop = (nodenum[0] == 2 || nodenum[0] == 3) && 
           get_stump(get_tgctrl(controls));
    stop = stop || !check_depth(get_tgctrl(controls), depth);
    
    if (stop)
        C_Node(node, learnsample, weights, fitmem, controls, 1, depth);
    else
        C_Node(node, learnsample, weights, fitmem, controls, 0, depth);
    
    S3set_nodeID(node, nodenum[0]);    
    
    if (!S3get_nodeterminal(node)) {

        C_splitnode(node, learnsample, controls);

        /* determine surrogate splits and split missing values */
        if (get_maxsurrogate(get_splitctrl(controls)) > 0) {
            C_surrogates(node, learnsample, weights, controls, fitmem);
            C_splitsurrogate(node, learnsample);
        }
            
        nodenum[0] += 1;
        C_TreeGrow(S3get_leftnode(node), learnsample, fitmem, 
                   controls, where, nodenum, depth + 1);

        nodenum[0] += 1;                                      
        C_TreeGrow(S3get_rightnode(node), learnsample, fitmem, 
                   controls, where, nodenum, depth + 1);
                   
    } else {
        dweights = REAL(weights);
        nobs = get_nobs(learnsample);
        for (i = 0; i < nobs; i++)
            if (dweights[i] > 0) where[i] = nodenum[0];
    } 
}
Пример #3
0
void C_Node(SEXP node, SEXP learnsample, SEXP weights,
            SEXP fitmem, SEXP controls, int TERMINAL, int depth) {

    int nobs, ninputs, jselect, q, j, k, i;
    double mincriterion, sweights, *dprediction;
    double *teststat, *pvalue, smax, cutpoint = 0.0, maxstat = 0.0;
    double *standstat, *splitstat;
    SEXP responses, inputs, x, expcovinf, linexpcov;
    SEXP varctrl, splitctrl, gtctrl, tgctrl, split, testy, predy;
    double *dxtransf, *thisweights;
    int *itable;

    nobs = get_nobs(learnsample);
    ninputs = get_ninputs(learnsample);
    varctrl = get_varctrl(controls);
    splitctrl = get_splitctrl(controls);
    gtctrl = get_gtctrl(controls);
    tgctrl = get_tgctrl(controls);
    mincriterion = get_mincriterion(gtctrl);
    responses = GET_SLOT(learnsample, PL2_responsesSym);
    inputs = GET_SLOT(learnsample, PL2_inputsSym);
    testy = get_test_trafo(responses);
    predy = get_predict_trafo(responses);
    q = ncol(testy);

    /* <FIXME> we compute C_GlobalTest even for TERMINAL nodes! </FIXME> */

    /* compute the test statistics and the node criteria for each input */
    C_GlobalTest(learnsample, weights, fitmem, varctrl,
                 gtctrl, get_minsplit(splitctrl),
                 REAL(S3get_teststat(node)), REAL(S3get_criterion(node)), depth);

    /* sum of weights: C_GlobalTest did nothing if sweights < mincriterion */
    sweights = REAL(GET_SLOT(GET_SLOT(fitmem, PL2_expcovinfSym),
                             PL2_sumweightsSym))[0];
    REAL(VECTOR_ELT(node, S3_SUMWEIGHTS))[0] = sweights;

    /* compute the prediction of this node */
    dprediction = REAL(S3get_prediction(node));

    /* <FIXME> feed raw numeric values OR dummy encoded factors as y
       Problem: what happens for survival times ? */
    C_prediction(REAL(predy), nobs, ncol(predy), REAL(weights),
                 sweights, dprediction);
    /* </FIXME> */

    teststat = REAL(S3get_teststat(node));
    pvalue = REAL(S3get_criterion(node));

    /* try the two out of ninputs best inputs variables */
    /* <FIXME> be more flexible and add a parameter controlling
               the number of inputs tried </FIXME> */
    for (j = 0; j < 2; j++) {

        smax = C_max(pvalue, ninputs);
        REAL(S3get_maxcriterion(node))[0] = smax;

        /* if the global null hypothesis was rejected */
        if (smax > mincriterion && !TERMINAL) {

            /* the input variable with largest association to the response */
            jselect = C_whichmax(pvalue, teststat, ninputs) + 1;

            /* get the raw numeric values or the codings of a factor */
            x = get_variable(inputs, jselect);
            if (has_missings(inputs, jselect)) {
                expcovinf = GET_SLOT(get_varmemory(fitmem, jselect),
                                     PL2_expcovinfSym);
                thisweights = C_tempweights(jselect, weights, fitmem, inputs);
            } else {
                expcovinf = GET_SLOT(fitmem, PL2_expcovinfSym);
                thisweights = REAL(weights);
            }

            /* <FIXME> handle ordered factors separatly??? </FIXME> */
            if (!is_nominal(inputs, jselect)) {

                /* search for a split in a ordered variable x */
                split = S3get_primarysplit(node);

                /* check if the n-vector of splitstatistics
                   should be returned for each primary split */
                if (get_savesplitstats(tgctrl)) {
                    C_init_orderedsplit(split, nobs);
                    splitstat = REAL(S3get_splitstatistics(split));
                } else {
                    C_init_orderedsplit(split, 0);
                    splitstat = REAL(get_splitstatistics(fitmem));
                }

                C_split(REAL(x), 1, REAL(testy), q, thisweights, nobs,
                        INTEGER(get_ordering(inputs, jselect)), splitctrl,
                        GET_SLOT(fitmem, PL2_linexpcov2sampleSym),
                        expcovinf, REAL(S3get_splitpoint(split)), &maxstat,
                        splitstat);
                S3set_variableID(split, jselect);
            } else {

                /* search of a set of levels (split) in a numeric variable x */
                split = S3get_primarysplit(node);

                /* check if the n-vector of splitstatistics
                   should be returned for each primary split */
                if (get_savesplitstats(tgctrl)) {
                    C_init_nominalsplit(split,
                                        LENGTH(get_levels(inputs, jselect)),
                                        nobs);
                    splitstat = REAL(S3get_splitstatistics(split));
                } else {
                    C_init_nominalsplit(split,
                                        LENGTH(get_levels(inputs, jselect)),
                                        0);
                    splitstat = REAL(get_splitstatistics(fitmem));
                }

                linexpcov = get_varmemory(fitmem, jselect);
                standstat = Calloc(get_dimension(linexpcov), double);
                C_standardize(REAL(GET_SLOT(linexpcov,
                                            PL2_linearstatisticSym)),
                              REAL(GET_SLOT(linexpcov, PL2_expectationSym)),
                              REAL(GET_SLOT(linexpcov, PL2_covarianceSym)),
                              get_dimension(linexpcov), get_tol(splitctrl),
                              standstat);

                C_splitcategorical(INTEGER(x),
                                   LENGTH(get_levels(inputs, jselect)),
                                   REAL(testy), q, thisweights,
                                   nobs, standstat, splitctrl,
                                   GET_SLOT(fitmem, PL2_linexpcov2sampleSym),
                                   expcovinf, &cutpoint,
                                   INTEGER(S3get_splitpoint(split)),
                                   &maxstat, splitstat);

                /* compute which levels of a factor are available in this node
                   (for printing) later on. A real `table' for this node would
                   induce too much overhead here. Maybe later. */

                itable = INTEGER(S3get_table(split));
                dxtransf = REAL(get_transformation(inputs, jselect));
                for (k = 0; k < LENGTH(get_levels(inputs, jselect)); k++) {
                    itable[k] = 0;
                    for (i = 0; i < nobs; i++) {
                        if (dxtransf[k * nobs + i] * thisweights[i] > 0) {
                            itable[k] = 1;
                            continue;
                        }
                    }
                }

                Free(standstat);
            }
            if (maxstat == 0) {
                if (j == 1) {
                    S3set_nodeterminal(node);
                } else {
                    /* do not look at jselect in next iteration */
                    pvalue[jselect - 1] = R_NegInf;
                }
            } else {
                S3set_variableID(split, jselect);
                break;
            }
        } else {