Beispiel #1
0
void C_surrogates(SEXP node, SEXP learnsample, SEXP weights, SEXP controls, 
                  SEXP fitmem) {

    SEXP x, y, expcovinf; 
    SEXP splitctrl, inputs; 
    SEXP split, thiswhichNA;
    int nobs, ninputs, i, j, k, jselect, maxsurr, *order, nvar = 0;
    double ms, cp, *thisweights, *cutpoint, *maxstat, 
           *splitstat, *dweights, *tweights, *dx, *dy;
    double cut, *twotab, *ytmp, sumw = 0.0;
    
    nobs = get_nobs(learnsample);
    ninputs = get_ninputs(learnsample);
    splitctrl = get_splitctrl(controls);
    maxsurr = get_maxsurrogate(splitctrl);
    inputs = GET_SLOT(learnsample, PL2_inputsSym);
    jselect = S3get_variableID(S3get_primarysplit(node));
    
    /* (weights > 0) in left node are the new `response' to be approximated */
    y = S3get_nodeweights(VECTOR_ELT(node, S3_LEFT));
    ytmp = Calloc(nobs, double);
    for (i = 0; i < nobs; i++) {
        ytmp[i] = REAL(y)[i];
        if (ytmp[i] > 1.0) ytmp[i] = 1.0;
    }

    for (j = 0; j < ninputs; j++) {
        if (is_nominal(inputs, j + 1)) continue;
        nvar++;
    }
    nvar--;

    if (maxsurr != LENGTH(S3get_surrogatesplits(node)))
        error("nodes does not have %d surrogate splits", maxsurr);
    if (maxsurr > nvar)
        error("cannot set up %d surrogate splits with only %d ordered input variable(s)", 
              maxsurr, nvar);

    tweights = Calloc(nobs, double);
    dweights = REAL(weights);
    for (i = 0; i < nobs; i++) tweights[i] = dweights[i];
    if (has_missings(inputs, jselect)) {
        thiswhichNA = get_missings(inputs, jselect);
        for (k = 0; k < LENGTH(thiswhichNA); k++)
            tweights[INTEGER(thiswhichNA)[k] - 1] = 0.0;
    }

    /* check if sum(weights) > 1 */
    sumw = 0.0;
    for (i = 0; i < nobs; i++) sumw += tweights[i];
    if (sumw < 2.0)
        error("can't implement surrogate splits, not enough observations available");

    expcovinf = GET_SLOT(fitmem, PL2_expcovinfssSym);
    C_ExpectCovarInfluence(ytmp, 1, tweights, nobs, expcovinf);
    
    splitstat = REAL(get_splitstatistics(fitmem));
    /* <FIXME> extend `TreeFitMemory' to those as well ... */
    maxstat = Calloc(ninputs, double);
    cutpoint = Calloc(ninputs, double);
    order = Calloc(ninputs, int);
    /* <FIXME> */
    
    /* this is essentially an exhaustive search */
    /* <FIXME>: we don't want to do this for random forest like trees 
       </FIXME>
     */
    for (j = 0; j < ninputs; j++) {
    
         order[j] = j + 1;
         maxstat[j] = 0.0;
         cutpoint[j] = 0.0;

         /* ordered input variables only (for the moment) */
         if ((j + 1) == jselect || is_nominal(inputs, j + 1))
             continue;

         x = get_variable(inputs, j + 1);

         if (has_missings(inputs, j + 1)) {

             /* update _tweights_ wrt missings in variable j + 1 */
             thisweights = C_tempweights(j + 1, tweights, fitmem, inputs);

             /* check if sum(weights) > 1 */
             sumw = 0.0;
             for (i = 0; i < nobs; i++) sumw += thisweights[i];
             if (sumw < 2.0) continue;
                 
             C_ExpectCovarInfluence(ytmp, 1, thisweights, nobs, expcovinf);
             
             C_split(REAL(x), 1, ytmp, 1, thisweights, nobs,
                     INTEGER(get_ordering(inputs, j + 1)), splitctrl,
                     GET_SLOT(fitmem, PL2_linexpcov2sampleSym),
                     expcovinf, 1, &cp, &ms, splitstat);
         } else {
         
             C_split(REAL(x), 1, ytmp, 1, tweights, nobs,
             INTEGER(get_ordering(inputs, j + 1)), splitctrl,
             GET_SLOT(fitmem, PL2_linexpcov2sampleSym),
             expcovinf, 1, &cp, &ms, splitstat);
         }

         maxstat[j] = -ms;
         cutpoint[j] = cp;
    }


    /* order with respect to maximal statistic */
    rsort_with_index(maxstat, order, ninputs);
    
    twotab = Calloc(4, double);
    
    /* the best `maxsurr' ones are implemented */
    for (j = 0; j < maxsurr; j++) {

        if (is_nominal(inputs, order[j])) continue;
        
        for (i = 0; i < 4; i++) twotab[i] = 0.0;
        cut = cutpoint[order[j] - 1];
        /* this might give warnings about split being
           UNPROTECTed but is is since node is PROTECTed */
        PROTECT(split = allocVector(VECSXP, SPLIT_LENGTH));
        SET_VECTOR_ELT(S3get_surrogatesplits(node), j, split);
        C_init_orderedsplit(split, 0);
        S3set_variableID(split, order[j]);
        REAL(S3get_splitpoint(split))[0] = cut;
        dx = REAL(get_variable(inputs, order[j]));
        dy = REAL(y);

        /* OK, this is a dirty hack: determine if the split 
           goes left or right by the Pearson residual of a 2x2 table.
           I don't want to use the big caliber here 
        */
        for (i = 0; i < nobs; i++) {
            twotab[0] += ((dy[i] == 1) && (dx[i] <= cut)) * tweights[i];
            twotab[1] += (dy[i] == 1) * tweights[i];
            twotab[2] += (dx[i] <= cut) * tweights[i];
            twotab[3] += tweights[i];
        }
        S3set_toleft(split, (int) (twotab[0] - twotab[1] * twotab[2] / 
                     twotab[3]) > 0);
        UNPROTECT(1);
    }
    
    Free(maxstat);
    Free(cutpoint);
    Free(order);
    Free(tweights);
    Free(twotab);
    Free(ytmp);
}
Beispiel #2
0
void C_Node(SEXP node, SEXP learnsample, SEXP weights,
            SEXP fitmem, SEXP controls, int TERMINAL, int depth) {

    int nobs, ninputs, jselect, q, j, k, i;
    double mincriterion, sweights, *dprediction;
    double *teststat, *pvalue, smax, cutpoint = 0.0, maxstat = 0.0;
    double *standstat, *splitstat;
    SEXP responses, inputs, x, expcovinf, linexpcov;
    SEXP varctrl, splitctrl, gtctrl, tgctrl, split, testy, predy;
    double *dxtransf, *thisweights;
    int *itable;

    nobs = get_nobs(learnsample);
    ninputs = get_ninputs(learnsample);
    varctrl = get_varctrl(controls);
    splitctrl = get_splitctrl(controls);
    gtctrl = get_gtctrl(controls);
    tgctrl = get_tgctrl(controls);
    mincriterion = get_mincriterion(gtctrl);
    responses = GET_SLOT(learnsample, PL2_responsesSym);
    inputs = GET_SLOT(learnsample, PL2_inputsSym);
    testy = get_test_trafo(responses);
    predy = get_predict_trafo(responses);
    q = ncol(testy);

    /* <FIXME> we compute C_GlobalTest even for TERMINAL nodes! </FIXME> */

    /* compute the test statistics and the node criteria for each input */
    C_GlobalTest(learnsample, weights, fitmem, varctrl,
                 gtctrl, get_minsplit(splitctrl),
                 REAL(S3get_teststat(node)), REAL(S3get_criterion(node)), depth);

    /* sum of weights: C_GlobalTest did nothing if sweights < mincriterion */
    sweights = REAL(GET_SLOT(GET_SLOT(fitmem, PL2_expcovinfSym),
                             PL2_sumweightsSym))[0];
    REAL(VECTOR_ELT(node, S3_SUMWEIGHTS))[0] = sweights;

    /* compute the prediction of this node */
    dprediction = REAL(S3get_prediction(node));

    /* <FIXME> feed raw numeric values OR dummy encoded factors as y
       Problem: what happens for survival times ? */
    C_prediction(REAL(predy), nobs, ncol(predy), REAL(weights),
                 sweights, dprediction);
    /* </FIXME> */

    teststat = REAL(S3get_teststat(node));
    pvalue = REAL(S3get_criterion(node));

    /* try the two out of ninputs best inputs variables */
    /* <FIXME> be more flexible and add a parameter controlling
               the number of inputs tried </FIXME> */
    for (j = 0; j < 2; j++) {

        smax = C_max(pvalue, ninputs);
        REAL(S3get_maxcriterion(node))[0] = smax;

        /* if the global null hypothesis was rejected */
        if (smax > mincriterion && !TERMINAL) {

            /* the input variable with largest association to the response */
            jselect = C_whichmax(pvalue, teststat, ninputs) + 1;

            /* get the raw numeric values or the codings of a factor */
            x = get_variable(inputs, jselect);
            if (has_missings(inputs, jselect)) {
                expcovinf = GET_SLOT(get_varmemory(fitmem, jselect),
                                     PL2_expcovinfSym);
                thisweights = C_tempweights(jselect, weights, fitmem, inputs);
            } else {
                expcovinf = GET_SLOT(fitmem, PL2_expcovinfSym);
                thisweights = REAL(weights);
            }

            /* <FIXME> handle ordered factors separatly??? </FIXME> */
            if (!is_nominal(inputs, jselect)) {

                /* search for a split in a ordered variable x */
                split = S3get_primarysplit(node);

                /* check if the n-vector of splitstatistics
                   should be returned for each primary split */
                if (get_savesplitstats(tgctrl)) {
                    C_init_orderedsplit(split, nobs);
                    splitstat = REAL(S3get_splitstatistics(split));
                } else {
                    C_init_orderedsplit(split, 0);
                    splitstat = REAL(get_splitstatistics(fitmem));
                }

                C_split(REAL(x), 1, REAL(testy), q, thisweights, nobs,
                        INTEGER(get_ordering(inputs, jselect)), splitctrl,
                        GET_SLOT(fitmem, PL2_linexpcov2sampleSym),
                        expcovinf, REAL(S3get_splitpoint(split)), &maxstat,
                        splitstat);
                S3set_variableID(split, jselect);
            } else {

                /* search of a set of levels (split) in a numeric variable x */
                split = S3get_primarysplit(node);

                /* check if the n-vector of splitstatistics
                   should be returned for each primary split */
                if (get_savesplitstats(tgctrl)) {
                    C_init_nominalsplit(split,
                                        LENGTH(get_levels(inputs, jselect)),
                                        nobs);
                    splitstat = REAL(S3get_splitstatistics(split));
                } else {
                    C_init_nominalsplit(split,
                                        LENGTH(get_levels(inputs, jselect)),
                                        0);
                    splitstat = REAL(get_splitstatistics(fitmem));
                }

                linexpcov = get_varmemory(fitmem, jselect);
                standstat = Calloc(get_dimension(linexpcov), double);
                C_standardize(REAL(GET_SLOT(linexpcov,
                                            PL2_linearstatisticSym)),
                              REAL(GET_SLOT(linexpcov, PL2_expectationSym)),
                              REAL(GET_SLOT(linexpcov, PL2_covarianceSym)),
                              get_dimension(linexpcov), get_tol(splitctrl),
                              standstat);

                C_splitcategorical(INTEGER(x),
                                   LENGTH(get_levels(inputs, jselect)),
                                   REAL(testy), q, thisweights,
                                   nobs, standstat, splitctrl,
                                   GET_SLOT(fitmem, PL2_linexpcov2sampleSym),
                                   expcovinf, &cutpoint,
                                   INTEGER(S3get_splitpoint(split)),
                                   &maxstat, splitstat);

                /* compute which levels of a factor are available in this node
                   (for printing) later on. A real `table' for this node would
                   induce too much overhead here. Maybe later. */

                itable = INTEGER(S3get_table(split));
                dxtransf = REAL(get_transformation(inputs, jselect));
                for (k = 0; k < LENGTH(get_levels(inputs, jselect)); k++) {
                    itable[k] = 0;
                    for (i = 0; i < nobs; i++) {
                        if (dxtransf[k * nobs + i] * thisweights[i] > 0) {
                            itable[k] = 1;
                            continue;
                        }
                    }
                }

                Free(standstat);
            }
            if (maxstat == 0) {
                if (j == 1) {
                    S3set_nodeterminal(node);
                } else {
                    /* do not look at jselect in next iteration */
                    pvalue[jselect - 1] = R_NegInf;
                }
            } else {
                S3set_variableID(split, jselect);
                break;
            }
        } else {