void C_splitsurrogate(SEXP node, SEXP learnsample) { SEXP weights, split, surrsplit; SEXP inputs, whichNA, whichNAns; double cutpoint, *dx, *dweights, *leftweights, *rightweights; int *iwhichNA, k; int i, nna, ns; weights = S3get_nodeweights(node); dweights = REAL(weights); inputs = GET_SLOT(learnsample, PL2_inputsSym); leftweights = REAL(S3get_nodeweights(S3get_leftnode(node))); rightweights = REAL(S3get_nodeweights(S3get_rightnode(node))); surrsplit = S3get_surrogatesplits(node); /* if the primary split has any missings */ split = S3get_primarysplit(node); if (has_missings(inputs, S3get_variableID(split))) { /* where are the missings? */ whichNA = get_missings(inputs, S3get_variableID(split)); iwhichNA = INTEGER(whichNA); nna = LENGTH(whichNA); /* for all missing values ... */ for (k = 0; k < nna; k++) { ns = 0; i = iwhichNA[k] - 1; if (dweights[i] == 0) continue; /* loop over surrogate splits until an appropriate one is found */ while(TRUE) { if (ns >= LENGTH(surrsplit)) break; split = VECTOR_ELT(surrsplit, ns); if (has_missings(inputs, S3get_variableID(split))) { whichNAns = get_missings(inputs, S3get_variableID(split)); if (C_i_in_set(i + 1, whichNAns)) { ns++; continue; } } cutpoint = REAL(S3get_splitpoint(split))[0]; dx = REAL(get_variable(inputs, S3get_variableID(split))); if (S3get_toleft(split)) { if (dx[i] <= cutpoint) { leftweights[i] = dweights[i]; rightweights[i] = 0.0; } else { rightweights[i] = dweights[i]; leftweights[i] = 0.0; } } else { if (dx[i] <= cutpoint) { rightweights[i] = dweights[i]; leftweights[i] = 0.0; } else { leftweights[i] = dweights[i]; rightweights[i] = 0.0; } } break; } } } }
void C_surrogates(SEXP node, SEXP learnsample, SEXP weights, SEXP controls, SEXP fitmem) { SEXP x, y, expcovinf; SEXP splitctrl, inputs; SEXP split, thiswhichNA; int nobs, ninputs, i, j, k, jselect, maxsurr, *order, nvar = 0; double ms, cp, *thisweights, *cutpoint, *maxstat, *splitstat, *dweights, *tweights, *dx, *dy; double cut, *twotab, *ytmp, sumw = 0.0; nobs = get_nobs(learnsample); ninputs = get_ninputs(learnsample); splitctrl = get_splitctrl(controls); maxsurr = get_maxsurrogate(splitctrl); inputs = GET_SLOT(learnsample, PL2_inputsSym); jselect = S3get_variableID(S3get_primarysplit(node)); /* (weights > 0) in left node are the new `response' to be approximated */ y = S3get_nodeweights(VECTOR_ELT(node, S3_LEFT)); ytmp = Calloc(nobs, double); for (i = 0; i < nobs; i++) { ytmp[i] = REAL(y)[i]; if (ytmp[i] > 1.0) ytmp[i] = 1.0; } for (j = 0; j < ninputs; j++) { if (is_nominal(inputs, j + 1)) continue; nvar++; } nvar--; if (maxsurr != LENGTH(S3get_surrogatesplits(node))) error("nodes does not have %d surrogate splits", maxsurr); if (maxsurr > nvar) error("cannot set up %d surrogate splits with only %d ordered input variable(s)", maxsurr, nvar); tweights = Calloc(nobs, double); dweights = REAL(weights); for (i = 0; i < nobs; i++) tweights[i] = dweights[i]; if (has_missings(inputs, jselect)) { thiswhichNA = get_missings(inputs, jselect); for (k = 0; k < LENGTH(thiswhichNA); k++) tweights[INTEGER(thiswhichNA)[k] - 1] = 0.0; } /* check if sum(weights) > 1 */ sumw = 0.0; for (i = 0; i < nobs; i++) sumw += tweights[i]; if (sumw < 2.0) error("can't implement surrogate splits, not enough observations available"); expcovinf = GET_SLOT(fitmem, PL2_expcovinfssSym); C_ExpectCovarInfluence(ytmp, 1, tweights, nobs, expcovinf); splitstat = REAL(get_splitstatistics(fitmem)); /* <FIXME> extend `TreeFitMemory' to those as well ... */ maxstat = Calloc(ninputs, double); cutpoint = Calloc(ninputs, double); order = Calloc(ninputs, int); /* <FIXME> */ /* this is essentially an exhaustive search */ /* <FIXME>: we don't want to do this for random forest like trees </FIXME> */ for (j = 0; j < ninputs; j++) { order[j] = j + 1; maxstat[j] = 0.0; cutpoint[j] = 0.0; /* ordered input variables only (for the moment) */ if ((j + 1) == jselect || is_nominal(inputs, j + 1)) continue; x = get_variable(inputs, j + 1); if (has_missings(inputs, j + 1)) { /* update _tweights_ wrt missings in variable j + 1 */ thisweights = C_tempweights(j + 1, tweights, fitmem, inputs); /* check if sum(weights) > 1 */ sumw = 0.0; for (i = 0; i < nobs; i++) sumw += thisweights[i]; if (sumw < 2.0) continue; C_ExpectCovarInfluence(ytmp, 1, thisweights, nobs, expcovinf); C_split(REAL(x), 1, ytmp, 1, thisweights, nobs, INTEGER(get_ordering(inputs, j + 1)), splitctrl, GET_SLOT(fitmem, PL2_linexpcov2sampleSym), expcovinf, 1, &cp, &ms, splitstat); } else { C_split(REAL(x), 1, ytmp, 1, tweights, nobs, INTEGER(get_ordering(inputs, j + 1)), splitctrl, GET_SLOT(fitmem, PL2_linexpcov2sampleSym), expcovinf, 1, &cp, &ms, splitstat); } maxstat[j] = -ms; cutpoint[j] = cp; } /* order with respect to maximal statistic */ rsort_with_index(maxstat, order, ninputs); twotab = Calloc(4, double); /* the best `maxsurr' ones are implemented */ for (j = 0; j < maxsurr; j++) { if (is_nominal(inputs, order[j])) continue; for (i = 0; i < 4; i++) twotab[i] = 0.0; cut = cutpoint[order[j] - 1]; /* this might give warnings about split being UNPROTECTed but is is since node is PROTECTed */ PROTECT(split = allocVector(VECSXP, SPLIT_LENGTH)); SET_VECTOR_ELT(S3get_surrogatesplits(node), j, split); C_init_orderedsplit(split, 0); S3set_variableID(split, order[j]); REAL(S3get_splitpoint(split))[0] = cut; dx = REAL(get_variable(inputs, order[j])); dy = REAL(y); /* OK, this is a dirty hack: determine if the split goes left or right by the Pearson residual of a 2x2 table. I don't want to use the big caliber here */ for (i = 0; i < nobs; i++) { twotab[0] += ((dy[i] == 1) && (dx[i] <= cut)) * tweights[i]; twotab[1] += (dy[i] == 1) * tweights[i]; twotab[2] += (dx[i] <= cut) * tweights[i]; twotab[3] += tweights[i]; } S3set_toleft(split, (int) (twotab[0] - twotab[1] * twotab[2] / twotab[3]) > 0); UNPROTECT(1); } Free(maxstat); Free(cutpoint); Free(order); Free(tweights); Free(twotab); Free(ytmp); }
void C_Node(SEXP node, SEXP learnsample, SEXP weights, SEXP fitmem, SEXP controls, int TERMINAL, int depth) { int nobs, ninputs, jselect, q, j, k, i; double mincriterion, sweights, *dprediction; double *teststat, *pvalue, smax, cutpoint = 0.0, maxstat = 0.0; double *standstat, *splitstat; SEXP responses, inputs, x, expcovinf, linexpcov; SEXP varctrl, splitctrl, gtctrl, tgctrl, split, testy, predy; double *dxtransf, *thisweights; int *itable; nobs = get_nobs(learnsample); ninputs = get_ninputs(learnsample); varctrl = get_varctrl(controls); splitctrl = get_splitctrl(controls); gtctrl = get_gtctrl(controls); tgctrl = get_tgctrl(controls); mincriterion = get_mincriterion(gtctrl); responses = GET_SLOT(learnsample, PL2_responsesSym); inputs = GET_SLOT(learnsample, PL2_inputsSym); testy = get_test_trafo(responses); predy = get_predict_trafo(responses); q = ncol(testy); /* <FIXME> we compute C_GlobalTest even for TERMINAL nodes! </FIXME> */ /* compute the test statistics and the node criteria for each input */ C_GlobalTest(learnsample, weights, fitmem, varctrl, gtctrl, get_minsplit(splitctrl), REAL(S3get_teststat(node)), REAL(S3get_criterion(node)), depth); /* sum of weights: C_GlobalTest did nothing if sweights < mincriterion */ sweights = REAL(GET_SLOT(GET_SLOT(fitmem, PL2_expcovinfSym), PL2_sumweightsSym))[0]; REAL(VECTOR_ELT(node, S3_SUMWEIGHTS))[0] = sweights; /* compute the prediction of this node */ dprediction = REAL(S3get_prediction(node)); /* <FIXME> feed raw numeric values OR dummy encoded factors as y Problem: what happens for survival times ? */ C_prediction(REAL(predy), nobs, ncol(predy), REAL(weights), sweights, dprediction); /* </FIXME> */ teststat = REAL(S3get_teststat(node)); pvalue = REAL(S3get_criterion(node)); /* try the two out of ninputs best inputs variables */ /* <FIXME> be more flexible and add a parameter controlling the number of inputs tried </FIXME> */ for (j = 0; j < 2; j++) { smax = C_max(pvalue, ninputs); REAL(S3get_maxcriterion(node))[0] = smax; /* if the global null hypothesis was rejected */ if (smax > mincriterion && !TERMINAL) { /* the input variable with largest association to the response */ jselect = C_whichmax(pvalue, teststat, ninputs) + 1; /* get the raw numeric values or the codings of a factor */ x = get_variable(inputs, jselect); if (has_missings(inputs, jselect)) { expcovinf = GET_SLOT(get_varmemory(fitmem, jselect), PL2_expcovinfSym); thisweights = C_tempweights(jselect, weights, fitmem, inputs); } else { expcovinf = GET_SLOT(fitmem, PL2_expcovinfSym); thisweights = REAL(weights); } /* <FIXME> handle ordered factors separatly??? </FIXME> */ if (!is_nominal(inputs, jselect)) { /* search for a split in a ordered variable x */ split = S3get_primarysplit(node); /* check if the n-vector of splitstatistics should be returned for each primary split */ if (get_savesplitstats(tgctrl)) { C_init_orderedsplit(split, nobs); splitstat = REAL(S3get_splitstatistics(split)); } else { C_init_orderedsplit(split, 0); splitstat = REAL(get_splitstatistics(fitmem)); } C_split(REAL(x), 1, REAL(testy), q, thisweights, nobs, INTEGER(get_ordering(inputs, jselect)), splitctrl, GET_SLOT(fitmem, PL2_linexpcov2sampleSym), expcovinf, REAL(S3get_splitpoint(split)), &maxstat, splitstat); S3set_variableID(split, jselect); } else { /* search of a set of levels (split) in a numeric variable x */ split = S3get_primarysplit(node); /* check if the n-vector of splitstatistics should be returned for each primary split */ if (get_savesplitstats(tgctrl)) { C_init_nominalsplit(split, LENGTH(get_levels(inputs, jselect)), nobs); splitstat = REAL(S3get_splitstatistics(split)); } else { C_init_nominalsplit(split, LENGTH(get_levels(inputs, jselect)), 0); splitstat = REAL(get_splitstatistics(fitmem)); } linexpcov = get_varmemory(fitmem, jselect); standstat = Calloc(get_dimension(linexpcov), double); C_standardize(REAL(GET_SLOT(linexpcov, PL2_linearstatisticSym)), REAL(GET_SLOT(linexpcov, PL2_expectationSym)), REAL(GET_SLOT(linexpcov, PL2_covarianceSym)), get_dimension(linexpcov), get_tol(splitctrl), standstat); C_splitcategorical(INTEGER(x), LENGTH(get_levels(inputs, jselect)), REAL(testy), q, thisweights, nobs, standstat, splitctrl, GET_SLOT(fitmem, PL2_linexpcov2sampleSym), expcovinf, &cutpoint, INTEGER(S3get_splitpoint(split)), &maxstat, splitstat); /* compute which levels of a factor are available in this node (for printing) later on. A real `table' for this node would induce too much overhead here. Maybe later. */ itable = INTEGER(S3get_table(split)); dxtransf = REAL(get_transformation(inputs, jselect)); for (k = 0; k < LENGTH(get_levels(inputs, jselect)); k++) { itable[k] = 0; for (i = 0; i < nobs; i++) { if (dxtransf[k * nobs + i] * thisweights[i] > 0) { itable[k] = 1; continue; } } } Free(standstat); } if (maxstat == 0) { if (j == 1) { S3set_nodeterminal(node); } else { /* do not look at jselect in next iteration */ pvalue[jselect - 1] = R_NegInf; } } else { S3set_variableID(split, jselect); break; } } else {