GBMRESULT gbm_transfer_to_R ( CGBM *pGBM, VEC_VEC_CATEGORIES &vecSplitCodes, int *aiSplitVar, double *adSplitPoint, int *aiLeftNode, int *aiRightNode, int *aiMissingNode, double *adErrorReduction, double *adWeight, double *adPred, int cCatSplitsOld ) { GBMRESULT hr = GBM_OK; hr = pGBM->TransferTreeToRList(aiSplitVar, adSplitPoint, aiLeftNode, aiRightNode, aiMissingNode, adErrorReduction, adWeight, adPred, vecSplitCodes, cCatSplitsOld); if(GBM_FAILED(hr)) goto Error; Cleanup: return hr; Error: goto Cleanup; }
GBMRESULT CCARTTree::Adjust ( unsigned long *aiNodeAssign, double *adFadj, unsigned long cTrain, VEC_P_NODETERMINAL &vecpTermNodes, unsigned long cMinObsInNode ) { unsigned long hr = GBM_OK; unsigned long iObs = 0; hr = pRootNode->Adjust(cMinObsInNode); if(GBM_FAILED(hr)) { goto Error; } // predict for the training observations for(iObs=0; iObs<cTrain; iObs++) { adFadj[iObs] = vecpTermNodes[aiNodeAssign[iObs]]->dPrediction; } Cleanup: return hr; Error: goto Cleanup; }
GBMRESULT CCARTTree::Reset() { GBMRESULT hr = GBM_OK; if(pRootNode != NULL) { // delete the old tree and start over hr = pRootNode->RecycleSelf(pNodeFactory); } if(GBM_FAILED(hr)) { goto Error; } iBestNode = 0; dBestNodeImprovement = 0.0; schWhichNode = 0; pNewSplitNode = NULL; pNewLeftNode = NULL; pNewRightNode = NULL; pNewMissingNode = NULL; pInitialRootNode = NULL; Cleanup: return hr; Error: goto Cleanup; }
GBMRESULT CGBM::PrintTree() { GBMRESULT hr = GBM_OK; hr = ptreeTemp->Print(); if(GBM_FAILED(hr)) goto Error; Cleanup: return hr; Error: goto Cleanup; }
GBMRESULT CCARTTree::GetVarRelativeInfluence ( double *adRelInf ) { GBMRESULT hr = GBM_OK; if(pRootNode != NULL) { hr = pRootNode->GetVarRelativeInfluence(adRelInf); if(GBM_FAILED(hr)) { goto Error; } } Cleanup: return hr; Error: goto Cleanup; }
GBMRESULT CNodeContinuous::TransferTreeToRList ( int &iNodeID, CDataset *pData, int *aiSplitVar, double *adSplitPoint, int *aiLeftNode, int *aiRightNode, int *aiMissingNode, double *adErrorReduction, double *adWeight, double *adPred, VEC_VEC_CATEGORIES &vecSplitCodes, int cCatSplitsOld, double dShrinkage ) { GBMRESULT hr = GBM_OK; int iThisNodeID = iNodeID; aiSplitVar[iThisNodeID] = iSplitVar; adSplitPoint[iThisNodeID] = dSplitValue; adErrorReduction[iThisNodeID] = dImprovement; adWeight[iThisNodeID] = dTrainW; adPred[iThisNodeID] = dShrinkage*dPrediction; iNodeID++; aiLeftNode[iThisNodeID] = iNodeID; hr = pLeftNode->TransferTreeToRList(iNodeID, pData, aiSplitVar, adSplitPoint, aiLeftNode, aiRightNode, aiMissingNode, adErrorReduction, adWeight, adPred, vecSplitCodes, cCatSplitsOld, dShrinkage); if(GBM_FAILED(hr)) goto Error; aiRightNode[iThisNodeID] = iNodeID; hr = pRightNode->TransferTreeToRList(iNodeID, pData, aiSplitVar, adSplitPoint, aiLeftNode, aiRightNode, aiMissingNode, adErrorReduction, adWeight, adPred, vecSplitCodes, cCatSplitsOld, dShrinkage); if(GBM_FAILED(hr)) goto Error; aiMissingNode[iThisNodeID] = iNodeID; hr = pMissingNode->TransferTreeToRList(iNodeID, pData, aiSplitVar, adSplitPoint, aiLeftNode, aiRightNode, aiMissingNode, adErrorReduction, adWeight, adPred, vecSplitCodes, cCatSplitsOld, dShrinkage); if(GBM_FAILED(hr)) goto Error; Cleanup: return hr; Error: goto Cleanup; }
SEXP gbm ( SEXP radY, // outcome or response SEXP radOffset, // offset for f(x), NA for no offset SEXP radX, SEXP raiXOrder, SEXP radWeight, SEXP radMisc, // other row specific data (eg failure time), NA=no Misc SEXP rcRows, SEXP rcCols, SEXP racVarClasses, SEXP ralMonotoneVar, SEXP rszFamily, SEXP rcTrees, SEXP rcDepth, // interaction depth SEXP rcMinObsInNode, SEXP rcNumClasses, SEXP rdShrinkage, SEXP rdBagFraction, SEXP rcTrain, SEXP radFOld, SEXP rcCatSplitsOld, SEXP rcTreesOld, SEXP rfVerbose ) { unsigned long hr = 0; SEXP rAns = NULL; SEXP rNewTree = NULL; SEXP riSplitVar = NULL; SEXP rdSplitPoint = NULL; SEXP riLeftNode = NULL; SEXP riRightNode = NULL; SEXP riMissingNode = NULL; SEXP rdErrorReduction = NULL; SEXP rdWeight = NULL; SEXP rdPred = NULL; SEXP rdInitF = NULL; SEXP radF = NULL; SEXP radTrainError = NULL; SEXP radValidError = NULL; SEXP radOOBagImprove = NULL; SEXP rSetOfTrees = NULL; SEXP rSetSplitCodes = NULL; SEXP rSplitCode = NULL; VEC_VEC_CATEGORIES vecSplitCodes; int i = 0; int iT = 0; int iK = 0; int cTrees = INTEGER(rcTrees)[0]; const int cResultComponents = 7; // rdInitF, radF, radTrainError, radValidError, radOOBagImprove // rSetOfTrees, rSetSplitCodes const int cTreeComponents = 8; // riSplitVar, rdSplitPoint, riLeftNode, // riRightNode, riMissingNode, rdErrorReduction, rdWeight, rdPred int cNodes = 0; int cTrain = INTEGER(rcTrain)[0]; int cNumClasses = INTEGER(rcNumClasses)[0]; double dTrainError = 0.0; double dValidError = 0.0; double dOOBagImprove = 0.0; CGBM *pGBM = NULL; CDataset *pData = NULL; CDistribution *pDist = NULL; // set up the dataset pData = new CDataset(); if(pData==NULL) { hr = GBM_OUTOFMEMORY; goto Error; } // initialize R's random number generator GetRNGstate(); // initialize some things hr = gbm_setup(REAL(radY), REAL(radOffset), REAL(radX), INTEGER(raiXOrder), REAL(radWeight), REAL(radMisc), INTEGER(rcRows)[0], INTEGER(rcCols)[0], INTEGER(racVarClasses), INTEGER(ralMonotoneVar), CHAR(STRING_ELT(rszFamily,0)), INTEGER(rcTrees)[0], INTEGER(rcDepth)[0], INTEGER(rcMinObsInNode)[0], INTEGER(rcNumClasses)[0], REAL(rdShrinkage)[0], REAL(rdBagFraction)[0], INTEGER(rcTrain)[0], pData, pDist); if(GBM_FAILED(hr)) { goto Error; } // allocate the GBM pGBM = new CGBM(); if(pGBM==NULL) { hr = GBM_OUTOFMEMORY; goto Error; } // initialize the GBM hr = pGBM->Initialize(pData, pDist, REAL(rdShrinkage)[0], cTrain, REAL(rdBagFraction)[0], INTEGER(rcDepth)[0], INTEGER(rcMinObsInNode)[0], INTEGER(rcNumClasses)[0]); if(GBM_FAILED(hr)) { goto Error; } // allocate the main return object PROTECT(rAns = allocVector(VECSXP, cResultComponents)); // allocate the initial value PROTECT(rdInitF = allocVector(REALSXP, 1)); SET_VECTOR_ELT(rAns,0,rdInitF); UNPROTECT(1); // rdInitF // allocate the predictions PROTECT(radF = allocVector(REALSXP, (pData->cRows) * cNumClasses)); SET_VECTOR_ELT(rAns,1,radF); UNPROTECT(1); // radF if(ISNA(REAL(radFOld)[0])) // check for old predictions { // set the initial value of F as a constant hr = pDist->InitF(pData->adY, pData->adMisc, pData->adOffset, pData->adWeight, REAL(rdInitF)[0], cTrain); for(i=0; i < (pData->cRows) * cNumClasses; i++) { REAL(radF)[i] = REAL(rdInitF)[0]; } } else { for(i=0; i < (pData->cRows) * cNumClasses; i++) { REAL(radF)[i] = REAL(radFOld)[i]; } } // allocate space for the performance measures PROTECT(radTrainError = allocVector(REALSXP, cTrees)); PROTECT(radValidError = allocVector(REALSXP, cTrees)); PROTECT(radOOBagImprove = allocVector(REALSXP, cTrees)); SET_VECTOR_ELT(rAns,2,radTrainError); SET_VECTOR_ELT(rAns,3,radValidError); SET_VECTOR_ELT(rAns,4,radOOBagImprove); UNPROTECT(3); // radTrainError , radValidError, radOOBagImprove // allocate the component for the tree structures PROTECT(rSetOfTrees = allocVector(VECSXP, cTrees * cNumClasses)); SET_VECTOR_ELT(rAns,5,rSetOfTrees); UNPROTECT(1); // rSetOfTrees if(INTEGER(rfVerbose)[0]) { Rprintf("Iter TrainDeviance ValidDeviance StepSize Improve\n"); } for(iT=0; iT<cTrees; iT++) { // Update the parameters hr = pDist->UpdateParams(REAL(radF), pData->adOffset, pData->adWeight, cTrain); if(GBM_FAILED(hr)) { goto Error; } REAL(radTrainError)[iT] = 0.0; REAL(radValidError)[iT] = 0.0; REAL(radOOBagImprove)[iT] = 0.0; for (iK = 0; iK < cNumClasses; iK++) { hr = pGBM->iterate(REAL(radF), dTrainError,dValidError,dOOBagImprove, cNodes, cNumClasses, iK); if(GBM_FAILED(hr)) { goto Error; } // store the performance measures REAL(radTrainError)[iT] += dTrainError; REAL(radValidError)[iT] += dValidError; REAL(radOOBagImprove)[iT] += dOOBagImprove; // allocate the new tree component for the R list structure PROTECT(rNewTree = allocVector(VECSXP, cTreeComponents)); // riNodeID,riSplitVar,rdSplitPoint,riLeftNode, // riRightNode,riMissingNode,rdErrorReduction,rdWeight PROTECT(riSplitVar = allocVector(INTSXP, cNodes)); PROTECT(rdSplitPoint = allocVector(REALSXP, cNodes)); PROTECT(riLeftNode = allocVector(INTSXP, cNodes)); PROTECT(riRightNode = allocVector(INTSXP, cNodes)); PROTECT(riMissingNode = allocVector(INTSXP, cNodes)); PROTECT(rdErrorReduction = allocVector(REALSXP, cNodes)); PROTECT(rdWeight = allocVector(REALSXP, cNodes)); PROTECT(rdPred = allocVector(REALSXP, cNodes)); SET_VECTOR_ELT(rNewTree,0,riSplitVar); SET_VECTOR_ELT(rNewTree,1,rdSplitPoint); SET_VECTOR_ELT(rNewTree,2,riLeftNode); SET_VECTOR_ELT(rNewTree,3,riRightNode); SET_VECTOR_ELT(rNewTree,4,riMissingNode); SET_VECTOR_ELT(rNewTree,5,rdErrorReduction); SET_VECTOR_ELT(rNewTree,6,rdWeight); SET_VECTOR_ELT(rNewTree,7,rdPred); UNPROTECT(cTreeComponents); SET_VECTOR_ELT(rSetOfTrees,(iK + iT * cNumClasses),rNewTree); UNPROTECT(1); // rNewTree hr = gbm_transfer_to_R(pGBM, vecSplitCodes, INTEGER(riSplitVar), REAL(rdSplitPoint), INTEGER(riLeftNode), INTEGER(riRightNode), INTEGER(riMissingNode), REAL(rdErrorReduction), REAL(rdWeight), REAL(rdPred), INTEGER(rcCatSplitsOld)[0]); } // Close for iK // print the information if((iT <= 9) || ((iT+1+INTEGER(rcTreesOld)[0])/100 == (iT+1+INTEGER(rcTreesOld)[0])/100.0) || (iT==cTrees-1)) { R_CheckUserInterrupt(); if(INTEGER(rfVerbose)[0]) { Rprintf("%6d %13.4f %15.4f %10.4f %9.4f\n", iT+1+INTEGER(rcTreesOld)[0], REAL(radTrainError)[iT], REAL(radValidError)[iT], REAL(rdShrinkage)[0], REAL(radOOBagImprove)[iT]); } } } if(INTEGER(rfVerbose)[0]) Rprintf("\n"); // transfer categorical splits to R PROTECT(rSetSplitCodes = allocVector(VECSXP, vecSplitCodes.size())); SET_VECTOR_ELT(rAns,6,rSetSplitCodes); UNPROTECT(1); // rSetSplitCodes for(i=0; i<(int)vecSplitCodes.size(); i++) { PROTECT(rSplitCode = allocVector(INTSXP, size_of_vector(vecSplitCodes,i))); SET_VECTOR_ELT(rSetSplitCodes,i,rSplitCode); UNPROTECT(1); // rSplitCode hr = gbm_transfer_catsplits_to_R(i, vecSplitCodes, INTEGER(rSplitCode)); } // dump random number generator seed #ifdef NOISY_DEBUG Rprintf("PutRNGstate\n"); #endif PutRNGstate(); Cleanup: UNPROTECT(1); // rAns #ifdef NOISY_DEBUG Rprintf("destructing\n"); #endif if(pGBM != NULL) { delete pGBM; pGBM = NULL; } if(pDist != NULL) { delete pDist; pDist = NULL; } if(pData != NULL) { delete pData; pData = NULL; } return rAns; Error: goto Cleanup; }
GBMRESULT CGBM::Initialize ( CDataset *pData, CDistribution *pDist, double dLambda, unsigned long cTrain, double dBagFraction, unsigned long cDepth, unsigned long cMinObsInNode, unsigned long cNumClasses, int cGroups ) { GBMRESULT hr = GBM_OK; unsigned long i=0; if(pData == NULL) { hr = GBM_INVALIDARG; goto Error; } if(pDist == NULL) { hr = GBM_INVALIDARG; goto Error; } this->pData = pData; this->pDist = pDist; this->dLambda = dLambda; this->cTrain = cTrain; this->dBagFraction = dBagFraction; this->cDepth = cDepth; this->cMinObsInNode = cMinObsInNode; this->cGroups = cGroups; // allocate the tree structure ptreeTemp = new CCARTTree; cValid = pData->cRows - cTrain; cTotalInBag = (unsigned long)(dBagFraction*cTrain); adZ.assign((pData->cRows) * cNumClasses, 0); adFadj.assign((pData->cRows) * cNumClasses, 0); pNodeFactory = new CNodeFactory(); hr = pNodeFactory->Initialize(cDepth); if(GBM_FAILED(hr)) { goto Error; } ptreeTemp->Initialize(pNodeFactory); // array for flagging those observations in the bag afInBag = new bool[cTrain]; // aiNodeAssign tracks to which node each training obs belongs aiNodeAssign.resize(cTrain); // NodeSearch objects help decide which nodes to split aNodeSearch = new CNodeSearch[2*cDepth+1]; for(i=0; i<2*cDepth+1; i++) { aNodeSearch[i].Initialize(cMinObsInNode); } vecpTermNodes.resize(2*cDepth+1,NULL); fInitialized = true; Cleanup: return hr; Error: goto Cleanup; }
GBMRESULT CGBM::iterate ( double *adF, double &dTrainError, double &dValidError, double &dOOBagImprove, int &cNodes, int cNumClasses, int cClassIdx ) { GBMRESULT hr = GBM_OK; unsigned long i = 0; unsigned long cBagged = 0; int cIdxOff = cClassIdx * (cTrain + cValid); // for(i=0; i < cTrain + cIdxOff; i++){ adF[i] = 0;} if(!fInitialized) { hr = GBM_FAIL; goto Error; } dTrainError = 0.0; dValidError = 0.0; dOOBagImprove = 0.0; vecpTermNodes.assign(2*cDepth+1,NULL); // randomly assign observations to the Bag if (cClassIdx == 0) { if (!IsPairwise()) { // regular instance based training for(i=0; i<cTrain; i++) /* && (cBagged < cTotalInBag); i++) */ { if(unif_rand()*(cTrain-i) < cTotalInBag-cBagged) { afInBag[i] = true; cBagged++; } else { afInBag[i] = false; } /* if (cBagged >= cTotalInBag){ break; } */ } std::fill(afInBag + i, afInBag + cTrain, false); } else { // for pairwise training, sampling is per group // therefore, we will not have exactly cTotalInBag instances double dLastGroup = -1; bool chosen = false; unsigned int cBaggedGroups = 0; unsigned int cSeenGroups = 0; unsigned int cTotalGroupsInBag = (unsigned long)(dBagFraction * cGroups); if (cTotalGroupsInBag <= 0) { cTotalGroupsInBag = 1; } for(i=0; i<cTrain; i++) { const double dGroup = pData->adMisc[i]; if (dGroup != dLastGroup) { if (cBaggedGroups >= cTotalGroupsInBag) { break; } // Group changed, make a new decision chosen = (unif_rand()*(cGroups - cSeenGroups) < cTotalGroupsInBag - cBaggedGroups); if (chosen) { cBaggedGroups++; } dLastGroup = dGroup; cSeenGroups++; } if (chosen) { afInBag[i] = true; cBagged++; } else { afInBag[i] = false; } } // the remainder is not in the bag std::fill(afInBag + i, afInBag + cTrain, false); } } #ifdef NOISY_DEBUG Rprintf("Compute working response\n"); #endif hr = pDist->ComputeWorkingResponse(pData->adY, pData->adMisc, pData->adOffset, adF, &adZ[0], pData->adWeight, afInBag, cTrain, cIdxOff); if(GBM_FAILED(hr)) { goto Error; } #ifdef NOISY_DEBUG Rprintf("Reset tree\n"); #endif hr = ptreeTemp->Reset(); #ifdef NOISY_DEBUG Rprintf("grow tree\n"); #endif hr = ptreeTemp->grow(&(adZ[cIdxOff]), pData, &(pData->adWeight[cIdxOff]), &(adFadj[cIdxOff]), cTrain, cTotalInBag, dLambda, cDepth, cMinObsInNode, afInBag, aiNodeAssign, aNodeSearch, vecpTermNodes); if(GBM_FAILED(hr)) { goto Error; } #ifdef NOISY_DEBUG Rprintf("get node count\n"); #endif hr = ptreeTemp->GetNodeCount(cNodes); if(GBM_FAILED(hr)) { goto Error; } // Now I have adF, adZ, and vecpTermNodes (new node assignments) // Fit the best constant within each terminal node #ifdef NOISY_DEBUG Rprintf("fit best constant\n"); #endif hr = pDist->FitBestConstant(pData->adY, pData->adMisc, pData->adOffset, pData->adWeight, &adF[0], &adZ[0], aiNodeAssign, cTrain, vecpTermNodes, (2*cNodes+1)/3, // number of terminal nodes cMinObsInNode, afInBag, &adFadj[0], cIdxOff); if(GBM_FAILED(hr)) { goto Error; } // update training predictions // fill in missing nodes where N < cMinObsInNode hr = ptreeTemp->Adjust(aiNodeAssign,&(adFadj[cIdxOff]),cTrain, vecpTermNodes,cMinObsInNode); if(GBM_FAILED(hr)) { goto Error; } ptreeTemp->SetShrinkage(dLambda); if (cClassIdx == (cNumClasses - 1)) { dOOBagImprove = pDist->BagImprovement(pData->adY, pData->adMisc, pData->adOffset, pData->adWeight, &adF[0], &adFadj[0], afInBag, dLambda, cTrain); } // update the training predictions for(i=0; i < cTrain; i++) { int iIdx = i + cIdxOff; adF[iIdx] += dLambda * adFadj[iIdx]; } dTrainError = pDist->Deviance(pData->adY, pData->adMisc, pData->adOffset, pData->adWeight, adF, cTrain, cIdxOff); // update the validation predictions hr = ptreeTemp->PredictValid(pData,cValid,&(adFadj[cIdxOff])); for(i=cTrain; i < cTrain+cValid; i++) { adF[i + cIdxOff] += adFadj[i + cIdxOff]; } if(pData->fHasOffset) { dValidError = pDist->Deviance(pData->adY, pData->adMisc, pData->adOffset, pData->adWeight, adF, cValid, cIdxOff + cTrain); } else { dValidError = pDist->Deviance(pData->adY, pData->adMisc, NULL, pData->adWeight, adF, cValid, cIdxOff + cTrain); } Cleanup: return hr; Error: goto Cleanup; }
unsigned long gbm_setup ( double *adY, double *adOffset, double *adX, int *aiXOrder, double *adWeight, double *adMisc, int cRows, int cCols, int *acVarClasses, int *alMonotoneVar, const char *pszFamily, int cTrees, int cDepth, int cMinObsInNode, int cNumClasses, double dShrinkage, double dBagFraction, int cTrain, CDataset *pData, PCDistribution &pDist ) { unsigned long hr = 0; hr = pData->SetData(adX,aiXOrder,adY,adOffset,adWeight,adMisc, cRows,cCols,acVarClasses,alMonotoneVar); if(GBM_FAILED(hr)) { goto Error; } // set the distribution if(strncmp(pszFamily,"bernoulli",2) == 0) { pDist = new CBernoulli(); if(pDist==NULL) { hr = GBM_OUTOFMEMORY; goto Error; } } else if(strncmp(pszFamily,"gaussian",2) == 0) { pDist = new CGaussian(); if(pDist==NULL) { hr = GBM_OUTOFMEMORY; goto Error; } } else if(strncmp(pszFamily,"poisson",2) == 0) { pDist = new CPoisson(); if(pDist==NULL) { hr = GBM_OUTOFMEMORY; goto Error; } } else if(strncmp(pszFamily,"adaboost",2) == 0) { pDist = new CAdaBoost(); if(pDist==NULL) { hr = GBM_OUTOFMEMORY; goto Error; } } else if(strncmp(pszFamily,"coxph",2) == 0) { pDist = new CCoxPH(); if(pDist==NULL) { hr = GBM_OUTOFMEMORY; goto Error; } } else if(strncmp(pszFamily,"laplace",2) == 0) { pDist = new CLaplace(); if(pDist==NULL) { hr = GBM_OUTOFMEMORY; goto Error; } } else if(strncmp(pszFamily,"quantile",2) == 0) { pDist = new CQuantile(adMisc[0]); if(pDist==NULL) { hr = GBM_OUTOFMEMORY; goto Error; } } else if(strncmp(pszFamily,"tdist",2) == 0) { pDist = new CTDist(adMisc[0]); if(pDist==NULL) { hr = GBM_OUTOFMEMORY; goto Error; } } else if(strncmp(pszFamily,"multinomial",2) == 0) { pDist = new CMultinomial(cNumClasses, cRows); if(pDist==NULL) { hr = GBM_OUTOFMEMORY; goto Error; } } else if(strncmp(pszFamily,"huberized",2) == 0) { pDist = new CHuberized(); if(pDist==NULL) { hr = GBM_OUTOFMEMORY; goto Error; } } if(pDist==NULL) { hr = GBM_INVALIDARG; goto Error; } Cleanup: return hr; Error: goto Cleanup; }
//------------------------------------------------------------------------------ // Grows a regression tree //------------------------------------------------------------------------------ GBMRESULT CCARTTree::grow ( double *adZ, CDataset *pData, double *adW, double *adF, unsigned long nTrain, unsigned long nBagged, double dLambda, unsigned long cMaxDepth, unsigned long cMinObsInNode, bool *afInBag, unsigned long *aiNodeAssign, CNodeSearch *aNodeSearch, VEC_P_NODETERMINAL &vecpTermNodes ) { GBMRESULT hr = GBM_OK; #ifdef NOISY_DEBUG Rprintf("Growing tree\n"); #endif if((adZ==NULL) || (pData==NULL) || (adW==NULL) || (adF==NULL) || (cMaxDepth < 1)) { hr = GBM_INVALIDARG; goto Error; } dSumZ = 0.0; dSumZ2 = 0.0; dTotalW = 0.0; #ifdef NOISY_DEBUG Rprintf("initial tree calcs\n"); #endif for(iObs=0; iObs<nTrain; iObs++) { // aiNodeAssign tracks to which node each training obs belongs aiNodeAssign[iObs] = 0; if(afInBag[iObs]) { // get the initial sums and sum of squares and total weight dSumZ += adW[iObs]*adZ[iObs]; dSumZ2 += adW[iObs]*adZ[iObs]*adZ[iObs]; dTotalW += adW[iObs]; } } dError = dSumZ2-dSumZ*dSumZ/dTotalW; pInitialRootNode = pNodeFactory->GetNewNodeTerminal(); pInitialRootNode->dPrediction = dSumZ/dTotalW; pInitialRootNode->dTrainW = dTotalW; vecpTermNodes.resize(2*cMaxDepth + 1,NULL); // accounts for missing nodes vecpTermNodes[0] = pInitialRootNode; pRootNode = pInitialRootNode; aNodeSearch[0].Set(dSumZ,dTotalW,nBagged, pInitialRootNode, &pRootNode, pNodeFactory); // build the tree structure #ifdef NOISY_DEBUG Rprintf("Building tree 1 "); #endif cTotalNodeCount = 1; cTerminalNodes = 1; for(cDepth=0; cDepth<cMaxDepth; cDepth++) { #ifdef NOISY_DEBUG Rprintf("%d ",cDepth); #endif hr = GetBestSplit(pData, nTrain, aNodeSearch, cTerminalNodes, aiNodeAssign, afInBag, adZ, adW, iBestNode, dBestNodeImprovement); if(GBM_FAILED(hr)) { goto Error; } if(dBestNodeImprovement == 0.0) { break; } // setup the new nodes and add them to the tree hr = aNodeSearch[iBestNode].SetupNewNodes(pNewSplitNode, pNewLeftNode, pNewRightNode, pNewMissingNode); cTotalNodeCount += 3; cTerminalNodes += 2; vecpTermNodes[iBestNode] = pNewLeftNode; vecpTermNodes[cTerminalNodes-2] = pNewRightNode; vecpTermNodes[cTerminalNodes-1] = pNewMissingNode; // assign observations to the correct node for(iObs=0; iObs < nTrain; iObs++) { iWhichNode = aiNodeAssign[iObs]; if(iWhichNode==iBestNode) { schWhichNode = pNewSplitNode->WhichNode(pData,iObs); if(schWhichNode == 1) // goes right { aiNodeAssign[iObs] = cTerminalNodes-2; } else if(schWhichNode == 0) // is missing { aiNodeAssign[iObs] = cTerminalNodes-1; } // those to the left stay with the same node assignment } } // set up the node search for the new right node aNodeSearch[cTerminalNodes-2].Set(aNodeSearch[iBestNode].dBestRightSumZ, aNodeSearch[iBestNode].dBestRightTotalW, aNodeSearch[iBestNode].cBestRightN, pNewRightNode, &(pNewSplitNode->pRightNode), pNodeFactory); // set up the node search for the new missing node aNodeSearch[cTerminalNodes-1].Set(aNodeSearch[iBestNode].dBestMissingSumZ, aNodeSearch[iBestNode].dBestMissingTotalW, aNodeSearch[iBestNode].cBestMissingN, pNewMissingNode, &(pNewSplitNode->pMissingNode), pNodeFactory); // set up the node search for the new left node // must be done second since we need info for right node first aNodeSearch[iBestNode].Set(aNodeSearch[iBestNode].dBestLeftSumZ, aNodeSearch[iBestNode].dBestLeftTotalW, aNodeSearch[iBestNode].cBestLeftN, pNewLeftNode, &(pNewSplitNode->pLeftNode), pNodeFactory); } // end tree growing // DEBUG // Print(); Cleanup: return hr; Error: goto Cleanup; }
GBMRESULT CCARTTree::GetBestSplit ( CDataset *pData, unsigned long nTrain, CNodeSearch *aNodeSearch, unsigned long cTerminalNodes, unsigned long *aiNodeAssign, bool *afInBag, double *adZ, double *adW, unsigned long &iBestNode, double &dBestNodeImprovement ) { GBMRESULT hr = GBM_OK; int iVar = 0; unsigned long iNode = 0; unsigned long iOrderObs = 0; unsigned long iWhichObs = 0; unsigned long cVarClasses = 0; double dX = 0.0; for(iVar=0; iVar < pData->cCols; iVar++) { cVarClasses = pData->acVarClasses[iVar]; for(iNode=0; iNode < cTerminalNodes; iNode++) { hr = aNodeSearch[iNode].ResetForNewVar(iVar,cVarClasses); } // distribute the observations in order to the correct node search for(iOrderObs=0; iOrderObs < nTrain; iOrderObs++) { iWhichObs = pData->aiXOrder[iVar*nTrain + iOrderObs]; if(afInBag[iWhichObs]) { iNode = aiNodeAssign[iWhichObs]; dX = pData->adX[iVar*(pData->cRows) + iWhichObs]; hr = aNodeSearch[iNode].IncorporateObs (dX, adZ[iWhichObs], adW[iWhichObs], pData->alMonotoneVar[iVar]); if(GBM_FAILED(hr)) { goto Error; } } } for(iNode=0; iNode<cTerminalNodes; iNode++) { if(cVarClasses != 0) // evaluate if categorical split { hr = aNodeSearch[iNode].EvaluateCategoricalSplit(); } aNodeSearch[iNode].WrapUpCurrentVariable(); } } // search for the best split iBestNode = 0; dBestNodeImprovement = 0.0; for(iNode=0; iNode<cTerminalNodes; iNode++) { aNodeSearch[iNode].SetToSplit(); if(aNodeSearch[iNode].BestImprovement() > dBestNodeImprovement) { iBestNode = iNode; dBestNodeImprovement = aNodeSearch[iNode].BestImprovement(); } } Cleanup: return hr; Error: goto Cleanup; }
GBMRESULT CGBM::iterate ( double *adF, double &dTrainError, double &dValidError, double &dOOBagImprove, int &cNodes ) { GBMRESULT hr = GBM_OK; unsigned long i = 0; unsigned long cBagged = 0; if(!fInitialized) { hr = GBM_FAIL; goto Error; } dTrainError = 0.0; dValidError = 0.0; dOOBagImprove = 0.0; vecpTermNodes.assign(2*cDepth+1,NULL); // randomly assign observations to the Bag cBagged = 0; for(i=0; i<cTrain; i++) { if(unif_rand()*(cTrain-i) < cTotalInBag-cBagged) { afInBag[i] = true; cBagged++; } else { afInBag[i] = false; } } #ifdef NOISY_DEBUG Rprintf("Compute working response\n"); #endif // CYF /*Rprintf("# training data: %d \n", cTrain); Rprintf("one iteration \n"); for(i=0; i < cTrain; i++) { Rprintf("%d ", afInBag[i]); //adF[i] } Rprintf("\n");*/ hr = pDist->ComputeWorkingResponse(pData->adY, pData->adMisc, pData->adOffset, adF, adZ, pData->adWeight, afInBag, cTrain); // CYF /*for(i=0; i < cTrain; i++) { Rprintf("%f\n", adZ[i]); //adF[i] }*/ //Rprintf("\n"); if(GBM_FAILED(hr)) { goto Error; } #ifdef NOISY_DEBUG Rprintf("Reset tree\n"); #endif hr = ptreeTemp->Reset(); #ifdef NOISY_DEBUG Rprintf("grow tree\n"); #endif hr = ptreeTemp->grow(adZ,pData,pData->adWeight,adFadj, cTrain,cTotalInBag,dLambda,cDepth, cMinObsInNode, afInBag, aiNodeAssign,aNodeSearch,vecpTermNodes); if(GBM_FAILED(hr)) { goto Error; } #ifdef NOISY_DEBUG Rprintf("get node count\n"); #endif hr = ptreeTemp->GetNodeCount(cNodes); if(GBM_FAILED(hr)) { goto Error; } // CYF: ??!! only node 0 1 3 5 has data //Rprintf("# training data: %d \n", cTrain); /*Rprintf("# total nodes: %d \n", cNodes); for(i=0; i < cTrain; i++) { Rprintf("%d, ", aiNodeAssign[i]); //adF[i] } Rprintf("\n");*/ // Now I have adF, adZ, and vecpTermNodes (new node assignments) // Fit the best constant within each terminal node #ifdef NOISY_DEBUG Rprintf("fit best constant\n"); #endif hr = pDist->FitBestConstant(pData->adY, pData->adMisc, pData->adOffset, pData->adWeight, adF, adZ, aiNodeAssign, cTrain, vecpTermNodes, (2*cNodes+1)/3, // number of terminal nodes cMinObsInNode, afInBag, adFadj); //Rprintf("FitBestConstant finished \n"); if(GBM_FAILED(hr)) { goto Error; } // CYF // Rprintf("2*cDepth+1, (2*cNodes+1)/3: %d %d \n", 2*cDepth+1, (2*cNodes+1)/3); // update training predictions // fill in missing nodes where N < cMinObsInNode hr = ptreeTemp->Adjust(aiNodeAssign,adFadj,cTrain, vecpTermNodes,cMinObsInNode); /* for (i=0; i<cTrain; i++) { Rprintf("%f, ", adFadj[i]); } Rprintf("\n"); */ //Rprintf("Tree adjust finished \n"); /*Rprintf("# total nodes: %d \n", cNodes); for(i=0; i < cTrain; i++) { Rprintf("%d, ", aiNodeAssign[i]); // adF[i] } Rprintf("\n");*/ if(GBM_FAILED(hr)) { goto Error; } ptreeTemp->SetShrinkage(dLambda); dOOBagImprove = pDist->BagImprovement(pData->adY, pData->adMisc, pData->adOffset, pData->adWeight, adF, adFadj, afInBag, dLambda, cTrain); //Rprintf("BagImprovement finished \n"); // update the training predictions for(i=0; i < cTrain; i++) { adF[i] += dLambda*adFadj[i]; } dTrainError = pDist->Deviance(pData->adY, pData->adMisc, pData->adOffset, pData->adWeight, adF, cTrain); // update the validation predictions hr = ptreeTemp->PredictValid(pData,cValid,adFadj); for(i=cTrain; i < cTrain+cValid; i++) { adF[i] += adFadj[i]; } if(pData->fHasOffset) { dValidError = pDist->Deviance(&(pData->adY[cTrain]), &(pData->adMisc[cTrain]), &(pData->adOffset[cTrain]), &(pData->adWeight[cTrain]), &(adF[cTrain]), cValid); //dValidError = 0; // temp } else { dValidError = pDist->Deviance(&(pData->adY[cTrain]), &(pData->adMisc[cTrain]), NULL, &(pData->adWeight[cTrain]), &(adF[cTrain]), cValid); //dValidError = 0; } //Rprintf("%f ", dValidError); //Rprintf("Deviance computing finished \n"); Cleanup: return hr; Error: goto Cleanup; }