Example #1
0
File: gbm.cpp Project: rforge/gbm
GBMRESULT gbm_transfer_to_R
(
    CGBM *pGBM,
    VEC_VEC_CATEGORIES &vecSplitCodes,
    int *aiSplitVar,
    double *adSplitPoint,
    int *aiLeftNode,
    int *aiRightNode,
    int *aiMissingNode,
    double *adErrorReduction,
    double *adWeight,
    double *adPred,
    int cCatSplitsOld
)
{
    GBMRESULT hr = GBM_OK;


    hr = pGBM->TransferTreeToRList(aiSplitVar,
                                   adSplitPoint,
                                   aiLeftNode,
                                   aiRightNode,
                                   aiMissingNode,
                                   adErrorReduction,
                                   adWeight,
                                   adPred,
                                   vecSplitCodes,
                                   cCatSplitsOld);
    if(GBM_FAILED(hr)) goto Error;

Cleanup:
    return hr;
Error:
    goto Cleanup;
}
Example #2
0
GBMRESULT CCARTTree::Adjust
(
    unsigned long *aiNodeAssign,
    double *adFadj,
    unsigned long cTrain,
    VEC_P_NODETERMINAL &vecpTermNodes,
    unsigned long cMinObsInNode
)
{
    unsigned long hr = GBM_OK;
    unsigned long iObs = 0;
    
    hr = pRootNode->Adjust(cMinObsInNode);
    if(GBM_FAILED(hr))
    {
        goto Error;
    }

    // predict for the training observations
    for(iObs=0; iObs<cTrain; iObs++)
    {
        adFadj[iObs] = vecpTermNodes[aiNodeAssign[iObs]]->dPrediction;
    }

Cleanup:
    return hr;
Error:
    goto Cleanup;
}
Example #3
0
GBMRESULT CCARTTree::Reset()
{
    GBMRESULT hr = GBM_OK;

    if(pRootNode != NULL)
    {
        // delete the old tree and start over
        hr = pRootNode->RecycleSelf(pNodeFactory);
    }
    if(GBM_FAILED(hr))
    {
        goto Error;
    }

    iBestNode = 0;
    dBestNodeImprovement = 0.0;

    schWhichNode = 0;

    pNewSplitNode    = NULL;
    pNewLeftNode     = NULL;
    pNewRightNode    = NULL;
    pNewMissingNode  = NULL;
    pInitialRootNode = NULL;

Cleanup:
    return hr;
Error:
    goto Cleanup;
}
Example #4
0
GBMRESULT CGBM::PrintTree()
{
    GBMRESULT hr = GBM_OK;

    hr = ptreeTemp->Print();
    if(GBM_FAILED(hr)) goto Error;

Cleanup:
    return hr;
Error:
    goto Cleanup;
}
Example #5
0
GBMRESULT CCARTTree::GetVarRelativeInfluence
(
    double *adRelInf
)
{
    GBMRESULT hr = GBM_OK;

    if(pRootNode != NULL)
    {
        hr = pRootNode->GetVarRelativeInfluence(adRelInf);
        if(GBM_FAILED(hr))
        {
            goto Error;
        }
    }

Cleanup:
    return hr;
Error:
    goto Cleanup;

}
Example #6
0
GBMRESULT CNodeContinuous::TransferTreeToRList
(
    int &iNodeID,
    CDataset *pData,
    int *aiSplitVar,
    double *adSplitPoint,
    int *aiLeftNode,
    int *aiRightNode,
    int *aiMissingNode,
    double *adErrorReduction,
    double *adWeight,
    double *adPred,
    VEC_VEC_CATEGORIES &vecSplitCodes,
    int cCatSplitsOld,
    double dShrinkage
)
{
    GBMRESULT hr = GBM_OK;
    int iThisNodeID = iNodeID;

    aiSplitVar[iThisNodeID] = iSplitVar;
    adSplitPoint[iThisNodeID] = dSplitValue;
    adErrorReduction[iThisNodeID] = dImprovement;
    adWeight[iThisNodeID] = dTrainW;
    adPred[iThisNodeID] = dShrinkage*dPrediction;


    iNodeID++;
    aiLeftNode[iThisNodeID] = iNodeID;
    hr = pLeftNode->TransferTreeToRList(iNodeID,
                                        pData,
                                        aiSplitVar,
                                        adSplitPoint,
                                        aiLeftNode,
                                        aiRightNode,
                                        aiMissingNode,
                                        adErrorReduction,
                                        adWeight,
                                        adPred,
                                        vecSplitCodes,
                                        cCatSplitsOld,
                                        dShrinkage);
    if(GBM_FAILED(hr)) goto Error;

    aiRightNode[iThisNodeID] = iNodeID;
    hr = pRightNode->TransferTreeToRList(iNodeID,
                                         pData,
                                         aiSplitVar,
                                         adSplitPoint,
                                         aiLeftNode,
                                         aiRightNode,
                                         aiMissingNode,
                                         adErrorReduction,
                                         adWeight,
                                         adPred,
                                         vecSplitCodes,
                                         cCatSplitsOld,
                                         dShrinkage);
    if(GBM_FAILED(hr)) goto Error;

    aiMissingNode[iThisNodeID] = iNodeID;
    hr = pMissingNode->TransferTreeToRList(iNodeID,
                                           pData,
                                           aiSplitVar,
                                           adSplitPoint,
                                           aiLeftNode,
                                           aiRightNode,
                                           aiMissingNode,
                                           adErrorReduction,
                                           adWeight,
                                           adPred,
                                           vecSplitCodes,
                                           cCatSplitsOld,
                                           dShrinkage);
    if(GBM_FAILED(hr)) goto Error;

Cleanup:
    return hr;
Error:
    goto Cleanup;
}
Example #7
0
SEXP gbm
(
    SEXP radY,       // outcome or response
    SEXP radOffset,  // offset for f(x), NA for no offset
    SEXP radX,        
    SEXP raiXOrder,        
    SEXP radWeight,
    SEXP radMisc,   // other row specific data (eg failure time), NA=no Misc
    SEXP rcRows,
    SEXP rcCols,
    SEXP racVarClasses,
    SEXP ralMonotoneVar,
    SEXP rszFamily, 
    SEXP rcTrees,
    SEXP rcDepth,       // interaction depth
    SEXP rcMinObsInNode,
    SEXP rcNumClasses,
    SEXP rdShrinkage,
    SEXP rdBagFraction,
    SEXP rcTrain,
    SEXP radFOld,
    SEXP rcCatSplitsOld,
    SEXP rcTreesOld,
    SEXP rfVerbose
)
{
    unsigned long hr = 0;

    SEXP rAns = NULL;
    SEXP rNewTree = NULL;
    SEXP riSplitVar = NULL;
    SEXP rdSplitPoint = NULL;
    SEXP riLeftNode = NULL;
    SEXP riRightNode = NULL;
    SEXP riMissingNode = NULL;
    SEXP rdErrorReduction = NULL;
    SEXP rdWeight = NULL;
    SEXP rdPred = NULL;

    SEXP rdInitF = NULL;
    SEXP radF = NULL;
    SEXP radTrainError = NULL;
    SEXP radValidError = NULL;
    SEXP radOOBagImprove = NULL;

    SEXP rSetOfTrees = NULL;
    SEXP rSetSplitCodes = NULL;
    SEXP rSplitCode = NULL;

    VEC_VEC_CATEGORIES vecSplitCodes;

    int i = 0;
    int iT = 0;
    int iK = 0;
    int cTrees = INTEGER(rcTrees)[0];
    const int cResultComponents = 7;
    // rdInitF, radF, radTrainError, radValidError, radOOBagImprove
    // rSetOfTrees, rSetSplitCodes
    const int cTreeComponents = 8;
    // riSplitVar, rdSplitPoint, riLeftNode,
    // riRightNode, riMissingNode, rdErrorReduction, rdWeight, rdPred
    int cNodes = 0;
    int cTrain = INTEGER(rcTrain)[0];
    int cNumClasses = INTEGER(rcNumClasses)[0];

    double dTrainError = 0.0;
    double dValidError = 0.0;
    double dOOBagImprove = 0.0;

    CGBM *pGBM = NULL;
    CDataset *pData = NULL;
    CDistribution *pDist = NULL;

    // set up the dataset
    pData = new CDataset();
    if(pData==NULL)
    {
        hr = GBM_OUTOFMEMORY;
        goto Error;
    }

    // initialize R's random number generator
    GetRNGstate();

    // initialize some things
    hr = gbm_setup(REAL(radY),
                   REAL(radOffset),
                   REAL(radX),
                   INTEGER(raiXOrder),
                   REAL(radWeight),
                   REAL(radMisc),
                   INTEGER(rcRows)[0],
                   INTEGER(rcCols)[0],
                   INTEGER(racVarClasses),
                   INTEGER(ralMonotoneVar),
                   CHAR(STRING_ELT(rszFamily,0)),
                   INTEGER(rcTrees)[0],
                   INTEGER(rcDepth)[0],
                   INTEGER(rcMinObsInNode)[0],
                   INTEGER(rcNumClasses)[0],
                   REAL(rdShrinkage)[0],
                   REAL(rdBagFraction)[0],
                   INTEGER(rcTrain)[0],
                   pData,
                   pDist);
    if(GBM_FAILED(hr))
    {
        goto Error;
    }
       
    // allocate the GBM
    pGBM = new CGBM();
    if(pGBM==NULL)
    {
        hr = GBM_OUTOFMEMORY;
        goto Error;
    }

    // initialize the GBM
    hr = pGBM->Initialize(pData,
                          pDist,
                          REAL(rdShrinkage)[0], 
                          cTrain, 
                          REAL(rdBagFraction)[0],
                          INTEGER(rcDepth)[0],
                          INTEGER(rcMinObsInNode)[0],
                          INTEGER(rcNumClasses)[0]);

    if(GBM_FAILED(hr))
    {
        goto Error;
    }

    // allocate the main return object
    PROTECT(rAns = allocVector(VECSXP, cResultComponents));

    // allocate the initial value
    PROTECT(rdInitF = allocVector(REALSXP, 1));
    SET_VECTOR_ELT(rAns,0,rdInitF);
    UNPROTECT(1); // rdInitF

    // allocate the predictions
    PROTECT(radF = allocVector(REALSXP, (pData->cRows) * cNumClasses));
    SET_VECTOR_ELT(rAns,1,radF);
    UNPROTECT(1); // radF

    if(ISNA(REAL(radFOld)[0])) // check for old predictions
    {
        // set the initial value of F as a constant
        hr = pDist->InitF(pData->adY,
                          pData->adMisc,
                          pData->adOffset,
                          pData->adWeight,
                          REAL(rdInitF)[0], 
                          cTrain);

        for(i=0; i < (pData->cRows) * cNumClasses; i++)
        {
            REAL(radF)[i] = REAL(rdInitF)[0];
        }
    }
    else
    {
        for(i=0; i < (pData->cRows) * cNumClasses; i++)
        {
            REAL(radF)[i] = REAL(radFOld)[i];
        }
    }

    // allocate space for the performance measures
    PROTECT(radTrainError = allocVector(REALSXP, cTrees));
    PROTECT(radValidError = allocVector(REALSXP, cTrees));
    PROTECT(radOOBagImprove = allocVector(REALSXP, cTrees));
    SET_VECTOR_ELT(rAns,2,radTrainError);
    SET_VECTOR_ELT(rAns,3,radValidError);
    SET_VECTOR_ELT(rAns,4,radOOBagImprove);
    UNPROTECT(3); // radTrainError , radValidError, radOOBagImprove

    // allocate the component for the tree structures
    PROTECT(rSetOfTrees = allocVector(VECSXP, cTrees * cNumClasses));
    SET_VECTOR_ELT(rAns,5,rSetOfTrees);
    UNPROTECT(1); // rSetOfTrees

    if(INTEGER(rfVerbose)[0])
    {
       Rprintf("Iter   TrainDeviance   ValidDeviance   StepSize   Improve\n");
    }
    for(iT=0; iT<cTrees; iT++)
    {
        // Update the parameters
        hr = pDist->UpdateParams(REAL(radF), pData->adOffset, pData->adWeight, cTrain);

        if(GBM_FAILED(hr))
        {
           goto Error;
        }
        REAL(radTrainError)[iT] = 0.0;
        REAL(radValidError)[iT] = 0.0;
        REAL(radOOBagImprove)[iT] = 0.0;
        for (iK = 0; iK < cNumClasses; iK++)
        {
            hr = pGBM->iterate(REAL(radF),
                               dTrainError,dValidError,dOOBagImprove,
                               cNodes, cNumClasses, iK);

            if(GBM_FAILED(hr))
            {
                goto Error;
            }

            // store the performance measures
            REAL(radTrainError)[iT] += dTrainError;
            REAL(radValidError)[iT] += dValidError;
            REAL(radOOBagImprove)[iT] += dOOBagImprove;

            // allocate the new tree component for the R list structure
            PROTECT(rNewTree = allocVector(VECSXP, cTreeComponents));
            // riNodeID,riSplitVar,rdSplitPoint,riLeftNode,
            // riRightNode,riMissingNode,rdErrorReduction,rdWeight
            PROTECT(riSplitVar = allocVector(INTSXP, cNodes));
            PROTECT(rdSplitPoint = allocVector(REALSXP, cNodes));
            PROTECT(riLeftNode = allocVector(INTSXP, cNodes));
            PROTECT(riRightNode = allocVector(INTSXP, cNodes));
            PROTECT(riMissingNode = allocVector(INTSXP, cNodes));
            PROTECT(rdErrorReduction = allocVector(REALSXP, cNodes));
            PROTECT(rdWeight = allocVector(REALSXP, cNodes));
            PROTECT(rdPred = allocVector(REALSXP, cNodes));
            SET_VECTOR_ELT(rNewTree,0,riSplitVar);
            SET_VECTOR_ELT(rNewTree,1,rdSplitPoint);
            SET_VECTOR_ELT(rNewTree,2,riLeftNode);
            SET_VECTOR_ELT(rNewTree,3,riRightNode);
            SET_VECTOR_ELT(rNewTree,4,riMissingNode);
            SET_VECTOR_ELT(rNewTree,5,rdErrorReduction);
            SET_VECTOR_ELT(rNewTree,6,rdWeight);
            SET_VECTOR_ELT(rNewTree,7,rdPred);
            UNPROTECT(cTreeComponents); 
            SET_VECTOR_ELT(rSetOfTrees,(iK + iT * cNumClasses),rNewTree);
            UNPROTECT(1); // rNewTree
        
            hr = gbm_transfer_to_R(pGBM,
                                   vecSplitCodes,
                                   INTEGER(riSplitVar),
                                   REAL(rdSplitPoint),
                                   INTEGER(riLeftNode),
                                   INTEGER(riRightNode),
                                   INTEGER(riMissingNode),
                                   REAL(rdErrorReduction),
                                   REAL(rdWeight),
                                   REAL(rdPred),
                                   INTEGER(rcCatSplitsOld)[0]);
        } // Close for iK

        // print the information
        if((iT <= 9) ||
           ((iT+1+INTEGER(rcTreesOld)[0])/100 ==
            (iT+1+INTEGER(rcTreesOld)[0])/100.0) ||
            (iT==cTrees-1))
        {
            R_CheckUserInterrupt();
            if(INTEGER(rfVerbose)[0])
            {
               Rprintf("%6d %13.4f %15.4f %10.4f %9.4f\n",
                       iT+1+INTEGER(rcTreesOld)[0],
                       REAL(radTrainError)[iT],
                       REAL(radValidError)[iT],
                       REAL(rdShrinkage)[0],
                       REAL(radOOBagImprove)[iT]);
            }
        }
    }

    if(INTEGER(rfVerbose)[0]) Rprintf("\n");

    // transfer categorical splits to R
    PROTECT(rSetSplitCodes = allocVector(VECSXP, vecSplitCodes.size()));
    SET_VECTOR_ELT(rAns,6,rSetSplitCodes);
    UNPROTECT(1); // rSetSplitCodes

    for(i=0; i<(int)vecSplitCodes.size(); i++)
    {
        PROTECT(rSplitCode = 
                    allocVector(INTSXP, size_of_vector(vecSplitCodes,i)));
        SET_VECTOR_ELT(rSetSplitCodes,i,rSplitCode);
        UNPROTECT(1); // rSplitCode

        hr = gbm_transfer_catsplits_to_R(i,
                                         vecSplitCodes,
                                         INTEGER(rSplitCode));
    }

    // dump random number generator seed
    #ifdef NOISY_DEBUG
    Rprintf("PutRNGstate\n");
    #endif
    PutRNGstate();

Cleanup:
    UNPROTECT(1); // rAns
    #ifdef NOISY_DEBUG
    Rprintf("destructing\n");
    #endif

    if(pGBM != NULL)
    {
        delete pGBM;
        pGBM = NULL;
    }
    if(pDist != NULL)
    {
        delete pDist;
        pDist = NULL;
    }
    if(pData != NULL)
    {
        delete pData;
        pData = NULL;
    }

    return rAns;
Error:
    goto Cleanup;
}
Example #8
0
GBMRESULT CGBM::Initialize
(
    CDataset *pData,
    CDistribution *pDist,
    double dLambda,
    unsigned long cTrain,
    double dBagFraction,
    unsigned long cDepth,
    unsigned long cMinObsInNode,
    unsigned long cNumClasses,
    int cGroups
)
{
    GBMRESULT hr = GBM_OK;
    unsigned long i=0;

    if(pData == NULL)
    {
        hr = GBM_INVALIDARG;
        goto Error;
    }
    if(pDist == NULL)
    {
        hr = GBM_INVALIDARG;
        goto Error;
    }

    this->pData = pData;
    this->pDist = pDist;
    this->dLambda = dLambda;
    this->cTrain = cTrain;
    this->dBagFraction = dBagFraction;
    this->cDepth = cDepth;
    this->cMinObsInNode = cMinObsInNode;
    this->cGroups = cGroups;

    // allocate the tree structure
    ptreeTemp = new CCARTTree;
    
    cValid = pData->cRows - cTrain;
    cTotalInBag = (unsigned long)(dBagFraction*cTrain);
    adZ.assign((pData->cRows) * cNumClasses, 0);
    adFadj.assign((pData->cRows) * cNumClasses, 0);

    pNodeFactory = new CNodeFactory();
    hr = pNodeFactory->Initialize(cDepth);
    if(GBM_FAILED(hr))
    {
        goto Error;
    }
    ptreeTemp->Initialize(pNodeFactory);

    // array for flagging those observations in the bag
    afInBag = new bool[cTrain];
    
    // aiNodeAssign tracks to which node each training obs belongs
    aiNodeAssign.resize(cTrain);
    // NodeSearch objects help decide which nodes to split
    aNodeSearch = new CNodeSearch[2*cDepth+1];
    
    for(i=0; i<2*cDepth+1; i++)
    {
        aNodeSearch[i].Initialize(cMinObsInNode);
    }
    vecpTermNodes.resize(2*cDepth+1,NULL);

    fInitialized = true;

Cleanup:
    return hr;
Error:
    goto Cleanup;
}
Example #9
0
GBMRESULT CGBM::iterate
(
    double *adF,
    double &dTrainError,
    double &dValidError,
    double &dOOBagImprove,
    int &cNodes,
    int cNumClasses,
    int cClassIdx
)
{
    GBMRESULT hr = GBM_OK;
    unsigned long i = 0;
    unsigned long cBagged = 0;
    int cIdxOff = cClassIdx * (cTrain + cValid);

 //   for(i=0; i < cTrain + cIdxOff; i++){ adF[i] = 0;}
    if(!fInitialized)
    {
        hr = GBM_FAIL;
        goto Error;
    }

    dTrainError = 0.0;
    dValidError = 0.0;
    dOOBagImprove = 0.0;

    vecpTermNodes.assign(2*cDepth+1,NULL);

    // randomly assign observations to the Bag

    if (cClassIdx == 0)
    {
        if (!IsPairwise())
        {
            // regular instance based training
            for(i=0; i<cTrain; i++) /* && (cBagged < cTotalInBag); i++) */
            {
                if(unif_rand()*(cTrain-i) < cTotalInBag-cBagged)
                {
                    afInBag[i] = true;
                    cBagged++;
                }
                else
                {
                    afInBag[i] = false;
                }
/*                if (cBagged >= cTotalInBag){
                    break; 		
                } */
            }
	    std::fill(afInBag + i, afInBag + cTrain, false);
        }
        else
        {
            // for pairwise training, sampling is per group
            // therefore, we will not have exactly cTotalInBag instances

            double dLastGroup = -1;
            bool chosen = false;
            unsigned int cBaggedGroups = 0;
            unsigned int cSeenGroups   = 0;
            unsigned int cTotalGroupsInBag = (unsigned long)(dBagFraction * cGroups);
            if (cTotalGroupsInBag <= 0)
            {
                cTotalGroupsInBag = 1;
            }
            for(i=0; i<cTrain; i++)
            {
                const double dGroup = pData->adMisc[i];
                if (dGroup != dLastGroup)
                {
                    if (cBaggedGroups >= cTotalGroupsInBag)
                    {
                        break;
                    }

                    // Group changed, make a new decision
                    chosen = (unif_rand()*(cGroups - cSeenGroups) < cTotalGroupsInBag - cBaggedGroups);
                    if (chosen)
                    {
                        cBaggedGroups++;
                    }
                    dLastGroup = dGroup;
                    cSeenGroups++;
                }
                if (chosen)
                {
                    afInBag[i] = true;
                    cBagged++;
                }
                else
                {
                    afInBag[i] = false;
                }
            }
            // the remainder is not in the bag
	    std::fill(afInBag + i, afInBag + cTrain, false);
        }
    }

#ifdef NOISY_DEBUG
    Rprintf("Compute working response\n");
#endif

    hr = pDist->ComputeWorkingResponse(pData->adY,
                                       pData->adMisc,
                                       pData->adOffset,
                                       adF,
                                       &adZ[0],
                                       pData->adWeight,
                                       afInBag,
                                       cTrain,
                                       cIdxOff);

    if(GBM_FAILED(hr))
    {
        goto Error;
    }

#ifdef NOISY_DEBUG
    Rprintf("Reset tree\n");
#endif
    hr = ptreeTemp->Reset();
#ifdef NOISY_DEBUG
    Rprintf("grow tree\n");
#endif

    hr = ptreeTemp->grow(&(adZ[cIdxOff]), pData, &(pData->adWeight[cIdxOff]),
                         &(adFadj[cIdxOff]), cTrain, cTotalInBag, dLambda, cDepth,
                         cMinObsInNode, afInBag, aiNodeAssign, aNodeSearch,
                         vecpTermNodes);

    if(GBM_FAILED(hr))
    {
        goto Error;
    }

#ifdef NOISY_DEBUG
    Rprintf("get node count\n");
#endif
    hr = ptreeTemp->GetNodeCount(cNodes);
    if(GBM_FAILED(hr))
    {
        goto Error;
    }

    // Now I have adF, adZ, and vecpTermNodes (new node assignments)
    // Fit the best constant within each terminal node
#ifdef NOISY_DEBUG
    Rprintf("fit best constant\n");
#endif

    hr = pDist->FitBestConstant(pData->adY,
                                pData->adMisc,
                                pData->adOffset,
                                pData->adWeight,
                                &adF[0],
                                &adZ[0],
                                aiNodeAssign,
                                cTrain,
                                vecpTermNodes,
                                (2*cNodes+1)/3, // number of terminal nodes
                                cMinObsInNode,
                                afInBag,
                                &adFadj[0],
                                cIdxOff);

    if(GBM_FAILED(hr))
    {
        goto Error;
    }

    // update training predictions
    // fill in missing nodes where N < cMinObsInNode
    hr = ptreeTemp->Adjust(aiNodeAssign,&(adFadj[cIdxOff]),cTrain,
                           vecpTermNodes,cMinObsInNode);
    if(GBM_FAILED(hr))
    {
        goto Error;
    }
    ptreeTemp->SetShrinkage(dLambda);

    if (cClassIdx == (cNumClasses - 1))
    {
        dOOBagImprove = pDist->BagImprovement(pData->adY,
                                              pData->adMisc,
                                              pData->adOffset,
                                              pData->adWeight,
                                              &adF[0],
                                              &adFadj[0],
                                              afInBag,
                                              dLambda,
                                              cTrain);
    }

    // update the training predictions
    for(i=0; i < cTrain; i++)
    {
        int iIdx = i + cIdxOff;
        adF[iIdx] += dLambda * adFadj[iIdx];
    }

    dTrainError = pDist->Deviance(pData->adY,
                                  pData->adMisc,
                                  pData->adOffset,
                                  pData->adWeight,
                                  adF,
                                  cTrain,
                                  cIdxOff);

    // update the validation predictions
    hr = ptreeTemp->PredictValid(pData,cValid,&(adFadj[cIdxOff]));

    for(i=cTrain; i < cTrain+cValid; i++)
    {
        adF[i + cIdxOff] += adFadj[i + cIdxOff];
    }

    if(pData->fHasOffset)
    {
        dValidError =
            pDist->Deviance(pData->adY,
                            pData->adMisc,
                            pData->adOffset,
                            pData->adWeight,
                            adF,
                            cValid,
                            cIdxOff + cTrain);
    }
    else
    {
        dValidError = pDist->Deviance(pData->adY,
                                      pData->adMisc,
                                      NULL,
                                      pData->adWeight,
                                      adF,
                                      cValid,
                                      cIdxOff + cTrain);
    }

Cleanup:
    return hr;
Error:
    goto Cleanup;
}
Example #10
0
File: gbm.cpp Project: rforge/gbm
unsigned long gbm_setup
(
    double *adY,
    double *adOffset,
    double *adX,
    int *aiXOrder,
    double *adWeight,
    double *adMisc,
    int cRows,
    int cCols,
    int *acVarClasses,
    int *alMonotoneVar,
    const char *pszFamily,
    int cTrees,
    int cDepth,
    int cMinObsInNode,
    int cNumClasses,
    double dShrinkage,
    double dBagFraction,
    int cTrain,
    CDataset *pData,
    PCDistribution &pDist
)
{
    unsigned long hr = 0;

    hr = pData->SetData(adX,aiXOrder,adY,adOffset,adWeight,adMisc,
                        cRows,cCols,acVarClasses,alMonotoneVar);
    if(GBM_FAILED(hr))
    {
        goto Error;
    }

    // set the distribution
    if(strncmp(pszFamily,"bernoulli",2) == 0)
    {
        pDist = new CBernoulli();
        if(pDist==NULL)
        {
            hr = GBM_OUTOFMEMORY;
            goto Error;
        }
    }
    else if(strncmp(pszFamily,"gaussian",2) == 0)
    {
        pDist = new CGaussian();
        if(pDist==NULL)
        {
            hr = GBM_OUTOFMEMORY;
            goto Error;
        }
    }
    else if(strncmp(pszFamily,"poisson",2) == 0)
    {
        pDist = new CPoisson();
        if(pDist==NULL)
        {
            hr = GBM_OUTOFMEMORY;
            goto Error;
        }
    }
    else if(strncmp(pszFamily,"adaboost",2) == 0)
    {
        pDist = new CAdaBoost();
        if(pDist==NULL)
        {
            hr = GBM_OUTOFMEMORY;
            goto Error;
        }
    }
    else if(strncmp(pszFamily,"coxph",2) == 0)
    {
        pDist = new CCoxPH();
        if(pDist==NULL)
        {
            hr = GBM_OUTOFMEMORY;
            goto Error;
        }
    }
    else if(strncmp(pszFamily,"laplace",2) == 0)
    {
        pDist = new CLaplace();
        if(pDist==NULL)
        {
            hr = GBM_OUTOFMEMORY;
            goto Error;
        }
    }
    else if(strncmp(pszFamily,"quantile",2) == 0)
    {
        pDist = new CQuantile(adMisc[0]);
        if(pDist==NULL)
        {
            hr = GBM_OUTOFMEMORY;
            goto Error;
        }
    }
    else if(strncmp(pszFamily,"tdist",2) == 0)
    {
        pDist = new CTDist(adMisc[0]);
        if(pDist==NULL)
        {
            hr = GBM_OUTOFMEMORY;
            goto Error;
        }
    }
    else if(strncmp(pszFamily,"multinomial",2) == 0)
    {
        pDist = new CMultinomial(cNumClasses, cRows);
        if(pDist==NULL)
        {
            hr = GBM_OUTOFMEMORY;
            goto Error;
        }
    }
    else if(strncmp(pszFamily,"huberized",2) == 0)
    {
        pDist = new CHuberized();
        if(pDist==NULL)
        {
            hr = GBM_OUTOFMEMORY;
            goto Error;
        }
    }
    if(pDist==NULL)
    {
        hr = GBM_INVALIDARG;
        goto Error;
    }

Cleanup:
    return hr;
Error:
    goto Cleanup;
}
Example #11
0
//------------------------------------------------------------------------------
// Grows a regression tree
//------------------------------------------------------------------------------
GBMRESULT CCARTTree::grow
(
    double *adZ, 
    CDataset *pData, 
    double *adW,
    double *adF,
    unsigned long nTrain,
    unsigned long nBagged,
    double dLambda,
    unsigned long cMaxDepth,
    unsigned long cMinObsInNode,
    bool *afInBag,
    unsigned long *aiNodeAssign,
    CNodeSearch *aNodeSearch,
    VEC_P_NODETERMINAL &vecpTermNodes
)
{
    GBMRESULT hr = GBM_OK;

    #ifdef NOISY_DEBUG
    Rprintf("Growing tree\n");
    #endif

    if((adZ==NULL) || (pData==NULL) || (adW==NULL) || (adF==NULL) || 
       (cMaxDepth < 1))
    {
        hr = GBM_INVALIDARG;
        goto Error;
    }

    dSumZ = 0.0;
    dSumZ2 = 0.0;
    dTotalW = 0.0;

    #ifdef NOISY_DEBUG
    Rprintf("initial tree calcs\n");
    #endif
    for(iObs=0; iObs<nTrain; iObs++)
    {
        // aiNodeAssign tracks to which node each training obs belongs
        aiNodeAssign[iObs] = 0;
        if(afInBag[iObs])
        {
            // get the initial sums and sum of squares and total weight
            dSumZ += adW[iObs]*adZ[iObs];
            dSumZ2 += adW[iObs]*adZ[iObs]*adZ[iObs];
            dTotalW += adW[iObs];
        }
    }
    dError = dSumZ2-dSumZ*dSumZ/dTotalW;

    pInitialRootNode = pNodeFactory->GetNewNodeTerminal();
    pInitialRootNode->dPrediction = dSumZ/dTotalW;
    pInitialRootNode->dTrainW = dTotalW;
    vecpTermNodes.resize(2*cMaxDepth + 1,NULL); // accounts for missing nodes
    vecpTermNodes[0] = pInitialRootNode;
    pRootNode = pInitialRootNode;

    aNodeSearch[0].Set(dSumZ,dTotalW,nBagged,
                       pInitialRootNode,
                       &pRootNode,
                       pNodeFactory);

    // build the tree structure
    #ifdef NOISY_DEBUG
    Rprintf("Building tree 1 ");
    #endif
    cTotalNodeCount = 1;
    cTerminalNodes = 1;
    for(cDepth=0; cDepth<cMaxDepth; cDepth++)
    {
        #ifdef NOISY_DEBUG
        Rprintf("%d ",cDepth);
        #endif
        hr = GetBestSplit(pData,
                          nTrain,
                          aNodeSearch,
                          cTerminalNodes,
                          aiNodeAssign,
                          afInBag,
                          adZ,
                          adW,
                          iBestNode,
                          dBestNodeImprovement);
        if(GBM_FAILED(hr))
        {
            goto Error;
        }

        if(dBestNodeImprovement == 0.0)
        {
            break;
        }

        // setup the new nodes and add them to the tree
        hr = aNodeSearch[iBestNode].SetupNewNodes(pNewSplitNode,
                                                  pNewLeftNode,
                                                  pNewRightNode,
                                                  pNewMissingNode);
        cTotalNodeCount += 3;
        cTerminalNodes += 2;
        vecpTermNodes[iBestNode] = pNewLeftNode;
        vecpTermNodes[cTerminalNodes-2] = pNewRightNode;
        vecpTermNodes[cTerminalNodes-1] = pNewMissingNode;

        // assign observations to the correct node
        for(iObs=0; iObs < nTrain; iObs++)
        {
            iWhichNode = aiNodeAssign[iObs];
            if(iWhichNode==iBestNode)
            {
                schWhichNode = pNewSplitNode->WhichNode(pData,iObs);
                if(schWhichNode == 1) // goes right
                {
                    aiNodeAssign[iObs] = cTerminalNodes-2;
                }
                else if(schWhichNode == 0) // is missing
                {
                    aiNodeAssign[iObs] = cTerminalNodes-1;
                }
                // those to the left stay with the same node assignment
            }
        }

        // set up the node search for the new right node
        aNodeSearch[cTerminalNodes-2].Set(aNodeSearch[iBestNode].dBestRightSumZ,
                                          aNodeSearch[iBestNode].dBestRightTotalW,
                                          aNodeSearch[iBestNode].cBestRightN,
                                          pNewRightNode,
                                          &(pNewSplitNode->pRightNode),
                                          pNodeFactory);
        // set up the node search for the new missing node
        aNodeSearch[cTerminalNodes-1].Set(aNodeSearch[iBestNode].dBestMissingSumZ,
                                          aNodeSearch[iBestNode].dBestMissingTotalW,
                                          aNodeSearch[iBestNode].cBestMissingN,
                                          pNewMissingNode,
                                          &(pNewSplitNode->pMissingNode),
                                          pNodeFactory);
        // set up the node search for the new left node
        // must be done second since we need info for right node first
        aNodeSearch[iBestNode].Set(aNodeSearch[iBestNode].dBestLeftSumZ,
                                   aNodeSearch[iBestNode].dBestLeftTotalW,
                                   aNodeSearch[iBestNode].cBestLeftN,
                                   pNewLeftNode,
                                   &(pNewSplitNode->pLeftNode),
                                   pNodeFactory);

    } // end tree growing

    // DEBUG
    // Print();

Cleanup:
    return hr;
Error:
    goto Cleanup;
}
Example #12
0
GBMRESULT CCARTTree::GetBestSplit
(
    CDataset *pData,
    unsigned long nTrain,
    CNodeSearch *aNodeSearch,
    unsigned long cTerminalNodes,
    unsigned long *aiNodeAssign,
    bool *afInBag,
    double *adZ,
    double *adW,
    unsigned long &iBestNode,
    double &dBestNodeImprovement
)
{
    GBMRESULT hr = GBM_OK;

    int iVar = 0;
    unsigned long iNode = 0;
    unsigned long iOrderObs = 0;
    unsigned long iWhichObs = 0;
    unsigned long cVarClasses = 0;
    double dX = 0.0;

    for(iVar=0; iVar < pData->cCols; iVar++)
    {
        cVarClasses = pData->acVarClasses[iVar];

        for(iNode=0; iNode < cTerminalNodes; iNode++)
        {
            hr = aNodeSearch[iNode].ResetForNewVar(iVar,cVarClasses);
        }

        // distribute the observations in order to the correct node search
        for(iOrderObs=0; iOrderObs < nTrain; iOrderObs++)
        {
            iWhichObs = pData->aiXOrder[iVar*nTrain + iOrderObs];
            if(afInBag[iWhichObs])
            {
                iNode = aiNodeAssign[iWhichObs];
                dX = pData->adX[iVar*(pData->cRows) + iWhichObs];
                hr = aNodeSearch[iNode].IncorporateObs
                     (dX,
                      adZ[iWhichObs],
                      adW[iWhichObs],
                      pData->alMonotoneVar[iVar]);
                if(GBM_FAILED(hr))
                {
                    goto Error;
                }
            }
        }
        for(iNode=0; iNode<cTerminalNodes; iNode++)
        {
            if(cVarClasses != 0) // evaluate if categorical split
            {
                hr = aNodeSearch[iNode].EvaluateCategoricalSplit();
            }
            aNodeSearch[iNode].WrapUpCurrentVariable();
        }
    }

    // search for the best split
    iBestNode = 0;
    dBestNodeImprovement = 0.0;
    for(iNode=0; iNode<cTerminalNodes; iNode++)
    {
        aNodeSearch[iNode].SetToSplit();
        if(aNodeSearch[iNode].BestImprovement() > dBestNodeImprovement)
        {
            iBestNode = iNode;
            dBestNodeImprovement = aNodeSearch[iNode].BestImprovement();
        }
    }

Cleanup:
    return hr;
Error:
    goto Cleanup;
}
Example #13
0
GBMRESULT CGBM::iterate
(
    double *adF,
    double &dTrainError,
    double &dValidError,
    double &dOOBagImprove,
    int &cNodes
)
{
    GBMRESULT hr = GBM_OK;
    unsigned long i = 0;
    unsigned long cBagged = 0;

    if(!fInitialized)
    {
        hr = GBM_FAIL;
        goto Error;
    }

    dTrainError = 0.0;
    dValidError = 0.0;
    dOOBagImprove = 0.0;

    vecpTermNodes.assign(2*cDepth+1,NULL);

    // randomly assign observations to the Bag
    cBagged = 0;
    for(i=0; i<cTrain; i++)
    {
        if(unif_rand()*(cTrain-i) < cTotalInBag-cBagged)
        {
            afInBag[i] = true;
            cBagged++;
        }
        else
        {
            afInBag[i] = false;
        }
    }

    #ifdef NOISY_DEBUG
    Rprintf("Compute working response\n");
    #endif

    // CYF
    /*Rprintf("# training data: %d \n", cTrain);
    Rprintf("one iteration \n");
    for(i=0; i < cTrain; i++)
    {
    	Rprintf("%d  ", afInBag[i]); //adF[i]
    }
    Rprintf("\n");*/

    hr = pDist->ComputeWorkingResponse(pData->adY, 
                                       pData->adMisc,
                                       pData->adOffset,
                                       adF, 
                                       adZ,
                                       pData->adWeight,
                                       afInBag,
                                       cTrain);

    // CYF
    /*for(i=0; i < cTrain; i++)
    {
    	Rprintf("%f\n", adZ[i]); //adF[i]
    }*/
    //Rprintf("\n");



    if(GBM_FAILED(hr))
    {
        goto Error;
    }

    #ifdef NOISY_DEBUG
    Rprintf("Reset tree\n");
    #endif
    hr = ptreeTemp->Reset();
    #ifdef NOISY_DEBUG
    Rprintf("grow tree\n");
    #endif
    hr = ptreeTemp->grow(adZ,pData,pData->adWeight,adFadj,
                         cTrain,cTotalInBag,dLambda,cDepth,
                         cMinObsInNode,
                         afInBag,
                         aiNodeAssign,aNodeSearch,vecpTermNodes);
    if(GBM_FAILED(hr))
    {
        goto Error;
    }

    #ifdef NOISY_DEBUG
    Rprintf("get node count\n");
    #endif
    hr = ptreeTemp->GetNodeCount(cNodes);
    if(GBM_FAILED(hr))
    {
        goto Error;
    }


    // CYF: ??!! only node 0 1 3 5 has data
    //Rprintf("# training data: %d \n", cTrain);
    /*Rprintf("# total nodes: %d \n", cNodes);
    for(i=0; i < cTrain; i++)
    {
    	Rprintf("%d, ", aiNodeAssign[i]); //adF[i]
    }
    Rprintf("\n");*/

    // Now I have adF, adZ, and vecpTermNodes (new node assignments)
    // Fit the best constant within each terminal node
    #ifdef NOISY_DEBUG
    Rprintf("fit best constant\n");
    #endif
    hr = pDist->FitBestConstant(pData->adY,
                                pData->adMisc,
                                pData->adOffset,
                                pData->adWeight,
                                adF,
                                adZ,
                                aiNodeAssign,
                                cTrain,
                                vecpTermNodes,
                                (2*cNodes+1)/3, // number of terminal nodes
                                cMinObsInNode,
                                afInBag,
                                adFadj);
    //Rprintf("FitBestConstant finished \n");

    if(GBM_FAILED(hr))
    {
        goto Error;
    }

    // CYF
    // Rprintf("2*cDepth+1, (2*cNodes+1)/3: %d %d \n", 2*cDepth+1, (2*cNodes+1)/3);

    // update training predictions
    // fill in missing nodes where N < cMinObsInNode
    hr = ptreeTemp->Adjust(aiNodeAssign,adFadj,cTrain,
                           vecpTermNodes,cMinObsInNode);
/*
    for (i=0; i<cTrain; i++)
    {
    	Rprintf("%f, ", adFadj[i]);
    }
    Rprintf("\n");
*/
    //Rprintf("Tree adjust finished \n");


    /*Rprintf("# total nodes: %d \n", cNodes);
    for(i=0; i < cTrain; i++)
    {
    	Rprintf("%d, ", aiNodeAssign[i]); //  adF[i]
    }
    Rprintf("\n");*/


    if(GBM_FAILED(hr))
    {
        goto Error;
    }
    ptreeTemp->SetShrinkage(dLambda);

    dOOBagImprove = pDist->BagImprovement(pData->adY,
                                          pData->adMisc,
                                          pData->adOffset,
                                          pData->adWeight,
                                          adF,
                                          adFadj,
                                          afInBag,
                                          dLambda,
                                          cTrain);

    //Rprintf("BagImprovement finished \n");

    // update the training predictions
    for(i=0; i < cTrain; i++)
    {
        adF[i] += dLambda*adFadj[i];
    }
    dTrainError = pDist->Deviance(pData->adY,
                                  pData->adMisc,
                                  pData->adOffset,
                                  pData->adWeight,
                                  adF,
                                  cTrain);

    // update the validation predictions
    hr = ptreeTemp->PredictValid(pData,cValid,adFadj);
    for(i=cTrain; i < cTrain+cValid; i++)
    {
        adF[i] += adFadj[i];
    }
    if(pData->fHasOffset)
    {
        dValidError =
            pDist->Deviance(&(pData->adY[cTrain]),
                            &(pData->adMisc[cTrain]),
                            &(pData->adOffset[cTrain]),
                            &(pData->adWeight[cTrain]),
                            &(adF[cTrain]),
                            cValid);
    	//dValidError = 0; // temp
    }
    else
    {
        dValidError = pDist->Deviance(&(pData->adY[cTrain]),
                                      &(pData->adMisc[cTrain]),
                                      NULL,
                                      &(pData->adWeight[cTrain]),
                                      &(adF[cTrain]),
                                      cValid);
    	//dValidError = 0;
    }
    //Rprintf("%f  ", dValidError);

    //Rprintf("Deviance computing finished \n");

Cleanup:
    return hr;
Error:
    goto Cleanup;
}