Example #1
0
  void RandomTree::_build(DataFrame & data, std::vector<unsigned int> & dataSet, TreeNode * node, unsigned int nodeSize)
  {
    static unsigned int idCtr = 0;
    node->leftChild = NULL;
    node->rightChild = NULL;

    if(data.isDataSetPure(dataSet) || dataSet.size() <= nodeSize)  //Data is pure
    { 
      //If data set is all of same class then it is pure and done
      //Give it a class label
      
      node->classLabel = data.getTrainingLabel(dataSet[0]);
      node->isPure = true;
      node->purityDelta = 0;
      node->rangeMin = node->rangeMax = 0;

      node->dataList = dataSet;
      idCtr++;
      node->nodeId = idCtr; 
    }
    else  //Data is not pure
    {
      std::vector<unsigned int> factors;
      unsigned int splitIdx = 0;
      unsigned int fIdx = 0;
      double splitVal = 0.0;
      double purityDelta = 0.0;

      data.selectRandomFactors(_factPerNode, factors);

      bool splitPossible = _igc.findDataSplit(data, factors, dataSet, splitIdx, fIdx, splitVal, purityDelta);

      if(splitPossible)  //Data is not all same value
      {
        node->isPure = false;

        std::vector<unsigned int> leftSplit;
        std::vector<unsigned int> rightSplit;
        node->leftChild = new TreeNode();
        node->rightChild = new TreeNode();

        node->splitValue = splitVal;
        node->factorIndex = fIdx;
        node->purityDelta = purityDelta;
        node->nodeId = 0;
       
        double minVal, maxVal, mean, q1, q3;

        double bandwidth = data.computeBandwidthByFactor(fIdx, dataSet, minVal, 
          maxVal, mean, q1, q3);
        bandwidth = bandwidth;
//         node->rangeMin = mean - (6 * bandwidth);
//         node->rangeMax = mean + (6 * bandwidth);
//           double midVal = (maxVal - minVal) / 2.0;
//           node->rangeMin = minVal - (0.5 *(maxVal - minVal));
//           node->rangeMax = maxVal + (0.5 * (maxVal - minVal));
         double iqr = q3 - q1;
         node->rangeMin = q1 - ( 3 * iqr);
         node->rangeMax = q3 + (3 * iqr);

       
        data.sortIndicesOnFactorValue(dataSet, fIdx);

        for(unsigned int i = 0; i < splitIdx; i++)
        {
          leftSplit.push_back(dataSet[i]);
        }
        _build(data, leftSplit, node->leftChild, nodeSize);

        for(unsigned int i = splitIdx; i < dataSet.size(); i++)
        {
          rightSplit.push_back(dataSet[i]);
        }

        _build(data, rightSplit, node->rightChild, nodeSize);
      }
      else  //Data is all same value
      {
        //No split possible (all factors values same across all factors)
        //Vote on classes and make pure node.
        
        node->classLabel = data.getMajorityTrainingLabel(dataSet);
        node->isPure = true;
        node->purityDelta = 0;

        node->dataList = dataSet;
        idCtr++;
        node->nodeId = idCtr;
      }
    }
  }