PClassifier
TSimpleTreeLearner::operator()(PExampleGenerator ogen, const int &weight)
{
	struct Example *examples, *ex;
	struct SimpleTreeNode *tree;
	struct Args args;
	int cls_vals;

	if (!ogen->domain->classVar)
		raiseError("class-less domain");

	if (!ogen->numberOfExamples() > 0)
	    raiseError("no examples");

	/* create a tabel with pointers to examples */
	ASSERT(examples = (struct Example *)calloc(ogen->numberOfExamples(), sizeof *examples));
	ex = examples;
	PEITERATE(ei, ogen) {
		ex->example = &(*ei);
		ex->weight = 1.0;
		ex++;
	}
Esempio n. 2
0
bool TTreeStopCriteria::operator()(PExampleGenerator gen, const int &, PDomainContingency ocont)
{ int nor = gen->numberOfExamples();
  if ((nor==0) || (nor==1))
    return true;      // example set is too small

  char vt = gen->domain->classVar->varType;
  if (vt!=TValue::INTVAR)
    return false;  // class is continuous, may continue

  // is there more than one class left?
  if (ocont) {
    char ndcf = 0;
    TDiscDistribution const &dva=CAST_TO_DISCDISTRIBUTION(ocont->classes);
    const_ITERATE(TDiscDistribution, ci, dva)
      if ((*ci>0) && (++ndcf==2))
        return false; // at least two classes, may continue
  }
  
  else {  
Esempio n. 3
0
PTreeNode TTreeLearner::operator()(PExampleGenerator examples, const int &weightID, PDistribution apriorClass, vector<bool> &candidates, const int &depth)
{
  PDomainContingency contingency;
  PDomainDistributions domainDistributions;
  PDistribution classDistribution;

  if (!examples->numberOfExamples())
    return PTreeNode();

  if (contingencyComputer)
    contingency = contingencyComputer->call(examples, weightID);

  if (storeContingencies)
    contingency = mlnew TDomainContingency(examples, weightID);

  if (contingency)
    classDistribution = contingency->classes;
  else if (storeDistributions)
    classDistribution = getClassDistribution(examples, weightID);

  if (classDistribution) {
    if (!classDistribution->abs)
      return PTreeNode();
  }
  else
    if (examples->weightOfExamples() < 1e-10)
      return PTreeNode();
  
  TTreeNode *utreeNode = mlnew TTreeNode();
  PTreeNode treeNode = utreeNode;

  utreeNode->weightID = weightID;

  bool isLeaf = ((maxDepth>=0) && (depth == maxDepth)) || stop->call(examples, weightID, contingency);

  if (isLeaf || storeNodeClassifier) {
    utreeNode->nodeClassifier = nodeLearner
                              ? nodeLearner->smartLearn(examples, weightID, contingency, domainDistributions, classDistribution)
                              : TMajorityLearner().smartLearn(examples, weightID, contingency, domainDistributions, classDistribution);

    if (isLeaf) {
      if (storeContingencies)
        utreeNode->contingency = contingency;
      if (storeDistributions)
        utreeNode->distribution = classDistribution;

      return treeNode;
    }
  }

  utreeNode->contingency = contingency;
  utreeNode->distribution = classDistribution;

  float quality;
  int spentAttribute;
  utreeNode->branchSelector = split->call(utreeNode->branchDescriptions, utreeNode->branchSizes, quality, spentAttribute,
                                          examples, weightID, contingency,
                                          apriorClass ? apriorClass : classDistribution,
                                          candidates, utreeNode->nodeClassifier);

  isLeaf = !utreeNode->branchSelector;
  bool hasNullNodes = false;

  if (!isLeaf) {
    if (spentAttribute>=0)
      if (candidates[spentAttribute])
        candidates[spentAttribute] = false;
      else
        spentAttribute = -1;
    /* BEWARE: If you add an additional 'return' in the code below,
               do not forget to restore the candidate's flag. */

    utreeNode->branches = mlnew TTreeNodeList();

    vector<int> newWeights;

    PExampleGeneratorList subsets = exampleSplitter->call(treeNode, examples, weightID, newWeights);

    if (!utreeNode->branchSizes)
      utreeNode->branchSizes = branchSizesFromSubsets(subsets, weightID, newWeights);

    if (!storeContingencies)
      utreeNode->contingency = PDomainContingency();
    if (!storeDistributions)
      utreeNode->distribution = PDistribution();

    vector<int>::iterator nwi = newWeights.begin(), nwe = newWeights.end();
    PITERATE(TExampleGeneratorList, gi, subsets) {
      if ((*gi)->numberOfExamples()) {
        utreeNode->branches->push_back(call(*gi, (nwi!=nwe) ? *nwi : weightID, apriorClass, candidates, depth+1));
        // Can store pointers to examples: the original is stored in the root
        if (storeExamples && utreeNode->branches->back())
          utreeNode->branches->back()->examples = *gi;
        else if ((nwi!=nwe) && *nwi && (*nwi != weightID))
            examples->removeMetaAttribute(*nwi);
      }
      else {
        utreeNode->branches->push_back(PTreeNode());
        hasNullNodes = true;
      }

      if (nwi!=nwe)
        nwi++;
    }

    /* If I set it to false, it must had been true before
       (otherwise, my TreeSplitConstructor wouldn't be allowed to spend it).
       Hence, I can simply set it back to true... */
    if (spentAttribute>=0)
      candidates[spentAttribute] = true;
  }

  else {  // no split constructed
    if (!utreeNode->contingency)
Esempio n. 4
0
PClassifier TTreeSplitConstructor_Combined::operator()(
                             PStringList &descriptions, PDiscDistribution &subsetSizes, float &quality, int &spentAttribute,

                             PExampleGenerator gen, const int &weightID ,
                             PDomainContingency dcont, PDistribution apriorClass,
                             const vector<bool> &candidates,
                             PClassifier nodeClassifier
                            )
{ checkProperty(discreteSplitConstructor);
  checkProperty(continuousSplitConstructor);

  vector<bool> discrete, continuous;
 
  bool cse = candidates.size()==0;
  vector<bool>::const_iterator ci(candidates.begin()), ce(candidates.end());
  TVarList::const_iterator vi(gen->domain->attributes->begin()), ve(gen->domain->attributes->end());

  for(; (cse || (ci!=ce)) && (vi!=ve); vi++) {
    if (cse || *(ci++))
      if ((*vi)->varType == TValue::INTVAR) {
        discrete.push_back(true);
        continuous.push_back(false);
        continue;
      }
      else if ((*vi)->varType == TValue::FLOATVAR) {
        discrete.push_back(false);
        continuous.push_back(true);
        continue;
      }
    discrete.push_back(false);
    continuous.push_back(false);
  }

  float discQuality;
  PStringList discDescriptions;
  PDiscDistribution discSizes;
  int discSpent;
  PClassifier discSplit = discreteSplitConstructor->call(discDescriptions, discSizes, discQuality, discSpent,
                                                               gen, weightID, dcont, apriorClass, discrete, nodeClassifier);

  float contQuality;
  PStringList contDescriptions;
  PDiscDistribution contSizes;
  int contSpent;
  PClassifier contSplit = continuousSplitConstructor->call(contDescriptions, contSizes, contQuality, contSpent,
                                                                 gen, weightID, dcont, apriorClass, continuous, nodeClassifier);

  int N = gen ? gen->numberOfExamples() : -1;
  if (N<0)
    N = dcont->classes->cases;

  if (   discSplit
      && (   !contSplit
          || (discQuality>contQuality)
          || (discQuality==contQuality) && (N%2>0))) {
    quality = discQuality;
    descriptions = discDescriptions;
    subsetSizes = discSizes;
    spentAttribute = discSpent;
    return discSplit;
  }
  else if (contSplit) {
    quality = contQuality;
    descriptions = contDescriptions;
    subsetSizes = contSizes;
    spentAttribute = contSpent;
    return contSplit;
  }
  else 
    return returnNothing(descriptions, subsetSizes, quality, spentAttribute);
}
Esempio n. 5
0
PClassifier TTreeSplitConstructor_ExhaustiveBinary::operator()(
                             PStringList &descriptions, PDiscDistribution &subsetSizes, float &quality, int &spentAttribute,

                             PExampleGenerator gen, const int &weightID ,
                             PDomainContingency dcont, PDistribution apriorClass,
                             const vector<bool> &candidates,
                             PClassifier
                            )
{ 
  checkProperty(measure);
  measure->checkClassTypeExc(gen->domain->classVar->varType);

  PIntList bestMapping;
  int wins, bestAttr;
  PVariable bvar;

  if (measure->needs==TMeasureAttribute::Generator) {
    bool cse = candidates.size()==0;
    bool haveCandidates = false;
    vector<bool> myCandidates;
    myCandidates.reserve(gen->domain->attributes->size());
    vector<bool>::const_iterator ci(candidates.begin()), ce(candidates.end());
    TVarList::const_iterator vi, ve(gen->domain->attributes->end());
    for(vi = gen->domain->attributes->begin(); vi != ve; vi++) {
      bool co = (*vi)->varType == TValue::INTVAR && (!cse || (ci!=ce) && *ci);
      myCandidates.push_back(co);
      haveCandidates = haveCandidates || co;
    }
    if (!haveCandidates)
      return returnNothing(descriptions, subsetSizes, quality, spentAttribute);

    PDistribution thisSubsets;
    float thisQuality;
    wins = 0;
    int thisAttr = 0;

    int N = gen->numberOfExamples();
    TSimpleRandomGenerator rgen(N);

    ci = myCandidates.begin();
    for(vi = gen->domain->attributes->begin(); vi != ve; ci++, vi++, thisAttr++) {
      if (*ci) {
        thisSubsets = NULL;
        PIntList thisMapping =
           /*throughCont ? measure->bestBinarization(thisSubsets, thisQuality, *dci, dcont->classes, apriorClass, minSubset)
                       : */measure->bestBinarization(thisSubsets, thisQuality, *vi, gen, apriorClass, weightID, minSubset);
          if (thisMapping
                && (   (!wins || (thisQuality>quality)) && ((wins=1)==1)
                    || (thisQuality==quality) && rgen.randbool(++wins))) {
            bestAttr = thisAttr;
            quality = thisQuality;
            subsetSizes = thisSubsets;
            bestMapping = thisMapping;
          }
      }
      /*if (thoughCont)
        dci++; */
    }
  
    if (!wins)
      return returnNothing(descriptions, subsetSizes, quality, spentAttribute);

    if (quality<worstAcceptable)
      return returnNothing(descriptions, subsetSizes, spentAttribute);

    if (subsetSizes && subsetSizes->variable)
      bvar = subsetSizes->variable;
    else {
      TEnumVariable *evar = mlnew TEnumVariable("");
      evar->addValue("0");
      evar->addValue("1");
      bvar = evar;
    }
  }
  
  else {
    bool cse = candidates.size()==0;
    if (!cse && noCandidates(candidates))
      return returnNothing(descriptions, subsetSizes, quality, spentAttribute);

    if (!dcont || dcont->classIsOuter) {
      dcont = PDomainContingency(mlnew TDomainContingency(gen, weightID));
//      raiseWarningWho("TreeSplitConstructor_ExhaustiveBinary", "this class is not optimized for 'candidates' list and can be very slow");
    }

    int N = gen ? gen->numberOfExamples() : -1;
    if (N<0)
      N = dcont->classes->cases;
    TSimpleRandomGenerator rgen(N);

    PDistribution classDistribution = dcont->classes;

    vector<bool>::const_iterator ci(candidates.begin()), ce(candidates.end());

    TDiscDistribution *dis0, *dis1;
    TContDistribution *con0, *con1;

    int thisAttr = 0;
    bestAttr = -1;
    wins = 0;
    quality = 0.0;
    float leftExamples, rightExamples;

    TDomainContingency::iterator dci(dcont->begin()), dce(dcont->end());
    for(; (cse || (ci!=ce)) && (dci!=dce); dci++, thisAttr++) {

      // We consider the attribute only if it is a candidate, discrete and has at least two values
      if ((cse || *(ci++)) && ((*dci)->outerVariable->varType==TValue::INTVAR) && ((*dci)->discrete->size()>=2)) {

        const TDistributionVector &distr = *(*dci)->discrete;

        if (distr.size()>16)
          raiseError("'%s' has more than 16 values, cannot exhaustively binarize", gen->domain->attributes->at(thisAttr)->get_name().c_str());

        // If the attribute is binary, we check subsetSizes and assess the quality if they are OK
        if (distr.size()==2) {
          if ((distr.front()->abs<minSubset) || (distr.back()->abs<minSubset))
            continue; // next attribute
          else {
            float thisMeas = measure->call(thisAttr, dcont, apriorClass);
            if (   ((!wins || (thisMeas>quality)) && ((wins=1)==1))
                || ((thisMeas==quality) && rgen.randbool(++wins))) {
              bestAttr = thisAttr;
              quality = thisMeas;
              leftExamples = distr.front()->abs;
              rightExamples = distr.back()->abs;
              bestMapping = mlnew TIntList(2, 0);
              bestMapping->at(1) = 1;
            }
            continue;
          }
        }

        vector<int> valueIndices;
        int ind = 0;
        for(TDistributionVector::const_iterator dvi(distr.begin()), dve(distr.end()); (dvi!=dve); dvi++, ind++)
          if ((*dvi)->abs>0)
            valueIndices.push_back(ind);

        if (valueIndices.size()<2)
          continue;

        PContingency cont = prepareBinaryCheat(classDistribution, *dci, bvar, dis0, dis1, con0, con1);

        // A real job: go through all splits
        int binWins = 0;
        float binQuality = -1.0;
        float binLeftExamples = -1.0, binRightExamples = -1.0;
        // Selection: each element correspons to a value of the original attribute and is 1, if the value goes right
        // The first value always goes left (and has no corresponding bit in selection.
        TBoolCount selection(valueIndices.size()-1), bestSelection(0);

        // First for discrete classes
        if (dis0) {
          do {
            *dis0 = CAST_TO_DISCDISTRIBUTION(distr[valueIndices[0]]);
            *dis1 *= 0;
            vector<int>::const_iterator ii(valueIndices.begin());
            ii++;
            for(TBoolCount::const_iterator bi(selection.begin()), be(selection.end()); bi!=be; bi++, ii++)
               *(*bi ? dis1 : dis0) += distr[*ii];
            cont->outerDistribution->setint(0, dis0->abs);
            cont->outerDistribution->setint(1, dis1->abs);

            if ((dis0->abs < minSubset) || (dis1->abs < minSubset))
              continue; // cannot split like that, to few examples in one of the branches

            float thisMeas = measure->operator()(cont, classDistribution, apriorClass);
            if (   ((!binWins) || (thisMeas>binQuality)) && ((binWins=1) ==1)
                || (thisMeas==binQuality) && rgen.randbool(++binWins)) {
              bestSelection = selection; 
              binQuality = thisMeas;
              binLeftExamples = dis0->abs;
              binRightExamples = dis1->abs;
            }
          } while (selection.next());
        }

        // And then exactly the same for continuous classes
        else {
          do {
            *con0 = CAST_TO_CONTDISTRIBUTION(distr[0]);
            *con1 = TContDistribution();
            vector<int>::const_iterator ii(valueIndices.begin());
            for(TBoolCount::const_iterator bi(selection.begin()), be(selection.end()); bi!=be; bi++, ii++)
               *(*bi ? con1 : con0) += distr[*ii];

            if ((con0->abs<minSubset) || (con1->abs<minSubset))
              continue; // cannot split like that, to few examples in one of the branches

            float thisMeas = measure->operator()(cont, classDistribution, apriorClass);
            if (   ((!binWins) || (thisMeas>binQuality)) && ((binWins=1) ==1)
                || (thisMeas==binQuality) && rgen.randbool(++binWins)) {
              bestSelection = selection; 
              binQuality = thisMeas;
              binLeftExamples = con0->abs;
              binRightExamples = con1->abs;
            }
          } while (selection.next());
        }

        if (       binWins
            && (   (!wins || (binQuality>quality)) && ((wins=1)==1)
                || (binQuality==quality) && rgen.randbool(++wins))) {
          bestAttr = thisAttr;
          quality = binQuality;
          leftExamples = binLeftExamples;
          rightExamples = binRightExamples;
          bestMapping = mlnew TIntList(distr.size(), -1);
          vector<int>::const_iterator ii = valueIndices.begin();
          bestMapping->at(*(ii++)) = 0;
          ITERATE(TBoolCount, bi, bestSelection)
            bestMapping->at(*(ii++)) = *bi ? 1 : 0;
        }
      }
    }
 

    if (!wins)
      return returnNothing(descriptions, subsetSizes, quality, spentAttribute);

    subsetSizes = mlnew TDiscDistribution();
    subsetSizes->addint(0, leftExamples);
    subsetSizes->addint(1, rightExamples);
  }

  PVariable attribute = gen->domain->attributes->at(bestAttr);

  if (attribute->noOfValues() == 2) {
    spentAttribute = bestAttr;
    descriptions = mlnew TStringList(attribute.AS(TEnumVariable)->values.getReference());
    TClassifierFromVarFD *cfv = mlnew TClassifierFromVarFD(attribute, gen->domain, bestAttr, subsetSizes);
    cfv->transformUnknowns = false;
    return cfv;
  }

  string s0, s1;
  int ns0 = 0, ns1 = 0;
  TValue ev;
  attribute->firstValue(ev);
  PITERATE(TIntList, mi, bestMapping) {
    string str;
    attribute->val2str(ev, str);
    if (*mi==1) {
      s1 += string(ns1 ? ", " : "") + str;
      ns1++;
    }
    else if (*mi==0) {
      s0 += string(ns0 ? ", " : "") + str;
      ns0++;
    }

    attribute->nextValue(ev);
  }
Esempio n. 6
0
PClassifier TTreeSplitConstructor_Attribute::operator()(
                             PStringList &descriptions, PDiscDistribution &subsetSizes, float &quality, int &spentAttribute,

                             PExampleGenerator gen, const int &weightID,
                             PDomainContingency dcont, PDistribution apriorClass,
                             const vector<bool> &candidates,
                             PClassifier nodeClassifier
                            )
{ checkProperty(measure);

  measure->checkClassTypeExc(gen->domain->classVar->varType);

  bool cse = candidates.size()==0;
  vector<bool>::const_iterator ci(candidates.begin()), ce(candidates.end());
  if (!cse) {
    if (noCandidates(candidates))
      return returnNothing(descriptions, subsetSizes, quality, spentAttribute);

    ci = candidates.begin();
  }

  int N = gen ? gen->numberOfExamples() : -1;
  if (N<0)
    N = dcont->classes->cases;
  TSimpleRandomGenerator rgen(N);

  int thisAttr = 0, bestAttr = -1, wins = 0;
  quality = 0.0;

  if (measure->needs == TMeasureAttribute::Contingency_Class) {
    vector<bool> myCandidates;
    if (cse) {
      myCandidates.reserve(gen->domain->attributes->size());
      PITERATE(TVarList, vi, gen->domain->attributes)
        myCandidates.push_back((*vi)->varType == TValue::INTVAR);
    }
    else {
      myCandidates.reserve(candidates.size());
      TVarList::const_iterator vi(gen->domain->attributes->begin());
      for(; ci != ce; ci++, vi++)
        myCandidates.push_back(*ci && ((*vi)->varType == TValue::INTVAR));
    }

    if (!dcont || dcont->classIsOuter)
      dcont = PDomainContingency(mlnew TDomainContingency(gen, weightID, myCandidates));

    ci = myCandidates.begin();
    ce = myCandidates.end();
    TDomainContingency::iterator dci(dcont->begin()), dce(dcont->end());
    for(; (ci != ce) && (dci!=dce); dci++, ci++, thisAttr++)
      if (*ci && checkDistribution((const TDiscDistribution &)((*dci)->outerDistribution.getReference()), minSubset)) {
        float thisMeas = measure->call(thisAttr, dcont, apriorClass);

        if (   ((!wins || (thisMeas>quality)) && ((wins=1)==1))
            || ((thisMeas==quality) && rgen.randbool(++wins))) {
          quality = thisMeas;
          subsetSizes = (*dci)->outerDistribution;
          bestAttr = thisAttr;
        }
      }
  }

  else if (measure->needs == TMeasureAttribute::DomainContingency) {
    if (!dcont || dcont->classIsOuter)
      dcont = PDomainContingency(mlnew TDomainContingency(gen, weightID));

    TDomainContingency::iterator dci(dcont->begin()), dce(dcont->end());
    for(; (cse || (ci!=ce)) && (dci!=dce); dci++, thisAttr++)
      if (    (cse || *(ci++))
           && ((*dci)->outerVariable->varType==TValue::INTVAR)
           && checkDistribution((const TDiscDistribution &)((*dci)->outerDistribution.getReference()), minSubset)) {
        float thisMeas = measure->call(thisAttr, dcont, apriorClass);

        if (   ((!wins || (thisMeas>quality)) && ((wins=1)==1))
            || ((thisMeas==quality) && rgen.randbool(++wins))) {
          quality = thisMeas;
          subsetSizes = (*dci)->outerDistribution;
          bestAttr = thisAttr;
        }
      }
  }

  else {
    TDomainDistributions ddist(gen, weightID);

    TDomainDistributions::iterator ddi(ddist.begin()), dde(ddist.end()-1);
    for(; (cse || (ci!=ce)) && (ddi!=dde); ddi++, thisAttr++)
      if (cse || *(ci++)) {
        TDiscDistribution *discdist = (*ddi).AS(TDiscDistribution);
        if (discdist && checkDistribution(*discdist, minSubset)) {
          float thisMeas = measure->call(thisAttr, gen, apriorClass, weightID);

          if (   ((!wins || (thisMeas>quality)) && ((wins=1)==1))
              || ((thisMeas==quality) && rgen.randbool(++wins))) {
            quality = thisMeas;
            subsetSizes = PDiscDistribution(*ddi); // not discdist - this would be double wrapping!
            bestAttr = thisAttr;
          }
        }
      }
    
  }

  if (!wins)
    return returnNothing(descriptions, subsetSizes, quality, spentAttribute);

  if (quality<worstAcceptable)
    return returnNothing(descriptions, subsetSizes, spentAttribute);

  PVariable attribute = gen->domain->attributes->at(bestAttr);
  TEnumVariable *evar = attribute.AS(TEnumVariable);
  if (evar)
    descriptions = mlnew TStringList(evar->values.getReference());
  else
    descriptions = mlnew TStringList(subsetSizes->size(), "");

  spentAttribute = bestAttr;

  TClassifierFromVarFD *cfv = mlnew TClassifierFromVarFD(attribute, gen->domain, bestAttr, subsetSizes);
  cfv->transformUnknowns = false;
  return cfv;
}