PProbabilityEstimator TProbabilityEstimatorConstructor_loess::operator()(PDistribution frequencies, PDistribution, PExampleGenerator, const long &weightID, const int &attrNo) const
{ TContDistribution *cdist = frequencies.AS(TContDistribution);
  if (!cdist)
    if (frequencies && frequencies->variable)
      raiseError("attribute '%s' is not continuous", frequencies->variable->get_name().c_str());
    else
      raiseError("continuous distribution expected");
  if (!cdist->size())
    raiseError("empty distribution");

  map<float, float> loesscurve;
  loess(cdist->distribution, nPoints, windowProportion, loesscurve, distributionMethod);
  return mlnew TProbabilityEstimator_FromDistribution(mlnew TContDistribution(loesscurve));
}
示例#2
0
PClassifier TTreeSplitConstructor_ExhaustiveBinary::operator()(
                             PStringList &descriptions, PDiscDistribution &subsetSizes, float &quality, int &spentAttribute,

                             PExampleGenerator gen, const int &weightID ,
                             PDomainContingency dcont, PDistribution apriorClass,
                             const vector<bool> &candidates,
                             PClassifier
                            )
{ 
  checkProperty(measure);
  measure->checkClassTypeExc(gen->domain->classVar->varType);

  PIntList bestMapping;
  int wins, bestAttr;
  PVariable bvar;

  if (measure->needs==TMeasureAttribute::Generator) {
    bool cse = candidates.size()==0;
    bool haveCandidates = false;
    vector<bool> myCandidates;
    myCandidates.reserve(gen->domain->attributes->size());
    vector<bool>::const_iterator ci(candidates.begin()), ce(candidates.end());
    TVarList::const_iterator vi, ve(gen->domain->attributes->end());
    for(vi = gen->domain->attributes->begin(); vi != ve; vi++) {
      bool co = (*vi)->varType == TValue::INTVAR && (!cse || (ci!=ce) && *ci);
      myCandidates.push_back(co);
      haveCandidates = haveCandidates || co;
    }
    if (!haveCandidates)
      return returnNothing(descriptions, subsetSizes, quality, spentAttribute);

    PDistribution thisSubsets;
    float thisQuality;
    wins = 0;
    int thisAttr = 0;

    int N = gen->numberOfExamples();
    TSimpleRandomGenerator rgen(N);

    ci = myCandidates.begin();
    for(vi = gen->domain->attributes->begin(); vi != ve; ci++, vi++, thisAttr++) {
      if (*ci) {
        thisSubsets = NULL;
        PIntList thisMapping =
           /*throughCont ? measure->bestBinarization(thisSubsets, thisQuality, *dci, dcont->classes, apriorClass, minSubset)
                       : */measure->bestBinarization(thisSubsets, thisQuality, *vi, gen, apriorClass, weightID, minSubset);
          if (thisMapping
                && (   (!wins || (thisQuality>quality)) && ((wins=1)==1)
                    || (thisQuality==quality) && rgen.randbool(++wins))) {
            bestAttr = thisAttr;
            quality = thisQuality;
            subsetSizes = thisSubsets;
            bestMapping = thisMapping;
          }
      }
      /*if (thoughCont)
        dci++; */
    }
  
    if (!wins)
      return returnNothing(descriptions, subsetSizes, quality, spentAttribute);

    if (quality<worstAcceptable)
      return returnNothing(descriptions, subsetSizes, spentAttribute);

    if (subsetSizes && subsetSizes->variable)
      bvar = subsetSizes->variable;
    else {
      TEnumVariable *evar = mlnew TEnumVariable("");
      evar->addValue("0");
      evar->addValue("1");
      bvar = evar;
    }
  }
  
  else {
    bool cse = candidates.size()==0;
    if (!cse && noCandidates(candidates))
      return returnNothing(descriptions, subsetSizes, quality, spentAttribute);

    if (!dcont || dcont->classIsOuter) {
      dcont = PDomainContingency(mlnew TDomainContingency(gen, weightID));
//      raiseWarningWho("TreeSplitConstructor_ExhaustiveBinary", "this class is not optimized for 'candidates' list and can be very slow");
    }

    int N = gen ? gen->numberOfExamples() : -1;
    if (N<0)
      N = dcont->classes->cases;
    TSimpleRandomGenerator rgen(N);

    PDistribution classDistribution = dcont->classes;

    vector<bool>::const_iterator ci(candidates.begin()), ce(candidates.end());

    TDiscDistribution *dis0, *dis1;
    TContDistribution *con0, *con1;

    int thisAttr = 0;
    bestAttr = -1;
    wins = 0;
    quality = 0.0;
    float leftExamples, rightExamples;

    TDomainContingency::iterator dci(dcont->begin()), dce(dcont->end());
    for(; (cse || (ci!=ce)) && (dci!=dce); dci++, thisAttr++) {

      // We consider the attribute only if it is a candidate, discrete and has at least two values
      if ((cse || *(ci++)) && ((*dci)->outerVariable->varType==TValue::INTVAR) && ((*dci)->discrete->size()>=2)) {

        const TDistributionVector &distr = *(*dci)->discrete;

        if (distr.size()>16)
          raiseError("'%s' has more than 16 values, cannot exhaustively binarize", gen->domain->attributes->at(thisAttr)->get_name().c_str());

        // If the attribute is binary, we check subsetSizes and assess the quality if they are OK
        if (distr.size()==2) {
          if ((distr.front()->abs<minSubset) || (distr.back()->abs<minSubset))
            continue; // next attribute
          else {
            float thisMeas = measure->call(thisAttr, dcont, apriorClass);
            if (   ((!wins || (thisMeas>quality)) && ((wins=1)==1))
                || ((thisMeas==quality) && rgen.randbool(++wins))) {
              bestAttr = thisAttr;
              quality = thisMeas;
              leftExamples = distr.front()->abs;
              rightExamples = distr.back()->abs;
              bestMapping = mlnew TIntList(2, 0);
              bestMapping->at(1) = 1;
            }
            continue;
          }
        }

        vector<int> valueIndices;
        int ind = 0;
        for(TDistributionVector::const_iterator dvi(distr.begin()), dve(distr.end()); (dvi!=dve); dvi++, ind++)
          if ((*dvi)->abs>0)
            valueIndices.push_back(ind);

        if (valueIndices.size()<2)
          continue;

        PContingency cont = prepareBinaryCheat(classDistribution, *dci, bvar, dis0, dis1, con0, con1);

        // A real job: go through all splits
        int binWins = 0;
        float binQuality = -1.0;
        float binLeftExamples = -1.0, binRightExamples = -1.0;
        // Selection: each element correspons to a value of the original attribute and is 1, if the value goes right
        // The first value always goes left (and has no corresponding bit in selection.
        TBoolCount selection(valueIndices.size()-1), bestSelection(0);

        // First for discrete classes
        if (dis0) {
          do {
            *dis0 = CAST_TO_DISCDISTRIBUTION(distr[valueIndices[0]]);
            *dis1 *= 0;
            vector<int>::const_iterator ii(valueIndices.begin());
            ii++;
            for(TBoolCount::const_iterator bi(selection.begin()), be(selection.end()); bi!=be; bi++, ii++)
               *(*bi ? dis1 : dis0) += distr[*ii];
            cont->outerDistribution->setint(0, dis0->abs);
            cont->outerDistribution->setint(1, dis1->abs);

            if ((dis0->abs < minSubset) || (dis1->abs < minSubset))
              continue; // cannot split like that, to few examples in one of the branches

            float thisMeas = measure->operator()(cont, classDistribution, apriorClass);
            if (   ((!binWins) || (thisMeas>binQuality)) && ((binWins=1) ==1)
                || (thisMeas==binQuality) && rgen.randbool(++binWins)) {
              bestSelection = selection; 
              binQuality = thisMeas;
              binLeftExamples = dis0->abs;
              binRightExamples = dis1->abs;
            }
          } while (selection.next());
        }

        // And then exactly the same for continuous classes
        else {
          do {
            *con0 = CAST_TO_CONTDISTRIBUTION(distr[0]);
            *con1 = TContDistribution();
            vector<int>::const_iterator ii(valueIndices.begin());
            for(TBoolCount::const_iterator bi(selection.begin()), be(selection.end()); bi!=be; bi++, ii++)
               *(*bi ? con1 : con0) += distr[*ii];

            if ((con0->abs<minSubset) || (con1->abs<minSubset))
              continue; // cannot split like that, to few examples in one of the branches

            float thisMeas = measure->operator()(cont, classDistribution, apriorClass);
            if (   ((!binWins) || (thisMeas>binQuality)) && ((binWins=1) ==1)
                || (thisMeas==binQuality) && rgen.randbool(++binWins)) {
              bestSelection = selection; 
              binQuality = thisMeas;
              binLeftExamples = con0->abs;
              binRightExamples = con1->abs;
            }
          } while (selection.next());
        }

        if (       binWins
            && (   (!wins || (binQuality>quality)) && ((wins=1)==1)
                || (binQuality==quality) && rgen.randbool(++wins))) {
          bestAttr = thisAttr;
          quality = binQuality;
          leftExamples = binLeftExamples;
          rightExamples = binRightExamples;
          bestMapping = mlnew TIntList(distr.size(), -1);
          vector<int>::const_iterator ii = valueIndices.begin();
          bestMapping->at(*(ii++)) = 0;
          ITERATE(TBoolCount, bi, bestSelection)
            bestMapping->at(*(ii++)) = *bi ? 1 : 0;
        }
      }
    }
 

    if (!wins)
      return returnNothing(descriptions, subsetSizes, quality, spentAttribute);

    subsetSizes = mlnew TDiscDistribution();
    subsetSizes->addint(0, leftExamples);
    subsetSizes->addint(1, rightExamples);
  }

  PVariable attribute = gen->domain->attributes->at(bestAttr);

  if (attribute->noOfValues() == 2) {
    spentAttribute = bestAttr;
    descriptions = mlnew TStringList(attribute.AS(TEnumVariable)->values.getReference());
    TClassifierFromVarFD *cfv = mlnew TClassifierFromVarFD(attribute, gen->domain, bestAttr, subsetSizes);
    cfv->transformUnknowns = false;
    return cfv;
  }

  string s0, s1;
  int ns0 = 0, ns1 = 0;
  TValue ev;
  attribute->firstValue(ev);
  PITERATE(TIntList, mi, bestMapping) {
    string str;
    attribute->val2str(ev, str);
    if (*mi==1) {
      s1 += string(ns1 ? ", " : "") + str;
      ns1++;
    }
    else if (*mi==0) {
      s0 += string(ns0 ? ", " : "") + str;
      ns0++;
    }

    attribute->nextValue(ev);
  }
PProbabilityEstimator TProbabilityEstimatorConstructor_kernel::operator()(PDistribution frequencies, PDistribution apriori, PExampleGenerator, const long &weightID, const int &) const
{ TContDistribution *cdist = frequencies.AS(TContDistribution);
  if (!cdist)
    raiseError("continuous distribution expected");
  if (!cdist->size())
    raiseError("empty distribution");
  if ((minImpact<0.0) || (minImpact>1.0))
    raiseError("'minImpact' should be between 0.0 and 1.0 (not %5.3f)", minImpact);

  vector<float> points;
  distributePoints(cdist->distribution, nPoints, points);

  TContDistribution *curve = mlnew TContDistribution(frequencies->variable);
  PDistribution wcurve = curve;

  /* Bandwidth suggested by Chad Shaw. Also found in http://www.stat.lsa.umich.edu/~kshedden/Courses/Stat606/Notes/interpolate.pdf */
  const float h = smoothing * sqrt(cdist->error()) * exp(- 1.0/5.0 * log(cdist->abs)); // 1.144
  const float hsqrt2pi = h * 2.5066282746310002;
  float t;

  if (minImpact>0) {
    t = -2 * log(minImpact*hsqrt2pi); // 2.5066... == sqrt(2*pi)
    if (t<=0) {
      // minImpact too high, but that's user's problem... 
      ITERATE(vector<float>, pi, points)
        curve->setfloat(*pi, 0.0);
        return wcurve;
    }
    else
      t = h * sqrt(t);
  }
      
      
  ITERATE(vector<float>, pi, points) {
    const float &x = *pi;
    TContDistribution::const_iterator from, to;

    if (minImpact>0) {
      from = cdist->lower_bound(x-t);
      to = cdist->lower_bound(x+t);
      if ((from==cdist->end()) || (to==cdist->begin()) || (from==to)) {
        curve->setfloat(x, 0.0);
        continue;
      }
    }
    else {
      from = cdist->begin();
      to = cdist->end();
    }

    float p = 0.0, n = 0.0;
    for(; from != to; from++) {
      n += (*from).second;
      p += (*from).second * exp( - 0.5 * sqr( (x - (*from).first)/h ) );
    }

    curve->setfloat(x, p/hsqrt2pi/(n*h)); // hsqrt2pi is from the inside (errf), n*h is for the sum average
  }


  return mlnew TProbabilityEstimator_FromDistribution(curve);
}
示例#4
0
PDistribution TLogRegClassifier::classDistribution(const TExample &origexam)
{   
  checkProperty(domain);
  TExample cexample(domain, origexam);

  TExample *example2;

  if (imputer)
    example2 = imputer->call(cexample);
  else {
    if (dataDescription)
      for(TExample::const_iterator ei(cexample.begin()), ee(cexample.end()-1); ei!=ee; ei++)
        if ((*ei).isSpecial())
          return TClassifier::classDistribution(cexample, dataDescription);

    example2 = &cexample;
  }

  TExample *example = continuizedDomain ? mlnew TExample(continuizedDomain, *example2) : example2;

  float prob1;
  try {
    // multiply example with beta
    TAttributedFloatList::const_iterator b(beta->begin()), be(beta->end());

    // get beta 0
    prob1 = *b;
    b++;
    // multiply beta with example
    TVarList::const_iterator vi(example->domain->attributes->begin());
    TExample::const_iterator ei(example->begin()), ee(example->end());
    for (; (b!=be) && (ei!=ee); ei++, b++, vi++) {
      if ((*ei).isSpecial())
        raiseError("unknown value in attribute '%s'", (*vi)->get_name().c_str());
      prob1 += (*ei).floatV * (*b); 
    }

    prob1 = exp(prob1)/(1+exp(prob1));
  }
  catch (...) {
    if (imputer)
      mldelete example2;
    if (continuizedDomain)
      mldelete example;
    throw;
  }

  if (imputer)
    mldelete example2;
  if (continuizedDomain)
    mldelete example;

  if (classVar->varType == TValue::INTVAR) {
      TDiscDistribution *dist = mlnew TDiscDistribution(classVar);
      PDistribution res = dist;
      dist->addint(0, 1-prob1);
      dist->addint(1, prob1);
      return res;
  }
  else {
      TContDistribution *dist = mlnew TContDistribution(classVar);
      PDistribution res = dist;
      dist->addfloat(prob1, 1.0);
      return res;
  }
}