Ejemplo n.º 1
0
PProbabilityEstimator TProbabilityEstimatorConstructor_Laplace::operator()(PDistribution frequencies, PDistribution, PExampleGenerator, const long &, const int &) const
{ TProbabilityEstimator_FromDistribution *pefd = mlnew TProbabilityEstimator_FromDistribution(CLONE(TDistribution, frequencies));
  PProbabilityEstimator estimator = pefd;
  
  TDiscDistribution *ddist = pefd->probabilities.AS(TDiscDistribution);
  if (ddist) {
    const float &abs = ddist->abs;
    const float &cases = ddist->cases;
    const float div = cases + l * ddist->noOfElements();
    int i = 0;
    if (div) {
      if ((cases == abs) || !renormalize || (abs<1e-20))
        PITERATE(TDiscDistribution, di, ddist)
          ddist->setint(i++, (*di + l) / div);
      else
        PITERATE(TDiscDistribution, di, ddist)
          ddist->setint(i++, (*di / abs * cases + l) / div);
    }
    else
      pefd->probabilities->normalize();
  }
  else
    pefd->probabilities->normalize();
  
  return estimator;
}
Ejemplo n.º 2
0
PClassifier TLogRegLearner::fitModel(PExampleGenerator gen, const int &weight, int &error, PVariable &errorAt)
{ 
  PImputer imputer = imputerConstructor ? imputerConstructor->call(gen, weight) : PImputer();
  PExampleGenerator imputed = imputer ? imputer->call(gen, weight) : gen;

  // construct classifier	
  TLogRegClassifier *lrc = mlnew TLogRegClassifier(imputed->domain);
  lrc->dataDescription = mlnew TEFMDataDescription(gen->domain, mlnew TDomainDistributions(gen), 0, getMetaID());
  PClassifier cl = lrc;
  lrc->imputer = imputer;

  //if (imputed->domain->hasDiscreteAttributes(false)) {
    lrc->continuizedDomain = domainContinuizer ? domainContinuizer->call(imputed, weight) : (*logisticRegressionDomainContinuizer)(imputed, weight);
    imputed = mlnew TExampleTable(lrc->continuizedDomain, imputed);
  //}

    // copy class value

  // construct a LR fitter
  fitter = fitter ? fitter : PLogRegFitter(mlnew TLogRegFitter_Cholesky());

  PAttributedFloatList temp_beta, temp_beta_se;
  // fit logistic regression 

  temp_beta = fitter->call(imputed, weight, temp_beta_se, lrc->likelihood, error, errorAt);
  lrc->fit_status = error;

  // transform beta to AttributedList
  PVarList enum_attributes = mlnew TVarList(); 
  enum_attributes->push_back(imputed->domain->classVar);
  PITERATE(TVarList, vl, imputed->domain->attributes) 
    enum_attributes->push_back(*vl);
  // tranfsorm *beta into a PFloatList
  lrc->beta=mlnew TAttributedFloatList(enum_attributes);
  lrc->beta_se=mlnew TAttributedFloatList(enum_attributes);

  PITERATE(TAttributedFloatList, fi, temp_beta)
    lrc->beta->push_back(*fi);

  PITERATE(TAttributedFloatList, fi_se, temp_beta_se)
    lrc->beta_se->push_back(*fi_se);

  if (error >= TLogRegFitter::Constant) 
    return cl;

  lrc->wald_Z = computeWaldZ(lrc->beta, lrc->beta_se);
  lrc->P = computeP(lrc->wald_Z);

  // return classifier with domain, beta and standard errors of beta 
  return cl;
}
Ejemplo n.º 3
0
void THierarchicalCluster::recursiveMove(const int &offset)
{
  first += offset;
  last += offset;
  if (branches)
    PITERATE(THierarchicalClusterList, bi, branches)
      (*bi)->recursiveMove(offset);
}
Ejemplo n.º 4
0
PClassifier TTreeSplitConstructor_Attribute::operator()(
                             PStringList &descriptions, PDiscDistribution &subsetSizes, float &quality, int &spentAttribute,

                             PExampleGenerator gen, const int &weightID,
                             PDomainContingency dcont, PDistribution apriorClass,
                             const vector<bool> &candidates,
                             PClassifier nodeClassifier
                            )
{ checkProperty(measure);

  measure->checkClassTypeExc(gen->domain->classVar->varType);

  bool cse = candidates.size()==0;
  vector<bool>::const_iterator ci(candidates.begin()), ce(candidates.end());
  if (!cse) {
    if (noCandidates(candidates))
      return returnNothing(descriptions, subsetSizes, quality, spentAttribute);

    ci = candidates.begin();
  }

  int N = gen ? gen->numberOfExamples() : -1;
  if (N<0)
    N = dcont->classes->cases;
  TSimpleRandomGenerator rgen(N);

  int thisAttr = 0, bestAttr = -1, wins = 0;
  quality = 0.0;

  if (measure->needs == TMeasureAttribute::Contingency_Class) {
    vector<bool> myCandidates;
    if (cse) {
      myCandidates.reserve(gen->domain->attributes->size());
      PITERATE(TVarList, vi, gen->domain->attributes)
        myCandidates.push_back((*vi)->varType == TValue::INTVAR);
    }
    else {
      myCandidates.reserve(candidates.size());
      TVarList::const_iterator vi(gen->domain->attributes->begin());
      for(; ci != ce; ci++, vi++)
        myCandidates.push_back(*ci && ((*vi)->varType == TValue::INTVAR));
    }

    if (!dcont || dcont->classIsOuter)
      dcont = PDomainContingency(mlnew TDomainContingency(gen, weightID, myCandidates));

    ci = myCandidates.begin();
    ce = myCandidates.end();
    TDomainContingency::iterator dci(dcont->begin()), dce(dcont->end());
    for(; (ci != ce) && (dci!=dce); dci++, ci++, thisAttr++)
      if (*ci && checkDistribution((const TDiscDistribution &)((*dci)->outerDistribution.getReference()), minSubset)) {
        float thisMeas = measure->call(thisAttr, dcont, apriorClass);

        if (   ((!wins || (thisMeas>quality)) && ((wins=1)==1))
            || ((thisMeas==quality) && rgen.randbool(++wins))) {
          quality = thisMeas;
          subsetSizes = (*dci)->outerDistribution;
          bestAttr = thisAttr;
        }
      }
  }

  else if (measure->needs == TMeasureAttribute::DomainContingency) {
    if (!dcont || dcont->classIsOuter)
      dcont = PDomainContingency(mlnew TDomainContingency(gen, weightID));

    TDomainContingency::iterator dci(dcont->begin()), dce(dcont->end());
    for(; (cse || (ci!=ce)) && (dci!=dce); dci++, thisAttr++)
      if (    (cse || *(ci++))
           && ((*dci)->outerVariable->varType==TValue::INTVAR)
           && checkDistribution((const TDiscDistribution &)((*dci)->outerDistribution.getReference()), minSubset)) {
        float thisMeas = measure->call(thisAttr, dcont, apriorClass);

        if (   ((!wins || (thisMeas>quality)) && ((wins=1)==1))
            || ((thisMeas==quality) && rgen.randbool(++wins))) {
          quality = thisMeas;
          subsetSizes = (*dci)->outerDistribution;
          bestAttr = thisAttr;
        }
      }
  }

  else {
    TDomainDistributions ddist(gen, weightID);

    TDomainDistributions::iterator ddi(ddist.begin()), dde(ddist.end()-1);
    for(; (cse || (ci!=ce)) && (ddi!=dde); ddi++, thisAttr++)
      if (cse || *(ci++)) {
        TDiscDistribution *discdist = (*ddi).AS(TDiscDistribution);
        if (discdist && checkDistribution(*discdist, minSubset)) {
          float thisMeas = measure->call(thisAttr, gen, apriorClass, weightID);

          if (   ((!wins || (thisMeas>quality)) && ((wins=1)==1))
              || ((thisMeas==quality) && rgen.randbool(++wins))) {
            quality = thisMeas;
            subsetSizes = PDiscDistribution(*ddi); // not discdist - this would be double wrapping!
            bestAttr = thisAttr;
          }
        }
      }
    
  }

  if (!wins)
    return returnNothing(descriptions, subsetSizes, quality, spentAttribute);

  if (quality<worstAcceptable)
    return returnNothing(descriptions, subsetSizes, spentAttribute);

  PVariable attribute = gen->domain->attributes->at(bestAttr);
  TEnumVariable *evar = attribute.AS(TEnumVariable);
  if (evar)
    descriptions = mlnew TStringList(evar->values.getReference());
  else
    descriptions = mlnew TStringList(subsetSizes->size(), "");

  spentAttribute = bestAttr;

  TClassifierFromVarFD *cfv = mlnew TClassifierFromVarFD(attribute, gen->domain, bestAttr, subsetSizes);
  cfv->transformUnknowns = false;
  return cfv;
}