PProbabilityEstimator TProbabilityEstimatorConstructor_Laplace::operator()(PDistribution frequencies, PDistribution, PExampleGenerator, const long &, const int &) const { TProbabilityEstimator_FromDistribution *pefd = mlnew TProbabilityEstimator_FromDistribution(CLONE(TDistribution, frequencies)); PProbabilityEstimator estimator = pefd; TDiscDistribution *ddist = pefd->probabilities.AS(TDiscDistribution); if (ddist) { const float &abs = ddist->abs; const float &cases = ddist->cases; const float div = cases + l * ddist->noOfElements(); int i = 0; if (div) { if ((cases == abs) || !renormalize || (abs<1e-20)) PITERATE(TDiscDistribution, di, ddist) ddist->setint(i++, (*di + l) / div); else PITERATE(TDiscDistribution, di, ddist) ddist->setint(i++, (*di / abs * cases + l) / div); } else pefd->probabilities->normalize(); } else pefd->probabilities->normalize(); return estimator; }
PClassifier TLogRegLearner::fitModel(PExampleGenerator gen, const int &weight, int &error, PVariable &errorAt) { PImputer imputer = imputerConstructor ? imputerConstructor->call(gen, weight) : PImputer(); PExampleGenerator imputed = imputer ? imputer->call(gen, weight) : gen; // construct classifier TLogRegClassifier *lrc = mlnew TLogRegClassifier(imputed->domain); lrc->dataDescription = mlnew TEFMDataDescription(gen->domain, mlnew TDomainDistributions(gen), 0, getMetaID()); PClassifier cl = lrc; lrc->imputer = imputer; //if (imputed->domain->hasDiscreteAttributes(false)) { lrc->continuizedDomain = domainContinuizer ? domainContinuizer->call(imputed, weight) : (*logisticRegressionDomainContinuizer)(imputed, weight); imputed = mlnew TExampleTable(lrc->continuizedDomain, imputed); //} // copy class value // construct a LR fitter fitter = fitter ? fitter : PLogRegFitter(mlnew TLogRegFitter_Cholesky()); PAttributedFloatList temp_beta, temp_beta_se; // fit logistic regression temp_beta = fitter->call(imputed, weight, temp_beta_se, lrc->likelihood, error, errorAt); lrc->fit_status = error; // transform beta to AttributedList PVarList enum_attributes = mlnew TVarList(); enum_attributes->push_back(imputed->domain->classVar); PITERATE(TVarList, vl, imputed->domain->attributes) enum_attributes->push_back(*vl); // tranfsorm *beta into a PFloatList lrc->beta=mlnew TAttributedFloatList(enum_attributes); lrc->beta_se=mlnew TAttributedFloatList(enum_attributes); PITERATE(TAttributedFloatList, fi, temp_beta) lrc->beta->push_back(*fi); PITERATE(TAttributedFloatList, fi_se, temp_beta_se) lrc->beta_se->push_back(*fi_se); if (error >= TLogRegFitter::Constant) return cl; lrc->wald_Z = computeWaldZ(lrc->beta, lrc->beta_se); lrc->P = computeP(lrc->wald_Z); // return classifier with domain, beta and standard errors of beta return cl; }
void THierarchicalCluster::recursiveMove(const int &offset) { first += offset; last += offset; if (branches) PITERATE(THierarchicalClusterList, bi, branches) (*bi)->recursiveMove(offset); }
PClassifier TTreeSplitConstructor_Attribute::operator()( PStringList &descriptions, PDiscDistribution &subsetSizes, float &quality, int &spentAttribute, PExampleGenerator gen, const int &weightID, PDomainContingency dcont, PDistribution apriorClass, const vector<bool> &candidates, PClassifier nodeClassifier ) { checkProperty(measure); measure->checkClassTypeExc(gen->domain->classVar->varType); bool cse = candidates.size()==0; vector<bool>::const_iterator ci(candidates.begin()), ce(candidates.end()); if (!cse) { if (noCandidates(candidates)) return returnNothing(descriptions, subsetSizes, quality, spentAttribute); ci = candidates.begin(); } int N = gen ? gen->numberOfExamples() : -1; if (N<0) N = dcont->classes->cases; TSimpleRandomGenerator rgen(N); int thisAttr = 0, bestAttr = -1, wins = 0; quality = 0.0; if (measure->needs == TMeasureAttribute::Contingency_Class) { vector<bool> myCandidates; if (cse) { myCandidates.reserve(gen->domain->attributes->size()); PITERATE(TVarList, vi, gen->domain->attributes) myCandidates.push_back((*vi)->varType == TValue::INTVAR); } else { myCandidates.reserve(candidates.size()); TVarList::const_iterator vi(gen->domain->attributes->begin()); for(; ci != ce; ci++, vi++) myCandidates.push_back(*ci && ((*vi)->varType == TValue::INTVAR)); } if (!dcont || dcont->classIsOuter) dcont = PDomainContingency(mlnew TDomainContingency(gen, weightID, myCandidates)); ci = myCandidates.begin(); ce = myCandidates.end(); TDomainContingency::iterator dci(dcont->begin()), dce(dcont->end()); for(; (ci != ce) && (dci!=dce); dci++, ci++, thisAttr++) if (*ci && checkDistribution((const TDiscDistribution &)((*dci)->outerDistribution.getReference()), minSubset)) { float thisMeas = measure->call(thisAttr, dcont, apriorClass); if ( ((!wins || (thisMeas>quality)) && ((wins=1)==1)) || ((thisMeas==quality) && rgen.randbool(++wins))) { quality = thisMeas; subsetSizes = (*dci)->outerDistribution; bestAttr = thisAttr; } } } else if (measure->needs == TMeasureAttribute::DomainContingency) { if (!dcont || dcont->classIsOuter) dcont = PDomainContingency(mlnew TDomainContingency(gen, weightID)); TDomainContingency::iterator dci(dcont->begin()), dce(dcont->end()); for(; (cse || (ci!=ce)) && (dci!=dce); dci++, thisAttr++) if ( (cse || *(ci++)) && ((*dci)->outerVariable->varType==TValue::INTVAR) && checkDistribution((const TDiscDistribution &)((*dci)->outerDistribution.getReference()), minSubset)) { float thisMeas = measure->call(thisAttr, dcont, apriorClass); if ( ((!wins || (thisMeas>quality)) && ((wins=1)==1)) || ((thisMeas==quality) && rgen.randbool(++wins))) { quality = thisMeas; subsetSizes = (*dci)->outerDistribution; bestAttr = thisAttr; } } } else { TDomainDistributions ddist(gen, weightID); TDomainDistributions::iterator ddi(ddist.begin()), dde(ddist.end()-1); for(; (cse || (ci!=ce)) && (ddi!=dde); ddi++, thisAttr++) if (cse || *(ci++)) { TDiscDistribution *discdist = (*ddi).AS(TDiscDistribution); if (discdist && checkDistribution(*discdist, minSubset)) { float thisMeas = measure->call(thisAttr, gen, apriorClass, weightID); if ( ((!wins || (thisMeas>quality)) && ((wins=1)==1)) || ((thisMeas==quality) && rgen.randbool(++wins))) { quality = thisMeas; subsetSizes = PDiscDistribution(*ddi); // not discdist - this would be double wrapping! bestAttr = thisAttr; } } } } if (!wins) return returnNothing(descriptions, subsetSizes, quality, spentAttribute); if (quality<worstAcceptable) return returnNothing(descriptions, subsetSizes, spentAttribute); PVariable attribute = gen->domain->attributes->at(bestAttr); TEnumVariable *evar = attribute.AS(TEnumVariable); if (evar) descriptions = mlnew TStringList(evar->values.getReference()); else descriptions = mlnew TStringList(subsetSizes->size(), ""); spentAttribute = bestAttr; TClassifierFromVarFD *cfv = mlnew TClassifierFromVarFD(attribute, gen->domain, bestAttr, subsetSizes); cfv->transformUnknowns = false; return cfv; }