void TMultiClassifier::predictionAndDistribution(const TExample &ex, PValueList &classValues, PDistributionList &classDists) { if (computesProbabilities) { classDists = classDistribution(ex); PValueList classValues = new TValueList(); TValue value; PVariable classVar; for (int i = 0; i < classVars->size(); i++) { classVar = classVars->at(i); value = classVar->varType == TValue::FLOATVAR ? TValue(classDists->at(i)->average()) : classDists->at(i)->highestProbValue(ex); classValues->push_back(value); } } else { classValues = operator()(ex); PDistributionList classDist = new TDistributionList(); PDistribution dist; PVariable classVar; for (int i = 0; i < classVars->size(); i++) { classVar = classVars->at(i); dist = TDistribution::create(classVar); dist->add(classValues->at(i)); classDist->push_back(dist); } } }
PDistribution TClassifier::classDistribution(const TExample &exam) { if (computesProbabilities) raiseError("invalid setting of 'computesProbabilities'"); PDistribution dist = TDistribution::create(classVar); dist->add(operator()(exam)); return dist; }
void TClassifier::predictionAndDistribution(const TExample &ex, TValue &val, PDistribution &classDist) { if (computesProbabilities) { classDist = classDistribution(ex); val = classVar->varType==TValue::FLOATVAR ? TValue(classDist->average()) : classDist->highestProbValue(ex); } else { val = operator()(ex); classDist = TDistribution::create(classVar); classDist->add(val); } }
PProbabilityEstimator TProbabilityEstimatorConstructor_m::operator()(PDistribution frequencies, PDistribution apriori, PExampleGenerator, const long &weightID, const int &) const { TProbabilityEstimator_FromDistribution *pefd = mlnew TProbabilityEstimator_FromDistribution(CLONE(TDistribution, frequencies)); PProbabilityEstimator estimator = pefd; TDiscDistribution *ddist = pefd->probabilities.AS(TDiscDistribution); if (ddist && (ddist->cases > 1e-20) && apriori) { TDiscDistribution *dapriori = apriori.AS(TDiscDistribution); if (!dapriori || (dapriori->abs < 1e-20)) raiseError("invalid apriori distribution"); float mabs = m/dapriori->abs; const float &abs = ddist->abs; const float &cases = ddist->cases; const float div = cases + m; if ((abs==cases) || !renormalize) { int i = 0; for(TDiscDistribution::iterator di(ddist->begin()), de(ddist->end()), ai(dapriori->begin()); di != de; di++, ai++, i++) ddist->setint(i, (*di+*ai*mabs)/div); } else { int i = 0; for(TDiscDistribution::iterator di(ddist->begin()), de(ddist->end()), ai(dapriori->begin()); di != de; di++, ai++, i++) ddist->setint(i, (*di / abs * cases + *ai*mabs)/div); } } else pefd->probabilities->normalize(); return estimator; }
PDistributionList TMultiClassifier::classDistribution(const TExample &exam) { if (computesProbabilities) raiseError("invalid setting of 'computesProbabilities'"); PDistributionList classDists = new TDistributionList(); PDistribution dist; PVariable classVar; PValueList classValues = operator()(exam); for (int i = 0; i < classVars->size(); i++) { classVar = classVars->at(i); dist = TDistribution::create(classVar); dist->add(classValues->at(i)); classDists->push_back(dist); } return dist; }
PProbabilityEstimator TProbabilityEstimatorConstructor_loess::operator()(PDistribution frequencies, PDistribution, PExampleGenerator, const long &weightID, const int &attrNo) const { TContDistribution *cdist = frequencies.AS(TContDistribution); if (!cdist) if (frequencies && frequencies->variable) raiseError("attribute '%s' is not continuous", frequencies->variable->get_name().c_str()); else raiseError("continuous distribution expected"); if (!cdist->size()) raiseError("empty distribution"); map<float, float> loesscurve; loess(cdist->distribution, nPoints, windowProportion, loesscurve, distributionMethod); return mlnew TProbabilityEstimator_FromDistribution(mlnew TContDistribution(loesscurve)); }
PProbabilityEstimator TProbabilityEstimatorConstructor_kernel::operator()(PDistribution frequencies, PDistribution apriori, PExampleGenerator, const long &weightID, const int &) const { TContDistribution *cdist = frequencies.AS(TContDistribution); if (!cdist) raiseError("continuous distribution expected"); if (!cdist->size()) raiseError("empty distribution"); if ((minImpact<0.0) || (minImpact>1.0)) raiseError("'minImpact' should be between 0.0 and 1.0 (not %5.3f)", minImpact); vector<float> points; distributePoints(cdist->distribution, nPoints, points); TContDistribution *curve = mlnew TContDistribution(frequencies->variable); PDistribution wcurve = curve; /* Bandwidth suggested by Chad Shaw. Also found in http://www.stat.lsa.umich.edu/~kshedden/Courses/Stat606/Notes/interpolate.pdf */ const float h = smoothing * sqrt(cdist->error()) * exp(- 1.0/5.0 * log(cdist->abs)); // 1.144 const float hsqrt2pi = h * 2.5066282746310002; float t; if (minImpact>0) { t = -2 * log(minImpact*hsqrt2pi); // 2.5066... == sqrt(2*pi) if (t<=0) { // minImpact too high, but that's user's problem... ITERATE(vector<float>, pi, points) curve->setfloat(*pi, 0.0); return wcurve; } else t = h * sqrt(t); } ITERATE(vector<float>, pi, points) { const float &x = *pi; TContDistribution::const_iterator from, to; if (minImpact>0) { from = cdist->lower_bound(x-t); to = cdist->lower_bound(x+t); if ((from==cdist->end()) || (to==cdist->begin()) || (from==to)) { curve->setfloat(x, 0.0); continue; } } else { from = cdist->begin(); to = cdist->end(); } float p = 0.0, n = 0.0; for(; from != to; from++) { n += (*from).second; p += (*from).second * exp( - 0.5 * sqr( (x - (*from).first)/h ) ); } curve->setfloat(x, p/hsqrt2pi/(n*h)); // hsqrt2pi is from the inside (errf), n*h is for the sum average } return mlnew TProbabilityEstimator_FromDistribution(curve); }