PProbabilityEstimator TProbabilityEstimatorConstructor_m::operator()(PDistribution frequencies, PDistribution apriori, PExampleGenerator, const long &weightID, const int &) const { TProbabilityEstimator_FromDistribution *pefd = mlnew TProbabilityEstimator_FromDistribution(CLONE(TDistribution, frequencies)); PProbabilityEstimator estimator = pefd; TDiscDistribution *ddist = pefd->probabilities.AS(TDiscDistribution); if (ddist && (ddist->cases > 1e-20) && apriori) { TDiscDistribution *dapriori = apriori.AS(TDiscDistribution); if (!dapriori || (dapriori->abs < 1e-20)) raiseError("invalid apriori distribution"); float mabs = m/dapriori->abs; const float &abs = ddist->abs; const float &cases = ddist->cases; const float div = cases + m; if ((abs==cases) || !renormalize) { int i = 0; for(TDiscDistribution::iterator di(ddist->begin()), de(ddist->end()), ai(dapriori->begin()); di != de; di++, ai++, i++) ddist->setint(i, (*di+*ai*mabs)/div); } else { int i = 0; for(TDiscDistribution::iterator di(ddist->begin()), de(ddist->end()), ai(dapriori->begin()); di != de; di++, ai++, i++) ddist->setint(i, (*di / abs * cases + *ai*mabs)/div); } } else pefd->probabilities->normalize(); return estimator; }
PProbabilityEstimator TProbabilityEstimatorConstructor_Laplace::operator()(PDistribution frequencies, PDistribution, PExampleGenerator, const long &, const int &) const { TProbabilityEstimator_FromDistribution *pefd = mlnew TProbabilityEstimator_FromDistribution(CLONE(TDistribution, frequencies)); PProbabilityEstimator estimator = pefd; TDiscDistribution *ddist = pefd->probabilities.AS(TDiscDistribution); if (ddist) { const float &abs = ddist->abs; const float &cases = ddist->cases; const float div = cases + l * ddist->noOfElements(); int i = 0; if (div) { if ((cases == abs) || !renormalize || (abs<1e-20)) PITERATE(TDiscDistribution, di, ddist) ddist->setint(i++, (*di + l) / div); else PITERATE(TDiscDistribution, di, ddist) ddist->setint(i++, (*di / abs * cases + l) / div); } else pefd->probabilities->normalize(); } else pefd->probabilities->normalize(); return estimator; }
// rejects the split if there are less than two non-empty branches // or there is a non-empty branch with less then minSubset examples bool checkDistribution(const TDiscDistribution &dist, const float &minSubset) { int nonzero = 0; for(TDiscDistribution::const_iterator dvi(dist.begin()), dve(dist.end()); dvi!=dve; dvi++) if (*dvi > 0) { if (*dvi < minSubset) return false; nonzero++; } return nonzero >= 2; }
PConditionalProbabilityEstimator TConditionalProbabilityEstimatorConstructor_loess::operator()(PContingency frequencies, PDistribution, PExampleGenerator, const long &, const int &) const { if (frequencies->varType != TValue::FLOATVAR) if (frequencies->outerVariable) raiseError("attribute '%s' is not continuous", frequencies->outerVariable->get_name().c_str()); else raiseError("continuous attribute expected for condition"); if (!frequencies->continuous->size()) // This is ugly, but: if you change this, you should also change the code which catches it in // Bayesian learner raiseError("distribution (of attribute values, probably) is empty or has only a single value"); PContingency cont = CLONE(TContingency, frequencies); const TDistributionMap &points = *frequencies->continuous; /* if (frequencies->continuous->size() == 1) { TDiscDistribution *f = (TDiscDistribution *)(points.begin()->second.getUnwrappedPtr()); f->normalize(); f->variances = mlnew TFloatList(f->size(), 0.0); return mlnew TConditionalProbabilityEstimator_FromDistribution(cont); } */ cont->continuous->clear(); vector<float> xpoints; distributePoints(points, nPoints, xpoints, distributionMethod); if (!xpoints.size()) raiseError("no points for the curve (check 'nPoints')"); if (frequencies->continuous->size() == 1) { TDiscDistribution *f = (TDiscDistribution *)(points.begin()->second.getUnwrappedPtr()); f->normalize(); f->variances = mlnew TFloatList(f->size(), 0.0); const_ITERATE(vector<float>, pi, xpoints) (*cont->continuous)[*pi] = f; return mlnew TConditionalProbabilityEstimator_FromDistribution(cont); } TDistributionMap::const_iterator lowedge = points.begin(); TDistributionMap::const_iterator highedge = points.end(); bool needAll; map<float, PDistribution>::const_iterator from, to; vector<float>::const_iterator pi(xpoints.begin()), pe(xpoints.end()); float refx = *pi; from = lowedge; to = highedge; int totalNumOfPoints = frequencies->outerDistribution->abs; int needpoints = int(ceil(totalNumOfPoints * windowProportion)); if (needpoints<3) needpoints = 3; TSimpleRandomGenerator rgen(frequencies->outerDistribution->cases); if ((needpoints<=0) || (needpoints>=totalNumOfPoints)) { //points.size() needAll = true; from = lowedge; to = highedge; } else { needAll = false; /* Find the window */ from = points.lower_bound(refx); to = points.upper_bound(refx); if (from==to) if (to != highedge) to++; else from --; /* Extend the interval; we set from to highedge when it would go beyond lowedge, to indicate that only to can be modified now */ while (needpoints > 0) { if ((to == highedge) || ((from != highedge) && (refx - (*from).first < (*to).first - refx))) { if (from == lowedge) from = highedge; else { from--; needpoints -= (*from).second->cases; } } else { to++; if (to!=highedge) needpoints -= (*to).second->cases; else needpoints = 0; } } if (from == highedge) from = lowedge; /* else from++;*/ } int numOfOverflowing = 0; // This follows http://www-2.cs.cmu.edu/afs/cs/project/jair/pub/volume4/cohn96a-html/node7.html for(;;) { TDistributionMap::const_iterator tt = to; --tt; if (tt == from) { TDistribution *Sy = CLONE(TDistribution, (*tt).second); PDistribution wSy = Sy; Sy->normalize(); (*cont->continuous)[refx] = (wSy); ((TDiscDistribution *)(Sy)) ->variances = mlnew TFloatList(Sy->variable->noOfValues(), 0.0); } else { float h = (refx - (*from).first); if ((*tt).first - refx > h) h = ((*tt).first - refx); /* Iterate through the window */ tt = from; const float &x = (*tt).first; const PDistribution &y = (*tt).second; float cases = y->abs; float w = fabs(refx - x) / h; w = 1 - w*w*w; w = w*w*w; const float num = y->abs; // number of instances with this x - value float n = w * num; float Sww = w * w * num; float Sx = w * x * num; float Swwx = w * w * x * num; float Swwxx = w * w * x * x * num; TDistribution *Sy = CLONE(TDistribution, y); PDistribution wSy = Sy; *Sy *= w; float Sxx = w * x * x * num; TDistribution *Syy = CLONE(TDistribution, y); PDistribution wSyy = Syy; *Syy *= w; TDistribution *Sxy = CLONE(TDistribution, y); PDistribution wSxy = Sxy; *Sxy *= w * x; if (tt!=to) while (++tt != to) { const float &x = (*tt).first; const PDistribution &y = (*tt).second; cases += y->abs; w = fabs(refx - x) / h; w = 1 - w*w*w; w = w*w*w; const float num = y->abs; n += w * num; Sww += w * w * num; Sx += w * x * num; Swwx += w * w * x * num; Swwxx += w * w * x * x * num; Sxx += w * x * x * num; TDistribution *ty = CLONE(TDistribution, y); PDistribution wty = ty; *ty *= w; *Sy += wty; *Syy += wty; *ty *= x; *Sxy += wty; //*ty *= PDistribution(y); } float sigma_x2 = n<1e-6 ? 0.0 : (Sxx - Sx * Sx / n)/n; if (sigma_x2<1e-10) { *Sy *= 0; Sy->cases = cases; (*cont->continuous)[refx] = (wSy); } TDistribution *sigma_y2 = CLONE(TDistribution, Sy); PDistribution wsigma_y2 = sigma_y2; *sigma_y2 *= wsigma_y2; *sigma_y2 *= -1/n; *sigma_y2 += wSyy; *sigma_y2 *= 1/n; TDistribution *sigma_xy = CLONE(TDistribution, Sy); PDistribution wsigma_xy = sigma_xy; *sigma_xy *= -Sx/n; *sigma_xy += wSxy; *sigma_xy *= 1/n; // This will be sigma_xy / sigma_x2, but we'll multiply it by whatever we need TDistribution *sigma_tmp = CLONE(TDistribution, sigma_xy); PDistribution wsigma_tmp = sigma_tmp; //*sigma_tmp *= wsigma_tmp; if (sigma_x2 > 1e-10) *sigma_tmp *= 1/sigma_x2; const float difx = refx - Sx/n; // computation of y *sigma_tmp *= difx; *Sy *= 1/n; *Sy += *sigma_tmp; // probabilities that are higher than 0.9 normalize with a logistic function, which produces two positive // effects: prevents overfitting and avoids probabilities that are higher than 1.0. But, on the other hand, this // solution is rather unmathematical. Do the same for probabilities that are lower than 0.1. vector<float>::iterator syi(((TDiscDistribution *)(Sy))->distribution.begin()); vector<float>::iterator sye(((TDiscDistribution *)(Sy))->distribution.end()); for (; syi!=sye; syi++) { if (*syi > 0.9) { Sy->abs -= *syi; *syi = 1/(1+exp(-10*((*syi)-0.9)*log(9.0)-log(9.0))); Sy->abs += *syi; } if (*syi < 0.1) { Sy->abs -= *syi; *syi = 1/(1+exp(10*(0.1-(*syi))*log(9.0)+log(9.0))); Sy->abs += *syi; } } Sy->cases = cases; Sy->normalize(); (*cont->continuous)[refx] = (wSy); // now for the variance // restore sigma_tmp and compute the conditional sigma if ((fabs(difx) > 1e-10) && (sigma_x2 > 1e-10)) { *sigma_tmp *= (1/difx); *sigma_tmp *= wsigma_xy; *sigma_tmp *= -1; *sigma_tmp += wsigma_y2; // fct corresponds to part of (10) in the brackets (see URL above) // float fct = Sww + difx*difx/sigma_x2/sigma_x2 * (Swwxx - 2/n * Sx*Swwx + 2/n/n * Sx*Sx*Sww); float fct = 1 + difx*difx/sigma_x2; //n + difx*difx/sigma_x2+n*n --- add this product to the overall fct sum if you are estimating error for a single user and not for the line. *sigma_tmp *= fct/n; // fct/n/n; } ((TDiscDistribution *)(Sy)) ->variances = mlnew TFloatList(((TDiscDistribution *)(sigma_tmp))->distribution); } // on to the next point pi++; if (pi==pe) break; refx = *pi; // Adjust the window while (to!=highedge) { float dif = (refx - (*from).first) - ((*to).first - refx); if ((dif>0) || (dif==0) && rgen.randbool()) { if (numOfOverflowing > 0) { from++; numOfOverflowing -= (*from).second->cases; } else { to++; if (to!=highedge) numOfOverflowing += (*to).second->cases; } } else break; } } return mlnew TConditionalProbabilityEstimator_FromDistribution(cont); }
PDistribution TLogRegClassifier::classDistribution(const TExample &origexam) { checkProperty(domain); TExample cexample(domain, origexam); TExample *example2; if (imputer) example2 = imputer->call(cexample); else { if (dataDescription) for(TExample::const_iterator ei(cexample.begin()), ee(cexample.end()-1); ei!=ee; ei++) if ((*ei).isSpecial()) return TClassifier::classDistribution(cexample, dataDescription); example2 = &cexample; } TExample *example = continuizedDomain ? mlnew TExample(continuizedDomain, *example2) : example2; float prob1; try { // multiply example with beta TAttributedFloatList::const_iterator b(beta->begin()), be(beta->end()); // get beta 0 prob1 = *b; b++; // multiply beta with example TVarList::const_iterator vi(example->domain->attributes->begin()); TExample::const_iterator ei(example->begin()), ee(example->end()); for (; (b!=be) && (ei!=ee); ei++, b++, vi++) { if ((*ei).isSpecial()) raiseError("unknown value in attribute '%s'", (*vi)->get_name().c_str()); prob1 += (*ei).floatV * (*b); } prob1 = exp(prob1)/(1+exp(prob1)); } catch (...) { if (imputer) mldelete example2; if (continuizedDomain) mldelete example; throw; } if (imputer) mldelete example2; if (continuizedDomain) mldelete example; if (classVar->varType == TValue::INTVAR) { TDiscDistribution *dist = mlnew TDiscDistribution(classVar); PDistribution res = dist; dist->addint(0, 1-prob1); dist->addint(1, prob1); return res; } else { TContDistribution *dist = mlnew TContDistribution(classVar); PDistribution res = dist; dist->addfloat(prob1, 1.0); return res; } }