void TMultiClassifier::predictionAndDistribution(const TExample &ex,
		PValueList &classValues, PDistributionList &classDists) {
	if (computesProbabilities) {
		classDists = classDistribution(ex);
		PValueList classValues = new TValueList();
		TValue value;
		PVariable classVar;
		for (int i = 0; i < classVars->size(); i++) {
			classVar = classVars->at(i);
			value = classVar->varType == TValue::FLOATVAR ?
					TValue(classDists->at(i)->average()) :
					classDists->at(i)->highestProbValue(ex);
			classValues->push_back(value);
		}
	} else {
		classValues = operator()(ex);
		PDistributionList classDist = new TDistributionList();
		PDistribution dist;
		PVariable classVar;

		for (int i = 0; i < classVars->size(); i++) {
			classVar = classVars->at(i);
			dist = TDistribution::create(classVar);
			dist->add(classValues->at(i));
			classDist->push_back(dist);
		}

	}
}
Esempio n. 2
0
PDistribution TClassifier::classDistribution(const TExample &exam)
{ if (computesProbabilities) 
    raiseError("invalid setting of 'computesProbabilities'");

  PDistribution dist = TDistribution::create(classVar);
  dist->add(operator()(exam));
  return dist;
}
Esempio n. 3
0
void TClassifier::predictionAndDistribution(const TExample &ex, TValue &val, PDistribution &classDist)
{ if (computesProbabilities) {
    classDist = classDistribution(ex);
    val = classVar->varType==TValue::FLOATVAR ? TValue(classDist->average()) : classDist->highestProbValue(ex);
  }
  else {
    val = operator()(ex);
    classDist = TDistribution::create(classVar);
    classDist->add(val);
  }
}
PProbabilityEstimator TProbabilityEstimatorConstructor_m::operator()(PDistribution frequencies, PDistribution apriori, PExampleGenerator, const long &weightID, const int &) const
{ TProbabilityEstimator_FromDistribution *pefd = mlnew TProbabilityEstimator_FromDistribution(CLONE(TDistribution, frequencies));
  PProbabilityEstimator estimator = pefd;
  
  TDiscDistribution *ddist = pefd->probabilities.AS(TDiscDistribution);  
  if (ddist && (ddist->cases > 1e-20) && apriori) {
    TDiscDistribution *dapriori = apriori.AS(TDiscDistribution);
    if (!dapriori || (dapriori->abs < 1e-20))
      raiseError("invalid apriori distribution");
    
    float mabs = m/dapriori->abs;
    const float &abs = ddist->abs;
    const float &cases = ddist->cases;
    const float div = cases + m;
    if ((abs==cases) || !renormalize) {
      int i = 0;
      for(TDiscDistribution::iterator di(ddist->begin()), de(ddist->end()), ai(dapriori->begin());
          di != de;
          di++, ai++, i++)
         ddist->setint(i, (*di+*ai*mabs)/div);
    }
    else {
      int i = 0;
      for(TDiscDistribution::iterator di(ddist->begin()), de(ddist->end()), ai(dapriori->begin());
          di != de;
          di++, ai++, i++)
         ddist->setint(i, (*di / abs * cases + *ai*mabs)/div);
    }
  }
  else
    pefd->probabilities->normalize();
    
  return estimator;
}
PDistributionList TMultiClassifier::classDistribution(const TExample &exam) {
	if (computesProbabilities)
		raiseError("invalid setting of 'computesProbabilities'");

	PDistributionList classDists = new TDistributionList();
	PDistribution dist;
	PVariable classVar;
	PValueList classValues = operator()(exam);

	for (int i = 0; i < classVars->size(); i++) {
		classVar = classVars->at(i);
		dist = TDistribution::create(classVar);
		dist->add(classValues->at(i));
		classDists->push_back(dist);
	}
	return dist;
}
PProbabilityEstimator TProbabilityEstimatorConstructor_loess::operator()(PDistribution frequencies, PDistribution, PExampleGenerator, const long &weightID, const int &attrNo) const
{ TContDistribution *cdist = frequencies.AS(TContDistribution);
  if (!cdist)
    if (frequencies && frequencies->variable)
      raiseError("attribute '%s' is not continuous", frequencies->variable->get_name().c_str());
    else
      raiseError("continuous distribution expected");
  if (!cdist->size())
    raiseError("empty distribution");

  map<float, float> loesscurve;
  loess(cdist->distribution, nPoints, windowProportion, loesscurve, distributionMethod);
  return mlnew TProbabilityEstimator_FromDistribution(mlnew TContDistribution(loesscurve));
}
PProbabilityEstimator TProbabilityEstimatorConstructor_kernel::operator()(PDistribution frequencies, PDistribution apriori, PExampleGenerator, const long &weightID, const int &) const
{ TContDistribution *cdist = frequencies.AS(TContDistribution);
  if (!cdist)
    raiseError("continuous distribution expected");
  if (!cdist->size())
    raiseError("empty distribution");
  if ((minImpact<0.0) || (minImpact>1.0))
    raiseError("'minImpact' should be between 0.0 and 1.0 (not %5.3f)", minImpact);

  vector<float> points;
  distributePoints(cdist->distribution, nPoints, points);

  TContDistribution *curve = mlnew TContDistribution(frequencies->variable);
  PDistribution wcurve = curve;

  /* Bandwidth suggested by Chad Shaw. Also found in http://www.stat.lsa.umich.edu/~kshedden/Courses/Stat606/Notes/interpolate.pdf */
  const float h = smoothing * sqrt(cdist->error()) * exp(- 1.0/5.0 * log(cdist->abs)); // 1.144
  const float hsqrt2pi = h * 2.5066282746310002;
  float t;

  if (minImpact>0) {
    t = -2 * log(minImpact*hsqrt2pi); // 2.5066... == sqrt(2*pi)
    if (t<=0) {
      // minImpact too high, but that's user's problem... 
      ITERATE(vector<float>, pi, points)
        curve->setfloat(*pi, 0.0);
        return wcurve;
    }
    else
      t = h * sqrt(t);
  }
      
      
  ITERATE(vector<float>, pi, points) {
    const float &x = *pi;
    TContDistribution::const_iterator from, to;

    if (minImpact>0) {
      from = cdist->lower_bound(x-t);
      to = cdist->lower_bound(x+t);
      if ((from==cdist->end()) || (to==cdist->begin()) || (from==to)) {
        curve->setfloat(x, 0.0);
        continue;
      }
    }
    else {
      from = cdist->begin();
      to = cdist->end();
    }

    float p = 0.0, n = 0.0;
    for(; from != to; from++) {
      n += (*from).second;
      p += (*from).second * exp( - 0.5 * sqr( (x - (*from).first)/h ) );
    }

    curve->setfloat(x, p/hsqrt2pi/(n*h)); // hsqrt2pi is from the inside (errf), n*h is for the sum average
  }


  return mlnew TProbabilityEstimator_FromDistribution(curve);
}