PProbabilityEstimator TProbabilityEstimatorConstructor_m::operator()(PDistribution frequencies, PDistribution apriori, PExampleGenerator, const long &weightID, const int &) const
{ TProbabilityEstimator_FromDistribution *pefd = mlnew TProbabilityEstimator_FromDistribution(CLONE(TDistribution, frequencies));
  PProbabilityEstimator estimator = pefd;
  
  TDiscDistribution *ddist = pefd->probabilities.AS(TDiscDistribution);  
  if (ddist && (ddist->cases > 1e-20) && apriori) {
    TDiscDistribution *dapriori = apriori.AS(TDiscDistribution);
    if (!dapriori || (dapriori->abs < 1e-20))
      raiseError("invalid apriori distribution");
    
    float mabs = m/dapriori->abs;
    const float &abs = ddist->abs;
    const float &cases = ddist->cases;
    const float div = cases + m;
    if ((abs==cases) || !renormalize) {
      int i = 0;
      for(TDiscDistribution::iterator di(ddist->begin()), de(ddist->end()), ai(dapriori->begin());
          di != de;
          di++, ai++, i++)
         ddist->setint(i, (*di+*ai*mabs)/div);
    }
    else {
      int i = 0;
      for(TDiscDistribution::iterator di(ddist->begin()), de(ddist->end()), ai(dapriori->begin());
          di != de;
          di++, ai++, i++)
         ddist->setint(i, (*di / abs * cases + *ai*mabs)/div);
    }
  }
  else
    pefd->probabilities->normalize();
    
  return estimator;
}
PProbabilityEstimator TProbabilityEstimatorConstructor_Laplace::operator()(PDistribution frequencies, PDistribution, PExampleGenerator, const long &, const int &) const
{ TProbabilityEstimator_FromDistribution *pefd = mlnew TProbabilityEstimator_FromDistribution(CLONE(TDistribution, frequencies));
  PProbabilityEstimator estimator = pefd;
  
  TDiscDistribution *ddist = pefd->probabilities.AS(TDiscDistribution);
  if (ddist) {
    const float &abs = ddist->abs;
    const float &cases = ddist->cases;
    const float div = cases + l * ddist->noOfElements();
    int i = 0;
    if (div) {
      if ((cases == abs) || !renormalize || (abs<1e-20))
        PITERATE(TDiscDistribution, di, ddist)
          ddist->setint(i++, (*di + l) / div);
      else
        PITERATE(TDiscDistribution, di, ddist)
          ddist->setint(i++, (*di / abs * cases + l) / div);
    }
    else
      pefd->probabilities->normalize();
  }
  else
    pefd->probabilities->normalize();
  
  return estimator;
}
Exemple #3
0
// rejects the split if there are less than two non-empty branches
// or there is a non-empty branch with less then minSubset examples
bool checkDistribution(const TDiscDistribution &dist, const float &minSubset)
{
  int nonzero = 0;
  for(TDiscDistribution::const_iterator dvi(dist.begin()), dve(dist.end()); dvi!=dve; dvi++)
    if (*dvi > 0) {
      if  (*dvi < minSubset)
        return false;
      nonzero++;
    }

  return nonzero >= 2;
}
PConditionalProbabilityEstimator TConditionalProbabilityEstimatorConstructor_loess::operator()(PContingency frequencies, PDistribution, PExampleGenerator, const long &, const int &) const
{ if (frequencies->varType != TValue::FLOATVAR)
    if (frequencies->outerVariable)
      raiseError("attribute '%s' is not continuous", frequencies->outerVariable->get_name().c_str());
    else
      raiseError("continuous attribute expected for condition");

    if (!frequencies->continuous->size())
      // This is ugly, but: if you change this, you should also change the code which catches it in
      // Bayesian learner
      raiseError("distribution (of attribute values, probably) is empty or has only a single value");

  PContingency cont = CLONE(TContingency, frequencies);
  const TDistributionMap &points = *frequencies->continuous;

/*  if (frequencies->continuous->size() == 1) {
    TDiscDistribution *f = (TDiscDistribution *)(points.begin()->second.getUnwrappedPtr());
    f->normalize();
    f->variances = mlnew TFloatList(f->size(), 0.0);
    return mlnew TConditionalProbabilityEstimator_FromDistribution(cont);
  }
*/
  cont->continuous->clear();

  vector<float> xpoints;
  distributePoints(points, nPoints, xpoints, distributionMethod);

  if (!xpoints.size())
    raiseError("no points for the curve (check 'nPoints')");
    
  if (frequencies->continuous->size() == 1) {
    TDiscDistribution *f = (TDiscDistribution *)(points.begin()->second.getUnwrappedPtr());
    f->normalize();
    f->variances = mlnew TFloatList(f->size(), 0.0);
    const_ITERATE(vector<float>, pi, xpoints)
      (*cont->continuous)[*pi] = f;
    return mlnew TConditionalProbabilityEstimator_FromDistribution(cont);
  }    

  TDistributionMap::const_iterator lowedge = points.begin();
  TDistributionMap::const_iterator highedge = points.end();

  bool needAll;
  map<float, PDistribution>::const_iterator from, to;

  vector<float>::const_iterator pi(xpoints.begin()), pe(xpoints.end());
  float refx = *pi;

  from = lowedge;
  to = highedge; 
  int totalNumOfPoints = frequencies->outerDistribution->abs;

  int needpoints = int(ceil(totalNumOfPoints * windowProportion));
  if (needpoints<3)
    needpoints = 3;


  TSimpleRandomGenerator rgen(frequencies->outerDistribution->cases);

  if ((needpoints<=0) || (needpoints>=totalNumOfPoints)) {  //points.size()
    needAll = true;
    from = lowedge;
    to = highedge;
  }
  else {
    needAll = false;

    /* Find the window */
    from = points.lower_bound(refx);
    to = points.upper_bound(refx);
    if (from==to)
      if (to != highedge)
        to++;
      else
        from --;

    /* Extend the interval; we set from to highedge when it would go beyond lowedge, to indicate that only to can be modified now */
    while (needpoints > 0) {
      if ((to == highedge) || ((from != highedge) && (refx - (*from).first < (*to).first - refx))) {
        if (from == lowedge)
          from = highedge;
        else {
          from--;
          needpoints -= (*from).second->cases;
        }
      }
      else {
        to++;
        if (to!=highedge)
          needpoints -= (*to).second->cases;
        else
          needpoints = 0;
      }

    }
    
    if (from == highedge)
      from = lowedge;
/*    else
      from++;*/
  }

  int numOfOverflowing = 0;
  // This follows http://www-2.cs.cmu.edu/afs/cs/project/jair/pub/volume4/cohn96a-html/node7.html
  for(;;) {
    TDistributionMap::const_iterator tt = to;
    --tt;
    
    if (tt == from) {
      TDistribution *Sy = CLONE(TDistribution, (*tt).second);
      PDistribution wSy = Sy;
      Sy->normalize();
      (*cont->continuous)[refx] = (wSy);
      ((TDiscDistribution *)(Sy)) ->variances = mlnew TFloatList(Sy->variable->noOfValues(), 0.0);
    }
    else {
  
      float h = (refx - (*from).first);
      if ((*tt).first - refx  >  h)
        h = ((*tt).first - refx);

      /* Iterate through the window */

      tt = from;
      const float &x = (*tt).first;
      const PDistribution &y = (*tt).second;
      float cases = y->abs;

      float w = fabs(refx - x) / h;
      w = 1 - w*w*w;
      w = w*w*w;

      const float num = y->abs; // number of instances with this x - value
      float n = w * num;
      float Sww = w * w * num;

      float Sx = w * x * num;
      float Swwx  = w * w * x * num;
      float Swwxx = w * w * x * x * num;
      TDistribution *Sy = CLONE(TDistribution, y);
      PDistribution wSy = Sy;
      *Sy *= w;

      float Sxx = w * x * x * num;
      TDistribution *Syy = CLONE(TDistribution, y);
      PDistribution wSyy = Syy;
      *Syy *= w;

      TDistribution *Sxy = CLONE(TDistribution, y);
      PDistribution wSxy = Sxy;
      *Sxy *= w * x;

      if (tt!=to)
        while (++tt != to) {
          const float &x = (*tt).first;
          const PDistribution &y = (*tt).second;
          cases += y->abs;

          w = fabs(refx - x) / h;
          w = 1 - w*w*w;
          w = w*w*w;

          const float num = y->abs;
          n   += w * num;
          Sww += w * w * num;
          Sx  += w * x * num;
          Swwx += w * w * x * num;
          Swwxx += w * w * x * x * num;
          Sxx += w * x * x * num;

          TDistribution *ty = CLONE(TDistribution, y);
          PDistribution wty = ty;
          *ty *= w;
          *Sy  += wty;
          *Syy += wty;
          *ty *= x;
          *Sxy += wty;

          //*ty *= PDistribution(y);
        }

      float sigma_x2 = n<1e-6 ? 0.0 : (Sxx - Sx * Sx / n)/n;
      if (sigma_x2<1e-10) {
        *Sy *= 0;
        Sy->cases = cases;
        (*cont->continuous)[refx] = (wSy);
      }

      TDistribution *sigma_y2 = CLONE(TDistribution, Sy);
      PDistribution wsigma_y2 = sigma_y2;
      *sigma_y2 *= wsigma_y2;
      *sigma_y2 *= -1/n;
      *sigma_y2 += wSyy;
      *sigma_y2 *= 1/n;

      TDistribution *sigma_xy = CLONE(TDistribution, Sy);
      PDistribution wsigma_xy = sigma_xy;
      *sigma_xy *= -Sx/n;
      *sigma_xy += wSxy; 
      *sigma_xy *= 1/n;

      // This will be sigma_xy / sigma_x2, but we'll multiply it by whatever we need
      TDistribution *sigma_tmp = CLONE(TDistribution, sigma_xy);
      PDistribution wsigma_tmp = sigma_tmp;
      //*sigma_tmp *= wsigma_tmp;
      if (sigma_x2 > 1e-10)
        *sigma_tmp *= 1/sigma_x2;

      const float difx = refx - Sx/n;

      // computation of y
      *sigma_tmp *= difx;
      *Sy *= 1/n;
      *Sy += *sigma_tmp;

      // probabilities that are higher than 0.9 normalize with a logistic function, which produces two positive 
      // effects: prevents overfitting and avoids probabilities that are higher than 1.0. But, on the other hand, this 
      // solution is rather unmathematical. Do the same for probabilities that are lower than 0.1.

      vector<float>::iterator syi(((TDiscDistribution *)(Sy))->distribution.begin()); 
      vector<float>::iterator sye(((TDiscDistribution *)(Sy))->distribution.end()); 
      for (; syi!=sye; syi++) {
        if (*syi > 0.9) {
          Sy->abs -= *syi;
          *syi = 1/(1+exp(-10*((*syi)-0.9)*log(9.0)-log(9.0)));
          Sy->abs += *syi;
        }
        if (*syi < 0.1) {
          Sy->abs -= *syi;
          *syi = 1/(1+exp(10*(0.1-(*syi))*log(9.0)+log(9.0)));
          Sy->abs += *syi;
        }
      }

      Sy->cases = cases;
      Sy->normalize();
      (*cont->continuous)[refx] = (wSy);
   
      // now for the variance
      // restore sigma_tmp and compute the conditional sigma
      if ((fabs(difx) > 1e-10) && (sigma_x2 > 1e-10)) {
        *sigma_tmp *= (1/difx);
        *sigma_tmp *= wsigma_xy;
        *sigma_tmp *= -1; 
        *sigma_tmp += wsigma_y2;
        // fct corresponds to part of (10) in the brackets (see URL above)
     //   float fct = Sww + difx*difx/sigma_x2/sigma_x2 * (Swwxx   - 2/n * Sx*Swwx   +  2/n/n * Sx*Sx*Sww);
        float fct = 1 + difx*difx/sigma_x2; //n + difx*difx/sigma_x2+n*n --- add this product to the overall fct sum if you are estimating error for a single user and not for the line.  
        *sigma_tmp *= fct/n; // fct/n/n;
      }
      ((TDiscDistribution *)(Sy)) ->variances = mlnew TFloatList(((TDiscDistribution *)(sigma_tmp))->distribution);
    }      

    // on to the next point
    pi++;
    if (pi==pe)
      break; 

    refx = *pi;

    // Adjust the window
    while (to!=highedge) {
      float dif = (refx - (*from).first) - ((*to).first - refx);
      if ((dif>0) || (dif==0) && rgen.randbool()) {
        if (numOfOverflowing > 0) {
          from++;
          numOfOverflowing -= (*from).second->cases;
        }
        else {
          to++;
          if (to!=highedge) 
            numOfOverflowing += (*to).second->cases;
        }
      }
	    else
		    break;
    }
  }

  return mlnew TConditionalProbabilityEstimator_FromDistribution(cont);
}
PDistribution TLogRegClassifier::classDistribution(const TExample &origexam)
{   
  checkProperty(domain);
  TExample cexample(domain, origexam);

  TExample *example2;

  if (imputer)
    example2 = imputer->call(cexample);
  else {
    if (dataDescription)
      for(TExample::const_iterator ei(cexample.begin()), ee(cexample.end()-1); ei!=ee; ei++)
        if ((*ei).isSpecial())
          return TClassifier::classDistribution(cexample, dataDescription);

    example2 = &cexample;
  }

  TExample *example = continuizedDomain ? mlnew TExample(continuizedDomain, *example2) : example2;

  float prob1;
  try {
    // multiply example with beta
    TAttributedFloatList::const_iterator b(beta->begin()), be(beta->end());

    // get beta 0
    prob1 = *b;
    b++;
    // multiply beta with example
    TVarList::const_iterator vi(example->domain->attributes->begin());
    TExample::const_iterator ei(example->begin()), ee(example->end());
    for (; (b!=be) && (ei!=ee); ei++, b++, vi++) {
      if ((*ei).isSpecial())
        raiseError("unknown value in attribute '%s'", (*vi)->get_name().c_str());
      prob1 += (*ei).floatV * (*b); 
    }

    prob1 = exp(prob1)/(1+exp(prob1));
  }
  catch (...) {
    if (imputer)
      mldelete example2;
    if (continuizedDomain)
      mldelete example;
    throw;
  }

  if (imputer)
    mldelete example2;
  if (continuizedDomain)
    mldelete example;

  if (classVar->varType == TValue::INTVAR) {
      TDiscDistribution *dist = mlnew TDiscDistribution(classVar);
      PDistribution res = dist;
      dist->addint(0, 1-prob1);
      dist->addint(1, prob1);
      return res;
  }
  else {
      TContDistribution *dist = mlnew TContDistribution(classVar);
      PDistribution res = dist;
      dist->addfloat(prob1, 1.0);
      return res;
  }
}