double kl_distance(const histogram<double>& sample1,const histogram<double>& sample2) { //let's include the whole range double _min=std::min(sample1.min(),sample2.min()); double _max=std::max(sample1.max(),sample2.max()); double _range=_max-_min; //now we are going to iterate through samples double distance=0.0; int _size=std::max(sample1.size(),sample2.size()); double _bin=_range/_size; double v=_min+_bin/2.0; for(int i=0;i<_size;i++,v+=_bin) { double _P=0.0; if(v>=sample1.min() && v<=sample1.max()) _P=sample1[v]; double _Q=0.0; if(v>=sample2.min() && v<=sample2.max()) _Q=sample2[v]; if(_P>0.0 && _Q>0.0)//TODO: epsilon? distance+=_P*log(_P/_Q); } return distance; }
//! apply k-means classify to histogram void apply_k_means_classify(const histogram<double>& input, const std::vector<double>& mu, std::vector<int>& cls ) { int number_of_classes=mu.size(); int i,j,k; //calculate all the classes for(j=0;j<input.size();j++) { int best_k=0; double best_dist=0; for(k=0;k<number_of_classes;k++) { double dist=fabs(input.value(j)-mu[k]); if(dist<best_dist|| best_k==0) { best_dist=dist; best_k=k+1; } } cls[j]=best_k; } }
void simple_k_means( histogram<double>& hist, std::vector<double>& mu,int k_means,int maxiter) { std::vector<int> cls(hist.size(),0); if(mu.size()!=k_means) { mu.resize(k_means); //initializing k-means using uniform distribution double vol_min=hist.find_percentile(0.001); double vol_max=hist.find_percentile(0.999); for(int j=0;j<k_means;j++) mu[j]= vol_min+(vol_max-vol_min)*j/(double)(k_means-1); } for(int iter=0;iter<maxiter;iter++) { apply_k_means_classify(hist,mu,cls); estimate_mu(hist,cls,mu); } }