Example #1
0
int KDE::CalcKDE(DataSet_double& Out, DataSet_1D const& Pdata) const {
  if (Pdata.Size() < 2) {
    mprinterr("Error: Not enough data for KDE.\n");
    return 1;
  }
  // Automatically determine min, max, step, and bin values.
//  std::vector<double> data;
//  data.reserve( Pdata.Size() );
  double N = 0.0;
  double mean = 0.0;
  double M2 = 0.0;
  double min = Pdata.Dval(0);
  double max = min; 
  for (unsigned int i = 0; i != Pdata.Size(); i++) {
    double x = Pdata.Dval(i);
    min = std::min(min, x);
    max = std::max(max, x);
    N++;
    double delta = x - mean;
    mean += delta / N;
    M2 += delta * (x - mean);
//    data.push_back( x );
  }
  M2 /= (N - 1.0);
  double stdev = sqrt(M2);
  double step = 0.0;
  int bins = (int)sqrt((double)Pdata.Size());
/*
  std::sort(data.begin(), data.end());
  double min = data.front();
  double max = data.back();
  unsigned int upperidx, loweridx;
  if ( (data.size() % 2) == 0 ) {
    // Even number of points. Get Q1 as median of lower and Q3 as median of upper.
    unsigned int halfsize = data.size() / 2;
    loweridx = ((halfsize - 1) / 2);
    upperidx = loweridx + halfsize;
  } else {
    // Odd number of points. Include the median in both halves
    unsigned int lsize = (data.size() + 1) / 2;
    loweridx = ((lsize - 1) / 2);
    unsigned int usize = (data.size() - 1) / 2;
    upperidx = loweridx + usize;
  }
  double Q1 = data[loweridx];
  double Q3 = data[upperidx];
  double step = 2 * ((Q3 - Q1) / pow(data.size(), 1/3));
  int bins = 0;
  mprintf("DEBUG: Q1= %g, Q3= %g, step= %g, min= %g, max= %g, mean= %g, stdev= %g\n",
          Q1, Q3, step, min, max, mean, stdev);
  if (max - min < step) {
    // Would only be 1 bin. Probably noisy.
    mprintf("Warning: Data set is very sparse.\n");
    bins = (int)Pdata.Size() / 10;
    step = 0;
  }
*/
  mprintf("DEBUG: mean= %g, stdev= %g\n", mean, stdev);
  HistBin Xdim;
  if (Xdim.CalcBinsOrStep(min, max, step, bins, Pdata.Meta().Legend()))
    return 1;
  Xdim.PrintHistBin();

  // Automatically determine bandwidth
  double bandwidth = 1.06 * stdev * BandwidthFactor(Pdata.Size());
  mprintf("\tBandwidth: %f\n", bandwidth);

  std::vector<double> Increments(Pdata.Size(), 1.0);

  return CalcKDE(Out, Pdata, Increments, Xdim, bandwidth);
}
Example #2
0
// Analysis_KDE::Analyze()
Analysis::RetType Analysis_KDE::Analyze() {
  DataSet_1D const& Pdata = static_cast<DataSet_1D const&>( *data_ );
  int inSize = (int)Pdata.Size();
  // Set output set dimensions from input set if necessary.
  if (!minArgSet_) {
    mprintf("\tNo minimum specified, determining from input data.\n");
    if (q_data_ != 0)
      default_min_ = std::max(((DataSet_1D*)q_data_)->Min(), Pdata.Min());
    else
      default_min_ = Pdata.Min();
  }
  if (!maxArgSet_) {
    mprintf("\tNo maximum specified, determining from input data.\n");
    if (q_data_ != 0)
      default_max_ = std::min(((DataSet_1D*)q_data_)->Max(), Pdata.Max());
    else
      default_max_ = Pdata.Max();
  }
  HistBin Xdim;
  if (Xdim.CalcBinsOrStep(default_min_, default_max_, default_step_,
                          default_bins_, Pdata.Meta().Legend()))
    return Analysis::ERR;
  Xdim.PrintHistBin();
  output_->SetDim( Dimension::X, Xdim );

  // Allocate output set
  DataSet_double& P_hist = static_cast<DataSet_double&>( *output_ );
  P_hist.Resize( Xdim.Bins() );
  int outSize = (int)P_hist.Size();

  // Estimate bandwidth from normal distribution approximation if necessary.
  if (bandwidth_ < 0.0) {
    double stdev;
    Pdata.Avg( stdev );
    double N_to_1_over_5 = pow( (double)inSize, (-1.0/5.0) );
    bandwidth_ = 1.06 * stdev * N_to_1_over_5;
    mprintf("\tDetermined bandwidth from normal distribution approximation: %f\n", bandwidth_);
  }

  // Set up increments
  std::vector<double> Increments(inSize, 1.0);
  if (amddata_ != 0) {
    DataSet_1D& AMD = static_cast<DataSet_1D&>( *amddata_ );
    if ((int)AMD.Size() != inSize) {
      if ((int)AMD.Size() < inSize) {
        mprinterr("Error: Size of AMD data set %zu < input data set iu\n",
                  AMD.Size(), inSize);
        return Analysis::ERR;
      } else {
        mprintf("Warning: Size of AMD data set %zu > input data set %i\n",
                AMD.Size(), inSize);
      }
    }
    for (int i = 0; i < inSize; i++)
      Increments[i] = exp( AMD.Dval(i) );
  }
  int frame, bin;
  double increment;
  double total = 0.0;
# ifdef _OPENMP
  int numthreads;
# pragma omp parallel
  {
#   pragma omp master
    {
      numthreads = omp_get_num_threads();
      mprintf("\tParallelizing calculation with %i threads\n", numthreads);
    }
  }
# endif
  if (q_data_ == 0) {
    double val;
    // Calculate KDE, loop over input data
#   ifdef _OPENMP
    int mythread;
    double **P_thread;
#   pragma omp parallel private(frame, bin, val, increment, mythread) reduction(+:total)
    {
      mythread = omp_get_thread_num();
      // Prevent race conditions by giving each thread its own histogram
#     pragma omp master
      {
        P_thread = new double*[ numthreads ];
        for (int nt = 0; nt < numthreads; nt++) {
          P_thread[nt] = new double[ outSize ];
          std::fill(P_thread[nt], P_thread[nt] + outSize, 0.0);
        }
      }
#     pragma omp barrier
#     pragma omp for
#   endif
      for (frame = 0; frame < inSize; frame++) {
        val = Pdata.Dval(frame);
        increment = Increments[frame];
        total += increment;
        // Apply kernel across histogram
        for (bin = 0; bin < outSize; bin++)
#         ifdef _OPENMP
          P_thread[mythread][bin] +=
#         else
          P_hist[bin] += 
#         endif
            (increment * (this->*Kernel_)( (Xdim.Coord(bin) - val) / bandwidth_ ));
      }
#   ifdef _OPENMP
    } // END parallel block
    // Combine results from each thread histogram into P_hist
    for (int i = 0; i < numthreads; i++) {
      for (int j = 0; j < outSize; j++)
        P_hist[j] += P_thread[i][j];
      delete[] P_thread[i];
    }
    delete[] P_thread;
#   endif
  } else {
    // Calculate Kullback-Leibler divergence vs time
    DataSet_1D const& Qdata = static_cast<DataSet_1D const&>( *q_data_ );
    if (inSize != (int)Qdata.Size()) {
      mprintf("Warning: Size of %s (%zu) != size of %s (%zu)\n",
                Pdata.legend(), Pdata.Size(), Qdata.legend(), Qdata.Size());
      inSize = std::min( inSize, (int)Qdata.Size() );
      mprintf("Warning:  Only using %i data points.\n", inSize);
    }
    DataSet_double& klOut = static_cast<DataSet_double&>( *kldiv_ );
    std::vector<double> Q_hist( Xdim.Bins(), 0.0 ); // Raw Q histogram.
    klOut.Resize( inSize ); // Hold KL div vs time
    double val_p, val_q, KL, xcrd, Pnorm, Qnorm, normP, normQ;
    bool Pzero, Qzero;
    // Loop over input P and Q data
    unsigned int nInvalid = 0, validPoint;
    for (frame = 0; frame < inSize; frame++) {
      //mprintf("DEBUG: Frame=%i Outsize=%i\n", frame, outSize);
      increment = Increments[frame];
      total += increment;
      // Apply kernel across P and Q, calculate KL divergence as we go. 
      val_p = Pdata.Dval(frame);
      val_q = Qdata.Dval(frame);
      normP = 0.0;
      normQ = 0.0;
      validPoint = 0; // 0 in this context means true
#     ifdef _OPENMP
#     pragma omp parallel private(bin, xcrd) reduction(+:normP, normQ)
      {
#       pragma omp for
#       endif
        for (bin = 0; bin < outSize; bin++) {
          xcrd = Xdim.Coord(bin);
          P_hist[bin] += (increment * (this->*Kernel_)( (xcrd - val_p) / bandwidth_ ));
          normP += P_hist[bin];
          Q_hist[bin] += (increment * (this->*Kernel_)( (xcrd - val_q) / bandwidth_ ));
          normQ += Q_hist[bin];
        }
#     ifdef _OPENMP
      } // End first parallel block
#     endif
      if (normP > std::numeric_limits<double>::min())
        normP = 1.0 / normP;
      if (normQ > std::numeric_limits<double>::min())
        normQ = 1.0 / normQ;
      KL = 0.0;
#     ifdef _OPENMP
#     pragma omp parallel private(bin, Pnorm, Qnorm, Pzero, Qzero) reduction(+:KL, validPoint)
      {
#       pragma omp for
#       endif
        for (bin = 0; bin < outSize; bin++) {
          // KL only defined when Q and P are non-zero, or both zero.
          if (validPoint == 0) {
            // Normalize for this frame
            Pnorm = P_hist[bin] * normP;
            Qnorm = Q_hist[bin] * normQ;
            //mprintf("Frame %8i Bin %8i P=%g Q=%g Pnorm=%g Qnorm=%g\n",frame,bin,P_hist[bin],Q_hist[bin],normP,normQ);
            Pzero = (Pnorm <= std::numeric_limits<double>::min());
            Qzero = (Qnorm <= std::numeric_limits<double>::min());
            if (!Pzero && !Qzero)
              KL += ( log( Pnorm / Qnorm ) * Pnorm );
            else if ( Pzero != Qzero )
              validPoint++;
          }
        }
#       ifdef _OPENMP
      } // End second parallel block
#     endif
      if (validPoint == 0) {
        klOut[frame] = KL;
      } else {
        //mprintf("Warning:\tKullback-Leibler divergence is undefined for frame %i\n", frame+1);
        nInvalid++;
      }
    } // END KL divergence calc loop over frames
    if (nInvalid > 0)
      mprintf("Warning:\tKullback-Leibler divergence was undefined for %u frames.\n", nInvalid);
  }

  // Normalize
  for (unsigned int j = 0; j < P_hist.Size(); j++)
    P_hist[j] /= (total * bandwidth_);

  // Calc free E
  if (calcFreeE_) {
    double KT = (-Constants::GASK_KCAL * Temp_);
    double minFreeE = 0.0;
    for (unsigned int j = 0; j < P_hist.Size(); j++) {
      P_hist[j] = log( P_hist[j] ) * KT;
      if (j == 0)
        minFreeE = P_hist[j];
      else if (P_hist[j] < minFreeE)
        minFreeE = P_hist[j];
    }
    for (unsigned int j = 0; j < P_hist.Size(); j++)
      P_hist[j] -= minFreeE;
  }

  return Analysis::OK;
}
Example #3
0
int KDE::CalcKDE(DataSet_double& Out, DataSet_1D const& Pdata,
                 HistBin const& Xdim, double bandwidth) const
{
  std::vector<double> Increments(Pdata.Size(), 1.0);
  return CalcKDE(Out, Pdata, Increments, Xdim, bandwidth);
}