int KDE::CalcKDE(DataSet_double& Out, DataSet_1D const& Pdata) const { if (Pdata.Size() < 2) { mprinterr("Error: Not enough data for KDE.\n"); return 1; } // Automatically determine min, max, step, and bin values. // std::vector<double> data; // data.reserve( Pdata.Size() ); double N = 0.0; double mean = 0.0; double M2 = 0.0; double min = Pdata.Dval(0); double max = min; for (unsigned int i = 0; i != Pdata.Size(); i++) { double x = Pdata.Dval(i); min = std::min(min, x); max = std::max(max, x); N++; double delta = x - mean; mean += delta / N; M2 += delta * (x - mean); // data.push_back( x ); } M2 /= (N - 1.0); double stdev = sqrt(M2); double step = 0.0; int bins = (int)sqrt((double)Pdata.Size()); /* std::sort(data.begin(), data.end()); double min = data.front(); double max = data.back(); unsigned int upperidx, loweridx; if ( (data.size() % 2) == 0 ) { // Even number of points. Get Q1 as median of lower and Q3 as median of upper. unsigned int halfsize = data.size() / 2; loweridx = ((halfsize - 1) / 2); upperidx = loweridx + halfsize; } else { // Odd number of points. Include the median in both halves unsigned int lsize = (data.size() + 1) / 2; loweridx = ((lsize - 1) / 2); unsigned int usize = (data.size() - 1) / 2; upperidx = loweridx + usize; } double Q1 = data[loweridx]; double Q3 = data[upperidx]; double step = 2 * ((Q3 - Q1) / pow(data.size(), 1/3)); int bins = 0; mprintf("DEBUG: Q1= %g, Q3= %g, step= %g, min= %g, max= %g, mean= %g, stdev= %g\n", Q1, Q3, step, min, max, mean, stdev); if (max - min < step) { // Would only be 1 bin. Probably noisy. mprintf("Warning: Data set is very sparse.\n"); bins = (int)Pdata.Size() / 10; step = 0; } */ mprintf("DEBUG: mean= %g, stdev= %g\n", mean, stdev); HistBin Xdim; if (Xdim.CalcBinsOrStep(min, max, step, bins, Pdata.Meta().Legend())) return 1; Xdim.PrintHistBin(); // Automatically determine bandwidth double bandwidth = 1.06 * stdev * BandwidthFactor(Pdata.Size()); mprintf("\tBandwidth: %f\n", bandwidth); std::vector<double> Increments(Pdata.Size(), 1.0); return CalcKDE(Out, Pdata, Increments, Xdim, bandwidth); }
// Analysis_KDE::Analyze() Analysis::RetType Analysis_KDE::Analyze() { DataSet_1D const& Pdata = static_cast<DataSet_1D const&>( *data_ ); int inSize = (int)Pdata.Size(); // Set output set dimensions from input set if necessary. if (!minArgSet_) { mprintf("\tNo minimum specified, determining from input data.\n"); if (q_data_ != 0) default_min_ = std::max(((DataSet_1D*)q_data_)->Min(), Pdata.Min()); else default_min_ = Pdata.Min(); } if (!maxArgSet_) { mprintf("\tNo maximum specified, determining from input data.\n"); if (q_data_ != 0) default_max_ = std::min(((DataSet_1D*)q_data_)->Max(), Pdata.Max()); else default_max_ = Pdata.Max(); } HistBin Xdim; if (Xdim.CalcBinsOrStep(default_min_, default_max_, default_step_, default_bins_, Pdata.Meta().Legend())) return Analysis::ERR; Xdim.PrintHistBin(); output_->SetDim( Dimension::X, Xdim ); // Allocate output set DataSet_double& P_hist = static_cast<DataSet_double&>( *output_ ); P_hist.Resize( Xdim.Bins() ); int outSize = (int)P_hist.Size(); // Estimate bandwidth from normal distribution approximation if necessary. if (bandwidth_ < 0.0) { double stdev; Pdata.Avg( stdev ); double N_to_1_over_5 = pow( (double)inSize, (-1.0/5.0) ); bandwidth_ = 1.06 * stdev * N_to_1_over_5; mprintf("\tDetermined bandwidth from normal distribution approximation: %f\n", bandwidth_); } // Set up increments std::vector<double> Increments(inSize, 1.0); if (amddata_ != 0) { DataSet_1D& AMD = static_cast<DataSet_1D&>( *amddata_ ); if ((int)AMD.Size() != inSize) { if ((int)AMD.Size() < inSize) { mprinterr("Error: Size of AMD data set %zu < input data set iu\n", AMD.Size(), inSize); return Analysis::ERR; } else { mprintf("Warning: Size of AMD data set %zu > input data set %i\n", AMD.Size(), inSize); } } for (int i = 0; i < inSize; i++) Increments[i] = exp( AMD.Dval(i) ); } int frame, bin; double increment; double total = 0.0; # ifdef _OPENMP int numthreads; # pragma omp parallel { # pragma omp master { numthreads = omp_get_num_threads(); mprintf("\tParallelizing calculation with %i threads\n", numthreads); } } # endif if (q_data_ == 0) { double val; // Calculate KDE, loop over input data # ifdef _OPENMP int mythread; double **P_thread; # pragma omp parallel private(frame, bin, val, increment, mythread) reduction(+:total) { mythread = omp_get_thread_num(); // Prevent race conditions by giving each thread its own histogram # pragma omp master { P_thread = new double*[ numthreads ]; for (int nt = 0; nt < numthreads; nt++) { P_thread[nt] = new double[ outSize ]; std::fill(P_thread[nt], P_thread[nt] + outSize, 0.0); } } # pragma omp barrier # pragma omp for # endif for (frame = 0; frame < inSize; frame++) { val = Pdata.Dval(frame); increment = Increments[frame]; total += increment; // Apply kernel across histogram for (bin = 0; bin < outSize; bin++) # ifdef _OPENMP P_thread[mythread][bin] += # else P_hist[bin] += # endif (increment * (this->*Kernel_)( (Xdim.Coord(bin) - val) / bandwidth_ )); } # ifdef _OPENMP } // END parallel block // Combine results from each thread histogram into P_hist for (int i = 0; i < numthreads; i++) { for (int j = 0; j < outSize; j++) P_hist[j] += P_thread[i][j]; delete[] P_thread[i]; } delete[] P_thread; # endif } else { // Calculate Kullback-Leibler divergence vs time DataSet_1D const& Qdata = static_cast<DataSet_1D const&>( *q_data_ ); if (inSize != (int)Qdata.Size()) { mprintf("Warning: Size of %s (%zu) != size of %s (%zu)\n", Pdata.legend(), Pdata.Size(), Qdata.legend(), Qdata.Size()); inSize = std::min( inSize, (int)Qdata.Size() ); mprintf("Warning: Only using %i data points.\n", inSize); } DataSet_double& klOut = static_cast<DataSet_double&>( *kldiv_ ); std::vector<double> Q_hist( Xdim.Bins(), 0.0 ); // Raw Q histogram. klOut.Resize( inSize ); // Hold KL div vs time double val_p, val_q, KL, xcrd, Pnorm, Qnorm, normP, normQ; bool Pzero, Qzero; // Loop over input P and Q data unsigned int nInvalid = 0, validPoint; for (frame = 0; frame < inSize; frame++) { //mprintf("DEBUG: Frame=%i Outsize=%i\n", frame, outSize); increment = Increments[frame]; total += increment; // Apply kernel across P and Q, calculate KL divergence as we go. val_p = Pdata.Dval(frame); val_q = Qdata.Dval(frame); normP = 0.0; normQ = 0.0; validPoint = 0; // 0 in this context means true # ifdef _OPENMP # pragma omp parallel private(bin, xcrd) reduction(+:normP, normQ) { # pragma omp for # endif for (bin = 0; bin < outSize; bin++) { xcrd = Xdim.Coord(bin); P_hist[bin] += (increment * (this->*Kernel_)( (xcrd - val_p) / bandwidth_ )); normP += P_hist[bin]; Q_hist[bin] += (increment * (this->*Kernel_)( (xcrd - val_q) / bandwidth_ )); normQ += Q_hist[bin]; } # ifdef _OPENMP } // End first parallel block # endif if (normP > std::numeric_limits<double>::min()) normP = 1.0 / normP; if (normQ > std::numeric_limits<double>::min()) normQ = 1.0 / normQ; KL = 0.0; # ifdef _OPENMP # pragma omp parallel private(bin, Pnorm, Qnorm, Pzero, Qzero) reduction(+:KL, validPoint) { # pragma omp for # endif for (bin = 0; bin < outSize; bin++) { // KL only defined when Q and P are non-zero, or both zero. if (validPoint == 0) { // Normalize for this frame Pnorm = P_hist[bin] * normP; Qnorm = Q_hist[bin] * normQ; //mprintf("Frame %8i Bin %8i P=%g Q=%g Pnorm=%g Qnorm=%g\n",frame,bin,P_hist[bin],Q_hist[bin],normP,normQ); Pzero = (Pnorm <= std::numeric_limits<double>::min()); Qzero = (Qnorm <= std::numeric_limits<double>::min()); if (!Pzero && !Qzero) KL += ( log( Pnorm / Qnorm ) * Pnorm ); else if ( Pzero != Qzero ) validPoint++; } } # ifdef _OPENMP } // End second parallel block # endif if (validPoint == 0) { klOut[frame] = KL; } else { //mprintf("Warning:\tKullback-Leibler divergence is undefined for frame %i\n", frame+1); nInvalid++; } } // END KL divergence calc loop over frames if (nInvalid > 0) mprintf("Warning:\tKullback-Leibler divergence was undefined for %u frames.\n", nInvalid); } // Normalize for (unsigned int j = 0; j < P_hist.Size(); j++) P_hist[j] /= (total * bandwidth_); // Calc free E if (calcFreeE_) { double KT = (-Constants::GASK_KCAL * Temp_); double minFreeE = 0.0; for (unsigned int j = 0; j < P_hist.Size(); j++) { P_hist[j] = log( P_hist[j] ) * KT; if (j == 0) minFreeE = P_hist[j]; else if (P_hist[j] < minFreeE) minFreeE = P_hist[j]; } for (unsigned int j = 0; j < P_hist.Size(); j++) P_hist[j] -= minFreeE; } return Analysis::OK; }
int KDE::CalcKDE(DataSet_double& Out, DataSet_1D const& Pdata, HistBin const& Xdim, double bandwidth) const { std::vector<double> Increments(Pdata.Size(), 1.0); return CalcKDE(Out, Pdata, Increments, Xdim, bandwidth); }