/** Given an ArgList containing name,[min,max,step,bins,col,N], set up a * coordinate with that name and parameters min, max, step, bins. * If '*' or not specified, a default value will be set. * \return 1 if error occurs, 0 otherwise. */ int Analysis_Hist::setupDimension(ArgList &arglist, DataSet_1D const& dset, size_t& offset) { bool minArg = false; bool maxArg = false; bool stepArg = false; bool binsArg = false; if (debug_>1) arglist.PrintList(); // Set up dimension name // NOTE: arglist[0] should be same as dset name from CheckDimension std::string const& dLabel = arglist[0]; // Cycle through coordinate arguments. Any argument left blank will be // assigned a default value later. double dMin = 0.0; double dMax = 0.0; double dStep = 0.0; int dBins = -1; for (int i = 1; i < arglist.Nargs(); i++) { if (debug_>1) mprintf("DEBUG: setupCoord: Token %i (%s)\n", i, arglist[i].c_str()); // '*' means default explicitly requested if (arglist[i] == "*") continue; switch (i) { case 1 : dMin = convertToDouble( arglist[i]); minArg = true; break; case 2 : dMax = convertToDouble( arglist[i]); maxArg = true; break; case 3 : dStep = convertToDouble( arglist[i]); stepArg = true; break; case 4 : dBins = convertToInteger(arglist[i]); binsArg = true; break; } } // If no min arg and no default min arg, get min from dataset if (!minArg) { if (!minArgSet_) dMin = dset.Min(); else dMin = default_min_; } // If no max arg and no default max arg, get max from dataset if (!maxArg) { if (!maxArgSet_) dMax = dset.Max(); else dMax = default_max_; } // If bins/step not specified, use default if (!binsArg) dBins = default_bins_; if (!stepArg) dStep = default_step_; // Calculate dimension from given args. HistBin dim; if (dim.CalcBinsOrStep( dMin, dMax, dStep, dBins, dLabel )) { mprinterr("Error: Could not set up histogram dimension '%s'\n", dLabel.c_str()); return 1; } dim.PrintHistBin(); dimensions_.push_back( dim ); // Recalculate offsets for all dimensions starting at farthest coord. This // follows row major ordering. size_t last_offset = 1UL; // For checking overflow. offset = 1UL; binOffsets_.resize( dimensions_.size() ); OffType::iterator bOff = binOffsets_.begin(); for (HdimType::const_iterator rd = dimensions_.begin(); rd != dimensions_.end(); ++rd, ++bOff) { if (debug_>0) mprintf("\tHistogram: %s offset is %zu\n", rd->label(), offset); *bOff = (long int)offset; offset *= rd->Bins(); // Check for overflow. if ( offset < last_offset ) { mprinterr("Error: Too many bins for histogram. Try reducing the number of bins and/or\n" "Error: the number of dimensions.\n"); return 1; } last_offset = offset; } // offset should now be equal to the total number of bins across all dimensions if (debug_>0) mprintf("\tHistogram: Total Bins = %zu\n",offset); return 0; }
int KDE::CalcKDE(DataSet_double& Out, DataSet_1D const& Pdata) const { if (Pdata.Size() < 2) { mprinterr("Error: Not enough data for KDE.\n"); return 1; } // Automatically determine min, max, step, and bin values. // std::vector<double> data; // data.reserve( Pdata.Size() ); double N = 0.0; double mean = 0.0; double M2 = 0.0; double min = Pdata.Dval(0); double max = min; for (unsigned int i = 0; i != Pdata.Size(); i++) { double x = Pdata.Dval(i); min = std::min(min, x); max = std::max(max, x); N++; double delta = x - mean; mean += delta / N; M2 += delta * (x - mean); // data.push_back( x ); } M2 /= (N - 1.0); double stdev = sqrt(M2); double step = 0.0; int bins = (int)sqrt((double)Pdata.Size()); /* std::sort(data.begin(), data.end()); double min = data.front(); double max = data.back(); unsigned int upperidx, loweridx; if ( (data.size() % 2) == 0 ) { // Even number of points. Get Q1 as median of lower and Q3 as median of upper. unsigned int halfsize = data.size() / 2; loweridx = ((halfsize - 1) / 2); upperidx = loweridx + halfsize; } else { // Odd number of points. Include the median in both halves unsigned int lsize = (data.size() + 1) / 2; loweridx = ((lsize - 1) / 2); unsigned int usize = (data.size() - 1) / 2; upperidx = loweridx + usize; } double Q1 = data[loweridx]; double Q3 = data[upperidx]; double step = 2 * ((Q3 - Q1) / pow(data.size(), 1/3)); int bins = 0; mprintf("DEBUG: Q1= %g, Q3= %g, step= %g, min= %g, max= %g, mean= %g, stdev= %g\n", Q1, Q3, step, min, max, mean, stdev); if (max - min < step) { // Would only be 1 bin. Probably noisy. mprintf("Warning: Data set is very sparse.\n"); bins = (int)Pdata.Size() / 10; step = 0; } */ mprintf("DEBUG: mean= %g, stdev= %g\n", mean, stdev); HistBin Xdim; if (Xdim.CalcBinsOrStep(min, max, step, bins, Pdata.Meta().Legend())) return 1; Xdim.PrintHistBin(); // Automatically determine bandwidth double bandwidth = 1.06 * stdev * BandwidthFactor(Pdata.Size()); mprintf("\tBandwidth: %f\n", bandwidth); std::vector<double> Increments(Pdata.Size(), 1.0); return CalcKDE(Out, Pdata, Increments, Xdim, bandwidth); }
// Analysis_KDE::Analyze() Analysis::RetType Analysis_KDE::Analyze() { DataSet_1D const& Pdata = static_cast<DataSet_1D const&>( *data_ ); int inSize = (int)Pdata.Size(); // Set output set dimensions from input set if necessary. if (!minArgSet_) { mprintf("\tNo minimum specified, determining from input data.\n"); if (q_data_ != 0) default_min_ = std::max(((DataSet_1D*)q_data_)->Min(), Pdata.Min()); else default_min_ = Pdata.Min(); } if (!maxArgSet_) { mprintf("\tNo maximum specified, determining from input data.\n"); if (q_data_ != 0) default_max_ = std::min(((DataSet_1D*)q_data_)->Max(), Pdata.Max()); else default_max_ = Pdata.Max(); } HistBin Xdim; if (Xdim.CalcBinsOrStep(default_min_, default_max_, default_step_, default_bins_, Pdata.Meta().Legend())) return Analysis::ERR; Xdim.PrintHistBin(); output_->SetDim( Dimension::X, Xdim ); // Allocate output set DataSet_double& P_hist = static_cast<DataSet_double&>( *output_ ); P_hist.Resize( Xdim.Bins() ); int outSize = (int)P_hist.Size(); // Estimate bandwidth from normal distribution approximation if necessary. if (bandwidth_ < 0.0) { double stdev; Pdata.Avg( stdev ); double N_to_1_over_5 = pow( (double)inSize, (-1.0/5.0) ); bandwidth_ = 1.06 * stdev * N_to_1_over_5; mprintf("\tDetermined bandwidth from normal distribution approximation: %f\n", bandwidth_); } // Set up increments std::vector<double> Increments(inSize, 1.0); if (amddata_ != 0) { DataSet_1D& AMD = static_cast<DataSet_1D&>( *amddata_ ); if ((int)AMD.Size() != inSize) { if ((int)AMD.Size() < inSize) { mprinterr("Error: Size of AMD data set %zu < input data set iu\n", AMD.Size(), inSize); return Analysis::ERR; } else { mprintf("Warning: Size of AMD data set %zu > input data set %i\n", AMD.Size(), inSize); } } for (int i = 0; i < inSize; i++) Increments[i] = exp( AMD.Dval(i) ); } int frame, bin; double increment; double total = 0.0; # ifdef _OPENMP int numthreads; # pragma omp parallel { # pragma omp master { numthreads = omp_get_num_threads(); mprintf("\tParallelizing calculation with %i threads\n", numthreads); } } # endif if (q_data_ == 0) { double val; // Calculate KDE, loop over input data # ifdef _OPENMP int mythread; double **P_thread; # pragma omp parallel private(frame, bin, val, increment, mythread) reduction(+:total) { mythread = omp_get_thread_num(); // Prevent race conditions by giving each thread its own histogram # pragma omp master { P_thread = new double*[ numthreads ]; for (int nt = 0; nt < numthreads; nt++) { P_thread[nt] = new double[ outSize ]; std::fill(P_thread[nt], P_thread[nt] + outSize, 0.0); } } # pragma omp barrier # pragma omp for # endif for (frame = 0; frame < inSize; frame++) { val = Pdata.Dval(frame); increment = Increments[frame]; total += increment; // Apply kernel across histogram for (bin = 0; bin < outSize; bin++) # ifdef _OPENMP P_thread[mythread][bin] += # else P_hist[bin] += # endif (increment * (this->*Kernel_)( (Xdim.Coord(bin) - val) / bandwidth_ )); } # ifdef _OPENMP } // END parallel block // Combine results from each thread histogram into P_hist for (int i = 0; i < numthreads; i++) { for (int j = 0; j < outSize; j++) P_hist[j] += P_thread[i][j]; delete[] P_thread[i]; } delete[] P_thread; # endif } else { // Calculate Kullback-Leibler divergence vs time DataSet_1D const& Qdata = static_cast<DataSet_1D const&>( *q_data_ ); if (inSize != (int)Qdata.Size()) { mprintf("Warning: Size of %s (%zu) != size of %s (%zu)\n", Pdata.legend(), Pdata.Size(), Qdata.legend(), Qdata.Size()); inSize = std::min( inSize, (int)Qdata.Size() ); mprintf("Warning: Only using %i data points.\n", inSize); } DataSet_double& klOut = static_cast<DataSet_double&>( *kldiv_ ); std::vector<double> Q_hist( Xdim.Bins(), 0.0 ); // Raw Q histogram. klOut.Resize( inSize ); // Hold KL div vs time double val_p, val_q, KL, xcrd, Pnorm, Qnorm, normP, normQ; bool Pzero, Qzero; // Loop over input P and Q data unsigned int nInvalid = 0, validPoint; for (frame = 0; frame < inSize; frame++) { //mprintf("DEBUG: Frame=%i Outsize=%i\n", frame, outSize); increment = Increments[frame]; total += increment; // Apply kernel across P and Q, calculate KL divergence as we go. val_p = Pdata.Dval(frame); val_q = Qdata.Dval(frame); normP = 0.0; normQ = 0.0; validPoint = 0; // 0 in this context means true # ifdef _OPENMP # pragma omp parallel private(bin, xcrd) reduction(+:normP, normQ) { # pragma omp for # endif for (bin = 0; bin < outSize; bin++) { xcrd = Xdim.Coord(bin); P_hist[bin] += (increment * (this->*Kernel_)( (xcrd - val_p) / bandwidth_ )); normP += P_hist[bin]; Q_hist[bin] += (increment * (this->*Kernel_)( (xcrd - val_q) / bandwidth_ )); normQ += Q_hist[bin]; } # ifdef _OPENMP } // End first parallel block # endif if (normP > std::numeric_limits<double>::min()) normP = 1.0 / normP; if (normQ > std::numeric_limits<double>::min()) normQ = 1.0 / normQ; KL = 0.0; # ifdef _OPENMP # pragma omp parallel private(bin, Pnorm, Qnorm, Pzero, Qzero) reduction(+:KL, validPoint) { # pragma omp for # endif for (bin = 0; bin < outSize; bin++) { // KL only defined when Q and P are non-zero, or both zero. if (validPoint == 0) { // Normalize for this frame Pnorm = P_hist[bin] * normP; Qnorm = Q_hist[bin] * normQ; //mprintf("Frame %8i Bin %8i P=%g Q=%g Pnorm=%g Qnorm=%g\n",frame,bin,P_hist[bin],Q_hist[bin],normP,normQ); Pzero = (Pnorm <= std::numeric_limits<double>::min()); Qzero = (Qnorm <= std::numeric_limits<double>::min()); if (!Pzero && !Qzero) KL += ( log( Pnorm / Qnorm ) * Pnorm ); else if ( Pzero != Qzero ) validPoint++; } } # ifdef _OPENMP } // End second parallel block # endif if (validPoint == 0) { klOut[frame] = KL; } else { //mprintf("Warning:\tKullback-Leibler divergence is undefined for frame %i\n", frame+1); nInvalid++; } } // END KL divergence calc loop over frames if (nInvalid > 0) mprintf("Warning:\tKullback-Leibler divergence was undefined for %u frames.\n", nInvalid); } // Normalize for (unsigned int j = 0; j < P_hist.Size(); j++) P_hist[j] /= (total * bandwidth_); // Calc free E if (calcFreeE_) { double KT = (-Constants::GASK_KCAL * Temp_); double minFreeE = 0.0; for (unsigned int j = 0; j < P_hist.Size(); j++) { P_hist[j] = log( P_hist[j] ) * KT; if (j == 0) minFreeE = P_hist[j]; else if (P_hist[j] < minFreeE) minFreeE = P_hist[j]; } for (unsigned int j = 0; j < P_hist.Size(); j++) P_hist[j] -= minFreeE; } return Analysis::OK; }
int KDE::CalcKDE(DataSet_double& Out, DataSet_1D const& Pdata, std::vector<double> const& Increments, HistBin const& Xdim, double bandwidth) const { int inSize = (int)Pdata.Size(); // Allocate output set, set all to zero. Out.Zero( Xdim.Bins() ); Out.SetDim( Dimension::X, Xdim ); int outSize = (int)Out.Size(); int frame, bin; double increment, val; double total = 0.0; # ifdef _OPENMP int original_num_threads; # pragma omp parallel { # pragma omp master { original_num_threads = omp_get_num_threads(); } } // Ensure we only execute with the desired number of threads if (numthreads_ < original_num_threads) omp_set_num_threads( numthreads_ ); # endif // Calculate KDE, loop over input data # ifdef _OPENMP int mythread; double **P_thread; # pragma omp parallel private(frame, bin, val, increment, mythread) reduction(+:total) { mythread = omp_get_thread_num(); // Prevent race conditions by giving each thread its own histogram # pragma omp master { P_thread = new double*[ numthreads_ ]; for (int nt = 0; nt < numthreads_; nt++) { P_thread[nt] = new double[ outSize ]; std::fill(P_thread[nt], P_thread[nt] + outSize, 0.0); } } # pragma omp barrier # pragma omp for # endif for (frame = 0; frame < inSize; frame++) { val = Pdata.Dval(frame); increment = Increments[frame]; total += increment; // Apply kernel across histogram for (bin = 0; bin < outSize; bin++) # ifdef _OPENMP P_thread[mythread][bin] += # else Out[bin] += # endif (increment * (this->*Kernel_)( (Xdim.Coord(bin) - val) / bandwidth )); } # ifdef _OPENMP } // END parallel block // Combine results from each thread histogram into Out for (int i = 0; i < numthreads_; i++) { for (int j = 0; j < outSize; j++) Out[j] += P_thread[i][j]; delete[] P_thread[i]; } delete[] P_thread; // Restore original number of threads if (original_num_threads != numthreads_) omp_set_num_threads( original_num_threads ); # endif // Normalize for (unsigned int j = 0; j < Out.Size(); j++) Out[j] /= (total * bandwidth); return 0; }