void CMT::WhiteningTransform::initialize(const ArrayXXd& input, int dimOut) { if(input.cols() < input.rows()) throw Exception("Too few inputs to compute whitening transform."); mMeanIn = input.rowwise().mean(); // compute covariances MatrixXd covXX = covariance(input); // input whitening SelfAdjointEigenSolver<MatrixXd> eigenSolver; eigenSolver.compute(covXX); Array<double, 1, Dynamic> eigenvalues = eigenSolver.eigenvalues(); MatrixXd eigenvectors = eigenSolver.eigenvectors(); // don't whiten directions with near-zero variance for(int i = 0; i < eigenvalues.size(); ++i) if(eigenvalues[i] < 1e-7) eigenvalues[i] = 1.; mPreIn = (eigenvectors.array().rowwise() * eigenvalues.sqrt().cwiseInverse()).matrix() * eigenvectors.transpose(); mPreInInv = (eigenvectors.array().rowwise() * eigenvalues.sqrt()).matrix() * eigenvectors.transpose(); mMeanOut = VectorXd::Zero(dimOut); mPreOut = MatrixXd::Identity(dimOut, dimOut); mPreOutInv = MatrixXd::Identity(dimOut, dimOut); mPredictor = MatrixXd::Zero(dimOut, input.rows()); mGradTransform = MatrixXd::Zero(dimOut, input.rows()); mLogJacobian = 1.; }
void NestedSampler::setPosteriorSample(ArrayXXd newPosteriorSample) { Ndimensions = newPosteriorSample.rows(); int Nsamples = newPosteriorSample.cols(); posteriorSample.resize(Ndimensions, Nsamples); posteriorSample = newPosteriorSample; }
/** * \brief Calculate the loglikelihood of a linear regression contained * in a linear_reg object. * * @param X The design matrix. */ void linear_reg::logLikelihood(const mematrix<double>& X) { /* loglik = 0.; double ss=0; for (int i=0;i<rdata.nids;i++) { double resid = rdata.Y[i] - beta.get(0,0); // intercept for (int j=1;j<beta.nrow;j++) resid -= beta.get(j,0)*X.get(i,j); // residuals[i] = resid; ss += resid*resid; } sigma2 = ss/N; */ //cout << "estimate " << rdata.nids << "\n"; //(rdata.X).print(); //for (int i=0;i<rdata.nids;i++) cout << rdata.masked_data[i] << " "; //cout << endl; loglik = 0.; double halfrecsig2 = .5 / sigma2; //loglik -= halfrecsig2 * residuals[i] * residuals[i]; double intercept = beta.get(0, 0); residuals.data = reg_data.Y.data.array() - intercept; //matrix. ArrayXXd betacol = beta.data.block(1, 0, beta.data.rows() - 1, 1).array().transpose(); ArrayXXd resid_sub = (X.data.block(0, 1, X.data.rows(), X.data.cols() - 1) * betacol.matrix().asDiagonal()).rowwise().sum(); //std::cout << resid_sub << std::endl; residuals.data -= resid_sub.matrix(); //residuals[i] -= resid_sub; loglik -= (residuals.data.array().square() * halfrecsig2).sum(); loglik -= static_cast<double>(reg_data.nids) * log(sqrt(sigma2)); }
Array<int, 1, Dynamic> CMT::MCBM::samplePrior(const MatrixXd& input) const { if(input.rows() != dimIn()) throw Exception("Inputs have wrong dimensionality."); ArrayXXd featureEnergy = mWeights * (mFeatures.transpose() * input).array().square().matrix(); ArrayXXd biasEnergy = mInputBias.transpose() * input; ArrayXXd predictorEnergy = mPredictors * input; ArrayXXd tmp0 = (featureEnergy + biasEnergy).colwise() + mPriors.array(); ArrayXXd tmp1 = (tmp0 + predictorEnergy).colwise() + mOutputBias.array(); ArrayXXd logPrior = tmp0 + tmp1; logPrior.rowwise() -= logSumExp(logPrior); ArrayXXd prior = logPrior.exp(); Array<int, 1, Dynamic> labels(input.cols()); #pragma omp parallel for for(int j = 0; j < input.cols(); ++j) { int i = 0; double urand = static_cast<double>(rand()) / (static_cast<long>(RAND_MAX) + 1l); double cdf; // compute index for(cdf = prior(0, j); cdf < urand; cdf += prior(i, j)) ++i; labels[j] = i; } return labels; }
/* Multiply each row of u by temp */ MatrixXd arrayMultiplierRowWise(MatrixXd u,ArrayXXd temp,int n){ ArrayXXd uArray = u.array(); int i; for(i=0;i<n;i++){ uArray.row(i) *= temp; } return uArray.matrix(); }
ArrayXXd CMT::tanh(const ArrayXXd& arr) { ArrayXXd result(arr.rows(), arr.cols()); #pragma omp parallel for for(int i = 0; i < arr.size(); ++i) result(i) = std::tanh(arr(i)); return result; }
ArrayXXd CMT::HistogramNonlinearity::operator()(const ArrayXXd& inputs) const { ArrayXXd outputs(inputs.rows(), inputs.cols()); for(int i = 0; i < inputs.rows(); ++i) for(int j = 0; j < inputs.cols(); ++j) outputs(i, j) = mHistogram[bin(inputs(i, j))] + mEpsilon; return outputs; }
ArrayXXd CMT::HistogramNonlinearity::gradient(const ArrayXXd& inputs) const { if(inputs.rows() != 1) throw Exception("Data has to be stored in one row."); ArrayXXd gradient = ArrayXXd::Zero(mHistogram.size(), inputs.cols()); for(int i = 0; i < inputs.rows(); ++i) for(int j = 0; j < inputs.rows(); ++j) gradient(bin(inputs(i, j)), j) = 1; return gradient; }
MatrixXd CMT::MLR::predict(const MatrixXd& input) const { if(input.rows() != mDimIn) throw Exception("Inputs have wrong dimensionality."); MatrixXd output = MatrixXd::Zero(mDimOut, input.cols()); // distribution over outputs ArrayXXd prob = (mWeights * input).colwise() + mBiases; prob.rowwise() -= logSumExp(prob); prob = prob.exp(); return prob; }
ArrayXXd CMT::BlobNonlinearity::operator()(const ArrayXXd& inputs) const { if(inputs.rows() != 1) throw Exception("Data has to be stored in one row."); ArrayXXd diff = ArrayXXd::Zero(mNumComponents, inputs.cols()); diff.rowwise() += inputs.row(0); diff.colwise() -= mMeans; ArrayXXd negEnergy = diff.square().colwise() * (-mLogPrecisions.exp() / 2.); return (mLogWeights.exp().transpose().matrix() * negEnergy.exp().matrix()).array() + mEpsilon; }
ArrayXXi CMT::sampleBinomial(const ArrayXXi& n, const ArrayXXd& p) { if(n.rows() != p.rows() || n.cols() != p.cols()) throw Exception("n and p must be of the same size."); ArrayXXi samples = ArrayXXi::Zero(n.rows(), n.cols()); #pragma omp parallel for for(int i = 0; i < samples.size(); ++i) { // very naive algorithm for generating binomial samples for(int k = 0; k < n(i); ++k) if(rand() / static_cast<double>(RAND_MAX) < p(i)) samples(i) += 1; } return samples; }
void operator()(std::size_t begin, std::size_t end) { for(std::size_t j = begin; j < end; j++){ ArrayXd d = s2*a_prec(j) + s1*e_prec(j); ArrayXd mlam = b.col(j).array() / d; effects.col(j) = U * (mlam + randn_draws.col(j) / sqrt(d)).matrix(); } }
Array<double, 1, Dynamic> CMT::MLR::logLikelihood( const MatrixXd& input, const MatrixXd& output) const { if(input.cols() != output.cols()) throw Exception("Number of inputs and outputs have to be the same."); if(input.rows() != mDimIn) throw Exception("Inputs have wrong dimensionality."); if(output.rows() != mDimOut) throw Exception("Output has wrong dimensionality."); // distribution over outputs ArrayXXd logProb = (mWeights * input).colwise() + mBiases; logProb.rowwise() -= logSumExp(logProb); return (logProb * output.array()).colwise().sum(); }
void CMT::HistogramNonlinearity::initialize( const ArrayXXd& inputs, const ArrayXXd& outputs, int numBins) { double max = inputs.maxCoeff(); double min = inputs.minCoeff(); mBinEdges = vector<double>(numBins + 1); double binWidth = (max - min) / numBins; for(int k = 0; k < mBinEdges.size(); ++k) mBinEdges[k] = min + k * binWidth; initialize(inputs, outputs); }
ArrayXXd CMT::BlobNonlinearity::derivative(const ArrayXXd& inputs) const { if(inputs.rows() != 1) throw Exception("Data has to be stored in one row."); ArrayXXd diff = ArrayXXd::Zero(mNumComponents, inputs.cols()); diff.rowwise() -= inputs.row(0); diff.colwise() += mMeans; ArrayXd precisions = mLogPrecisions.exp(); ArrayXXd negEnergy = diff.square().colwise() * (-precisions / 2.); return (mLogWeights.exp() * precisions).transpose().matrix() * (diff * negEnergy.exp()).matrix(); }
/** * Algorithm due to Knuth, 1969. */ ArrayXXi CMT::samplePoisson(const ArrayXXd& lambda) { ArrayXXi samples(lambda.rows(), lambda.cols()); ArrayXXd threshold = (-lambda).exp(); #pragma omp parallel for for(int i = 0; i < samples.size(); ++i) { double p = rand() / static_cast<double>(RAND_MAX); int k = 0; while(p > threshold(i)) { k += 1; p *= rand() / static_cast<double>(RAND_MAX); } samples(i) = k; } return samples; }
void CMT::HistogramNonlinearity::initialize( const ArrayXXd& inputs, const ArrayXXd& outputs) { if(inputs.rows() != outputs.rows() || inputs.cols() != outputs.cols()) throw Exception("Inputs and outputs have to have same size."); mHistogram = vector<double>(mBinEdges.size() - 1); vector<int> counter(mBinEdges.size() - 1); for(int k = 0; k < mHistogram.size(); ++k) { mHistogram[k] = 0.; counter[k] = 0; } for(int i = 0; i < inputs.rows(); ++i) for(int j = 0; j < inputs.cols(); ++j) { // find bin int k = bin(inputs(i, j)); // update histogram counter[k] += 1; mHistogram[k] += outputs(i, j); } for(int k = 0; k < mHistogram.size(); ++k) if(mHistogram[k] > 0.) // average output observed in bin k mHistogram[k] /= counter[k]; }
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { int N = mxGetScalar(prhs[0]); double d = mxGetScalar(prhs[1]); double h = mxGetScalar(prhs[2]); int Njacv = mxGetScalar(prhs[3]); double b = mxGetScalar(prhs[4]); double c = mxGetScalar(prhs[5]); double dr = mxGetScalar(prhs[6]); double di = mxGetScalar(prhs[7]); int threadNum = mxGetScalar(prhs[8]); double *a0 = mxGetPr(prhs[9]); double *v = mxGetPr(prhs[10]); double th = mxGetScalar(prhs[11]); double phi = mxGetScalar(prhs[12]); int nstp = mxGetScalar(prhs[13]); // mwSize isJ = mxGetScalar(prhs[14]); ArrayXXd av = gintgv(N, d, h, Njacv, b, c, dr, di, threadNum, a0, v, th, phi, nstp); plhs[0] = mxCreateDoubleMatrix(av.rows(), av.cols(), mxREAL); memcpy(mxGetPr(plhs[0]), av.data(), av.cols()*av.rows()*sizeof(double)); }
double CMT::MLR::parameterGradient( const MatrixXd& input, const MatrixXd& output, const lbfgsfloatval_t* x, lbfgsfloatval_t* g, const Trainable::Parameters& params_) const { const Parameters& params = dynamic_cast<const Parameters&>(params_); MatrixXd weights = mWeights; VectorXd biases = mBiases; // copy parameters int k = 0; if(params.trainWeights) for(int i = 1; i < weights.rows(); ++i) for(int j = 0; j < weights.cols(); ++j, ++k) weights(i, j) = x[k]; if(params.trainBiases) for(int i = 1; i < mBiases.rows(); ++i, ++k) biases[i] = x[k]; // compute distribution over outputs ArrayXXd logProb = (weights * input).colwise() + biases; logProb.rowwise() -= logSumExp(logProb); // difference between prediction and actual output MatrixXd diff = (logProb.exp().matrix() - output); // compute gradients double normConst = output.cols() * log(2.); if(g) { int offset = 0; if(params.trainWeights) { Map<Matrix<double, Dynamic, Dynamic, RowMajor> > weightsGrad(g, mDimOut - 1, mDimIn); weightsGrad = (diff * input.transpose() / normConst).bottomRows(mDimOut - 1); offset += weightsGrad.size(); weightsGrad += params.regularizeWeights.gradient( weights.bottomRows(mDimOut - 1).transpose()).transpose(); } if(params.trainBiases) { VectorLBFGS biasesGrad(g + offset, mDimOut - 1); biasesGrad = diff.rowwise().sum().bottomRows(mDimOut - 1) / normConst; biasesGrad += params.regularizeBiases.gradient(biases); } } // return negative average log-likelihood in bits double value = -(logProb * output.array()).sum() / normConst; if(params.trainWeights) value += params.regularizeWeights.evaluate(weights.bottomRows(mDimOut - 1).transpose()); if(params.trainBiases) value += params.regularizeBiases.evaluate(biases); return value; }
MatrixXd cube(MatrixXd xin){ ArrayXXd x = xin.array(); //convert to Array x*=(x*x); return x.matrix(); }
Array<double, 1, Dynamic> CMT::logMeanExp(const ArrayXXd& array) { Array<double, 1, Dynamic> arrayMax = array.colwise().maxCoeff() - 1.; return arrayMax + (array.rowwise() - arrayMax).exp().colwise().mean().log(); }
bool CMT::Mixture::train( const MatrixXd& data, const MatrixXd& dataValid, const Parameters& parameters, const Component::Parameters& componentParameters) { if(parameters.initialize && !initialized()) initialize(data, parameters, componentParameters); ArrayXXd logJoint(numComponents(), data.cols()); Array<double, Dynamic, 1> postSum; Array<double, 1, Dynamic> logLik; ArrayXXd post; ArrayXXd weights; // training and validation log-loss for checking convergence double avgLogLoss = numeric_limits<double>::infinity(); double avgLogLossNew; double avgLogLossValid = evaluate(dataValid); double avgLogLossValidNew = avgLogLossValid; int counter = 0; // backup model parameters VectorXd priors = mPriors; vector<Component*> components; for(int k = 0; k < numComponents(); ++k) components.push_back(mComponents[k]->copy()); for(int i = 0; i < parameters.maxIter; ++i) { // compute joint probability of data and assignments (E) #pragma omp parallel for for(int k = 0; k < numComponents(); ++k) logJoint.row(k) = mComponents[k]->logLikelihood(data) + log(mPriors[k]); // compute normalized posterior (E) logLik = logSumExp(logJoint); // average negative log-likelihood in bits per component avgLogLossNew = -logLik.mean() / log(2.) / dim(); if(parameters.verbosity > 0) { if(i % parameters.valIter == 0) { // print training and validation error cout << setw(6) << i; cout << setw(14) << setprecision(7) << avgLogLossNew; cout << setw(14) << setprecision(7) << avgLogLossValidNew << endl; } else { // print training error cout << setw(6) << i << setw(14) << setprecision(7) << avgLogLossNew << endl; } } // test for convergence if(avgLogLoss - avgLogLossNew < parameters.threshold) return true; avgLogLoss = avgLogLossNew; // compute normalized posterior (E) post = (logJoint.rowwise() - logLik).exp(); postSum = post.rowwise().sum(); weights = post.colwise() / postSum; // optimize prior weights (M) if(parameters.trainPriors) { mPriors = postSum / data.cols() + parameters.regularizePriors; mPriors /= mPriors.sum(); } // optimize components (M) if(parameters.trainComponents) { #pragma omp parallel for for(int k = 0; k < numComponents(); ++k) mComponents[k]->train(data, weights.row(k), componentParameters); } else { return true; } if((i + 1) % parameters.valIter == 0) { // check validation error avgLogLossValidNew = evaluate(dataValid); if(avgLogLossValidNew < avgLogLossValid) { // backup new found model parameters priors = mPriors; for(int k = 0; k < numComponents(); ++k) *components[k] = *mComponents[k]; avgLogLossValid = avgLogLossValidNew; } else { counter++; if(parameters.valLookAhead > 0 && counter >= parameters.valLookAhead) { // set parameters to best parameters found during training mPriors = priors; for(int k = 0; k < numComponents(); ++k) { *mComponents[k] = *components[k]; delete components[k]; } return true; } } } } if(parameters.verbosity > 0) cout << setw(6) << parameters.maxIter << setw(11) << setprecision(5) << evaluate(data) << endl; return false; }
bool KmeansClusterer::updateClusterCentersUntilConverged(RefArrayXXd sample, RefArrayXXd centers, RefArrayXd clusterSizes, vector<int> &clusterIndices, double &sumOfDistancesToClosestCenter, double relTolerance) { unsigned int Npoints = sample.cols(); unsigned int Ndimensions = sample.rows(); unsigned int Nclusters = centers.cols(); ArrayXXd updatedCenters = ArrayXXd::Zero(Ndimensions, Nclusters); // coordinates of each of the new cluster centers // Perform the k-means clustering iteration, each time improving the cluster centers, // and redetermining which points belongs to which cluster bool stopIterations = false; bool convergenceReached; unsigned int indexOfClosestCenter; double oldSumOfDistances = 0.0; double newSumOfDistances = 0.0; double distanceToClosestCenter; double distance; while (!stopIterations) { // Find for each point the closest cluster center. // At the same time recompute/update the new cluster centers, which is simply // the barycenter of all points belonging to the cluster. clusterSizes.setZero(); updatedCenters.setZero(); for (int n = 0; n < Npoints; ++n) { distanceToClosestCenter = numeric_limits<double>::max(); for (int i = 0; i < Nclusters; ++i) { distance = metric.distance(sample.col(n), centers.col(i)); if (distance < distanceToClosestCenter) { indexOfClosestCenter = i; distanceToClosestCenter = distance; } } newSumOfDistances += distanceToClosestCenter; updatedCenters.col(indexOfClosestCenter) += sample.col(n); clusterSizes(indexOfClosestCenter) += 1; clusterIndices[n] = indexOfClosestCenter; } // Assert that all clusters contain at least 2 points. If not we probably started // with an unfortunate set of initial cluster centers. Flag this by immediately // returning false. if (!(clusterSizes > 1).all()) { convergenceReached = false; return convergenceReached; } // Finish computing the new updated centers. Given the check above, we are sure // that none of the clusters is empty. updatedCenters.rowwise() /= clusterSizes.transpose(); centers = updatedCenters; // A new set of clusters has been determined. // Decide whether the algorithm has converged. Convergence occurs when // the sum of all distances of all points to their cluster center does // not change significantly anymore. // Note: in order for this criterion to work properly, the coordinate // space should be normalized, so that one particular coordinate // cannot numerically dominate all other coordinates. if (oldSumOfDistances == 0.0) { // This is the first center-updating iteration, so there is nothing to compare yet. // Simply set the variables. oldSumOfDistances = newSumOfDistances; newSumOfDistances = 0.0; } else { // If the relative change in sumOfDistances between old and new was smaller than // the threshold set by the user, stop the iteration loop. if (fabs(newSumOfDistances - oldSumOfDistances) / oldSumOfDistances < relTolerance) { sumOfDistancesToClosestCenter = newSumOfDistances; // will be returned to user stopIterations = true; } else { oldSumOfDistances = newSumOfDistances; newSumOfDistances = 0.0; } } } // end k-means center-updating loop // Convergence was properly reached, so return convergenceReached = true; return convergenceReached; }
ArrayXXd CMT::ExponentialFunction::derivative(const ArrayXXd& data) const { return data.exp(); }
ArrayXXd CMT::ExponentialFunction::operator()(const ArrayXXd& data) const { return data.exp() + mEpsilon; }
ArrayXXd CMT::BlobNonlinearity::gradient(const ArrayXXd& inputs) const { if(inputs.rows() != 1) throw Exception("Data has to be stored in one row."); ArrayXXd diff = ArrayXXd::Zero(mNumComponents, inputs.cols()); diff.rowwise() += inputs.row(0); diff.colwise() -= mMeans; ArrayXXd diffSq = diff.square(); ArrayXd precisions = mLogPrecisions.exp(); ArrayXd weights = mLogWeights.exp(); ArrayXXd negEnergy = diffSq.colwise() * (-precisions / 2.); ArrayXXd negEnergyExp = negEnergy.exp(); ArrayXXd gradient(3 * mNumComponents, inputs.cols()); // gradient of mean gradient.topRows(mNumComponents) = (diff * negEnergyExp).colwise() * (weights * precisions); // gradient of log-precisions gradient.middleRows(mNumComponents, mNumComponents) = (diffSq / 2. * negEnergyExp).colwise() * (-weights * precisions); // gradient of log-weights gradient.bottomRows(mNumComponents) = negEnergyExp.colwise() * weights; return gradient; }
CMT::WhiteningTransform::WhiteningTransform(const ArrayXXd& input, const ArrayXXd& output) { initialize(input, output.rows()); }
double CMT::MCBM::parameterGradient( const MatrixXd& inputCompl, const MatrixXd& outputCompl, const lbfgsfloatval_t* x, lbfgsfloatval_t* g, const Trainable::Parameters& params_) const { const Parameters& params = dynamic_cast<const Parameters&>(params_); // average log-likelihood double logLik = 0.; // interpret memory for parameters and gradients lbfgsfloatval_t* y = const_cast<lbfgsfloatval_t*>(x); int offset = 0; VectorLBFGS priors(params.trainPriors ? y : const_cast<double*>(mPriors.data()), mNumComponents); VectorLBFGS priorsGrad(g, mNumComponents); if(params.trainPriors) offset += priors.size(); MatrixLBFGS weights(params.trainWeights ? y + offset : const_cast<double*>(mWeights.data()), mNumComponents, mNumFeatures); MatrixLBFGS weightsGrad(g + offset, mNumComponents, mNumFeatures); if(params.trainWeights) offset += weights.size(); MatrixLBFGS features(params.trainFeatures ? y + offset : const_cast<double*>(mFeatures.data()), mDimIn, mNumFeatures); MatrixLBFGS featuresGrad(g + offset, mDimIn, mNumFeatures); if(params.trainFeatures) offset += features.size(); MatrixLBFGS predictors(params.trainPredictors ? y + offset : const_cast<double*>(mPredictors.data()), mNumComponents, mDimIn); MatrixLBFGS predictorsGrad(g + offset, mNumComponents, mDimIn); if(params.trainPredictors) offset += predictors.size(); MatrixLBFGS inputBias(params.trainInputBias ? y + offset : const_cast<double*>(mInputBias.data()), mDimIn, mNumComponents); MatrixLBFGS inputBiasGrad(g + offset, mDimIn, mNumComponents); if(params.trainInputBias) offset += inputBias.size(); VectorLBFGS outputBias(params.trainOutputBias ? y + offset : const_cast<double*>(mOutputBias.data()), mNumComponents); VectorLBFGS outputBiasGrad(g + offset, mNumComponents); if(params.trainOutputBias) offset += outputBias.size(); if(g) { // initialize gradients if(params.trainPriors) priorsGrad.setZero(); if(params.trainWeights) weightsGrad.setZero(); if(params.trainFeatures) featuresGrad.setZero(); if(params.trainPredictors) predictorsGrad.setZero(); if(params.trainInputBias) inputBiasGrad.setZero(); if(params.trainOutputBias) outputBiasGrad.setZero(); } // split data into batches for better performance int numData = static_cast<int>(inputCompl.cols()); int batchSize = min(max(params.batchSize, 10), numData); #pragma omp parallel for for(int b = 0; b < inputCompl.cols(); b += batchSize) { const MatrixXd& input = inputCompl.middleCols(b, min(batchSize, numData - b)); const MatrixXd& output = outputCompl.middleCols(b, min(batchSize, numData - b)); ArrayXXd featureOutput = features.transpose() * input; MatrixXd featureOutputSq = featureOutput.square(); MatrixXd weightsOutput = weights * featureOutputSq; ArrayXXd predictorOutput = predictors * input; // unnormalized posteriors over components for both possible outputs ArrayXXd logPost0 = (weightsOutput + inputBias.transpose() * input).colwise() + priors; ArrayXXd logPost1 = (logPost0 + predictorOutput).colwise() + outputBias.array(); // sum over components to get unnormalized probabilities of outputs Array<double, 1, Dynamic> logProb0 = logSumExp(logPost0); Array<double, 1, Dynamic> logProb1 = logSumExp(logPost1); // normalize posteriors over components logPost0.rowwise() -= logProb0; logPost1.rowwise() -= logProb1; // stack row vectors ArrayXXd logProb01(2, input.cols()); logProb01 << logProb0, logProb1; // normalize log-probabilities Array<double, 1, Dynamic> logNorm = logSumExp(logProb01); logProb1 -= logNorm; logProb0 -= logNorm; double logLikBatch = (output.array() * logProb1 + (1. - output.array()) * logProb0).sum(); #pragma omp critical logLik += logLikBatch; if(!g) // don't compute gradients continue; Array<double, 1, Dynamic> tmp = output.array() * logProb0.exp() - (1. - output.array()) * logProb1.exp(); ArrayXXd post0Tmp = logPost0.exp().rowwise() * tmp; ArrayXXd post1Tmp = logPost1.exp().rowwise() * tmp; ArrayXXd postDiffTmp = post1Tmp - post0Tmp; // update gradients if(params.trainPriors) #pragma omp critical priorsGrad -= postDiffTmp.rowwise().sum().matrix(); if(params.trainWeights) #pragma omp critical weightsGrad -= postDiffTmp.matrix() * featureOutputSq.transpose(); if(params.trainFeatures) { ArrayXXd tmp2 = weights.transpose() * postDiffTmp.matrix() * 2.; MatrixXd tmp3 = featureOutput * tmp2; #pragma omp critical featuresGrad -= input * tmp3.transpose(); } if(params.trainPredictors) #pragma omp critical predictorsGrad -= post1Tmp.matrix() * input.transpose(); if(params.trainInputBias) #pragma omp critical inputBiasGrad -= input * postDiffTmp.matrix().transpose(); if(params.trainOutputBias) #pragma omp critical outputBiasGrad -= post1Tmp.rowwise().sum().matrix(); } double normConst = inputCompl.cols() * log(2.) * dimOut(); if(g) { for(int i = 0; i < offset; ++i) g[i] /= normConst; if(params.trainFeatures) featuresGrad += params.regularizeFeatures.gradient(features); if(params.trainPredictors) predictorsGrad += params.regularizePredictors.gradient(predictors.transpose()).transpose(); if(params.trainWeights) weightsGrad += params.regularizeWeights.gradient(weights); } double value = -logLik / normConst; if(params.trainFeatures) value += params.regularizeFeatures.evaluate(features); if(params.trainPredictors) value += params.regularizePredictors.evaluate(predictors.transpose()); if(params.trainWeights) value += params.regularizeWeights.evaluate(weights); return value; }
void NestedSampler::run(LivePointsReducer &livePointsReducer, const int NinitialIterationsWithoutClustering, const int NiterationsWithSameClustering, const int maxNdrawAttempts, const double maxRatioOfRemainderToCurrentEvidence, string pathPrefix) { int startTime = time(0); double logMeanLiveEvidence; terminationFactor = maxRatioOfRemainderToCurrentEvidence; outputPathPrefix = pathPrefix; if (printOnTheScreen) { cerr << "------------------------------------------------" << endl; cerr << " Bayesian Inference problem has " << Ndimensions << " dimensions." << endl; cerr << "------------------------------------------------" << endl; cerr << endl; } // Save configuring parameters to an output ASCII file string fileName = "configuringParameters.txt"; string fullPath = outputPathPrefix + fileName; File::openOutputFile(outputFile, fullPath); outputFile << "# List of configuring parameters used for the NSMC." << endl; outputFile << "# Row #1: Ndimensions" << endl; outputFile << "# Row #2: Initial(Maximum) NlivePoints" << endl; outputFile << "# Row #3: Minimum NlivePoints" << endl; outputFile << "# Row #4: NinitialIterationsWithoutClustering" << endl; outputFile << "# Row #5: NiterationsWithSameClustering" << endl; outputFile << "# Row #6: maxNdrawAttempts" << endl; outputFile << "# Row #7: terminationFactor" << endl; outputFile << "# Row #8: Niterations" << endl; outputFile << "# Row #9: Optimal Niterations" << endl; outputFile << "# Row #10: Final Nclusters" << endl; outputFile << "# Row #11: Final NlivePoints" << endl; outputFile << "# Row #12: Computational Time (seconds)" << endl; outputFile << Ndimensions << endl; outputFile << initialNlivePoints << endl; outputFile << minNlivePoints << endl; outputFile << NinitialIterationsWithoutClustering << endl; outputFile << NiterationsWithSameClustering << endl; outputFile << maxNdrawAttempts << endl; outputFile << terminationFactor << endl; // Set up the random number generator. It generates integer random numbers // between 0 and NlivePoints-1, inclusive. uniform_int_distribution<int> discreteUniform(0, NlivePoints-1); // Draw the initial sample from the prior PDF. Different coordinates of a point // can have different priors, so these have to be sampled individually. if (printOnTheScreen) { cerr << "------------------------------------------------" << endl; cerr << " Doing initial sampling of parameter space..." << endl; cerr << "------------------------------------------------" << endl; cerr << endl; } nestedSample.resize(Ndimensions, NlivePoints); int beginIndex = 0; int NdimensionsOfCurrentPrior; ArrayXXd priorSample; for (int i = 0; i < ptrPriors.size(); i++) { // Some priors cover one particalar coordinate, others may cover two or more coordinates // Find out how many dimensions the current prior covers. NdimensionsOfCurrentPrior = ptrPriors[i]->getNdimensions(); // Draw the subset of coordinates randomly from the current prior priorSample.resize(NdimensionsOfCurrentPrior, NlivePoints); ptrPriors[i]->draw(priorSample); // Insert this random subset of coordinates into the total sample of coordinates of points nestedSample.block(beginIndex, 0, NdimensionsOfCurrentPrior, NlivePoints) = priorSample; // Move index to the beginning of the coordinate set of the next prior beginIndex += NdimensionsOfCurrentPrior; } // Compute the log(Likelihood) for each of our points in the live sample logLikelihood.resize(NlivePoints); for (int i = 0; i < NlivePoints; ++i) { logLikelihood(i) = likelihood.logValue(nestedSample.col(i)); } // Initialize the prior mass interval and cumulate it double logWidthInPriorMass = log(1.0 - exp(-1.0/NlivePoints)); // X_0 - X_1 First width in prior mass logCumulatedPriorMass = Functions::logExpSum(logCumulatedPriorMass, logWidthInPriorMass); // 1 - X_1 logRemainingPriorMass = Functions::logExpDifference(logRemainingPriorMass, logWidthInPriorMass); // X_1 // Initialize first part of width in prior mass for trapezoidal rule // X_0 = (2 - X_1), right-side boundary condition for trapezoidal rule double logRemainingPriorMassRightBound = Functions::logExpDifference(log(2), logRemainingPriorMass); double logWidthInPriorMassRight = Functions::logExpDifference(logRemainingPriorMassRightBound,logRemainingPriorMass); // Find maximum log(Likelihood) value in the initial sample of live points. // This information can be useful when reducing the number of live points adopted within the nesting process. logMaxLikelihoodOfLivePoints = logLikelihood.maxCoeff(); // The nested sampling will involve finding clusters in the sample. // This will require the containers clusterIndices and clusterSizes. unsigned int Nclusters = 0; vector<int> clusterIndices(NlivePoints); // clusterIndices must have the same number of elements as the number of live points vector<int> clusterSizes; // The number of live points counted in each cluster is updated everytime one live point // is removed from the sample. // Start the nested sampling loop. Each iteration, we'll replace the point with the worst likelihood. // New points are drawn from the prior, but with the constraint that they should have a likelihood // that is better than the currently worst one. if (printOnTheScreen) { cerr << "-------------------------------" << endl; cerr << " Starting nested sampling... " << endl; cerr << "-------------------------------" << endl; cerr << endl; } bool nestedSamplingShouldContinue = true; bool livePointsShouldBeReduced = (initialNlivePoints > minNlivePoints); // Update live points only if required Niterations = 0; do { // Resize the arrays to make room for an additional point. // Do so without destroying the original contents. posteriorSample.conservativeResize(Ndimensions, Niterations + 1); logLikelihoodOfPosteriorSample.conservativeResize(Niterations + 1); logWeightOfPosteriorSample.conservativeResize(Niterations + 1); // Find the point with the worst likelihood. This likelihood value will set a constraint // when drawing new points later on. int indexOfLivePointWithWorstLikelihood; worstLiveLogLikelihood = logLikelihood.minCoeff(&indexOfLivePointWithWorstLikelihood); // Although we will replace the point with the worst likelihood in the live sample, we will save // it in our collection of posterior sample. Also save its likelihood value. The weight is // computed and collected at the end of each iteration. posteriorSample.col(Niterations) = nestedSample.col(indexOfLivePointWithWorstLikelihood); logLikelihoodOfPosteriorSample(Niterations) = worstLiveLogLikelihood; // Compute the (logarithm of) the mean likelihood of the set of live points. // Note that we are not computing mean(log(likelihood)) but log(mean(likelhood)). // Since we are only storing the log(likelihood) values, this results in a peculiar // way of computing the mean. This will be used for computing the mean live evidence // at the end of the iteration. logMeanLikelihoodOfLivePoints = logLikelihood(0); for (int m = 1; m < NlivePoints; m++) { logMeanLikelihoodOfLivePoints = Functions::logExpSum(logMeanLikelihoodOfLivePoints, logLikelihood(m)); } logMeanLikelihoodOfLivePoints -= log(NlivePoints); // Find clusters in our live sample of points. Don't do this every iteration but only // every x iterations, where x is given by 'NiterationsWithSameClustering'. if ((Niterations % NiterationsWithSameClustering) == 0) { // Don't do clustering the first N iterations, where N is user-specified. That is, // the first N iterations we assume that there is only 1 cluster containing all the points. // This is often useful because initially the points may be sampled from a uniform prior, // and we therefore don't expect any clustering _before_ the algorithm is able to tune in on // the island(s) of high likelihood. Clusters found in the first N initial iterations are // therefore likely purely noise. if (Niterations < NinitialIterationsWithoutClustering) { // There is only 1 cluster, containing all objects. All points have the same cluster // index, namely 0. Nclusters = 1; clusterSizes.resize(1); clusterSizes[0] = NlivePoints; fill(clusterIndices.begin(), clusterIndices.end(), 0); } else { // After the first N initial iterations, we do a proper clustering. Nclusters = clusterer.cluster(nestedSample, clusterIndices, clusterSizes); } } // Draw a new point, which should replace the point with the worst likelihood. // This new point should be drawn from the prior, but with a likelihood greater // than the current worst likelihood. The drawing algorithm may need a starting point, // for which we will take a randomly chosen point of the live sample (excluding the // worst point). int indexOfRandomlyChosenLivePoint = 0; if (NlivePoints > 1) { // Select randomly an index of a sample point, but not the one of the worst point do { // 0 <= indexOfRandomlyChosenLivePoint < NlivePoints indexOfRandomlyChosenLivePoint = discreteUniform(engine); } while (indexOfRandomlyChosenLivePoint == indexOfLivePointWithWorstLikelihood); } // drawnPoint will be a starting point as input, and will contain the newly drawn point as output ArrayXd drawnPoint = nestedSample.col(indexOfRandomlyChosenLivePoint); double logLikelihoodOfDrawnPoint = 0.0; bool newPointIsFound = drawWithConstraint(nestedSample, Nclusters, clusterIndices, clusterSizes, drawnPoint, logLikelihoodOfDrawnPoint, maxNdrawAttempts); // If the adopted sampler produces an error (e.g. in the case of the ellipsoidal sampler a failure // in the ellipsoid matrix decomposition), then we can stop right here. nestedSamplingShouldContinue = verifySamplerStatus(); if (!nestedSamplingShouldContinue) break; // If we didn't find a point with a better likelihood, then we can stop right here. if (!newPointIsFound) { nestedSamplingShouldContinue = false; cerr << "Can't find point with a better Likelihood." << endl; cerr << "Stopping the nested sampling loop prematurely." << endl; break; } // Replace the point having the worst likelihood with our newly drawn one. nestedSample.col(indexOfLivePointWithWorstLikelihood) = drawnPoint; logLikelihood(indexOfLivePointWithWorstLikelihood) = logLikelihoodOfDrawnPoint; // If we got till here this is not the last iteration possible, hence // update all the information for the next iteration. // Check if the number of live points has not reached the minimum allowed, // and update it for the next iteration. if (livePointsShouldBeReduced) { // Update the number of live points for the current iteration based on the previous number. // If the number of live points reaches the minimum allowed // then do not update the number anymore. updatedNlivePoints = livePointsReducer.updateNlivePoints(); if (updatedNlivePoints > NlivePoints) { // Terminate program if new number of live points is greater than previous one cerr << "Something went wrong in the reduction of the live points." << endl; cerr << "The new number of live points is greater than the previous one." << endl; cerr << "Quitting program. " << endl; break; } // If the lower bound for the number of live points has not been reached yet, // the process should be repeated at the next iteration. // Otherwise the minimun number allowed is reached right now. In this case // stop the reduction process starting from the next iteration. livePointsShouldBeReduced = (updatedNlivePoints > minNlivePoints); if (updatedNlivePoints != NlivePoints) { // Resize all eigen arrays and vectors of dimensions NlivePoints according to // new number of live points evaluated. In case previos and new number // of live points coincide, no resizing is done. vector<int> indicesOfLivePointsToRemove = livePointsReducer.findIndicesOfLivePointsToRemove(engine); // At least one live point has to be removed, hence update the sample removeLivePointsFromSample(indicesOfLivePointsToRemove, clusterIndices, clusterSizes); // Since everything is fine update discreteUniform with the corresponding new upper bound uniform_int_distribution<int> discreteUniform2(0, updatedNlivePoints-1); discreteUniform = discreteUniform2; } } // Store the new number of live points in the vector containing this information. // This is done even if the new number is the same as the previous one. NlivePointsPerIteration.push_back(NlivePoints); // Compute the mean live evidence given the previous set of live points (see Keeton 2011, MNRAS) logMeanLiveEvidence = logMeanLikelihoodOfLivePoints + Niterations * (log(NlivePoints) - log(NlivePoints + 1)); // Compute the ratio of the evidence of the live sample to the current Skilling's evidence. // Only when we gathered enough evidence, this ratio will be sufficiently small so that we can stop the iterations. ratioOfRemainderToCurrentEvidence = exp(logMeanLiveEvidence - logEvidence); // Re-evaluate the stopping criterion, using the condition suggested by Keeton (2011) nestedSamplingShouldContinue = (ratioOfRemainderToCurrentEvidence > maxRatioOfRemainderToCurrentEvidence); // Shrink prior mass interval according to proper number of live points // (see documentation by Enrico Corsaro October 2013). When reducing the number of live points // the equation is a generalized version of that used by Skilling 2004. The equation // reduces to the standard case when the new number of live points is the same // as the previous one. // ---- Use the line below for simple rectangular rule ---- // double logWeight = logWidthInPriorMass; // -------------------------------------------------------- double logStretchingFactor = Niterations*((1.0/NlivePoints) - (1.0/updatedNlivePoints)); logWidthInPriorMass = logRemainingPriorMass + Functions::logExpDifference(0.0, logStretchingFactor - 1.0/updatedNlivePoints); // X_i - X_(i+1) // Compute the logWeight according to the trapezoidal rule 0.5*(X_(i-1) - X_(i+1)) // and new contribution of evidence to be cumulated to the total evidence. // This is done in logarithmic scale by summing the right (X_(i-1) - X_i) and left part (X_i - X_(i+1)) // of the total width in prior mass required for the trapezoidal rule. We do this computation at the end // of the nested iteration because we need to know the new remaining prior mass of the next iteration. double logWidthInPriorMassLeft = logWidthInPriorMass; // ---- Use the line below for trapezoidal rule ---- double logWeight = log(0.5) + Functions::logExpSum(logWidthInPriorMassLeft, logWidthInPriorMassRight); double logEvidenceContributionNew = logWeight + worstLiveLogLikelihood; // Save log(Weight) of the current iteration logWeightOfPosteriorSample(Niterations) = logWeight; // Update the right part of the width in prior mass interval by replacing it with the left part logWidthInPriorMassRight = logWidthInPriorMass; // Update the evidence and the information Gain double logEvidenceNew = Functions::logExpSum(logEvidence, logEvidenceContributionNew); informationGain = exp(logEvidenceContributionNew - logEvidenceNew) * worstLiveLogLikelihood + exp(logEvidence - logEvidenceNew) * (informationGain + logEvidence) - logEvidenceNew; logEvidence = logEvidenceNew; // Print current information on the screen, if required if (printOnTheScreen) { if ((Niterations % 50) == 0) { cerr << "Nit: " << Niterations << " Ncl: " << Nclusters << " Nlive: " << NlivePoints << " CPM: " << exp(logCumulatedPriorMass) << " Ratio: " << ratioOfRemainderToCurrentEvidence << " log(E): " << logEvidence << " IG: " << informationGain << endl; } } // Update total width in prior mass and remaining width in prior mass from beginning to current iteration // and use this information for the next iteration (if any) logCumulatedPriorMass = Functions::logExpSum(logCumulatedPriorMass, logWidthInPriorMass); logRemainingPriorMass = logStretchingFactor + logRemainingPriorMass - 1.0/updatedNlivePoints; // Update new number of live points in NestedSampler class NlivePoints = updatedNlivePoints; // Increase nested loop counter Niterations++; } while (nestedSamplingShouldContinue); // Add the remaining live sample of points to our collection of posterior points // (i.e parameter coordinates, likelihood values and weights) unsigned int oldNpointsInPosterior = posteriorSample.cols(); posteriorSample.conservativeResize(Ndimensions, oldNpointsInPosterior + NlivePoints); // First make enough room posteriorSample.block(0, oldNpointsInPosterior, Ndimensions, NlivePoints) = nestedSample; // Then copy the live sample to the posterior array logWeightOfPosteriorSample.conservativeResize(oldNpointsInPosterior + NlivePoints); logWeightOfPosteriorSample.segment(oldNpointsInPosterior, NlivePoints).fill(logRemainingPriorMass - log(NlivePoints)); // Check if the best condition to impose logLikelihoodOfPosteriorSample.conservativeResize(oldNpointsInPosterior + NlivePoints); logLikelihoodOfPosteriorSample.segment(oldNpointsInPosterior, NlivePoints) = logLikelihood; // Compute Skilling's error on the log(Evidence) logEvidenceError = sqrt(fabs(informationGain)/NlivePoints); // Add Mean Live Evidence of the remaining live sample of points to the total log(Evidence) collected logEvidence = Functions::logExpSum(logMeanLiveEvidence, logEvidence); if (printOnTheScreen) { cerr << "------------------------------------------------" << endl; cerr << " Final log(E): " << logEvidence << " +/- " << logEvidenceError << endl; cerr << "------------------------------------------------" << endl; } // Print total computational time printComputationalTime(startTime); // Append information to existing output file and close stream afterwards outputFile << Niterations << endl; outputFile << static_cast<int>((NlivePoints*informationGain) + (NlivePoints*sqrt(Ndimensions*1.0))) << endl; outputFile << Nclusters << endl; outputFile << NlivePoints << endl; outputFile << computationalTime << endl; }
bool CMT::Mixture::train( const MatrixXd& data, const Parameters& parameters, const Component::Parameters& componentParameters) { if(data.rows() != dim()) throw Exception("Data has wrong dimensionality."); if(parameters.initialize && !initialized()) initialize(data, parameters, componentParameters); ArrayXXd logJoint(numComponents(), data.cols()); Array<double, Dynamic, 1> postSum; Array<double, 1, Dynamic> logLik; ArrayXXd post; ArrayXXd weights; double avgLogLoss = numeric_limits<double>::infinity(); double avgLogLossNew; for(int i = 0; i < parameters.maxIter; ++i) { // compute joint probability of data and assignments (E) #pragma omp parallel for for(int k = 0; k < numComponents(); ++k) logJoint.row(k) = mComponents[k]->logLikelihood(data) + log(mPriors[k]); // compute normalized posterior (E) logLik = logSumExp(logJoint); // average negative log-likelihood in bits per component avgLogLossNew = -logLik.mean() / log(2.) / dim(); if(parameters.verbosity > 0) cout << setw(6) << i << setw(14) << setprecision(7) << avgLogLossNew << endl; // test for convergence if(avgLogLoss - avgLogLossNew < parameters.threshold) return true; avgLogLoss = avgLogLossNew; // compute normalized posterior (E) post = (logJoint.rowwise() - logLik).exp(); postSum = post.rowwise().sum(); weights = post.colwise() / postSum; // optimize prior weights (M) if(parameters.trainPriors) { mPriors = postSum / data.cols() + parameters.regularizePriors; mPriors /= mPriors.sum(); } // optimize components (M) if(parameters.trainComponents) { #pragma omp parallel for for(int k = 0; k < numComponents(); ++k) mComponents[k]->train(data, weights.row(k), componentParameters); } else { return true; } } if(parameters.verbosity > 0) cout << setw(6) << parameters.maxIter << setw(14) << setprecision(7) << evaluate(data) << endl; return false; }