Beispiel #1
0
void CMT::WhiteningTransform::initialize(const ArrayXXd& input, int dimOut) {
	if(input.cols() < input.rows())
		throw Exception("Too few inputs to compute whitening transform."); 

	mMeanIn = input.rowwise().mean();

	// compute covariances
	MatrixXd covXX = covariance(input);

	// input whitening
	SelfAdjointEigenSolver<MatrixXd> eigenSolver;

	eigenSolver.compute(covXX);

	Array<double, 1, Dynamic> eigenvalues = eigenSolver.eigenvalues();
	MatrixXd eigenvectors = eigenSolver.eigenvectors();

	// don't whiten directions with near-zero variance
	for(int i = 0; i < eigenvalues.size(); ++i)
		if(eigenvalues[i] < 1e-7)
			eigenvalues[i] = 1.;

	mPreIn = (eigenvectors.array().rowwise() * eigenvalues.sqrt().cwiseInverse()).matrix()
		* eigenvectors.transpose();
	mPreInInv = (eigenvectors.array().rowwise() * eigenvalues.sqrt()).matrix()
		* eigenvectors.transpose();

	mMeanOut = VectorXd::Zero(dimOut);
	mPreOut = MatrixXd::Identity(dimOut, dimOut);
	mPreOutInv = MatrixXd::Identity(dimOut, dimOut);
	mPredictor = MatrixXd::Zero(dimOut, input.rows());
	mGradTransform = MatrixXd::Zero(dimOut, input.rows());
	mLogJacobian = 1.;
}
void NestedSampler::setPosteriorSample(ArrayXXd newPosteriorSample)
{
    Ndimensions = newPosteriorSample.rows();
    int Nsamples = newPosteriorSample.cols();
    posteriorSample.resize(Ndimensions, Nsamples);
    posteriorSample = newPosteriorSample;
}
Beispiel #3
0
/**
 * \brief Calculate the loglikelihood of a linear regression contained
 * in a linear_reg object.
 *
 * @param X The design matrix.
 */
void linear_reg::logLikelihood(const mematrix<double>& X) {
    /*
     loglik = 0.;
     double ss=0;
     for (int i=0;i<rdata.nids;i++) {
     double resid = rdata.Y[i] - beta.get(0,0); // intercept
     for (int j=1;j<beta.nrow;j++) resid -= beta.get(j,0)*X.get(i,j);
     // residuals[i] = resid;
     ss += resid*resid;
     }
     sigma2 = ss/N;
     */
    //cout << "estimate " << rdata.nids << "\n";
    //(rdata.X).print();
    //for (int i=0;i<rdata.nids;i++) cout << rdata.masked_data[i] << " ";
    //cout << endl;
    loglik = 0.;
    double halfrecsig2 = .5 / sigma2;
    //loglik -= halfrecsig2 * residuals[i] * residuals[i];

    double intercept = beta.get(0, 0);
    residuals.data = reg_data.Y.data.array() - intercept;
    //matrix.
    ArrayXXd betacol =
            beta.data.block(1, 0, beta.data.rows() - 1, 1).array().transpose();
    ArrayXXd resid_sub = (X.data.block(0, 1, X.data.rows(), X.data.cols() - 1)
            * betacol.matrix().asDiagonal()).rowwise().sum();
    //std::cout << resid_sub << std::endl;
    residuals.data -= resid_sub.matrix();
    //residuals[i] -= resid_sub;
    loglik -= (residuals.data.array().square() * halfrecsig2).sum();
    loglik -= static_cast<double>(reg_data.nids) * log(sqrt(sigma2));
}
Beispiel #4
0
Array<int, 1, Dynamic> CMT::MCBM::samplePrior(const MatrixXd& input) const {
	if(input.rows() != dimIn())
		throw Exception("Inputs have wrong dimensionality.");

	ArrayXXd featureEnergy = mWeights * (mFeatures.transpose() * input).array().square().matrix();
	ArrayXXd biasEnergy = mInputBias.transpose() * input;

	ArrayXXd predictorEnergy = mPredictors * input;

	ArrayXXd tmp0 = (featureEnergy + biasEnergy).colwise() + mPriors.array();
	ArrayXXd tmp1 = (tmp0 + predictorEnergy).colwise() + mOutputBias.array();

	ArrayXXd logPrior = tmp0 + tmp1;
	logPrior.rowwise() -= logSumExp(logPrior);

	ArrayXXd prior = logPrior.exp();

	Array<int, 1, Dynamic> labels(input.cols());

	#pragma omp parallel for
	for(int j = 0; j < input.cols(); ++j) {
		int i = 0;
		double urand = static_cast<double>(rand()) / (static_cast<long>(RAND_MAX) + 1l);
		double cdf;

		// compute index
		for(cdf = prior(0, j); cdf < urand; cdf += prior(i, j))
			++i;

		labels[j] = i;
	}

	return labels;
}
Beispiel #5
0
/*
Multiply each row of u by temp
*/
MatrixXd arrayMultiplierRowWise(MatrixXd u,ArrayXXd temp,int n){
	ArrayXXd uArray = u.array();
	int i;
	for(i=0;i<n;i++){
		uArray.row(i) *= temp;
	}
	return uArray.matrix();
}
Beispiel #6
0
ArrayXXd CMT::tanh(const ArrayXXd& arr) {
	ArrayXXd result(arr.rows(), arr.cols());

	#pragma omp parallel for
	for(int i = 0; i < arr.size(); ++i)
		result(i) = std::tanh(arr(i));

	return result;
}
Beispiel #7
0
ArrayXXd CMT::HistogramNonlinearity::operator()(const ArrayXXd& inputs) const {
	ArrayXXd outputs(inputs.rows(), inputs.cols());

	for(int i = 0; i < inputs.rows(); ++i)
		for(int j = 0; j < inputs.cols(); ++j)
			outputs(i, j) = mHistogram[bin(inputs(i, j))] + mEpsilon;

	return outputs;
}
Beispiel #8
0
ArrayXXd CMT::HistogramNonlinearity::gradient(const ArrayXXd& inputs) const {
	if(inputs.rows() != 1)
		throw Exception("Data has to be stored in one row.");

	ArrayXXd gradient = ArrayXXd::Zero(mHistogram.size(), inputs.cols());

	for(int i = 0; i < inputs.rows(); ++i)
		for(int j = 0; j < inputs.rows(); ++j)
			gradient(bin(inputs(i, j)), j) = 1;

	return gradient;
}
Beispiel #9
0
MatrixXd CMT::MLR::predict(const MatrixXd& input) const {
	if(input.rows() != mDimIn)
		throw Exception("Inputs have wrong dimensionality.");

	MatrixXd output = MatrixXd::Zero(mDimOut, input.cols());

	// distribution over outputs
	ArrayXXd prob = (mWeights * input).colwise() + mBiases;
	prob.rowwise() -= logSumExp(prob);
	prob = prob.exp();

	return prob;
}
Beispiel #10
0
ArrayXXd CMT::BlobNonlinearity::operator()(const ArrayXXd& inputs) const {
	if(inputs.rows() != 1)
		throw Exception("Data has to be stored in one row.");

	ArrayXXd diff = ArrayXXd::Zero(mNumComponents, inputs.cols());
	diff.rowwise() += inputs.row(0);
	diff.colwise() -= mMeans;

	ArrayXXd negEnergy = diff.square().colwise() * (-mLogPrecisions.exp() / 2.);
	return (mLogWeights.exp().transpose().matrix() * negEnergy.exp().matrix()).array() + mEpsilon;
}
Beispiel #11
0
ArrayXXi CMT::sampleBinomial(const ArrayXXi& n, const ArrayXXd& p) {
	if(n.rows() != p.rows() || n.cols() != p.cols())
		throw Exception("n and p must be of the same size.");

	ArrayXXi samples = ArrayXXi::Zero(n.rows(), n.cols());

	#pragma omp parallel for
	for(int i = 0; i < samples.size(); ++i) {
		// very naive algorithm for generating binomial samples
		for(int k = 0; k < n(i); ++k)
			if(rand() / static_cast<double>(RAND_MAX) < p(i))
				samples(i) += 1; 
	}

	return samples;
}
 void operator()(std::size_t begin, std::size_t end) {
   for(std::size_t j = begin; j < end; j++){
     ArrayXd d = s2*a_prec(j) + s1*e_prec(j);
     ArrayXd mlam = b.col(j).array() / d;
     effects.col(j) = U * (mlam + randn_draws.col(j) / sqrt(d)).matrix();
   }
 }
Beispiel #13
0
Array<double, 1, Dynamic> CMT::MLR::logLikelihood(
	const MatrixXd& input,
	const MatrixXd& output) const
{
	if(input.cols() != output.cols())
		throw Exception("Number of inputs and outputs have to be the same.");
	if(input.rows() != mDimIn)
		throw Exception("Inputs have wrong dimensionality.");
	if(output.rows() != mDimOut)
		throw Exception("Output has wrong dimensionality.");

	// distribution over outputs
	ArrayXXd logProb = (mWeights * input).colwise() + mBiases;
	logProb.rowwise() -= logSumExp(logProb);

	return (logProb * output.array()).colwise().sum();
}
Beispiel #14
0
void CMT::HistogramNonlinearity::initialize(
	const ArrayXXd& inputs,
	const ArrayXXd& outputs,
	int numBins)
{
	double max = inputs.maxCoeff();
	double min = inputs.minCoeff();

	mBinEdges = vector<double>(numBins + 1);

	double binWidth = (max - min) / numBins;

	for(int k = 0; k < mBinEdges.size(); ++k)
		mBinEdges[k] = min + k * binWidth;

	initialize(inputs, outputs);
}
Beispiel #15
0
ArrayXXd CMT::BlobNonlinearity::derivative(const ArrayXXd& inputs) const {
	if(inputs.rows() != 1)
		throw Exception("Data has to be stored in one row.");

	ArrayXXd diff = ArrayXXd::Zero(mNumComponents, inputs.cols());
	diff.rowwise() -= inputs.row(0);
	diff.colwise() += mMeans;

	ArrayXd precisions = mLogPrecisions.exp();

	ArrayXXd negEnergy = diff.square().colwise() * (-precisions / 2.);

	return (mLogWeights.exp() * precisions).transpose().matrix() * (diff * negEnergy.exp()).matrix();
}
Beispiel #16
0
/**
 * Algorithm due to Knuth, 1969.
 */
ArrayXXi CMT::samplePoisson(const ArrayXXd& lambda) {
	ArrayXXi samples(lambda.rows(), lambda.cols());
	ArrayXXd threshold = (-lambda).exp();

	#pragma omp parallel for
	for(int i = 0; i < samples.size(); ++i) {
		double p = rand() / static_cast<double>(RAND_MAX);
		int k = 0;

		while(p > threshold(i)) {
			k += 1;
			p *= rand() / static_cast<double>(RAND_MAX);
		}

		samples(i) = k;
	}

	return samples;
}
Beispiel #17
0
void CMT::HistogramNonlinearity::initialize(
	const ArrayXXd& inputs,
	const ArrayXXd& outputs)
{
	if(inputs.rows() != outputs.rows() || inputs.cols() != outputs.cols())
		throw Exception("Inputs and outputs have to have same size.");

	mHistogram = vector<double>(mBinEdges.size() - 1);
	vector<int> counter(mBinEdges.size() - 1);

	for(int k = 0; k < mHistogram.size(); ++k) {
		mHistogram[k] = 0.;
		counter[k] = 0;
	}

	for(int i = 0; i < inputs.rows(); ++i)
		for(int j = 0; j < inputs.cols(); ++j) {
			// find bin
			int k = bin(inputs(i, j));

			// update histogram
			counter[k] += 1;
			mHistogram[k] += outputs(i, j);
		}

	for(int k = 0; k < mHistogram.size(); ++k)
		if(mHistogram[k] > 0.)
			// average output observed in bin k
			mHistogram[k] /= counter[k];
}
Beispiel #18
0
void
mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
    int N = mxGetScalar(prhs[0]);
    double d = mxGetScalar(prhs[1]);
    double h = mxGetScalar(prhs[2]);
    int Njacv = mxGetScalar(prhs[3]);
    double b = mxGetScalar(prhs[4]);
    double c = mxGetScalar(prhs[5]);
    double dr = mxGetScalar(prhs[6]);
    double di = mxGetScalar(prhs[7]);
    int threadNum = mxGetScalar(prhs[8]);

    double *a0 = mxGetPr(prhs[9]);
    double *v = mxGetPr(prhs[10]);
    double th = mxGetScalar(prhs[11]);
    double phi = mxGetScalar(prhs[12]);
    int nstp = mxGetScalar(prhs[13]);
    // mwSize isJ = mxGetScalar(prhs[14]);

    ArrayXXd av = gintgv(N, d, h, Njacv, b, c, dr, di, threadNum, a0, v, th, phi, nstp);
    plhs[0] = mxCreateDoubleMatrix(av.rows(), av.cols(), mxREAL);
    memcpy(mxGetPr(plhs[0]), av.data(), av.cols()*av.rows()*sizeof(double));
}
Beispiel #19
0
double CMT::MLR::parameterGradient(
	const MatrixXd& input,
	const MatrixXd& output,
	const lbfgsfloatval_t* x,
	lbfgsfloatval_t* g,
	const Trainable::Parameters& params_) const
{
	const Parameters& params = dynamic_cast<const Parameters&>(params_);

	MatrixXd weights = mWeights;
	VectorXd biases = mBiases;

	// copy parameters
	int k = 0;
	if(params.trainWeights)
		for(int i = 1; i < weights.rows(); ++i)
			for(int j = 0; j < weights.cols(); ++j, ++k)
				weights(i, j) = x[k];
	if(params.trainBiases)
		for(int i = 1; i < mBiases.rows(); ++i, ++k)
			biases[i] = x[k];

	// compute distribution over outputs
	ArrayXXd logProb = (weights * input).colwise() + biases;
	logProb.rowwise() -= logSumExp(logProb);

	// difference between prediction and actual output
	MatrixXd diff = (logProb.exp().matrix() - output);

	// compute gradients
	double normConst = output.cols() * log(2.);

	if(g) {
		int offset = 0;

		if(params.trainWeights) {
			Map<Matrix<double, Dynamic, Dynamic, RowMajor> > weightsGrad(g, mDimOut - 1, mDimIn);
			weightsGrad = (diff * input.transpose() / normConst).bottomRows(mDimOut - 1);
			offset += weightsGrad.size();

			weightsGrad += params.regularizeWeights.gradient(
				weights.bottomRows(mDimOut - 1).transpose()).transpose();
		}

		if(params.trainBiases) {
			VectorLBFGS biasesGrad(g + offset, mDimOut - 1);
			biasesGrad = diff.rowwise().sum().bottomRows(mDimOut - 1) / normConst;
			biasesGrad += params.regularizeBiases.gradient(biases);
		}
	}

	// return negative average log-likelihood in bits
	double value = -(logProb * output.array()).sum() / normConst;

	if(params.trainWeights)
		value += params.regularizeWeights.evaluate(weights.bottomRows(mDimOut - 1).transpose());

	if(params.trainBiases)
		value += params.regularizeBiases.evaluate(biases);

	return value;
}
Beispiel #20
0
MatrixXd cube(MatrixXd xin){
	
	ArrayXXd x = xin.array();	//convert to Array
	x*=(x*x);
	return x.matrix();
}
Beispiel #21
0
Array<double, 1, Dynamic> CMT::logMeanExp(const ArrayXXd& array) {
	Array<double, 1, Dynamic> arrayMax = array.colwise().maxCoeff() - 1.;
	return arrayMax + (array.rowwise() - arrayMax).exp().colwise().mean().log();
}
Beispiel #22
0
bool CMT::Mixture::train(
	const MatrixXd& data,
	const MatrixXd& dataValid,
	const Parameters& parameters,
	const Component::Parameters& componentParameters)
{
	if(parameters.initialize && !initialized())
		initialize(data, parameters, componentParameters);

	ArrayXXd logJoint(numComponents(), data.cols());
	Array<double, Dynamic, 1> postSum;
	Array<double, 1, Dynamic> logLik;
	ArrayXXd post;
	ArrayXXd weights;

	// training and validation log-loss for checking convergence
	double avgLogLoss = numeric_limits<double>::infinity();
	double avgLogLossNew;
	double avgLogLossValid = evaluate(dataValid);
	double avgLogLossValidNew = avgLogLossValid;
	int counter = 0;

	// backup model parameters
	VectorXd priors = mPriors;
	vector<Component*> components;

	for(int k = 0; k < numComponents(); ++k)
		components.push_back(mComponents[k]->copy());

	for(int i = 0; i < parameters.maxIter; ++i) {
		// compute joint probability of data and assignments (E)
		#pragma omp parallel for
		for(int k = 0; k < numComponents(); ++k)
			logJoint.row(k) = mComponents[k]->logLikelihood(data) + log(mPriors[k]);

		// compute normalized posterior (E)
		logLik = logSumExp(logJoint);

		// average negative log-likelihood in bits per component
		avgLogLossNew = -logLik.mean() / log(2.) / dim();

		if(parameters.verbosity > 0) {
			if(i % parameters.valIter == 0) {
				// print training and validation error
				cout << setw(6) << i;
				cout << setw(14) << setprecision(7) << avgLogLossNew;
				cout << setw(14) << setprecision(7) << avgLogLossValidNew << endl;
			} else {
				// print training error
				cout << setw(6) << i << setw(14) << setprecision(7) << avgLogLossNew << endl;
			}
		}

		// test for convergence
		if(avgLogLoss - avgLogLossNew < parameters.threshold)
			return true;
		avgLogLoss = avgLogLossNew;

		// compute normalized posterior (E)
		post = (logJoint.rowwise() - logLik).exp();
		postSum = post.rowwise().sum();
		weights = post.colwise() / postSum;

		// optimize prior weights (M)
		if(parameters.trainPriors) {
			mPriors = postSum / data.cols() + parameters.regularizePriors;
			mPriors /= mPriors.sum();
		}

		// optimize components (M)
		if(parameters.trainComponents) {
			#pragma omp parallel for
			for(int k = 0; k < numComponents(); ++k)
				mComponents[k]->train(data, weights.row(k), componentParameters);
		} else {
			return true;
		}

		if((i + 1) % parameters.valIter == 0) {
			// check validation error
			avgLogLossValidNew = evaluate(dataValid);

			if(avgLogLossValidNew < avgLogLossValid) {
				// backup new found model parameters
				priors = mPriors;
				for(int k = 0; k < numComponents(); ++k)
					*components[k] = *mComponents[k];
				
				avgLogLossValid = avgLogLossValidNew;
			} else {
				counter++;

				if(parameters.valLookAhead > 0 && counter >= parameters.valLookAhead) {
					// set parameters to best parameters found during training
					mPriors = priors;

					for(int k = 0; k < numComponents(); ++k) {
						*mComponents[k] = *components[k];
						delete components[k];
					}

					return true;
				}
			}
		}
	}

	if(parameters.verbosity > 0)
		cout << setw(6) << parameters.maxIter << setw(11) << setprecision(5) << evaluate(data) << endl;

	return false;
}
bool KmeansClusterer::updateClusterCentersUntilConverged(RefArrayXXd sample, RefArrayXXd centers, 
                                                         RefArrayXd clusterSizes, vector<int> &clusterIndices,
                                                         double &sumOfDistancesToClosestCenter, double relTolerance)
{
    unsigned int Npoints = sample.cols();
    unsigned int Ndimensions = sample.rows();
    unsigned int Nclusters = centers.cols();
    ArrayXXd updatedCenters = ArrayXXd::Zero(Ndimensions, Nclusters);   // coordinates of each of the new cluster centers


    // Perform the k-means clustering iteration, each time improving the cluster centers,
    // and redetermining which points belongs to which cluster
    
    bool stopIterations = false;
    bool convergenceReached;
    unsigned int indexOfClosestCenter;
    double oldSumOfDistances = 0.0;
    double newSumOfDistances = 0.0;
    double distanceToClosestCenter;
    double distance; 

    while (!stopIterations)
    {
        // Find for each point the closest cluster center.
        // At the same time recompute/update the new cluster centers, which is simply
        // the barycenter of all points belonging to the cluster. 
    
        clusterSizes.setZero();
        updatedCenters.setZero();
    
        for (int n = 0; n < Npoints; ++n)
        {
            distanceToClosestCenter = numeric_limits<double>::max();
        
            for (int i = 0; i < Nclusters; ++i)
            {
                distance = metric.distance(sample.col(n), centers.col(i));
                
                if (distance < distanceToClosestCenter)
                {
                    indexOfClosestCenter = i;
                    distanceToClosestCenter = distance;
                }
            }
        
            newSumOfDistances += distanceToClosestCenter;
            updatedCenters.col(indexOfClosestCenter) += sample.col(n);
            clusterSizes(indexOfClosestCenter) += 1; 
            clusterIndices[n] = indexOfClosestCenter;        
        }
    

        // Assert that all clusters contain at least 2 points. If not we probably started
        // with an unfortunate set of initial cluster centers. Flag this by immediately 
        // returning false.
       
        if (!(clusterSizes > 1).all())
        {
            convergenceReached = false;
            return convergenceReached;
        }
       

        // Finish computing the new updated centers. Given the check above, we are sure
        // that none of the clusters is empty. 
        
        updatedCenters.rowwise() /= clusterSizes.transpose();
        centers = updatedCenters;
    

        // A new set of clusters has been determined.
        // Decide whether the algorithm has converged. Convergence occurs when
        // the sum of all distances of all points to their cluster center does
        // not change significantly anymore. 
        // Note: in order for this criterion to work properly, the coordinate
        //       space should be normalized, so that one particular coordinate
        //       cannot numerically dominate all other coordinates.
    
        if (oldSumOfDistances == 0.0)
        {
            // This is the first center-updating iteration, so there is nothing to compare yet.
            // Simply set the variables.
    
            oldSumOfDistances = newSumOfDistances;
            newSumOfDistances = 0.0;
        }
        else
        {
            // If the relative change in sumOfDistances between old and new was smaller than
            // the threshold set by the user, stop the iteration loop.
        
            if (fabs(newSumOfDistances - oldSumOfDistances) / oldSumOfDistances < relTolerance)
            {
                sumOfDistancesToClosestCenter = newSumOfDistances;      // will be returned to user
                stopIterations = true;
            }
            else
            {
                oldSumOfDistances = newSumOfDistances;
                newSumOfDistances = 0.0;
            }   
        }
    }  // end k-means center-updating loop 
    
    
    // Convergence was properly reached, so return
    
    convergenceReached = true;
    
    return convergenceReached;  
}
Beispiel #24
0
ArrayXXd CMT::ExponentialFunction::derivative(const ArrayXXd& data) const {
	return data.exp();
}
Beispiel #25
0
ArrayXXd CMT::ExponentialFunction::operator()(const ArrayXXd& data) const {
	return data.exp() + mEpsilon;
}
Beispiel #26
0
ArrayXXd CMT::BlobNonlinearity::gradient(const ArrayXXd& inputs) const {
	if(inputs.rows() != 1)
		throw Exception("Data has to be stored in one row.");
	
	ArrayXXd diff = ArrayXXd::Zero(mNumComponents, inputs.cols());
	diff.rowwise() += inputs.row(0);
	diff.colwise() -= mMeans;

	ArrayXXd diffSq = diff.square();
	ArrayXd precisions = mLogPrecisions.exp();
	ArrayXd weights = mLogWeights.exp();

	ArrayXXd negEnergy = diffSq.colwise() * (-precisions / 2.);
	ArrayXXd negEnergyExp = negEnergy.exp();

	ArrayXXd gradient(3 * mNumComponents, inputs.cols());

	// gradient of mean
	gradient.topRows(mNumComponents) = (diff * negEnergyExp).colwise() * (weights * precisions);

	// gradient of log-precisions
	gradient.middleRows(mNumComponents, mNumComponents) = (diffSq / 2. * negEnergyExp).colwise() * (-weights * precisions);

	// gradient of log-weights
	gradient.bottomRows(mNumComponents) = negEnergyExp.colwise() * weights;

	return gradient;
}
Beispiel #27
0
CMT::WhiteningTransform::WhiteningTransform(const ArrayXXd& input, const ArrayXXd& output) {
	initialize(input, output.rows());
}
Beispiel #28
0
double CMT::MCBM::parameterGradient(
	const MatrixXd& inputCompl,
	const MatrixXd& outputCompl,
	const lbfgsfloatval_t* x,
	lbfgsfloatval_t* g,
	const Trainable::Parameters& params_) const
{
	const Parameters& params = dynamic_cast<const Parameters&>(params_);

	// average log-likelihood
	double logLik = 0.;

	// interpret memory for parameters and gradients
	lbfgsfloatval_t* y = const_cast<lbfgsfloatval_t*>(x);

	int offset = 0;

	VectorLBFGS priors(params.trainPriors ? y : const_cast<double*>(mPriors.data()), mNumComponents);
	VectorLBFGS priorsGrad(g, mNumComponents);
	if(params.trainPriors)
		offset += priors.size();

	MatrixLBFGS weights(params.trainWeights ? y + offset : const_cast<double*>(mWeights.data()), mNumComponents, mNumFeatures);
	MatrixLBFGS weightsGrad(g + offset, mNumComponents, mNumFeatures);
	if(params.trainWeights)
		offset += weights.size();

	MatrixLBFGS features(params.trainFeatures ? y + offset : const_cast<double*>(mFeatures.data()), mDimIn, mNumFeatures);
	MatrixLBFGS featuresGrad(g + offset, mDimIn, mNumFeatures);
	if(params.trainFeatures)
		offset += features.size();

	MatrixLBFGS predictors(params.trainPredictors ? y + offset : const_cast<double*>(mPredictors.data()), mNumComponents, mDimIn);
	MatrixLBFGS predictorsGrad(g + offset, mNumComponents, mDimIn);
	if(params.trainPredictors)
		offset += predictors.size();

	MatrixLBFGS inputBias(params.trainInputBias ? y + offset : const_cast<double*>(mInputBias.data()), mDimIn, mNumComponents);
	MatrixLBFGS inputBiasGrad(g + offset, mDimIn, mNumComponents);
	if(params.trainInputBias)
		offset += inputBias.size();

	VectorLBFGS outputBias(params.trainOutputBias ? y + offset : const_cast<double*>(mOutputBias.data()), mNumComponents);
	VectorLBFGS outputBiasGrad(g + offset, mNumComponents);
	if(params.trainOutputBias)
		offset += outputBias.size();

	if(g) {
		// initialize gradients
		if(params.trainPriors)
			priorsGrad.setZero();
		if(params.trainWeights)
			weightsGrad.setZero();
		if(params.trainFeatures)
			featuresGrad.setZero();
		if(params.trainPredictors)
			predictorsGrad.setZero();
		if(params.trainInputBias)
			inputBiasGrad.setZero();
		if(params.trainOutputBias)
			outputBiasGrad.setZero();
	}

	// split data into batches for better performance
	int numData = static_cast<int>(inputCompl.cols());
	int batchSize = min(max(params.batchSize, 10), numData);

	#pragma omp parallel for
	for(int b = 0; b < inputCompl.cols(); b += batchSize) {
		const MatrixXd& input = inputCompl.middleCols(b, min(batchSize, numData - b));
		const MatrixXd& output = outputCompl.middleCols(b, min(batchSize, numData - b));

		ArrayXXd featureOutput = features.transpose() * input;
		MatrixXd featureOutputSq = featureOutput.square();
		MatrixXd weightsOutput = weights * featureOutputSq;
		ArrayXXd predictorOutput = predictors * input;

		// unnormalized posteriors over components for both possible outputs
		ArrayXXd logPost0 = (weightsOutput + inputBias.transpose() * input).colwise() + priors;
		ArrayXXd logPost1 = (logPost0 + predictorOutput).colwise() + outputBias.array();

		// sum over components to get unnormalized probabilities of outputs
		Array<double, 1, Dynamic> logProb0 = logSumExp(logPost0);
		Array<double, 1, Dynamic> logProb1 = logSumExp(logPost1);
	
		// normalize posteriors over components
		logPost0.rowwise() -= logProb0;
		logPost1.rowwise() -= logProb1;

		// stack row vectors
		ArrayXXd logProb01(2, input.cols());
		logProb01 << logProb0, logProb1; 

		// normalize log-probabilities
		Array<double, 1, Dynamic> logNorm = logSumExp(logProb01);
		logProb1 -= logNorm;
		logProb0 -= logNorm;

		double logLikBatch = (output.array() * logProb1 + (1. - output.array()) * logProb0).sum();

		#pragma omp critical
		logLik += logLikBatch;

		if(!g)
			// don't compute gradients
			continue;

		Array<double, 1, Dynamic> tmp = output.array() * logProb0.exp() - (1. - output.array()) * logProb1.exp();

		ArrayXXd post0Tmp = logPost0.exp().rowwise() * tmp;
		ArrayXXd post1Tmp = logPost1.exp().rowwise() * tmp;
		ArrayXXd postDiffTmp = post1Tmp - post0Tmp;

		// update gradients
		if(params.trainPriors)
			#pragma omp critical
			priorsGrad -= postDiffTmp.rowwise().sum().matrix();

		if(params.trainWeights)
			#pragma omp critical
			weightsGrad -= postDiffTmp.matrix() * featureOutputSq.transpose();

		if(params.trainFeatures) {
			ArrayXXd tmp2 = weights.transpose() * postDiffTmp.matrix() * 2.;
			MatrixXd tmp3 = featureOutput * tmp2;
			#pragma omp critical
			featuresGrad -= input * tmp3.transpose();
		}

		if(params.trainPredictors)
			#pragma omp critical
			predictorsGrad -= post1Tmp.matrix() * input.transpose();

		if(params.trainInputBias)
			#pragma omp critical
			inputBiasGrad -= input * postDiffTmp.matrix().transpose();

		if(params.trainOutputBias)
			#pragma omp critical
			outputBiasGrad -= post1Tmp.rowwise().sum().matrix();
	}

	double normConst = inputCompl.cols() * log(2.) * dimOut();

	if(g) {
		for(int i = 0; i < offset; ++i)
			g[i] /= normConst;

		if(params.trainFeatures)
			featuresGrad += params.regularizeFeatures.gradient(features);

		if(params.trainPredictors)
			predictorsGrad += params.regularizePredictors.gradient(predictors.transpose()).transpose();

		if(params.trainWeights)
			weightsGrad += params.regularizeWeights.gradient(weights);
	}

	double value = -logLik / normConst;

	if(params.trainFeatures)
		value += params.regularizeFeatures.evaluate(features);

	if(params.trainPredictors)
		value += params.regularizePredictors.evaluate(predictors.transpose());

	if(params.trainWeights)
		value += params.regularizeWeights.evaluate(weights);

	return value;
}
void NestedSampler::run(LivePointsReducer &livePointsReducer, const int NinitialIterationsWithoutClustering, 
                        const int NiterationsWithSameClustering, const int maxNdrawAttempts, 
                        const double maxRatioOfRemainderToCurrentEvidence, string pathPrefix)
{
    int startTime = time(0);
    double logMeanLiveEvidence;
    terminationFactor = maxRatioOfRemainderToCurrentEvidence;
    outputPathPrefix = pathPrefix;

    if (printOnTheScreen)
    {
        cerr << "------------------------------------------------" << endl;
        cerr << " Bayesian Inference problem has " << Ndimensions << " dimensions." << endl;
        cerr << "------------------------------------------------" << endl;
        cerr << endl;
    }


    // Save configuring parameters to an output ASCII file

    string fileName = "configuringParameters.txt";
    string fullPath = outputPathPrefix + fileName;
    File::openOutputFile(outputFile, fullPath);
   
   
    outputFile << "# List of configuring parameters used for the NSMC." << endl;
    outputFile << "# Row #1: Ndimensions" << endl;
    outputFile << "# Row #2: Initial(Maximum) NlivePoints" << endl;
    outputFile << "# Row #3: Minimum NlivePoints" << endl;
    outputFile << "# Row #4: NinitialIterationsWithoutClustering" << endl;
    outputFile << "# Row #5: NiterationsWithSameClustering" << endl;
    outputFile << "# Row #6: maxNdrawAttempts" << endl;
    outputFile << "# Row #7: terminationFactor" << endl;
    outputFile << "# Row #8: Niterations" << endl;
    outputFile << "# Row #9: Optimal Niterations" << endl;
    outputFile << "# Row #10: Final Nclusters" << endl;
    outputFile << "# Row #11: Final NlivePoints" << endl;
    outputFile << "# Row #12: Computational Time (seconds)" << endl;
    outputFile << Ndimensions << endl;
    outputFile << initialNlivePoints << endl;
    outputFile << minNlivePoints << endl;
    outputFile << NinitialIterationsWithoutClustering << endl;
    outputFile << NiterationsWithSameClustering << endl;
    outputFile << maxNdrawAttempts << endl;
    outputFile << terminationFactor << endl;


    // Set up the random number generator. It generates integer random numbers
    // between 0 and NlivePoints-1, inclusive.

    uniform_int_distribution<int> discreteUniform(0, NlivePoints-1);


    // Draw the initial sample from the prior PDF. Different coordinates of a point
    // can have different priors, so these have to be sampled individually.
    
    if (printOnTheScreen)
    {
        cerr << "------------------------------------------------" << endl;
        cerr << " Doing initial sampling of parameter space..." << endl;
        cerr << "------------------------------------------------" << endl;
        cerr << endl;
    }
        
    nestedSample.resize(Ndimensions, NlivePoints);
    int beginIndex = 0;
    int NdimensionsOfCurrentPrior;
    ArrayXXd priorSample;

    for (int i = 0; i < ptrPriors.size(); i++)
    {
        // Some priors cover one particalar coordinate, others may cover two or more coordinates
        // Find out how many dimensions the current prior covers.

        NdimensionsOfCurrentPrior = ptrPriors[i]->getNdimensions();
        

        // Draw the subset of coordinates randomly from the current prior
        
        priorSample.resize(NdimensionsOfCurrentPrior, NlivePoints);
        ptrPriors[i]->draw(priorSample);


        // Insert this random subset of coordinates into the total sample of coordinates of points

        nestedSample.block(beginIndex, 0, NdimensionsOfCurrentPrior, NlivePoints) = priorSample;      


        // Move index to the beginning of the coordinate set of the next prior

        beginIndex += NdimensionsOfCurrentPrior;
    }


    // Compute the log(Likelihood) for each of our points in the live sample

    logLikelihood.resize(NlivePoints);
    
    for (int i = 0; i < NlivePoints; ++i)
    {
        logLikelihood(i) = likelihood.logValue(nestedSample.col(i));
    }


    // Initialize the prior mass interval and cumulate it

    double logWidthInPriorMass = log(1.0 - exp(-1.0/NlivePoints));                                             // X_0 - X_1    First width in prior mass
    logCumulatedPriorMass = Functions::logExpSum(logCumulatedPriorMass, logWidthInPriorMass);               // 1 - X_1
    logRemainingPriorMass = Functions::logExpDifference(logRemainingPriorMass, logWidthInPriorMass);        // X_1


    // Initialize first part of width in prior mass for trapezoidal rule
    // X_0 = (2 - X_1), right-side boundary condition for trapezoidal rule

    double logRemainingPriorMassRightBound = Functions::logExpDifference(log(2), logRemainingPriorMass);    
    double logWidthInPriorMassRight = Functions::logExpDifference(logRemainingPriorMassRightBound,logRemainingPriorMass);


    // Find maximum log(Likelihood) value in the initial sample of live points. 
    // This information can be useful when reducing the number of live points adopted within the nesting process.

    logMaxLikelihoodOfLivePoints = logLikelihood.maxCoeff();


    // The nested sampling will involve finding clusters in the sample.
    // This will require the containers clusterIndices and clusterSizes.

    unsigned int Nclusters = 0;
    vector<int> clusterIndices(NlivePoints);           // clusterIndices must have the same number of elements as the number of live points
    vector<int> clusterSizes;                       // The number of live points counted in each cluster is updated everytime one live point
                                                    // is removed from the sample.


    // Start the nested sampling loop. Each iteration, we'll replace the point with the worst likelihood.
    // New points are drawn from the prior, but with the constraint that they should have a likelihood
    // that is better than the currently worst one.
    
    if (printOnTheScreen)
    {
        cerr << "-------------------------------" << endl;
        cerr << " Starting nested sampling...   " << endl;
        cerr << "-------------------------------" << endl;
        cerr << endl;
    }
        
    bool nestedSamplingShouldContinue = true;
    bool livePointsShouldBeReduced = (initialNlivePoints > minNlivePoints);       // Update live points only if required
    
    Niterations = 0;

    do 
    {
        // Resize the arrays to make room for an additional point.
        // Do so without destroying the original contents.

        posteriorSample.conservativeResize(Ndimensions, Niterations + 1);  
        logLikelihoodOfPosteriorSample.conservativeResize(Niterations + 1);
        logWeightOfPosteriorSample.conservativeResize(Niterations + 1);
        

        // Find the point with the worst likelihood. This likelihood value will set a constraint
        // when drawing new points later on.
        
        int indexOfLivePointWithWorstLikelihood;
        worstLiveLogLikelihood = logLikelihood.minCoeff(&indexOfLivePointWithWorstLikelihood);

        
        // Although we will replace the point with the worst likelihood in the live sample, we will save
        // it in our collection of posterior sample. Also save its likelihood value. The weight is 
        // computed and collected at the end of each iteration.

        posteriorSample.col(Niterations) = nestedSample.col(indexOfLivePointWithWorstLikelihood); 
        logLikelihoodOfPosteriorSample(Niterations) = worstLiveLogLikelihood; 


        // Compute the (logarithm of) the mean likelihood of the set of live points.
        // Note that we are not computing mean(log(likelihood)) but log(mean(likelhood)).
        // Since we are only storing the log(likelihood) values, this results in a peculiar
        // way of computing the mean. This will be used for computing the mean live evidence
        // at the end of the iteration.
        
        logMeanLikelihoodOfLivePoints = logLikelihood(0);

        for (int m = 1; m < NlivePoints; m++)
        {
            logMeanLikelihoodOfLivePoints = Functions::logExpSum(logMeanLikelihoodOfLivePoints, logLikelihood(m));
        }

        logMeanLikelihoodOfLivePoints -= log(NlivePoints);
                

        // Find clusters in our live sample of points. Don't do this every iteration but only
        // every x iterations, where x is given by 'NiterationsWithSameClustering'.
        
        if ((Niterations % NiterationsWithSameClustering) == 0)
        {            
            // Don't do clustering the first N iterations, where N is user-specified. That is, 
            // the first N iterations we assume that there is only 1 cluster containing all the points.
            // This is often useful because initially the points may be sampled from a uniform prior,
            // and we therefore don't expect any clustering _before_ the algorithm is able to tune in on 
            // the island(s) of high likelihood. Clusters found in the first N initial iterations are
            // therefore likely purely noise.
        
            if (Niterations < NinitialIterationsWithoutClustering)
            {
                // There is only 1 cluster, containing all objects. All points have the same cluster
                // index, namely 0.
                       
                Nclusters = 1;
                clusterSizes.resize(1);
                clusterSizes[0] = NlivePoints;
                fill(clusterIndices.begin(), clusterIndices.end(), 0);
            }
            else         
            {
                // After the first N initial iterations, we do a proper clustering.
                
                Nclusters = clusterer.cluster(nestedSample, clusterIndices, clusterSizes);
            }
        }


        // Draw a new point, which should replace the point with the worst likelihood.
        // This new point should be drawn from the prior, but with a likelihood greater 
        // than the current worst likelihood. The drawing algorithm may need a starting point,
        // for which we will take a randomly chosen point of the live sample (excluding the
        // worst point).

        int indexOfRandomlyChosenLivePoint = 0;
        
        if (NlivePoints > 1)
        {
            // Select randomly an index of a sample point, but not the one of the worst point

            do 
            {
                // 0 <= indexOfRandomlyChosenLivePoint < NlivePoints

                indexOfRandomlyChosenLivePoint = discreteUniform(engine);
            } 
            while (indexOfRandomlyChosenLivePoint == indexOfLivePointWithWorstLikelihood);
        }


        // drawnPoint will be a starting point as input, and will contain the newly drawn point as output

        ArrayXd drawnPoint = nestedSample.col(indexOfRandomlyChosenLivePoint);
        double logLikelihoodOfDrawnPoint = 0.0;
        bool newPointIsFound = drawWithConstraint(nestedSample, Nclusters, clusterIndices, clusterSizes, 
                                                  drawnPoint, logLikelihoodOfDrawnPoint, maxNdrawAttempts); 


        // If the adopted sampler produces an error (e.g. in the case of the ellipsoidal sampler a failure
        // in the ellipsoid matrix decomposition), then we can stop right here.
        
        nestedSamplingShouldContinue = verifySamplerStatus();
        if (!nestedSamplingShouldContinue) break;


        // If we didn't find a point with a better likelihood, then we can stop right here.
        
        if (!newPointIsFound)
        {
            nestedSamplingShouldContinue = false;
            cerr << "Can't find point with a better Likelihood." << endl; 
            cerr << "Stopping the nested sampling loop prematurely." << endl;
            break;
        }


        // Replace the point having the worst likelihood with our newly drawn one.

        nestedSample.col(indexOfLivePointWithWorstLikelihood) = drawnPoint;
        logLikelihood(indexOfLivePointWithWorstLikelihood) = logLikelihoodOfDrawnPoint;
       
        
        // If we got till here this is not the last iteration possible, hence 
        // update all the information for the next iteration. 
        // Check if the number of live points has not reached the minimum allowed,
        // and update it for the next iteration.

        if (livePointsShouldBeReduced)
        {
            // Update the number of live points for the current iteration based on the previous number.
            // If the number of live points reaches the minimum allowed 
            // then do not update the number anymore.

            updatedNlivePoints = livePointsReducer.updateNlivePoints();
            
            if (updatedNlivePoints > NlivePoints)
            {
                // Terminate program if new number of live points is greater than previous one
                    
                cerr << "Something went wrong in the reduction of the live points." << endl;
                cerr << "The new number of live points is greater than the previous one." << endl;
                cerr << "Quitting program. " << endl;
                break;
            }

                
            // If the lower bound for the number of live points has not been reached yet, 
            // the process should be repeated at the next iteration.
            // Otherwise the minimun number allowed is reached right now. In this case
            // stop the reduction process starting from the next iteration.
                
            livePointsShouldBeReduced = (updatedNlivePoints > minNlivePoints);

            if (updatedNlivePoints != NlivePoints)
            {
                // Resize all eigen arrays and vectors of dimensions NlivePoints according to 
                // new number of live points evaluated. In case previos and new number 
                // of live points coincide, no resizing is done.
                    
                vector<int> indicesOfLivePointsToRemove = livePointsReducer.findIndicesOfLivePointsToRemove(engine);

                    
                // At least one live point has to be removed, hence update the sample

                removeLivePointsFromSample(indicesOfLivePointsToRemove, clusterIndices, clusterSizes);
                        
                        
                // Since everything is fine update discreteUniform with the corresponding new upper bound

                uniform_int_distribution<int> discreteUniform2(0, updatedNlivePoints-1);
                discreteUniform = discreteUniform2;
            }
        }


        // Store the new number of live points in the vector containing this information.
        // This is done even if the new number is the same as the previous one.

        NlivePointsPerIteration.push_back(NlivePoints);

            
        // Compute the mean live evidence given the previous set of live points (see Keeton 2011, MNRAS) 

        logMeanLiveEvidence = logMeanLikelihoodOfLivePoints + Niterations * (log(NlivePoints) - log(NlivePoints + 1));


        // Compute the ratio of the evidence of the live sample to the current Skilling's evidence.
        // Only when we gathered enough evidence, this ratio will be sufficiently small so that we can stop the iterations.

        ratioOfRemainderToCurrentEvidence = exp(logMeanLiveEvidence - logEvidence);


        // Re-evaluate the stopping criterion, using the condition suggested by Keeton (2011)

        nestedSamplingShouldContinue = (ratioOfRemainderToCurrentEvidence > maxRatioOfRemainderToCurrentEvidence);


        // Shrink prior mass interval according to proper number of live points 
        // (see documentation by Enrico Corsaro October 2013). When reducing the number of live points 
        // the equation is a generalized version of that used by Skilling 2004. The equation
        // reduces to the standard case when the new number of live points is the same
        // as the previous one.

        // ---- Use the line below for simple rectangular rule ----
        // double logWeight = logWidthInPriorMass;
        // --------------------------------------------------------
        
        double logStretchingFactor = Niterations*((1.0/NlivePoints) - (1.0/updatedNlivePoints)); 
        logWidthInPriorMass = logRemainingPriorMass + Functions::logExpDifference(0.0, logStretchingFactor - 1.0/updatedNlivePoints);  // X_i - X_(i+1)

        
        // Compute the logWeight according to the trapezoidal rule 0.5*(X_(i-1) - X_(i+1)) 
        // and new contribution of evidence to be cumulated to the total evidence.
        // This is done in logarithmic scale by summing the right (X_(i-1) - X_i) and left part (X_i - X_(i+1)) 
        // of the total width in prior mass required for the trapezoidal rule. We do this computation at the end 
        // of the nested iteration because we need to know the new remaining prior mass of the next iteration.
            
        double logWidthInPriorMassLeft = logWidthInPriorMass; 

        
        // ---- Use the line below for trapezoidal rule ----

        double logWeight = log(0.5) + Functions::logExpSum(logWidthInPriorMassLeft, logWidthInPriorMassRight);
        double logEvidenceContributionNew = logWeight + worstLiveLogLikelihood;


        // Save log(Weight) of the current iteration

        logWeightOfPosteriorSample(Niterations) = logWeight;


        // Update the right part of the width in prior mass interval by replacing it with the left part

        logWidthInPriorMassRight = logWidthInPriorMass;


        // Update the evidence and the information Gain
        
        double logEvidenceNew = Functions::logExpSum(logEvidence, logEvidenceContributionNew);
        informationGain = exp(logEvidenceContributionNew - logEvidenceNew) * worstLiveLogLikelihood 
                        + exp(logEvidence - logEvidenceNew) * (informationGain + logEvidence) 
                        - logEvidenceNew;
        logEvidence = logEvidenceNew;


        // Print current information on the screen, if required

        if (printOnTheScreen)
        {
            if ((Niterations % 50) == 0)
            {
                cerr << "Nit: " << Niterations 
                     << "   Ncl: " << Nclusters 
                     << "   Nlive: " << NlivePoints
                     << "   CPM: " << exp(logCumulatedPriorMass)
                     << "   Ratio: " << ratioOfRemainderToCurrentEvidence
                     << "   log(E): " << logEvidence 
                     << "   IG: " << informationGain
                     << endl; 
            }
        }


        // Update total width in prior mass and remaining width in prior mass from beginning to current iteration
        // and use this information for the next iteration (if any)

        logCumulatedPriorMass = Functions::logExpSum(logCumulatedPriorMass, logWidthInPriorMass);
        logRemainingPriorMass = logStretchingFactor + logRemainingPriorMass - 1.0/updatedNlivePoints;


        // Update new number of live points in NestedSampler class 
            
        NlivePoints = updatedNlivePoints;


        // Increase nested loop counter
        
        Niterations++;
    }
    while (nestedSamplingShouldContinue);


    // Add the remaining live sample of points to our collection of posterior points 
    // (i.e parameter coordinates, likelihood values and weights)

    unsigned int oldNpointsInPosterior = posteriorSample.cols();

    posteriorSample.conservativeResize(Ndimensions, oldNpointsInPosterior + NlivePoints);          // First make enough room
    posteriorSample.block(0, oldNpointsInPosterior, Ndimensions, NlivePoints) = nestedSample;      // Then copy the live sample to the posterior array
    logWeightOfPosteriorSample.conservativeResize(oldNpointsInPosterior + NlivePoints);
    logWeightOfPosteriorSample.segment(oldNpointsInPosterior, NlivePoints).fill(logRemainingPriorMass - log(NlivePoints));  // Check if the best condition to impose 
    logLikelihoodOfPosteriorSample.conservativeResize(oldNpointsInPosterior + NlivePoints);
    logLikelihoodOfPosteriorSample.segment(oldNpointsInPosterior, NlivePoints) = logLikelihood; 


    // Compute Skilling's error on the log(Evidence)
    
    logEvidenceError = sqrt(fabs(informationGain)/NlivePoints);


    // Add Mean Live Evidence of the remaining live sample of points to the total log(Evidence) collected

    logEvidence = Functions::logExpSum(logMeanLiveEvidence, logEvidence);
    
    if (printOnTheScreen)
    {
        cerr << "------------------------------------------------" << endl;
        cerr << " Final log(E): " << logEvidence << " +/- " << logEvidenceError << endl;
        cerr << "------------------------------------------------" << endl;
    }

    // Print total computational time

    printComputationalTime(startTime);
    
    
    // Append information to existing output file and close stream afterwards
    
    outputFile << Niterations << endl;
    outputFile << static_cast<int>((NlivePoints*informationGain) + (NlivePoints*sqrt(Ndimensions*1.0))) << endl;
    outputFile << Nclusters << endl;
    outputFile << NlivePoints << endl;
    outputFile << computationalTime << endl;
}
Beispiel #30
0
bool CMT::Mixture::train(
	const MatrixXd& data,
	const Parameters& parameters,
	const Component::Parameters& componentParameters)
{
	if(data.rows() != dim())
		throw Exception("Data has wrong dimensionality.");

	if(parameters.initialize && !initialized())
		initialize(data, parameters, componentParameters);

	ArrayXXd logJoint(numComponents(), data.cols());
	Array<double, Dynamic, 1> postSum;
	Array<double, 1, Dynamic> logLik;
	ArrayXXd post;
	ArrayXXd weights;
	double avgLogLoss = numeric_limits<double>::infinity();
	double avgLogLossNew;

	for(int i = 0; i < parameters.maxIter; ++i) {
		// compute joint probability of data and assignments (E)
		#pragma omp parallel for
		for(int k = 0; k < numComponents(); ++k)
			logJoint.row(k) = mComponents[k]->logLikelihood(data) + log(mPriors[k]);

		// compute normalized posterior (E)
		logLik = logSumExp(logJoint);

		// average negative log-likelihood in bits per component
		avgLogLossNew = -logLik.mean() / log(2.) / dim();

		if(parameters.verbosity > 0)
			cout << setw(6) << i << setw(14) << setprecision(7) << avgLogLossNew << endl;

		// test for convergence
		if(avgLogLoss - avgLogLossNew < parameters.threshold)
			return true;
		avgLogLoss = avgLogLossNew;

		// compute normalized posterior (E)
		post = (logJoint.rowwise() - logLik).exp();
		postSum = post.rowwise().sum();
		weights = post.colwise() / postSum;

		// optimize prior weights (M)
		if(parameters.trainPriors) {
			mPriors = postSum / data.cols() + parameters.regularizePriors;
			mPriors /= mPriors.sum();
		}

		// optimize components (M)
		if(parameters.trainComponents) {
			#pragma omp parallel for
			for(int k = 0; k < numComponents(); ++k)
				mComponents[k]->train(data, weights.row(k), componentParameters);
		} else {
			return true;
		}
	}

	if(parameters.verbosity > 0)
		cout << setw(6) << parameters.maxIter << setw(14) << setprecision(7) << evaluate(data) << endl;

	return false;
}