Ejemplo n.º 1
0
ArrayXXd CMT::BlobNonlinearity::gradient(const ArrayXXd& inputs) const {
	if(inputs.rows() != 1)
		throw Exception("Data has to be stored in one row.");
	
	ArrayXXd diff = ArrayXXd::Zero(mNumComponents, inputs.cols());
	diff.rowwise() += inputs.row(0);
	diff.colwise() -= mMeans;

	ArrayXXd diffSq = diff.square();
	ArrayXd precisions = mLogPrecisions.exp();
	ArrayXd weights = mLogWeights.exp();

	ArrayXXd negEnergy = diffSq.colwise() * (-precisions / 2.);
	ArrayXXd negEnergyExp = negEnergy.exp();

	ArrayXXd gradient(3 * mNumComponents, inputs.cols());

	// gradient of mean
	gradient.topRows(mNumComponents) = (diff * negEnergyExp).colwise() * (weights * precisions);

	// gradient of log-precisions
	gradient.middleRows(mNumComponents, mNumComponents) = (diffSq / 2. * negEnergyExp).colwise() * (-weights * precisions);

	// gradient of log-weights
	gradient.bottomRows(mNumComponents) = negEnergyExp.colwise() * weights;

	return gradient;
}
Ejemplo n.º 2
0
ArrayXXd CMT::BlobNonlinearity::operator()(const ArrayXXd& inputs) const {
	if(inputs.rows() != 1)
		throw Exception("Data has to be stored in one row.");

	ArrayXXd diff = ArrayXXd::Zero(mNumComponents, inputs.cols());
	diff.rowwise() += inputs.row(0);
	diff.colwise() -= mMeans;

	ArrayXXd negEnergy = diff.square().colwise() * (-mLogPrecisions.exp() / 2.);
	return (mLogWeights.exp().transpose().matrix() * negEnergy.exp().matrix()).array() + mEpsilon;
}
Ejemplo n.º 3
0
ArrayXXd CMT::BlobNonlinearity::derivative(const ArrayXXd& inputs) const {
	if(inputs.rows() != 1)
		throw Exception("Data has to be stored in one row.");

	ArrayXXd diff = ArrayXXd::Zero(mNumComponents, inputs.cols());
	diff.rowwise() -= inputs.row(0);
	diff.colwise() += mMeans;

	ArrayXd precisions = mLogPrecisions.exp();

	ArrayXXd negEnergy = diff.square().colwise() * (-precisions / 2.);

	return (mLogWeights.exp() * precisions).transpose().matrix() * (diff * negEnergy.exp()).matrix();
}
Ejemplo n.º 4
0
bool CMT::Mixture::train(
	const MatrixXd& data,
	const MatrixXd& dataValid,
	const Parameters& parameters,
	const Component::Parameters& componentParameters)
{
	if(parameters.initialize && !initialized())
		initialize(data, parameters, componentParameters);

	ArrayXXd logJoint(numComponents(), data.cols());
	Array<double, Dynamic, 1> postSum;
	Array<double, 1, Dynamic> logLik;
	ArrayXXd post;
	ArrayXXd weights;

	// training and validation log-loss for checking convergence
	double avgLogLoss = numeric_limits<double>::infinity();
	double avgLogLossNew;
	double avgLogLossValid = evaluate(dataValid);
	double avgLogLossValidNew = avgLogLossValid;
	int counter = 0;

	// backup model parameters
	VectorXd priors = mPriors;
	vector<Component*> components;

	for(int k = 0; k < numComponents(); ++k)
		components.push_back(mComponents[k]->copy());

	for(int i = 0; i < parameters.maxIter; ++i) {
		// compute joint probability of data and assignments (E)
		#pragma omp parallel for
		for(int k = 0; k < numComponents(); ++k)
			logJoint.row(k) = mComponents[k]->logLikelihood(data) + log(mPriors[k]);

		// compute normalized posterior (E)
		logLik = logSumExp(logJoint);

		// average negative log-likelihood in bits per component
		avgLogLossNew = -logLik.mean() / log(2.) / dim();

		if(parameters.verbosity > 0) {
			if(i % parameters.valIter == 0) {
				// print training and validation error
				cout << setw(6) << i;
				cout << setw(14) << setprecision(7) << avgLogLossNew;
				cout << setw(14) << setprecision(7) << avgLogLossValidNew << endl;
			} else {
				// print training error
				cout << setw(6) << i << setw(14) << setprecision(7) << avgLogLossNew << endl;
			}
		}

		// test for convergence
		if(avgLogLoss - avgLogLossNew < parameters.threshold)
			return true;
		avgLogLoss = avgLogLossNew;

		// compute normalized posterior (E)
		post = (logJoint.rowwise() - logLik).exp();
		postSum = post.rowwise().sum();
		weights = post.colwise() / postSum;

		// optimize prior weights (M)
		if(parameters.trainPriors) {
			mPriors = postSum / data.cols() + parameters.regularizePriors;
			mPriors /= mPriors.sum();
		}

		// optimize components (M)
		if(parameters.trainComponents) {
			#pragma omp parallel for
			for(int k = 0; k < numComponents(); ++k)
				mComponents[k]->train(data, weights.row(k), componentParameters);
		} else {
			return true;
		}

		if((i + 1) % parameters.valIter == 0) {
			// check validation error
			avgLogLossValidNew = evaluate(dataValid);

			if(avgLogLossValidNew < avgLogLossValid) {
				// backup new found model parameters
				priors = mPriors;
				for(int k = 0; k < numComponents(); ++k)
					*components[k] = *mComponents[k];
				
				avgLogLossValid = avgLogLossValidNew;
			} else {
				counter++;

				if(parameters.valLookAhead > 0 && counter >= parameters.valLookAhead) {
					// set parameters to best parameters found during training
					mPriors = priors;

					for(int k = 0; k < numComponents(); ++k) {
						*mComponents[k] = *components[k];
						delete components[k];
					}

					return true;
				}
			}
		}
	}

	if(parameters.verbosity > 0)
		cout << setw(6) << parameters.maxIter << setw(11) << setprecision(5) << evaluate(data) << endl;

	return false;
}
Ejemplo n.º 5
0
bool CMT::Mixture::train(
	const MatrixXd& data,
	const Parameters& parameters,
	const Component::Parameters& componentParameters)
{
	if(data.rows() != dim())
		throw Exception("Data has wrong dimensionality.");

	if(parameters.initialize && !initialized())
		initialize(data, parameters, componentParameters);

	ArrayXXd logJoint(numComponents(), data.cols());
	Array<double, Dynamic, 1> postSum;
	Array<double, 1, Dynamic> logLik;
	ArrayXXd post;
	ArrayXXd weights;
	double avgLogLoss = numeric_limits<double>::infinity();
	double avgLogLossNew;

	for(int i = 0; i < parameters.maxIter; ++i) {
		// compute joint probability of data and assignments (E)
		#pragma omp parallel for
		for(int k = 0; k < numComponents(); ++k)
			logJoint.row(k) = mComponents[k]->logLikelihood(data) + log(mPriors[k]);

		// compute normalized posterior (E)
		logLik = logSumExp(logJoint);

		// average negative log-likelihood in bits per component
		avgLogLossNew = -logLik.mean() / log(2.) / dim();

		if(parameters.verbosity > 0)
			cout << setw(6) << i << setw(14) << setprecision(7) << avgLogLossNew << endl;

		// test for convergence
		if(avgLogLoss - avgLogLossNew < parameters.threshold)
			return true;
		avgLogLoss = avgLogLossNew;

		// compute normalized posterior (E)
		post = (logJoint.rowwise() - logLik).exp();
		postSum = post.rowwise().sum();
		weights = post.colwise() / postSum;

		// optimize prior weights (M)
		if(parameters.trainPriors) {
			mPriors = postSum / data.cols() + parameters.regularizePriors;
			mPriors /= mPriors.sum();
		}

		// optimize components (M)
		if(parameters.trainComponents) {
			#pragma omp parallel for
			for(int k = 0; k < numComponents(); ++k)
				mComponents[k]->train(data, weights.row(k), componentParameters);
		} else {
			return true;
		}
	}

	if(parameters.verbosity > 0)
		cout << setw(6) << parameters.maxIter << setw(14) << setprecision(7) << evaluate(data) << endl;

	return false;
}
Ejemplo n.º 6
0
Archivo: utils.cpp Proyecto: cajal/cmt
Array<double, 1, Dynamic> CMT::logMeanExp(const ArrayXXd& array) {
	Array<double, 1, Dynamic> arrayMax = array.colwise().maxCoeff() - 1.;
	return arrayMax + (array.rowwise() - arrayMax).exp().colwise().mean().log();
}
Ejemplo n.º 7
0
pair<pair<ArrayXXd, ArrayXXd>, Array<double, 1, Dynamic> > CMT::STM::computeDataGradient(
	const MatrixXd& input,
	const MatrixXd& output) const
{
	// make sure nonlinearity is differentiable
	DifferentiableNonlinearity* nonlinearity =
		dynamic_cast<DifferentiableNonlinearity*>(mNonlinearity);
	if(!nonlinearity)
		throw Exception("Nonlinearity has to be differentiable.");

	if(input.rows() != dimIn())
		throw Exception("Input has wrong dimensionality.");
	if(output.rows() != 1)
		throw Exception("Output has wrong dimensionality.");
	if(input.cols() != output.cols())
		throw Exception("Number of inputs and outputs should be the same.");

	if(dimInNonlinear() && !dimInLinear()) {
		Array<double, 1, Dynamic> responses;

		ArrayXXd jointEnergy;

		if(numFeatures() > 0)
			jointEnergy = mWeights * (mFeatures.transpose() * input).array().square().matrix()
				+ mPredictors * input;
		else
			jointEnergy = mPredictors * input;
		jointEnergy.colwise() += mBiases.array();
		jointEnergy *= mSharpness;

		responses = logSumExp(jointEnergy);

		// posterior over components for each input
		MatrixXd posterior = (jointEnergy.rowwise() - responses).array().exp();

		responses /= mSharpness;

		Array<double, 1, Dynamic> tmp0 = (*mNonlinearity)(responses);
		Array<double, 1, Dynamic> tmp1 = -mDistribution->gradient(output, tmp0);
		Array<double, 1, Dynamic> tmp2 = nonlinearity->derivative(responses);

		ArrayXXd avgPredictor = mPredictors.transpose() * posterior;

		ArrayXXd tmp3;
		if(numFeatures() > 0) {
			ArrayXXd avgWeights = (2. * mWeights).transpose() * posterior;
			tmp3 = mFeatures * (avgWeights * (mFeatures.transpose() * input).array()).matrix();
		} else {
			tmp3 = ArrayXXd::Zero(avgPredictor.rows(), avgPredictor.cols());
		}

		return make_pair(
			make_pair(
				(tmp3 + avgPredictor).rowwise() * (tmp1 * tmp2),
				ArrayXXd::Zero(output.rows(), output.cols())),
			mDistribution->logLikelihood(output, tmp0));

	} else if(dimInNonlinear() && dimInLinear()) {
		// split inputs into linear and nonlinear components
		MatrixXd inputNonlinear = input.topRows(dimInNonlinear());
		MatrixXd inputLinear = input.bottomRows(dimInLinear());

		Array<double, 1, Dynamic> responses;

		ArrayXXd jointEnergy;

		if(numFeatures() > 0)
			jointEnergy = mWeights * (mFeatures.transpose() * inputNonlinear).array().square().matrix()
				+ mPredictors * input;
		else
			jointEnergy = mPredictors * inputNonlinear;
		jointEnergy.colwise() += mBiases.array();
		jointEnergy *= mSharpness;

		responses = logSumExp(jointEnergy);

		// posterior over components for each input
		MatrixXd posterior = (jointEnergy.rowwise() - responses).array().exp();

		responses /= mSharpness;
		responses += (mLinearPredictor.transpose() * inputLinear).array();

		Array<double, 1, Dynamic> tmp0 = (*mNonlinearity)(responses);
		Array<double, 1, Dynamic> tmp1 = -mDistribution->gradient(output, tmp0);
		Array<double, 1, Dynamic> tmp2 = nonlinearity->derivative(responses);

		ArrayXXd avgPredictor = mPredictors.transpose() * posterior;

		ArrayXXd tmp3;
		if(numFeatures() > 0) {
			ArrayXXd avgWeights = (2. * mWeights).transpose() * posterior;
			tmp3 = mFeatures * (avgWeights * (mFeatures.transpose() * inputNonlinear).array()).matrix();
		} else {
			tmp3 = ArrayXXd::Zero(avgPredictor.rows(), avgPredictor.cols());
		}

		// concatenate gradients of nonlinear and linear component
		ArrayXXd inputGradient(dimIn(), input.cols());
		inputGradient << 
			(tmp3 + avgPredictor).rowwise() * (tmp1 * tmp2),
			mLinearPredictor * (tmp1 * tmp2).matrix();

		return make_pair(
			make_pair(
				inputGradient,
				ArrayXXd::Zero(output.rows(), output.cols())),
			mDistribution->logLikelihood(output, tmp0));

	} else if(dimInLinear()) {
		double avgBias = logSumExp(mSharpness * mBiases)(0, 0) / mSharpness;
		Array<double, 1, Dynamic> responses = (mLinearPredictor.transpose() * input).array() + avgBias;

		Array<double, 1, Dynamic> tmp0 = (*mNonlinearity)(responses);
		Array<double, 1, Dynamic> tmp1 = -mDistribution->gradient(output, tmp0);
		Array<double, 1, Dynamic> tmp2 = nonlinearity->derivative(responses);

		return make_pair(
			make_pair(
				mLinearPredictor * (tmp1 * tmp2).matrix(),
				ArrayXXd::Zero(output.rows(), output.cols())),
			mDistribution->logLikelihood(output, tmp0));
	}

	return make_pair(
		make_pair(
			ArrayXXd::Zero(input.rows(), input.cols()),
			ArrayXXd::Zero(output.rows(), output.cols())),
		logLikelihood(input, output));
}