Beispiel #1
0
void CMT::WhiteningTransform::initialize(const ArrayXXd& input, int dimOut) {
	if(input.cols() < input.rows())
		throw Exception("Too few inputs to compute whitening transform."); 

	mMeanIn = input.rowwise().mean();

	// compute covariances
	MatrixXd covXX = covariance(input);

	// input whitening
	SelfAdjointEigenSolver<MatrixXd> eigenSolver;

	eigenSolver.compute(covXX);

	Array<double, 1, Dynamic> eigenvalues = eigenSolver.eigenvalues();
	MatrixXd eigenvectors = eigenSolver.eigenvectors();

	// don't whiten directions with near-zero variance
	for(int i = 0; i < eigenvalues.size(); ++i)
		if(eigenvalues[i] < 1e-7)
			eigenvalues[i] = 1.;

	mPreIn = (eigenvectors.array().rowwise() * eigenvalues.sqrt().cwiseInverse()).matrix()
		* eigenvectors.transpose();
	mPreInInv = (eigenvectors.array().rowwise() * eigenvalues.sqrt()).matrix()
		* eigenvectors.transpose();

	mMeanOut = VectorXd::Zero(dimOut);
	mPreOut = MatrixXd::Identity(dimOut, dimOut);
	mPreOutInv = MatrixXd::Identity(dimOut, dimOut);
	mPredictor = MatrixXd::Zero(dimOut, input.rows());
	mGradTransform = MatrixXd::Zero(dimOut, input.rows());
	mLogJacobian = 1.;
}
Beispiel #2
0
ArrayXXd CMT::BlobNonlinearity::gradient(const ArrayXXd& inputs) const {
	if(inputs.rows() != 1)
		throw Exception("Data has to be stored in one row.");
	
	ArrayXXd diff = ArrayXXd::Zero(mNumComponents, inputs.cols());
	diff.rowwise() += inputs.row(0);
	diff.colwise() -= mMeans;

	ArrayXXd diffSq = diff.square();
	ArrayXd precisions = mLogPrecisions.exp();
	ArrayXd weights = mLogWeights.exp();

	ArrayXXd negEnergy = diffSq.colwise() * (-precisions / 2.);
	ArrayXXd negEnergyExp = negEnergy.exp();

	ArrayXXd gradient(3 * mNumComponents, inputs.cols());

	// gradient of mean
	gradient.topRows(mNumComponents) = (diff * negEnergyExp).colwise() * (weights * precisions);

	// gradient of log-precisions
	gradient.middleRows(mNumComponents, mNumComponents) = (diffSq / 2. * negEnergyExp).colwise() * (-weights * precisions);

	// gradient of log-weights
	gradient.bottomRows(mNumComponents) = negEnergyExp.colwise() * weights;

	return gradient;
}
Beispiel #3
0
Array<int, 1, Dynamic> CMT::MCBM::samplePrior(const MatrixXd& input) const {
	if(input.rows() != dimIn())
		throw Exception("Inputs have wrong dimensionality.");

	ArrayXXd featureEnergy = mWeights * (mFeatures.transpose() * input).array().square().matrix();
	ArrayXXd biasEnergy = mInputBias.transpose() * input;

	ArrayXXd predictorEnergy = mPredictors * input;

	ArrayXXd tmp0 = (featureEnergy + biasEnergy).colwise() + mPriors.array();
	ArrayXXd tmp1 = (tmp0 + predictorEnergy).colwise() + mOutputBias.array();

	ArrayXXd logPrior = tmp0 + tmp1;
	logPrior.rowwise() -= logSumExp(logPrior);

	ArrayXXd prior = logPrior.exp();

	Array<int, 1, Dynamic> labels(input.cols());

	#pragma omp parallel for
	for(int j = 0; j < input.cols(); ++j) {
		int i = 0;
		double urand = static_cast<double>(rand()) / (static_cast<long>(RAND_MAX) + 1l);
		double cdf;

		// compute index
		for(cdf = prior(0, j); cdf < urand; cdf += prior(i, j))
			++i;

		labels[j] = i;
	}

	return labels;
}
Beispiel #4
0
ArrayXXd CMT::BlobNonlinearity::operator()(const ArrayXXd& inputs) const {
	if(inputs.rows() != 1)
		throw Exception("Data has to be stored in one row.");

	ArrayXXd diff = ArrayXXd::Zero(mNumComponents, inputs.cols());
	diff.rowwise() += inputs.row(0);
	diff.colwise() -= mMeans;

	ArrayXXd negEnergy = diff.square().colwise() * (-mLogPrecisions.exp() / 2.);
	return (mLogWeights.exp().transpose().matrix() * negEnergy.exp().matrix()).array() + mEpsilon;
}
Beispiel #5
0
MatrixXd CMT::MLR::predict(const MatrixXd& input) const {
	if(input.rows() != mDimIn)
		throw Exception("Inputs have wrong dimensionality.");

	MatrixXd output = MatrixXd::Zero(mDimOut, input.cols());

	// distribution over outputs
	ArrayXXd prob = (mWeights * input).colwise() + mBiases;
	prob.rowwise() -= logSumExp(prob);
	prob = prob.exp();

	return prob;
}
Beispiel #6
0
ArrayXXd CMT::BlobNonlinearity::derivative(const ArrayXXd& inputs) const {
	if(inputs.rows() != 1)
		throw Exception("Data has to be stored in one row.");

	ArrayXXd diff = ArrayXXd::Zero(mNumComponents, inputs.cols());
	diff.rowwise() -= inputs.row(0);
	diff.colwise() += mMeans;

	ArrayXd precisions = mLogPrecisions.exp();

	ArrayXXd negEnergy = diff.square().colwise() * (-precisions / 2.);

	return (mLogWeights.exp() * precisions).transpose().matrix() * (diff * negEnergy.exp()).matrix();
}
Beispiel #7
0
Array<double, 1, Dynamic> CMT::MLR::logLikelihood(
	const MatrixXd& input,
	const MatrixXd& output) const
{
	if(input.cols() != output.cols())
		throw Exception("Number of inputs and outputs have to be the same.");
	if(input.rows() != mDimIn)
		throw Exception("Inputs have wrong dimensionality.");
	if(output.rows() != mDimOut)
		throw Exception("Output has wrong dimensionality.");

	// distribution over outputs
	ArrayXXd logProb = (mWeights * input).colwise() + mBiases;
	logProb.rowwise() -= logSumExp(logProb);

	return (logProb * output.array()).colwise().sum();
}
Beispiel #8
0
double CMT::MLR::parameterGradient(
	const MatrixXd& input,
	const MatrixXd& output,
	const lbfgsfloatval_t* x,
	lbfgsfloatval_t* g,
	const Trainable::Parameters& params_) const
{
	const Parameters& params = dynamic_cast<const Parameters&>(params_);

	MatrixXd weights = mWeights;
	VectorXd biases = mBiases;

	// copy parameters
	int k = 0;
	if(params.trainWeights)
		for(int i = 1; i < weights.rows(); ++i)
			for(int j = 0; j < weights.cols(); ++j, ++k)
				weights(i, j) = x[k];
	if(params.trainBiases)
		for(int i = 1; i < mBiases.rows(); ++i, ++k)
			biases[i] = x[k];

	// compute distribution over outputs
	ArrayXXd logProb = (weights * input).colwise() + biases;
	logProb.rowwise() -= logSumExp(logProb);

	// difference between prediction and actual output
	MatrixXd diff = (logProb.exp().matrix() - output);

	// compute gradients
	double normConst = output.cols() * log(2.);

	if(g) {
		int offset = 0;

		if(params.trainWeights) {
			Map<Matrix<double, Dynamic, Dynamic, RowMajor> > weightsGrad(g, mDimOut - 1, mDimIn);
			weightsGrad = (diff * input.transpose() / normConst).bottomRows(mDimOut - 1);
			offset += weightsGrad.size();

			weightsGrad += params.regularizeWeights.gradient(
				weights.bottomRows(mDimOut - 1).transpose()).transpose();
		}

		if(params.trainBiases) {
			VectorLBFGS biasesGrad(g + offset, mDimOut - 1);
			biasesGrad = diff.rowwise().sum().bottomRows(mDimOut - 1) / normConst;
			biasesGrad += params.regularizeBiases.gradient(biases);
		}
	}

	// return negative average log-likelihood in bits
	double value = -(logProb * output.array()).sum() / normConst;

	if(params.trainWeights)
		value += params.regularizeWeights.evaluate(weights.bottomRows(mDimOut - 1).transpose());

	if(params.trainBiases)
		value += params.regularizeBiases.evaluate(biases);

	return value;
}
Beispiel #9
0
bool CMT::Mixture::train(
	const MatrixXd& data,
	const MatrixXd& dataValid,
	const Parameters& parameters,
	const Component::Parameters& componentParameters)
{
	if(parameters.initialize && !initialized())
		initialize(data, parameters, componentParameters);

	ArrayXXd logJoint(numComponents(), data.cols());
	Array<double, Dynamic, 1> postSum;
	Array<double, 1, Dynamic> logLik;
	ArrayXXd post;
	ArrayXXd weights;

	// training and validation log-loss for checking convergence
	double avgLogLoss = numeric_limits<double>::infinity();
	double avgLogLossNew;
	double avgLogLossValid = evaluate(dataValid);
	double avgLogLossValidNew = avgLogLossValid;
	int counter = 0;

	// backup model parameters
	VectorXd priors = mPriors;
	vector<Component*> components;

	for(int k = 0; k < numComponents(); ++k)
		components.push_back(mComponents[k]->copy());

	for(int i = 0; i < parameters.maxIter; ++i) {
		// compute joint probability of data and assignments (E)
		#pragma omp parallel for
		for(int k = 0; k < numComponents(); ++k)
			logJoint.row(k) = mComponents[k]->logLikelihood(data) + log(mPriors[k]);

		// compute normalized posterior (E)
		logLik = logSumExp(logJoint);

		// average negative log-likelihood in bits per component
		avgLogLossNew = -logLik.mean() / log(2.) / dim();

		if(parameters.verbosity > 0) {
			if(i % parameters.valIter == 0) {
				// print training and validation error
				cout << setw(6) << i;
				cout << setw(14) << setprecision(7) << avgLogLossNew;
				cout << setw(14) << setprecision(7) << avgLogLossValidNew << endl;
			} else {
				// print training error
				cout << setw(6) << i << setw(14) << setprecision(7) << avgLogLossNew << endl;
			}
		}

		// test for convergence
		if(avgLogLoss - avgLogLossNew < parameters.threshold)
			return true;
		avgLogLoss = avgLogLossNew;

		// compute normalized posterior (E)
		post = (logJoint.rowwise() - logLik).exp();
		postSum = post.rowwise().sum();
		weights = post.colwise() / postSum;

		// optimize prior weights (M)
		if(parameters.trainPriors) {
			mPriors = postSum / data.cols() + parameters.regularizePriors;
			mPriors /= mPriors.sum();
		}

		// optimize components (M)
		if(parameters.trainComponents) {
			#pragma omp parallel for
			for(int k = 0; k < numComponents(); ++k)
				mComponents[k]->train(data, weights.row(k), componentParameters);
		} else {
			return true;
		}

		if((i + 1) % parameters.valIter == 0) {
			// check validation error
			avgLogLossValidNew = evaluate(dataValid);

			if(avgLogLossValidNew < avgLogLossValid) {
				// backup new found model parameters
				priors = mPriors;
				for(int k = 0; k < numComponents(); ++k)
					*components[k] = *mComponents[k];
				
				avgLogLossValid = avgLogLossValidNew;
			} else {
				counter++;

				if(parameters.valLookAhead > 0 && counter >= parameters.valLookAhead) {
					// set parameters to best parameters found during training
					mPriors = priors;

					for(int k = 0; k < numComponents(); ++k) {
						*mComponents[k] = *components[k];
						delete components[k];
					}

					return true;
				}
			}
		}
	}

	if(parameters.verbosity > 0)
		cout << setw(6) << parameters.maxIter << setw(11) << setprecision(5) << evaluate(data) << endl;

	return false;
}
Beispiel #10
0
bool CMT::Mixture::train(
	const MatrixXd& data,
	const Parameters& parameters,
	const Component::Parameters& componentParameters)
{
	if(data.rows() != dim())
		throw Exception("Data has wrong dimensionality.");

	if(parameters.initialize && !initialized())
		initialize(data, parameters, componentParameters);

	ArrayXXd logJoint(numComponents(), data.cols());
	Array<double, Dynamic, 1> postSum;
	Array<double, 1, Dynamic> logLik;
	ArrayXXd post;
	ArrayXXd weights;
	double avgLogLoss = numeric_limits<double>::infinity();
	double avgLogLossNew;

	for(int i = 0; i < parameters.maxIter; ++i) {
		// compute joint probability of data and assignments (E)
		#pragma omp parallel for
		for(int k = 0; k < numComponents(); ++k)
			logJoint.row(k) = mComponents[k]->logLikelihood(data) + log(mPriors[k]);

		// compute normalized posterior (E)
		logLik = logSumExp(logJoint);

		// average negative log-likelihood in bits per component
		avgLogLossNew = -logLik.mean() / log(2.) / dim();

		if(parameters.verbosity > 0)
			cout << setw(6) << i << setw(14) << setprecision(7) << avgLogLossNew << endl;

		// test for convergence
		if(avgLogLoss - avgLogLossNew < parameters.threshold)
			return true;
		avgLogLoss = avgLogLossNew;

		// compute normalized posterior (E)
		post = (logJoint.rowwise() - logLik).exp();
		postSum = post.rowwise().sum();
		weights = post.colwise() / postSum;

		// optimize prior weights (M)
		if(parameters.trainPriors) {
			mPriors = postSum / data.cols() + parameters.regularizePriors;
			mPriors /= mPriors.sum();
		}

		// optimize components (M)
		if(parameters.trainComponents) {
			#pragma omp parallel for
			for(int k = 0; k < numComponents(); ++k)
				mComponents[k]->train(data, weights.row(k), componentParameters);
		} else {
			return true;
		}
	}

	if(parameters.verbosity > 0)
		cout << setw(6) << parameters.maxIter << setw(14) << setprecision(7) << evaluate(data) << endl;

	return false;
}
bool KmeansClusterer::updateClusterCentersUntilConverged(RefArrayXXd sample, RefArrayXXd centers, 
                                                         RefArrayXd clusterSizes, vector<int> &clusterIndices,
                                                         double &sumOfDistancesToClosestCenter, double relTolerance)
{
    unsigned int Npoints = sample.cols();
    unsigned int Ndimensions = sample.rows();
    unsigned int Nclusters = centers.cols();
    ArrayXXd updatedCenters = ArrayXXd::Zero(Ndimensions, Nclusters);   // coordinates of each of the new cluster centers


    // Perform the k-means clustering iteration, each time improving the cluster centers,
    // and redetermining which points belongs to which cluster
    
    bool stopIterations = false;
    bool convergenceReached;
    unsigned int indexOfClosestCenter;
    double oldSumOfDistances = 0.0;
    double newSumOfDistances = 0.0;
    double distanceToClosestCenter;
    double distance; 

    while (!stopIterations)
    {
        // Find for each point the closest cluster center.
        // At the same time recompute/update the new cluster centers, which is simply
        // the barycenter of all points belonging to the cluster. 
    
        clusterSizes.setZero();
        updatedCenters.setZero();
    
        for (int n = 0; n < Npoints; ++n)
        {
            distanceToClosestCenter = numeric_limits<double>::max();
        
            for (int i = 0; i < Nclusters; ++i)
            {
                distance = metric.distance(sample.col(n), centers.col(i));
                
                if (distance < distanceToClosestCenter)
                {
                    indexOfClosestCenter = i;
                    distanceToClosestCenter = distance;
                }
            }
        
            newSumOfDistances += distanceToClosestCenter;
            updatedCenters.col(indexOfClosestCenter) += sample.col(n);
            clusterSizes(indexOfClosestCenter) += 1; 
            clusterIndices[n] = indexOfClosestCenter;        
        }
    

        // Assert that all clusters contain at least 2 points. If not we probably started
        // with an unfortunate set of initial cluster centers. Flag this by immediately 
        // returning false.
       
        if (!(clusterSizes > 1).all())
        {
            convergenceReached = false;
            return convergenceReached;
        }
       

        // Finish computing the new updated centers. Given the check above, we are sure
        // that none of the clusters is empty. 
        
        updatedCenters.rowwise() /= clusterSizes.transpose();
        centers = updatedCenters;
    

        // A new set of clusters has been determined.
        // Decide whether the algorithm has converged. Convergence occurs when
        // the sum of all distances of all points to their cluster center does
        // not change significantly anymore. 
        // Note: in order for this criterion to work properly, the coordinate
        //       space should be normalized, so that one particular coordinate
        //       cannot numerically dominate all other coordinates.
    
        if (oldSumOfDistances == 0.0)
        {
            // This is the first center-updating iteration, so there is nothing to compare yet.
            // Simply set the variables.
    
            oldSumOfDistances = newSumOfDistances;
            newSumOfDistances = 0.0;
        }
        else
        {
            // If the relative change in sumOfDistances between old and new was smaller than
            // the threshold set by the user, stop the iteration loop.
        
            if (fabs(newSumOfDistances - oldSumOfDistances) / oldSumOfDistances < relTolerance)
            {
                sumOfDistancesToClosestCenter = newSumOfDistances;      // will be returned to user
                stopIterations = true;
            }
            else
            {
                oldSumOfDistances = newSumOfDistances;
                newSumOfDistances = 0.0;
            }   
        }
    }  // end k-means center-updating loop 
    
    
    // Convergence was properly reached, so return
    
    convergenceReached = true;
    
    return convergenceReached;  
}
Beispiel #12
0
Array<double, 1, Dynamic> CMT::logMeanExp(const ArrayXXd& array) {
	Array<double, 1, Dynamic> arrayMax = array.colwise().maxCoeff() - 1.;
	return arrayMax + (array.rowwise() - arrayMax).exp().colwise().mean().log();
}
Beispiel #13
0
double CMT::MCBM::parameterGradient(
	const MatrixXd& inputCompl,
	const MatrixXd& outputCompl,
	const lbfgsfloatval_t* x,
	lbfgsfloatval_t* g,
	const Trainable::Parameters& params_) const
{
	const Parameters& params = dynamic_cast<const Parameters&>(params_);

	// average log-likelihood
	double logLik = 0.;

	// interpret memory for parameters and gradients
	lbfgsfloatval_t* y = const_cast<lbfgsfloatval_t*>(x);

	int offset = 0;

	VectorLBFGS priors(params.trainPriors ? y : const_cast<double*>(mPriors.data()), mNumComponents);
	VectorLBFGS priorsGrad(g, mNumComponents);
	if(params.trainPriors)
		offset += priors.size();

	MatrixLBFGS weights(params.trainWeights ? y + offset : const_cast<double*>(mWeights.data()), mNumComponents, mNumFeatures);
	MatrixLBFGS weightsGrad(g + offset, mNumComponents, mNumFeatures);
	if(params.trainWeights)
		offset += weights.size();

	MatrixLBFGS features(params.trainFeatures ? y + offset : const_cast<double*>(mFeatures.data()), mDimIn, mNumFeatures);
	MatrixLBFGS featuresGrad(g + offset, mDimIn, mNumFeatures);
	if(params.trainFeatures)
		offset += features.size();

	MatrixLBFGS predictors(params.trainPredictors ? y + offset : const_cast<double*>(mPredictors.data()), mNumComponents, mDimIn);
	MatrixLBFGS predictorsGrad(g + offset, mNumComponents, mDimIn);
	if(params.trainPredictors)
		offset += predictors.size();

	MatrixLBFGS inputBias(params.trainInputBias ? y + offset : const_cast<double*>(mInputBias.data()), mDimIn, mNumComponents);
	MatrixLBFGS inputBiasGrad(g + offset, mDimIn, mNumComponents);
	if(params.trainInputBias)
		offset += inputBias.size();

	VectorLBFGS outputBias(params.trainOutputBias ? y + offset : const_cast<double*>(mOutputBias.data()), mNumComponents);
	VectorLBFGS outputBiasGrad(g + offset, mNumComponents);
	if(params.trainOutputBias)
		offset += outputBias.size();

	if(g) {
		// initialize gradients
		if(params.trainPriors)
			priorsGrad.setZero();
		if(params.trainWeights)
			weightsGrad.setZero();
		if(params.trainFeatures)
			featuresGrad.setZero();
		if(params.trainPredictors)
			predictorsGrad.setZero();
		if(params.trainInputBias)
			inputBiasGrad.setZero();
		if(params.trainOutputBias)
			outputBiasGrad.setZero();
	}

	// split data into batches for better performance
	int numData = static_cast<int>(inputCompl.cols());
	int batchSize = min(max(params.batchSize, 10), numData);

	#pragma omp parallel for
	for(int b = 0; b < inputCompl.cols(); b += batchSize) {
		const MatrixXd& input = inputCompl.middleCols(b, min(batchSize, numData - b));
		const MatrixXd& output = outputCompl.middleCols(b, min(batchSize, numData - b));

		ArrayXXd featureOutput = features.transpose() * input;
		MatrixXd featureOutputSq = featureOutput.square();
		MatrixXd weightsOutput = weights * featureOutputSq;
		ArrayXXd predictorOutput = predictors * input;

		// unnormalized posteriors over components for both possible outputs
		ArrayXXd logPost0 = (weightsOutput + inputBias.transpose() * input).colwise() + priors;
		ArrayXXd logPost1 = (logPost0 + predictorOutput).colwise() + outputBias.array();

		// sum over components to get unnormalized probabilities of outputs
		Array<double, 1, Dynamic> logProb0 = logSumExp(logPost0);
		Array<double, 1, Dynamic> logProb1 = logSumExp(logPost1);
	
		// normalize posteriors over components
		logPost0.rowwise() -= logProb0;
		logPost1.rowwise() -= logProb1;

		// stack row vectors
		ArrayXXd logProb01(2, input.cols());
		logProb01 << logProb0, logProb1; 

		// normalize log-probabilities
		Array<double, 1, Dynamic> logNorm = logSumExp(logProb01);
		logProb1 -= logNorm;
		logProb0 -= logNorm;

		double logLikBatch = (output.array() * logProb1 + (1. - output.array()) * logProb0).sum();

		#pragma omp critical
		logLik += logLikBatch;

		if(!g)
			// don't compute gradients
			continue;

		Array<double, 1, Dynamic> tmp = output.array() * logProb0.exp() - (1. - output.array()) * logProb1.exp();

		ArrayXXd post0Tmp = logPost0.exp().rowwise() * tmp;
		ArrayXXd post1Tmp = logPost1.exp().rowwise() * tmp;
		ArrayXXd postDiffTmp = post1Tmp - post0Tmp;

		// update gradients
		if(params.trainPriors)
			#pragma omp critical
			priorsGrad -= postDiffTmp.rowwise().sum().matrix();

		if(params.trainWeights)
			#pragma omp critical
			weightsGrad -= postDiffTmp.matrix() * featureOutputSq.transpose();

		if(params.trainFeatures) {
			ArrayXXd tmp2 = weights.transpose() * postDiffTmp.matrix() * 2.;
			MatrixXd tmp3 = featureOutput * tmp2;
			#pragma omp critical
			featuresGrad -= input * tmp3.transpose();
		}

		if(params.trainPredictors)
			#pragma omp critical
			predictorsGrad -= post1Tmp.matrix() * input.transpose();

		if(params.trainInputBias)
			#pragma omp critical
			inputBiasGrad -= input * postDiffTmp.matrix().transpose();

		if(params.trainOutputBias)
			#pragma omp critical
			outputBiasGrad -= post1Tmp.rowwise().sum().matrix();
	}

	double normConst = inputCompl.cols() * log(2.) * dimOut();

	if(g) {
		for(int i = 0; i < offset; ++i)
			g[i] /= normConst;

		if(params.trainFeatures)
			featuresGrad += params.regularizeFeatures.gradient(features);

		if(params.trainPredictors)
			predictorsGrad += params.regularizePredictors.gradient(predictors.transpose()).transpose();

		if(params.trainWeights)
			weightsGrad += params.regularizeWeights.gradient(weights);
	}

	double value = -logLik / normConst;

	if(params.trainFeatures)
		value += params.regularizeFeatures.evaluate(features);

	if(params.trainPredictors)
		value += params.regularizePredictors.evaluate(predictors.transpose());

	if(params.trainWeights)
		value += params.regularizeWeights.evaluate(weights);

	return value;
}
Beispiel #14
0
pair<pair<ArrayXXd, ArrayXXd>, Array<double, 1, Dynamic> > CMT::STM::computeDataGradient(
	const MatrixXd& input,
	const MatrixXd& output) const
{
	// make sure nonlinearity is differentiable
	DifferentiableNonlinearity* nonlinearity =
		dynamic_cast<DifferentiableNonlinearity*>(mNonlinearity);
	if(!nonlinearity)
		throw Exception("Nonlinearity has to be differentiable.");

	if(input.rows() != dimIn())
		throw Exception("Input has wrong dimensionality.");
	if(output.rows() != 1)
		throw Exception("Output has wrong dimensionality.");
	if(input.cols() != output.cols())
		throw Exception("Number of inputs and outputs should be the same.");

	if(dimInNonlinear() && !dimInLinear()) {
		Array<double, 1, Dynamic> responses;

		ArrayXXd jointEnergy;

		if(numFeatures() > 0)
			jointEnergy = mWeights * (mFeatures.transpose() * input).array().square().matrix()
				+ mPredictors * input;
		else
			jointEnergy = mPredictors * input;
		jointEnergy.colwise() += mBiases.array();
		jointEnergy *= mSharpness;

		responses = logSumExp(jointEnergy);

		// posterior over components for each input
		MatrixXd posterior = (jointEnergy.rowwise() - responses).array().exp();

		responses /= mSharpness;

		Array<double, 1, Dynamic> tmp0 = (*mNonlinearity)(responses);
		Array<double, 1, Dynamic> tmp1 = -mDistribution->gradient(output, tmp0);
		Array<double, 1, Dynamic> tmp2 = nonlinearity->derivative(responses);

		ArrayXXd avgPredictor = mPredictors.transpose() * posterior;

		ArrayXXd tmp3;
		if(numFeatures() > 0) {
			ArrayXXd avgWeights = (2. * mWeights).transpose() * posterior;
			tmp3 = mFeatures * (avgWeights * (mFeatures.transpose() * input).array()).matrix();
		} else {
			tmp3 = ArrayXXd::Zero(avgPredictor.rows(), avgPredictor.cols());
		}

		return make_pair(
			make_pair(
				(tmp3 + avgPredictor).rowwise() * (tmp1 * tmp2),
				ArrayXXd::Zero(output.rows(), output.cols())),
			mDistribution->logLikelihood(output, tmp0));

	} else if(dimInNonlinear() && dimInLinear()) {
		// split inputs into linear and nonlinear components
		MatrixXd inputNonlinear = input.topRows(dimInNonlinear());
		MatrixXd inputLinear = input.bottomRows(dimInLinear());

		Array<double, 1, Dynamic> responses;

		ArrayXXd jointEnergy;

		if(numFeatures() > 0)
			jointEnergy = mWeights * (mFeatures.transpose() * inputNonlinear).array().square().matrix()
				+ mPredictors * input;
		else
			jointEnergy = mPredictors * inputNonlinear;
		jointEnergy.colwise() += mBiases.array();
		jointEnergy *= mSharpness;

		responses = logSumExp(jointEnergy);

		// posterior over components for each input
		MatrixXd posterior = (jointEnergy.rowwise() - responses).array().exp();

		responses /= mSharpness;
		responses += (mLinearPredictor.transpose() * inputLinear).array();

		Array<double, 1, Dynamic> tmp0 = (*mNonlinearity)(responses);
		Array<double, 1, Dynamic> tmp1 = -mDistribution->gradient(output, tmp0);
		Array<double, 1, Dynamic> tmp2 = nonlinearity->derivative(responses);

		ArrayXXd avgPredictor = mPredictors.transpose() * posterior;

		ArrayXXd tmp3;
		if(numFeatures() > 0) {
			ArrayXXd avgWeights = (2. * mWeights).transpose() * posterior;
			tmp3 = mFeatures * (avgWeights * (mFeatures.transpose() * inputNonlinear).array()).matrix();
		} else {
			tmp3 = ArrayXXd::Zero(avgPredictor.rows(), avgPredictor.cols());
		}

		// concatenate gradients of nonlinear and linear component
		ArrayXXd inputGradient(dimIn(), input.cols());
		inputGradient << 
			(tmp3 + avgPredictor).rowwise() * (tmp1 * tmp2),
			mLinearPredictor * (tmp1 * tmp2).matrix();

		return make_pair(
			make_pair(
				inputGradient,
				ArrayXXd::Zero(output.rows(), output.cols())),
			mDistribution->logLikelihood(output, tmp0));

	} else if(dimInLinear()) {
		double avgBias = logSumExp(mSharpness * mBiases)(0, 0) / mSharpness;
		Array<double, 1, Dynamic> responses = (mLinearPredictor.transpose() * input).array() + avgBias;

		Array<double, 1, Dynamic> tmp0 = (*mNonlinearity)(responses);
		Array<double, 1, Dynamic> tmp1 = -mDistribution->gradient(output, tmp0);
		Array<double, 1, Dynamic> tmp2 = nonlinearity->derivative(responses);

		return make_pair(
			make_pair(
				mLinearPredictor * (tmp1 * tmp2).matrix(),
				ArrayXXd::Zero(output.rows(), output.cols())),
			mDistribution->logLikelihood(output, tmp0));
	}

	return make_pair(
		make_pair(
			ArrayXXd::Zero(input.rows(), input.cols()),
			ArrayXXd::Zero(output.rows(), output.cols())),
		logLikelihood(input, output));
}