ArrayXXd CMT::BlobNonlinearity::gradient(const ArrayXXd& inputs) const { if(inputs.rows() != 1) throw Exception("Data has to be stored in one row."); ArrayXXd diff = ArrayXXd::Zero(mNumComponents, inputs.cols()); diff.rowwise() += inputs.row(0); diff.colwise() -= mMeans; ArrayXXd diffSq = diff.square(); ArrayXd precisions = mLogPrecisions.exp(); ArrayXd weights = mLogWeights.exp(); ArrayXXd negEnergy = diffSq.colwise() * (-precisions / 2.); ArrayXXd negEnergyExp = negEnergy.exp(); ArrayXXd gradient(3 * mNumComponents, inputs.cols()); // gradient of mean gradient.topRows(mNumComponents) = (diff * negEnergyExp).colwise() * (weights * precisions); // gradient of log-precisions gradient.middleRows(mNumComponents, mNumComponents) = (diffSq / 2. * negEnergyExp).colwise() * (-weights * precisions); // gradient of log-weights gradient.bottomRows(mNumComponents) = negEnergyExp.colwise() * weights; return gradient; }
ArrayXXd CMT::BlobNonlinearity::operator()(const ArrayXXd& inputs) const { if(inputs.rows() != 1) throw Exception("Data has to be stored in one row."); ArrayXXd diff = ArrayXXd::Zero(mNumComponents, inputs.cols()); diff.rowwise() += inputs.row(0); diff.colwise() -= mMeans; ArrayXXd negEnergy = diff.square().colwise() * (-mLogPrecisions.exp() / 2.); return (mLogWeights.exp().transpose().matrix() * negEnergy.exp().matrix()).array() + mEpsilon; }
ArrayXXd CMT::BlobNonlinearity::derivative(const ArrayXXd& inputs) const { if(inputs.rows() != 1) throw Exception("Data has to be stored in one row."); ArrayXXd diff = ArrayXXd::Zero(mNumComponents, inputs.cols()); diff.rowwise() -= inputs.row(0); diff.colwise() += mMeans; ArrayXd precisions = mLogPrecisions.exp(); ArrayXXd negEnergy = diff.square().colwise() * (-precisions / 2.); return (mLogWeights.exp() * precisions).transpose().matrix() * (diff * negEnergy.exp()).matrix(); }
bool CMT::Mixture::train( const MatrixXd& data, const MatrixXd& dataValid, const Parameters& parameters, const Component::Parameters& componentParameters) { if(parameters.initialize && !initialized()) initialize(data, parameters, componentParameters); ArrayXXd logJoint(numComponents(), data.cols()); Array<double, Dynamic, 1> postSum; Array<double, 1, Dynamic> logLik; ArrayXXd post; ArrayXXd weights; // training and validation log-loss for checking convergence double avgLogLoss = numeric_limits<double>::infinity(); double avgLogLossNew; double avgLogLossValid = evaluate(dataValid); double avgLogLossValidNew = avgLogLossValid; int counter = 0; // backup model parameters VectorXd priors = mPriors; vector<Component*> components; for(int k = 0; k < numComponents(); ++k) components.push_back(mComponents[k]->copy()); for(int i = 0; i < parameters.maxIter; ++i) { // compute joint probability of data and assignments (E) #pragma omp parallel for for(int k = 0; k < numComponents(); ++k) logJoint.row(k) = mComponents[k]->logLikelihood(data) + log(mPriors[k]); // compute normalized posterior (E) logLik = logSumExp(logJoint); // average negative log-likelihood in bits per component avgLogLossNew = -logLik.mean() / log(2.) / dim(); if(parameters.verbosity > 0) { if(i % parameters.valIter == 0) { // print training and validation error cout << setw(6) << i; cout << setw(14) << setprecision(7) << avgLogLossNew; cout << setw(14) << setprecision(7) << avgLogLossValidNew << endl; } else { // print training error cout << setw(6) << i << setw(14) << setprecision(7) << avgLogLossNew << endl; } } // test for convergence if(avgLogLoss - avgLogLossNew < parameters.threshold) return true; avgLogLoss = avgLogLossNew; // compute normalized posterior (E) post = (logJoint.rowwise() - logLik).exp(); postSum = post.rowwise().sum(); weights = post.colwise() / postSum; // optimize prior weights (M) if(parameters.trainPriors) { mPriors = postSum / data.cols() + parameters.regularizePriors; mPriors /= mPriors.sum(); } // optimize components (M) if(parameters.trainComponents) { #pragma omp parallel for for(int k = 0; k < numComponents(); ++k) mComponents[k]->train(data, weights.row(k), componentParameters); } else { return true; } if((i + 1) % parameters.valIter == 0) { // check validation error avgLogLossValidNew = evaluate(dataValid); if(avgLogLossValidNew < avgLogLossValid) { // backup new found model parameters priors = mPriors; for(int k = 0; k < numComponents(); ++k) *components[k] = *mComponents[k]; avgLogLossValid = avgLogLossValidNew; } else { counter++; if(parameters.valLookAhead > 0 && counter >= parameters.valLookAhead) { // set parameters to best parameters found during training mPriors = priors; for(int k = 0; k < numComponents(); ++k) { *mComponents[k] = *components[k]; delete components[k]; } return true; } } } } if(parameters.verbosity > 0) cout << setw(6) << parameters.maxIter << setw(11) << setprecision(5) << evaluate(data) << endl; return false; }
bool CMT::Mixture::train( const MatrixXd& data, const Parameters& parameters, const Component::Parameters& componentParameters) { if(data.rows() != dim()) throw Exception("Data has wrong dimensionality."); if(parameters.initialize && !initialized()) initialize(data, parameters, componentParameters); ArrayXXd logJoint(numComponents(), data.cols()); Array<double, Dynamic, 1> postSum; Array<double, 1, Dynamic> logLik; ArrayXXd post; ArrayXXd weights; double avgLogLoss = numeric_limits<double>::infinity(); double avgLogLossNew; for(int i = 0; i < parameters.maxIter; ++i) { // compute joint probability of data and assignments (E) #pragma omp parallel for for(int k = 0; k < numComponents(); ++k) logJoint.row(k) = mComponents[k]->logLikelihood(data) + log(mPriors[k]); // compute normalized posterior (E) logLik = logSumExp(logJoint); // average negative log-likelihood in bits per component avgLogLossNew = -logLik.mean() / log(2.) / dim(); if(parameters.verbosity > 0) cout << setw(6) << i << setw(14) << setprecision(7) << avgLogLossNew << endl; // test for convergence if(avgLogLoss - avgLogLossNew < parameters.threshold) return true; avgLogLoss = avgLogLossNew; // compute normalized posterior (E) post = (logJoint.rowwise() - logLik).exp(); postSum = post.rowwise().sum(); weights = post.colwise() / postSum; // optimize prior weights (M) if(parameters.trainPriors) { mPriors = postSum / data.cols() + parameters.regularizePriors; mPriors /= mPriors.sum(); } // optimize components (M) if(parameters.trainComponents) { #pragma omp parallel for for(int k = 0; k < numComponents(); ++k) mComponents[k]->train(data, weights.row(k), componentParameters); } else { return true; } } if(parameters.verbosity > 0) cout << setw(6) << parameters.maxIter << setw(14) << setprecision(7) << evaluate(data) << endl; return false; }
Array<double, 1, Dynamic> CMT::logMeanExp(const ArrayXXd& array) { Array<double, 1, Dynamic> arrayMax = array.colwise().maxCoeff() - 1.; return arrayMax + (array.rowwise() - arrayMax).exp().colwise().mean().log(); }
pair<pair<ArrayXXd, ArrayXXd>, Array<double, 1, Dynamic> > CMT::STM::computeDataGradient( const MatrixXd& input, const MatrixXd& output) const { // make sure nonlinearity is differentiable DifferentiableNonlinearity* nonlinearity = dynamic_cast<DifferentiableNonlinearity*>(mNonlinearity); if(!nonlinearity) throw Exception("Nonlinearity has to be differentiable."); if(input.rows() != dimIn()) throw Exception("Input has wrong dimensionality."); if(output.rows() != 1) throw Exception("Output has wrong dimensionality."); if(input.cols() != output.cols()) throw Exception("Number of inputs and outputs should be the same."); if(dimInNonlinear() && !dimInLinear()) { Array<double, 1, Dynamic> responses; ArrayXXd jointEnergy; if(numFeatures() > 0) jointEnergy = mWeights * (mFeatures.transpose() * input).array().square().matrix() + mPredictors * input; else jointEnergy = mPredictors * input; jointEnergy.colwise() += mBiases.array(); jointEnergy *= mSharpness; responses = logSumExp(jointEnergy); // posterior over components for each input MatrixXd posterior = (jointEnergy.rowwise() - responses).array().exp(); responses /= mSharpness; Array<double, 1, Dynamic> tmp0 = (*mNonlinearity)(responses); Array<double, 1, Dynamic> tmp1 = -mDistribution->gradient(output, tmp0); Array<double, 1, Dynamic> tmp2 = nonlinearity->derivative(responses); ArrayXXd avgPredictor = mPredictors.transpose() * posterior; ArrayXXd tmp3; if(numFeatures() > 0) { ArrayXXd avgWeights = (2. * mWeights).transpose() * posterior; tmp3 = mFeatures * (avgWeights * (mFeatures.transpose() * input).array()).matrix(); } else { tmp3 = ArrayXXd::Zero(avgPredictor.rows(), avgPredictor.cols()); } return make_pair( make_pair( (tmp3 + avgPredictor).rowwise() * (tmp1 * tmp2), ArrayXXd::Zero(output.rows(), output.cols())), mDistribution->logLikelihood(output, tmp0)); } else if(dimInNonlinear() && dimInLinear()) { // split inputs into linear and nonlinear components MatrixXd inputNonlinear = input.topRows(dimInNonlinear()); MatrixXd inputLinear = input.bottomRows(dimInLinear()); Array<double, 1, Dynamic> responses; ArrayXXd jointEnergy; if(numFeatures() > 0) jointEnergy = mWeights * (mFeatures.transpose() * inputNonlinear).array().square().matrix() + mPredictors * input; else jointEnergy = mPredictors * inputNonlinear; jointEnergy.colwise() += mBiases.array(); jointEnergy *= mSharpness; responses = logSumExp(jointEnergy); // posterior over components for each input MatrixXd posterior = (jointEnergy.rowwise() - responses).array().exp(); responses /= mSharpness; responses += (mLinearPredictor.transpose() * inputLinear).array(); Array<double, 1, Dynamic> tmp0 = (*mNonlinearity)(responses); Array<double, 1, Dynamic> tmp1 = -mDistribution->gradient(output, tmp0); Array<double, 1, Dynamic> tmp2 = nonlinearity->derivative(responses); ArrayXXd avgPredictor = mPredictors.transpose() * posterior; ArrayXXd tmp3; if(numFeatures() > 0) { ArrayXXd avgWeights = (2. * mWeights).transpose() * posterior; tmp3 = mFeatures * (avgWeights * (mFeatures.transpose() * inputNonlinear).array()).matrix(); } else { tmp3 = ArrayXXd::Zero(avgPredictor.rows(), avgPredictor.cols()); } // concatenate gradients of nonlinear and linear component ArrayXXd inputGradient(dimIn(), input.cols()); inputGradient << (tmp3 + avgPredictor).rowwise() * (tmp1 * tmp2), mLinearPredictor * (tmp1 * tmp2).matrix(); return make_pair( make_pair( inputGradient, ArrayXXd::Zero(output.rows(), output.cols())), mDistribution->logLikelihood(output, tmp0)); } else if(dimInLinear()) { double avgBias = logSumExp(mSharpness * mBiases)(0, 0) / mSharpness; Array<double, 1, Dynamic> responses = (mLinearPredictor.transpose() * input).array() + avgBias; Array<double, 1, Dynamic> tmp0 = (*mNonlinearity)(responses); Array<double, 1, Dynamic> tmp1 = -mDistribution->gradient(output, tmp0); Array<double, 1, Dynamic> tmp2 = nonlinearity->derivative(responses); return make_pair( make_pair( mLinearPredictor * (tmp1 * tmp2).matrix(), ArrayXXd::Zero(output.rows(), output.cols())), mDistribution->logLikelihood(output, tmp0)); } return make_pair( make_pair( ArrayXXd::Zero(input.rows(), input.cols()), ArrayXXd::Zero(output.rows(), output.cols())), logLikelihood(input, output)); }