void CMT::WhiteningTransform::initialize(const ArrayXXd& input, int dimOut) { if(input.cols() < input.rows()) throw Exception("Too few inputs to compute whitening transform."); mMeanIn = input.rowwise().mean(); // compute covariances MatrixXd covXX = covariance(input); // input whitening SelfAdjointEigenSolver<MatrixXd> eigenSolver; eigenSolver.compute(covXX); Array<double, 1, Dynamic> eigenvalues = eigenSolver.eigenvalues(); MatrixXd eigenvectors = eigenSolver.eigenvectors(); // don't whiten directions with near-zero variance for(int i = 0; i < eigenvalues.size(); ++i) if(eigenvalues[i] < 1e-7) eigenvalues[i] = 1.; mPreIn = (eigenvectors.array().rowwise() * eigenvalues.sqrt().cwiseInverse()).matrix() * eigenvectors.transpose(); mPreInInv = (eigenvectors.array().rowwise() * eigenvalues.sqrt()).matrix() * eigenvectors.transpose(); mMeanOut = VectorXd::Zero(dimOut); mPreOut = MatrixXd::Identity(dimOut, dimOut); mPreOutInv = MatrixXd::Identity(dimOut, dimOut); mPredictor = MatrixXd::Zero(dimOut, input.rows()); mGradTransform = MatrixXd::Zero(dimOut, input.rows()); mLogJacobian = 1.; }
ArrayXXd CMT::BlobNonlinearity::gradient(const ArrayXXd& inputs) const { if(inputs.rows() != 1) throw Exception("Data has to be stored in one row."); ArrayXXd diff = ArrayXXd::Zero(mNumComponents, inputs.cols()); diff.rowwise() += inputs.row(0); diff.colwise() -= mMeans; ArrayXXd diffSq = diff.square(); ArrayXd precisions = mLogPrecisions.exp(); ArrayXd weights = mLogWeights.exp(); ArrayXXd negEnergy = diffSq.colwise() * (-precisions / 2.); ArrayXXd negEnergyExp = negEnergy.exp(); ArrayXXd gradient(3 * mNumComponents, inputs.cols()); // gradient of mean gradient.topRows(mNumComponents) = (diff * negEnergyExp).colwise() * (weights * precisions); // gradient of log-precisions gradient.middleRows(mNumComponents, mNumComponents) = (diffSq / 2. * negEnergyExp).colwise() * (-weights * precisions); // gradient of log-weights gradient.bottomRows(mNumComponents) = negEnergyExp.colwise() * weights; return gradient; }
Array<int, 1, Dynamic> CMT::MCBM::samplePrior(const MatrixXd& input) const { if(input.rows() != dimIn()) throw Exception("Inputs have wrong dimensionality."); ArrayXXd featureEnergy = mWeights * (mFeatures.transpose() * input).array().square().matrix(); ArrayXXd biasEnergy = mInputBias.transpose() * input; ArrayXXd predictorEnergy = mPredictors * input; ArrayXXd tmp0 = (featureEnergy + biasEnergy).colwise() + mPriors.array(); ArrayXXd tmp1 = (tmp0 + predictorEnergy).colwise() + mOutputBias.array(); ArrayXXd logPrior = tmp0 + tmp1; logPrior.rowwise() -= logSumExp(logPrior); ArrayXXd prior = logPrior.exp(); Array<int, 1, Dynamic> labels(input.cols()); #pragma omp parallel for for(int j = 0; j < input.cols(); ++j) { int i = 0; double urand = static_cast<double>(rand()) / (static_cast<long>(RAND_MAX) + 1l); double cdf; // compute index for(cdf = prior(0, j); cdf < urand; cdf += prior(i, j)) ++i; labels[j] = i; } return labels; }
ArrayXXd CMT::BlobNonlinearity::operator()(const ArrayXXd& inputs) const { if(inputs.rows() != 1) throw Exception("Data has to be stored in one row."); ArrayXXd diff = ArrayXXd::Zero(mNumComponents, inputs.cols()); diff.rowwise() += inputs.row(0); diff.colwise() -= mMeans; ArrayXXd negEnergy = diff.square().colwise() * (-mLogPrecisions.exp() / 2.); return (mLogWeights.exp().transpose().matrix() * negEnergy.exp().matrix()).array() + mEpsilon; }
MatrixXd CMT::MLR::predict(const MatrixXd& input) const { if(input.rows() != mDimIn) throw Exception("Inputs have wrong dimensionality."); MatrixXd output = MatrixXd::Zero(mDimOut, input.cols()); // distribution over outputs ArrayXXd prob = (mWeights * input).colwise() + mBiases; prob.rowwise() -= logSumExp(prob); prob = prob.exp(); return prob; }
ArrayXXd CMT::BlobNonlinearity::derivative(const ArrayXXd& inputs) const { if(inputs.rows() != 1) throw Exception("Data has to be stored in one row."); ArrayXXd diff = ArrayXXd::Zero(mNumComponents, inputs.cols()); diff.rowwise() -= inputs.row(0); diff.colwise() += mMeans; ArrayXd precisions = mLogPrecisions.exp(); ArrayXXd negEnergy = diff.square().colwise() * (-precisions / 2.); return (mLogWeights.exp() * precisions).transpose().matrix() * (diff * negEnergy.exp()).matrix(); }
Array<double, 1, Dynamic> CMT::MLR::logLikelihood( const MatrixXd& input, const MatrixXd& output) const { if(input.cols() != output.cols()) throw Exception("Number of inputs and outputs have to be the same."); if(input.rows() != mDimIn) throw Exception("Inputs have wrong dimensionality."); if(output.rows() != mDimOut) throw Exception("Output has wrong dimensionality."); // distribution over outputs ArrayXXd logProb = (mWeights * input).colwise() + mBiases; logProb.rowwise() -= logSumExp(logProb); return (logProb * output.array()).colwise().sum(); }
double CMT::MLR::parameterGradient( const MatrixXd& input, const MatrixXd& output, const lbfgsfloatval_t* x, lbfgsfloatval_t* g, const Trainable::Parameters& params_) const { const Parameters& params = dynamic_cast<const Parameters&>(params_); MatrixXd weights = mWeights; VectorXd biases = mBiases; // copy parameters int k = 0; if(params.trainWeights) for(int i = 1; i < weights.rows(); ++i) for(int j = 0; j < weights.cols(); ++j, ++k) weights(i, j) = x[k]; if(params.trainBiases) for(int i = 1; i < mBiases.rows(); ++i, ++k) biases[i] = x[k]; // compute distribution over outputs ArrayXXd logProb = (weights * input).colwise() + biases; logProb.rowwise() -= logSumExp(logProb); // difference between prediction and actual output MatrixXd diff = (logProb.exp().matrix() - output); // compute gradients double normConst = output.cols() * log(2.); if(g) { int offset = 0; if(params.trainWeights) { Map<Matrix<double, Dynamic, Dynamic, RowMajor> > weightsGrad(g, mDimOut - 1, mDimIn); weightsGrad = (diff * input.transpose() / normConst).bottomRows(mDimOut - 1); offset += weightsGrad.size(); weightsGrad += params.regularizeWeights.gradient( weights.bottomRows(mDimOut - 1).transpose()).transpose(); } if(params.trainBiases) { VectorLBFGS biasesGrad(g + offset, mDimOut - 1); biasesGrad = diff.rowwise().sum().bottomRows(mDimOut - 1) / normConst; biasesGrad += params.regularizeBiases.gradient(biases); } } // return negative average log-likelihood in bits double value = -(logProb * output.array()).sum() / normConst; if(params.trainWeights) value += params.regularizeWeights.evaluate(weights.bottomRows(mDimOut - 1).transpose()); if(params.trainBiases) value += params.regularizeBiases.evaluate(biases); return value; }
bool CMT::Mixture::train( const MatrixXd& data, const MatrixXd& dataValid, const Parameters& parameters, const Component::Parameters& componentParameters) { if(parameters.initialize && !initialized()) initialize(data, parameters, componentParameters); ArrayXXd logJoint(numComponents(), data.cols()); Array<double, Dynamic, 1> postSum; Array<double, 1, Dynamic> logLik; ArrayXXd post; ArrayXXd weights; // training and validation log-loss for checking convergence double avgLogLoss = numeric_limits<double>::infinity(); double avgLogLossNew; double avgLogLossValid = evaluate(dataValid); double avgLogLossValidNew = avgLogLossValid; int counter = 0; // backup model parameters VectorXd priors = mPriors; vector<Component*> components; for(int k = 0; k < numComponents(); ++k) components.push_back(mComponents[k]->copy()); for(int i = 0; i < parameters.maxIter; ++i) { // compute joint probability of data and assignments (E) #pragma omp parallel for for(int k = 0; k < numComponents(); ++k) logJoint.row(k) = mComponents[k]->logLikelihood(data) + log(mPriors[k]); // compute normalized posterior (E) logLik = logSumExp(logJoint); // average negative log-likelihood in bits per component avgLogLossNew = -logLik.mean() / log(2.) / dim(); if(parameters.verbosity > 0) { if(i % parameters.valIter == 0) { // print training and validation error cout << setw(6) << i; cout << setw(14) << setprecision(7) << avgLogLossNew; cout << setw(14) << setprecision(7) << avgLogLossValidNew << endl; } else { // print training error cout << setw(6) << i << setw(14) << setprecision(7) << avgLogLossNew << endl; } } // test for convergence if(avgLogLoss - avgLogLossNew < parameters.threshold) return true; avgLogLoss = avgLogLossNew; // compute normalized posterior (E) post = (logJoint.rowwise() - logLik).exp(); postSum = post.rowwise().sum(); weights = post.colwise() / postSum; // optimize prior weights (M) if(parameters.trainPriors) { mPriors = postSum / data.cols() + parameters.regularizePriors; mPriors /= mPriors.sum(); } // optimize components (M) if(parameters.trainComponents) { #pragma omp parallel for for(int k = 0; k < numComponents(); ++k) mComponents[k]->train(data, weights.row(k), componentParameters); } else { return true; } if((i + 1) % parameters.valIter == 0) { // check validation error avgLogLossValidNew = evaluate(dataValid); if(avgLogLossValidNew < avgLogLossValid) { // backup new found model parameters priors = mPriors; for(int k = 0; k < numComponents(); ++k) *components[k] = *mComponents[k]; avgLogLossValid = avgLogLossValidNew; } else { counter++; if(parameters.valLookAhead > 0 && counter >= parameters.valLookAhead) { // set parameters to best parameters found during training mPriors = priors; for(int k = 0; k < numComponents(); ++k) { *mComponents[k] = *components[k]; delete components[k]; } return true; } } } } if(parameters.verbosity > 0) cout << setw(6) << parameters.maxIter << setw(11) << setprecision(5) << evaluate(data) << endl; return false; }
bool CMT::Mixture::train( const MatrixXd& data, const Parameters& parameters, const Component::Parameters& componentParameters) { if(data.rows() != dim()) throw Exception("Data has wrong dimensionality."); if(parameters.initialize && !initialized()) initialize(data, parameters, componentParameters); ArrayXXd logJoint(numComponents(), data.cols()); Array<double, Dynamic, 1> postSum; Array<double, 1, Dynamic> logLik; ArrayXXd post; ArrayXXd weights; double avgLogLoss = numeric_limits<double>::infinity(); double avgLogLossNew; for(int i = 0; i < parameters.maxIter; ++i) { // compute joint probability of data and assignments (E) #pragma omp parallel for for(int k = 0; k < numComponents(); ++k) logJoint.row(k) = mComponents[k]->logLikelihood(data) + log(mPriors[k]); // compute normalized posterior (E) logLik = logSumExp(logJoint); // average negative log-likelihood in bits per component avgLogLossNew = -logLik.mean() / log(2.) / dim(); if(parameters.verbosity > 0) cout << setw(6) << i << setw(14) << setprecision(7) << avgLogLossNew << endl; // test for convergence if(avgLogLoss - avgLogLossNew < parameters.threshold) return true; avgLogLoss = avgLogLossNew; // compute normalized posterior (E) post = (logJoint.rowwise() - logLik).exp(); postSum = post.rowwise().sum(); weights = post.colwise() / postSum; // optimize prior weights (M) if(parameters.trainPriors) { mPriors = postSum / data.cols() + parameters.regularizePriors; mPriors /= mPriors.sum(); } // optimize components (M) if(parameters.trainComponents) { #pragma omp parallel for for(int k = 0; k < numComponents(); ++k) mComponents[k]->train(data, weights.row(k), componentParameters); } else { return true; } } if(parameters.verbosity > 0) cout << setw(6) << parameters.maxIter << setw(14) << setprecision(7) << evaluate(data) << endl; return false; }
bool KmeansClusterer::updateClusterCentersUntilConverged(RefArrayXXd sample, RefArrayXXd centers, RefArrayXd clusterSizes, vector<int> &clusterIndices, double &sumOfDistancesToClosestCenter, double relTolerance) { unsigned int Npoints = sample.cols(); unsigned int Ndimensions = sample.rows(); unsigned int Nclusters = centers.cols(); ArrayXXd updatedCenters = ArrayXXd::Zero(Ndimensions, Nclusters); // coordinates of each of the new cluster centers // Perform the k-means clustering iteration, each time improving the cluster centers, // and redetermining which points belongs to which cluster bool stopIterations = false; bool convergenceReached; unsigned int indexOfClosestCenter; double oldSumOfDistances = 0.0; double newSumOfDistances = 0.0; double distanceToClosestCenter; double distance; while (!stopIterations) { // Find for each point the closest cluster center. // At the same time recompute/update the new cluster centers, which is simply // the barycenter of all points belonging to the cluster. clusterSizes.setZero(); updatedCenters.setZero(); for (int n = 0; n < Npoints; ++n) { distanceToClosestCenter = numeric_limits<double>::max(); for (int i = 0; i < Nclusters; ++i) { distance = metric.distance(sample.col(n), centers.col(i)); if (distance < distanceToClosestCenter) { indexOfClosestCenter = i; distanceToClosestCenter = distance; } } newSumOfDistances += distanceToClosestCenter; updatedCenters.col(indexOfClosestCenter) += sample.col(n); clusterSizes(indexOfClosestCenter) += 1; clusterIndices[n] = indexOfClosestCenter; } // Assert that all clusters contain at least 2 points. If not we probably started // with an unfortunate set of initial cluster centers. Flag this by immediately // returning false. if (!(clusterSizes > 1).all()) { convergenceReached = false; return convergenceReached; } // Finish computing the new updated centers. Given the check above, we are sure // that none of the clusters is empty. updatedCenters.rowwise() /= clusterSizes.transpose(); centers = updatedCenters; // A new set of clusters has been determined. // Decide whether the algorithm has converged. Convergence occurs when // the sum of all distances of all points to their cluster center does // not change significantly anymore. // Note: in order for this criterion to work properly, the coordinate // space should be normalized, so that one particular coordinate // cannot numerically dominate all other coordinates. if (oldSumOfDistances == 0.0) { // This is the first center-updating iteration, so there is nothing to compare yet. // Simply set the variables. oldSumOfDistances = newSumOfDistances; newSumOfDistances = 0.0; } else { // If the relative change in sumOfDistances between old and new was smaller than // the threshold set by the user, stop the iteration loop. if (fabs(newSumOfDistances - oldSumOfDistances) / oldSumOfDistances < relTolerance) { sumOfDistancesToClosestCenter = newSumOfDistances; // will be returned to user stopIterations = true; } else { oldSumOfDistances = newSumOfDistances; newSumOfDistances = 0.0; } } } // end k-means center-updating loop // Convergence was properly reached, so return convergenceReached = true; return convergenceReached; }
Array<double, 1, Dynamic> CMT::logMeanExp(const ArrayXXd& array) { Array<double, 1, Dynamic> arrayMax = array.colwise().maxCoeff() - 1.; return arrayMax + (array.rowwise() - arrayMax).exp().colwise().mean().log(); }
double CMT::MCBM::parameterGradient( const MatrixXd& inputCompl, const MatrixXd& outputCompl, const lbfgsfloatval_t* x, lbfgsfloatval_t* g, const Trainable::Parameters& params_) const { const Parameters& params = dynamic_cast<const Parameters&>(params_); // average log-likelihood double logLik = 0.; // interpret memory for parameters and gradients lbfgsfloatval_t* y = const_cast<lbfgsfloatval_t*>(x); int offset = 0; VectorLBFGS priors(params.trainPriors ? y : const_cast<double*>(mPriors.data()), mNumComponents); VectorLBFGS priorsGrad(g, mNumComponents); if(params.trainPriors) offset += priors.size(); MatrixLBFGS weights(params.trainWeights ? y + offset : const_cast<double*>(mWeights.data()), mNumComponents, mNumFeatures); MatrixLBFGS weightsGrad(g + offset, mNumComponents, mNumFeatures); if(params.trainWeights) offset += weights.size(); MatrixLBFGS features(params.trainFeatures ? y + offset : const_cast<double*>(mFeatures.data()), mDimIn, mNumFeatures); MatrixLBFGS featuresGrad(g + offset, mDimIn, mNumFeatures); if(params.trainFeatures) offset += features.size(); MatrixLBFGS predictors(params.trainPredictors ? y + offset : const_cast<double*>(mPredictors.data()), mNumComponents, mDimIn); MatrixLBFGS predictorsGrad(g + offset, mNumComponents, mDimIn); if(params.trainPredictors) offset += predictors.size(); MatrixLBFGS inputBias(params.trainInputBias ? y + offset : const_cast<double*>(mInputBias.data()), mDimIn, mNumComponents); MatrixLBFGS inputBiasGrad(g + offset, mDimIn, mNumComponents); if(params.trainInputBias) offset += inputBias.size(); VectorLBFGS outputBias(params.trainOutputBias ? y + offset : const_cast<double*>(mOutputBias.data()), mNumComponents); VectorLBFGS outputBiasGrad(g + offset, mNumComponents); if(params.trainOutputBias) offset += outputBias.size(); if(g) { // initialize gradients if(params.trainPriors) priorsGrad.setZero(); if(params.trainWeights) weightsGrad.setZero(); if(params.trainFeatures) featuresGrad.setZero(); if(params.trainPredictors) predictorsGrad.setZero(); if(params.trainInputBias) inputBiasGrad.setZero(); if(params.trainOutputBias) outputBiasGrad.setZero(); } // split data into batches for better performance int numData = static_cast<int>(inputCompl.cols()); int batchSize = min(max(params.batchSize, 10), numData); #pragma omp parallel for for(int b = 0; b < inputCompl.cols(); b += batchSize) { const MatrixXd& input = inputCompl.middleCols(b, min(batchSize, numData - b)); const MatrixXd& output = outputCompl.middleCols(b, min(batchSize, numData - b)); ArrayXXd featureOutput = features.transpose() * input; MatrixXd featureOutputSq = featureOutput.square(); MatrixXd weightsOutput = weights * featureOutputSq; ArrayXXd predictorOutput = predictors * input; // unnormalized posteriors over components for both possible outputs ArrayXXd logPost0 = (weightsOutput + inputBias.transpose() * input).colwise() + priors; ArrayXXd logPost1 = (logPost0 + predictorOutput).colwise() + outputBias.array(); // sum over components to get unnormalized probabilities of outputs Array<double, 1, Dynamic> logProb0 = logSumExp(logPost0); Array<double, 1, Dynamic> logProb1 = logSumExp(logPost1); // normalize posteriors over components logPost0.rowwise() -= logProb0; logPost1.rowwise() -= logProb1; // stack row vectors ArrayXXd logProb01(2, input.cols()); logProb01 << logProb0, logProb1; // normalize log-probabilities Array<double, 1, Dynamic> logNorm = logSumExp(logProb01); logProb1 -= logNorm; logProb0 -= logNorm; double logLikBatch = (output.array() * logProb1 + (1. - output.array()) * logProb0).sum(); #pragma omp critical logLik += logLikBatch; if(!g) // don't compute gradients continue; Array<double, 1, Dynamic> tmp = output.array() * logProb0.exp() - (1. - output.array()) * logProb1.exp(); ArrayXXd post0Tmp = logPost0.exp().rowwise() * tmp; ArrayXXd post1Tmp = logPost1.exp().rowwise() * tmp; ArrayXXd postDiffTmp = post1Tmp - post0Tmp; // update gradients if(params.trainPriors) #pragma omp critical priorsGrad -= postDiffTmp.rowwise().sum().matrix(); if(params.trainWeights) #pragma omp critical weightsGrad -= postDiffTmp.matrix() * featureOutputSq.transpose(); if(params.trainFeatures) { ArrayXXd tmp2 = weights.transpose() * postDiffTmp.matrix() * 2.; MatrixXd tmp3 = featureOutput * tmp2; #pragma omp critical featuresGrad -= input * tmp3.transpose(); } if(params.trainPredictors) #pragma omp critical predictorsGrad -= post1Tmp.matrix() * input.transpose(); if(params.trainInputBias) #pragma omp critical inputBiasGrad -= input * postDiffTmp.matrix().transpose(); if(params.trainOutputBias) #pragma omp critical outputBiasGrad -= post1Tmp.rowwise().sum().matrix(); } double normConst = inputCompl.cols() * log(2.) * dimOut(); if(g) { for(int i = 0; i < offset; ++i) g[i] /= normConst; if(params.trainFeatures) featuresGrad += params.regularizeFeatures.gradient(features); if(params.trainPredictors) predictorsGrad += params.regularizePredictors.gradient(predictors.transpose()).transpose(); if(params.trainWeights) weightsGrad += params.regularizeWeights.gradient(weights); } double value = -logLik / normConst; if(params.trainFeatures) value += params.regularizeFeatures.evaluate(features); if(params.trainPredictors) value += params.regularizePredictors.evaluate(predictors.transpose()); if(params.trainWeights) value += params.regularizeWeights.evaluate(weights); return value; }
pair<pair<ArrayXXd, ArrayXXd>, Array<double, 1, Dynamic> > CMT::STM::computeDataGradient( const MatrixXd& input, const MatrixXd& output) const { // make sure nonlinearity is differentiable DifferentiableNonlinearity* nonlinearity = dynamic_cast<DifferentiableNonlinearity*>(mNonlinearity); if(!nonlinearity) throw Exception("Nonlinearity has to be differentiable."); if(input.rows() != dimIn()) throw Exception("Input has wrong dimensionality."); if(output.rows() != 1) throw Exception("Output has wrong dimensionality."); if(input.cols() != output.cols()) throw Exception("Number of inputs and outputs should be the same."); if(dimInNonlinear() && !dimInLinear()) { Array<double, 1, Dynamic> responses; ArrayXXd jointEnergy; if(numFeatures() > 0) jointEnergy = mWeights * (mFeatures.transpose() * input).array().square().matrix() + mPredictors * input; else jointEnergy = mPredictors * input; jointEnergy.colwise() += mBiases.array(); jointEnergy *= mSharpness; responses = logSumExp(jointEnergy); // posterior over components for each input MatrixXd posterior = (jointEnergy.rowwise() - responses).array().exp(); responses /= mSharpness; Array<double, 1, Dynamic> tmp0 = (*mNonlinearity)(responses); Array<double, 1, Dynamic> tmp1 = -mDistribution->gradient(output, tmp0); Array<double, 1, Dynamic> tmp2 = nonlinearity->derivative(responses); ArrayXXd avgPredictor = mPredictors.transpose() * posterior; ArrayXXd tmp3; if(numFeatures() > 0) { ArrayXXd avgWeights = (2. * mWeights).transpose() * posterior; tmp3 = mFeatures * (avgWeights * (mFeatures.transpose() * input).array()).matrix(); } else { tmp3 = ArrayXXd::Zero(avgPredictor.rows(), avgPredictor.cols()); } return make_pair( make_pair( (tmp3 + avgPredictor).rowwise() * (tmp1 * tmp2), ArrayXXd::Zero(output.rows(), output.cols())), mDistribution->logLikelihood(output, tmp0)); } else if(dimInNonlinear() && dimInLinear()) { // split inputs into linear and nonlinear components MatrixXd inputNonlinear = input.topRows(dimInNonlinear()); MatrixXd inputLinear = input.bottomRows(dimInLinear()); Array<double, 1, Dynamic> responses; ArrayXXd jointEnergy; if(numFeatures() > 0) jointEnergy = mWeights * (mFeatures.transpose() * inputNonlinear).array().square().matrix() + mPredictors * input; else jointEnergy = mPredictors * inputNonlinear; jointEnergy.colwise() += mBiases.array(); jointEnergy *= mSharpness; responses = logSumExp(jointEnergy); // posterior over components for each input MatrixXd posterior = (jointEnergy.rowwise() - responses).array().exp(); responses /= mSharpness; responses += (mLinearPredictor.transpose() * inputLinear).array(); Array<double, 1, Dynamic> tmp0 = (*mNonlinearity)(responses); Array<double, 1, Dynamic> tmp1 = -mDistribution->gradient(output, tmp0); Array<double, 1, Dynamic> tmp2 = nonlinearity->derivative(responses); ArrayXXd avgPredictor = mPredictors.transpose() * posterior; ArrayXXd tmp3; if(numFeatures() > 0) { ArrayXXd avgWeights = (2. * mWeights).transpose() * posterior; tmp3 = mFeatures * (avgWeights * (mFeatures.transpose() * inputNonlinear).array()).matrix(); } else { tmp3 = ArrayXXd::Zero(avgPredictor.rows(), avgPredictor.cols()); } // concatenate gradients of nonlinear and linear component ArrayXXd inputGradient(dimIn(), input.cols()); inputGradient << (tmp3 + avgPredictor).rowwise() * (tmp1 * tmp2), mLinearPredictor * (tmp1 * tmp2).matrix(); return make_pair( make_pair( inputGradient, ArrayXXd::Zero(output.rows(), output.cols())), mDistribution->logLikelihood(output, tmp0)); } else if(dimInLinear()) { double avgBias = logSumExp(mSharpness * mBiases)(0, 0) / mSharpness; Array<double, 1, Dynamic> responses = (mLinearPredictor.transpose() * input).array() + avgBias; Array<double, 1, Dynamic> tmp0 = (*mNonlinearity)(responses); Array<double, 1, Dynamic> tmp1 = -mDistribution->gradient(output, tmp0); Array<double, 1, Dynamic> tmp2 = nonlinearity->derivative(responses); return make_pair( make_pair( mLinearPredictor * (tmp1 * tmp2).matrix(), ArrayXXd::Zero(output.rows(), output.cols())), mDistribution->logLikelihood(output, tmp0)); } return make_pair( make_pair( ArrayXXd::Zero(input.rows(), input.cols()), ArrayXXd::Zero(output.rows(), output.cols())), logLikelihood(input, output)); }