double SoftmaxErrorFunction<MetricType>::Evaluate(const arma::mat& coordinates)
{
  // Calculate the denominators and numerators, if necessary.
  Precalculate(coordinates);

  return -accu(p); // Sum of p_i for all i.  We negate because our solver
                   // minimizes, not maximizes.
};
size_t MaxVarianceNewCluster::EmptyCluster(const MatType& data,
                                           const size_t emptyCluster,
                                           const arma::mat& oldCentroids,
                                           arma::mat& newCentroids,
                                           arma::Col<size_t>& clusterCounts,
                                           MetricType& metric,
                                           const size_t iteration)
{
  // If necessary, calculate the variances and assignments.
  if (iteration != this->iteration || assignments.n_elem != data.n_cols)
    Precalculate(data, oldCentroids, clusterCounts, metric);
  this->iteration = iteration;

  // Now find the cluster with maximum variance.
  arma::uword maxVarCluster;
  variances.max(maxVarCluster);

  // Now, inside this cluster, find the point which is furthest away.
  size_t furthestPoint = data.n_cols;
  double maxDistance = -DBL_MAX;
  for (size_t i = 0; i < data.n_cols; ++i)
  {
    if (assignments[i] == maxVarCluster)
    {
      const double distance = std::pow(metric.Evaluate(data.col(i),
          newCentroids.col(maxVarCluster)), 2.0);

      if (distance > maxDistance)
      {
        maxDistance = distance;
        furthestPoint = i;
      }
    }
  }

  // Take that point and add it to the empty cluster.
  newCentroids.col(maxVarCluster) *= (double(clusterCounts[maxVarCluster]) /
      double(clusterCounts[maxVarCluster] - 1));
  newCentroids.col(maxVarCluster) -= (1.0 / (clusterCounts[maxVarCluster] - 1.0)) *
      arma::vec(data.col(furthestPoint));
  clusterCounts[maxVarCluster]--;
  clusterCounts[emptyCluster]++;
  newCentroids.col(emptyCluster) = arma::vec(data.col(furthestPoint));
  assignments[furthestPoint] = emptyCluster;

  // Modify the variances, as necessary.
  variances[emptyCluster] = 0;
  // One has already been subtracted from clusterCounts[maxVarCluster].
  variances[maxVarCluster] = (1.0 / (clusterCounts[maxVarCluster])) *
      ((clusterCounts[maxVarCluster] + 1) * variances[maxVarCluster] - maxDistance);

  // Output some debugging information.
  Log::Debug << "Point " << furthestPoint << " assigned to empty cluster " <<
      emptyCluster << ".\n";

  return 1; // We only changed one point.
}
void SoftmaxErrorFunction<MetricType>::Gradient(const arma::mat& coordinates,
                                                arma::mat& gradient)
{
  // Calculate the denominators and numerators, if necessary.
  Precalculate(coordinates);

  // Now, we handle the summation over i:
  //   sum_i (p_i sum_k (p_ik x_ik x_ik^T) -
  //       sum_{j in class of i} (p_ij x_ij x_ij^T)
  // We can algebraically manipulate the whole thing to produce a more
  // memory-friendly way to calculate this.  Looping over each i and k (again
  // O((n * (n + 1)) / 2) as with the last step, we can add the following to the
  // sum:
  //
  //   if class of i is the same as the class of k, add
  //     (((p_i - (1 / p_i)) p_ik) + ((p_k - (1 / p_k)) p_ki)) x_ik x_ik^T
  //   otherwise, add
  //     (p_i p_ik + p_k p_ki) x_ik x_ik^T
  arma::mat sum;
  sum.zeros(stretchedDataset.n_rows, stretchedDataset.n_rows);
  for (size_t i = 0; i < stretchedDataset.n_cols; i++)
  {
    for (size_t k = (i + 1); k < stretchedDataset.n_cols; k++)
    {
      // Calculate p_ik and p_ki first.
      double eval = exp(-metric.Evaluate(stretchedDataset.unsafe_col(i),
                                         stretchedDataset.unsafe_col(k)));
      double p_ik = 0, p_ki = 0;
      p_ik = eval / denominators(i);
      p_ki = eval / denominators(k);

      // Subtract x_i from x_k.  We are not using stretched points here.
      arma::vec x_ik = dataset.col(i) - dataset.col(k);
      arma::mat secondTerm = (x_ik * trans(x_ik));

      if (labels[i] == labels[k])
        sum += ((p[i] - 1) * p_ik + (p[k] - 1) * p_ki) * secondTerm;
      else
        sum += (p[i] * p_ik + p[k] * p_ki) * secondTerm;
    }
  }

  // Assemble the final gradient.
  gradient = -2 * coordinates * sum;
}