Пример #1
0
LSHSearch<SortPolicy>::
LSHSearch(const arma::mat& referenceSet,
          const size_t numProj,
          const size_t numTables,
          const double hashWidthIn,
          const size_t secondHashSize,
          const size_t bucketSize) :
  referenceSet(referenceSet),
  querySet(referenceSet),
  numProj(numProj),
  numTables(numTables),
  hashWidth(hashWidthIn),
  secondHashSize(secondHashSize),
  bucketSize(bucketSize),
  distanceEvaluations(0)
{
  if (hashWidth == 0.0) // The user has not provided any value.
  {
    // Compute a heuristic hash width from the data.
    for (size_t i = 0; i < 25; i++)
    {
      size_t p1 = (size_t) math::RandInt(referenceSet.n_cols);
      size_t p2 = (size_t) math::RandInt(referenceSet.n_cols);

      hashWidth += std::sqrt(metric::EuclideanDistance::Evaluate(
          referenceSet.unsafe_col(p1), referenceSet.unsafe_col(p2)));
    }

    hashWidth /= 25;
  }

  Log::Info << "Hash width chosen as: " << hashWidth << std::endl;

  BuildHash();
}
Пример #2
0
FastMKSRules<KernelType, TreeType>::FastMKSRules(const arma::mat& referenceSet,
        const arma::mat& querySet,
        arma::Mat<size_t>& indices,
        arma::mat& products,
        KernelType& kernel) :
    referenceSet(referenceSet),
    querySet(querySet),
    indices(indices),
    products(products),
    kernel(kernel),
    lastQueryIndex(-1),
    lastReferenceIndex(-1),
    lastKernel(0.0),
    baseCases(0),
    scores(0)
{
    // Precompute each self-kernel.
    queryKernels.set_size(querySet.n_cols);
    for (size_t i = 0; i < querySet.n_cols; ++i)
        queryKernels[i] = sqrt(kernel.Evaluate(querySet.unsafe_col(i),
                                               querySet.unsafe_col(i)));

    referenceKernels.set_size(referenceSet.n_cols);
    for (size_t i = 0; i < referenceSet.n_cols; ++i)
        referenceKernels[i] = sqrt(kernel.Evaluate(referenceSet.unsafe_col(i),
                                   referenceSet.unsafe_col(i)));

    // Set to invalid memory, so that the first node combination does not try to
    // dereference null pointers.
    traversalInfo.LastQueryNode() = (TreeType*) this;
    traversalInfo.LastReferenceNode() = (TreeType*) this;
}
Пример #3
0
double HMM<Distribution>::Predict(const arma::mat& dataSeq,
                                  arma::Col<size_t>& stateSeq) const
{
  // This is an implementation of the Viterbi algorithm for finding the most
  // probable sequence of states to produce the observed data sequence.  We
  // don't use log-likelihoods to save that little bit of time, but we'll
  // calculate the log-likelihood at the end of it all.
  stateSeq.set_size(dataSeq.n_cols);
  arma::mat logStateProb(transition.n_rows, dataSeq.n_cols);
  arma::mat stateSeqBack(transition.n_rows, dataSeq.n_cols);

  // Store the logs of the transposed transition matrix.  This is because we
  // will be using the rows of the transition matrix.
  arma::mat logTrans(log(trans(transition)));

  // The calculation of the first state is slightly different; the probability
  // of the first state being state j is the maximum probability that the state
  // came to be j from another state.
  logStateProb.col(0).zeros();
  for (size_t state = 0; state < transition.n_rows; state++)
  {
    logStateProb(state, 0) = log(initial[state] *
        emission[state].Probability(dataSeq.unsafe_col(0)));
    stateSeqBack(state, 0) = state;
  }

  // Store the best first state.
  arma::uword index;
  for (size_t t = 1; t < dataSeq.n_cols; t++)
  {
    // Assemble the state probability for this element.
    // Given that we are in state j, we use state with the highest probability
    // of being the previous state.
    for (size_t j = 0; j < transition.n_rows; j++)
    {
      arma::vec prob = logStateProb.col(t - 1) + logTrans.col(j);
      logStateProb(j, t) = prob.max(index) +
          log(emission[j].Probability(dataSeq.unsafe_col(t)));
        stateSeqBack(j, t) = index;
    }
  }

  // Backtrack to find the most probable state sequence.
  logStateProb.unsafe_col(dataSeq.n_cols - 1).max(index);
  stateSeq[dataSeq.n_cols - 1] = index;
  for (size_t t = 2; t <= dataSeq.n_cols; t++)
    stateSeq[dataSeq.n_cols - t] =
        stateSeqBack(stateSeq[dataSeq.n_cols - t + 1], dataSeq.n_cols - t + 1);

  return logStateProb(stateSeq(dataSeq.n_cols - 1), dataSeq.n_cols - 1);
}
Пример #4
0
  inline static void Initialize(const MatType& V,
                                const size_t r,
                                arma::mat& W,
                                arma::mat& H)
  {
    const size_t n = V.n_rows;
    const size_t m = V.n_cols;

    if (columnsToAverage > m)
    {
      Log::Warn << "Number of random columns (columnsToAverage) is more than "
          << "the number of columns available in the V matrix; weird results "
          << "may ensue!" << std::endl;
    }

    W.zeros(n, r);

    // Initialize W matrix with random columns.
    for (size_t col = 0; col < r; col++)
    {
      for (size_t randCol = 0; randCol < columnsToAverage; randCol++)
      {
        // .col() does not work in this case, as of Armadillo 3.920.
        W.unsafe_col(col) += V.col(math::RandInt(0, m));
      }
    }

    // Now divide by p.
    W /= columnsToAverage;

    // Initialize H to random values.
    H.randu(r, m);
  }
Пример #5
0
void HMM<Distribution>::Backward(const arma::mat& dataSeq,
                                 const arma::vec& scales,
                                 arma::mat& backwardProb) const
{
  // Our goal is to calculate the backward probabilities:
  //  P(X_k | o_{k + 1:T}) for all possible states X_k, for each time point k.
  backwardProb.zeros(transition.n_rows, dataSeq.n_cols);

  // The last element probability is 1.
  backwardProb.col(dataSeq.n_cols - 1).fill(1);

  // Now step backwards through all other observations.
  for (size_t t = dataSeq.n_cols - 2; t + 1 > 0; t--)
  {
    for (size_t j = 0; j < transition.n_rows; j++)
    {
      // The backward probability of state j at time t is the sum over all state
      // of the probability of the next state having been a transition from the
      // current state multiplied by the probability of each of those states
      // emitting the given observation.
      for (size_t state = 0; state < transition.n_rows; state++)
        backwardProb(j, t) += transition(state, j) * backwardProb(state, t + 1)
            * emission[state].Probability(dataSeq.unsafe_col(t + 1));

      // Normalize by the weights from the forward algorithm.
      backwardProb(j, t) /= scales[t + 1];
    }
  }
}
Пример #6
0
inline force_inline
double LSHSearch<SortPolicy>::BaseCase(arma::mat& distances,
                                       arma::Mat<size_t>& neighbors,
                                       const size_t queryIndex,
                                       const size_t referenceIndex)
{
  // If the datasets are the same, then this search is only using one dataset
  // and we should not return identical points.
  if ((&querySet == &referenceSet) && (queryIndex == referenceIndex))
    return 0.0;

  const double distance = metric::EuclideanDistance::Evaluate(
      querySet.unsafe_col(queryIndex), referenceSet.unsafe_col(referenceIndex));

  // If this distance is better than any of the current candidates, the
  // SortDistance() function will give us the position to insert it into.
  arma::vec queryDist = distances.unsafe_col(queryIndex);
  arma::Col<size_t> queryIndices = neighbors.unsafe_col(queryIndex);
  size_t insertPosition = SortPolicy::SortDistance(queryDist, queryIndices,
      distance);

  // SortDistance() returns (size_t() - 1) if we shouldn't add it.
  if (insertPosition != (size_t() - 1))
    InsertNeighbor(distances, neighbors, queryIndex, insertPosition,
        referenceIndex, distance);

  return distance;
}
Пример #7
0
    /**
     * Construct the exact kernel matrix.
     *
     * @param data Input data points.
     * @param transformedData Matrix to output results into.
     * @param eigval KPCA eigenvalues will be written to this vector.
     * @param eigvec KPCA eigenvectors will be written to this matrix.
     * @param rank Rank to be used for matrix approximation.
     * @param kernel Kernel to be used for computation.
     */
    static void ApplyKernelMatrix(const arma::mat& data,
                                  arma::mat& transformedData,
                                  arma::vec& eigval,
                                  arma::mat& eigvec,
                                  const size_t /* unused */,
                                  KernelType kernel = KernelType())
  {
    // Construct the kernel matrix.
    arma::mat kernelMatrix;
    // Resize the kernel matrix to the right size.
    kernelMatrix.set_size(data.n_cols, data.n_cols);

    // Note that we only need to calculate the upper triangular part of the 
    // kernel matrix, since it is symmetric. This helps minimize the number of
    // kernel evaluations.
    for (size_t i = 0; i < data.n_cols; ++i)
    {
      for (size_t j = i; j < data.n_cols; ++j)
      {
        // Evaluate the kernel on these two points.
        kernelMatrix(i, j) = kernel.Evaluate(data.unsafe_col(i),
                                             data.unsafe_col(j));
      }
    }

    // Copy to the lower triangular part of the matrix.
    for (size_t i = 1; i < data.n_cols; ++i)
      for (size_t j = 0; j < i; ++j)
        kernelMatrix(i, j) = kernelMatrix(j, i);

    // For PCA the data has to be centered, even if the data is centered. But it
    // is not guaranteed that the data, when mapped to the kernel space, is also
    // centered. Since we actually never work in the feature space we cannot
    // center the data. So, we perform a "psuedo-centering" using the kernel
    // matrix.
    arma::rowvec rowMean = arma::sum(kernelMatrix, 0) / kernelMatrix.n_cols;
    kernelMatrix.each_col() -= arma::sum(kernelMatrix, 1) / kernelMatrix.n_cols;
    kernelMatrix.each_row() -= rowMean;
    kernelMatrix += arma::sum(rowMean) / kernelMatrix.n_cols;

    // Eigendecompose the centered kernel matrix.
    arma::eig_sym(eigval, eigvec, kernelMatrix);

    // Swap the eigenvalues since they are ordered backwards (we need largest to
    // smallest).
    for (size_t i = 0; i < floor(eigval.n_elem / 2.0); ++i)
      eigval.swap_rows(i, (eigval.n_elem - 1) - i);

    // Flip the coefficients to produce the same effect.
    eigvec = arma::fliplr(eigvec);

    transformedData = eigvec.t() * kernelMatrix;
    transformedData.each_col() /= arma::sqrt(eigval);
  }
Пример #8
0
void HMM<Distribution>::Forward(const arma::mat& dataSeq,
                                arma::vec& scales,
                                arma::mat& forwardProb) const
{
  // Our goal is to calculate the forward probabilities:
  //  P(X_k | o_{1:k}) for all possible states X_k, for each time point k.
  forwardProb.zeros(transition.n_rows, dataSeq.n_cols);
  scales.zeros(dataSeq.n_cols);

  // The first entry in the forward algorithm uses the initial state
  // probabilities.  Note that MATLAB assumes that the starting state (at
  // t = -1) is state 0; this is not our assumption here.  To force that
  // behavior, you could append a single starting state to every single data
  // sequence and that should produce results in line with MATLAB.
  for (size_t state = 0; state < transition.n_rows; state++)
    forwardProb(state, 0) = initial(state) *
        emission[state].Probability(dataSeq.unsafe_col(0));

  // Then normalize the column.
  scales[0] = accu(forwardProb.col(0));
  forwardProb.col(0) /= scales[0];

  // Now compute the probabilities for each successive observation.
  for (size_t t = 1; t < dataSeq.n_cols; t++)
  {
    for (size_t j = 0; j < transition.n_rows; j++)
    {
      // The forward probability of state j at time t is the sum over all states
      // of the probability of the previous state transitioning to the current
      // state and emitting the given observation.
      forwardProb(j, t) = accu(forwardProb.col(t - 1) %
          trans(transition.row(j))) *
          emission[j].Probability(dataSeq.unsafe_col(t));
    }

    // Normalize probability.
    scales[t] = accu(forwardProb.col(t));
    forwardProb.col(t) /= scales[t];
  }
}
Пример #9
0
void SparseCoding::Encode(const arma::mat& data, arma::mat& codes)
{
    // When using the Cholesky version of LARS, this is correct even if
    // lambda2 > 0.
    arma::mat matGram = trans(dictionary) * dictionary;

    codes.set_size(atoms, data.n_cols);
    for (size_t i = 0; i < data.n_cols; ++i)
    {
        // Report progress.
        if ((i % 100) == 0)
            Log::Debug << "Optimization at point " << i << "." << std::endl;

        bool useCholesky = true;
        regression::LARS lars(useCholesky, matGram, lambda1, lambda2);

        // Create an alias of the code (using the same memory), and then LARS will
        // place the result directly into that; then we will not need to have an
        // extra copy.
        arma::vec code = codes.unsafe_col(i);
        lars.Train(dictionary, data.unsafe_col(i), code, false);
    }
}
Пример #10
0
void HMM<Distribution>::Forward(const arma::mat& dataSeq,
                                arma::vec& scales,
                                arma::mat& forwardProb) const
{
  // Our goal is to calculate the forward probabilities:
  //  P(X_k | o_{1:k}) for all possible states X_k, for each time point k.
  forwardProb.zeros(transition.n_rows, dataSeq.n_cols);
  scales.zeros(dataSeq.n_cols);

  // Starting state (at t = -1) is assumed to be state 0.  This is what MATLAB
  // does in their hmmdecode() function, so we will emulate that behavior.
  for (size_t state = 0; state < transition.n_rows; state++)
    forwardProb(state, 0) = transition(state, 0) *
        emission[state].Probability(dataSeq.unsafe_col(0));

  // Then normalize the column.
  scales[0] = accu(forwardProb.col(0));
  forwardProb.col(0) /= scales[0];

  // Now compute the probabilities for each successive observation.
  for (size_t t = 1; t < dataSeq.n_cols; t++)
  {
    for (size_t j = 0; j < transition.n_rows; j++)
    {
      // The forward probability of state j at time t is the sum over all states
      // of the probability of the previous state transitioning to the current
      // state and emitting the given observation.
      forwardProb(j, t) = accu(forwardProb.col(t - 1) %
          trans(transition.row(j))) *
          emission[j].Probability(dataSeq.unsafe_col(t));
    }

    // Normalize probability.
    scales[t] = accu(forwardProb.col(t));
    forwardProb.col(t) /= scales[t];
  }
}
Пример #11
0
void mlpack::det::PrintLeafMembership(DTree* dtree,
                                      const arma::mat& data,
                                      const arma::Mat<size_t>& labels,
                                      const size_t numClasses,
                                      const std::string leafClassMembershipFile)
{
  // Tag the leaves with numbers.
  int numLeaves = dtree->TagTree();

  arma::Mat<size_t> table(numLeaves, (numClasses + 1));
  table.zeros();

  for (size_t i = 0; i < data.n_cols; i++)
  {
    const arma::vec testPoint = data.unsafe_col(i);
    const int leafTag = dtree->FindBucket(testPoint);
    const size_t label = labels[i];
    table(leafTag, label) += 1;
  }

  if (leafClassMembershipFile == "")
  {
    Log::Info << "Leaf membership; row represents leaf id, column represents "
        << "class id; value represents number of points in leaf in class."
        << std::endl << table;
  }
  else
  {
    // Create a stream for the file.
    std::ofstream outfile(leafClassMembershipFile.c_str());
    if (outfile.good())
    {
      outfile << table;
      Log::Info << "Leaf membership printed to '" << leafClassMembershipFile
          << "'." << std::endl;
    }
    else
    {
      Log::Warn << "Can't open '" << leafClassMembershipFile << "' to write "
          << "leaf membership to." << std::endl;
    }
    outfile.close();
  }

  return;
}
Пример #12
0
double find_P(const arma::mat& X, const arma::mat& Y, double sigma2,
              float outliers, arma::vec& P1, arma::vec& Pt1, arma::mat& PX,
              bool use_fgt, const float epsilon) {
    P1.zeros();
    Pt1.zeros();
    PX.zeros();

    const arma::uword N = X.n_rows;
    const arma::uword M = Y.n_rows;
    const arma::uword D = Y.n_cols;

    const double h = std::sqrt(2 * sigma2);
    const double ndi = (outliers * M * std::pow(2 * M_PI * sigma2, 0.5 * D)) /
                       ((1 - outliers) * N);
    arma::vec q = arma::ones<arma::vec>(M);

    fgt::GaussTransformUnqPtr transformY;
    if (use_fgt) {
        transformY = fgt::choose_gauss_transform(Y, h, epsilon);
    } else {
        transformY.reset(new fgt::Direct(Y, h));
    }
    arma::vec denomP = transformY->compute(X, q);

    denomP = denomP + ndi;
    Pt1 = 1 - ndi / denomP;
    q = 1 / denomP;

    fgt::GaussTransformUnqPtr transformX;
    if (use_fgt) {
        transformX = fgt::choose_gauss_transform(X, h, epsilon);
    } else {
        transformX.reset(new fgt::Direct(X, h));
    }
    P1 = transformX->compute(Y, q);

    for (arma::uword i = 0; i < D; ++i) {
        q = X.col(i) / denomP;
        arma::vec c = PX.unsafe_col(i);
        PX.col(i) = transformX->compute(Y, q);
    }

    return -arma::sum(arma::log(denomP)) + D * N * std::log(sigma2) / 2;
}
Пример #13
0
/**
* Calculates the multivariate Gaussian log probability density function for each
* data point (column) in the given matrix
*
* @param x List of observations.
* @param probabilities Output log probabilities for each input observation.
*/
inline void GaussianDistribution::LogProbability(const arma::mat& x,
                                                 arma::vec& logProbabilities) const
{
  // Column i of 'diffs' is the difference between x.col(i) and the mean.
  arma::mat diffs = x - (mean * arma::ones<arma::rowvec>(x.n_cols));

  // Now, we only want to calculate the diagonal elements of (diffs' * cov^-1 *
  // diffs).  We just don't need any of the other elements.  We can calculate
  // the right hand part of the equation (instead of the left side) so that
  // later we are referencing columns, not rows -- that is faster.
  const arma::mat rhs = -0.5 * invCov * diffs;
  arma::vec logExponents(diffs.n_cols); // We will now fill this.
  for (size_t i = 0; i < diffs.n_cols; i++)
    logExponents(i) = accu(diffs.unsafe_col(i) % rhs.unsafe_col(i));

  const size_t k = x.n_rows;

  logProbabilities = -0.5 * k * log2pi - 0.5 * logDetCov + logExponents;
}
Пример #14
0
void GMM<FittingType>::Classify(const arma::mat& observations,
                                arma::Col<size_t>& labels) const
{
  // This is not the best way to do this!

  // We should not have to fill this with values, because each one should be
  // overwritten.
  labels.set_size(observations.n_cols);
  for (size_t i = 0; i < observations.n_cols; ++i)
  {
    // Find maximum probability component.
    double probability = 0;
    for (size_t j = 0; j < gaussians; ++j)
    {
      double newProb = Probability(observations.unsafe_col(i), j);
      if (newProb >= probability)
      {
        probability = newProb;
        labels[i] = j;
      }
    }
  }
}
Пример #15
0
inline void MeanShift<UseKernel, KernelType, MatType>::Cluster(
    const MatType& data,
    arma::Col<size_t>& assignments,
    arma::mat& centroids,
    bool useSeeds)
{
  if (radius <= 0)
  {
    // An invalid radius is given; an estimation is needed.
    Radius(EstimateRadius(data));
  }

  MatType seeds;
  const MatType* pSeeds = &data;
  if (useSeeds)
  {
    GenSeeds(data, radius, 1, seeds);
    pSeeds = &seeds;
  }

  // Holds all centroids before removing duplicate ones.
  arma::mat allCentroids(pSeeds->n_rows, pSeeds->n_cols);

  assignments.set_size(data.n_cols);

  range::RangeSearch<> rangeSearcher(data);
  math::Range validRadius(0, radius);
  std::vector<std::vector<size_t> > neighbors;
  std::vector<std::vector<double> > distances;

  // For each seed, perform mean shift algorithm.
  for (size_t i = 0; i < pSeeds->n_cols; ++i)
  {
    // Initial centroid is the seed itself.
    allCentroids.col(i) = pSeeds->unsafe_col(i);
    for (size_t completedIterations = 0; completedIterations < maxIterations;
         completedIterations++)
    {
      // Store new centroid in this.
      arma::colvec newCentroid = arma::zeros<arma::colvec>(pSeeds->n_rows);

      rangeSearcher.Search(allCentroids.unsafe_col(i), validRadius,
          neighbors, distances);
      if (neighbors[0].size() <= 1)
        break;

      // Calculate new centroid.
      if (!CalculateCentroid(data, neighbors[0], distances[0], newCentroid))
        newCentroid = allCentroids.unsafe_col(i);

      // If the mean shift vector is small enough, it has converged.
      if (metric::EuclideanDistance::Evaluate(newCentroid,
          allCentroids.unsafe_col(i)) < 1e-3 * radius)
      {
        // Determine if the new centroid is duplicate with old ones.
        bool isDuplicated = false;
        for (size_t k = 0; k < centroids.n_cols; ++k)
        {
          const double distance = metric::EuclideanDistance::Evaluate(
              allCentroids.unsafe_col(i), centroids.unsafe_col(k));
          if (distance < radius)
          {
            isDuplicated = true;
            break;
          }
        }

        if (!isDuplicated)
          centroids.insert_cols(centroids.n_cols, allCentroids.unsafe_col(i));

        // Get out of the loop.
        break;
      }

      // Update the centroid.
      allCentroids.col(i) = newCentroid;
    }
  }

  // Assign centroids to each point.
  neighbor::KNN neighborSearcher(centroids);
  arma::mat neighborDistances;
  arma::Mat<size_t> resultingNeighbors;
  neighborSearcher.Search(data, 1, resultingNeighbors, neighborDistances);
  assignments = resultingNeighbors.t();
}