예제 #1
0
  void AverageLinkage::operator()(DistanceMatrix<float> & original_distance, std::vector<BinaryTreeNode> & cluster_tree, const float threshold /*=1*/) const
  {
    // input MUST have >= 2 elements!
    if (original_distance.dimensionsize() < 2)
    {
      throw ClusterFunctor::InsufficientInput(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "Distance matrix to start from only contains one element");
    }

    std::vector<std::set<Size> > clusters(original_distance.dimensionsize());
    for (Size i = 0; i < original_distance.dimensionsize(); ++i)
    {
      clusters[i].insert(i);
    }

    cluster_tree.clear();
    cluster_tree.reserve(original_distance.dimensionsize() - 1);

    // Initial minimum-distance pair
    original_distance.updateMinElement();
    std::pair<Size, Size> min = original_distance.getMinElementCoordinates();

    Size overall_cluster_steps(original_distance.dimensionsize());
    startProgress(0, original_distance.dimensionsize(), "clustering data");

    while (original_distance(min.second, min.first) < threshold)
    {
      //grow the tree
      cluster_tree.push_back(BinaryTreeNode(*(clusters[min.second].begin()), *(clusters[min.first].begin()), original_distance(min.first, min.second)));
      if (cluster_tree.back().left_child > cluster_tree.back().right_child)
      {
        std::swap(cluster_tree.back().left_child, cluster_tree.back().right_child);
      }

      if (original_distance.dimensionsize() > 2)
      {
        //pick minimum-distance pair i,j and merge them

        //calculate parameter for lance-williams formula
        float alpha_i = (float)(clusters[min.first].size() / (float)(clusters[min.first].size() + clusters[min.second].size()));
        float alpha_j = (float)(clusters[min.second].size() / (float)(clusters[min.first].size() + clusters[min.second].size()));
        //~ std::cout << alpha_i << '\t' << alpha_j << std::endl;

        //pushback elements of second to first (and then erase second)
        clusters[min.second].insert(clusters[min.first].begin(), clusters[min.first].end());
        // erase first one
        clusters.erase(clusters.begin() + min.first);

        //update original_distance matrix
        //average linkage: new distance between clusters is the minimum distance between elements of each cluster
        //lance-williams update for d((i,j),k): (m_i/m_i+m_j)* d(i,k) + (m_j/m_i+m_j)* d(j,k) ; m_x is the number of elements in cluster x
        for (Size k = 0; k < min.second; ++k)
        {
          float dik = original_distance.getValue(min.first, k);
          float djk = original_distance.getValue(min.second, k);
          original_distance.setValueQuick(min.second, k, (alpha_i * dik + alpha_j * djk));
        }
        for (Size k = min.second + 1; k < original_distance.dimensionsize(); ++k)
        {
          float dik = original_distance.getValue(min.first, k);
          float djk = original_distance.getValue(min.second, k);
          original_distance.setValueQuick(k, min.second, (alpha_i * dik + alpha_j * djk));
        }

        //reduce
        original_distance.reduce(min.first);

        //update minimum-distance pair
        original_distance.updateMinElement();

        //get min-pair from triangular matrix
        min = original_distance.getMinElementCoordinates();
      }
      else
      {
        break;
      }
      setProgress(overall_cluster_steps - original_distance.dimensionsize());

      //repeat until only two cluster remains, last step skips matrix operations
    }
    //fill tree with dummy nodes
    Size sad(*clusters.front().begin());
    for (Size i = 1; (i < clusters.size()) && (cluster_tree.size() < cluster_tree.capacity()); ++i)
    {
      cluster_tree.push_back(BinaryTreeNode(sad, *clusters[i].begin(), -1.0));
    }

    endProgress();
  }
예제 #2
0
  void CompleteLinkage::operator()(DistanceMatrix<float> & original_distance, std::vector<BinaryTreeNode> & cluster_tree, const float threshold /*=1*/) const
  {
    // attention: clustering process is done by clustering the indices
    // pointing to elements in inputvector and distances in inputmatrix

    // input MUST have >= 2 elements!
    if (original_distance.dimensionsize() < 2)
    {
      throw ClusterFunctor::InsufficientInput(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "Distance matrix to start from only contains one element");
    }

    std::vector<std::set<Size> > clusters(original_distance.dimensionsize());
    for (Size i = 0; i < original_distance.dimensionsize(); ++i)
    {
      clusters[i].insert(i);
    }

    cluster_tree.clear();
    cluster_tree.reserve(original_distance.dimensionsize() - 1);

    // Initial minimum-distance pair
    original_distance.updateMinElement();
    std::pair<Size, Size> min = original_distance.getMinElementCoordinates();

    Size overall_cluster_steps(original_distance.dimensionsize());
    startProgress(0, original_distance.dimensionsize(), "clustering data");

    while (original_distance(min.first, min.second) < threshold)
    {
      //grow the tree
      cluster_tree.push_back(BinaryTreeNode(*(clusters[min.second].begin()), *(clusters[min.first].begin()), original_distance(min.first, min.second)));
      if (cluster_tree.back().left_child > cluster_tree.back().right_child)
      {
        std::swap(cluster_tree.back().left_child, cluster_tree.back().right_child);
      }

      if (original_distance.dimensionsize() > 2)
      {
        //pick minimum-distance pair i,j and merge them

        //pushback elements of second to first (and then erase second)
        clusters[min.second].insert(clusters[min.first].begin(), clusters[min.first].end());
        // erase first one
        clusters.erase(clusters.begin() + min.first);

        //update original_distance matrix
        //complete linkage: new distance between clusters is the minimum distance between elements of each cluster
        //lance-williams update for d((i,j),k): 0.5* d(i,k) + 0.5* d(j,k) + 0.5* |d(i,k)-d(j,k)|
        for (Size k = 0; k < min.second; ++k)
        {
          float dik = original_distance.getValue(min.first, k);
          float djk = original_distance.getValue(min.second, k);
          original_distance.setValueQuick(min.second, k, (0.5f * dik + 0.5f * djk + 0.5f * std::fabs(dik - djk)));
        }
        for (Size k = min.second + 1; k < original_distance.dimensionsize(); ++k)
        {
          float dik = original_distance.getValue(min.first, k);
          float djk = original_distance.getValue(min.second, k);
          original_distance.setValueQuick(k, min.second, (0.5f * dik + 0.5f * djk + 0.5f * std::fabs(dik - djk)));
        }

        //reduce
        original_distance.reduce(min.first);

        //update minimum-distance pair
        original_distance.updateMinElement();

        //get new min-pair
        min = original_distance.getMinElementCoordinates();
      }
      else
      {
        break;
      }
      setProgress(overall_cluster_steps - original_distance.dimensionsize());

      //repeat until only two cluster remains or threshold exceeded, last step skips matrix operations
    }
    //fill tree with dummy nodes
    Size sad(*clusters.front().begin());
    for (Size i = 1; i < clusters.size() && (cluster_tree.size() < cluster_tree.capacity()); ++i)
    {
      cluster_tree.push_back(BinaryTreeNode(sad, *clusters[i].begin(), -1.0));
    }
    //~ while(cluster_tree.size() < cluster_tree.capacity())
    //~ {
    //~ cluster_tree.push_back(BinaryTreeNode(0,1,-1.0));
    //~ }

    endProgress();
  }
예제 #3
0
  void SingleLinkage::operator()(DistanceMatrix<float> & original_distance, std::vector<BinaryTreeNode> & cluster_tree, const float threshold /*=1*/) const
  {
    // input MUST have >= 2 elements!
    if (original_distance.dimensionsize() < 2)
    {
      throw ClusterFunctor::InsufficientInput(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "Distance matrix to start from only contains one element");
    }

    cluster_tree.clear();
    if (threshold < 1)
    {
      LOG_ERROR << "You tried to use Single Linkage clustering with a threshold. This is currently not supported!" << std::endl;
      throw Exception::NotImplemented(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION);
    }

    //SLINK
    std::vector<Size> pi;
    pi.reserve(original_distance.dimensionsize());
    std::vector<float> lambda;
    lambda.reserve(original_distance.dimensionsize());

    startProgress(0, original_distance.dimensionsize(), "clustering data");

    //initialize first pointer values
    pi.push_back(0);
    lambda.push_back(std::numeric_limits<float>::max());

    for (Size k = 1; k < original_distance.dimensionsize(); ++k)
    {
      std::vector<float> row_k;
      row_k.reserve(k);

      //initialize pointer values for element to cluster
      pi.push_back(k);
      lambda.push_back(std::numeric_limits<float>::max());

      // get the right distances
      for (Size i = 0; i < k; ++i)
      {
        row_k.push_back(original_distance.getValue(i, k));
      }

      //calculate pointer values for element k
      for (Size i = 0; i < k; ++i)
      {
        if (lambda[i] >= row_k[i])
        {
          row_k[pi[i]] = std::min(row_k[pi[i]], lambda[i]);
          lambda[i] = row_k[i];
          pi[i] = k;
        }
        else
        {
          row_k[pi[i]] = std::min(row_k[pi[i]], row_k[i]);
        }
      }

      //update clustering if necessary
      for (Size i = 0; i < k; ++i)
      {
        if (lambda[i] >= lambda[pi[i]])
        {
          pi[i] = k;
        }
      }
      setProgress(k);
    }

    for (Size i = 0; i < pi.size() - 1; ++i)
    {
      //strict order is always kept in algorithm: i < pi[i]
      cluster_tree.push_back(BinaryTreeNode(i, pi[i], lambda[i]));
      //~ std::cout << i << '\n' << pi[i] << '\n' << lambda[i] << std::endl;
    }

    //sort pre-tree
    std::sort(cluster_tree.begin(), cluster_tree.end(), compareBinaryTreeNode);

    // convert -pre-tree to correct format
    for (Size i = 0; i < cluster_tree.size(); ++i)
    {
      if (cluster_tree[i].right_child < cluster_tree[i].left_child)
      {
        std::swap(cluster_tree[i].left_child, cluster_tree[i].right_child);
      }
      for (Size k = i + 1; k < cluster_tree.size(); ++k)
      {
        if (cluster_tree[k].left_child == cluster_tree[i].right_child)
        {
          cluster_tree[k].left_child = cluster_tree[i].left_child;
        }
        else if (cluster_tree[k].left_child > cluster_tree[i].right_child)
        {
          --cluster_tree[k].left_child;
        }
        if (cluster_tree[k].right_child == cluster_tree[i].right_child)
        {
          cluster_tree[k].right_child = cluster_tree[i].left_child;
        }
        else if (cluster_tree[k].right_child > cluster_tree[i].right_child)
        {
          --cluster_tree[k].right_child;
        }
      }

    }
    //~ prepare to redo clustering to get all indices for binarytree in min index element representation
    std::vector<std::set<Size> > clusters(original_distance.dimensionsize());
    for (Size i = 0; i < original_distance.dimensionsize(); ++i)
    {
      clusters[i].insert(i);
    }
    for (Size cluster_step = 0; cluster_step < cluster_tree.size(); ++cluster_step)
    {
      Size new_left_child = *(clusters[cluster_tree[cluster_step].left_child].begin());
      Size new_right_child = *(clusters[cluster_tree[cluster_step].right_child].begin());
      clusters[cluster_tree[cluster_step].left_child].insert(clusters[cluster_tree[cluster_step].right_child].begin(), clusters[cluster_tree[cluster_step].right_child].end());
      clusters.erase(clusters.begin() + cluster_tree[cluster_step].right_child);
      std::swap(cluster_tree[cluster_step].left_child, new_left_child);
      std::swap(cluster_tree[cluster_step].right_child, new_right_child);
      if (cluster_tree[cluster_step].left_child > cluster_tree[cluster_step].right_child)
      {
        std::swap(cluster_tree[cluster_step].left_child, cluster_tree[cluster_step].right_child);
      }
    }

    endProgress();
  }