void AverageLinkage::operator()(DistanceMatrix<float> & original_distance, std::vector<BinaryTreeNode> & cluster_tree, const float threshold /*=1*/) const { // input MUST have >= 2 elements! if (original_distance.dimensionsize() < 2) { throw ClusterFunctor::InsufficientInput(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "Distance matrix to start from only contains one element"); } std::vector<std::set<Size> > clusters(original_distance.dimensionsize()); for (Size i = 0; i < original_distance.dimensionsize(); ++i) { clusters[i].insert(i); } cluster_tree.clear(); cluster_tree.reserve(original_distance.dimensionsize() - 1); // Initial minimum-distance pair original_distance.updateMinElement(); std::pair<Size, Size> min = original_distance.getMinElementCoordinates(); Size overall_cluster_steps(original_distance.dimensionsize()); startProgress(0, original_distance.dimensionsize(), "clustering data"); while (original_distance(min.second, min.first) < threshold) { //grow the tree cluster_tree.push_back(BinaryTreeNode(*(clusters[min.second].begin()), *(clusters[min.first].begin()), original_distance(min.first, min.second))); if (cluster_tree.back().left_child > cluster_tree.back().right_child) { std::swap(cluster_tree.back().left_child, cluster_tree.back().right_child); } if (original_distance.dimensionsize() > 2) { //pick minimum-distance pair i,j and merge them //calculate parameter for lance-williams formula float alpha_i = (float)(clusters[min.first].size() / (float)(clusters[min.first].size() + clusters[min.second].size())); float alpha_j = (float)(clusters[min.second].size() / (float)(clusters[min.first].size() + clusters[min.second].size())); //~ std::cout << alpha_i << '\t' << alpha_j << std::endl; //pushback elements of second to first (and then erase second) clusters[min.second].insert(clusters[min.first].begin(), clusters[min.first].end()); // erase first one clusters.erase(clusters.begin() + min.first); //update original_distance matrix //average linkage: new distance between clusters is the minimum distance between elements of each cluster //lance-williams update for d((i,j),k): (m_i/m_i+m_j)* d(i,k) + (m_j/m_i+m_j)* d(j,k) ; m_x is the number of elements in cluster x for (Size k = 0; k < min.second; ++k) { float dik = original_distance.getValue(min.first, k); float djk = original_distance.getValue(min.second, k); original_distance.setValueQuick(min.second, k, (alpha_i * dik + alpha_j * djk)); } for (Size k = min.second + 1; k < original_distance.dimensionsize(); ++k) { float dik = original_distance.getValue(min.first, k); float djk = original_distance.getValue(min.second, k); original_distance.setValueQuick(k, min.second, (alpha_i * dik + alpha_j * djk)); } //reduce original_distance.reduce(min.first); //update minimum-distance pair original_distance.updateMinElement(); //get min-pair from triangular matrix min = original_distance.getMinElementCoordinates(); } else { break; } setProgress(overall_cluster_steps - original_distance.dimensionsize()); //repeat until only two cluster remains, last step skips matrix operations } //fill tree with dummy nodes Size sad(*clusters.front().begin()); for (Size i = 1; (i < clusters.size()) && (cluster_tree.size() < cluster_tree.capacity()); ++i) { cluster_tree.push_back(BinaryTreeNode(sad, *clusters[i].begin(), -1.0)); } endProgress(); }
void CompleteLinkage::operator()(DistanceMatrix<float> & original_distance, std::vector<BinaryTreeNode> & cluster_tree, const float threshold /*=1*/) const { // attention: clustering process is done by clustering the indices // pointing to elements in inputvector and distances in inputmatrix // input MUST have >= 2 elements! if (original_distance.dimensionsize() < 2) { throw ClusterFunctor::InsufficientInput(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "Distance matrix to start from only contains one element"); } std::vector<std::set<Size> > clusters(original_distance.dimensionsize()); for (Size i = 0; i < original_distance.dimensionsize(); ++i) { clusters[i].insert(i); } cluster_tree.clear(); cluster_tree.reserve(original_distance.dimensionsize() - 1); // Initial minimum-distance pair original_distance.updateMinElement(); std::pair<Size, Size> min = original_distance.getMinElementCoordinates(); Size overall_cluster_steps(original_distance.dimensionsize()); startProgress(0, original_distance.dimensionsize(), "clustering data"); while (original_distance(min.first, min.second) < threshold) { //grow the tree cluster_tree.push_back(BinaryTreeNode(*(clusters[min.second].begin()), *(clusters[min.first].begin()), original_distance(min.first, min.second))); if (cluster_tree.back().left_child > cluster_tree.back().right_child) { std::swap(cluster_tree.back().left_child, cluster_tree.back().right_child); } if (original_distance.dimensionsize() > 2) { //pick minimum-distance pair i,j and merge them //pushback elements of second to first (and then erase second) clusters[min.second].insert(clusters[min.first].begin(), clusters[min.first].end()); // erase first one clusters.erase(clusters.begin() + min.first); //update original_distance matrix //complete linkage: new distance between clusters is the minimum distance between elements of each cluster //lance-williams update for d((i,j),k): 0.5* d(i,k) + 0.5* d(j,k) + 0.5* |d(i,k)-d(j,k)| for (Size k = 0; k < min.second; ++k) { float dik = original_distance.getValue(min.first, k); float djk = original_distance.getValue(min.second, k); original_distance.setValueQuick(min.second, k, (0.5f * dik + 0.5f * djk + 0.5f * std::fabs(dik - djk))); } for (Size k = min.second + 1; k < original_distance.dimensionsize(); ++k) { float dik = original_distance.getValue(min.first, k); float djk = original_distance.getValue(min.second, k); original_distance.setValueQuick(k, min.second, (0.5f * dik + 0.5f * djk + 0.5f * std::fabs(dik - djk))); } //reduce original_distance.reduce(min.first); //update minimum-distance pair original_distance.updateMinElement(); //get new min-pair min = original_distance.getMinElementCoordinates(); } else { break; } setProgress(overall_cluster_steps - original_distance.dimensionsize()); //repeat until only two cluster remains or threshold exceeded, last step skips matrix operations } //fill tree with dummy nodes Size sad(*clusters.front().begin()); for (Size i = 1; i < clusters.size() && (cluster_tree.size() < cluster_tree.capacity()); ++i) { cluster_tree.push_back(BinaryTreeNode(sad, *clusters[i].begin(), -1.0)); } //~ while(cluster_tree.size() < cluster_tree.capacity()) //~ { //~ cluster_tree.push_back(BinaryTreeNode(0,1,-1.0)); //~ } endProgress(); }
void SingleLinkage::operator()(DistanceMatrix<float> & original_distance, std::vector<BinaryTreeNode> & cluster_tree, const float threshold /*=1*/) const { // input MUST have >= 2 elements! if (original_distance.dimensionsize() < 2) { throw ClusterFunctor::InsufficientInput(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "Distance matrix to start from only contains one element"); } cluster_tree.clear(); if (threshold < 1) { LOG_ERROR << "You tried to use Single Linkage clustering with a threshold. This is currently not supported!" << std::endl; throw Exception::NotImplemented(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION); } //SLINK std::vector<Size> pi; pi.reserve(original_distance.dimensionsize()); std::vector<float> lambda; lambda.reserve(original_distance.dimensionsize()); startProgress(0, original_distance.dimensionsize(), "clustering data"); //initialize first pointer values pi.push_back(0); lambda.push_back(std::numeric_limits<float>::max()); for (Size k = 1; k < original_distance.dimensionsize(); ++k) { std::vector<float> row_k; row_k.reserve(k); //initialize pointer values for element to cluster pi.push_back(k); lambda.push_back(std::numeric_limits<float>::max()); // get the right distances for (Size i = 0; i < k; ++i) { row_k.push_back(original_distance.getValue(i, k)); } //calculate pointer values for element k for (Size i = 0; i < k; ++i) { if (lambda[i] >= row_k[i]) { row_k[pi[i]] = std::min(row_k[pi[i]], lambda[i]); lambda[i] = row_k[i]; pi[i] = k; } else { row_k[pi[i]] = std::min(row_k[pi[i]], row_k[i]); } } //update clustering if necessary for (Size i = 0; i < k; ++i) { if (lambda[i] >= lambda[pi[i]]) { pi[i] = k; } } setProgress(k); } for (Size i = 0; i < pi.size() - 1; ++i) { //strict order is always kept in algorithm: i < pi[i] cluster_tree.push_back(BinaryTreeNode(i, pi[i], lambda[i])); //~ std::cout << i << '\n' << pi[i] << '\n' << lambda[i] << std::endl; } //sort pre-tree std::sort(cluster_tree.begin(), cluster_tree.end(), compareBinaryTreeNode); // convert -pre-tree to correct format for (Size i = 0; i < cluster_tree.size(); ++i) { if (cluster_tree[i].right_child < cluster_tree[i].left_child) { std::swap(cluster_tree[i].left_child, cluster_tree[i].right_child); } for (Size k = i + 1; k < cluster_tree.size(); ++k) { if (cluster_tree[k].left_child == cluster_tree[i].right_child) { cluster_tree[k].left_child = cluster_tree[i].left_child; } else if (cluster_tree[k].left_child > cluster_tree[i].right_child) { --cluster_tree[k].left_child; } if (cluster_tree[k].right_child == cluster_tree[i].right_child) { cluster_tree[k].right_child = cluster_tree[i].left_child; } else if (cluster_tree[k].right_child > cluster_tree[i].right_child) { --cluster_tree[k].right_child; } } } //~ prepare to redo clustering to get all indices for binarytree in min index element representation std::vector<std::set<Size> > clusters(original_distance.dimensionsize()); for (Size i = 0; i < original_distance.dimensionsize(); ++i) { clusters[i].insert(i); } for (Size cluster_step = 0; cluster_step < cluster_tree.size(); ++cluster_step) { Size new_left_child = *(clusters[cluster_tree[cluster_step].left_child].begin()); Size new_right_child = *(clusters[cluster_tree[cluster_step].right_child].begin()); clusters[cluster_tree[cluster_step].left_child].insert(clusters[cluster_tree[cluster_step].right_child].begin(), clusters[cluster_tree[cluster_step].right_child].end()); clusters.erase(clusters.begin() + cluster_tree[cluster_step].right_child); std::swap(cluster_tree[cluster_step].left_child, new_left_child); std::swap(cluster_tree[cluster_step].right_child, new_right_child); if (cluster_tree[cluster_step].left_child > cluster_tree[cluster_step].right_child) { std::swap(cluster_tree[cluster_step].left_child, cluster_tree[cluster_step].right_child); } } endProgress(); }