static bool canMerge(const ConcurrentDSU &uf, unsigned x, unsigned y) { size_t szx = uf.set_size(x), szy = uf.set_size(y); const size_t hardthr = 2500; // Global threshold - no cluster larger than hard threshold if (szx + szy > hardthr) return false; // If one of the clusters is moderately large, than attach "almost" singletons // only. if ((szx > hardthr * 3 / 4 && szy > 50) || (szy > hardthr * 3 / 4 && szx > 50)) return false; return true; }
Clusterization CostructClusters() { InitializeReadSequenceMap(); #pragma omp parallel for for(size_t i = 0; i < reads_.size(); i++) { auto candidates = read_index_.GetCandidatesFor(i); for(auto it = candidates.begin(); it != candidates.end(); it++) { size_t cluster1 = clusters_.find_set(i); size_t cluster2 = clusters_.find_set(*it); if(cluster1 != cluster2) { string read_seq1 = read_seq_map_[cluster1]; string read_seq2 = read_seq_map_[cluster2]; auto comparison_result = seq_comparer_.SequencesMatch(read_seq1, read_seq2); if(comparison_result.match) { clusters_.unite(i, *it); string superstring = GetSuperString(read_seq1, read_seq2, comparison_result); size_t new_cluster = clusters_.find_set(i); read_seq_map_[new_cluster] = superstring; } } } } cout << clusters_.num_sets() << " clusters were constructed" << endl; Clusterization result(reads_); for(size_t i = 0; i < reads_.size(); i++) result.Add(i, clusters_.find_set(i)); for(auto it = read_seq_map_.begin(); it != read_seq_map_.end(); it++) { size_t cluster_id = clusters_.find_set(it->first); if(it->first == cluster_id) { result.AddClusterSequence(cluster_id, it->second, clusters_.set_size(cluster_id)); } } assert(result.ClustersSize() == clusters_.num_sets()); return result; }
static bool canMerge(const ConcurrentDSU &uf, unsigned x, unsigned y) { return (uf.set_size(x) + uf.set_size(y)) < 10000; }