static bool canMerge(const ConcurrentDSU &uf, unsigned x, unsigned y) { size_t szx = uf.set_size(x), szy = uf.set_size(y); const size_t hardthr = 2500; // Global threshold - no cluster larger than hard threshold if (szx + szy > hardthr) return false; // If one of the clusters is moderately large, than attach "almost" singletons // only. if ((szx > hardthr * 3 / 4 && szy > 50) || (szy > hardthr * 3 / 4 && szx > 50)) return false; return true; }
static void processBlockQuadratic(ConcurrentDSU &uf, const std::vector<size_t> &block, const KMerData &data, unsigned tau) { size_t blockSize = block.size(); for (size_t i = 0; i < blockSize; ++i) { unsigned x = (unsigned)block[i]; hammer::KMer kmerx = data.kmer(x); for (size_t j = i + 1; j < blockSize; j++) { unsigned y = (unsigned)block[j]; hammer::KMer kmery = data.kmer(y); if (uf.find_set(x) != uf.find_set(y) && canMerge(uf, x, y) && hamdistKMer(kmerx, kmery, tau) <= tau) { uf.unite(x, y); } } } }
Clusterization CostructClusters() { InitializeReadSequenceMap(); #pragma omp parallel for for(size_t i = 0; i < reads_.size(); i++) { auto candidates = read_index_.GetCandidatesFor(i); for(auto it = candidates.begin(); it != candidates.end(); it++) { size_t cluster1 = clusters_.find_set(i); size_t cluster2 = clusters_.find_set(*it); if(cluster1 != cluster2) { string read_seq1 = read_seq_map_[cluster1]; string read_seq2 = read_seq_map_[cluster2]; auto comparison_result = seq_comparer_.SequencesMatch(read_seq1, read_seq2); if(comparison_result.match) { clusters_.unite(i, *it); string superstring = GetSuperString(read_seq1, read_seq2, comparison_result); size_t new_cluster = clusters_.find_set(i); read_seq_map_[new_cluster] = superstring; } } } } cout << clusters_.num_sets() << " clusters were constructed" << endl; Clusterization result(reads_); for(size_t i = 0; i < reads_.size(); i++) result.Add(i, clusters_.find_set(i)); for(auto it = read_seq_map_.begin(); it != read_seq_map_.end(); it++) { size_t cluster_id = clusters_.find_set(it->first); if(it->first == cluster_id) { result.AddClusterSequence(cluster_id, it->second, clusters_.set_size(cluster_id)); } } assert(result.ClustersSize() == clusters_.num_sets()); return result; }
static bool canMerge(const ConcurrentDSU &uf, unsigned x, unsigned y) { return (uf.set_size(x) + uf.set_size(y)) < 10000; }