TEST_F(ClusteringCoefficientTest, RealWorldTest) { VectorGraph * g = new VectorGraph(false, false); GraphReader<VectorGraph, Vertex> graphReader; graphReader.read(*g, "TestTrees/AS_CAIDA_2008.txt"); typedef ClusteringCoefficient<ListGraph, Vertex> Clustering; typedef Clustering::Coefficient Coef; Clustering clustering; Coef epsilon = 0.001; Vertex * x = g->getVertexById(3); Coef c = clustering.vertexClusteringCoefficient(x); ASSERT_TRUE(fabs(c - 0.6666667) < epsilon); x = g->getVertexById(174); c = clustering.vertexClusteringCoefficient(x); ASSERT_TRUE(fabs(c - 0.01141431) < epsilon); x = g->getVertexById(4); c = clustering.vertexClusteringCoefficient(x); /* This is 1.0 in Network Workbench, but because of how ComplexNets works * it throws error when supporting multiedges, so we either accept * this value, or we dont throw exception when reading multiedges*/ ASSERT_TRUE(fabs(c - 1.0) < epsilon); x = g->getVertexById(23148); c = clustering.vertexClusteringCoefficient(x); ASSERT_TRUE(fabs(c - 0.3583333) < epsilon); }
virtual double compute( const Clustering& c1, const Clustering& c2 ) const { size_t c1_sum2 = 0; size_t n = 0; for (size_t i=0; i < c1.size(); i++) { c1_sum2 += c1[i].size() * c1[i].size(); n += c1[i].size(); } size_t c2_sum2 = 0; for (size_t i=0; i < c2.size(); i++) { c2_sum2 += c2[i].size() * c2[i].size(); } size_t c1c2_sum2 = 0; for (size_t i=0; i < c1.size(); i++) { for (size_t j=0; j < c2.size(); j++) { size_t size; std::set_intersection( c1[i].begin(), c1[i].end(), c2[j].begin(), c2[j].end(), counter(size) ); c1c2_sum2 += size * size; } } return ( c1_sum2 + c2_sum2 - (2 * c1c2_sum2) ) / (double)(n*n); }
TEST_F(ClusteringCoefficientTest, AcyclicGraphTest) { ListGraph ig; //Create vertex Vertex* x = new Vertex(1); //create neighbor vertices Vertex* v1 = new Vertex(2); Vertex* v2 = new Vertex(3); Vertex* v3 = new Vertex(4); Vertex* v4 = new Vertex(5); ig.addVertex(x); ig.addVertex(v1); ig.addVertex(v2); ig.addVertex(v3); ig.addVertex(v4); ig.addEdge(x, v1); ig.addEdge(x, v2); ig.addEdge(x, v3); ig.addEdge(v4, x);; typedef ClusteringCoefficient<ListGraph, Vertex> Clustering; typedef Clustering::Coefficient Coef; Clustering clustering; Coef c = clustering.vertexClusteringCoefficient(x); Coef epsilon = 0.001; ASSERT_TRUE(fabs(c - 0.0) < epsilon); Coef c2 = clustering.clusteringCoefficient(ig, Vertex::Degree(4)); ASSERT_TRUE(fabs(c2 - 0.0) < epsilon); }
MatrixPtr GenerateClusteredData::operator()() { auto matrix = std::make_shared<Matrix>(); matrix->reserve(nbrInds); Variables variables; for (size_t var = 0; var < nbrClusters * clustSize; ++var) { variables ^= Variable( boost::lexical_cast<std::string>(var), plIntegerType(0, cardinality-1) ); } Clustering clustering; clustering.reserve(nbrClusters); for ( size_t clust = 0; clust < nbrClusters; ++clust ) { Cluster cluster; for ( size_t item = 0; item < clustSize; ++item ) { cluster.push_back( clust*clustSize + item ); } clustering.push_back( cluster ); } plJointDistribution jointDist = createClusteringJointDist( variables, clustering); plValues values( variables ); // std::cout << jointDist << std::endl << jointDist.get_computable_object_list() << std::endl; for (size_t ind = 0; ind < nbrInds; ++ind) { jointDist.draw(values); std::vector<int> row(variables.size()); for (size_t var = 0; var < variables.size(); ++var) { row[var] = values[variables[var]]; } matrix->push_back(row); } //std::cout << jointDist << std::endl; return Transpose(*matrix); }
void Node::iteration(bool last) { double bestGain = 0.0; Clustering bestClustering; bestClustering.copy(&clustering); Clustering backupClustering; backupClustering.copy(&clustering); buildValuesOverClusters(); for(int i = 0; i < NUMBER_OF_RANDOM_RESTARTS; i++) { cout << "Random restart #" << i << endl; if(clusteringType() != FLAT) clustering.splitOrMerge(last); if(clustering.size() == 1) { cout << "Warning: there is only one cluster in " << name << endl; continue; } buildClustersOverClusters(); double gain = calculateInformationGain(); cout << "Initial value of objective function: " << gain << endl; for(int j = 0; j < NUMBER_OF_CONVERGENT_STEPS; j++) { cout << "Convergent step: " << j << endl; gain += clustering.correctionLoop(); } if(gain > bestGain) { bestGain = gain; cout << "Best gain: " << gain << endl; bestClustering.copy(&clustering); } clustering.copy(&backupClustering); } clustering.copy(&bestClustering); }
plJointDistribution createClusteringJointDist( const Variables& variables, const Clustering& clustering) { plComputableObjectList cndProbTabs; for ( Clustering::const_iterator it = clustering.begin(); it != clustering.end(); ++it ) { cndProbTabs *= createClusterJointDist( variables, *it); } //std::cout << cndProbTabs << std::endl; plJointDistribution jointDist( variables, cndProbTabs ); return jointDist; }
unsigned int DBSCAN::run(Clustering &c){ unsigned int clusterID = 1; vector<unsigned int> neighbours; double cout_limit = 0.0; cout << "db node_count: " << node_count << endl; for(unsigned int nodeID = 0; nodeID < node_count; nodeID++){ if (double(nodeID) / node_count >= cout_limit){ cout << double(nodeID) / node_count * 100 << " %" << endl; cout_limit+=.05; } //cout << "h" << endl; if (!visited[nodeID]){ visited[nodeID] = true; neighbours.clear(); //cout << "i" << endl; if (met->getNeighbors(nodeID, neighbours)){ //cout << "j" << endl; cluster[nodeID] = clusterID; //visited[nodeID] = true; expand(neighbours, clusterID); clusterID++; } } } c.resize(clusterID); met->expand(c, cluster); return clusterID; };
bool MatchingDynamicClusterer::bootstrap( Clustering &step_clustering ) { m_dynamic.clear(); Clustering::iterator cit; Clustering::iterator cend = step_clustering.end(); int step_cluster_index = 0; for( cit = step_clustering.begin() ; cit != cend; cit++, step_cluster_index++ ) { // if( (*cit).size() < MIN_CLUSTER_SIZE || (*cit).size() > MAX_CLUSTER_SIZE ) if( (*cit).size() < MIN_CLUSTER_SIZE ) { continue; } DynamicCluster dc; dc.update( m_step, step_cluster_index, *cit ); m_dynamic.push_back(dc); #ifdef DEBUG_MATCHING cout << "T" << m_step << ": Birth: Community M" << m_dynamic.size() << endl; #endif } return true; }
bool MapMatchingDynamicClusterer::add_clustering( Clustering &step_clustering ) { m_step += 1; /// First? if( m_step == 1 ) { return bootstrap(step_clustering); } int step_cluster_index = 0; /// Build a map of Nodes -> Dynamic Communities containing those nodes map<NODE,set<int> > fastmap; DynamicClustering::iterator dit; DynamicClustering::iterator dend = m_dynamic.end(); int dyn_count = (int)m_dynamic.size(); int dyn_index = 0; long* dyn_sizes = new long[dyn_count+1]; for( dit = m_dynamic.begin() ; dit != dend; dit++, dyn_index++ ) { // Dead? if( m_death_age > 0 && m_dynamic[dyn_index].is_dead( m_step, m_death_age ) ) { dyn_sizes[dyn_index] = 0; continue; } Cluster& front = (*dit).front(); dyn_sizes[dyn_index] = (long)front.size(); Cluster::const_iterator fit; Cluster::const_iterator fend = front.end(); for( fit = front.begin() ; fit != fend; fit++ ) { NODE node_index = *fit; if( !fastmap.count( node_index ) ) { set<int> first; first.insert(dyn_index); fastmap.insert( make_pair(node_index,first) ); } else { fastmap[node_index].insert(dyn_index); } } } /// Now try to match all int* all_intersection = new int[dyn_count+1]; vector<DynamicCluster> fresh; PairVector matched_pairs; map<NODE,set<int> >::const_iterator mend = fastmap.end(); Clustering::iterator cit; Clustering::iterator cend = step_clustering.end(); for( cit = step_clustering.begin() ; cit != cend; cit++, step_cluster_index++ ) { long size_step = (long)(*cit).size(); if( size_step < MIN_CLUSTER_SIZE ) { continue; } // Compute all intersections for( dyn_index = 0; dyn_index < dyn_count; dyn_index++) { all_intersection[dyn_index] = 0; } Cluster::const_iterator xit; Cluster::const_iterator xend = (*cit).end(); for( xit = (*cit).begin() ; xit != xend; xit++ ) { NODE node_index = *xit; map<NODE,set<int> >::const_iterator mit = fastmap.find(node_index); if( mit != mend ) { set<int>::const_iterator sit; for ( sit = fastmap[node_index].begin(); sit != fastmap[node_index].end(); sit++ ) { all_intersection[(*sit)]++; } } } // Find matches vector<int> matches; for( dyn_index = 0; dyn_index < dyn_count; dyn_index++) { if( dyn_sizes[dyn_index] == 0 || all_intersection[dyn_index] == 0 ) { continue; } #ifdef SIM_OVERLAP double sim = ((double)(all_intersection[dyn_index]))/min(size_step,dyn_sizes[dyn_index]); #else double sim = ((double)(all_intersection[dyn_index]))/(size_step+dyn_sizes[dyn_index]-all_intersection[dyn_index]); #endif if( sim > m_threshold ) { matches.push_back( dyn_index ); } } // new community? if( matches.empty() ) { DynamicCluster dc; dc.update( m_step, step_cluster_index, *cit ); fresh.push_back(dc); #ifdef DEBUG_MATCHING cout << "T" << m_step << ": Birth: Community M" << (m_dynamic.size()+fresh.size()) << " from C" << step_cluster_index+1 << endl; #endif } else { vector<int>::const_iterator iit; for( iit = matches.begin() ; iit != matches.end(); iit++ ) { pair<int,int> p(step_cluster_index,(*iit)); matched_pairs.push_back(p); } } } // Actually update existing dynamic communities now set<int> matched_dynamic; PairVector::const_iterator pit; for( pit = matched_pairs.begin(); pit != matched_pairs.end(); pit++ ) { int step_cluster_index = (*pit).first; int dyn_cluster_index = (*pit).second; // already processed this dynamic cluster? if( matched_dynamic.count( dyn_cluster_index ) ) { DynamicCluster dc( m_dynamic[dyn_cluster_index], m_step, step_cluster_index, step_clustering[step_cluster_index] ); fresh.push_back(dc); #ifdef DEBUG_MATCHING cout << "T" << m_step << ": Split: Matched C" << (step_cluster_index+1) << " to M" << (dyn_cluster_index+1) << ". Splitting to M" << (m_dynamic.size()+fresh.size()) << endl; #endif } else { #ifdef DEBUG_MATCHING cout << "T" << m_step << ": Continuation: Matched C" << (step_cluster_index+1) << " to M" << (dyn_cluster_index+1) << endl; #endif m_dynamic[dyn_cluster_index].update( m_step, step_cluster_index, step_clustering[step_cluster_index] ); matched_dynamic.insert(dyn_cluster_index); } } // And finally add any new dynamic communities for( dit = fresh.begin() ; dit != fresh.end(); dit++ ) { m_dynamic.push_back(*dit); } delete[] dyn_sizes; delete[] all_intersection; return true; }
bool MatchingDynamicClusterer::add_clustering( Clustering &step_clustering ) { m_step += 1; /// First? if( m_step == 1 ) { return bootstrap(step_clustering); } /// Otherwise, try to match all Clustering::iterator cit; Clustering::iterator cend = step_clustering.end(); int step_cluster_index = 0; vector<DynamicCluster> fresh; PairVector matched_pairs; for( cit = step_clustering.begin() ; cit != cend; cit++, step_cluster_index++ ) { vector<int> matches; find_matches( *cit, matches ); // new community? if( matches.empty() ) { DynamicCluster dc; dc.update( m_step, step_cluster_index, *cit ); fresh.push_back(dc); #ifdef DEBUG_MATCHING cout << "T" << m_step << ": Birth: Community M" << (m_dynamic.size()+fresh.size()) << " from C" << step_cluster_index+1 << endl; #endif } else { vector<int>::const_iterator iit; for( iit = matches.begin() ; iit != matches.end(); iit++ ) { pair<int,int> p(step_cluster_index,(*iit)); matched_pairs.push_back(p); } } } // Actually update existing dynamic communities now set<int> matched_dynamic; PairVector::const_iterator pit; for( pit = matched_pairs.begin(); pit != matched_pairs.end(); pit++ ) { int step_cluster_index = (*pit).first; int dyn_cluster_index = (*pit).second; // already processed this dynamic cluster? if( matched_dynamic.count( dyn_cluster_index ) ) { DynamicCluster dc( m_dynamic[dyn_cluster_index], m_step, step_cluster_index, step_clustering[step_cluster_index] ); fresh.push_back(dc); #ifdef DEBUG_MATCHING cout << "T" << m_step << ": Split: Matched C" << (step_cluster_index+1) << " to M" << (dyn_cluster_index+1) << ". Splitting to M" << (m_dynamic.size()+fresh.size()) << endl; #endif } else { #ifdef DEBUG_MATCHING cout << "T" << m_step << ": Continuation: Matched C" << (step_cluster_index+1) << " to M" << (dyn_cluster_index+1) << endl; #endif m_dynamic[dyn_cluster_index].update( m_step, step_cluster_index, step_clustering[step_cluster_index] ); matched_dynamic.insert(dyn_cluster_index); } } // And finally add any new dynamic communities DynamicClustering::const_iterator dit; for( dit = fresh.begin() ; dit != fresh.end(); dit++ ) { m_dynamic.push_back(*dit); } return true; }
int main (int argc, const char * argv[]){ struct timeval start, end; gettimeofday(&start, NULL); // general parameters size_t maxSeqLen = 50000; int seqType = Sequence::AMINO_ACIDS; // parameter for the prefiltering int kmerSize = 6; int alphabetSize = 21; size_t maxResListLen = 100; int split = 0; int skip = 0; bool aaBiasCorrection = true; float zscoreThr = 50.0f; float sensitivity = 4.0; // parameters for the alignment double evalThr = 0.001; double covThr = 0.8; int maxAlnNum = 10; std::string lastSeqDB = ""; std::string currentSeqDB = ""; std::string cluDB = ""; std::string outDB = ""; std::string tmpDir = ""; // get the path of the scoring matrix char* mmdir = getenv ("MMDIR"); if (mmdir == 0){ std::cerr << "Please set the environment variable $MMDIR to your MMSEQS installation directory.\n"; exit(1); } std::string scoringMatrixFile(mmdir); scoringMatrixFile = scoringMatrixFile + "/data/blosum62.out"; parseArgs(argc, argv, &lastSeqDB, ¤tSeqDB, &cluDB, &outDB, &tmpDir, &scoringMatrixFile, &maxSeqLen); std::string lastSeqDBIndex = lastSeqDB + ".index"; std::string currentSeqDBIndex = currentSeqDB + ".index"; std::string cluDBIndex = cluDB + ".index"; std::string outDBIndex = outDB + ".index"; std::list<std::string>* tmpFiles = new std::list<std::string>(); std::string AIndex = tmpDir + "/A.index"; std::string BIndex = tmpDir + "/B.index"; tmpFiles->push_back(AIndex); tmpFiles->push_back(BIndex); std::string Brest_indexFile = tmpDir + "/Brest.index"; tmpFiles->push_back(Brest_indexFile); std::string BB_clu = tmpDir + "/BB_clu"; std::string BB_clu_index = BB_clu + ".index"; tmpFiles->push_back(BB_clu); tmpFiles->push_back(BB_clu_index); std::cout << "////////////////////////////////////////////////////////////////////////\n"; std::cout << "/////// Init /////////////\n"; std::cout << "////////////////////////////////////////////////////////////////////////\n"; // extract three indexes: // - A: last database version without deleted sequences // - B: sequences which are new in the database writeIndexes(AIndex, BIndex, lastSeqDBIndex, currentSeqDBIndex); std::cout << "////////////////////////////////////////////////////////////////////////\n"; std::cout << "/////// Calculating B->A scores /////////////\n"; std::cout << "////////////////////////////////////////////////////////////////////////\n"; // calculate score for the updating // B->A scores std::string BA_base = runScoresCalculation(currentSeqDB, BIndex, currentSeqDB, AIndex, tmpDir, scoringMatrixFile, maxSeqLen, seqType, kmerSize, alphabetSize, maxResListLen, split, skip, aaBiasCorrection, zscoreThr, sensitivity, evalThr, covThr, maxAlnNum, "BA", tmpFiles); std::cout << "////////////////////////////////////////////////////////////////////////\n"; std::cout << "/////// Adding sequences to existing clusters /////////////\n"; std::cout << "////////////////////////////////////////////////////////////////////////\n"; // update the clustering DBReader* currSeqDbr = new DBReader(currentSeqDB.c_str(), currentSeqDBIndex.c_str()); currSeqDbr->open(DBReader::NOSORT); // data structures for the clustering int seqDBSize = currSeqDbr->getSize(); unsigned int* id2rep = new unsigned int[seqDBSize]; char** rep2cluName = new char*[seqDBSize]; for (int i = 0; i < seqDBSize; i++) rep2cluName[i] = new char[FFINDEX_MAX_ENTRY_NAME_LENTH]; cluster_t* clusters = new cluster_t[seqDBSize]; for (int i = 0; i < seqDBSize; i++){ clusters[i].clu_size = 0; clusters[i].first = 0; clusters[i].last = 0; } std::cout << "Read the existing clustering...\n"; // Read the existing clustering readClustering(currSeqDbr, cluDB, id2rep, rep2cluName, clusters); std::cout << "Append new sequences to the existing clustering...\n"; // append sequences from the new database to the existing clustering based on the B->A alignment scores // write sequences without a match to a separate index (they will be clustered separately) appendToClustering(currSeqDbr, BIndex, BA_base, id2rep, clusters, Brest_indexFile); if (seqsWithoutMatches > 0){ std::cout << "////////////////////////////////////////////////////////////////////////\n"; std::cout << "/////// Calculating B->B scores /////////////\n"; std::cout << "////////////////////////////////////////////////////////////////////////\n"; // B->B scores std::string BB_base = runScoresCalculation(currentSeqDB, Brest_indexFile, currentSeqDB, Brest_indexFile, tmpDir, scoringMatrixFile, maxSeqLen, seqType, kmerSize, alphabetSize, maxResListLen, split, skip, aaBiasCorrection, zscoreThr, sensitivity, evalThr, covThr, maxAlnNum, "BB", tmpFiles); std::cout << "////////////////////////////////////////////////////////////////////////\n"; std::cout << "/////// Appending new clusters /////////////\n"; std::cout << "////////////////////////////////////////////////////////////////////////\n"; std::cout << "Cluster new sequences without a match to the existing clusters...\n"; // cluster sequences without a match to the existing clusters separately // use the index generated in the previous step Clustering* clu = new Clustering(currentSeqDB, currentSeqDBIndex, BB_base, BB_base + ".index", BB_clu, BB_clu_index, 0.0, 0, maxResListLen); clu->run(Clustering::SET_COVER); std::cout << "Append generated clusters to the complete clustering...\n"; // append B->B clusters to the clustering newClus = readClustering(currSeqDbr, BB_clu, id2rep, rep2cluName, clusters); } // write new clustering std::cout << "Write clustering results...\n"; writeResults(clusters, rep2cluName, currSeqDbr, seqDBSize, outDB); std::cout << "done.\n"; currSeqDbr->close(); std::cout << "////////////////////////////////////////////////////////////////////////\n"; std::cout << "/////// Statistics ////////\n"; std::cout << "////////////////////////////////////////////////////////////////////////\n"; std::cout << "\nPrevios database version: " << oldDBSize << " entries.\n"; std::cout << "New database vesion : " << newDBSize << " entries.\n"; std::cout << deletedSeqs << " entries were deleted,\n"; std::cout << newSeqs << " entries are new,\n"; std::cout << sharedSeqs << " entries are shared.\n\n"; std::cout << seqsWithMatches << " new sequences had matches to the previous database version.\n"; std::cout << "Remaining " << seqsWithoutMatches << " were grouped into " << newClus << " new clusters.\n"; gettimeofday(&end, NULL); int sec = end.tv_sec - start.tv_sec; std::cout << "\nTime for updating: " << (sec / 3600) << " h " << (sec % 3600 / 60) << " m " << (sec % 60) << "s\n\n"; deleteTmpFiles(tmpFiles); delete tmpFiles; }