TEST_F(ClusteringCoefficientTest, RealWorldTest)
{
    VectorGraph * g = new VectorGraph(false, false);
    GraphReader<VectorGraph, Vertex> graphReader;

    graphReader.read(*g, "TestTrees/AS_CAIDA_2008.txt");

    typedef ClusteringCoefficient<ListGraph, Vertex> Clustering;
    typedef Clustering::Coefficient Coef;
    Clustering clustering;
    Coef epsilon = 0.001;

    Vertex * x = g->getVertexById(3);
    Coef c = clustering.vertexClusteringCoefficient(x);
    ASSERT_TRUE(fabs(c - 0.6666667) <  epsilon);

    x = g->getVertexById(174);
    c = clustering.vertexClusteringCoefficient(x);
    ASSERT_TRUE(fabs(c - 0.01141431) <  epsilon);

    x = g->getVertexById(4);
    c = clustering.vertexClusteringCoefficient(x);

    /* This is 1.0 in Network Workbench, but because of how ComplexNets works
     * it throws error when supporting multiedges, so we either accept
     * this value, or we dont throw exception when reading multiedges*/
    ASSERT_TRUE(fabs(c - 1.0) <  epsilon);

    x = g->getVertexById(23148);
    c = clustering.vertexClusteringCoefficient(x);
    
    ASSERT_TRUE(fabs(c - 0.3583333) <  epsilon);
}
Exemple #2
0
  virtual double compute( const Clustering& c1, const Clustering& c2 ) const {
    size_t c1_sum2 = 0;
    size_t n = 0;
    for (size_t i=0; i < c1.size(); i++) {
      c1_sum2 += c1[i].size() * c1[i].size();
      n += c1[i].size();
    }

    size_t c2_sum2 = 0;
    for (size_t i=0; i < c2.size(); i++) {
      c2_sum2 += c2[i].size() * c2[i].size();    
    }  

    size_t c1c2_sum2 = 0;
    for (size_t i=0; i < c1.size(); i++) {
      for (size_t j=0; j < c2.size(); j++) {
        size_t size;
        std::set_intersection( c1[i].begin(), c1[i].end(), 
                               c2[j].begin(), c2[j].end(),
                               counter(size) );
        c1c2_sum2 += size * size;
      }
    }

    return ( c1_sum2 + c2_sum2 - (2 * c1c2_sum2) ) / (double)(n*n);
  }
TEST_F(ClusteringCoefficientTest, AcyclicGraphTest)
{
    ListGraph ig;
    //Create vertex
    Vertex* x = new Vertex(1);

    //create neighbor vertices
    Vertex* v1 = new Vertex(2);
    Vertex* v2 = new Vertex(3);
    Vertex* v3 = new Vertex(4);
    Vertex* v4 = new Vertex(5);

    ig.addVertex(x);
    ig.addVertex(v1);
    ig.addVertex(v2);
    ig.addVertex(v3);
    ig.addVertex(v4);
    ig.addEdge(x, v1);
    ig.addEdge(x, v2);
    ig.addEdge(x, v3);
    ig.addEdge(v4, x);;

    typedef ClusteringCoefficient<ListGraph, Vertex> Clustering;
    typedef Clustering::Coefficient Coef;
    Clustering clustering;

    Coef c = clustering.vertexClusteringCoefficient(x);

    Coef epsilon = 0.001;

    ASSERT_TRUE(fabs(c - 0.0) <  epsilon);

    Coef c2 = clustering.clusteringCoefficient(ig, Vertex::Degree(4));
    ASSERT_TRUE(fabs(c2 - 0.0) <  epsilon);
}
MatrixPtr GenerateClusteredData::operator()() {
  auto matrix = std::make_shared<Matrix>();
  matrix->reserve(nbrInds);
  Variables variables;
  
  for (size_t var = 0; var < nbrClusters * clustSize; ++var) {
    variables ^= Variable( boost::lexical_cast<std::string>(var),
                           plIntegerType(0, cardinality-1) );
  }

  Clustering clustering; clustering.reserve(nbrClusters);
  for ( size_t clust = 0; clust < nbrClusters; ++clust ) {
    Cluster cluster;
    for ( size_t item = 0; item < clustSize; ++item ) {
      cluster.push_back( clust*clustSize + item ); 
    }
    clustering.push_back( cluster );
  }
  
  plJointDistribution jointDist = createClusteringJointDist( variables, clustering);
  plValues values( variables );
  // std::cout << jointDist << std::endl << jointDist.get_computable_object_list() << std::endl;
  for (size_t ind = 0; ind < nbrInds; ++ind) {
    jointDist.draw(values);   
    std::vector<int> row(variables.size()); 
    for (size_t var = 0; var < variables.size(); ++var) {
      row[var] = values[variables[var]];  
    }
    matrix->push_back(row);
  }

  //std::cout << jointDist << std::endl;
  return Transpose(*matrix);
}
Exemple #5
0
void Node::iteration(bool last) {
  double bestGain = 0.0;
  Clustering bestClustering;
  bestClustering.copy(&clustering);
  Clustering backupClustering;
  backupClustering.copy(&clustering);
  buildValuesOverClusters();
  for(int i = 0; i < NUMBER_OF_RANDOM_RESTARTS; i++) {
    cout << "Random restart #" << i << endl;

    if(clusteringType() != FLAT)
      clustering.splitOrMerge(last);
		
    if(clustering.size() == 1) {
      cout << "Warning: there is only one cluster in " << name << endl;
      continue;
    }
    buildClustersOverClusters();

    double gain = calculateInformationGain();
    cout << "Initial value of objective function: " << gain << endl;
    for(int j = 0; j < NUMBER_OF_CONVERGENT_STEPS; j++) {
      cout << "Convergent step: " << j << endl;
      gain += clustering.correctionLoop();
    }
    if(gain > bestGain) {
      bestGain = gain;
      cout << "Best gain: " << gain << endl;
      bestClustering.copy(&clustering);
    }
    clustering.copy(&backupClustering);
  }
  clustering.copy(&bestClustering);
}
plJointDistribution createClusteringJointDist( const Variables& variables, const Clustering& clustering) {
  plComputableObjectList cndProbTabs;
  for ( Clustering::const_iterator it = clustering.begin(); it != clustering.end(); ++it ) {
    cndProbTabs *= createClusterJointDist( variables, *it);
  }
  //std::cout << cndProbTabs << std::endl;
  plJointDistribution jointDist( variables, cndProbTabs );
  return jointDist;
}
Exemple #7
0
unsigned int DBSCAN::run(Clustering &c){
    unsigned int clusterID = 1;
    vector<unsigned int> neighbours;
    double cout_limit = 0.0;
    
    cout << "db node_count: " << node_count << endl;
    for(unsigned int nodeID = 0; nodeID < node_count; nodeID++){
        
        if (double(nodeID) / node_count >= cout_limit){
            cout << double(nodeID) / node_count * 100 << " %" << endl;
            cout_limit+=.05;
        }
        
        //cout << "h" << endl;
        if (!visited[nodeID]){
            visited[nodeID] = true;
            neighbours.clear();
            //cout << "i" << endl;
            if (met->getNeighbors(nodeID, neighbours)){
                //cout << "j" << endl;
                cluster[nodeID] = clusterID;
                //visited[nodeID] = true;
                expand(neighbours, clusterID);
                clusterID++;
            }
        }
    }
    
    c.resize(clusterID);
    
    met->expand(c, cluster);
    
    return clusterID;
};
bool MatchingDynamicClusterer::bootstrap( Clustering &step_clustering )
{
	m_dynamic.clear();
	Clustering::iterator cit;
	Clustering::iterator cend = step_clustering.end();
	int step_cluster_index = 0;
	for( cit = step_clustering.begin() ; cit != cend; cit++, step_cluster_index++ )
	{
		// if( (*cit).size() < MIN_CLUSTER_SIZE || (*cit).size() > MAX_CLUSTER_SIZE )
		if( (*cit).size() < MIN_CLUSTER_SIZE )
		{
			continue;
		}
		DynamicCluster dc;
		dc.update( m_step, step_cluster_index, *cit );
		m_dynamic.push_back(dc);
#ifdef DEBUG_MATCHING
		cout << "T" << m_step << ": Birth: Community M" << m_dynamic.size() << endl;
#endif			
	}
	return true;
}
bool MapMatchingDynamicClusterer::add_clustering( Clustering &step_clustering )
{
	m_step += 1;
	/// First?
	if( m_step == 1 )
	{
		return bootstrap(step_clustering);
	}
	
	int step_cluster_index = 0;

	/// Build a map of Nodes -> Dynamic Communities containing those nodes
	map<NODE,set<int> > fastmap;
	DynamicClustering::iterator dit;
	DynamicClustering::iterator dend = m_dynamic.end();
	int dyn_count = (int)m_dynamic.size();
	int dyn_index = 0;
	long* dyn_sizes = new long[dyn_count+1];
	for( dit = m_dynamic.begin() ; dit != dend; dit++, dyn_index++ )
	{
		// Dead?
		if( m_death_age > 0 && m_dynamic[dyn_index].is_dead( m_step, m_death_age ) )
		{
			dyn_sizes[dyn_index] = 0;
			continue;
		}
		Cluster& front = (*dit).front();
		dyn_sizes[dyn_index] = (long)front.size();
		Cluster::const_iterator fit;
		Cluster::const_iterator	fend = front.end();
		for( fit = front.begin() ; fit != fend; fit++ )
		{
			NODE node_index = *fit;
			if( !fastmap.count( node_index ) )
			{
				set<int> first;
				first.insert(dyn_index);
				fastmap.insert( make_pair(node_index,first) );
			}
			else
			{
				fastmap[node_index].insert(dyn_index);
			}
		}
	}	

	/// Now try to match all
	int* all_intersection = new int[dyn_count+1];
	vector<DynamicCluster> fresh;
	PairVector matched_pairs;
	map<NODE,set<int> >::const_iterator mend = fastmap.end();
	Clustering::iterator cit;
	Clustering::iterator cend = step_clustering.end();
	for( cit = step_clustering.begin() ; cit != cend; cit++, step_cluster_index++ )
	{
		long size_step = (long)(*cit).size();
		if( size_step < MIN_CLUSTER_SIZE )
		{
			continue;
		}
		// Compute all intersections
		for( dyn_index = 0; dyn_index < dyn_count; dyn_index++)
		{
			all_intersection[dyn_index] = 0;
		}
		Cluster::const_iterator xit;
		Cluster::const_iterator	xend = (*cit).end();
		for( xit = (*cit).begin() ; xit != xend; xit++ )
		{
			NODE node_index = *xit;
			map<NODE,set<int> >::const_iterator mit = fastmap.find(node_index);
			if( mit != mend )
			{
				set<int>::const_iterator sit;
				for ( sit = fastmap[node_index].begin(); sit != fastmap[node_index].end(); sit++ )
				{
					all_intersection[(*sit)]++;
				}
			}
		}
		// Find matches
		vector<int> matches;
		for( dyn_index = 0; dyn_index < dyn_count; dyn_index++)
		{
			if( dyn_sizes[dyn_index] == 0 || all_intersection[dyn_index] == 0 )
			{
				continue;
			}
#ifdef SIM_OVERLAP
			double sim = ((double)(all_intersection[dyn_index]))/min(size_step,dyn_sizes[dyn_index]);
#else
			double sim = ((double)(all_intersection[dyn_index]))/(size_step+dyn_sizes[dyn_index]-all_intersection[dyn_index]);
#endif
			if( sim > m_threshold )
			{
				matches.push_back( dyn_index );
			}
		}

		// new community?
		if( matches.empty() )
		{
			DynamicCluster dc;
			dc.update( m_step, step_cluster_index, *cit );
			fresh.push_back(dc);
#ifdef DEBUG_MATCHING
			cout << "T" << m_step << ": Birth: Community M" << (m_dynamic.size()+fresh.size()) << " from C" << step_cluster_index+1 << endl;
#endif			
		}
		else
		{
			vector<int>::const_iterator iit;
			for( iit = matches.begin() ; iit != matches.end(); iit++ )
			{
				pair<int,int> p(step_cluster_index,(*iit));
				matched_pairs.push_back(p);
			}
		}
	}

	// Actually update existing dynamic communities now
	set<int> matched_dynamic;
	PairVector::const_iterator pit;
	for( pit = matched_pairs.begin(); pit != matched_pairs.end(); pit++ )
	{
		int step_cluster_index = (*pit).first;
		int dyn_cluster_index = (*pit).second;
		// already processed this dynamic cluster?
		if( matched_dynamic.count( dyn_cluster_index ) ) 
		{
			DynamicCluster dc( m_dynamic[dyn_cluster_index], m_step, step_cluster_index, step_clustering[step_cluster_index] );
			fresh.push_back(dc);
#ifdef DEBUG_MATCHING
			cout << "T" << m_step << ": Split: Matched C" << (step_cluster_index+1) << " to M" << (dyn_cluster_index+1) << ". Splitting to M" << (m_dynamic.size()+fresh.size()) <<  endl;
#endif
		}
		else
		{
#ifdef DEBUG_MATCHING
			cout << "T" << m_step << ": Continuation: Matched C" << (step_cluster_index+1) << " to M" << (dyn_cluster_index+1) << endl;
#endif
			m_dynamic[dyn_cluster_index].update( m_step, step_cluster_index, step_clustering[step_cluster_index] );
			matched_dynamic.insert(dyn_cluster_index);
		}
	}
	// And finally add any new dynamic communities
	for( dit = fresh.begin() ; dit != fresh.end(); dit++ )
	{
		m_dynamic.push_back(*dit);
	}

	delete[] dyn_sizes;
	delete[] all_intersection;
	return true;
}
bool MatchingDynamicClusterer::add_clustering( Clustering &step_clustering )
{
	m_step += 1;
	/// First?
	if( m_step == 1 )
	{
		return bootstrap(step_clustering);
	}
	
	/// Otherwise, try to match all
	Clustering::iterator cit;
	Clustering::iterator cend = step_clustering.end();
	int step_cluster_index = 0;
	vector<DynamicCluster> fresh;
	PairVector matched_pairs;
	for( cit = step_clustering.begin() ; cit != cend; cit++, step_cluster_index++ )
	{
		vector<int> matches;
		find_matches( *cit, matches );
		// new community?
		if( matches.empty() )
		{
			DynamicCluster dc;
			dc.update( m_step, step_cluster_index, *cit );
			fresh.push_back(dc);
#ifdef DEBUG_MATCHING
			cout << "T" << m_step << ": Birth: Community M" << (m_dynamic.size()+fresh.size()) << " from C" << step_cluster_index+1 << endl;
#endif			
		}
		else
		{
			vector<int>::const_iterator iit;
			for( iit = matches.begin() ; iit != matches.end(); iit++ )
			{
				pair<int,int> p(step_cluster_index,(*iit));
				matched_pairs.push_back(p);
			}
		}
	}
	
	// Actually update existing dynamic communities now
	set<int> matched_dynamic;
	PairVector::const_iterator pit;
	for( pit = matched_pairs.begin(); pit != matched_pairs.end(); pit++ )
	{
		int step_cluster_index = (*pit).first;
		int dyn_cluster_index = (*pit).second;
		// already processed this dynamic cluster?
		if( matched_dynamic.count( dyn_cluster_index ) ) 
		{
			DynamicCluster dc( m_dynamic[dyn_cluster_index], m_step, step_cluster_index, step_clustering[step_cluster_index] );
			fresh.push_back(dc);
#ifdef DEBUG_MATCHING
			cout << "T" << m_step << ": Split: Matched C" << (step_cluster_index+1) << " to M" << (dyn_cluster_index+1) << ". Splitting to M" << (m_dynamic.size()+fresh.size()) <<  endl;
#endif
		}
		else
		{
#ifdef DEBUG_MATCHING
			cout << "T" << m_step << ": Continuation: Matched C" << (step_cluster_index+1) << " to M" << (dyn_cluster_index+1) << endl;
#endif
			m_dynamic[dyn_cluster_index].update( m_step, step_cluster_index, step_clustering[step_cluster_index] );
			matched_dynamic.insert(dyn_cluster_index);
		}
	}
	// And finally add any new dynamic communities
	DynamicClustering::const_iterator dit;
	for( dit = fresh.begin() ; dit != fresh.end(); dit++ )
	{
		m_dynamic.push_back(*dit);
	}
	
	return true;
}
Exemple #11
0
int main (int argc, const char * argv[]){

    struct timeval start, end;
    gettimeofday(&start, NULL);

    // general parameters
    size_t maxSeqLen = 50000;
    int seqType = Sequence::AMINO_ACIDS;

    // parameter for the prefiltering
    int kmerSize = 6;
    int alphabetSize = 21;
    size_t maxResListLen = 100;
    int split = 0;
    int skip = 0;
    bool aaBiasCorrection = true;
    float zscoreThr = 50.0f;
    float sensitivity = 4.0;

    // parameters for the alignment
    double evalThr = 0.001;
    double covThr = 0.8;
    int maxAlnNum = 10;

    std::string lastSeqDB = "";
    std::string currentSeqDB = "";
    std::string cluDB = ""; 
    std::string outDB = "";
    std::string tmpDir = "";

    // get the path of the scoring matrix
    char* mmdir = getenv ("MMDIR");
    if (mmdir == 0){
        std::cerr << "Please set the environment variable $MMDIR to your MMSEQS installation directory.\n";
        exit(1);
    }
    std::string scoringMatrixFile(mmdir);
    scoringMatrixFile = scoringMatrixFile + "/data/blosum62.out";

    parseArgs(argc, argv, &lastSeqDB, &currentSeqDB, &cluDB, &outDB, &tmpDir, &scoringMatrixFile, &maxSeqLen);

    std::string lastSeqDBIndex = lastSeqDB + ".index";
    std::string currentSeqDBIndex = currentSeqDB + ".index";
    std::string cluDBIndex = cluDB + ".index";
    std::string outDBIndex = outDB + ".index";

    std::list<std::string>* tmpFiles = new std::list<std::string>();
    std::string AIndex = tmpDir + "/A.index";
    std::string BIndex = tmpDir + "/B.index";
    tmpFiles->push_back(AIndex);
    tmpFiles->push_back(BIndex);

    std::string Brest_indexFile = tmpDir + "/Brest.index";
    tmpFiles->push_back(Brest_indexFile);
    
    std::string BB_clu = tmpDir + "/BB_clu";
    std::string BB_clu_index = BB_clu + ".index";
    tmpFiles->push_back(BB_clu);
    tmpFiles->push_back(BB_clu_index);
    
    std::cout << "////////////////////////////////////////////////////////////////////////\n";
    std::cout << "///////                   Init                             /////////////\n";
    std::cout << "////////////////////////////////////////////////////////////////////////\n";
    // extract three indexes:
    // - A: last database version without deleted sequences
    // - B: sequences which are new in the database
    writeIndexes(AIndex, BIndex, lastSeqDBIndex, currentSeqDBIndex);


    std::cout << "////////////////////////////////////////////////////////////////////////\n";
    std::cout << "///////            Calculating B->A scores                 /////////////\n";
    std::cout << "////////////////////////////////////////////////////////////////////////\n";
    // calculate score for the updating
    // B->A scores
    std::string BA_base = runScoresCalculation(currentSeqDB, BIndex,
            currentSeqDB, AIndex,
            tmpDir,
            scoringMatrixFile, maxSeqLen, seqType,
            kmerSize, alphabetSize, maxResListLen, split, skip, aaBiasCorrection, zscoreThr, sensitivity,
            evalThr, covThr, maxAlnNum, "BA", tmpFiles);

    std::cout << "////////////////////////////////////////////////////////////////////////\n";
    std::cout << "///////      Adding sequences to existing clusters         /////////////\n";
    std::cout << "////////////////////////////////////////////////////////////////////////\n";
    // update the clustering
    DBReader* currSeqDbr = new DBReader(currentSeqDB.c_str(), currentSeqDBIndex.c_str());
    currSeqDbr->open(DBReader::NOSORT);

    // data structures for the clustering
    int seqDBSize = currSeqDbr->getSize();
    unsigned int* id2rep = new unsigned int[seqDBSize];
    char** rep2cluName = new char*[seqDBSize];
    for (int i = 0; i < seqDBSize; i++)
        rep2cluName[i] = new char[FFINDEX_MAX_ENTRY_NAME_LENTH];
    cluster_t* clusters = new cluster_t[seqDBSize];
    for (int i = 0; i < seqDBSize; i++){
        clusters[i].clu_size = 0;
        clusters[i].first = 0;
        clusters[i].last = 0;
    }

    std::cout << "Read the existing clustering...\n";
    // Read the existing clustering
    readClustering(currSeqDbr, cluDB, id2rep, rep2cluName, clusters);

    std::cout << "Append new sequences to the existing clustering...\n";
    // append sequences from the new database to the existing clustering based on the B->A alignment scores
    // write sequences without a match to a separate index (they will be clustered separately)
    appendToClustering(currSeqDbr, BIndex, BA_base, id2rep, clusters, Brest_indexFile);

    if (seqsWithoutMatches > 0){
        std::cout << "////////////////////////////////////////////////////////////////////////\n";
        std::cout << "///////            Calculating B->B scores                 /////////////\n";
        std::cout << "////////////////////////////////////////////////////////////////////////\n";
        // B->B scores
        std::string BB_base = runScoresCalculation(currentSeqDB, Brest_indexFile, 
                currentSeqDB, Brest_indexFile,
                tmpDir,
                scoringMatrixFile, maxSeqLen, seqType,
                kmerSize, alphabetSize, maxResListLen, split, skip, aaBiasCorrection, zscoreThr, sensitivity,
                evalThr, covThr, maxAlnNum, "BB", tmpFiles);

        std::cout << "////////////////////////////////////////////////////////////////////////\n";
        std::cout << "///////             Appending new clusters                 /////////////\n";
        std::cout << "////////////////////////////////////////////////////////////////////////\n";
        std::cout << "Cluster new sequences without a match to the existing clusters...\n";
        // cluster sequences without a match to the existing clusters separately
        // use the index generated in the previous step
        Clustering* clu = new Clustering(currentSeqDB, currentSeqDBIndex,
                BB_base, BB_base + ".index",
                BB_clu, BB_clu_index,
                0.0, 0, maxResListLen);
        clu->run(Clustering::SET_COVER); 

        std::cout << "Append generated clusters to the complete clustering...\n";
        // append B->B clusters to the clustering
        newClus = readClustering(currSeqDbr, BB_clu, id2rep, rep2cluName, clusters);
    }

    // write new clustering
    std::cout << "Write clustering results...\n";
    writeResults(clusters, rep2cluName, currSeqDbr, seqDBSize, outDB);
    std::cout << "done.\n";

    currSeqDbr->close();

    std::cout << "////////////////////////////////////////////////////////////////////////\n";
    std::cout << "///////                   Statistics                            ////////\n";
    std::cout << "////////////////////////////////////////////////////////////////////////\n";
    std::cout << "\nPrevios database version: " << oldDBSize << " entries.\n";
    std::cout << "New database vesion     : " << newDBSize << " entries.\n";
    std::cout << deletedSeqs << " entries were deleted,\n";
    std::cout << newSeqs << " entries are new,\n";
    std::cout << sharedSeqs << " entries are shared.\n\n";

    std::cout << seqsWithMatches << " new sequences had matches to the previous database version.\n";
    std::cout << "Remaining " << seqsWithoutMatches << " were grouped into " << newClus << " new clusters.\n";
 
    gettimeofday(&end, NULL);
    int sec = end.tv_sec - start.tv_sec;
    std::cout << "\nTime for updating: " << (sec / 3600) << " h " << (sec % 3600 / 60) << " m " << (sec % 60) << "s\n\n";

    deleteTmpFiles(tmpFiles);
    delete tmpFiles;

}