bool fit(DataVec& data, DataVec& centroids) { bool converged(true); // assign points to closest centroid for (DataVec::iterator it1 = data.begin(); it1!=data.end(); ++it1) { double min_dist = std::numeric_limits<double>::max(); int min_clust = -1; for (DataVec::iterator it2 = centroids.begin(); it2!=centroids.end(); ++it2) { double d = dist(*it1,*it2); if (d < min_dist) { min_dist = d; min_clust = it2-centroids.begin(); } } //std::cout << "Point " << *it1 << "\n"; //std::cout << "min_dist=" << min_dist << " min_clust=" << min_clust << "\n"; it1->cluster_ = min_clust; } // re-estimate centroids for (size_t i=0;i<K;++i) { std::cout << "Centroid at " << i << " was " << centroids[i] << "\n"; bool centroidUpdated = getCentroid(data,i,centroids[i]); if (centroidUpdated) converged = false; std::cout << "Centroid at " << i << " is now " << centroids[i] << "\n"; } return converged; }
int main(int argc, char** argv) { // seed random generator srand(time(NULL)); ifstream infile("../iris.data"); string line; DataVec data; double fac = 4.0; while (std::getline(infile, line)) { std::vector<std::string> fields; boost::split(fields,line, boost::is_any_of(",")); assert(fields.size() == 5); double x1 = atof(fields[0].c_str())*fac; double x2 = atof(fields[1].c_str())*fac; double x3 = atof(fields[2].c_str())*fac; double x4 = atof(fields[3].c_str())*fac; Point p(x1,x2,x3,x4); //std::cout << p << std::endl; data.push_back(p); } std::cout << "Collected " << data.size() << " points. " << std::endl; assert(data.size()>K); // init centroids to random points DataVec centroids; //centroids.reserve(K); DataVec::iterator dataBegin = random_unique(data.begin(),data.end(),K); std::cout << K << " random points " << std::endl; for (size_t i=0;i<K;++i) { std::cout << data[i] << "\n"; centroids.push_back(data[i]); } std::cout << centroids.size() << " centroids " << std::endl; // Lloyd's algorithm to iteratively fit the cluster centroids. bool done = fit(data,centroids); while(!done) { done = fit(data,centroids); std::cout << done << "\n"; } double idx = dunnIndex(data,centroids); cout << "Dunn Index for this clustering " << idx << "\n"; // write clustering to file ofstream of("clusters.dat"); for (DataVec::iterator it = data.begin(); it!=data.end(); ++it) { of << *it << std::endl; } of.close(); return EXIT_SUCCESS; }
bool getCentroid(const DataVec& data, const int cluster, Point& centroid) { size_t num(0); Point new_centroid(0.0,0.0,0.0,0.0); for (DataVec::const_iterator ct=data.begin();ct!=data.end();++ct) { if (ct->cluster_ == cluster) { new_centroid.x1_ += ct->x1_; new_centroid.x2_ += ct->x2_; new_centroid.x3_ += ct->x3_; new_centroid.x4_ += ct->x4_; ++new_centroid.cluster_; ++num; } } if (num==0) { std::cout << "Cluster unchanged \n"; return true; } new_centroid.x1_ /= num; new_centroid.x2_ /= num; new_centroid.x3_ /= num; new_centroid.x4_ /= num; double d = dist(centroid,new_centroid); std::cout << "getCentroid: d=" << d << "\n"; std::cout << "num=" << num << "\n"; bool changed = d>0.05 ? true : false; centroid = new_centroid; return changed; }
double dunnIndex(DataVec& data, DataVec& centroids) { double res = 0.0; // compute max cluster diameter double max_clust_diam = 0.0; for (size_t i=0;i<K;++i) { double clust_diam = 0.0; double n = 0.0; for (DataVec::const_iterator ct=data.begin();ct!=data.end();++ct) { if (ct->cluster_ == i) { double d = dist(*ct,centroids[K]); clust_diam += d; ++n; } } std::cout << "clust_diam = " << clust_diam << "\n"; if (n>0) clust_diam /= n; std::cout << "clust_diam = " << clust_diam << "\n"; if (clust_diam > max_clust_diam) max_clust_diam = clust_diam; } // compute min intercluster distance double min_clust_dist = std::numeric_limits<double>::max(); for (size_t i=0;i<K;++i) { double d = 0.0; for (size_t j=(i+1);j<K;++j) { d = dist(centroids[i],centroids[j]); //std::cout << "distance btw " << i << " " << j << " d= " << d << "\n"; } if (d>0 && d<min_clust_dist) min_clust_dist = d; } std::cout << "min_clust_dist = " << min_clust_dist << "\n"; std::cout << "max_clust_diam = " << max_clust_diam << "\n"; if (max_clust_diam > 0) res = min_clust_dist / max_clust_diam; return res; }
int main(int argc, const char * argv[]) { parseCommandLineArgs(argc,argv); vector<uint32_t> msgFreqVec; msgFreqVec.push_back(50); msgFreqVec.push_back(100); msgFreqVec.push_back(200); msgFreqVec.push_back(300); msgFreqVec.push_back(400); msgFreqVec.push_back(500); map <uint,float> fairnessIndexMap; map <uint,uint32_t> collisionsIndexMap; map <uint,float> utilizationMap; map <uint,uint32_t> thruputMap; DataVec dataMap; for (uint32_t testNum =1; testNum < g_uSimNum; testNum++) { uint jjj(1); if(testNum%10 == 0) cout << "Test: " << testNum << endl; for (uint hiddenNode = 0; hiddenNode <= 1; hiddenNode++ ) { for (uint useVcs = 0; useVcs <= 1; useVcs++ ) { for (uint nodeAFreqScale = 1; nodeAFreqScale <= 2; nodeAFreqScale++) { for (vector<uint32_t>::iterator it = msgFreqVec.begin(); it != msgFreqVec.end(); ++it ) { //cout << "sim: " << jjj << endl; //init simulation Simulation *sim = new Simulation(simDuration); Channel *channel = new Channel(); if (! sim || ! channel) { cout << "failed to allocate memory for simulation and channel"<< endl; exit(1); } //init nodes uint32_t nodeAFreq = *it*nodeAFreqScale; uint32_t nodeCFreq = (*it); //init nodes RxNode *nodeB = new RxNode(2,sim,channel,ACK_RTS_CTS_SND_DUR,SLOT_DUR); RxNode *nodeD = new RxNode(4,sim,channel,ACK_RTS_CTS_SND_DUR,SLOT_DUR); TxNode *nodeA = new TxNode(0,sim,channel,nodeB,nodeAFreq,DIFS,SIFS,PACKET_SEND_DUR,ACK_RTS_CTS_SND_DUR,SLOT_DUR,useVcs,hiddenNode); TxNode *nodeC = new TxNode(1,sim,channel,nodeD,nodeCFreq,DIFS,SIFS,PACKET_SEND_DUR,ACK_RTS_CTS_SND_DUR,SLOT_DUR,useVcs,hiddenNode); if (! nodeA ||!nodeB || ! nodeC || !nodeD) { cout << "failed to allocate memory for nodes"<< endl; exit(1); } //seed starting events nodeA->schedulePacketReady(random_distro::exponential(nodeAFreq,random_distro::TEN_USECS)); nodeC->schedulePacketReady(random_distro::exponential(nodeCFreq,random_distro::TEN_USECS)); //run simulation sim->Run(); uint32_t aThruput(0),cThruput(0); float aUtil(0.0f),cUtil(0.0f); if ( useVcs) { aThruput = nodeA->SuccessfulSends() * (PACKET_SIZE_BYTES + 3 * ACK_SIZE_BYTES); aUtil = (((float)nodeA->SuccessfulSends()*VCS_RTT)/(float)simDuration) * 100.0f; cThruput = nodeC->SuccessfulSends() * (PACKET_SIZE_BYTES + 3* ACK_SIZE_BYTES); cUtil = (((float)nodeC->SuccessfulSends()*VCS_RTT)/(float)simDuration) * 100.0f; } else { aThruput = nodeA->SuccessfulSends() * (PACKET_SIZE_BYTES + ACK_SIZE_BYTES); aUtil = (((float)nodeA->SuccessfulSends()*RTT)/(float)simDuration) * 100.0f; cThruput = nodeC->SuccessfulSends() * (PACKET_SIZE_BYTES + ACK_SIZE_BYTES); cUtil = (((float)nodeC->SuccessfulSends()*RTT)/(float)simDuration) * 100.0f; } dataMap.insert(Data(jjj, nodeAFreq, nodeCFreq, aThruput, cThruput, aUtil, cUtil, nodeC->TotalCollisions(), nodeC->IsHiddenNode(), nodeC->UsesVCS(), aThruput + cThruput, aUtil + cUtil, aUtil/cUtil)); //sim->PrintData(); // destroy objects nodeA = NULL; nodeC = NULL; channel = NULL; sim = NULL; delete nodeA; delete nodeC; delete channel; delete sim; jjj++; } // for lamdas } //for nodeA Freq scale } //for use Vcs } // hiddenNode } // testNum dataMap.print(); stringstream ss; ss<< g_uSimNum; string strNumOfTests(""); ss >>strNumOfTests; dataMap.OutputToFile("data_"+ strNumOfTests+".csv"); if (g_bShowEnhancedStats) { cout <<endl; cout << "Row 1: Parallel, No VCS" << endl; cout << "Row 2: Parallel, VCS" << endl; cout << "Row 3: Hidden Node, No VCS" << endl; cout << "Row 4: Hidden Node, VCS" << endl; cout << endl<<"Throughput A:" << endl; for ( map<uint32_t, Data*>::iterator it2 = dataMap.begin(); it2 != dataMap.end(); ++it2 ) { cout << left << setw(2) <<it2->first << ":" <<setw(10) << std::setprecision( 3 ) << fixed <<it2->second->thruput_a << "\t"; if (it2->first && it2->first % 12 == 0) cout << endl; } cout << endl<<"Throughput C:" << endl; for ( map<uint32_t, Data*>::iterator it2 = dataMap.begin(); it2 != dataMap.end(); ++it2 ) { cout << left << setw(2) <<it2->first << ":" <<setw(10) << std::setprecision( 3 ) << fixed << it2->second->thruput_c << "\t"; if (it2->first && it2->first % 12 == 0) cout << endl; } cout << endl<<"Throughput:" << endl; for ( map<uint32_t, Data*>::iterator it2 = dataMap.begin(); it2 != dataMap.end(); ++it2 ) { cout << left << setw(2) <<it2->first << ":" <<setw(10) << std::setprecision( 3 ) << fixed <<it2->second->thruput_tot << "\t"; if (it2->first && it2->first % 12 == 0) cout << endl; } cout <<endl<< "Utilization:" << endl; for ( map <uint32_t, Data*>::iterator it2 = dataMap.begin(); it2 != dataMap.end(); ++it2 ) { cout << left << setw(2) <<it2->first << ":" <<setw(5) << std::setprecision( 3 ) <<fixed<< it2->second->util_tot << "\t"; if (it2->first && it2->first % 12 == 0) cout << endl; } cout <<endl<< "Fairness Indeces:" << endl; for ( map <uint32_t, Data*>::iterator it2 = dataMap.begin(); it2 != dataMap.end(); ++it2 ) { cout << left << setw(2) <<it2->first << ":" <<setw(5) << std::setprecision( 3 ) <<fixed<< it2->second->fairness<< "\t"; if (it2->first && it2->first % 12 == 0) cout << endl; } cout << endl<<"Collisions:" << endl; for ( map<uint32_t, Data*>::iterator it2 = dataMap.begin(); it2 != dataMap.end(); ++it2 ) { cout << left << setw(2) <<it2->first << ":" <<setw(10) << std::setprecision( 3 ) <<fixed<< it2->second->colls << "\t"; if (it2->first && it2->first % 12 == 0) cout << endl; } } return 0; }
int main (int argc, char *argv[]) { // handle cmd args int batch_size, maxiter; std::string datadir; std::string output_file; if ( argc > 5 || argc < 2 ) { printf( "Usage: ./logistic_mpi <data_directory> <batch_size> " "<max_iterations> <model_output_file>\n"); MPI_Finalize(); exit( 0 ); } else if ( argc == 5 ) { datadir = argv[1]; batch_size = atoi( argv[2] ); // mini-batch processing if ( batch_size == -1 ) { batch_size = INT_MIN; } maxiter = atoi( argv[3] ); output_file = argv[4]; } else if ( argc == 4 ) { datadir = argv[1]; batch_size = atoi( argv[2] ); // mini-batch processing if ( batch_size == -1 ) { batch_size = INT_MIN; } maxiter = atoi( argv[3] ); output_file = "logistic.model"; } else if ( argc == 4 ) { datadir = argv[1]; batch_size = atoi( argv[2] ); // mini-batch processing if ( batch_size == -1 ) { batch_size = INT_MIN; } maxiter = 100; output_file = "logistic.model"; } else { datadir = argv[1]; batch_size = INT_MIN; // batch processing maxiter = 100; output_file = "logistic.model"; } // initialize/populate mpi specific vars local to each node double t1,t2; // elapsed time computation int numtasks, taskid, len; char hostname[MPI_MAX_PROCESSOR_NAME]; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &numtasks); MPI_Comm_rank(MPI_COMM_WORLD,&taskid); MPI_Get_processor_name(hostname, &len); MPI_Op op; /* DATA PREPROCESSING */ if ( taskid == MASTER ) { printf( "\nLoading and Preprocessing Data\n" ); } t1 = MPI_Wtime(); // determine number of instances DataVec datavec; mlu::count_instances( datadir, datavec, num_inst ); // determine number of features mlu::count_features( datavec[0], n ); /* DATA INITIALIZATION */ // randomize instances std::random_shuffle( datavec.begin(), datavec.end() ); // partition data based on taskid size_t div = datavec.size() / numtasks; ProbSize limit = ( taskid == numtasks - 1 ) ? num_inst : div * ( taskid + 1 ); m = limit - div * taskid; // danamically allocate data Mat X( m, n ); Vec labels( m ); // load data partition double feat_val, label; ProbSize i = 0; for ( ProbSize idx = taskid * div; idx < limit; ++idx ) { std::ifstream data( datavec[idx] ); for ( ProbSize j=0; j<n; ++j ) { data >> feat_val; X(i,j) = feat_val; } data >> label; labels[i] = label; i++; } // perform feature scaling (optional) if ( scaling ) { // Allreduce to find global min Vec X_min_tmp = X.colwise().minCoeff(); X_min_data = X_min_tmp.data(); Vec X_min = Vec( X_min_tmp.size() ); MPI_Allreduce( X_min_tmp.data(), X_min.data(), X_min_tmp.size(), MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD ); // Allreduce to find global max Vec X_max_tmp = X.colwise().maxCoeff(); X_max_data = X_max_tmp.data(); Vec X_max = Vec( X_max_tmp.size() ); MPI_Allreduce( X_max_tmp.data(), X_max.data(), X_max_tmp.size(), MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD ); // scale features using global min and max mlu::scale_features( X, X_min, X_max, 1, 0 ); } /* FORMAT LABELS */ // get unique labels mlu::get_unique_labels( labels, classmap ); // allreduce to obtain maximum label set size int local_size = classmap.size(); int max_size = 0; MPI_Allreduce( &local_size, &max_size, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD ); // allreduce to obtain global unique label set int unique_labels[max_size]; int global_unique_labels[max_size]; for ( int i=0; i<max_size; ++i ) { unique_labels[i] = -1; global_unique_labels[i] = -1; } int idx = 0; for ( auto& kv : classmap ) { unique_labels[idx++] = kv.first; } MPI_Op_create( (MPI_User_function *)reduce_unique_labels, 1, &op ); MPI_Allreduce( unique_labels, global_unique_labels, max_size, MPI_INT, op, MPI_COMM_WORLD ); MPI_Op_free( &op ); // update local classmap std::sort( global_unique_labels, global_unique_labels + max_size ); classmap.clear(); int labeltmp; idx=0; for ( int i=0; i<max_size; ++i ) { labeltmp = global_unique_labels[i]; if ( labeltmp != -1 ) { classmap.emplace( labeltmp, idx++ ); } } // format the local label set into a matrix based on global class map Mat y = mlu::format_labels( labels, classmap ); numlabels = (LayerSize) classmap.size(); // output total data loading time for each task MPI_Barrier( MPI_COMM_WORLD ); t2 = MPI_Wtime(); printf( "--- task %d loading time %lf\n", taskid, t2 - t1 ); /* INIT LOCAL CLASSIFIER */ LogisticRegression logistic_layer( n, numlabels, true ); /* OPTIMIZATION */ if ( taskid == MASTER ) { printf( "\nPerforming Gradient Descent\n" ); } int update_size; // stores the number of instances read for each update double grad_mag; // stores the magnitude of the gradient for each update int delta_size = logistic_layer.get_theta_size(); Vec delta_update = Vec::Zero( delta_size ); int global_update_size; if ( taskid == MASTER ) { printf( "iteration : elapsed time : magnitude\n" ); } for ( int i=0; i<maxiter; ++i ) { // compute gradient update t1 = MPI_Wtime(); logistic_layer.compute_gradient( X, y, batch_size, update_size ); delta_data = logistic_layer.get_delta().data(); // sum updates across all partitions MPI_Allreduce( delta_data, delta_update.data(), delta_size, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD ); logistic_layer.set_delta( delta_update ); // sum the update sizes MPI_Allreduce( &update_size, &global_update_size, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD ); // normalize + regularize gradient update logistic_layer.normalize_gradient( global_update_size ); logistic_layer.regularize_gradient( global_update_size ); // update logistic_layer parameters t2 = MPI_Wtime(); if ( logistic_layer.converged( grad_mag ) ) { break; } if ( taskid == MASTER ) { printf( "%d : %lf : %lf\n", i+1, t2 - t1, grad_mag ); } logistic_layer.update_theta(); } /* MODEL STORAGE */ if (taskid == MASTER) { FILE *output; output = fopen ( output_file.c_str(), "w" ); int idx; Vec theta = logistic_layer.get_theta(); printf( "\nWriting Model to File: %s\n\n", output_file.c_str() ); fprintf( output, "%lu\n", theta.size() ); for ( idx=0; idx<theta.size()-1; ++idx ) { fprintf( output, "%lf\t", theta[idx] ); } fprintf( output, "%lf\n", theta[idx] ); fclose( output ); } MPI_Finalize(); return 0; }