int main(int argc, char** argv) { // seed random generator srand(time(NULL)); ifstream infile("../iris.data"); string line; DataVec data; double fac = 4.0; while (std::getline(infile, line)) { std::vector<std::string> fields; boost::split(fields,line, boost::is_any_of(",")); assert(fields.size() == 5); double x1 = atof(fields[0].c_str())*fac; double x2 = atof(fields[1].c_str())*fac; double x3 = atof(fields[2].c_str())*fac; double x4 = atof(fields[3].c_str())*fac; Point p(x1,x2,x3,x4); //std::cout << p << std::endl; data.push_back(p); } std::cout << "Collected " << data.size() << " points. " << std::endl; assert(data.size()>K); // init centroids to random points DataVec centroids; //centroids.reserve(K); DataVec::iterator dataBegin = random_unique(data.begin(),data.end(),K); std::cout << K << " random points " << std::endl; for (size_t i=0;i<K;++i) { std::cout << data[i] << "\n"; centroids.push_back(data[i]); } std::cout << centroids.size() << " centroids " << std::endl; // Lloyd's algorithm to iteratively fit the cluster centroids. bool done = fit(data,centroids); while(!done) { done = fit(data,centroids); std::cout << done << "\n"; } double idx = dunnIndex(data,centroids); cout << "Dunn Index for this clustering " << idx << "\n"; // write clustering to file ofstream of("clusters.dat"); for (DataVec::iterator it = data.begin(); it!=data.end(); ++it) { of << *it << std::endl; } of.close(); return EXIT_SUCCESS; }
int main (int argc, char *argv[]) { // handle cmd args int batch_size, maxiter; std::string datadir; std::string output_file; if ( argc > 5 || argc < 2 ) { printf( "Usage: ./logistic_mpi <data_directory> <batch_size> " "<max_iterations> <model_output_file>\n"); MPI_Finalize(); exit( 0 ); } else if ( argc == 5 ) { datadir = argv[1]; batch_size = atoi( argv[2] ); // mini-batch processing if ( batch_size == -1 ) { batch_size = INT_MIN; } maxiter = atoi( argv[3] ); output_file = argv[4]; } else if ( argc == 4 ) { datadir = argv[1]; batch_size = atoi( argv[2] ); // mini-batch processing if ( batch_size == -1 ) { batch_size = INT_MIN; } maxiter = atoi( argv[3] ); output_file = "logistic.model"; } else if ( argc == 4 ) { datadir = argv[1]; batch_size = atoi( argv[2] ); // mini-batch processing if ( batch_size == -1 ) { batch_size = INT_MIN; } maxiter = 100; output_file = "logistic.model"; } else { datadir = argv[1]; batch_size = INT_MIN; // batch processing maxiter = 100; output_file = "logistic.model"; } // initialize/populate mpi specific vars local to each node double t1,t2; // elapsed time computation int numtasks, taskid, len; char hostname[MPI_MAX_PROCESSOR_NAME]; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &numtasks); MPI_Comm_rank(MPI_COMM_WORLD,&taskid); MPI_Get_processor_name(hostname, &len); MPI_Op op; /* DATA PREPROCESSING */ if ( taskid == MASTER ) { printf( "\nLoading and Preprocessing Data\n" ); } t1 = MPI_Wtime(); // determine number of instances DataVec datavec; mlu::count_instances( datadir, datavec, num_inst ); // determine number of features mlu::count_features( datavec[0], n ); /* DATA INITIALIZATION */ // randomize instances std::random_shuffle( datavec.begin(), datavec.end() ); // partition data based on taskid size_t div = datavec.size() / numtasks; ProbSize limit = ( taskid == numtasks - 1 ) ? num_inst : div * ( taskid + 1 ); m = limit - div * taskid; // danamically allocate data Mat X( m, n ); Vec labels( m ); // load data partition double feat_val, label; ProbSize i = 0; for ( ProbSize idx = taskid * div; idx < limit; ++idx ) { std::ifstream data( datavec[idx] ); for ( ProbSize j=0; j<n; ++j ) { data >> feat_val; X(i,j) = feat_val; } data >> label; labels[i] = label; i++; } // perform feature scaling (optional) if ( scaling ) { // Allreduce to find global min Vec X_min_tmp = X.colwise().minCoeff(); X_min_data = X_min_tmp.data(); Vec X_min = Vec( X_min_tmp.size() ); MPI_Allreduce( X_min_tmp.data(), X_min.data(), X_min_tmp.size(), MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD ); // Allreduce to find global max Vec X_max_tmp = X.colwise().maxCoeff(); X_max_data = X_max_tmp.data(); Vec X_max = Vec( X_max_tmp.size() ); MPI_Allreduce( X_max_tmp.data(), X_max.data(), X_max_tmp.size(), MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD ); // scale features using global min and max mlu::scale_features( X, X_min, X_max, 1, 0 ); } /* FORMAT LABELS */ // get unique labels mlu::get_unique_labels( labels, classmap ); // allreduce to obtain maximum label set size int local_size = classmap.size(); int max_size = 0; MPI_Allreduce( &local_size, &max_size, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD ); // allreduce to obtain global unique label set int unique_labels[max_size]; int global_unique_labels[max_size]; for ( int i=0; i<max_size; ++i ) { unique_labels[i] = -1; global_unique_labels[i] = -1; } int idx = 0; for ( auto& kv : classmap ) { unique_labels[idx++] = kv.first; } MPI_Op_create( (MPI_User_function *)reduce_unique_labels, 1, &op ); MPI_Allreduce( unique_labels, global_unique_labels, max_size, MPI_INT, op, MPI_COMM_WORLD ); MPI_Op_free( &op ); // update local classmap std::sort( global_unique_labels, global_unique_labels + max_size ); classmap.clear(); int labeltmp; idx=0; for ( int i=0; i<max_size; ++i ) { labeltmp = global_unique_labels[i]; if ( labeltmp != -1 ) { classmap.emplace( labeltmp, idx++ ); } } // format the local label set into a matrix based on global class map Mat y = mlu::format_labels( labels, classmap ); numlabels = (LayerSize) classmap.size(); // output total data loading time for each task MPI_Barrier( MPI_COMM_WORLD ); t2 = MPI_Wtime(); printf( "--- task %d loading time %lf\n", taskid, t2 - t1 ); /* INIT LOCAL CLASSIFIER */ LogisticRegression logistic_layer( n, numlabels, true ); /* OPTIMIZATION */ if ( taskid == MASTER ) { printf( "\nPerforming Gradient Descent\n" ); } int update_size; // stores the number of instances read for each update double grad_mag; // stores the magnitude of the gradient for each update int delta_size = logistic_layer.get_theta_size(); Vec delta_update = Vec::Zero( delta_size ); int global_update_size; if ( taskid == MASTER ) { printf( "iteration : elapsed time : magnitude\n" ); } for ( int i=0; i<maxiter; ++i ) { // compute gradient update t1 = MPI_Wtime(); logistic_layer.compute_gradient( X, y, batch_size, update_size ); delta_data = logistic_layer.get_delta().data(); // sum updates across all partitions MPI_Allreduce( delta_data, delta_update.data(), delta_size, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD ); logistic_layer.set_delta( delta_update ); // sum the update sizes MPI_Allreduce( &update_size, &global_update_size, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD ); // normalize + regularize gradient update logistic_layer.normalize_gradient( global_update_size ); logistic_layer.regularize_gradient( global_update_size ); // update logistic_layer parameters t2 = MPI_Wtime(); if ( logistic_layer.converged( grad_mag ) ) { break; } if ( taskid == MASTER ) { printf( "%d : %lf : %lf\n", i+1, t2 - t1, grad_mag ); } logistic_layer.update_theta(); } /* MODEL STORAGE */ if (taskid == MASTER) { FILE *output; output = fopen ( output_file.c_str(), "w" ); int idx; Vec theta = logistic_layer.get_theta(); printf( "\nWriting Model to File: %s\n\n", output_file.c_str() ); fprintf( output, "%lu\n", theta.size() ); for ( idx=0; idx<theta.size()-1; ++idx ) { fprintf( output, "%lf\t", theta[idx] ); } fprintf( output, "%lf\n", theta[idx] ); fclose( output ); } MPI_Finalize(); return 0; }