void Permutohedral::compute ( MatrixXf & out, const MatrixXf & in, bool reverse ) const { if( out.cols() != in.cols() || out.rows() != in.rows() ) out = 0*in; if( in.rows() <= 2 ) seqCompute( out.data(), in.data(), in.rows(), reverse ); else sseCompute( out.data(), in.data(), in.rows(), reverse ); }
void blas_gemm(const MatrixXf& a, const MatrixXf& b, MatrixXf& c) { int M = c.rows(); int N = c.cols(); int K = a.cols(); int lda = a.rows(); int ldb = b.rows(); int ldc = c.rows(); sgemm_(¬rans,¬rans,&M,&N,&K,&fone, const_cast<float*>(a.data()),&lda, const_cast<float*>(b.data()),&ldb,&fone, c.data(),&ldc); }
void MapOptimizer::scale_stepsize (MatrixXf & dJ) const { if (logging) LOG(" computing stepsize"); const size_t XY = dom.size * cod.size; const float * restrict J_ = m_joint.data(); const float * restrict dJ_ = dJ.data(); float time_to_boundary = INFINITY; for (size_t xy = 0; xy < XY; ++xy) { float dJ_xy = dJ_[xy]; if (dJ_xy < 0) { imin(time_to_boundary, -J_[xy] / dJ_xy); } } float keepaway = 1 / M_E; float scale = time_to_boundary * keepaway; if (logging) LOG(" scaling step size by " << scale); dJ *= scale; }
double density (const MatrixXf & x) { const size_t I = x.rows() * x.cols(); double sum_x1 = 0; double sum_x2 = 0; const float * restrict x_ = x.data(); for (size_t i = 0; i < I; ++i) { double xi = x_[i]; sum_x1 += max(-xi,xi); sum_x2 += xi * xi; } return sqr(sum_x1) / sum_x2 / I; }
// Compute d/df a^T*K*b MatrixXf kernelGradient( const MatrixXf & a, const MatrixXf & b ) const { MatrixXf g = 0*f_; lattice_.gradient( g.data(), a.data(), b.data(), a.rows() ); return g; }
void MapOptimizer::constrain_direction (MatrixXf & dJ, float tol) const { // Enforce the simultaneous constraints // // /\x. sum y. dJ(y,x) = 0 // /\y. sum x. dJ(y,x) = 0 // // We combine the two constraints by iteratively weakly enforcing both: // Let Px,Py project to the feasible subspaces for constraints 1,2, resp. // Each projection has eigenvalues in {0,1}. // We approximate the desired projection Pxy as a linear combination of Px,Py // Pxy' = 1 - alpha ((1-Px) + (1-Py)) // which has eigenvalues in {1} u [1 - alpha, 1 - 2 alpha]. // Hence Pxy = lim n->infty Pxy'^n, where convergence rate depends on alpha. // The optimal alpha is 2/3, yielding Pxy' eigenvalues in {1} u [-1/3,1/3], // and resulting in project_scale = -alpha below. if (logging) LOG(" constraining direction"); const size_t X = dom.size; const size_t Y = cod.size; const MatrixXf & J = m_joint; const VectorXf & sum_y_J = m_dom_prior; const VectorXf & sum_x_J = m_cod_prior; const float sum_xy_J = m_cod_prior.sum(); VectorXf sum_y_dJ(J.cols()); VectorXf sum_x_dJ(J.rows()); // this is iterative, so we hand-optimize by merging loops const float * restrict J_ = J.data(); const float * restrict sum_y_J_ = sum_y_J.data(); const float * restrict sum_x_J_ = sum_x_J.data(); float * restrict dJ_ = dJ.data(); float * restrict project_y_ = sum_y_dJ.data(); float * restrict project_x_ = sum_x_dJ.data(); const float project_scale = -2/3.0; Vector<float> accum_x_dJ(Y); float * restrict accum_x_dJ_ = accum_x_dJ; // accumulate first projection accum_x_dJ.zero(); for (size_t x = 0; x < X; ++x) { const float * restrict dJ_x_ = dJ_ + Y * x; float accum_y_dJ = 0; for (size_t y = 0; y < Y; ++y) { float dJ_xy = dJ_x_[y]; accum_y_dJ += dJ_xy; accum_x_dJ_[y] += dJ_xy; } project_y_[x] = project_scale * accum_y_dJ / sum_y_J_[x]; } for (size_t y = 0; y < Y; ++y) { project_x_[y] = project_scale * accum_x_dJ_[y] / sum_x_J_[y]; accum_x_dJ_[y] = 0; } // apply previous projection and accumulate next projection for (size_t iter = 0; iter < 100; ++iter) { float error = 0; for (size_t x = 0; x < X; ++x) { const float * restrict J_x_ = J_ + Y * x; float * restrict dJ_x_ = dJ_ + Y * x; float accum_y_dJ = 0; for (size_t y = 0; y < Y; ++y) { float dJ_xy = dJ_x_[y] += J_x_[y] * (project_x_[y] + project_y_[x]); accum_y_dJ += dJ_xy; accum_x_dJ_[y] += dJ_xy; } project_y_[x] = project_scale * accum_y_dJ / sum_y_J_[x]; imax(error, max(-accum_y_dJ, accum_y_dJ)); } for (size_t y = 0; y < Y; ++y) { float accum_x_dJ_y = accum_x_dJ_[y]; accum_x_dJ_[y] = 0; project_x_[y] = project_scale * accum_x_dJ_y / sum_x_J_[y]; imax(error, max(-accum_x_dJ_y, accum_x_dJ_y)); } if (error < tol) { if (logging) { LOG(" after " << (1+iter) << " iterations, error < " << error); } break; } } // apply final projection for (size_t x = 0; x < X; ++x) { const float * restrict J_x_ = J_ + Y * x; float * restrict dJ_x_ = dJ_ + Y * x; for (size_t y = 0; y < Y; ++y) { dJ_x_[y] += J_x_[y] * (project_x_[y] + project_y_[x]); } } if (debug) { sum_y_dJ = dJ.colwise().sum(); sum_x_dJ = dJ.rowwise().sum(); float sum_xy_dJ = sum_x_dJ.sum(); DEBUG("max constraint errors = " << sqrt(sum_x_dJ.array().square().maxCoeff())<< ", " << sqrt(sum_y_dJ.array().square().maxCoeff())<< ", " << sum_xy_dJ); sum_y_dJ.array() /= sum_y_J.array(); sum_x_dJ.array() /= sum_x_J.array(); sum_xy_dJ /= sum_xy_J; DEBUG("max relative constraints errors = " << sqrt(sum_x_dJ.array().square().maxCoeff()) << ", " << sqrt(sum_y_dJ.array().square().maxCoeff()) << ", " << sum_xy_dJ); DEBUG("max(|dJ|) = " << dJ.array().abs().maxCoeff() << ", rms(dJ) = " << sqrt(dJ.array().square().mean())); DEBUG("max(J) / min(J) = " << (J.maxCoeff() / J.minCoeff())); DEBUG("max(sum x. J) / min(sum x. J) = " << (sum_x_J.maxCoeff() / sum_x_J.minCoeff())); DEBUG("max(sum y. J) / min(sum y. J) = " << (sum_y_J.maxCoeff() / sum_y_J.minCoeff())); } }
MatrixXf M1(2,6); // Column-major storage M1 << 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12; Map<MatrixXf> M2(M1.data(), 6,2); cout << "M2:" << endl << M2 << endl;
int main (int argc, char *argv[]) { // handle cmd args // TODO: add <hidden_layer_sizes> (i.e., "100-100-50") argument processing int batch_size; if ( argc > 1 ) { printf( " Usage: ./neuralnet_mpi <batch_size>"); exit( 0 ); } else if ( argc == 1 ) { batch_size = atoi( argv[0] ); // mini-batch processing } else { batch_size = INT_MIN; // batch processing } // initialize/populate mpi specific vars local to each node int numtasks, taskid, len, dest, source; char hostname[MPI_MAX_PROCESSOR_NAME]; MPI_Status status; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &numtasks); MPI_Comm_rank(MPI_COMM_WORLD,&taskid); MPI_Get_processor_name(hostname, &len); /***** MASTER TASK ONLY ******/ // perform data preprocessing based on number of workers and batch_size if (taskid == MASTER) { printf( "MASTER: Number of MPI tasks is: %d\n",numtasks ); /* DATA PREPROCESSING */ // Load dataset // TEMP: populate fictitious dataset // NOTE: record max and min for each column in the max/min arrays long datasize = 16; long numfeats = 4; // to be populated while loading dataset long numlabels = 1; // for testing binary classification only //float min[numfeats]; // stores the overall min for each column //float max[numfeats]; // stores the overall max for each column MatrixXf data = MatrixXf( datasize, numfeats ); // row-major order! VectorXf labels_vec = VectorXf( datasize ); for ( long i=0; i<datasize; ++i ) { for ( long j=0; j<numfeats; ++j ) { data(i, j) = i + j; } labels_vec[i] = 1.0; // populate a vector with labels for each instance } data(0,0) = 100.1; data(1,0) = 200.1; data(0,1) = 300.1; // Shuffle data/labels to randomize instances // Reformat labels for multi-class classification MatrixXf labels = MatrixXf::Zero( datasize, numlabels ); // TODO: create class_map from unique classes with k:v = <true_label>:<column_index> for ( long i=0; i<datasize; ++i ) { // labels( i, class_map[y_vec[i]] ) = 1.0; } // Scale features // TODO: use max/min arrays to scale data to between -1 and 1 // # scales all features in dataset X to values between new_min and new_max // X_min, X_max = X.min(0), X.max(0) // return (((X - X_min) / (X_max - X_min + 0.000001)) * (new_max - new_min)) + new_min /* DATA MARSHALLING */ long chunksize = datasize / numtasks; // load MASTER data MatrixXf X = MatrixXf( chunksize, numfeats ); MatrixXf y = MatrixXf( chunksize, numlabels ); memcpy( X.data(), data.data(), chunksize * numfeats * sizeof(float) ); memcpy( y.data(), labels.data(), chunksize * numlabels * sizeof(float) ); std::cout << "MASTER X:\n" << X << std::endl; std::cout << "MASTER y:\n" << y << std::endl; // send data to workers long offset = chunksize; for (dest=1; dest<numtasks; dest++) { MPI_Send( &chunksize, 1, MPI_LONG, dest, TAG_0, MPI_COMM_WORLD ); MPI_Send( &numfeats, 1, MPI_LONG, dest, TAG_0, MPI_COMM_WORLD ); MPI_Send( &numlabels, 1, MPI_LONG, dest, TAG_0, MPI_COMM_WORLD ); MPI_Send( data.data() + offset * numfeats, chunksize * numfeats, MPI_FLOAT, dest, TAG_0, MPI_COMM_WORLD ); MPI_Send( labels.data() + offset * numlabels, chunksize * numlabels, MPI_FLOAT, dest, TAG_0, MPI_COMM_WORLD ); printf( "Sent %ld instances to task %d offset= %ld\n", chunksize, dest, offset ); offset += chunksize; } /* CLASSIFICATION MODEL INITIALIZATION */ // pass network structure and processing parameters message // initialze MASTER NN // initialize network parameters // TODO: create randomized set of parameters stored in contiguous memory // to be packed and unpacked as needed. The important part of doing this on // the MASTER first is that all networks will start with the same parameter set. // set MASTER NN parameters /* OPTIMIZATION */ // optimize offset = 0; /* PREDICTION */ // predict on validation set // output prediction results /* MODEL STORAGE */ // store parameters } /***** NON-MASTER TASKS ONLY *****/ if (taskid > MASTER) { printf ("Hello from task %d on %s!\n", taskid, hostname); /* DATA INITIALIZATION */ long chunksize, numfeats, numlabels; source = MASTER; // recieve data partition MPI_Recv( &chunksize, 1, MPI_LONG, source, MPI_ANY_TAG, MPI_COMM_WORLD, &status ); MPI_Recv( &numfeats, 1, MPI_LONG, source, MPI_ANY_TAG, MPI_COMM_WORLD, &status ); MPI_Recv( &numlabels, 1, MPI_LONG, source, MPI_ANY_TAG, MPI_COMM_WORLD, &status ); printf( "Task %d chunksize = %ld\n", taskid, chunksize ); printf( "Task %d numfeats = %ld\n", taskid, numfeats ); printf( "Task %d numlabels = %ld\n", taskid, numlabels ); // initialize local data storage MatrixXf X = MatrixXf( chunksize, numfeats ); MatrixXf y = MatrixXf( chunksize, numlabels ); // receive data and labels MPI_Recv( X.data(), chunksize * numfeats, MPI_FLOAT, source, MPI_ANY_TAG, MPI_COMM_WORLD, &status ); MPI_Recv( y.data(), chunksize * numlabels, MPI_FLOAT, source, MPI_ANY_TAG, MPI_COMM_WORLD, &status ); std::cout << "task " << taskid << " X:\n" << X << std::endl; std::cout << "task " << taskid << " y:\n" << y << std::endl; /* CLASSIFICATION MODEL INITIALIZATION */ // recieve network structure and processing paramters info // initialize local neuralnet_openmp instance // TODO: each NN instance is set with identical structure and processing parameters // recieve network parameters and pass to local instance // TODO: each NN instance gets the same set of randomized parameters /* OPTIMIZATION */ // optimize } MPI_Finalize(); }
int main(void) { cout << "Eigen v" << EIGEN_WORLD_VERSION << "." << EIGEN_MAJOR_VERSION << "." << EIGEN_MINOR_VERSION << endl; static const int R = 288; static const int N = R*(R+1)/2; static const int M = 63; static const int HALF_M = M/2; static const float nsigma = 2.5f; MatrixXf data = MatrixXf::Random(M, N); MatrixXf mask = MatrixXf::Zero(M, N); MatrixXf result = MatrixXf::Zero(1, N); VectorXf std = VectorXf::Zero(N); VectorXf centroid = VectorXf::Zero(N); VectorXf mean = VectorXf::Zero(N); VectorXf minval = VectorXf::Zero(N); VectorXf maxval = VectorXf::Zero(N); cout << "computing..." << flush; double t = GetRealTime(); // computes the exact median if (M&1) { #pragma omp parallel for for (int i = 0; i < N; i++) { vector<float> row(data.data()+i*M, data.data()+(i+1)*M); nth_element(row.begin(), row.begin()+HALF_M, row.end()); centroid(i) = row[HALF_M]; } } // nth_element guarantees x_0,...,x_{n-1} < x_n else { #pragma omp parallel for for (int i = 0; i < N; i++) { vector<float> row(data.data()+i*M, data.data()+(i+1)*M); nth_element(row.begin(), row.begin()+HALF_M, row.end()); centroid(i) = row[HALF_M]; centroid(i) += *max_element(row.begin(), row.begin()+HALF_M); centroid(i) *= 0.5f; } } // compute the mean mean = data.colwise().mean(); // compute std (x) = sqrt ( 1/N SUM_i (x(i) - mean(x))^2 ) std = (((data.rowwise() - mean.transpose()).array().square()).colwise().sum() * (1.0f / M)) .array() .sqrt(); // compute n sigmas from centroid minval = centroid - std * nsigma; maxval = centroid + std * nsigma; // compute clip mask for (int i = 0; i < N; i++) { mask.col(i) = (data.col(i).array() > minval(i)).select(VectorXf::Ones(M), 0.0f); mask.col(i) = (data.col(i).array() < maxval(i)).select(VectorXf::Ones(M), 0.0f); } // apply clip mask to data data.array() *= mask.array(); // compute mean such that we ignore clipped data, this is our final result result = data.colwise().sum().array() / mask.colwise().sum().array(); t = GetRealTime() - t; cout << "[done]" << endl << endl; size_t bytes = data.size()*sizeof(float); cout << "data: " << M << "x" << N << endl; cout << "size: " << bytes*1e-6f << " MB" << endl; cout << "rate: " << bytes/(1e6f*t) << " MB/s" << endl; cout << "time: " << t << " s" << endl; return 0; }
Vector<float> as_vector (const MatrixXf & x) { return Vector<float>(x.size(), const_cast<float *>(x.data())); }