static void chol_driver(blas_idx_t n_global) { auto grid = std::make_shared<blacs_grid_t>(); auto a = make_tridiagonal(grid, n_global); // Compute Cholesky factorization of A in-place char uplo ='U'; blas_idx_t ia = 1, ja = 1, info; MPI_Barrier (MPI_COMM_WORLD); double t0 = MPI_Wtime(); pdpotrf_ (uplo, n_global, a->local_data(), ia, ja, a->descriptor(), info); assert(info == 0); double t1 = MPI_Wtime() - t0; double t_glob; MPI_Reduce(&t1, &t_glob, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); if (grid->iam() == 0) { double gflops = potrf_flops(n_global)/t_glob/grid->nprocs(); printf("\n" "MATRIX CHOLESKY FACTORIZATION BENCHMARK SUMMARY\n" "===============================================\n" "N = %d\tNP = %d\tNP_ROW = %d\tNP_COL = %d\n" "Time for PxPOTRF = %10.7f seconds\tGflops/Proc = %10.7f\n", n_global, grid->nprocs(), grid->nprows(), grid->npcols(), t_glob, gflops);fflush(stdout); } }
void Buddha::worker(uint64_t from, uint64_t to) { uint64_t filled = 0; std::vector<uint64_t> local_data(thread_vector_size_); std::size_t progress_local = 0; floating_type radius_sqr = radius_ * radius_; floating_type subpixel_width = 2 * radius_ / x_size_; floating_type subpixel_height = 2 * radius_ / y_size_; for (uint64_t sub_x = 0; sub_x < subpixel_resolution_; ++sub_x) { for (uint64_t sub_y = 0; sub_y < subpixel_resolution_; ++sub_y) { for (uint64_t i = from; i < to; ++i) { ++progress_local; if (filled + max_iterations_ >= thread_vector_size_) flush_data(local_data, filled); complex_type c = lin2complex(i); c.real(c.real() + sub_x * subpixel_width); c.imag(c.imag() + sub_y * subpixel_height); complex_type z = c; uint64_t pos = 0; if (mandelbrot_hint(c)) continue; while (z.real() * z.real() + z.imag() * z.imag() < radius_sqr && pos < max_iterations_) { // TODO: Possible optimization when computing abs(z)^2. uint64_t zpos = complex2lin(z); if (zpos < data_.size()) { local_data[filled + pos] = zpos; } z *= z; z += c; ++pos; } if (pos >= min_iterations_ && pos < max_iterations_) { filled += pos; } } if (progress_local > 10000) { progress_ += progress_local; progress_local = 0; } } } flush_data(local_data, filled); }
void umat(TUmatData &umat_data) { typedef LocalData<TUmatData> TLocalData; /// localdata TLocalData local_data(umat_data); // PRINT(local_data.epsilon); /// stiffness local_data.H0(0, 0) = local_data.E0; // PRINT(local_data.H0); /// stress local_data.sigma = local_data.H0 * local_data.epsilon; // PRINT(local_data.sigma); /// ener local_data.ener = dot_vec_col(local_data.sigma, local_data.epsilon)/2; // PRINT(local_data.ener); } // void umat
void umat (TUmatData &umat_data) { typedef LocalData<TUmatData> TLocalData; /// localdata TLocalData local_data(umat_data); // PRINT(local_data.epsilon); /// stiffness local_data.H0 = elasticity_isotrope_H<ndim> ( local_data.E0, local_data.N0); // PRINT(local_data.H0); /// stress local_data.sigma = local_data.H0 * local_data.epsilon; // PRINT(local_data.sigma); /// ener local_data.ener = dot_vec_col(local_data.sigma, local_data.epsilon)/2; // PRINT(local_data.ener); } // void umat
/// 计算结果存储在矩阵a中 /// n_global: the order of the matrix static void inv_driver(blas_idx_t n_global) { auto grid = std::make_shared<blacs_grid_t>(); //// self code //n_global = 3; //double *aaa = new double(n_global*n_global); //for (int i = 0; i < 9; i++) //{ // aaa[i] = i + 1; //} //aaa[8] = 10; //auto a = block_cyclic_mat_t::createWithArray(grid, n_global, n_global, aaa); // Create a NxN random matrix A auto a = block_cyclic_mat_t::random(grid, n_global, n_global); // Create a NxN matrix to hold A^{-1} auto ai = block_cyclic_mat_t::constant(grid, n_global, n_global); // Copy A to A^{-1} since it will be overwritten during factorization std::copy_n(a->local_data(), a->local_size(), ai->local_data()); MPI_Barrier (MPI_COMM_WORLD); double t0 = MPI_Wtime(); // Factorize A blas_idx_t ia = 1, ja = 1; std::vector<blas_idx_t> ipiv(a->local_rows() + a->row_block_size() + 100); blas_idx_t info; //含义应该是D-GE-TRF。 //第一个D表示我们的矩阵是double类型的 //GE表示我们的矩阵是General类型的 //TRF表示对矩阵进行三角分解也就是我们通常所说的LU分解。 pdgetrf_(n_global, n_global, ai->local_data(), ia, ja, ai->descriptor(), ipiv.data(), info); assert(info == 0); double t_factor = MPI_Wtime() - t0; // Compute A^{-1} based on the LU factorization // Compute workspace for double and integer work arrays on each process blas_idx_t lwork = 10; blas_idx_t liwork = 10; std::vector<double> work (lwork); std::vector<blas_idx_t> iwork(liwork); lwork = liwork = -1; // 计算lwork与liwork的值 pdgetri_(n_global, ai->local_data(), ia, ja, ai->descriptor(), ipiv.data(), work.data(), lwork, iwork.data(), liwork, info); assert(info == 0); lwork = static_cast<blas_idx_t>(work[0]); liwork = static_cast<size_t>(iwork[0]); work.resize(lwork); iwork.resize(liwork); // Now compute the inverse t0 = MPI_Wtime(); pdgetri_(n_global, ai->local_data(), ia, ja, ai->descriptor(), ipiv.data(), work.data(), lwork, iwork.data(), liwork, info); assert(info == 0); double t_solve = MPI_Wtime() - t0; // Verify that the inverse is correct using A*A^{-1} = I auto identity = block_cyclic_mat_t::diagonal(grid, n_global, n_global); // Compute I = A * A^{-1} - I and verify that the ||I|| is small char nein = 'N'; double alpha = 1.0, beta = -1.0; pdgemm_(nein, nein, n_global, n_global, n_global, alpha, a->local_data() , ia, ja, a->descriptor(), ai->local_data(), ia, ja, ai->descriptor(), beta, identity->local_data(), ia, ja, identity->descriptor()); // Compute 1-norm of the result char norm='1'; work.resize(identity->local_cols()); double err = pdlange_(norm, n_global, n_global, identity->local_data(), ia, ja, identity->descriptor(), work.data()); double t_total = t_factor + t_solve; double t_glob; MPI_Reduce(&t_total, &t_glob, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); if (grid->iam() == 0) { double gflops = getri_flops(n_global)/t_glob/grid->nprocs(); printf("\n" "MATRIX INVERSE BENCHMARK SUMMARY\n" "================================\n" "N = %d\tNP = %d\tNP_ROW = %d\tNP_COL = %d\n" "Time for PxGETRF + PxGETRI = %10.7f seconds\tGflops/Proc = %10.7f, Error = %f\n", n_global, grid->nprocs(), grid->nprows(), grid->npcols(), t_glob, gflops, err);fflush(stdout); } }
static void dgemm_driver(blas_idx_t m_global, blas_idx_t n_global, blas_idx_t k_global) { auto grid = std::make_shared<blacs_grid_t>(); auto a = block_cyclic_mat_t::random(grid, m_global, k_global); // auto b = block_cyclic_mat_t::random(grid, k_global, n_global); auto c = block_cyclic_mat_t::random(grid, m_global, n_global); //for test TODO double *dd = new double[m_global*k_global]; for (int i = 0; i < m_global*k_global; i++) { dd[i] = i; } auto b = block_cyclic_mat_t::createWithArray(grid, m_global, k_global, dd); //从这里开始矩阵的运算 MPI_Barrier(MPI_COMM_WORLD); double alpha = 1.0, beta = 0.0; double t0 = MPI_Wtime(); char NEIN = 'N'; //表示不进行转置 blas_idx_t ia = 1, ja = 1, ib = 1, jb = 1, ic = 1, jc = 1; // sub(C) = alpha*op(sub(A))*op(sub(B)) + beta*sub(C) pdgemm_ (NEIN, NEIN, m_global, n_global, k_global, alpha, a->local_data(), ia, ja, a->descriptor(), b->local_data(), ib, jb, b->descriptor(), beta, c->local_data(), ic, jc, c->descriptor() ); double t1 = MPI_Wtime() - t0; double t_glob; //获取所有进程所用时间中最长的时间 MPI_Reduce(&t1, &t_glob, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); if (grid->iam() == 0) // 进程号为0的进程 { double gflops = gemm_flops(m_global, n_global, k_global)/t_glob/grid->nprocs(); printf("\n" "MATRIX MULTIPLY BENCHMARK SUMMARY\n" "=================================\n" "M = %d\tN = %d\tK = %d\tNP = %d\tNP_ROW = %d\tNP_COL = %d\n" "Time for PxGEMM = %10.7f seconds\tGFlops/Proc = %10.7f\n", m_global, n_global, k_global, grid->nprocs(), grid->nprows(), grid->npcols(), t_glob, gflops); fflush(stdout); for (int i = 0; i < 10; i++) { for (int j = 0; j < 10; j++) { printf("%f ", c->local_data()[i*k_global + j]); } printf("\n"); } fflush(stdout); } }
double HDP_MEDOIDS (vector<Instance*>& data, vector< vector<double> >& means, Lookups* tables, vector<double> lambdas, dist_func df, int FIX_DIM) { // STEP ZERO: validate input and initialization int N = tables->nWords; int D = tables->nDocs; vector< pair<int, int> > doc_lookup = *(tables->doc_lookup); double lambda_global = lambdas[0]; double lambda_local = lambdas[1]; vector< vector<double> > global_means (1, vector<double>(FIX_DIM, 0.0)); vector< vector<int> > k (D, vector<int>(1,0)); // global association vector<int> z (N, 0); // local assignment vector<int> global_asgn (N, 0); // global assignment // STEP ONE: a. set initial *global* medoid as global mean compute_assignment (global_asgn, k, z, tables); compute_means (data, global_asgn, FIX_DIM, global_means); double last_cost = compute_cost (data, global_means, k, z, lambdas, tables, df, FIX_DIM); double new_cost = last_cost; while (true) { // 4. for each point x_ij, for (int j = 0; j < D; j ++) { for (int i = doc_lookup[j].first; i < doc_lookup[j].second; i++) { int num_global_means = global_means.size(); vector<double> d_ij (num_global_means, 0.0); for (int p = 0; p < num_global_means; p ++) { Instance* temp_ins = vec2ins(global_means[p]); double euc_dist = df(data[i], temp_ins, FIX_DIM); d_ij[p] = euc_dist * euc_dist; delete temp_ins; } set<int> temp; for (int p = 0; p < num_global_means; p ++) temp.insert(p); int num_local_means = k[j].size(); for (int q = 0; q < num_local_means; q ++) temp.erase(k[j][q]); set<int>::iterator it; for (it=temp.begin(); it!=temp.end();++it) d_ij[*it] += lambda_local; int min_p = -1; double min_dij = INF; for (int p = 0; p < num_global_means; p ++) if (d_ij[p] < min_dij) { min_p = p; min_dij = d_ij[p]; } if (min_dij > lambda_global + lambda_local) { z[i] = num_local_means; k[j].push_back(num_global_means); vector<double> new_g(FIX_DIM, 0.0); for (int f = 0; f < data[i]->fea.size(); f++) new_g[data[i]->fea[f].first-1] = data[i]->fea[f].second; global_means.push_back(new_g); // cout << "global and local increment" << endl; } else { bool c_exist = false; for (int c = 0; c < num_local_means; c ++) if (k[j][c] == min_p) { z[i] = c; c_exist = true; break; } if (!c_exist) { z[i] = num_local_means; k[j].push_back(min_p); // cout << "local increment" << endl; } } } } /* cout << "half..........." << endl; cout << "#global created: " << global_means.size() << ", #global used: " << get_num_global_means(k); */ new_cost = compute_cost (data, global_means, k, z, lambdas, tables, df, FIX_DIM); // 5. for all local clusters, for (int j = 0; j < D; j ++) { int begin_i = doc_lookup[j].first; int end_i = doc_lookup[j].second; int doc_len = doc_lookup[j].second - doc_lookup[j].first; int num_local_means = k[j].size(); // all local clusters are distinct to each other /* set<int> temp; for (int y = 0; y < num_local_means; y++) temp.insert(k[j][y]); cout << temp.size() << " ==? " << num_local_means << endl; assert (temp.size() == num_local_means); */ // compute means of local clusters vector< vector<double> > local_means (num_local_means, vector<double>(FIX_DIM, 0.0)); vector<int> local_asgn (z.begin()+begin_i, z.begin()+end_i); vector<Instance*> local_data (data.begin()+begin_i,data.begin()+end_i); compute_means (local_data, local_asgn, FIX_DIM, local_means); assert (num_local_means == local_means.size()); // pre-compute instances for global means int num_global_means = global_means.size(); vector<Instance*> temp_global_means (num_global_means, NULL); for (int p = 0; p < num_global_means; p ++) temp_global_means[p] = vec2ins (global_means[p]); // pre-compute instances for local means vector<Instance*> temp_local_means (num_local_means, NULL); for (int c = 0; c < num_local_means; c ++) temp_local_means[c] = vec2ins (local_means[c]); for (int c = 0; c < num_local_means; c++) { // compute distance of local clusters to each global cluster num_global_means = global_means.size(); vector<double> d_jcp (num_global_means, 0.0); double sum_d_ijc = 0.0; for (int i = doc_lookup[j].first; i < doc_lookup[j].second; i ++) { if (z[i] != c) continue; double local_dist = df (data[i], temp_local_means[c], FIX_DIM); sum_d_ijc += local_dist * local_dist; for (int p = 0; p < num_global_means; p ++) { double dist = df (data[i], temp_global_means[p], FIX_DIM); d_jcp[p] += dist * dist; } } int min_p = -1; double min_d_jcp = INF; for (int p = 0; p < num_global_means; p ++) if (d_jcp[p] < min_d_jcp) { min_p = p; min_d_jcp = d_jcp[p]; } assert (min_p >= 0); // cout << min_d_jcp << " " << lambda_global << " " << sum_d_ijc << endl; if (min_d_jcp > lambda_global + sum_d_ijc) { global_means.push_back(local_means[c]); // push mu_jc temp_global_means.push_back(vec2ins (local_means[c])); k[j][c] = num_global_means; // cout << "global increment" << endl; } else { k[j][c] = min_p; } } for (int c = 0; c < num_local_means; c ++) delete temp_local_means[c]; num_global_means = global_means.size(); for (int p = 0; p < num_global_means; p ++) delete temp_global_means[p]; } // 6. for each global clusters, compute_assignment (global_asgn, k, z, tables); /* cout << "compute global means.." << endl; cout << "#global created: " << global_means.size() << ", #global used: " << get_num_global_means(k); */ compute_means (data, global_asgn, FIX_DIM, global_means); // 7. convergence? new_cost = compute_cost (data, global_means, k, z, lambdas, tables, df, FIX_DIM); if ( new_cost < objmin ) objmin = new_cost; objmin_trace << omp_get_wtime()-start_time << " " << objmin << endl; if (new_cost == last_cost) break; if (new_cost < last_cost) { last_cost = new_cost; } else { cerr << "failure" << endl; return INF; assert(false); } } means = global_means; return last_cost; }// entry main function