bool mat_sqrt(const complex_vec_t& matrix, complex_vec_t& result) { result.clear(); for(complex_vec_t::const_iterator i = matrix.begin(); i != matrix.end(); ++ i) { result.push_back(sqrt(*i)); } // for return true; } // mat_sqrt()
bool mat_mul(complex_t scalar, const complex_vec_t& matrix, complex_vec_t& result) { result.clear(); for(complex_vec_t::const_iterator i = matrix.begin(); i != matrix.end(); ++ i) { result.push_back((*i) * scalar); } // for return true; } // mat_mul()
bool AnalyticFormFactor::mat_sinc(unsigned int x_size, unsigned int y_size, unsigned int z_size, const complex_vec_t& matrix, complex_vec_t& result) { result.clear(); for(std::vector<complex_t>::const_iterator i = matrix.begin(); i != matrix.end(); ++ i) { result.push_back(sinc(*i)); } // for return true; } // AnalyticFormFactor::mat_sinc()
bool AnalyticFormFactor::mat_fq_inv(unsigned int x_size, unsigned int y_size, unsigned int z_size, const complex_vec_t& matrix, real_t y, complex_vec_t& result) { result.clear(); for(complex_vec_t::const_iterator i = matrix.begin(); i != matrix.end(); ++ i) { result.push_back(fq_inv(*i, y)); } // for return true; } // AnalyticFormFactor::mat_fq_inv()
/** * computes element-by-element division of two matrices (matrix1 / matrix2) into result */ bool mat_dot_div(unsigned int nx1, unsigned int ny1, unsigned int nz1, const complex_vec_t& matrix1, unsigned int nx2, unsigned int ny2, unsigned int nz2, const complex_vec_t& matrix2, complex_vec_t& result) { if(nx1 != nx2 || ny1 != ny2 || nz1 != nz2 || matrix1.size() != matrix2.size()) { std::cerr << "error: matrix sizes are not the same for dot division operation" << std::endl; return false; } // if result.clear(); complex_vec_t::const_iterator i1 = matrix1.begin(); complex_vec_t::const_iterator i2 = matrix2.begin(); for(; i1 != matrix1.end(); ++ i1, ++ i2) { result.push_back((*i1) / (*i2)); } // for return true; } // mat_dot_div()
/** * computes element-by-element product of two matrices into result */ bool mat_dot_prod(unsigned int x1_size, unsigned int y1_size, unsigned int z1_size, const complex_vec_t& matrix1, unsigned int x2_size, unsigned int y2_size, unsigned int z2_size, const complex_vec_t& matrix2, complex_vec_t& result) { if(x1_size != x2_size || y1_size != y2_size || z1_size != z2_size || matrix1.size() != matrix2.size()) { std::cerr << "error: matrix sizes are not the same for dot product operation" << std::endl; return false; } // if result.clear(); complex_vec_t::const_iterator i1 = matrix1.begin(); complex_vec_t::const_iterator i2 = matrix2.begin(); for(; i1 != matrix1.end(); ++ i1, ++ i2) { result.push_back((*i1) * (*i2)); } // for return true; } // mat_dot_prod()
/** * Function to gather partial FF arrays from all processes to construct the final FF. * This is a bottleneck for large num procs ... */ bool NumericFormFactor::construct_ff(int p_nqx, int p_nqy, int p_nqz, int nqx, int nqy, int nqz, int p_y, int p_z, #ifdef FF_NUM_GPU cucomplex_t* p_ff, #else complex_t* p_ff, #endif complex_vec_t& ff, #ifdef USE_MPI woo::MultiNode& world_comm, std::string comm_key, #endif real_t& mem_time, real_t& comm_time) { real_t mem_start = 0, mem_end = 0, comm_start = 0, comm_end = 0; woo::BoostChronoTimer memtimer, commtimer; mem_time = 0; comm_time = 0; #ifdef USE_MPI bool master = world_comm.is_master(comm_key); int size = world_comm.size(comm_key); int rank = world_comm.rank(comm_key); #else bool master = true; int size = 1; int rank = 0; #endif memtimer.start(); int local_qpoints = p_nqx * p_nqy * p_nqz; unsigned long int total_qpoints = nqx * nqy * nqz; // process 0 creates the main ff, and collects computed p_ff from all others (just use gather) ff.clear(); #ifdef FF_NUM_GPU cucomplex_t* all_ff = NULL; // TODO: improve this ... #else complex_t* all_ff = NULL; #endif if(master) { ff.reserve(total_qpoints); ff.assign(total_qpoints, complex_t(0.0, 0.0)); #ifdef FF_NUM_GPU all_ff = new (std::nothrow) cucomplex_t[total_qpoints]; #else all_ff = new (std::nothrow) complex_t[total_qpoints]; #endif } // if mem_time += memtimer.elapsed_msec(); int *recv_p_nqy = new (std::nothrow) int[p_y](); recv_p_nqy[0] = p_nqy; int *off_p_nqy = new (std::nothrow) int[p_y](); off_p_nqy[0] = 0; #ifdef FF_NUM_GPU cucomplex_t *ff_buffer = new (std::nothrow) cucomplex_t[total_qpoints]; #else complex_t *ff_buffer = new (std::nothrow) complex_t[total_qpoints]; #endif if(ff_buffer == NULL) { std::cerr << "error: failed to allocate memory for ff buffer" << std::endl; return false; } // if #ifdef USE_MPI // construct stuff for gatherv int *recv_counts = new (std::nothrow) int[size](); int *displs = new (std::nothrow) int[size](); commtimer.start(); //comm.Allgather(&local_qpoints, 1, MPI::INT, recv_counts, 1, MPI::INT); world_comm.allgather(comm_key, &local_qpoints, 1, recv_counts, 1); commtimer.stop(); comm_time += commtimer.elapsed_msec(); memtimer.start(); displs[0] = 0; for(int i = 1; i < size; ++ i) { displs[i] = displs[i - 1] + recv_counts[i - 1]; } // for complex_t *cast_p_ff, *cast_ff; #ifdef FF_NUM_GPU cast_p_ff = reinterpret_cast<complex_t*>(p_ff); cast_ff = reinterpret_cast<complex_t*>(ff_buffer); #else cast_p_ff = p_ff; cast_ff = ff_buffer; #endif memtimer.stop(); mem_time += memtimer.elapsed_msec(); commtimer.start(); world_comm.gatherv(comm_key, cast_p_ff, local_qpoints, cast_ff, recv_counts, displs); world_comm.gather("ff_num_col_comm", &p_nqy, 1, recv_p_nqy, 1); commtimer.stop(); comm_time += commtimer.elapsed_msec(); for(int i = 1; i < p_y; ++ i) off_p_nqy[i] = off_p_nqy[i - 1] + recv_p_nqy[i - 1]; #else #ifdef FF_NUM_GPU memcpy(ff_buffer, p_ff, total_qpoints * sizeof(cucomplex_t)); #else memcpy(ff_buffer, p_ff, total_qpoints * sizeof(complex_t)); #endif #endif // USE_MPI memtimer.start(); // move all the data to correct places if(rank == 0) { unsigned long int ff_index = 0; for(int i_nqz = 0; i_nqz < nqz; ++ i_nqz) { for(int i_py = 0; i_py < p_y; ++ i_py) { unsigned long int ffb_index = nqx * (i_nqz * recv_p_nqy[i_py] + nqz * off_p_nqy[i_py]); #ifdef FF_NUM_GPU memcpy(&all_ff[ff_index], &ff_buffer[ffb_index], nqx * recv_p_nqy[i_py] * sizeof(cucomplex_t)); #else memcpy(&all_ff[ff_index], &ff_buffer[ffb_index], nqx * recv_p_nqy[i_py] * sizeof(complex_t)); #endif ff_index += nqx * recv_p_nqy[i_py]; } // for i_py } // for i_nqz // put into the final ff buffer #ifdef FF_NUM_GPU ff.assign(reinterpret_cast<complex_t*>(all_ff), reinterpret_cast<complex_t*>(all_ff + total_qpoints)); #else ff.assign(all_ff, all_ff + total_qpoints); #endif } // if delete[] ff_buffer; #ifdef USE_MPI delete[] displs; delete[] recv_counts; #endif delete[] off_p_nqy; delete[] recv_p_nqy; delete[] all_ff; memtimer.stop(); mem_time += memtimer.elapsed_msec(); return true; } // NumericFormFactor::construct_ff()
/** * main host function */ bool NumericFormFactor::compute(const char* filename, complex_vec_t& ff, vector3_t& rot1, vector3_t& rot2, vector3_t& rot3 #ifdef USE_MPI , woo::MultiNode& world_comm, std::string comm_key #endif ) { real_t comp_start = 0.0, comp_end = 0.0, comm_start = 0.0, comm_end = 0.0; real_t mem_start = 0.0, mem_end = 0.0; real_t comp_time = 0.0, comm_time = 0.0, mem_time = 0.0, kernel_time = 0.0, red_time = 0.0; real_t total_start = 0.0, total_end = 0.0, total_time = 0.0; woo::BoostChronoTimer maintimer, computetimer; woo::BoostChronoTimer commtimer, memtimer; unsigned int nqx = QGrid::instance().nqx(); unsigned int nqy = QGrid::instance().nqy(); unsigned int nqz = QGrid::instance().nqz_extended(); #ifdef USE_MPI bool master = world_comm.is_master(comm_key); commtimer.start(); world_comm.barrier(comm_key); commtimer.stop(); comm_time += commtimer.elapsed_msec(); #else bool master = true; #endif // warning: all procs read the shape file!!!! // TODO: improve to parallel IO, or one proc reading and sending to all ... // #ifndef __SSE3__ real_vec_t shape_def; // #else // #ifdef USE_GPU // real_vec_t shape_def; // #else // real_t* shape_def = NULL; // #endif // #endif // use the new file reader instead ... unsigned int num_triangles = read_shapes_file(filename, shape_def); // TODO ... <--- sadly all procs read this! IMPROVE!!! // TODO: temporary ... remove ... std::vector<short int> axes(4); // axes[i] = j // i: x=0 y=1 z=2 // j: 0=a 1=b 2=c #ifndef AXIS_ROT axes[0] = 0; axes[1] = 1; axes[2] = 2; // default values #else find_axes_orientation(shape_def, axes); #endif #ifdef USE_MPI int num_procs = world_comm.size(comm_key); int rank = world_comm.rank(comm_key); #endif if(master) { std::cout << "-- Numerical form factor computation ..." << std::endl << "** Using input shape file: " << filename << std::endl << "** Number of input triangles: " << num_triangles << std::endl << "** Q-grid resolution (q-points): " << nqx * nqy * nqz << std::endl << "** NQX x NQY x NQZ: " << nqx << " x " << nqy << " x " << nqz << std::endl #ifdef USE_MPI << "** Number of processes requested: " << num_procs << std::endl #endif << std::flush; } // if if(num_triangles < 1) { std::cerr << "error: no triangles found in specified definition file" << std::endl; return false; } // if #ifdef USE_MPI // decompose along y and z directions into blocks int p_y = std::floor(sqrt((real_t) num_procs)); // some procs may be idle ... int p_z = num_procs / p_y; int p_nqx = nqx; int p_nqy = nqy / p_y + (((rank / p_z) < (int)nqy % p_y) ? 1 : 0); int p_nqz = nqz / p_z + (((rank % p_z) < (int)nqz % p_z) ? 1 : 0); commtimer.start(); int idle = 0; if(world_comm.rank(comm_key) >= p_y * p_z) idle = 1; std::string real_world("ff_num_real_world"); world_comm.split(real_world, comm_key, idle); commtimer.stop(); comm_time += commtimer.elapsed_msec(); #else int p_y = 1, p_z = 1; int p_nqx = nqx; int p_nqy = nqy; int p_nqz = nqz; #endif // USE_MPI #ifdef FINDBLOCK int block_x = 0, block_y = 0, block_z = 0, block_t = 0; int block_x_max = 0, block_y_max = 0, block_z_max = 0, block_t_max = 0; block_x_max = (nqx < 400) ? nqx : 400; block_y_max = (nqy < 400) ? nqy : 400; block_z_max = (nqz < 400) ? nqz : 400; block_t_max = (num_triangles < 2500) ? num_triangles : 2500; block_t = block_t_max; for(block_t = block_t_max; block_t > std::min(99, block_t_max - 1); block_t -= 100) { for(block_x = block_x_max; block_x > std::min(3, block_x_max - 1); block_x -= 2) { for(block_y = block_y_max; block_y > std::min(3, block_y_max - 1); block_y -= 2) { for(block_z = block_z_max; block_z > std::min(3, block_z_max - 1); block_z -= 2) { #endif maintimer.start(); #ifdef USE_MPI if(world_comm.rank(comm_key) < p_y * p_z) { // only the non-idle processors bool master = world_comm.is_master(real_world); if(master) { std::cout << "++ Number of MPI processes used: " << world_comm.size(real_world) << std::endl << "++ MPI grid size: 1 x " << p_y << " x " << p_z << std::endl << std::flush; } // if commtimer.start(); int rank = world_comm.rank(real_world); int size = world_comm.size(real_world); // create row-wise and column-wise communicators int row = rank / p_z, col = rank % p_z; world_comm.split("ff_num_row_comm", real_world, row); world_comm.split("ff_num_col_comm", real_world, col); // perform MPI scan operation to compute y_offset and z_offset unsigned int y_offset = 0, z_offset = 0; world_comm.scan_sum("ff_num_col_comm", p_nqy, y_offset); world_comm.scan_sum("ff_num_row_comm", p_nqz, z_offset); commtimer.stop(); comm_time += commtimer.elapsed_msec(); y_offset -= p_nqy; z_offset -= p_nqz; #else master = true; unsigned int y_offset = 0, z_offset = 0; int rank = 0; int size = 1; #endif // USE_MPI memtimer.start(); // FIXME: this is a yucky temporary fix ... fix properly ... real_t* qx = new (std::nothrow) real_t[nqx](); real_t* qy = new (std::nothrow) real_t[nqy](); #ifdef FF_NUM_GPU cucomplex_t* qz = new (std::nothrow) cucomplex_t[nqz](); #else complex_t* qz = new (std::nothrow) complex_t[nqz](); #endif // create qy_and qz using qgrid instance for(unsigned int i = 0; i < nqx; ++ i) { qx[i] = QGrid::instance().qx(i); } // for for(unsigned int i = 0; i < nqy; ++ i) { qy[i] = QGrid::instance().qy(i); } // for for(unsigned int i = 0; i < nqz; ++ i) { #ifdef FF_NUM_GPU qz[i].x = QGrid::instance().qz_extended(i).real(); qz[i].y = QGrid::instance().qz_extended(i).imag(); #else qz[i] = QGrid::instance().qz_extended(i); #endif } // for #ifdef USE_MPI // create p_ff buffers <----- TODO: IMPROVE for all procs!!! real_t *p_qy = NULL; p_qy = new (std::nothrow) real_t[p_nqy](); if(p_qy == NULL) { return 0; } memcpy(p_qy, (void*) (qy + y_offset), p_nqy * sizeof(real_t)); #ifdef FF_NUM_GPU cucomplex_t *p_qz = NULL; p_qz = new (std::nothrow) cucomplex_t[p_nqz](); if(p_qz == NULL) { delete[] p_qy; return 0; } memcpy(p_qz, (void*) (qz + z_offset), p_nqz * sizeof(cucomplex_t)); #else // TODO: avoid the following ... complex_t *p_qz = NULL; p_qz = new (std::nothrow) complex_t[p_nqz](); if(p_qz == NULL) { delete[] p_qy; return 0; } memcpy(p_qz, (void*) (qz + z_offset), p_nqz * sizeof(complex_t)); #endif // FF_NUM_GPU #else // no MPI real_t *p_qy = qy; #ifdef FF_NUM_GPU cucomplex_t *p_qz = qz; #else complex_t *p_qz = qz; #endif // FF_NUM_GPU #endif // USE_MPI memtimer.stop(); mem_time += memtimer.elapsed_msec(); // compute local #ifdef FF_NUM_GPU cucomplex_t *p_ff = NULL; #else complex_t *p_ff = NULL; #endif computetimer.reset(); computetimer.start(); unsigned int ret_numtriangles = 0; real_t temp_mem_time = 0.0; #ifdef FF_NUM_GPU // use GPU #ifdef FF_NUM_GPU_FUSED ret_numtriangles = gff_.compute_form_factor_kb_fused(rank, shape_def, axes, p_ff, qx, p_nqx, p_qy, p_nqy, p_qz, p_nqz, 3, rot_, kernel_time, red_time, temp_mem_time #ifdef FINDBLOCK , block_x, block_y, block_z, block_t #endif ); #else ret_numtriangles = gff_.compute_form_factor_db(rank, shape_def, axes, p_ff, qx, p_nqx, p_qy, p_nqy, p_qz, p_nqz, rot_, kernel_time, red_time, temp_mem_time #ifdef FINDBLOCK , block_x, block_y, block_z, block_t #endif ); #endif /* #elif defined USE_MIC // use MIC #ifndef FF_NUM_MIC_KB ret_numtriangles = mff_.compute_form_factor_db(rank, shape_def, p_ff, qx, p_nqx, p_qy, p_nqy, p_qz, p_nqz, rot_, kernel_time, red_time, temp_mem_time #ifdef FINDBLOCK , block_x, block_y, block_z, block_t #endif ); #else ret_numtriangles = mff_.compute_form_factor_kb(rank, shape_def, num_triangles, p_ff, qx, p_nqx, p_qy, p_nqy, p_qz, p_nqz, 3, rot_, kernel_time, red_time, temp_mem_time #ifdef FINDBLOCK , block_x, block_y, block_z, block_t #endif ); #endif */ #else // use only CPU ret_numtriangles = cff_.compute_form_factor(rank, shape_def, // #ifdef __SSE3__ // num_triangles, // #endif p_ff, qx, p_nqx, p_qy, p_nqy, p_qz, p_nqz, rot_, kernel_time, red_time, temp_mem_time #ifdef FINDBLOCK , block_x, block_y, block_z, block_t #endif ); #endif computetimer.stop(); comp_time += computetimer.elapsed_msec(); mem_time += (temp_mem_time / 1000); // gather everything on proc 0 if(ret_numtriangles > 0) { real_t temp_mem_time = 0.0, temp_comm_time = 0.0; construct_ff(p_nqx, p_nqy, p_nqz, nqx, nqy, nqz, p_y, p_z, p_ff, ff, #ifdef USE_MPI world_comm, real_world, #endif temp_mem_time, temp_comm_time); mem_time += temp_mem_time; comm_time += temp_comm_time; } // if /*if(rank == 0) { write_slice_to_file(ff, nqx, nqy, nqz, filename, 0, 0); // x = 0, y = 1, z = 2 // only slice along x implemented for now } // if*/ #ifdef USE_MPI world_comm.barrier(real_world); #endif memtimer.start(); #ifdef FINDBLOCK ff.clear(); #endif if(p_ff != NULL) delete[] p_ff; #ifdef USE_MPI delete[] p_qz; delete[] p_qy; #endif delete[] qz; delete[] qy; delete[] qx; memtimer.stop(); maintimer.stop(); total_time = maintimer.elapsed_msec(); mem_time += memtimer.elapsed_msec(); if(master) { #ifdef TIME_DETAIL_1 std::cout << "** FF kernel time: " << kernel_time << " ms." << std::endl << "** FF compute time: " << computetimer.elapsed_msec() << " ms." << std::endl << "** FF memory and IO time: " << mem_time * 1000 << " ms." << std::endl << "** Communication time: " << comm_time * 1000 << " ms." << std::endl << "** Total FF time: " << maintimer.elapsed_msec() << " ms." << std::endl << std::flush; #endif // TIME_DETAIL_1 double mflop = 0.0; real_t gflops = 0.0; #ifdef USE_GPU // flop count for GPU //mflop = (double) nqx * nqy * nqz * (42 * num_triangles + 2) / 1000000; mflop = (double) nqx * nqy * nqz * (69 * num_triangles + 52) / 1000000; #elif defined USE_MIC // flop count for MIC //mflop = (double) nqx * nqy * nqz * (78 * num_triangles + 18) / 1000000; mflop = (double) nqx * nqy * nqz * (111 * num_triangles + 50) / 1000000; #elif defined INTEL_SB_AVX // flop count for Sandy Bridge with AVX // TODO: recount flops ... mflop = (double) nqx * nqy * nqz * (85 * num_triangles + 16) / 1000000; #else // flop count for SSE3 CPU (hopper) // TODO: recount flops ... mflop = (double) nqx * nqy * nqz * (68 * num_triangles + 20) / 1000000; #endif //gflops = nidle_num_procs * mflop / kernel_time; gflops = mflop / kernel_time; std::cout << "** Kernel performance: " << gflops << " GFLOPS/s" << std::endl; } // if #ifdef USE_MPI world_comm.free("ff_num_row_comm"); world_comm.free("ff_num_col_comm"); } // if #endif #ifdef USE_MPI world_comm.barrier(comm_key); world_comm.free(real_world); #endif #ifdef FINDBLOCK } // block_t } // block_z } // block_y } // block_x #endif return true; } // NumericFormFactor::compute()
bool mat_exp(complex_vec_t& matrix, complex_vec_t& result) { result.clear(); for(complex_vec_t::iterator i = matrix.begin(); i != matrix.end(); ++ i) result.push_back(exp(*i)); } // mat_exp()