/** * Function to read the input shape file. */ unsigned int NumericFormFactor::read_shapes_file_dat(const char* filename, real_vec_t &shape_def) { std::ifstream f(filename); if(!f.is_open()) { std::cout << "Cannot open file " << filename << std::endl; return 1; } // if real_t s = 0.0, cx = 0.0, cy = 0.0, cz = 0.0, nx = 0.0, ny = 0.0, nz = 0.0; while(true) { f >> s; if(f.eof() || !f.good()) break; f >> nx; f >> ny; f >> nz; f >> cx; f >> cy; f >> cz; shape_def.push_back(s); shape_def.push_back(nx); shape_def.push_back(ny); shape_def.push_back(nz); shape_def.push_back(cx); shape_def.push_back(cy); shape_def.push_back(cz); } // while f.close(); return shape_def.size() / 7; } // NumericFormFactor::read_shapes_file_dat()
/** * Approximated integration */ unsigned int NumericFormFactorC::compute_approx_triangle( real_vec_t &shape_def, complex_t *& ff, int nqy, real_t * qx, real_t * qy, int nqz, complex_t * qz, RotMatrix_t & rot, real_t &comp_time){ int num_triangles = shape_def.size() / CPU_T_PROP_SIZE_; if (num_triangles < 1) return 0; ff = new (std::nothrow) complex_t[nqz]; if (ff == NULL){ std::cerr << "Memory allocation failed for ff. Requested size: " << nqz << std::endl; return 0; } memset(ff, 0, nqz * sizeof(complex_t)); woo::BoostChronoTimer timer; timer.start(); #pragma omp parallel for for (int i_z = 0; i_z < nqz; i_z++){ int i_y = i_z % nqy; for (int i_t = 0; i_t < num_triangles; i_t++){ int offset = i_t * CPU_T_PROP_SIZE_; real_t s = shape_def[offset]; real_t nx = shape_def[offset + 1]; real_t ny = shape_def[offset + 2]; real_t nz = shape_def[offset + 3]; real_t x = shape_def[offset + 4]; real_t y = shape_def[offset + 5]; real_t z = shape_def[offset + 6]; // rotate q-vector std::vector<complex_t> mq = rot.rotate(qx[i_y], qy[i_y], qz[i_z]); real_t q2 = std::norm(mq[0]) + std::norm(mq[1]) + std::norm(mq[2]); complex_t qn = mq[0] * nx + mq[1] * ny + mq[2] * nz; complex_t qt = mq[0] * x + mq[1] * y + mq[2] * z; complex_t nj = CMPLX_MINUS_ONE_; complex_t np = CMPLX_ONE_; ff[i_z] += (nj * qn * s * std::exp(np * qt) / q2); } } timer.stop(); comp_time = timer.elapsed_msec(); return num_triangles; }
/** * The main host function called from outside, as part of the API for a single node. */ unsigned int NumericFormFactorC::compute_form_factor(int rank, // #ifndef __SSE3__ real_vec_t &shape_def, // #else // real_t* shape_def, unsigned int num_triangles, // #endif complex_t* &ff, real_t* &qx, int nqx, real_t* &qy, int nqy, complex_t* &qz, int nqz, real_t* &rot, real_t& kernel_time, real_t& red_time, real_t& mem_time #ifdef FINDBLOCK , const int block_x, const int block_y, const int block_z, const int block_t #endif ) { double temp_mem_time = 0.0, total_mem_time = 0.0; #ifdef _OPENMP if(rank == 0) std::cout << "++ Number of OpenMP threads: " << omp_get_max_threads() << std::endl; #endif // #ifndef __SSE3__ unsigned int num_triangles = shape_def.size() / CPU_T_PROP_SIZE_; // #endif if(num_triangles < 1) return 0; // #ifdef INTEL_SB_AVX // unsigned int shape_padding = (32 - (num_triangles & 31)) & 31; // #elif defined __SSE3__ // unsigned int shape_padding = (16 - (num_triangles & 15)) & 15; // #endif //#ifndef FF_NUM_CPU_PADDING unsigned long int total_qpoints = nqx * nqy * nqz; unsigned long int host_mem_usage = ((unsigned long int) nqx + nqy) * sizeof(real_t) + nqz * sizeof(complex_t); //#else // padding to 16 bytes //const unsigned int PAD_LINE_ = 16; //unsigned int pad_x = 0; //if(nqx != 1) pad_x = (PAD_LINE_ - (nqx % PAD_LINE_)) % PAD_LINE_; //unsigned int pad_y = (PAD_LINE_ - (nqy % PAD_LINE_)) % PAD_LINE_; //unsigned int pad_z = (PAD_LINE_ - (nqz % PAD_LINE_)) % PAD_LINE_; //unsigned int pnqx = nqx + pad_x, pnqy = nqy + pad_y, pnqz = nqz + pad_z; //unsigned long int total_qpoints = pnqx * pnqy * pnqz; //unsigned long int host_mem_usage = ((unsigned long int) pnqx + pnqy) * sizeof(real_t) + // pnqz * sizeof(complex_t); //#endif // allocate memory for the final FF 3D matrix ff = new (std::nothrow) complex_t[total_qpoints]; // allocate and initialize to 0 memset(ff, 0, total_qpoints * sizeof(complex_t)); if(ff == NULL) { std::cerr << "Memory allocation failed for ff. Size = " << total_qpoints * sizeof(complex_t) << " b" << std::endl; return 0; } // if host_mem_usage += total_qpoints * sizeof(complex_t); //unsigned long int matrix_size = (unsigned long int) nqx * nqy * nqz * num_triangles; // do hyperblocking to use less memory unsigned int b_nqx = 0, b_nqy = 0, b_nqz = 0, b_num_triangles = 0; #ifndef FF_NUM_CPU_AUTOTUNE_HB compute_block_size(nqx, nqy, nqz, num_triangles, b_nqx, b_nqy, b_nqz, b_num_triangles #ifdef FINDBLOCK , block_x, block_y, block_z, block_t #endif ); #else std::cout << "-- Autotuning hyperblock size ... " << std::endl; double min_time_hb = 1000000.0; unsigned int min_b_nqx = 1, min_b_nqy = 1, min_b_nqz = 1, min_b_num_triangles = 1; woo::BoostChronoTimer at_kernel_timer, at_overhead_timer; at_overhead_timer.start(); complex_t* ff_temp; ff_temp = new (std::nothrow) complex_t[nqx * nqy * nqz]; for(int b_nqx_i = 1; b_nqx_i <= nqx; ++ b_nqx_i) { for(int b_nqy_i = 10; b_nqy_i <= nqy; b_nqy_i += 10) { for(int b_nqz_i = 10; b_nqz_i <= nqz; b_nqz_i += 10) { for(int b_nt_i = 10; b_nt_i <= num_triangles; b_nt_i += 10) { at_kernel_timer.start(); // compute the number of sub-blocks, along each of the 4 dimensions unsigned int nb_x = (unsigned int) ceil((float) nqx / b_nqx_i); unsigned int nb_y = (unsigned int) ceil((float) nqy / b_nqy_i); unsigned int nb_z = (unsigned int) ceil((float) nqz / b_nqz_i); unsigned int nb_t = (unsigned int) ceil((float) num_triangles / b_nt_i); unsigned int num_blocks = nb_x * nb_y * nb_z * nb_t; form_factor_kernel_fused_nqx1(qx, qy, qz, shape_def, b_nqx_i, b_nqy_i, b_nqz_i, b_nt_i, b_nqx_i, b_nqy_i, b_nqz_i, b_nt_i, nqx, nqy, nqz, num_triangles, 0, 0, 0, 0, rot, ff); at_kernel_timer.stop(); double curr_time = at_kernel_timer.elapsed_msec(); double tot_time = curr_time * num_blocks; std::cout << "## " << b_nqx_i << " x " << b_nqy_i << " x " << b_nqz_i << " x " << b_nt_i << "\t" << num_blocks << "\t:\t" << curr_time << "\t" << tot_time << std::endl; if(tot_time < min_time_hb) { min_time_hb = tot_time; min_b_nqx = b_nqx_i; min_b_nqy = b_nqy_i; min_b_nqz = b_nqz_i; min_b_num_triangles = b_nt_i; } // if } // for } // for } // for } // for delete[] ff_temp; at_overhead_timer.stop(); b_nqx = min_b_nqx; b_nqy = min_b_nqy; b_nqz = min_b_nqz; b_num_triangles = min_b_num_triangles; if(rank == 0) { std::cout << "## HBlock Autotuner overhead: " << at_overhead_timer.elapsed_msec() << " ms." << std::endl; } // if #endif unsigned long int blocked_3d_matrix_size = (unsigned long int) b_nqx * b_nqy * b_nqz; //size_t estimated_host_mem_need = host_mem_usage + blocked_matrix_size * sizeof(complex_t); //if(rank == 0) { // std::cout << "++ Estimated host memory need: " << (float) estimated_host_mem_need / 1024 / 1024 // << " MB" << std::endl; //} // if #ifndef FF_NUM_CPU_FUSED unsigned long int blocked_matrix_size = (unsigned long int) blocked_3d_matrix_size * b_num_triangles; host_mem_usage += blocked_matrix_size * sizeof(complex_t); complex_t *fq_buffer = new (std::nothrow) complex_t[blocked_matrix_size](); if(fq_buffer == NULL) { std::cerr << "Memory allocation failed for fq_buffer. blocked_matrix_size = " << blocked_matrix_size << std::endl << "Host memory usage = " << (float) host_mem_usage / 1024 / 1024 << " MB" << std::endl; delete[] ff; return 0; } // if #endif if(rank == 0) { std::cout << "++ Host memory usage: " << (float) host_mem_usage / 1024 / 1024 << " MB" << std::endl << std::flush; } // if // compute the number of sub-blocks, along each of the 4 dimensions // formulate loops over each dimension, to go over each sub block unsigned int nb_x = (unsigned int) ceil((float) nqx / b_nqx); unsigned int nb_y = (unsigned int) ceil((float) nqy / b_nqy); unsigned int nb_z = (unsigned int) ceil((float) nqz / b_nqz); unsigned int nb_t = (unsigned int) ceil((float) num_triangles / b_num_triangles); unsigned int curr_b_nqx = b_nqx, curr_b_nqy = b_nqy, curr_b_nqz = b_nqz; unsigned int curr_b_num_triangles = b_num_triangles; unsigned int num_blocks = nb_x * nb_y * nb_z * nb_t; #ifdef TIME_DETAIL_2 if(rank == 0) { std::cout << "++ Hyperblock size: " << b_nqx << " x " << b_nqy << " x " << b_nqz << " x " << b_num_triangles << std::endl; std::cout << "++ Number of decomposed Hblocks: " << num_blocks << " [" << nb_x << " x " << nb_y << " x " << nb_z << " x " << nb_t << "]" << std::endl; } // if #endif // TIME_DETAIL_2 unsigned int block_num = 0; #ifdef PROFILE_PAPI long long int papi_total_cycles = 0, papi_total_inst = 0, papi_total_flop = 0; double overall_ipc = 0.0; #endif if(rank == 0) std::cout << "-- Computing form factor on CPU ... " << std::flush; woo::BoostChronoTimer kernel_timer; kernel_timer.start(); // compute for each hyperblock curr_b_nqx = b_nqx; for(unsigned int ib_x = 0; ib_x < nb_x; ++ ib_x) { if(ib_x == nb_x - 1) curr_b_nqx = nqx - b_nqx * ib_x; curr_b_nqy = b_nqy; for(unsigned int ib_y = 0; ib_y < nb_y; ++ ib_y) { if(ib_y == nb_y - 1) curr_b_nqy = nqy - b_nqy * ib_y; curr_b_nqz = b_nqz; for(unsigned int ib_z = 0; ib_z < nb_z; ++ ib_z) { if(ib_z == nb_z - 1) curr_b_nqz = nqz - b_nqz * ib_z; curr_b_num_triangles = b_num_triangles; for(unsigned int ib_t = 0; ib_t < nb_t; ++ ib_t) { if(ib_t == nb_t - 1) curr_b_num_triangles = num_triangles - b_num_triangles * ib_t; #ifdef PROFILE_PAPI // PAPI_L1_DCM 0x80000000 No Level 1 data cache misses // PAPI_L1_ICM 0x80000001 No Level 1 instruction cache misses // PAPI_L2_DCM 0x80000002 No Level 2 data cache misses // PAPI_L2_ICM 0x80000003 No Level 2 instruction cache misses // PAPI_L1_TCM 0x80000006 Yes Level 1 cache misses // PAPI_L2_TCM 0x80000007 No Level 2 cache misses // PAPI_FPU_IDL 0x80000012 No Cycles floating point units are idle // PAPI_TLB_DM 0x80000014 No Data translation lookaside buffer misses // PAPI_TLB_IM 0x80000015 No Instruction translation lookaside buffer misses // PAPI_TLB_TL 0x80000016 Yes Total translation lookaside buffer misses // PAPI_STL_ICY 0x80000025 No Cycles with no instruction issue // PAPI_HW_INT 0x80000029 No Hardware interrupts // PAPI_BR_TKN 0x8000002c No Conditional branch instructions taken // PAPI_BR_MSP 0x8000002e No Conditional branch instructions mispredicted // PAPI_TOT_INS 0x80000032 No Instructions completed // PAPI_FP_INS 0x80000034 No Floating point instructions // PAPI_BR_INS 0x80000037 No Branch instructions // PAPI_VEC_INS 0x80000038 No Vector/SIMD instructions (could include integer) // PAPI_RES_STL 0x80000039 No Cycles stalled on any resource // PAPI_TOT_CYC 0x8000003b No Total cycles // PAPI_L1_DCH 0x8000003e Yes Level 1 data cache hits // PAPI_L2_DCH 0x8000003f Yes Level 2 data cache hits // PAPI_L1_DCA 0x80000040 No Level 1 data cache accesses // PAPI_L2_DCA 0x80000041 No Level 2 data cache accesses // PAPI_L1_ICH 0x80000049 Yes Level 1 instruction cache hits // PAPI_L2_ICH 0x8000004a No Level 2 instruction cache hits // PAPI_L1_ICA 0x8000004c No Level 1 instruction cache accesses // PAPI_L2_ICA 0x8000004d No Level 2 instruction cache accesses // PAPI_L1_ICR 0x8000004f No Level 1 instruction cache reads // PAPI_L1_TCH 0x80000055 Yes Level 1 total cache hits // PAPI_L2_TCH 0x80000056 Yes Level 2 total cache hits // PAPI_L1_TCA 0x80000058 Yes Level 1 total cache accesses // PAPI_L2_TCA 0x80000059 No Level 2 total cache accesses // PAPI_FML_INS 0x80000061 No Floating point multiply instructions // PAPI_FAD_INS 0x80000062 No Floating point add instructions // (Also includes subtract instructions) // PAPI_FDV_INS 0x80000063 No Floating point divide instructions // (Counts both divide and square root instructions) // PAPI_FSQ_INS 0x80000064 No Floating point square root instructions // (Counts both divide and square root instructions) // PAPI_FP_OPS 0x80000066 No Floating point operations // PAPI_SP_OPS 0x80000067 No Floating point operations; optimized to count // scaled single precision vector operations // PAPI_DP_OPS 0x80000068 No Floating point operations; optimized to count // scaled double precision vector operations int papi_events[3] = { PAPI_TOT_CYC, PAPI_TOT_INS, PAPI_FP_OPS }; //int papi_events[3] = { PAPI_FML_INS, PAPI_FAD_INS, PAPI_FDV_INS }; //int papi_events[3] = { PAPI_FP_OPS, PAPI_SP_OPS, PAPI_DP_OPS }; long long papi_counter_values[3]; PAPI_start_counters(papi_events, 3); #endif // call the main kernel #ifndef FF_NUM_CPU_FUSED // DO NOT USE THIS form_factor_kernel(qx, qy, qz, shape_def, curr_b_nqx, curr_b_nqy, curr_b_nqz, curr_b_num_triangles, b_nqx, b_nqy, b_nqz, b_num_triangles, ib_x, ib_y, ib_z, ib_t, fq_buffer); #else if(nqx == 1) { form_factor_kernel_fused_nqx1(qx, qy, qz, shape_def, //form_factor_kernel_fused_nqx1_unroll4(qx, qy, qz, shape_def, curr_b_nqx, curr_b_nqy, curr_b_nqz, curr_b_num_triangles, b_nqx, b_nqy, b_nqz, b_num_triangles, nqx, nqy, nqz, num_triangles, ib_x, ib_y, ib_z, ib_t, rot, ff); } else { // #ifdef __SSE3__ // if(rank == 0) // std::cout << "uh-oh: no SSE3 version!" << std::endl; // #else form_factor_kernel_fused_unroll4(qx, qy, qz, shape_def, curr_b_nqx, curr_b_nqy, curr_b_nqz, curr_b_num_triangles, b_nqx, b_nqy, b_nqz, b_num_triangles, nqx, nqy, nqz, num_triangles, ib_x, ib_y, ib_z, ib_t, rot, ff); // #endif // __SSE3__ } // if-else #endif #ifndef FF_NUM_CPU_FUSED // DO NOT USE THIS // call the reduction kernel reduction_kernel(curr_b_nqx, curr_b_nqy, curr_b_nqz, curr_b_num_triangles, blocked_matrix_size, b_nqx, b_nqy, b_nqz, num_triangles, nqx, nqy, nqz, ib_x, ib_y, ib_z, ib_t, fq_buffer, ff); #endif #ifdef PROFILE_PAPI PAPI_stop_counters(papi_counter_values, 3); papi_total_cycles += papi_counter_values[0]; papi_total_inst += papi_counter_values[1]; papi_total_flop += papi_counter_values[2]; #endif } // for ib_t } // for ib_z } // for ib_y } // for ib_x kernel_timer.stop(); kernel_time = kernel_timer.elapsed_msec(); #ifndef FF_NUM_CPU_FUSED delete[] fq_buffer; #endif if(rank == 0) std::cout << "done." << std::endl; #ifdef PROFILE_PAPI if(rank == 0) { std::cout << "++ PAPI_TOT_CYC: " << papi_total_cycles << std::endl; std::cout << "++ PAPI_TOT_INS: " << papi_total_inst << std::endl; std::cout << "++ PAPI_FP_OPS: " << papi_total_flop << std::endl; std::cout << "++ IPC: " << (double) papi_total_inst / papi_total_cycles << std::endl; } // if #endif return num_triangles; } // NumericFormFactorC::compute_form_factor()
void NumericFormFactor::find_axes_orientation(real_vec_t &shape_def, std::vector<short int> &axes) { real_t min_a = shape_def[4], max_a = shape_def[4]; real_t min_b = shape_def[5], max_b = shape_def[5]; real_t min_c = shape_def[6], max_c = shape_def[6]; for(unsigned int i = 0; i + 6 < shape_def.size(); i += 7) { min_a = (min_a > shape_def[i + 4]) ? shape_def[i + 4] : min_a ; max_a = (max_a < shape_def[i + 4]) ? shape_def[i + 4] : max_a ; min_b = (min_b > shape_def[i + 5]) ? shape_def[i + 5] : min_b ; max_b = (max_b < shape_def[i + 5]) ? shape_def[i + 5] : max_b ; min_c = (min_c > shape_def[i + 6]) ? shape_def[i + 6] : min_c ; max_c = (max_c < shape_def[i + 6]) ? shape_def[i + 6] : max_c ; } // for real_t diff_a = max_a - min_a; real_t diff_b = max_b - min_b; real_t diff_c = max_c - min_c; // axes[i] = j // i: x=0 y=1 z=2 // j: 0=a 1=b 2=c //std::cout << "++ diff_a = " << diff_a << ", diff_b = " << diff_b // << ", diff_c = " << diff_c << std::endl; real_vec_t min_point, max_point; // the smallest one is x, other two are y and z if(diff_a < diff_b) { if(diff_a < diff_c) { // x is a axes[0] = 0; axes[1] = 1; axes[2] = 2; min_point.push_back(min_a); min_point.push_back(min_b); min_point.push_back(min_c); max_point.push_back(max_a); max_point.push_back(max_b); max_point.push_back(max_c); } else { // x is c axes[0] = 2; axes[1] = 0; axes[2] = 1; min_point.push_back(min_c); min_point.push_back(min_a); min_point.push_back(min_b); max_point.push_back(max_c); max_point.push_back(max_a); max_point.push_back(max_b); } // if-else } else { if(diff_b < diff_c) { // x is b axes[0] = 1; axes[1] = 0; axes[2] = 2; min_point.push_back(min_b); min_point.push_back(min_a); min_point.push_back(min_c); max_point.push_back(max_b); max_point.push_back(max_a); max_point.push_back(max_c); } else { // x is c axes[0] = 2; axes[1] = 0; axes[2] = 1; min_point.push_back(min_c); min_point.push_back(min_a); min_point.push_back(min_b); max_point.push_back(max_c); max_point.push_back(max_a); max_point.push_back(max_b); } // if-else } // if-else std::cout << "++ Shape min point: " << min_point[0] << ", " << min_point[1] << ", " << min_point[2] << std::endl; std::cout << "++ Shape max point: " << max_point[0] << ", " << max_point[1] << ", " << max_point[2] << std::endl; std::cout << "++ Shape dimensions: " << fabs(max_point[0] - min_point[0]) << " x " << fabs(max_point[1] - min_point[1]) << " x " << fabs(max_point[2] - min_point[2]) << std::endl; } // NumericFormFactor::find_axes_orientation()