int main( int argc, char* argv[] ) { int iterations = 1; if (argc > 1) iterations = atoi( argv[1] ); int size = 10; uint dest[size*size]; int i; for ( i = 0; i < size*size; i++ ) dest[i] = 0; int temp = 0; for ( i = 0; i < iterations; i++ ) { test_stats_on( temp ); masked_filter_scalar( dest, mask, src, size, size, g_coeff ); test_stats_off( temp ); } verify_results( dest, ref, size ); return 0; }
microseconds MatrixMul::blitz(const Args & args, std::mt19937 & gen) { std::cout << "Test: blitz++ "; const MatrixMulArgs & cur_args = dynamic_cast<const MatrixMulArgs&>(args); uint32_t m, k, l; get_matrix_sizes(cur_args, m, k, l); blitz::Array<double, 2> C(m, l), B(k, l), A(m, k); initialize_matrices(A.begin(), A.end(), B.begin(), B.end(), gen, cur_args); auto start = std::chrono::high_resolution_clock::now(); blitz::firstIndex i; blitz::secondIndex j; blitz::thirdIndex n; C = blitz::sum(A(i,n) * B(n,j), n); auto end = std::chrono::high_resolution_clock::now(); if( args.test ) { verify_results(C.begin(), C.end()); } auto time = std::chrono::duration_cast<std::chrono::microseconds>( end - start); std::cout << time.count() << std::endl; return time; }
microseconds MatrixMul::mult_blas(const Args & args, std::mt19937 & gen) { std::cout << "Test: BLAS "; const MatrixMulArgs & cur_args = dynamic_cast<const MatrixMulArgs&>(args); uint32_t m, k, l; get_matrix_sizes(cur_args, m, k, l); double * A = new double[m * k]; double * B = new double[k * l]; double * C = new double[m * l]; initialize_matrices(A, A + m*k, B, B + k*l, gen, cur_args); auto start = std::chrono::high_resolution_clock::now(); cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, l, k, 1.0, A, k, B, l, 0.0, C, l); auto end = std::chrono::high_resolution_clock::now(); if( args.test ) { verify_results(C, C + m*l); } auto time = std::chrono::duration_cast<std::chrono::microseconds>( end - start); std::cout << time.count() << std::endl; delete[] A; delete[] B; delete[] C; return time; }
microseconds MatrixMul::plain_call(const Args & args, std::mt19937 & gen) { std::cout << "Test: plain call "; const MatrixMulArgs & cur_args = dynamic_cast<const MatrixMulArgs&>(args); uint32_t m, k, l; get_matrix_sizes(cur_args, m, k, l); double * A = new double[m*k]; double * B = new double[k*l]; double * C = new double[m*l]; /** Initialize **/ initialize_matrices(A, A + m*k, B, B + k*l, gen, cur_args); /** Compute **/ auto start = std::chrono::high_resolution_clock::now(); for(uint32_t i = 0; i < m;++i) { for(uint32_t n = 0; n < l; ++n) { C[i*l + n] = A[i*k] * B[n]; } for(uint32_t j = 1; j < k; ++j) { for(uint32_t n = 0; n < l; ++n) { C[i*l + n] += A[i*k + j] * B[j*l + n]; } } } auto end = std::chrono::high_resolution_clock::now(); if( args.test ) { verify_results(C, C + m*l); } delete[] A; delete[] B; delete[] C; auto time = std::chrono::duration_cast<std::chrono::microseconds>( end - start); std::cout << time.count() << std::endl; return time; }
int main( int argc, char* argv[] ) { int size = 100; int dest[size]; int i; for ( i = 0; i < size; i++ ) dest[i] = 0; test_stats_on(); vvadd_scalar( dest, src0, src1, size ); test_stats_off(); verify_results( dest, ref, size ); return 0; }
int main(int argc, char *argv[]) { Elem *test_memory = test_memory_data; Elem *expected_memory = expected_memory_data; if (shuffle_memory(test_memory) == 0) { printf("ERROR: shuffle_memory failed. not verifying results \n"); return 1; } if (verify_results(test_memory, expected_memory) == 0) { printf("ERROR: verify_results failed. \n"); return 1; } return 0; }
int main( int argc, char* argv[] ) { int size = 10; uint dest[size*size]; int i; for ( i = 0; i < size*size; i++ ) dest[i] = 0; int temp = 0; test_stats_on( temp ); masked_filter_scalar( dest, mask, src, size, size, g_coeff ); test_stats_off( temp ); verify_results( dest, ref, size ); return 0; }
microseconds MatrixMul::boost_ublas(const Args & args, std::mt19937 & gen) { std::cout << "Test: boost uBLAS "; const MatrixMulArgs & cur_args = dynamic_cast<const MatrixMulArgs&>(args); uint32_t m, k, l; get_matrix_sizes(cur_args, m, k, l); boost::numeric::ublas::matrix<double> A(m, k), B(k, l), C(m, l); initialize_matrices(A.data().begin(), A.data().end(), B.data().begin(), B.data().end(), gen, cur_args); auto start = std::chrono::high_resolution_clock::now(); noalias(C) = prod( A, B ); auto end = std::chrono::high_resolution_clock::now(); if( args.test ) { verify_results(C.data().begin(), C.data().end()); } auto time = std::chrono::duration_cast<std::chrono::microseconds>( end - start); std::cout << time.count() << std::endl; return time; }
microseconds MatrixMul::mult_blaze(const Args & args, std::mt19937 & gen) { std::cout << "Test: Blaze "; const MatrixMulArgs & cur_args = dynamic_cast<const MatrixMulArgs&>(args); uint32_t m, k, l; get_matrix_sizes(cur_args, m, k, l); blaze::DynamicMatrix<double, blaze::rowMajor> A(m, k), B(k, l), C(m, l); initialize_matrices(A.data(), A.data() + m*k, B.data(), B.data() + k*l, gen, cur_args); auto start = std::chrono::high_resolution_clock::now(); C = A * B; auto end = std::chrono::high_resolution_clock::now(); if( args.test ) { verify_results(C.data(), C.data() + m*l); } auto time = std::chrono::duration_cast<std::chrono::microseconds>( end - start); std::cout << time.count() << std::endl; return time; }
int main( int argc, char* argv[] ) { int size = C; int dest[size]; int i; for ( i = 0; i < size; i++ ) dest[i] = 0; int temp = 0; // warmup mvmult_scalar( dest, (int*) matrix, vector, R, C ); test_stats_on( temp ); for ( i = 0; i < 1; i++ ) mvmult_scalar( dest, (int*) matrix, vector, R, C ); test_stats_off( temp ); verify_results( dest, ref, size ); return 0; }
/* for calling from Fortran */ void verify_results_(vector_t x[], vector_t v[]) { verify_results(x, v, "f90"); }
int cholesky_tiled(double *mat, int tile_size, int num_tiles, int mat_size, int niter, int max_log_str, bool layRow, int verify, int num_doms, int use_host, int num_mics, int host_ht_offset) { //verification result bool result; //total number of tiles int tot_tiles = num_tiles * num_tiles; //memory allocation for matrix for tiled-Cholesky double *A_my = (double *)malloc(mat_size * mat_size * sizeof(double)); //memory allocation for matrix for MKL cholesky (for comparison) double *A_MKL = (double *)malloc(mat_size * mat_size * sizeof(double)); //memory allocation for tiled matrix double **Asplit = new double* [tot_tiles]; int mem_size_tile = tile_size * tile_size * sizeof(double); #define HSTR_BUFFER_PROPS_VALUES { \ HSTR_MEM_TYPE_NORMAL, \ HSTR_MEM_ALLOC_PREFERRED, \ HSTR_BUF_PROP_ALIASED} HSTR_BUFFER_PROPS buffer_props = HSTR_BUFFER_PROPS_VALUES; for (int i = 0; i < tot_tiles; ++i) { //Buffer per tile, host allocation Asplit[i] = (double *)_mm_malloc(mem_size_tile, 64); //Buffer creation and allocation on the card //hStreams_app_create_buf((void *)Asplit[i], mem_size_tile); CHECK_HSTR_RESULT(hStreams_Alloc1DEx( (void *)Asplit[i], mem_size_tile, &buffer_props, -1, NULL)); } double tbegin, tend; int iter; int info; //Events are needed for various synchronizations to enforce //data dependence between and among data-transfers/computes HSTR_EVENT *eventcpyto = new HSTR_EVENT[tot_tiles]; HSTR_EVENT *eventcpyto_trsm = new HSTR_EVENT[tot_tiles * num_doms]; HSTR_EVENT *eventcpyfr = new HSTR_EVENT[tot_tiles]; HSTR_EVENT *eventpotrf = new HSTR_EVENT[tot_tiles]; HSTR_EVENT *eventtrsm = new HSTR_EVENT[tot_tiles]; HSTR_EVENT *eventsyrk = new HSTR_EVENT[tot_tiles]; HSTR_EVENT *eventgemm = new HSTR_EVENT[tot_tiles]; //for timing tiled cholesky double *totTimeMsec = new double [niter]; //for timing MKL cholesky double *totTimeMsecMKL = new double [niter]; mkl_mic_disable(); //these queues are used for queining up compute on the card and //data transfers to/from the card. //q_trsm for dtrsm, q_potrf for dportf, q_syrk_gemm for both dsyrk and dgemm. //The queues are incremented by one for every compute queued and wrap //around the max_log_str available. This ensures good load-balancing. int q_trsm, q_potrf; int q_syrk_gemm[10]; CBLAS_ORDER blasLay; int lapackLay; if (layRow) { blasLay = CblasRowMajor; lapackLay = LAPACK_ROW_MAJOR; } else { blasLay = CblasColMajor; lapackLay = LAPACK_COL_MAJOR; } for (iter = 0; iter < niter; ++iter) { //copying matrices into separate variables for tiled cholesky (A_my) //and MKL cholesky (A_MKL) //The output overwrites the matrices and hence the need to copy //for each iteration copy_mat(mat, A_my, mat_size); copy_mat(mat, A_MKL, mat_size); unsigned int m, n, k; printf("\nIteration = %d\n", iter); //splitting time included in the timing //This splits the input matrix into tiles (or blocks) split_into_blocks(A_my, Asplit, num_tiles, tile_size, mat_size, layRow); //beginning of timing tbegin = dtimeGet(); int ic; int is_mic; for (ic = 0; ic < num_doms; ++ic) { q_syrk_gemm[ic] = 0; } q_potrf = 0; q_trsm = 0; for (k = 0; k < num_tiles; ++k) { //POTRF //dpotrf is executed on the host on the diagonal tile if (mach_wide_league) { q_potrf = 0; } else { q_potrf = q_syrk_gemm[0]; } int qindex = (int)q_potrf % max_log_str; if (use_host) { if (k == 0) { if (loc_verbose > 0) printf("Sending tile[%d][%d] to host in queue %d, triggering event eventcpyto[%d][%d]\n", k, k, (int)(qindex), k, k); hStreams_app_xfer_memory((int)(qindex), Asplit[k * num_tiles + k], Asplit[k * num_tiles + k], mem_size_tile, HSTR_SRC_TO_SINK, &eventcpyto[k * num_tiles + k]); } } if (k > 0) { if (use_host) { hStreams_app_event_wait_in_stream(qindex, 1, &eventcpyfr[k * num_tiles + k], 0, NULL, NULL); } else { hStreams_app_event_wait(1, &eventcpyfr[k * num_tiles + k]); } if (loc_verbose > 0) { printf("Waiting on eventcpyfr[%d]\n", k * num_tiles + k); } } if (loc_verbose > 0) printf("Executing potrf on host for tile[%d][%d], in queue (if use_host) %d, triggerring eventpotrf[%d][%d]\n", k, k, qindex, k, k); if (use_host) { CHECK_HSTR_RESULT(hStreams_custom_dpotrf(lapackLay, 'L', tile_size, Asplit[k * num_tiles + k], tile_size, qindex, &eventpotrf[k * num_tiles + k])); } else { info = LAPACKE_dpotrf(lapackLay, 'L', tile_size, Asplit[k * num_tiles + k], tile_size); } if (mach_wide_league) { q_trsm = q_syrk_gemm[0]; } else { q_potrf++; q_trsm = q_potrf; } for (m = k + 1; m < num_tiles; ++m) { if (mach_wide_league) { qindex = (int)(q_trsm % max_log_str + 1); } else { qindex = (int)(q_trsm % max_log_str); } if (use_host) { if (k == 0) { if (loc_verbose > 0) printf("Sending tile[%d][%d] to host in queue %d, triggering event eventcpyto[%d][%d]\n", m, k, (int)(qindex), m, k); hStreams_app_xfer_memory((int)(qindex), Asplit[m * num_tiles + k], Asplit[m * num_tiles + k], mem_size_tile, HSTR_SRC_TO_SINK, &eventcpyto[m * num_tiles + k]); } } if (k > 0) { if (use_host) { hStreams_app_event_wait_in_stream(qindex, 1, &eventcpyfr[m * num_tiles + k], 0, NULL, NULL); } else { hStreams_app_event_wait(1, &eventcpyfr[m * num_tiles + k]); } if (loc_verbose > 0) { printf("Waiting on eventcpyfr[%d]\n", m * num_tiles + k); } } if (use_host) //hStreams_app_event_wait(1, &eventpotrf[k*num_tiles + k]); { hStreams_app_event_wait_in_stream(qindex, 1, &eventpotrf[k * num_tiles + k], 0, NULL, NULL); } //dtrsm is executed on the host if (loc_verbose > 0) printf("Executing trsm for tile[%d][%d] on host, in queue (if use_host) %d, triggering eventtrsm[%d][%d]\n", m, k, qindex, m, k); if (use_host) { CHECK_HSTR_RESULT(hStreams_custom_dtrsm(blasLay, CblasRight, CblasLower, CblasTrans, CblasNonUnit, tile_size, tile_size, 1.0, Asplit[k * num_tiles + k], tile_size, Asplit[m * num_tiles + k], tile_size, qindex, &eventtrsm[m * num_tiles + k])); } else { cblas_dtrsm(blasLay, CblasRight, CblasLower, CblasTrans, CblasNonUnit, tile_size, tile_size, 1.0, Asplit[k * num_tiles + k], tile_size, Asplit[m * num_tiles + k], tile_size); } //transfer to all cards for (ic = 0; ic < num_doms; ++ic) { if ((use_host == 1) && (num_mics >= 1)) { if (ic == 0) { is_mic = 0; //this is host } else { is_mic = 1; } } else { is_mic = 0; } if (mach_wide_league) { qindex = (int)q_trsm % max_log_str + ic * max_log_str + 1 + is_mic * host_ht_offset; } else { qindex = (int)q_trsm % max_log_str + ic * max_log_str + is_mic * host_ht_offset; } if (use_host) //hStreams_app_event_wait(1, &eventtrsm[m*num_tiles + k]); { hStreams_app_event_wait_in_stream(qindex, 1, &eventtrsm[m * num_tiles + k], 0, NULL, NULL); } if (loc_verbose > 0) printf("Sending tile[%d][%d] to card %d in queue %d, triggering event eventcpyto_trsm[%d]\n", m, k, ic, (int)(qindex), m * num_tiles + k + ic * tot_tiles); hStreams_app_xfer_memory((int)(qindex), Asplit[m * num_tiles + k], Asplit[m * num_tiles + k], mem_size_tile, HSTR_SRC_TO_SINK, &eventcpyto_trsm[m * num_tiles + k + ic * tot_tiles]); } q_trsm++; } if (use_host) { q_syrk_gemm[0] = q_trsm; for (ic = 1; ic < num_doms; ++ic) { q_syrk_gemm[ic] = 0; } } else { for (ic = 0; ic < num_doms; ++ic) { q_syrk_gemm[ic] = 0; } } for (n = k + 1; n < num_tiles; ++n) { ic = n % num_doms; //round-robin rows across num_doms if ((use_host == 1) && (num_mics >= 1)) { if (ic == 0) { is_mic = 0; //this is host } else { is_mic = 1; } } else { is_mic = 0; } if (mach_wide_league) { qindex = q_syrk_gemm[ic] % max_log_str + ic * max_log_str + 1 + is_mic * host_ht_offset; } else { qindex = q_syrk_gemm[ic] % max_log_str + ic * max_log_str + is_mic * host_ht_offset; } if (k == 0) { if (loc_verbose > 0) printf("Sending tile[%d][%d] to card in queue %d\n", n, n, (int)(qindex)); hStreams_app_xfer_memory((int)(qindex), Asplit[n * num_tiles + n], Asplit[n * num_tiles + n], mem_size_tile, HSTR_SRC_TO_SINK, &eventcpyto[n * num_tiles + n]); } //DSYRK //hStreams_app_event_wait(1, &eventcpyto_trsm[n*num_tiles + k + ic*tot_tiles]); hStreams_app_event_wait_in_stream(qindex, 1, &eventcpyto_trsm[n * num_tiles + k + ic * tot_tiles], 0, NULL, NULL); if (loc_verbose > 0) { printf("Waiting on eventcpyto_trsm[%d]\n", n * num_tiles + k + ic * tot_tiles); } if (k > 0) { //hStreams_app_event_wait(1, &eventsyrk[n*num_tiles + n]); hStreams_app_event_wait_in_stream(qindex, 1, &eventsyrk[n * num_tiles + n], 0, NULL, NULL); if (loc_verbose > 0) { printf("Waiting on eventsyrk[%d]\n", n * num_tiles + n); } } //dsyrk is executed on the card if (loc_verbose > 0) printf("Executing syrk for tile[%d][%d] on card in queue %d, triggering event eventsyrk[%d]\n", n, n, (int)(qindex), n * num_tiles + n); CHECK_HSTR_RESULT(hStreams_custom_dsyrk(blasLay, CblasLower, CblasNoTrans, tile_size, tile_size, -1.0, Asplit[n * num_tiles + k], tile_size, 1.0, Asplit[n * num_tiles + n], tile_size, (int)(qindex), &eventsyrk[n * num_tiles + n])); //send tile to host (only if n = k+1) if (n == k + 1) { if (loc_verbose > 0) printf("Sending tile[%d][%d] from card to host in queue %d, triggering event eventcpyfr[%d]\n", n, n, (int)(qindex), n * num_tiles + n); hStreams_app_xfer_memory((int)(qindex), Asplit[n * num_tiles + n], Asplit[n * num_tiles + n], mem_size_tile, HSTR_SINK_TO_SRC, &eventcpyfr[n * num_tiles + n]); } q_syrk_gemm[ic]++; for (m = n + 1; m < num_tiles; ++m) { ic = m % num_doms; //round-robin rows across num_doms if ((use_host == 1) && (num_mics >= 1)) { if (ic == 0) { is_mic = 0; //this is host } else { is_mic = 1; } } else { is_mic = 0; } if (mach_wide_league) { qindex = q_syrk_gemm[ic] % max_log_str + ic * max_log_str + 1 + is_mic * host_ht_offset; } else { qindex = q_syrk_gemm[ic] % max_log_str + ic * max_log_str + is_mic * host_ht_offset; } if (k == 0) { if (loc_verbose > 0) printf("Sending tile[%d][%d] to card in queue %d\n", m, n, (int)(qindex)); hStreams_app_xfer_memory((int)(qindex), Asplit[m * num_tiles + n], Asplit[m * num_tiles + n], mem_size_tile, HSTR_SRC_TO_SINK, &eventcpyto[m * num_tiles + n]); } //DGEMM if (loc_verbose > 0) { printf("Waiting on eventcpyto_trsm[%d]\n", m * num_tiles + k + ic * tot_tiles); } //hStreams_app_event_wait(1, &eventcpyto_trsm[m*num_tiles + k + ic*tot_tiles]); hStreams_app_event_wait_in_stream(qindex, 1, &eventcpyto_trsm[m * num_tiles + k + ic * tot_tiles], 0, NULL, NULL); if (loc_verbose > 0) { printf("Waiting on eventcpyto_trsm[%d]\n", n * num_tiles + k + ic * tot_tiles); } //hStreams_app_event_wait(1, &eventcpyto_trsm[n*num_tiles + k + ic*tot_tiles]); hStreams_app_event_wait_in_stream(qindex, 1, &eventcpyto_trsm[n * num_tiles + k + ic * tot_tiles], 0, NULL, NULL); if (k > 0) { //hStreams_app_event_wait(1, &eventgemm[m*num_tiles + n]); hStreams_app_event_wait_in_stream(qindex, 1, &eventgemm[m * num_tiles + n], 0, NULL, NULL); if (loc_verbose > 0) { printf("Waiting on eventgemm[%d]\n", m * num_tiles + n); } } //dgemm is executed on the card if (loc_verbose > 0) printf("Executing gemm for tile[%d][%d] on card in queue %d, triggering event eventgemm[%d]\n", m, n, (int)(qindex), m * num_tiles + n); CHECK_HSTR_RESULT(hStreams_app_dgemm((int)(qindex), blasLay, CblasNoTrans, CblasTrans, tile_size, tile_size, tile_size, -1.0, Asplit[m * num_tiles + k], tile_size, Asplit[n * num_tiles + k], tile_size, 1.0, Asplit[m * num_tiles + n], tile_size, &eventgemm[m * num_tiles + n])); //send tile to host (only if n = k+1) if (n == k + 1) { if (loc_verbose > 0) printf("Sending tile[%d][%d] from card to host in queue %d, triggering event eventcpyfr[%d]\n", m, n, (int)(qindex), m * num_tiles + n); hStreams_app_xfer_memory( (int)(qindex), Asplit[m * num_tiles + n], Asplit[m * num_tiles + n], mem_size_tile, HSTR_SINK_TO_SRC, &eventcpyfr[m * num_tiles + n]); } q_syrk_gemm[ic]++; } } } //syncrhonizing all the streams hStreams_app_thread_sync(); //end of timing tend = dtimeGet(); totTimeMsec[iter] = 1e3 * (tend - tbegin); printf("time for Tiled hstreams Cholesky for iteration %d = %.2f msec\n", iter, totTimeMsec[iter]); //assembling of tiles back into full matrix assemble(Asplit, A_my, num_tiles, tile_size, mat_size, layRow); //calling mkl cholesky for verification and timing comparison. //Using auto-offload feature of MKL tbegin = dtimeGet(); //calling MKL dpotrf on the full matrix info = LAPACKE_dpotrf(lapackLay, 'L', mat_size, A_MKL, mat_size); tend = dtimeGet(); totTimeMsecMKL[iter] = 1e3 * (tend - tbegin); printf("time for MKL Cholesky (AO) for iteration %d = %.2f msec\n", iter, totTimeMsecMKL[iter]); if (info != 0) { printf("error with dpotrf\n"); } mkl_mic_disable(); if (verify == 1) { result = verify_results(A_my, A_MKL, mat_size * mat_size); if (result == true) { printf("Tiled Cholesky successful\n"); } else { printf("Tiled Chloesky failed\n"); } } } double meanTimeMsec, stdDevMsec; double meanTimeMsecMKL, stdDevMsecMKL; mean_and_stdev(totTimeMsec, meanTimeMsec, stdDevMsec, niter); mean_and_stdev(totTimeMsecMKL, meanTimeMsecMKL, stdDevMsecMKL, niter); double gflops = pow(mat_size, 3.0) / 3.0 * 1e-9; printf("\nMatrix size = %d\n", mat_size); printf("Tiled hStreams Cholesky: for %d iterations (ignoring first),\n" "mean Time = %.2f msec, stdDev Time = %.2f msec,\n" "Mean Gflops (using mean Time) = %.2f\n", niter - 1, meanTimeMsec, stdDevMsec, gflops / (meanTimeMsec * 1e-3)); printf("\nMKL AO Cholesky: for %d iterations (ignoring first),\n" "mean Time = %.2f msec, stdDev Time = %.2f msec,\n" "Mean Gflops (using meanTime) = %.2f\n\n", niter - 1, meanTimeMsecMKL, stdDevMsecMKL, gflops / (meanTimeMsecMKL * 1e-3)); //Free free(A_my); free(A_MKL); for (int i = 0; i < tot_tiles; ++i) { _mm_free(Asplit[i]); } delete [] Asplit; delete [] eventcpyto; delete [] eventcpyto_trsm; delete [] eventcpyfr; delete [] eventpotrf; delete [] eventtrsm; delete [] eventsyrk; delete [] eventgemm; delete [] totTimeMsec; delete [] totTimeMsecMKL; // true result indicates all OK if (result) { return 0; } return 1; }
/* * The primary compute function for the bucket sort * Executes the sum of NUM_ITERATIONS + BURN_IN iterations, as defined in params.h * Only iterations after the BURN_IN iterations are timed * Only the final iteration calls the verification function */ static int bucket_sort(void) { int err = 0; init_timers(NUM_ITERATIONS); #ifdef PERMUTE create_permutation_array(); #endif for(uint64_t i = 0; i < (NUM_ITERATIONS + BURN_IN); ++i) { // Reset timers after burn in if(i == BURN_IN){ init_timers(NUM_ITERATIONS); } SHMEM_BARRIER_AT_START; timer_start(&timers[TIMER_TOTAL]); KEY_TYPE * my_keys = make_input(); int * local_bucket_sizes = count_local_bucket_sizes(my_keys); int * send_offsets; int * local_bucket_offsets = compute_local_bucket_offsets(local_bucket_sizes, &send_offsets); KEY_TYPE * my_local_bucketed_keys = bucketize_local_keys(my_keys, local_bucket_offsets); KEY_TYPE * my_bucket_keys = exchange_keys(send_offsets, local_bucket_sizes, my_local_bucketed_keys); my_bucket_size = receive_offset; int * my_local_key_counts = count_local_keys(my_bucket_keys); SHMEM_BARRIER_AT_END; timer_stop(&timers[TIMER_TOTAL]); // Only the last iteration is verified if(i == NUM_ITERATIONS) { err = verify_results(my_local_key_counts, my_bucket_keys); } // Reset receive_offset used in exchange_keys receive_offset = 0; free(my_local_bucketed_keys); free(my_keys); free(local_bucket_sizes); free(local_bucket_offsets); free(send_offsets); free(my_local_key_counts); shmem_barrier_all(); } return err; }
void test_perf_nb(int dry_run) { int i, j, loop, rc, bytes, elems[2] = {MAXPROC, MAXELEMS}; int stride, k=0, ntimes; double stime, t1, t2, t3, t4, t5, t6, t7, t8, t9; double *dsrc[MAXPROC], scale=1.0; armci_hdl_t hdl_get, hdl_put, hdl_acc; create_array((void**)ddst, sizeof(double),2, elems); create_array((void**)dsrc, sizeof(double),1, &elems[1]); if(!dry_run)if(me == 0) { printf("\n\t\t\tRemote 1-D Array Section\n"); printf("section get nbget wait put nbput "); printf(" wait acc nbacc wait\n"); printf("------- -------- -------- -------- -------- --------"); printf(" -------- -------- -------- --------\n"); fflush(stdout); } for(loop=1; loop<=MAXELEMS; loop*=2, k++) { elems[1] = loop; ntimes = (int)sqrt((double)(MAXELEMS/elems[1])); if(ntimes <1) ntimes=1; /* -------------------------- SETUP --------------------------- */ /*initializing non-blocking handles,time,src & dst buffers*/ ARMCI_INIT_HANDLE(&hdl_put); ARMCI_INIT_HANDLE(&hdl_get); ARMCI_INIT_HANDLE(&hdl_acc); t1 = t2 = t3 = t4 = t5 = t6 = t7 = t8 = t9 = 0.0; for(i=0; i<elems[1]; i++) dsrc[me][i]=i*1.001*(me+1); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); /* bytes transfered */ bytes = sizeof(double)*elems[1]; MP_BARRIER(); /* -------------------------- PUT/GET -------------------------- */ if(me == 0) { for(i=1; i<nproc; i++) { stime=MP_TIMER(); for(j=0; j<ntimes; j++) if((rc=ARMCI_Put(&dsrc[me][0], &ddst[i][me*elems[1]], bytes,i))) ARMCI_Error("armci_nbput failed\n",rc); t1 += MP_TIMER()-stime; } } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(PUT, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); if(me == 0) { for(i=1; i<nproc; i++) { stime=MP_TIMER(); for(j=0; j<ntimes; j++) if((rc=ARMCI_Get(&dsrc[i][0], &ddst[me][i*elems[1]], bytes,i))) ARMCI_Error("armci_nbget failed\n",rc); t4 += MP_TIMER()-stime; } } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(GET, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); /* ------------------------ nb PUT/GET ------------------------- */ if(me == 0) { for(i=1; i<nproc; i++) { for(j=0; j<ntimes; j++) { stime=MP_TIMER(); if((rc=ARMCI_NbPut(&dsrc[me][0], &ddst[i][me*elems[1]], bytes, i, &hdl_put))) ARMCI_Error("armci_nbput failed\n",rc); t2 += MP_TIMER()-stime; stime=MP_TIMER(); ARMCI_Wait(&hdl_put); t3 += MP_TIMER()-stime; } } } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(PUT, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); if(me == 0) { for(i=1; i<nproc; i++) { for(j=0; j<ntimes; j++) { stime=MP_TIMER(); if((rc=ARMCI_NbGet(&dsrc[i][0], &ddst[me][i*elems[1]], bytes, i, &hdl_get))) ARMCI_Error("armci_nbget failed\n",rc); t5 += MP_TIMER()-stime; stime=MP_TIMER(); ARMCI_Wait(&hdl_get); t6 += MP_TIMER()-stime; } } } MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(GET, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); /* ------------------------ Accumulate ------------------------- */ for(i=0; i<elems[1]; i++) dsrc[me][i]=1.0; MP_BARRIER(); stride = elems[1]*sizeof(double); scale = 1.0; for(j=0; j<ntimes; j++) { stime=MP_TIMER(); if((rc=ARMCI_AccS(ARMCI_ACC_DBL, &scale, &dsrc[me][0], &stride, &ddst[0][0], &stride, &bytes, 0, 0))) ARMCI_Error("armci_acc failed\n",rc); t7 += MP_TIMER()-stime; MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(ACC, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); } #if 1 /* See the note below why this part is disabled */ /* ---------------------- nb-Accumulate ------------------------ */ for(i=0; i<elems[1]; i++) dsrc[me][i]=1.0; MP_BARRIER(); stride = elems[1]*sizeof(double); scale = 1.0; for(j=0; j<ntimes; j++) { stime=MP_TIMER(); if((rc=ARMCI_NbAccS(ARMCI_ACC_DBL, &scale, &dsrc[me][0], &stride, &ddst[0][0], &stride, &bytes, 0, 0, &hdl_acc))) ARMCI_Error("armci_nbacc failed\n",rc); t8 += MP_TIMER()-stime; stime=MP_TIMER(); ARMCI_Wait(&hdl_acc); t9 += MP_TIMER()-stime; MP_BARRIER(); ARMCI_AllFence(); MP_BARRIER(); if(VERIFY) verify_results(ACC, elems); for(i=0; i<elems[0]*elems[1]; i++) ddst[me][i]=0.0; MP_BARRIER(); } #endif /* print timings */ if(!dry_run) if(me==0) printf("%d\t %.2e %.2e %.2e %.2e %.2e %.2e %.2e %.2e %.2e\n", bytes, t4/ntimes, t5/ntimes, t6/ntimes, t1/ntimes, t2/ntimes, t3/ntimes, t7/ntimes, t8/ntimes, t9/ntimes); } ARMCI_AllFence(); MP_BARRIER(); if(!dry_run)if(me==0){printf("O.K.\n"); fflush(stdout);} destroy_array((void **)ddst); destroy_array((void **)dsrc); }
void cholesky_tiled(double *mat, int tile_size, int num_tiles, int mat_size, int niter, int max_log_str, bool layRow, int verify) { //total number of tiles int tot_tiles = num_tiles * num_tiles; //memory allocation for matrix for tiled-Cholesky double *A_my = (double *)malloc(mat_size * mat_size * sizeof(double)); //memory allocation for matrix for MKL cholesky (for comparison) double *A_MKL = (double *)malloc(mat_size * mat_size * sizeof(double)); //memory allocation for tiled matrix double **Asplit = new double* [tot_tiles]; int mem_size_tile = tile_size * tile_size * sizeof(double); for (int i = 0; i < tot_tiles; ++i) { //Buffer per tile, host allocation Asplit[i] = (double *)_mm_malloc(mem_size_tile, 64); //Buffer creation and allocation on the card hStreams_app_create_buf((void *)Asplit[i], mem_size_tile); } double tbegin, tend; int iter; int info; //Events are needed for various synchronizations to enforce //data dependence between and among data-transfers/computes HSTR_EVENT *eventcpyto = new HSTR_EVENT[tot_tiles]; HSTR_EVENT *eventcpyfr = new HSTR_EVENT[tot_tiles]; HSTR_EVENT *eventpotrf = new HSTR_EVENT[tot_tiles]; HSTR_EVENT *eventtrsm = new HSTR_EVENT[tot_tiles]; HSTR_EVENT *eventsyrk = new HSTR_EVENT[tot_tiles]; HSTR_EVENT *eventgemm = new HSTR_EVENT[tot_tiles]; //for timing tiled cholesky double *totTimeMsec = new double [niter]; //for timing MKL cholesky double *totTimeMsecMKL = new double [niter]; HSTR_RESULT res; //these queues are used for queining up compute on the card and //data transfers to/from the card. //q_trsm for dtrsm, q_potrf for dportf, q_syrk_gemm for both dsyrk and dgemm. //The queues are incremented by one for every compute queued and wrap //around the max_log_str available. This ensures good load-balancing. int q_trsm, q_potrf, q_syrk_gemm; CBLAS_ORDER blasLay; int lapackLay; if (layRow) { blasLay = CblasRowMajor; lapackLay = LAPACK_ROW_MAJOR; } else { blasLay = CblasColMajor; lapackLay = LAPACK_COL_MAJOR; } for (iter = 0; iter < niter; ++iter) { //copying matrices into separate variables for tiled cholesky (A_my) //and MKL cholesky (A_MKL) //The output overwrites the matrices and hence the need to copy //for each iteration copy_mat(mat, A_my, mat_size); copy_mat(mat, A_MKL, mat_size); unsigned int m, n, k; printf("\nIteration = %d\n", iter); split_into_blocks(A_my, Asplit, num_tiles, tile_size, mat_size, layRow); //beginning of timing tbegin = dtimeGet(); //splitting time included in the timing //This splits the input matrix into tiles (or blocks) //split_into_blocks(A_my, Asplit, num_tiles, tile_size, mat_size, layRow); q_potrf = 0; for (k = 0; k < num_tiles; ++k) { //POTRF //dpotrf is executed on the host on the diagonal tile //the results are then sent to the card if (k > 0) { hStreams_app_event_wait(1, &eventsyrk[k * num_tiles + k]); if (loc_verbose > 0) printf("Sending tile[%d][%d] to host in queue %d\n", k, k, (int)(q_potrf % max_log_str)) ; hStreams_app_xfer_memory(Asplit[k * num_tiles + k], Asplit[k * num_tiles + k], mem_size_tile, (int)(q_potrf % max_log_str), HSTR_SINK_TO_SRC, &eventcpyfr[k * num_tiles + k]); hStreams_app_event_wait(1, &eventcpyfr[k * num_tiles + k]); } if (loc_verbose > 0) { printf("Executing potrf on host for tile[%d][%d]\n", k, k); } info = LAPACKE_dpotrf(lapackLay, 'L', tile_size, Asplit[k * num_tiles + k], tile_size); if (k < num_tiles - 1) { if (loc_verbose > 0) printf("Sending tile[%d][%d] to card in queue %d\n", k, k, (int)(q_potrf % max_log_str)); hStreams_app_xfer_memory(Asplit[k * num_tiles + k], Asplit[k * num_tiles + k], mem_size_tile, (int)(q_potrf % max_log_str), HSTR_SRC_TO_SINK, &eventcpyto[k * num_tiles + k]); } q_potrf++; q_trsm = 0; for (m = k + 1; m < num_tiles; ++m) { if (k == 0) { if (loc_verbose > 0) printf("Sending tile[%d][%d] to card in queue %d\n", m, k, (int)(q_trsm % max_log_str)); hStreams_app_xfer_memory(Asplit[m * num_tiles + k], Asplit[m * num_tiles + k], mem_size_tile, (int)(q_trsm % max_log_str), HSTR_SRC_TO_SINK, &eventcpyto[m * num_tiles + k]); } //DTRSM hStreams_app_event_wait(1, &eventcpyto[k * num_tiles + k]); if (k > 0) { hStreams_app_event_wait(1, &eventgemm[m * num_tiles + k]); } //dtrsm is executed on the card if (loc_verbose > 0) printf("Executing trsm for tile[%d][%d] on card in queue %d\n", m, k, (int)(q_trsm % max_log_str)); res = hStreams_custom_dtrsm(blasLay, CblasRight, CblasLower, CblasTrans, CblasNonUnit, tile_size, tile_size, 1.0, Asplit[k * num_tiles + k], tile_size, Asplit[m * num_tiles + k], tile_size, (int)(q_trsm % max_log_str), &eventtrsm[m * num_tiles + k]); if (loc_verbose > 0) printf("Sending tile[%d][%d] back to host in queue %d\n", m, k, (int)(q_trsm % max_log_str)); hStreams_app_xfer_memory(Asplit[m * num_tiles + k], Asplit[m * num_tiles + k], mem_size_tile, (int)(q_trsm % max_log_str), HSTR_SINK_TO_SRC, &eventcpyfr[m * num_tiles + k]); q_trsm++; } q_syrk_gemm = 0; for (n = k + 1; n < num_tiles; ++n) { if (k == 0) { if (loc_verbose > 0) printf("Sending tile[%d][%d] to card in queue %d\n", n, n, (int)(q_syrk_gemm % max_log_str)); hStreams_app_xfer_memory(Asplit[n * num_tiles + n], Asplit[n * num_tiles + n], mem_size_tile, (int)(q_syrk_gemm % max_log_str), HSTR_SRC_TO_SINK, &eventcpyto[n * num_tiles + n]); } //DSYRK hStreams_app_event_wait(1, &eventtrsm[n * num_tiles + k]); if (k > 0) { hStreams_app_event_wait(1, &eventsyrk[n * num_tiles + n]); } //dsyrk is executed on the card if (loc_verbose > 0) printf("Executing syrk for tile[%d][%d] on card in queue %d\n", n, n, (int)(q_syrk_gemm % max_log_str)); res = hStreams_custom_dsyrk(blasLay, CblasLower, CblasNoTrans, tile_size, tile_size, -1.0, Asplit[n * num_tiles + k], tile_size, 1.0, Asplit[n * num_tiles + n], tile_size, (int)(q_syrk_gemm % max_log_str), &eventsyrk[n * num_tiles + n]); q_syrk_gemm++; for (m = n + 1; m < num_tiles; ++m) { if (k == 0) { if (loc_verbose > 0) printf("Sending tile[%d][%d] to card in queue %d\n", m, n, (int)(q_syrk_gemm % max_log_str)); hStreams_app_xfer_memory(Asplit[m * num_tiles + n], Asplit[m * num_tiles + n], mem_size_tile, (int)(q_syrk_gemm % max_log_str), HSTR_SRC_TO_SINK, &eventcpyto[m * num_tiles + n]); } //DGEMM hStreams_app_event_wait(1, &eventtrsm[m * num_tiles + k]); hStreams_app_event_wait(1, &eventtrsm[n * num_tiles + k]); if (k > 0) { hStreams_app_event_wait(1, &eventgemm[m * num_tiles + n]); } //dgemm is executed on the card if (loc_verbose > 0) printf("Executing gemm for tile[%d][%d] on card in queue %d\n", m, n, (int)(q_syrk_gemm % max_log_str)); res = hStreams_app_dgemm(blasLay, CblasNoTrans, CblasTrans, tile_size, tile_size, tile_size, -1.0, Asplit[m * num_tiles + k], tile_size, Asplit[n * num_tiles + k], tile_size, 1.0, Asplit[m * num_tiles + n], tile_size, (int)(q_syrk_gemm % max_log_str), &eventgemm[m * num_tiles + n]); q_syrk_gemm++; } } } //syncrhonizing all the streams hStreams_app_thread_sync(); //end of timing tend = dtimeGet(); totTimeMsec[iter] = 1e3 * (tend - tbegin); printf("time for Tiled hstreams Cholesky for iteration %d = %.2f msec\n", iter, totTimeMsec[iter]); //assembling of tiles back into full matrix assemble(Asplit, A_my, num_tiles, tile_size, mat_size, layRow); //calling mkl cholesky for verification and timing comparison. //Using auto-offload feature of MKL #ifndef _WIN32 //FIXME: calling this function causes a crash on Windows mkl_mic_enable(); #endif tbegin = dtimeGet(); //calling MKL dpotrf on the full matrix info = LAPACKE_dpotrf(lapackLay, 'L', mat_size, A_MKL, mat_size); tend = dtimeGet(); totTimeMsecMKL[iter] = 1e3 * (tend - tbegin); printf("time for MKL Cholesky (AO) for iteration %d = %.2f msec\n", iter, totTimeMsecMKL[iter]); if (info != 0) { printf("error with dpotrf\n"); } mkl_mic_disable(); if (verify == 1) { bool result = verify_results(A_my, A_MKL, mat_size * mat_size); if (result == true) { printf("Tiled Cholesky successful\n"); } else { printf("Tiled Chloesky failed\n"); } } } double meanTimeMsec, stdDevMsec; double meanTimeMsecMKL, stdDevMsecMKL; mean_and_stdev(totTimeMsec, meanTimeMsec, stdDevMsec, niter); mean_and_stdev(totTimeMsecMKL, meanTimeMsecMKL, stdDevMsecMKL, niter); double gflops = pow(mat_size, 3.0) / 3.0 * 1e-9; printf("\nMatrix size = %d\n", mat_size); printf("Tiled hStreams Cholesky: for %d iterations (ignoring first),\n" "mean Time = %.2f msec, stdDev Time = %.2f msec,\n" "Mean Gflops (using mean Time) = %.2f\n", niter - 1, meanTimeMsec, stdDevMsec, gflops / (meanTimeMsec * 1e-3)); printf("\nMKL AO Cholesky: for %d iterations (ignoring first),\n" "mean Time = %.2f msec, stdDev Time = %.2f msec,\n" "Mean Gflops (using meanTime) = %.2f\n\n", niter - 1, meanTimeMsecMKL, stdDevMsecMKL, gflops / (meanTimeMsecMKL * 1e-3)); //Free free(A_my); free(A_MKL); for (int i = 0; i < tot_tiles; ++i) { _mm_free(Asplit[i]); } delete [] Asplit; delete [] eventcpyto; delete [] eventcpyfr; delete [] eventpotrf; delete [] eventtrsm; delete [] eventsyrk; delete [] eventgemm; delete [] totTimeMsec; delete [] totTimeMsecMKL; }