int main(int argc, char **argv) { int dimX = 4; int dimY = 64; int dimZ = 64; int numIters = 64; // Process args from command line*/ int argi = 1; int errFlag = 0; while (argi < argc) { char *one = argv[argi]; if (!strcmp(one, "-d") && argc > argi + 3) { dimX = atoi(argv[argi + 1]); dimY = atoi(argv[argi + 2]); dimZ = atoi(argv[argi + 3]); argi += 4; } else if (!strcmp(one, "-n") && argc > argi + 1) { numIters = atoi(argv[argi + 1]); /* Make numIters a multiple of 16 */ numIters = numIters & (~0xf); argi += 2; } // Also take tile_size from command line*/ else if (!strcmp(one, "-t") && argc > argi + 1) { tile_size = atoi(argv[argi + 1]); if (dimY % tile_size != 0) { printf("The given tile_size does not divide dimY evenly!\n"); return -1; } argi += 2; } else { errFlag = 1; break; } } if (errFlag) { printf("Usage: %s [-d [the size of each dimension]] " "[-n [the number of iterations in the kernel]] " "[-t tile_size]\n", argv[0]); return -1; } if (dimY != dimZ) { printf("DimY is not the same as DimZ, " "changing dimZ to make them alike!\n"); dimZ = dimY; } if (dimX * dimY * dimZ * numIters % 64 != 0) { printf("The production of DimX, DimY, DimZ and " "#iters must be multiples of 64!\n"); return -1; } printf("DimX=%d, DimY=%d, DimZ=%d, #Loop_Iterations=%d, tile_size=%d\n", dimX, dimY, dimZ, numIters, tile_size); REAL *out = NULL; // 64 bytes aligned allocation. posix_memalign((void **) &out, 64, dimX * dimY * dimZ * numIters * sizeof(REAL) * 2); if (out == NULL) { printf("Memory allocation failed!\n"); return -1; } double iterTimes[NUM_TESTS_ITERS]; double mintime = 1e6; // Notice how the out1 and out2 pointers are calculated. REAL *out1 = out; REAL *out2 = out + dimX * dimY * dimZ * numIters; //-------------------------------------------------------------- // Initialize hStreams with the given StreamsPerDomain and // over-subscription level. CHECK_HSTR_RESULT(hStreams_app_init(streams_per_domain, oversubscription)); //-------------------------------------------------------------- //-------------------------------------------------------------- // Saving buffer length in a temporary variable to save computation. long long buffer_len = dimX * dimY * dimZ * numIters * sizeof(REAL) * 2; // Set up buffers. // Wrap out with a buffer. hStreams_app_create_buf((void *) out, buffer_len); register long int buff_length = dimY * dimZ * numIters * sizeof(REAL); //!!a Creating two dimX buffer addresses pointing inside out buffer. // out1_addr[i] points to the position of out1 buffer where // iteration i of the outer loop of the compute kernel writes. // out2_addr[i] points to the position of out2 buffer where // iteration i of the outer loop of the compute kernel writes. REAL *out1_addr[dimX]; REAL *out2_addr[dimX]; // Separation between out1 and out2. long int out_length = dimX * dimY * dimZ * numIters; for (int i = 0; i < dimX; i++) { //%% EXERCISE: go back to compute kernel and find out what is the value for // out1_addr[i] and out2_addr[i]. out1_addr[i] = out + i * dimY * dimZ * numIters; out2_addr[i] = out1_addr[i] + out_length; // You can also create buffer in this way instead of creating // out buffer as a whole // hStreams_app_create_buf((void *)out1_addr[i], buff_length); // hStreams_app_create_buf((void *)out2_addr[i], buff_length); } // Event pointers to support asynchronous function calls. HSTR_EVENT eout1[numIters][dimX], eout2[numIters][dimX], eout3[numIters][dimX], eout4[numIters][dimX], eout5[numIters][dimX]; //-------------------------------------------------------------- for (int i = 0; i < NUM_TESTS_ITERS; i++) { iterTimes[i] = GetTime(); //-------------------------------------------------------------- // Call device side API. // Prepare to perform computation on the sink-side. // Outer loop. // The body of the original i loop is enqueued on the streams in a round // robin fashion. for (int ii = 0; ii < dimX; ii++) { int stream = ii % streams_per_domain; //-------------------------------------------------------------- // Initialize data at the sink-side //!!b Initializes intermediate data at sink, only // the portion being worked on hStreams_app_memset(stream, out1_addr[ii], // source proxy address to write 0.5, buff_length, // number of bytes to send &eout4[i][ii]); // completion event //%% EXERCISE: Initialize out2_addr[ii] buffer similarly on the sink-side. hStreams_app_memset(stream, out2_addr[ii], // source proxy address to write 0.5, buff_length, // number of bytes to send &eout5[i][ii]); // completion event uint64_t args[8]; // Pack scalar arguments first, then heap args. //%% EXERCISE: Setup the heap args properly. args[0] = (uint64_t)(ii); args[1] = (uint64_t)(dimX); args[2] = (uint64_t)(dimY); args[3] = (uint64_t)(dimZ); args[4] = (uint64_t)(numIters); args[5] = (uint64_t)(tile_size); args[6] = (uint64_t)(out1_addr[ii]); args[7] = (uint64_t)(out2_addr[ii]); //-------------------------------------------------------------- hStreams_app_invoke(stream, // same idea "compute", // remote function name 6, // scalar arg 2, // heap args args, // array of args &eout1[i][ii], NULL, // return variable 0); //----------------------------------------------------------------- // Collect result. hStreams_app_xfer_memory(stream, out1_addr[ii], // source proxy address to write out1_addr[ii], // source proxy address to read buff_length, // number of bytes to send HSTR_SINK_TO_SRC, // transfer direction &eout2[i][ii]); // completion event //-------------------------------------------------------------- //!!c Transfer output data from sink to source //%% EXERCISE: Transfer data back for out2_addr[ii]. hStreams_app_xfer_memory(stream, out2_addr[ii], // source proxy address to write out2_addr[ii], // source proxy address to read buff_length, // number of bytes to send HSTR_SINK_TO_SRC, // transfer direction &eout3[i][ii]); // completion event } //-------------------------------------------------------------- // Synchronize. CHECK_HSTR_RESULT(hStreams_app_thread_sync()); //-------------------------------------------------------------- iterTimes[i] = GetTime() - iterTimes[i]; double result = out1[numIters / 2] + out2[numIters / 2]; printf("Test %d takes %.3lf ms with result %.3lf\n", i, iterTimes[i], result); if (iterTimes[i] < mintime) { mintime = iterTimes[i]; } } printf("Test's min time is %.3lf ms\n", mintime); //-------------------------------------------------------------- // Cleanup before exiting. CHECK_HSTR_RESULT(hStreams_app_fini()); //-------------------------------------------------------------- free(out); return 0; }
void cholesky_tiled(double *mat, int tile_size, int num_tiles, int mat_size, int niter, int max_log_str, bool layRow, int verify) { //total number of tiles int tot_tiles = num_tiles * num_tiles; //memory allocation for matrix for tiled-Cholesky double *A_my = (double *)malloc(mat_size * mat_size * sizeof(double)); //memory allocation for matrix for MKL cholesky (for comparison) double *A_MKL = (double *)malloc(mat_size * mat_size * sizeof(double)); //memory allocation for tiled matrix double **Asplit = new double* [tot_tiles]; int mem_size_tile = tile_size * tile_size * sizeof(double); for (int i = 0; i < tot_tiles; ++i) { //Buffer per tile, host allocation Asplit[i] = (double *)_mm_malloc(mem_size_tile, 64); //Buffer creation and allocation on the card hStreams_app_create_buf((void *)Asplit[i], mem_size_tile); } double tbegin, tend; int iter; int info; //Events are needed for various synchronizations to enforce //data dependence between and among data-transfers/computes HSTR_EVENT *eventcpyto = new HSTR_EVENT[tot_tiles]; HSTR_EVENT *eventcpyfr = new HSTR_EVENT[tot_tiles]; HSTR_EVENT *eventpotrf = new HSTR_EVENT[tot_tiles]; HSTR_EVENT *eventtrsm = new HSTR_EVENT[tot_tiles]; HSTR_EVENT *eventsyrk = new HSTR_EVENT[tot_tiles]; HSTR_EVENT *eventgemm = new HSTR_EVENT[tot_tiles]; //for timing tiled cholesky double *totTimeMsec = new double [niter]; //for timing MKL cholesky double *totTimeMsecMKL = new double [niter]; HSTR_RESULT res; //these queues are used for queining up compute on the card and //data transfers to/from the card. //q_trsm for dtrsm, q_potrf for dportf, q_syrk_gemm for both dsyrk and dgemm. //The queues are incremented by one for every compute queued and wrap //around the max_log_str available. This ensures good load-balancing. int q_trsm, q_potrf, q_syrk_gemm; CBLAS_ORDER blasLay; int lapackLay; if (layRow) { blasLay = CblasRowMajor; lapackLay = LAPACK_ROW_MAJOR; } else { blasLay = CblasColMajor; lapackLay = LAPACK_COL_MAJOR; } for (iter = 0; iter < niter; ++iter) { //copying matrices into separate variables for tiled cholesky (A_my) //and MKL cholesky (A_MKL) //The output overwrites the matrices and hence the need to copy //for each iteration copy_mat(mat, A_my, mat_size); copy_mat(mat, A_MKL, mat_size); unsigned int m, n, k; printf("\nIteration = %d\n", iter); split_into_blocks(A_my, Asplit, num_tiles, tile_size, mat_size, layRow); //beginning of timing tbegin = dtimeGet(); //splitting time included in the timing //This splits the input matrix into tiles (or blocks) //split_into_blocks(A_my, Asplit, num_tiles, tile_size, mat_size, layRow); q_potrf = 0; for (k = 0; k < num_tiles; ++k) { //POTRF //dpotrf is executed on the host on the diagonal tile //the results are then sent to the card if (k > 0) { hStreams_app_event_wait(1, &eventsyrk[k * num_tiles + k]); if (loc_verbose > 0) printf("Sending tile[%d][%d] to host in queue %d\n", k, k, (int)(q_potrf % max_log_str)) ; hStreams_app_xfer_memory(Asplit[k * num_tiles + k], Asplit[k * num_tiles + k], mem_size_tile, (int)(q_potrf % max_log_str), HSTR_SINK_TO_SRC, &eventcpyfr[k * num_tiles + k]); hStreams_app_event_wait(1, &eventcpyfr[k * num_tiles + k]); } if (loc_verbose > 0) { printf("Executing potrf on host for tile[%d][%d]\n", k, k); } info = LAPACKE_dpotrf(lapackLay, 'L', tile_size, Asplit[k * num_tiles + k], tile_size); if (k < num_tiles - 1) { if (loc_verbose > 0) printf("Sending tile[%d][%d] to card in queue %d\n", k, k, (int)(q_potrf % max_log_str)); hStreams_app_xfer_memory(Asplit[k * num_tiles + k], Asplit[k * num_tiles + k], mem_size_tile, (int)(q_potrf % max_log_str), HSTR_SRC_TO_SINK, &eventcpyto[k * num_tiles + k]); } q_potrf++; q_trsm = 0; for (m = k + 1; m < num_tiles; ++m) { if (k == 0) { if (loc_verbose > 0) printf("Sending tile[%d][%d] to card in queue %d\n", m, k, (int)(q_trsm % max_log_str)); hStreams_app_xfer_memory(Asplit[m * num_tiles + k], Asplit[m * num_tiles + k], mem_size_tile, (int)(q_trsm % max_log_str), HSTR_SRC_TO_SINK, &eventcpyto[m * num_tiles + k]); } //DTRSM hStreams_app_event_wait(1, &eventcpyto[k * num_tiles + k]); if (k > 0) { hStreams_app_event_wait(1, &eventgemm[m * num_tiles + k]); } //dtrsm is executed on the card if (loc_verbose > 0) printf("Executing trsm for tile[%d][%d] on card in queue %d\n", m, k, (int)(q_trsm % max_log_str)); res = hStreams_custom_dtrsm(blasLay, CblasRight, CblasLower, CblasTrans, CblasNonUnit, tile_size, tile_size, 1.0, Asplit[k * num_tiles + k], tile_size, Asplit[m * num_tiles + k], tile_size, (int)(q_trsm % max_log_str), &eventtrsm[m * num_tiles + k]); if (loc_verbose > 0) printf("Sending tile[%d][%d] back to host in queue %d\n", m, k, (int)(q_trsm % max_log_str)); hStreams_app_xfer_memory(Asplit[m * num_tiles + k], Asplit[m * num_tiles + k], mem_size_tile, (int)(q_trsm % max_log_str), HSTR_SINK_TO_SRC, &eventcpyfr[m * num_tiles + k]); q_trsm++; } q_syrk_gemm = 0; for (n = k + 1; n < num_tiles; ++n) { if (k == 0) { if (loc_verbose > 0) printf("Sending tile[%d][%d] to card in queue %d\n", n, n, (int)(q_syrk_gemm % max_log_str)); hStreams_app_xfer_memory(Asplit[n * num_tiles + n], Asplit[n * num_tiles + n], mem_size_tile, (int)(q_syrk_gemm % max_log_str), HSTR_SRC_TO_SINK, &eventcpyto[n * num_tiles + n]); } //DSYRK hStreams_app_event_wait(1, &eventtrsm[n * num_tiles + k]); if (k > 0) { hStreams_app_event_wait(1, &eventsyrk[n * num_tiles + n]); } //dsyrk is executed on the card if (loc_verbose > 0) printf("Executing syrk for tile[%d][%d] on card in queue %d\n", n, n, (int)(q_syrk_gemm % max_log_str)); res = hStreams_custom_dsyrk(blasLay, CblasLower, CblasNoTrans, tile_size, tile_size, -1.0, Asplit[n * num_tiles + k], tile_size, 1.0, Asplit[n * num_tiles + n], tile_size, (int)(q_syrk_gemm % max_log_str), &eventsyrk[n * num_tiles + n]); q_syrk_gemm++; for (m = n + 1; m < num_tiles; ++m) { if (k == 0) { if (loc_verbose > 0) printf("Sending tile[%d][%d] to card in queue %d\n", m, n, (int)(q_syrk_gemm % max_log_str)); hStreams_app_xfer_memory(Asplit[m * num_tiles + n], Asplit[m * num_tiles + n], mem_size_tile, (int)(q_syrk_gemm % max_log_str), HSTR_SRC_TO_SINK, &eventcpyto[m * num_tiles + n]); } //DGEMM hStreams_app_event_wait(1, &eventtrsm[m * num_tiles + k]); hStreams_app_event_wait(1, &eventtrsm[n * num_tiles + k]); if (k > 0) { hStreams_app_event_wait(1, &eventgemm[m * num_tiles + n]); } //dgemm is executed on the card if (loc_verbose > 0) printf("Executing gemm for tile[%d][%d] on card in queue %d\n", m, n, (int)(q_syrk_gemm % max_log_str)); res = hStreams_app_dgemm(blasLay, CblasNoTrans, CblasTrans, tile_size, tile_size, tile_size, -1.0, Asplit[m * num_tiles + k], tile_size, Asplit[n * num_tiles + k], tile_size, 1.0, Asplit[m * num_tiles + n], tile_size, (int)(q_syrk_gemm % max_log_str), &eventgemm[m * num_tiles + n]); q_syrk_gemm++; } } } //syncrhonizing all the streams hStreams_app_thread_sync(); //end of timing tend = dtimeGet(); totTimeMsec[iter] = 1e3 * (tend - tbegin); printf("time for Tiled hstreams Cholesky for iteration %d = %.2f msec\n", iter, totTimeMsec[iter]); //assembling of tiles back into full matrix assemble(Asplit, A_my, num_tiles, tile_size, mat_size, layRow); //calling mkl cholesky for verification and timing comparison. //Using auto-offload feature of MKL #ifndef _WIN32 //FIXME: calling this function causes a crash on Windows mkl_mic_enable(); #endif tbegin = dtimeGet(); //calling MKL dpotrf on the full matrix info = LAPACKE_dpotrf(lapackLay, 'L', mat_size, A_MKL, mat_size); tend = dtimeGet(); totTimeMsecMKL[iter] = 1e3 * (tend - tbegin); printf("time for MKL Cholesky (AO) for iteration %d = %.2f msec\n", iter, totTimeMsecMKL[iter]); if (info != 0) { printf("error with dpotrf\n"); } mkl_mic_disable(); if (verify == 1) { bool result = verify_results(A_my, A_MKL, mat_size * mat_size); if (result == true) { printf("Tiled Cholesky successful\n"); } else { printf("Tiled Chloesky failed\n"); } } } double meanTimeMsec, stdDevMsec; double meanTimeMsecMKL, stdDevMsecMKL; mean_and_stdev(totTimeMsec, meanTimeMsec, stdDevMsec, niter); mean_and_stdev(totTimeMsecMKL, meanTimeMsecMKL, stdDevMsecMKL, niter); double gflops = pow(mat_size, 3.0) / 3.0 * 1e-9; printf("\nMatrix size = %d\n", mat_size); printf("Tiled hStreams Cholesky: for %d iterations (ignoring first),\n" "mean Time = %.2f msec, stdDev Time = %.2f msec,\n" "Mean Gflops (using mean Time) = %.2f\n", niter - 1, meanTimeMsec, stdDevMsec, gflops / (meanTimeMsec * 1e-3)); printf("\nMKL AO Cholesky: for %d iterations (ignoring first),\n" "mean Time = %.2f msec, stdDev Time = %.2f msec,\n" "Mean Gflops (using meanTime) = %.2f\n\n", niter - 1, meanTimeMsecMKL, stdDevMsecMKL, gflops / (meanTimeMsecMKL * 1e-3)); //Free free(A_my); free(A_MKL); for (int i = 0; i < tot_tiles; ++i) { _mm_free(Asplit[i]); } delete [] Asplit; delete [] eventcpyto; delete [] eventcpyfr; delete [] eventpotrf; delete [] eventtrsm; delete [] eventsyrk; delete [] eventgemm; delete [] totTimeMsec; delete [] totTimeMsecMKL; }