void run(size_t size, hipStream_t stream1, hipStream_t stream2){ float *Ah, *Bh, *Cd, *Dd, *Eh; float *Ahh, *Bhh, *Cdd, *Ddd, *Ehh; HIPCHECK(hipHostMalloc((void**)&Ah, size, hipHostMallocDefault)); HIPCHECK(hipHostMalloc((void**)&Bh, size, hipHostMallocDefault)); HIPCHECK(hipMalloc(&Cd, size)); HIPCHECK(hipMalloc(&Dd, size)); HIPCHECK(hipHostMalloc((void**)&Eh, size, hipHostMallocDefault)); HIPCHECK(hipHostMalloc((void**)&Ahh, size, hipHostMallocDefault)); HIPCHECK(hipHostMalloc((void**)&Bhh, size, hipHostMallocDefault)); HIPCHECK(hipMalloc(&Cdd, size)); HIPCHECK(hipMalloc(&Ddd, size)); HIPCHECK(hipHostMalloc((void**)&Ehh, size, hipHostMallocDefault)); HIPCHECK(hipMemcpyAsync(Bh, Ah, size, hipMemcpyHostToHost, stream1)); HIPCHECK(hipMemcpyAsync(Bhh, Ahh, size, hipMemcpyHostToHost, stream2)); HIPCHECK(hipMemcpyAsync(Cd, Bh, size, hipMemcpyHostToDevice, stream1)); HIPCHECK(hipMemcpyAsync(Cdd, Bhh, size, hipMemcpyHostToDevice, stream2)); hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream1, Cd); hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream2, Cdd); HIPCHECK(hipMemcpyAsync(Dd, Cd, size, hipMemcpyDeviceToDevice, stream1)); HIPCHECK(hipMemcpyAsync(Ddd, Cdd, size, hipMemcpyDeviceToDevice, stream2)); HIPCHECK(hipMemcpyAsync(Eh, Dd, size, hipMemcpyDeviceToHost, stream1)); HIPCHECK(hipMemcpyAsync(Ehh, Ddd, size, hipMemcpyDeviceToHost, stream2)); HIPCHECK(hipDeviceSynchronize()); HIPASSERT(Eh[10] = Ah[10] + 1.0f); HIPASSERT(Ehh[10] = Ahh[10] + 1.0f); }
int main() { float *Ad; hipMalloc((void**)&Ad, 1024); // Test the different hipLaunchParm options: hipLaunchKernel(vAdd, size_t(1024), 1, 0, 0, Ad); hipLaunchKernel(vAdd, 1024, dim3(1), 0, 0, Ad); hipLaunchKernel(vAdd, dim3(1024), 1, 0, 0, Ad); hipLaunchKernel(vAdd, dim3(1024), dim3(1), 0, 0, Ad); // Test case with hipLaunchKernel inside another macro: float e0; GPU_PRINT_TIME (hipLaunchKernel(vAdd, dim3(1024), dim3(1), 0, 0, Ad), e0, j); GPU_PRINT_TIME (WRAP(hipLaunchKernel(vAdd, dim3(1024), dim3(1), 0, 0, Ad)), e0, j); #ifdef EXTRA_PARENS_1 // Don't wrap hipLaunchKernel in extra set of parens: GPU_PRINT_TIME ((hipLaunchKernel(vAdd, dim3(1024), dim3(1), 0, 0, Ad)), e0, j); #endif MY_LAUNCH (hipLaunchKernel(vAdd, dim3(1024), dim3(1), 0, 0, Ad), true, "firstCall"); float *A; float e1; MY_LAUNCH_WITH_PAREN (hipMalloc(&A, 100), true, "launch2"); #ifdef EXTRA_PARENS_2 //MY_LAUNCH_WITH_PAREN wraps cmd in () which can cause issues. MY_LAUNCH_WITH_PAREN (hipLaunchKernel(vAdd, dim3(1024), dim3(1), 0, 0, Ad), true, "firstCall"); #endif passed(); }
int test_gl2(size_t N) { size_t Nbytes = N*sizeof(int); int *A_d, *B_d, *C_d; int *A_h, *B_h, *C_h; HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); // Full vadd in one large chunk, to get things started: HIPCHECK ( hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); HIPCHECK ( hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice)); hipLaunchKernel(vectorADD2, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, N); HIPCHECK ( hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); HIPCHECK (hipDeviceSynchronize()); HipTest::checkVectorADD(A_h, B_h, C_h, N); return 0; }
bool run_erfinv(){ double *A, *Ad, *B, *Bd; A = new double[N]; B = new double[N]; for(int i=0;i<N;i++){ A[i] = -0.6; B[i] = 0.0; } hipMalloc((void**)&Ad, SIZE); hipMalloc((void**)&Bd, SIZE); hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice); hipLaunchKernel(test_erfinv, dim3(1), dim3(N), 0, 0, Ad, Bd); hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost); int passed = 0; for(int i=0;i<512;i++){ if(B[i] - A[i] < 0.000001){ passed = 1; } } free(A); if(passed == 1){ return true; } assert(passed == 1); return false; }
int main(int argc, char *argv[]) { int warpSize, pshift; hipDeviceProp_t devProp; hipDeviceGetProperties(&devProp, 0); if(strncmp(devProp.name,"Fiji",1)==0) {warpSize =64; pshift =6;} else {warpSize =32; pshift =5;} unsigned int Num_Threads_per_Block = 512; unsigned int Num_Blocks_per_Grid = 1; unsigned int Num_Warps_per_Block = Num_Threads_per_Block/warpSize; unsigned int Num_Warps_per_Grid = (Num_Threads_per_Block*Num_Blocks_per_Grid)/warpSize; unsigned int* host_ballot = (unsigned int*)malloc(Num_Warps_per_Grid*sizeof(unsigned int)); unsigned int* device_ballot; HIP_ASSERT(hipMalloc((void**)&device_ballot, Num_Warps_per_Grid*sizeof(unsigned int))); int divergent_count =0; for (int i=0; i<Num_Warps_per_Grid; i++) host_ballot[i] = 0; HIP_ASSERT(hipMemcpy(device_ballot, host_ballot, Num_Warps_per_Grid*sizeof(unsigned int), hipMemcpyHostToDevice)); hipLaunchKernel(gpu_ballot, dim3(Num_Blocks_per_Grid),dim3(Num_Threads_per_Block),0,0, device_ballot,Num_Warps_per_Block,pshift); HIP_ASSERT(hipMemcpy(host_ballot, device_ballot, Num_Warps_per_Grid*sizeof(unsigned int), hipMemcpyDeviceToHost)); for (int i=0; i<Num_Warps_per_Grid; i++) { if ((host_ballot[i] == 0)||(host_ballot[i]/warpSize == warpSize)) std::cout << "Warp " << i << " IS convergent- Predicate true for " << host_ballot[i]/warpSize << " threads\n"; else {std::cout << "Warp " << i << " IS divergent - Predicate true for " << host_ballot[i]/warpSize<< " threads\n"; divergent_count++;} } if (divergent_count==1) printf("PASSED\n"); else printf("FAILED\n"); return EXIT_SUCCESS; }
bool run_sincos(){ double *A, *Ad, *B, *C, *Bd, *Cd; A = new double[N]; B = new double[N]; C = new double[N]; for(int i=0;i<N;i++){ A[i] = 1.0; } hipMalloc((void**)&Ad, SIZE); hipMalloc((void**)&Bd, SIZE); hipMalloc((void**)&Cd, SIZE); hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice); hipLaunchKernel(test_sincos, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd); hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost); hipMemcpy(C, Cd, SIZE, hipMemcpyDeviceToHost); int passed = 0; for(int i=0;i<512;i++){ if(B[i] == sin(1.0)){ passed = 1; } } passed = 0; for(int i=0;i<512;i++){ if(C[i] == cos(1.0)){ passed = 1; } } free(A); if(passed == 1){ return true; } assert(passed == 1); return false; }
bool run_rnorm3d(){ double *A, *Ad, *B, *Bd, *C, *Cd, *D, *Dd; A = new double[N]; B = new double[N]; C = new double[N]; D = new double[N]; double val = 0.0; for(int i=0;i<N;i++){ A[i] = 1.0; B[i] = 2.0; C[i] = 3.0; } val = 1/sqrt(1.0 + 4.0 + 9.0); hipMalloc((void**)&Ad, SIZE); hipMalloc((void**)&Bd, SIZE); hipMalloc((void**)&Cd, SIZE); hipMalloc((void**)&Dd, SIZE); hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice); hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice); hipMemcpy(Cd, C, SIZE, hipMemcpyHostToDevice); hipLaunchKernel(test_rnorm3d, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd, Dd); hipMemcpy(D, Dd, SIZE, hipMemcpyDeviceToHost); int passed = 0; for(int i=0;i<512;i++){ if(D[i] - val < 0.000001){ passed = 1; } } free(A); if(passed == 1){ return true; } assert(passed == 1); return false; }
bool run_lround(){ double *A, *Ad; long int *B, *Bd; A = new double[N]; B = new long int[N]; for(int i=0;i<N;i++){ A[i] = 1.345; } hipMalloc((void**)&Ad, SIZE); hipMalloc((void**)&Bd, N*sizeof(long int)); hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice); hipLaunchKernel(test_lround, dim3(1), dim3(N), 0, 0, Ad, Bd); hipMemcpy(B, Bd, N*sizeof(long int), hipMemcpyDeviceToHost); int passed = 0; for(int i=0;i<512;i++){ long int x = round(A[i]); if(B[i] == x){ passed = 1; } } free(A); if(passed == 1){ return true; } assert(passed == 1); return false; }
bool run_rnorm(){ double *A, *Ad, *B, *Bd; A = new double[N]; B = new double[N]; double val = 0.0; for(int i=0;i<N;i++){ A[i] = 1.0; B[i] = 0.0; val += 1.0; } val = 1/sqrt(val); hipMalloc((void**)&Ad, SIZE); hipMalloc((void**)&Bd, SIZE); hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice); hipLaunchKernel(test_rnorm, dim3(1), dim3(N), 0, 0, Ad, Bd); hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost); int passed = 0; for(int i=0;i<512;i++){ if(B[0] - val < 0.000001){ passed = 1; } } free(A); if(passed == 1){ return true; } assert(passed == 1); return false; }
int main(){ int A=0, *Ad; hipMalloc((void**)&Ad, SIZE); hipMemcpy(Ad, &A, SIZE, hipMemcpyHostToDevice); hipLaunchKernel(HIP_KERNEL_NAME(Iter), dim3(1), dim3(1), 0, 0, Ad); hipMemcpy(&A, Ad, SIZE, hipMemcpyDeviceToHost); }
void runbench(double *cd, long size){ if( memory_ratio>UNROLL_ITERATIONS ){ fprintf(stderr, "ERROR: memory_ratio exceeds UNROLL_ITERATIONS\n"); exit(1); } const long compute_grid_size = size/(UNROLLED_MEMORY_ACCESSES)/2; const int BLOCK_SIZE = 256; const int TOTAL_BLOCKS = compute_grid_size/BLOCK_SIZE; const long long computations = 2*(long long)(COMP_ITERATIONS)*REGBLOCK_SIZE*compute_grid_size; const long long memoryoperations = (long long)(COMP_ITERATIONS)*compute_grid_size; dim3 dimBlock(BLOCK_SIZE, 1, 1); dim3 dimGrid(TOTAL_BLOCKS, 1, 1); hipEvent_t start, stop; initializeEvents(&start, &stop); hipLaunchKernel(HIP_KERNEL_NAME(benchmark_func< float, BLOCK_SIZE, memory_ratio >), dim3(dimGrid), dim3(dimBlock ), 0, 0, 1.0f, (float*)cd); float kernel_time_mad_sp = finalizeEvents(start, stop); initializeEvents(&start, &stop); hipLaunchKernel(HIP_KERNEL_NAME(benchmark_func< double, BLOCK_SIZE, memory_ratio >), dim3(dimGrid), dim3(dimBlock ), 0, 0, 1.0, cd); float kernel_time_mad_dp = finalizeEvents(start, stop); initializeEvents(&start, &stop); hipLaunchKernel(HIP_KERNEL_NAME(benchmark_func< int, BLOCK_SIZE, memory_ratio >), dim3(dimGrid), dim3(dimBlock ), 0, 0, 1, (int*)cd); float kernel_time_mad_int = finalizeEvents(start, stop); const double memaccesses_ratio = (double)(memory_ratio)/UNROLL_ITERATIONS; const double computations_ratio = 1.0-memaccesses_ratio; printf(" %4d, %8.3f,%8.2f,%8.2f,%7.2f, %8.3f,%8.2f,%8.2f,%7.2f, %8.3f,%8.2f,%8.2f,%7.2f\n", UNROLL_ITERATIONS-memory_ratio, (computations_ratio*(double)computations)/(memaccesses_ratio*(double)memoryoperations*sizeof(float)), kernel_time_mad_sp, (computations_ratio*(double)computations)/kernel_time_mad_sp*1000./(double)(1000*1000*1000), (memaccesses_ratio*(double)memoryoperations*sizeof(float))/kernel_time_mad_sp*1000./(1000.*1000.*1000.), (computations_ratio*(double)computations)/(memaccesses_ratio*(double)memoryoperations*sizeof(double)), kernel_time_mad_dp, (computations_ratio*(double)computations)/kernel_time_mad_dp*1000./(double)(1000*1000*1000), (memaccesses_ratio*(double)memoryoperations*sizeof(double))/kernel_time_mad_dp*1000./(1000.*1000.*1000.), (computations_ratio*(double)computations)/(memaccesses_ratio*(double)memoryoperations*sizeof(int)), kernel_time_mad_int, (computations_ratio*(double)computations)/kernel_time_mad_int*1000./(double)(1000*1000*1000), (memaccesses_ratio*(double)memoryoperations*sizeof(int))/kernel_time_mad_int*1000./(1000.*1000.*1000.) ); }
int main(int argc, char *argv[]) { int warpSize, pshift; hipDeviceProp_t devProp; hipGetDeviceProperties(&devProp, 0); if(strncmp(devProp.name,"Fiji",1)==0) { warpSize =64; pshift =6; } else {warpSize =32; pshift=5;} int anycount =0; int allcount =0; int Num_Threads_per_Block = 1024; int Num_Blocks_per_Grid = 1; int Num_Warps_per_Block = Num_Threads_per_Block/warpSize; int Num_Warps_per_Grid = (Num_Threads_per_Block*Num_Blocks_per_Grid)/warpSize; int * host_any = ( int*)malloc(Num_Warps_per_Grid*sizeof(int)); int * host_all = ( int*)malloc(Num_Warps_per_Grid*sizeof(int)); int *device_any; int *device_all; HIP_ASSERT(hipMalloc((void**)&device_any,Num_Warps_per_Grid*sizeof( int))); HIP_ASSERT(hipMalloc((void**)&device_all,Num_Warps_per_Grid*sizeof(int))); for (int i=0; i<Num_Warps_per_Grid; i++) { host_any[i] = 0; host_all[i] = 0; } HIP_ASSERT(hipMemcpy(device_any, host_any,sizeof(int), hipMemcpyHostToDevice)); HIP_ASSERT(hipMemcpy(device_all, host_all,sizeof(int), hipMemcpyHostToDevice)); hipLaunchKernel(warpvote, dim3(Num_Blocks_per_Grid),dim3(Num_Threads_per_Block),0,0, device_any, device_all ,Num_Warps_per_Block,pshift); HIP_ASSERT(hipMemcpy(host_any, device_any, Num_Warps_per_Grid*sizeof(int), hipMemcpyDeviceToHost)); HIP_ASSERT(hipMemcpy(host_all, device_all, Num_Warps_per_Grid*sizeof(int), hipMemcpyDeviceToHost)); for (int i=0; i<Num_Warps_per_Grid; i++) { printf("warp no. %d __any = %d \n",i,host_any[i]); printf("warp no. %d __all = %d \n",i,host_all[i]); if (host_all[i]!=1) ++allcount; #if defined (__HIP_PLATFORM_HCC__) && !defined ( NVCC_COMPAT ) if (host_any[i]!=64) ++anycount; #else if (host_any[i]!=1) ++anycount; #endif } #if defined (__HIP_PLATFORM_HCC__) && !defined ( NVCC_COMPAT ) if (anycount == 1 && allcount ==1) printf("PASSED\n"); else printf("FAILED\n"); #else if (anycount == 0 && allcount ==1) printf("PASSED\n"); else printf("FAILED\n"); #endif return EXIT_SUCCESS; }
int main(){ int A=0, *Ad; hipMalloc((void**)&Ad, SIZE); hipMemcpy(Ad, &A, SIZE, hipMemcpyHostToDevice); dim3 dimGrid, dimBlock; dimGrid.x = 1, dimGrid.y =1, dimGrid.z = 1; dimBlock.x = 1, dimBlock.y = 1, dimGrid.z = 1; hipLaunchKernel(HIP_KERNEL_NAME(Iter), dimGrid, dimBlock, 0, 0, Ad); hipMemcpy(&A, Ad, SIZE, hipMemcpyDeviceToHost); }
void runbench_warmup(double *cd, long size){ const long reduced_grid_size = size/(UNROLLED_MEMORY_ACCESSES)/32; const int BLOCK_SIZE = 256; const int TOTAL_REDUCED_BLOCKS = reduced_grid_size/BLOCK_SIZE; dim3 dimBlock(BLOCK_SIZE, 1, 1); dim3 dimReducedGrid(TOTAL_REDUCED_BLOCKS, 1, 1); hipLaunchKernel(HIP_KERNEL_NAME(benchmark_func< short, BLOCK_SIZE, 0 >), dim3(dimReducedGrid), dim3(dimBlock ), 0, 0, (short)1, (short*)cd); CUDA_SAFE_CALL( hipGetLastError() ); CUDA_SAFE_CALL( hipDeviceSynchronize() ); }
void runbench(double *cd, long size){ const long compute_grid_size = size/ELEMENTS_PER_THREAD; const int BLOCK_SIZE = 256; const int TOTAL_BLOCKS = compute_grid_size/BLOCK_SIZE; const long long computations = ELEMENTS_PER_THREAD*(long long)compute_grid_size+(2*ELEMENTS_PER_THREAD*compute_iterations)*(long long)compute_grid_size; const long long memoryoperations = size; dim3 dimBlock(BLOCK_SIZE, 1, 1); dim3 dimGrid(TOTAL_BLOCKS, 1, 1); hipEvent_t start, stop; initializeEvents(&start, &stop); hipLaunchKernel(HIP_KERNEL_NAME(benchmark_func< float, BLOCK_SIZE, ELEMENTS_PER_THREAD, compute_iterations >), dim3(dimGrid), dim3(dimBlock ), 0, 0, 1.0f, (float*)cd); float kernel_time_mad_sp = finalizeEvents(start, stop); initializeEvents(&start, &stop); hipLaunchKernel(HIP_KERNEL_NAME(benchmark_func< double, BLOCK_SIZE, ELEMENTS_PER_THREAD, compute_iterations >), dim3(dimGrid), dim3(dimBlock ), 0, 0, 1.0, cd); float kernel_time_mad_dp = finalizeEvents(start, stop); initializeEvents(&start, &stop); hipLaunchKernel(HIP_KERNEL_NAME(benchmark_func< int, BLOCK_SIZE, ELEMENTS_PER_THREAD, compute_iterations >), dim3(dimGrid), dim3(dimBlock ), 0, 0, 1, (int*)cd); float kernel_time_mad_int = finalizeEvents(start, stop); printf(" %4d, %8.3f,%8.2f,%8.2f,%7.2f, %8.3f,%8.2f,%8.2f,%7.2f, %8.3f,%8.2f,%8.2f,%7.2f\n", compute_iterations, ((double)computations)/((double)memoryoperations*sizeof(float)), kernel_time_mad_sp, ((double)computations)/kernel_time_mad_sp*1000./(double)(1000*1000*1000), ((double)memoryoperations*sizeof(float))/kernel_time_mad_sp*1000./(1000.*1000.*1000.), ((double)computations)/((double)memoryoperations*sizeof(double)), kernel_time_mad_dp, ((double)computations)/kernel_time_mad_dp*1000./(double)(1000*1000*1000), ((double)memoryoperations*sizeof(double))/kernel_time_mad_dp*1000./(1000.*1000.*1000.), ((double)computations)/((double)memoryoperations*sizeof(int)), kernel_time_mad_int, ((double)computations)/kernel_time_mad_int*1000./(double)(1000*1000*1000), ((double)memoryoperations*sizeof(int))/kernel_time_mad_int*1000./(1000.*1000.*1000.) ); }
void runTest(int argc, char **argv) { hipDeviceProp_t deviceProp; deviceProp.major = 0; deviceProp.minor = 0; int dev = 0; hipDeviceGetProperties(&deviceProp, dev); // Statistics about the GPU device printf("> GPU device has %d Multi-Processors, " "SM %d.%d compute capabilities\n\n", deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor); int version = (deviceProp.major * 0x10 + deviceProp.minor); unsigned int numThreads = 256; unsigned int numBlocks = 64; unsigned int numData = 11; unsigned int memSize = sizeof(int) * numData; //allocate mem for the result on host side int *hOData = (int *) malloc(memSize); //initialize the memory for (unsigned int i = 0; i < numData; i++) hOData[i] = 0; //To make the AND and XOR tests generate something other than 0... hOData[8] = hOData[10] = 0xff; // allocate device memory for result int *dOData; hipMalloc((void **) &dOData, memSize); // copy host memory to device to initialize to zero hipMemcpy(dOData, hOData, memSize,hipMemcpyHostToDevice); // execute the kernel hipLaunchKernel(testKernel, dim3(numBlocks), dim3(numThreads), 0, 0, dOData); //Copy result from device to host hipMemcpy(hOData,dOData, memSize,hipMemcpyDeviceToHost); // Compute reference solution testResult = computeGold(hOData, numThreads * numBlocks); // Cleanup memory free(hOData); hipFree(dOData); }
int main() { float *A, *Ad; for(int i=0;i<len;i++) { A[i] = 1.0f; } Ad = (float*)mallocHip(size); memcpyHipH2D(Ad, A, size); hipLaunchKernel(HIP_KERNEL_NAME(Kern), dim3(len/1024), dim3(1024), 0, 0, A); memcpyHipD2H(A, Ad, size); for(int i=0;i<len;i++) { assert(A[i] == 2.0f); } }
int main(int argc, char *argv[]) { float *A_d, *C_d; float *A_h, *C_h; size_t N = 1000000; size_t Nbytes = N * sizeof(float); hipDeviceProp_t props; CHECK(hipDeviceGetProperties(&props, 0/*deviceID*/)); printf ("info: running on device %s\n", props.name); printf ("info: allocate host mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0); A_h = (float*)malloc(Nbytes); CHECK(A_h == 0 ? hipErrorMemoryAllocation : hipSuccess ); C_h = (float*)malloc(Nbytes); CHECK(C_h == 0 ? hipErrorMemoryAllocation : hipSuccess ); // Fill with Phi + i for (size_t i=0; i<N; i++) { A_h[i] = 1.618f + i; } printf ("info: allocate device mem (%6.2f MB)\n", 2*Nbytes/1024.0/1024.0); CHECK(hipMalloc(&A_d, Nbytes)); CHECK(hipMalloc(&C_d, Nbytes)); printf ("info: copy Host2Device\n"); CHECK ( hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); const unsigned blocks = 512; const unsigned threadsPerBlock = 256; printf ("info: launch 'vector_square' kernel\n"); hipLaunchKernel(HIP_KERNEL_NAME(vector_square), dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N); printf ("info: copy Device2Host\n"); CHECK ( hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); printf ("info: check result\n"); for (size_t i=0; i<N; i++) { if (C_h[i] != A_h[i] * A_h[i]) { CHECK(hipErrorUnknown); } } printf ("PASSED!\n"); }
int main(){ setup(); int *A, *Ad; for(int i=0;i<NUM_SIZE;i++){ A = (int*)malloc(size[i]); valSet(A, 1, size[i]); hipMalloc(&Ad, size[i]); std::cout<<"Malloc success at size: "<<size[i]<<std::endl; for(int j=0;j<NUM_ITER;j++){ std::cout<<"Iter: "<<j<<std::endl; hipMemcpy(Ad, A, size[i], hipMemcpyHostToDevice); hipLaunchKernel(Add, dim3(1), dim3(size[i]/sizeof(int)), 0, 0, Ad); hipMemcpy(A, Ad, size[i], hipMemcpyDeviceToHost); } hipDeviceSynchronize(); } }
int main(){ float *A, *Ad, *B, *Bd, *C, *Cd; A = new float[LEN]; B = new float[LEN]; C = new float[LEN]; for(uint32_t i=0;i<LEN;i++){ A[i] = i*1.0f; B[i] = i*1.0f; C[i] = i*1.0f; } hipMalloc((void**)&Ad, SIZE); hipMalloc((void**)&Bd, SIZE); hipMalloc((void**)&Cd, SIZE); hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice); hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice); hipLaunchKernel(getSqAbs, dim3(1), dim3(LEN), 0, 0, Ad, Bd, Cd); hipMemcpy(C, Cd, SIZE, hipMemcpyDeviceToHost); std::cout<<A[11]<<" "<<B[11]<<" "<<C[11]<<std::endl; }
void run1(size_t size, hipStream_t stream){ float *Ah, *Bh, *Cd, *Dd, *Eh; HIPCHECK(hipHostMalloc((void**)&Ah, size, hipHostMallocDefault)); HIPCHECK(hipHostMalloc((void**)&Bh, size, hipHostMallocDefault)); HIPCHECK(hipMalloc(&Cd, size)); HIPCHECK(hipMalloc(&Dd, size)); HIPCHECK(hipHostMalloc((void**)&Eh, size, hipHostMallocDefault)); for(int i=0;i<N;i++){ Ah[i] = 1.0f; } HIPCHECK(hipMemcpyAsync(Bh, Ah, size, hipMemcpyHostToHost, stream)); HIPCHECK(hipMemcpyAsync(Cd, Bh, size, hipMemcpyHostToDevice, stream)); hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Cd); HIPCHECK(hipMemcpyAsync(Dd, Cd, size, hipMemcpyDeviceToDevice, stream)); HIPCHECK(hipMemcpyAsync(Eh, Dd, size, hipMemcpyDeviceToHost, stream)); HIPCHECK(hipDeviceSynchronize()); HIPASSERT(Eh[10] == Ah[10] + 1.0f); }
int main(int argc, char *argv[]) { int Num_Threads_per_Block = 1024; int Num_Blocks_per_Grid = 1; int Num_Warps_per_Block = Num_Threads_per_Block/64; int Num_Warps_per_Grid = (Num_Threads_per_Block*Num_Blocks_per_Grid)/64; int * host_any = ( int*)malloc(Num_Warps_per_Grid*sizeof(int)); int * host_all = ( int*)malloc(Num_Warps_per_Grid*sizeof(int)); int *device_any; int *device_all; HIP_ASSERT(hipMalloc((void**)&device_any,Num_Warps_per_Grid*sizeof( int))); HIP_ASSERT(hipMalloc((void**)&device_all,Num_Warps_per_Grid*sizeof(int))); for (int i=0; i<Num_Warps_per_Grid; i++) { host_any[i] = 0; host_all[i] = 0; } HIP_ASSERT(hipMemcpy(device_any, host_any,sizeof(int), hipMemcpyHostToDevice)); HIP_ASSERT(hipMemcpy(device_all, host_all,sizeof(int), hipMemcpyHostToDevice)); hipLaunchKernel(warpvote, dim3(Num_Blocks_per_Grid),dim3(Num_Threads_per_Block),0,0, device_any, device_all ,Num_Warps_per_Block); HIP_ASSERT(hipMemcpy(host_any, device_any, Num_Warps_per_Grid*sizeof(int), hipMemcpyDeviceToHost)); HIP_ASSERT(hipMemcpy(host_all, device_all, Num_Warps_per_Grid*sizeof(int), hipMemcpyDeviceToHost)); for (int i=0; i<Num_Warps_per_Grid; i++) { printf("warp no. %d __any = %d \n",i,host_any[i]); printf("warp no. %d __all = %d \n",i,host_all[i]); } return EXIT_SUCCESS; }
void memcpy2Dtest(size_t numW, size_t numH, bool usePinnedHost) { size_t width = numW * sizeof(T); size_t sizeElements = width * numH; printf("memcpy2Dtest: %s<%s> size=%lu (%6.2fMB) W: %d, H:%d, usePinnedHost: %d\n", __func__, TYPENAME(T), sizeElements, sizeElements/1024.0/1024.0, (int)numW, (int)numH, usePinnedHost); T *A_d, *B_d, *C_d; T *A_h, *B_h, *C_h; size_t pitch_A, pitch_B, pitch_C; hipChannelFormatDesc desc = hipCreateChannelDesc<T>(); HipTest::initArrays2DPitch(&A_d, &B_d, &C_d, &pitch_A, &pitch_B, &pitch_C, numW, numH); HipTest::initArraysForHost(&A_h, &B_h, &C_h, numW*numH, usePinnedHost); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numW*numH); HIPCHECK (hipMemcpy2D (A_d, pitch_A, A_h, width, width, numH, hipMemcpyHostToDevice) ); HIPCHECK (hipMemcpy2D (B_d, pitch_B, B_h, width, width, numH, hipMemcpyHostToDevice) ); hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, (pitch_C/sizeof(T))*numH); HIPCHECK (hipMemcpy2D (C_h, width, C_d, pitch_C, width, numH, hipMemcpyDeviceToHost) ); HIPCHECK ( hipDeviceSynchronize() ); HipTest::checkVectorADD(A_h, B_h, C_h, numW*numH); HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, usePinnedHost); printf (" %s success\n", __func__); }
int main(){ hipError_t err; float *A, *Ad; A = new float[LEN]; for(int i=0;i<LEN;i++){ A[i] = 1.0f; } hipStream_t stream; err = hipStreamCreate(&stream); check("Creating stream",err); err = hipMalloc(&Ad, SIZE); check("Allocating Ad memory on device", err); err = hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice); check("Doing memory copy from A to Ad", err); float mS = 0; hipEvent_t start, stop; hipEventCreate(&start); hipEventCreate(&stop); ResultDatabase resultDB[8]; hipEventRecord(start); hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, 0, Ad); hipEventRecord(stop); hipEventElapsedTime(&mS, start, stop); resultDB[0].AddResult(std::string("First Kernel Launch"), "", "uS", mS*1000); // std::cout<<"First Kernel Launch: \t\t"<<mS*1000<<" uS"<<std::endl; resultDB[0].DumpSummary(std::cout); hipEventRecord(start); hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, 0, Ad); hipEventRecord(stop); hipEventElapsedTime(&mS, start, stop); resultDB[1].AddResult(std::string("Second Kernel Launch"), "", "uS", mS*1000); // std::cout<<"Second Kernel Launch: \t\t"<<mS*1000<<" uS"<<std::endl; resultDB[1].DumpSummary(std::cout); hipEventRecord(start); for(int i=0;i<ITER;i++){ hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, 0, Ad); } hipDeviceSynchronize(); hipEventRecord(stop); hipEventElapsedTime(&mS, start, stop); resultDB[2].AddResult(std::string("NULL Stream Sync dispatch wait"), "", "uS", mS*1000/ITER); resultDB[2].DumpSummary(std::cout); // std::cout<<"NULL Stream Sync dispatch wait: \t"<<mS*1000/ITER<<" uS"<<std::endl; hipDeviceSynchronize(); hipEventRecord(start); for(int i=0;i<ITER;i++){ hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, 0, Ad); } hipEventRecord(stop); hipDeviceSynchronize(); hipEventElapsedTime(&mS, start, stop); resultDB[3].AddResult(std::string("NULL Stream Async dispatch wait"), "", "uS", mS*1000/ITER); resultDB[3].DumpSummary(std::cout); // std::cout<<"NULL Stream Async dispatch wait: \t"<<mS*1000/ITER<<" uS"<<std::endl; hipDeviceSynchronize(); hipEventRecord(start); for(int i=0;i<ITER;i++){ hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, stream, Ad); hipDeviceSynchronize(); } hipEventRecord(stop); hipEventElapsedTime(&mS, start, stop); resultDB[4].AddResult(std::string("Stream Sync dispatch wait"), "", "uS", mS*1000/ITER); resultDB[4].DumpSummary(std::cout); // std::cout<<"Stream Sync dispatch wait: \t\t"<<mS*1000/ITER<<" uS"<<std::endl; hipDeviceSynchronize(); hipEventRecord(start); for(int i=0;i<ITER;i++){ hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, stream, Ad); } hipDeviceSynchronize(); hipEventRecord(stop); hipEventElapsedTime(&mS, start, stop); resultDB[5].AddResult(std::string("Stream Async dispatch wait"), "", "uS", mS*1000/ITER); // std::cout<<"Stream Async dispatch wait: \t\t"<<mS*1000/ITER<<" uS"<<std::endl; resultDB[5].DumpSummary(std::cout); hipDeviceSynchronize(); hipEventRecord(start); for(int i=0;i<ITER;i++){ hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, 0, Ad); } hipEventRecord(stop); hipEventElapsedTime(&mS, start, stop); resultDB[6].AddResult(std::string("NULL Stream No Wait"), "", "uS", mS*1000/ITER); resultDB[6].DumpSummary(std::cout); // std::cout<<"NULL Stream Dispatch No Wait: \t\t"<<mS*1000/ITER<<" uS"<<std::endl; hipDeviceSynchronize(); hipEventRecord(start); for(int i=0;i<ITER;i++){ hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, stream, Ad); } hipEventRecord(stop); hipEventElapsedTime(&mS, start, stop); resultDB[7].AddResult(std::string("Stream Dispatch No Wait"), "", "uS", mS*1000/ITER); resultDB[7].DumpSummary(std::cout); // std::cout<<"Stream Dispatch No Wait: \t\t"<<mS*1000/ITER<<" uS"<<std::endl; hipDeviceSynchronize(); }
int main() { size_t Nbytes = N*sizeof(int); int numDevices = 0; int *A_d, *B_d, *C_d, *X_d, *Y_d, *Z_d; int *A_h, *B_h, *C_h ; hipStream_t s; HIPCHECK(hipGetDeviceCount(&numDevices)); if(numDevices > 1) { HIPCHECK(hipSetDevice(0)); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false); HIPCHECK(hipSetDevice(1)); HIPCHECK(hipMalloc(&X_d,Nbytes)); HIPCHECK(hipMalloc(&Y_d,Nbytes)); HIPCHECK(hipMalloc(&Z_d,Nbytes)); HIPCHECK(hipSetDevice(0)); HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice)); hipLaunchKernel( HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, static_cast<const int*>(A_d), static_cast<const int*>(B_d), C_d, N); HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); HIPCHECK(hipDeviceSynchronize()); HipTest::checkVectorADD(A_h, B_h, C_h, N); HIPCHECK(hipStreamCreate(&s)); HIPCHECK(hipSetDevice(1)); HIPCHECK(hipMemcpyDtoDAsync((hipDeviceptr_t)X_d, (hipDeviceptr_t)A_d, Nbytes, s)); HIPCHECK(hipMemcpyDtoDAsync((hipDeviceptr_t)Y_d, (hipDeviceptr_t)B_d, Nbytes, s)); hipLaunchKernel( HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, static_cast<const int*>(X_d), static_cast<const int*>(Y_d), Z_d, N); HIPCHECK(hipMemcpyDtoHAsync(C_h, (hipDeviceptr_t)Z_d, Nbytes, s)); HIPCHECK(hipStreamSynchronize(s)); HIPCHECK(hipDeviceSynchronize()); HipTest::checkVectorADD(A_h, B_h, C_h, N); HIPCHECK(hipStreamDestroy(s)); HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false); HIPCHECK(hipFree(X_d)); HIPCHECK(hipFree(Y_d)); HIPCHECK(hipFree(Z_d)); } passed(); }
int main() { int *A, *Am, *B, *Ad, *C, *Cm; A = new int[NUM]; B = new int[NUM]; C = new int[NUM]; for(int i=0;i<NUM;i++) { A[i] = -1*i; B[i] = 0; C[i] = 0; } hipMalloc((void**)&Ad, SIZE); hipHostMalloc((void**)&Am, SIZE); hipHostMalloc((void**)&Cm, SIZE); for(int i=0;i<NUM;i++) { Am[i] = -1*i; Cm[i] = 0; } hipStream_t stream; hipStreamCreate(&stream); hipMemcpyToSymbolAsync(HIP_SYMBOL(globalIn), Am, SIZE, 0, hipMemcpyHostToDevice, stream); hipStreamSynchronize(stream); hipLaunchKernel(Assign, dim3(1,1,1), dim3(NUM,1,1), 0, 0, Ad); hipMemcpy(B, Ad, SIZE, hipMemcpyDeviceToHost); hipMemcpyFromSymbolAsync(Cm, HIP_SYMBOL(globalOut), SIZE, 0, hipMemcpyDeviceToHost, stream); hipStreamSynchronize(stream); for(int i=0;i<NUM;i++) { assert(Am[i] == B[i]); assert(Am[i] == Cm[i]); } for(int i=0;i<NUM;i++) { A[i] = -2*i; B[i] = 0; } hipMemcpyToSymbol(HIP_SYMBOL(globalIn), A, SIZE, 0, hipMemcpyHostToDevice); hipLaunchKernel(Assign, dim3(1,1,1), dim3(NUM,1,1), 0, 0, Ad); hipMemcpy(B, Ad, SIZE, hipMemcpyDeviceToHost); hipMemcpyFromSymbol(C, HIP_SYMBOL(globalOut), SIZE, 0, hipMemcpyDeviceToHost); for(int i=0;i<NUM;i++) { assert(A[i] == B[i]); assert(A[i] == C[i]); } for(int i=0;i<NUM;i++) { A[i] = -3*i; B[i] = 0; } hipMemcpyToSymbolAsync(HIP_SYMBOL(globalIn), A, SIZE, 0, hipMemcpyHostToDevice, stream); hipStreamSynchronize(stream); hipLaunchKernel(Assign, dim3(1,1,1), dim3(NUM,1,1), 0, 0, Ad); hipMemcpy(B, Ad, SIZE, hipMemcpyDeviceToHost); hipMemcpyFromSymbolAsync(C, HIP_SYMBOL(globalOut), SIZE, 0, hipMemcpyDeviceToHost, stream); hipStreamSynchronize(stream); for(int i=0;i<NUM;i++) { assert(A[i] == B[i]); assert(A[i] == C[i]); } hipHostFree(Am); hipHostFree(Cm); hipFree(Ad); delete[] A; delete[] B; delete[] C; passed(); }
int test_amdgcn_wave_lshift_1 (const int n, const int blockSize, const int launch_iter=1, const int shfl_iter=1, const bool verify=true) { const int WIDTH = 64; const int DELTA = 1; std::vector<int> input(n); std::future<void> inputFuture = std::async([&]() { std::default_random_engine generator; std::uniform_int_distribution<int> input_dist(0, WIDTH-1); std::generate(std::begin(input), std::end(input),[&]() { return input_dist(generator); }); }); inputFuture.wait(); int* gpuInput; hipMalloc(&gpuInput, n * sizeof(int)); hipMemcpy(gpuInput, input.data(), n * sizeof(int), hipMemcpyHostToDevice); int* gpuOutput; hipMalloc(&gpuOutput, n * sizeof(int)); // warm up { hipEvent_t start, stop; initializeEvents(&start, &stop); hipLaunchKernel(HIP_KERNEL_NAME(run_amdgcn_wave_lshift_1) , dim3(n/blockSize), dim3(blockSize), 0, 0 , gpuInput, gpuOutput, shfl_iter); float time_ms = finalizeEvents(start, stop); } // measure the performance hipEvent_t start, stop; initializeEvents(&start, &stop); for (int i = 0; i < launch_iter; i++) { hipLaunchKernel(HIP_KERNEL_NAME(run_amdgcn_wave_lshift_1) , dim3(n/blockSize), dim3(blockSize), 0, 0 , gpuInput, gpuOutput, shfl_iter); } float time_ms = finalizeEvents(start, stop); std::vector<int> output(n); hipMemcpy(output.data(), gpuOutput, n * sizeof(int), hipMemcpyDeviceToHost); // verification int errors = 0; if (verify) { for (int i = 0; i < n; i+=WIDTH) { int local_output[WIDTH]; for (int j = 0; j < shfl_iter; j++) { for (int k = 0; k < WIDTH; k++) { unsigned int lane = ((k+(int)DELTA)<WIDTH)?(k+DELTA):k; local_output[k] = input[i+lane]; } for (int k = 0; k < WIDTH; k++) { input[i+k] = local_output[k]; } } for (int k = 0; k < WIDTH; k++) { if (input[i+k] != output[i+k]) { errors++; } } } } std::cout << __FUNCTION__ << "<" << DELTA << "," << WIDTH << "> total(" << launch_iter << " launches, " << shfl_iter << " wavefront_shift_left/lane/kernel): " << time_ms << "ms, " << time_ms/(double)launch_iter << " ms/kernel, " << errors << " errors" << std::endl; hipFree(gpuInput); hipFree(gpuOutput); return errors; }
// IN: nStreams : number of streams to use for the test // IN :useNullStream - use NULL stream. Synchronizes everything. // IN: useSyncMemcpyH2D - use sync memcpy (no overlap) for H2D // IN: useSyncMemcpyD2H - use sync memcpy (no overlap) for D2H void test_chunkedAsyncExample(int nStreams, bool useNullStream, bool useSyncMemcpyH2D, bool useSyncMemcpyD2H) { size_t Nbytes = N * sizeof(int); printf("testing: %s(useNullStream=%d, useSyncMemcpyH2D=%d, useSyncMemcpyD2H=%d) ", __func__, useNullStream, useSyncMemcpyH2D, useSyncMemcpyD2H); printf("Nbytes=%zu (%6.1f MB)\n", Nbytes, (double)(Nbytes) / 1024.0 / 1024.0); int *A_d, *B_d, *C_d; int *A_h, *B_h, *C_h; HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, true); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); hipStream_t* stream = (hipStream_t*)malloc(sizeof(hipStream_t) * nStreams); if (useNullStream) { nStreams = 1; stream[0] = NULL; } else { for (int i = 0; i < nStreams; ++i) { HIPCHECK(hipStreamCreate(&stream[i])); } } size_t workLeft = N; size_t workPerStream = N / nStreams; for (int i = 0; i < nStreams; ++i) { size_t work = (workLeft < workPerStream) ? workLeft : workPerStream; size_t workBytes = work * sizeof(int); size_t offset = i * workPerStream; HIPASSERT(A_d + offset < A_d + Nbytes); HIPASSERT(B_d + offset < B_d + Nbytes); HIPASSERT(C_d + offset < C_d + Nbytes); if (useSyncMemcpyH2D) { HIPCHECK(hipMemcpy(&A_d[offset], &A_h[offset], workBytes, hipMemcpyHostToDevice)); HIPCHECK(hipMemcpy(&B_d[offset], &B_h[offset], workBytes, hipMemcpyHostToDevice)); } else { HIPCHECK(hipMemcpyAsync(&A_d[offset], &A_h[offset], workBytes, hipMemcpyHostToDevice, stream[i])); HIPCHECK(hipMemcpyAsync(&B_d[offset], &B_h[offset], workBytes, hipMemcpyHostToDevice, stream[i])); }; hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, stream[i], &A_d[offset], &B_d[offset], &C_d[offset], work); if (useSyncMemcpyD2H) { HIPCHECK(hipMemcpy(&C_h[offset], &C_d[offset], workBytes, hipMemcpyDeviceToHost)); } else { HIPCHECK(hipMemcpyAsync(&C_h[offset], &C_d[offset], workBytes, hipMemcpyDeviceToHost, stream[i])); } } HIPCHECK(hipDeviceSynchronize()); HipTest::checkVectorADD(A_h, B_h, C_h, N); HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, true); free(stream); };
void test_pingpong(hipStream_t stream, size_t numElements, int numInflight, int numPongs, bool doHostSide) { HIPASSERT(numElements % numInflight == 0); // Must be evenly divisible. size_t Nbytes = numElements * sizeof(T); size_t eachCopyElements = numElements / numInflight; size_t eachCopyBytes = eachCopyElements * sizeof(T); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); printf( "------------------------------------------------------------------------------------------" "-----\n"); printf( "testing: %s<%s> Nbytes=%zu (%6.1f MB) numPongs=%d numInflight=%d eachCopyElements=%zu " "eachCopyBytes=%zu\n", __func__, HostTraits<AllocType>::Name(), Nbytes, (double)(Nbytes) / 1024.0 / 1024.0, numPongs, numInflight, eachCopyElements, eachCopyBytes); T* A_h = NULL; T* A_d = NULL; A_h = (T*)(HostTraits<AllocType>::Alloc(Nbytes)); HIPCHECK(hipMalloc(&A_d, Nbytes)); // Initialize the host array: const T initValue = 13; const T deviceConst = 2; const T hostConst = 10000; for (size_t i = 0; i < numElements; i++) { A_h[i] = initValue + i; } for (int k = 0; k < numPongs; k++) { for (int i = 0; i < numInflight; i++) { HIPASSERT(A_d + i * eachCopyElements < A_d + Nbytes); HIPCHECK(hipMemcpyAsync(&A_d[i * eachCopyElements], &A_h[i * eachCopyElements], eachCopyBytes, hipMemcpyHostToDevice, stream)); } hipLaunchKernel(addK<T>, dim3(blocks), dim3(threadsPerBlock), 0, stream, A_d, 2, numElements); for (int i = 0; i < numInflight; i++) { HIPASSERT(A_d + i * eachCopyElements < A_d + Nbytes); HIPCHECK(hipMemcpyAsync(&A_h[i * eachCopyElements], &A_d[i * eachCopyElements], eachCopyBytes, hipMemcpyDeviceToHost, stream)); } if (doHostSide) { assert(0); #if 0 hipEvent_t e; HIPCHECK(hipEventCreate(&e)); #endif HIPCHECK(hipDeviceSynchronize()); for (size_t i = 0; i < numElements; i++) { A_h[i] += hostConst; } } }; HIPCHECK(hipDeviceSynchronize()); // Verify we copied back all the data correctly: for (size_t i = 0; i < numElements; i++) { T gold = initValue + i; // Perform calcs in same order as test above to replicate FP order-of-operations: for (int k = 0; k < numPongs; k++) { gold += deviceConst; if (doHostSide) { gold += hostConst; } } if (gold != A_h[i]) { std::cout << i << ": gold=" << gold << " out=" << A_h[i] << std::endl; HIPASSERT(gold == A_h[i]); } } HIPCHECK(hipHostFree(A_h)); HIPCHECK(hipFree(A_d)); }
int main(){ hipLaunchKernel(HIP_KERNEL_NAME(Empty), dim3(1), dim3(1), 0, 0, 0); hipDeviceSynchronize(); passed(); }