int main(int argc, char* argv[]) { HipTest::parseStandardArguments(argc, argv, false); parseMyArguments(argc, argv); printf("info: set device to %d tests=%x\n", p_gpuDevice, p_tests); HIPCHECK(hipSetDevice(p_gpuDevice)); if (p_tests & 0x01) { simpleNegTest(); } if (p_tests & 0x02) { hipStream_t stream; HIPCHECK(hipStreamCreate(&stream)); test_manyInflightCopies<float>(stream, 1024, 16, true); test_manyInflightCopies<float>( stream, 1024, 4, true); // verify we re-use the same entries instead of growing pool. test_manyInflightCopies<float>(stream, 1024 * 8, 64, false); HIPCHECK(hipStreamDestroy(stream)); } if (p_tests & 0x04) { test_chunkedAsyncExample(p_streams, true, true, true); // Easy sync version test_chunkedAsyncExample(p_streams, false, true, true); // Easy sync version test_chunkedAsyncExample(p_streams, false, false, true); // Some async test_chunkedAsyncExample(p_streams, false, false, false); // All async } if (p_tests & 0x08) { hipStream_t stream; HIPCHECK(hipStreamCreate(&stream)); // test_pingpong<int, Pinned>(stream, 1024*1024*32, 1, 1, false); // test_pingpong<int, Pinned>(stream, 1024*1024*32, 1, 10, false); HIPCHECK(hipStreamDestroy(stream)); } passed(); }
int main() { float *A, *Ad; HIPCHECK(hipHostMalloc((void**)&A, SIZE, hipHostMallocDefault)); HIPCHECK(hipMalloc((void**)&Ad, SIZE)); hipStream_t stream; HIPCHECK(hipStreamCreate(&stream)); for (int i = 0; i < SIZE; i++) { HIPCHECK(hipMemcpyAsync(Ad, A, SIZE, hipMemcpyHostToDevice, stream)); HIPCHECK(hipDeviceSynchronize()); } }
int main() { size_t Nbytes = N * sizeof(int); int numDevices = 0; int *A_d, *B_d, *C_d, *X_d, *Y_d, *Z_d; int *A_h, *B_h, *C_h; hipStream_t s; HIPCHECK(hipGetDeviceCount(&numDevices)); if (numDevices > 1) { HIPCHECK(hipSetDevice(0)); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false); HIPCHECK(hipSetDevice(1)); HIPCHECK(hipMalloc(&X_d, Nbytes)); HIPCHECK(hipMalloc(&Y_d, Nbytes)); HIPCHECK(hipMalloc(&Z_d, Nbytes)); HIPCHECK(hipSetDevice(0)); HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice)); hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, static_cast<const int*>(A_d), static_cast<const int*>(B_d), C_d, N); HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); HIPCHECK(hipDeviceSynchronize()); HipTest::checkVectorADD(A_h, B_h, C_h, N); HIPCHECK(hipSetDevice(1)); HIPCHECK(hipStreamCreate(&s)); HIPCHECK(hipMemcpyDtoDAsync((hipDeviceptr_t)X_d, (hipDeviceptr_t)A_d, Nbytes, s)); HIPCHECK(hipMemcpyDtoDAsync((hipDeviceptr_t)Y_d, (hipDeviceptr_t)B_d, Nbytes, s)); hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, static_cast<const int*>(X_d), static_cast<const int*>(Y_d), Z_d, N); HIPCHECK(hipMemcpyDtoHAsync(C_h, (hipDeviceptr_t)Z_d, Nbytes, s)); HIPCHECK(hipStreamSynchronize(s)); HIPCHECK(hipDeviceSynchronize()); HipTest::checkVectorADD(A_h, B_h, C_h, N); HIPCHECK(hipStreamDestroy(s)); HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false); HIPCHECK(hipFree(X_d)); HIPCHECK(hipFree(Y_d)); HIPCHECK(hipFree(Z_d)); } passed(); }
int main(int argc, char **argv){ HipTest::parseStandardArguments(argc, argv, true); hipStream_t stream[3]; for(int i=0;i<3;i++){ HIPCHECK(hipStreamCreate(&stream[i])); } const size_t size = N * sizeof(float); std::thread t1(run1, size, stream[0]); std::thread t2(run1, size, stream[0]); std::thread t3(run, size, stream[1], stream[2]); t1.join(); // std::cout<<"T1"<<std::endl; t2.join(); // std::cout<<"T2"<<std::endl; t3.join(); passed(); }
// Create a lot of streams and then destroy 'em. void createThenDestroyStreams(int iterations, int burstSize) { hipStream_t *streams = new hipStream_t[burstSize]; for (int i=0; i<iterations; i++) { if (p_verbose & 0x1) { printf ("%s iter=%d, create %d then destroy %d\n", __func__, i, burstSize, burstSize); } for (int j=0; j<burstSize; j++) { if (p_verbose & 0x2) { printf (" %d.%d streamCreate\n", i, j); } HIPCHECK( hipStreamCreate(&streams[j])); } for (int j=0; j<burstSize; j++) { if (p_verbose & 0x2) { printf (" %d.%d streamDestroy\n", i, j); } HIPCHECK( hipStreamDestroy(streams[j])); } } delete streams; }
int main(){ hipError_t err; float *A, *Ad; A = new float[LEN]; for(int i=0;i<LEN;i++){ A[i] = 1.0f; } hipStream_t stream; err = hipStreamCreate(&stream); check("Creating stream",err); err = hipMalloc(&Ad, SIZE); check("Allocating Ad memory on device", err); err = hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice); check("Doing memory copy from A to Ad", err); float mS = 0; hipEvent_t start, stop; hipEventCreate(&start); hipEventCreate(&stop); ResultDatabase resultDB[8]; hipEventRecord(start); hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, 0, Ad); hipEventRecord(stop); hipEventElapsedTime(&mS, start, stop); resultDB[0].AddResult(std::string("First Kernel Launch"), "", "uS", mS*1000); // std::cout<<"First Kernel Launch: \t\t"<<mS*1000<<" uS"<<std::endl; resultDB[0].DumpSummary(std::cout); hipEventRecord(start); hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, 0, Ad); hipEventRecord(stop); hipEventElapsedTime(&mS, start, stop); resultDB[1].AddResult(std::string("Second Kernel Launch"), "", "uS", mS*1000); // std::cout<<"Second Kernel Launch: \t\t"<<mS*1000<<" uS"<<std::endl; resultDB[1].DumpSummary(std::cout); hipEventRecord(start); for(int i=0;i<ITER;i++){ hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, 0, Ad); } hipDeviceSynchronize(); hipEventRecord(stop); hipEventElapsedTime(&mS, start, stop); resultDB[2].AddResult(std::string("NULL Stream Sync dispatch wait"), "", "uS", mS*1000/ITER); resultDB[2].DumpSummary(std::cout); // std::cout<<"NULL Stream Sync dispatch wait: \t"<<mS*1000/ITER<<" uS"<<std::endl; hipDeviceSynchronize(); hipEventRecord(start); for(int i=0;i<ITER;i++){ hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, 0, Ad); } hipEventRecord(stop); hipDeviceSynchronize(); hipEventElapsedTime(&mS, start, stop); resultDB[3].AddResult(std::string("NULL Stream Async dispatch wait"), "", "uS", mS*1000/ITER); resultDB[3].DumpSummary(std::cout); // std::cout<<"NULL Stream Async dispatch wait: \t"<<mS*1000/ITER<<" uS"<<std::endl; hipDeviceSynchronize(); hipEventRecord(start); for(int i=0;i<ITER;i++){ hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, stream, Ad); hipDeviceSynchronize(); } hipEventRecord(stop); hipEventElapsedTime(&mS, start, stop); resultDB[4].AddResult(std::string("Stream Sync dispatch wait"), "", "uS", mS*1000/ITER); resultDB[4].DumpSummary(std::cout); // std::cout<<"Stream Sync dispatch wait: \t\t"<<mS*1000/ITER<<" uS"<<std::endl; hipDeviceSynchronize(); hipEventRecord(start); for(int i=0;i<ITER;i++){ hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, stream, Ad); } hipDeviceSynchronize(); hipEventRecord(stop); hipEventElapsedTime(&mS, start, stop); resultDB[5].AddResult(std::string("Stream Async dispatch wait"), "", "uS", mS*1000/ITER); // std::cout<<"Stream Async dispatch wait: \t\t"<<mS*1000/ITER<<" uS"<<std::endl; resultDB[5].DumpSummary(std::cout); hipDeviceSynchronize(); hipEventRecord(start); for(int i=0;i<ITER;i++){ hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, 0, Ad); } hipEventRecord(stop); hipEventElapsedTime(&mS, start, stop); resultDB[6].AddResult(std::string("NULL Stream No Wait"), "", "uS", mS*1000/ITER); resultDB[6].DumpSummary(std::cout); // std::cout<<"NULL Stream Dispatch No Wait: \t\t"<<mS*1000/ITER<<" uS"<<std::endl; hipDeviceSynchronize(); hipEventRecord(start); for(int i=0;i<ITER;i++){ hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, stream, Ad); } hipEventRecord(stop); hipEventElapsedTime(&mS, start, stop); resultDB[7].AddResult(std::string("Stream Dispatch No Wait"), "", "uS", mS*1000/ITER); resultDB[7].DumpSummary(std::cout); // std::cout<<"Stream Dispatch No Wait: \t\t"<<mS*1000/ITER<<" uS"<<std::endl; hipDeviceSynchronize(); }
// IN: nStreams : number of streams to use for the test // IN :useNullStream - use NULL stream. Synchronizes everything. // IN: useSyncMemcpyH2D - use sync memcpy (no overlap) for H2D // IN: useSyncMemcpyD2H - use sync memcpy (no overlap) for D2H void test_chunkedAsyncExample(int nStreams, bool useNullStream, bool useSyncMemcpyH2D, bool useSyncMemcpyD2H) { size_t Nbytes = N * sizeof(int); printf("testing: %s(useNullStream=%d, useSyncMemcpyH2D=%d, useSyncMemcpyD2H=%d) ", __func__, useNullStream, useSyncMemcpyH2D, useSyncMemcpyD2H); printf("Nbytes=%zu (%6.1f MB)\n", Nbytes, (double)(Nbytes) / 1024.0 / 1024.0); int *A_d, *B_d, *C_d; int *A_h, *B_h, *C_h; HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, true); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); hipStream_t* stream = (hipStream_t*)malloc(sizeof(hipStream_t) * nStreams); if (useNullStream) { nStreams = 1; stream[0] = NULL; } else { for (int i = 0; i < nStreams; ++i) { HIPCHECK(hipStreamCreate(&stream[i])); } } size_t workLeft = N; size_t workPerStream = N / nStreams; for (int i = 0; i < nStreams; ++i) { size_t work = (workLeft < workPerStream) ? workLeft : workPerStream; size_t workBytes = work * sizeof(int); size_t offset = i * workPerStream; HIPASSERT(A_d + offset < A_d + Nbytes); HIPASSERT(B_d + offset < B_d + Nbytes); HIPASSERT(C_d + offset < C_d + Nbytes); if (useSyncMemcpyH2D) { HIPCHECK(hipMemcpy(&A_d[offset], &A_h[offset], workBytes, hipMemcpyHostToDevice)); HIPCHECK(hipMemcpy(&B_d[offset], &B_h[offset], workBytes, hipMemcpyHostToDevice)); } else { HIPCHECK(hipMemcpyAsync(&A_d[offset], &A_h[offset], workBytes, hipMemcpyHostToDevice, stream[i])); HIPCHECK(hipMemcpyAsync(&B_d[offset], &B_h[offset], workBytes, hipMemcpyHostToDevice, stream[i])); }; hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, stream[i], &A_d[offset], &B_d[offset], &C_d[offset], work); if (useSyncMemcpyD2H) { HIPCHECK(hipMemcpy(&C_h[offset], &C_d[offset], workBytes, hipMemcpyDeviceToHost)); } else { HIPCHECK(hipMemcpyAsync(&C_h[offset], &C_d[offset], workBytes, hipMemcpyDeviceToHost, stream[i])); } } HIPCHECK(hipDeviceSynchronize()); HipTest::checkVectorADD(A_h, B_h, C_h, N); HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, true); free(stream); };
int main() { int *A, *Am, *B, *Ad, *C, *Cm; A = new int[NUM]; B = new int[NUM]; C = new int[NUM]; for(int i=0;i<NUM;i++) { A[i] = -1*i; B[i] = 0; C[i] = 0; } hipMalloc((void**)&Ad, SIZE); hipHostMalloc((void**)&Am, SIZE); hipHostMalloc((void**)&Cm, SIZE); for(int i=0;i<NUM;i++) { Am[i] = -1*i; Cm[i] = 0; } hipStream_t stream; hipStreamCreate(&stream); hipMemcpyToSymbolAsync(HIP_SYMBOL(globalIn), Am, SIZE, 0, hipMemcpyHostToDevice, stream); hipStreamSynchronize(stream); hipLaunchKernel(Assign, dim3(1,1,1), dim3(NUM,1,1), 0, 0, Ad); hipMemcpy(B, Ad, SIZE, hipMemcpyDeviceToHost); hipMemcpyFromSymbolAsync(Cm, HIP_SYMBOL(globalOut), SIZE, 0, hipMemcpyDeviceToHost, stream); hipStreamSynchronize(stream); for(int i=0;i<NUM;i++) { assert(Am[i] == B[i]); assert(Am[i] == Cm[i]); } for(int i=0;i<NUM;i++) { A[i] = -2*i; B[i] = 0; } hipMemcpyToSymbol(HIP_SYMBOL(globalIn), A, SIZE, 0, hipMemcpyHostToDevice); hipLaunchKernel(Assign, dim3(1,1,1), dim3(NUM,1,1), 0, 0, Ad); hipMemcpy(B, Ad, SIZE, hipMemcpyDeviceToHost); hipMemcpyFromSymbol(C, HIP_SYMBOL(globalOut), SIZE, 0, hipMemcpyDeviceToHost); for(int i=0;i<NUM;i++) { assert(A[i] == B[i]); assert(A[i] == C[i]); } for(int i=0;i<NUM;i++) { A[i] = -3*i; B[i] = 0; } hipMemcpyToSymbolAsync(HIP_SYMBOL(globalIn), A, SIZE, 0, hipMemcpyHostToDevice, stream); hipStreamSynchronize(stream); hipLaunchKernel(Assign, dim3(1,1,1), dim3(NUM,1,1), 0, 0, Ad); hipMemcpy(B, Ad, SIZE, hipMemcpyDeviceToHost); hipMemcpyFromSymbolAsync(C, HIP_SYMBOL(globalOut), SIZE, 0, hipMemcpyDeviceToHost, stream); hipStreamSynchronize(stream); for(int i=0;i<NUM;i++) { assert(A[i] == B[i]); assert(A[i] == C[i]); } hipHostFree(Am); hipHostFree(Cm); hipFree(Ad); delete[] A; delete[] B; delete[] C; passed(); }
int main() { try { bool done = false; boost::fibers::fiber f1([&done]{ std::cout << "f1: entered" << std::endl; try { hipStream_t stream; hipStreamCreate( & stream); int size = 1024 * 1024; int full_size = 20 * size; int * host_a, * host_b, * host_c; hipHostMalloc( & host_a, full_size * sizeof( int), hipHostMallocDefault); hipHostMalloc( & host_b, full_size * sizeof( int), hipHostMallocDefault); hipHostMalloc( & host_c, full_size * sizeof( int), hipHostMallocDefault); int * dev_a, * dev_b, * dev_c; hipMalloc( & dev_a, size * sizeof( int) ); hipMalloc( & dev_b, size * sizeof( int) ); hipMalloc( & dev_c, size * sizeof( int) ); std::minstd_rand generator; std::uniform_int_distribution<> distribution(1, 6); for ( int i = 0; i < full_size; ++i) { host_a[i] = distribution( generator); host_b[i] = distribution( generator); } for ( int i = 0; i < full_size; i += size) { hipMemcpyAsync( dev_a, host_a + i, size * sizeof( int), hipMemcpyHostToDevice, stream); hipMemcpyAsync( dev_b, host_b + i, size * sizeof( int), hipMemcpyHostToDevice, stream); hipLaunchKernel( vector_add, dim3(size / 256), dim3(256), 0, stream, dev_a, dev_b, dev_c, size); hipMemcpyAsync( host_c + i, dev_c, size * sizeof( int), hipMemcpyDeviceToHost, stream); } auto result = boost::fibers::hip::waitfor_all( stream); BOOST_ASSERT( stream == std::get< 0 >( result) ); BOOST_ASSERT( hipSuccess == std::get< 1 >( result) ); std::cout << "f1: GPU computation finished" << std::endl; hipHostFree( host_a); hipHostFree( host_b); hipHostFree( host_c); hipFree( dev_a); hipFree( dev_b); hipFree( dev_c); hipStreamDestroy( stream); done = true; } catch ( std::exception const& ex) { std::cerr << "exception: " << ex.what() << std::endl; } std::cout << "f1: leaving" << std::endl; }); boost::fibers::fiber f2([&done]{ std::cout << "f2: entered" << std::endl; while ( ! done) { std::cout << "f2: sleeping" << std::endl; boost::this_fiber::sleep_for( std::chrono::milliseconds( 1 ) ); } std::cout << "f2: leaving" << std::endl; }); f1.join(); f2.join(); std::cout << "done." << std::endl; return EXIT_SUCCESS; } catch ( std::exception const& e) { std::cerr << "exception: " << e.what() << std::endl; } catch (...) { std::cerr << "unhandled exception" << std::endl; } return EXIT_FAILURE; }