int main(){ hipError_t err; float *A, *Ad; A = new float[LEN]; for(int i=0;i<LEN;i++){ A[i] = 1.0f; } hipStream_t stream; err = hipStreamCreate(&stream); check("Creating stream",err); err = hipMalloc(&Ad, SIZE); check("Allocating Ad memory on device", err); err = hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice); check("Doing memory copy from A to Ad", err); float mS = 0; hipEvent_t start, stop; hipEventCreate(&start); hipEventCreate(&stop); ResultDatabase resultDB[8]; hipEventRecord(start); hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, 0, Ad); hipEventRecord(stop); hipEventElapsedTime(&mS, start, stop); resultDB[0].AddResult(std::string("First Kernel Launch"), "", "uS", mS*1000); // std::cout<<"First Kernel Launch: \t\t"<<mS*1000<<" uS"<<std::endl; resultDB[0].DumpSummary(std::cout); hipEventRecord(start); hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, 0, Ad); hipEventRecord(stop); hipEventElapsedTime(&mS, start, stop); resultDB[1].AddResult(std::string("Second Kernel Launch"), "", "uS", mS*1000); // std::cout<<"Second Kernel Launch: \t\t"<<mS*1000<<" uS"<<std::endl; resultDB[1].DumpSummary(std::cout); hipEventRecord(start); for(int i=0;i<ITER;i++){ hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, 0, Ad); } hipDeviceSynchronize(); hipEventRecord(stop); hipEventElapsedTime(&mS, start, stop); resultDB[2].AddResult(std::string("NULL Stream Sync dispatch wait"), "", "uS", mS*1000/ITER); resultDB[2].DumpSummary(std::cout); // std::cout<<"NULL Stream Sync dispatch wait: \t"<<mS*1000/ITER<<" uS"<<std::endl; hipDeviceSynchronize(); hipEventRecord(start); for(int i=0;i<ITER;i++){ hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, 0, Ad); } hipEventRecord(stop); hipDeviceSynchronize(); hipEventElapsedTime(&mS, start, stop); resultDB[3].AddResult(std::string("NULL Stream Async dispatch wait"), "", "uS", mS*1000/ITER); resultDB[3].DumpSummary(std::cout); // std::cout<<"NULL Stream Async dispatch wait: \t"<<mS*1000/ITER<<" uS"<<std::endl; hipDeviceSynchronize(); hipEventRecord(start); for(int i=0;i<ITER;i++){ hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, stream, Ad); hipDeviceSynchronize(); } hipEventRecord(stop); hipEventElapsedTime(&mS, start, stop); resultDB[4].AddResult(std::string("Stream Sync dispatch wait"), "", "uS", mS*1000/ITER); resultDB[4].DumpSummary(std::cout); // std::cout<<"Stream Sync dispatch wait: \t\t"<<mS*1000/ITER<<" uS"<<std::endl; hipDeviceSynchronize(); hipEventRecord(start); for(int i=0;i<ITER;i++){ hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, stream, Ad); } hipDeviceSynchronize(); hipEventRecord(stop); hipEventElapsedTime(&mS, start, stop); resultDB[5].AddResult(std::string("Stream Async dispatch wait"), "", "uS", mS*1000/ITER); // std::cout<<"Stream Async dispatch wait: \t\t"<<mS*1000/ITER<<" uS"<<std::endl; resultDB[5].DumpSummary(std::cout); hipDeviceSynchronize(); hipEventRecord(start); for(int i=0;i<ITER;i++){ hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, 0, Ad); } hipEventRecord(stop); hipEventElapsedTime(&mS, start, stop); resultDB[6].AddResult(std::string("NULL Stream No Wait"), "", "uS", mS*1000/ITER); resultDB[6].DumpSummary(std::cout); // std::cout<<"NULL Stream Dispatch No Wait: \t\t"<<mS*1000/ITER<<" uS"<<std::endl; hipDeviceSynchronize(); hipEventRecord(start); for(int i=0;i<ITER;i++){ hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, stream, Ad); } hipEventRecord(stop); hipEventElapsedTime(&mS, start, stop); resultDB[7].AddResult(std::string("Stream Dispatch No Wait"), "", "uS", mS*1000/ITER); resultDB[7].DumpSummary(std::cout); // std::cout<<"Stream Dispatch No Wait: \t\t"<<mS*1000/ITER<<" uS"<<std::endl; hipDeviceSynchronize(); }
void initializeEvents(hipEvent_t *start, hipEvent_t *stop){ CUDA_SAFE_CALL( hipEventCreate(start) ); CUDA_SAFE_CALL( hipEventCreate(stop) ); CUDA_SAFE_CALL( hipEventRecord(*start, 0) ); }
void test_pingpong(hipStream_t stream, size_t numElements, int numInflight, int numPongs, bool doHostSide) { HIPASSERT(numElements % numInflight == 0); // Must be evenly divisible. size_t Nbytes = numElements * sizeof(T); size_t eachCopyElements = numElements / numInflight; size_t eachCopyBytes = eachCopyElements * sizeof(T); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); printf( "------------------------------------------------------------------------------------------" "-----\n"); printf( "testing: %s<%s> Nbytes=%zu (%6.1f MB) numPongs=%d numInflight=%d eachCopyElements=%zu " "eachCopyBytes=%zu\n", __func__, HostTraits<AllocType>::Name(), Nbytes, (double)(Nbytes) / 1024.0 / 1024.0, numPongs, numInflight, eachCopyElements, eachCopyBytes); T* A_h = NULL; T* A_d = NULL; A_h = (T*)(HostTraits<AllocType>::Alloc(Nbytes)); HIPCHECK(hipMalloc(&A_d, Nbytes)); // Initialize the host array: const T initValue = 13; const T deviceConst = 2; const T hostConst = 10000; for (size_t i = 0; i < numElements; i++) { A_h[i] = initValue + i; } for (int k = 0; k < numPongs; k++) { for (int i = 0; i < numInflight; i++) { HIPASSERT(A_d + i * eachCopyElements < A_d + Nbytes); HIPCHECK(hipMemcpyAsync(&A_d[i * eachCopyElements], &A_h[i * eachCopyElements], eachCopyBytes, hipMemcpyHostToDevice, stream)); } hipLaunchKernel(addK<T>, dim3(blocks), dim3(threadsPerBlock), 0, stream, A_d, 2, numElements); for (int i = 0; i < numInflight; i++) { HIPASSERT(A_d + i * eachCopyElements < A_d + Nbytes); HIPCHECK(hipMemcpyAsync(&A_h[i * eachCopyElements], &A_d[i * eachCopyElements], eachCopyBytes, hipMemcpyDeviceToHost, stream)); } if (doHostSide) { assert(0); #if 0 hipEvent_t e; HIPCHECK(hipEventCreate(&e)); #endif HIPCHECK(hipDeviceSynchronize()); for (size_t i = 0; i < numElements; i++) { A_h[i] += hostConst; } } }; HIPCHECK(hipDeviceSynchronize()); // Verify we copied back all the data correctly: for (size_t i = 0; i < numElements; i++) { T gold = initValue + i; // Perform calcs in same order as test above to replicate FP order-of-operations: for (int k = 0; k < numPongs; k++) { gold += deviceConst; if (doHostSide) { gold += hostConst; } } if (gold != A_h[i]) { std::cout << i << ": gold=" << gold << " out=" << A_h[i] << std::endl; HIPASSERT(gold == A_h[i]); } } HIPCHECK(hipHostFree(A_h)); HIPCHECK(hipFree(A_d)); }
void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, SyncMode syncMode, bool expectMismatch) { // This test sends a long-running kernel to the null stream, then tests to see if the // specified synchronization technique is effective. // // Some syncMode are not expected to correctly sync (for example "syncNone"). in these // cases the test sets expectMismatch and the check logic below will attempt to ensure that // the undesired synchronization did not occur - ie ensure the kernel is still running and did // not yet update the stop event. This can be tricky since if the kernel runs fast enough it // may complete before the check. To prevent this, the addCountReverse has a count parameter // which causes it to loop repeatedly, and the results are checked in reverse order. // // Tests with expectMismatch=true should ensure the kernel finishes correctly. This results // are checked and we test to make sure stop event has completed. if (!(testMask & p_tests)) { return; } printf ("\ntest 0x%02x: syncMode=%s expectMismatch=%d\n", testMask, syncModeString(syncMode), expectMismatch); size_t sizeBytes = numElements * sizeof(int); int count =100; int init0 = 0; HIPCHECK(hipMemset(C_d, init0, sizeBytes)); for (int i=0; i<numElements; i++) { C_h[i] = -1; // initialize } hipStream_t otherStream = 0; unsigned flags = (syncMode == syncMarkerThenOtherNonBlockingStream) ? hipStreamNonBlocking : hipStreamDefault; HIPCHECK(hipStreamCreateWithFlags(&otherStream, flags)); hipEvent_t stop, otherStreamEvent; HIPCHECK(hipEventCreate(&stop)); HIPCHECK(hipEventCreate(&otherStreamEvent)); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); // Launch kernel into null stream, should result in C_h == count. hipLaunchKernelGGL( HipTest::addCountReverse, dim3(blocks), dim3(threadsPerBlock), 0, 0 /*stream*/, static_cast<const int*>(C_d), C_h, numElements, count); HIPCHECK(hipEventRecord(stop, 0/*default*/)); switch (syncMode) { case syncNone: break; case syncNullStream: HIPCHECK(hipStreamSynchronize(0)); // wait on host for null stream: break; case syncOtherStream: // Does this synchronize with the null stream? HIPCHECK(hipStreamSynchronize(otherStream)); break; case syncMarkerThenOtherStream: case syncMarkerThenOtherNonBlockingStream: // this may wait for NULL stream depending hipStreamNonBlocking flag above HIPCHECK(hipEventRecord(otherStreamEvent, otherStream)); HIPCHECK(hipStreamSynchronize(otherStream)); break; case syncDevice: HIPCHECK(hipDeviceSynchronize()); break; default: assert(0); }; hipError_t done = hipEventQuery(stop); if (expectMismatch) { assert (done == hipErrorNotReady); } else { assert (done == hipSuccess); } int mismatches = 0; int expected = init0 + count; for (int i=0; i<numElements; i++) { bool compareEqual = (C_h[i] == expected); if (!compareEqual) { mismatches ++; if (!expectMismatch) { printf ("C_h[%d] (%d) != %d\n", i, C_h[i], expected); assert(C_h[i] == expected); } } } if (expectMismatch) { assert (mismatches > 0); } HIPCHECK(hipStreamDestroy(otherStream)); HIPCHECK(hipEventDestroy(stop)); HIPCHECK(hipEventDestroy(otherStreamEvent)); HIPCHECK(hipDeviceSynchronize()); printf ("test: OK - %d mismatches (%6.2f%%)\n", mismatches, ((double)(mismatches)*100.0)/numElements); }