void runTests(int64_t numElements) { size_t sizeBytes = numElements * sizeof(int); printf ("\n\ntest: starting sequence with sizeBytes=%zu bytes, %6.2f MB\n", sizeBytes, sizeBytes/1024.0/1024.0); int *C_h, *C_d; HIPCHECK(hipMalloc(&C_d, sizeBytes)); HIPCHECK(hipHostMalloc(&C_h, sizeBytes)); { test (0x01, C_d, C_h, numElements, syncNone, true /*expectMismatch*/); test (0x02, C_d, C_h, numElements, syncNullStream, false /*expectMismatch*/); test (0x04, C_d, C_h, numElements, syncOtherStream, true /*expectMismatch*/); test (0x08, C_d, C_h, numElements, syncDevice, false /*expectMismatch*/); // Sending a marker to to null stream may synchronize the otherStream // - other created with hipStreamNonBlocking=0 : synchronization, should match // - other created with hipStreamNonBlocking=1 : no synchronization, may mismatch test (0x10, C_d, C_h, numElements, syncMarkerThenOtherStream, false /*expectMismatch*/); // TODO - review why this test seems flaky //test (0x20, C_d, C_h, numElements, syncMarkerThenOtherNonBlockingStream, true /*expectMismatch*/); } HIPCHECK(hipFree(C_d)); HIPCHECK(hipHostFree(C_h)); }
int main() { unsigned flag = 0; HIPCHECK(hipDeviceReset()); int deviceCount = 0; HIPCHECK(hipGetDeviceCount(&deviceCount)); for(int j=0;j<deviceCount;j++){ HIPCHECK(hipSetDevice(j)); for(int i=0;i<4;i++){ flag = 1 << i; printf ("Flag=%x\n", flag); HIPCHECK(hipSetDeviceFlags(flag)); //HIPCHECK_API(hipSetDeviceFlags(flag), hipErrorInvalidValue); } flag = 0; } passed(); }
void memcpytest2_sizes(size_t maxElem=0, size_t offset=0) { printSep(); printf ("test: %s<%s>\n", __func__, TYPENAME(T)); int deviceId; HIPCHECK(hipGetDevice(&deviceId)); size_t free, total; HIPCHECK(hipMemGetInfo(&free, &total)); if (maxElem == 0) { maxElem = free/sizeof(T)/5; } printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB offset=%lu\n", deviceId, free, (float)(free/1024.0/1024.0), total, (float)(total/1024.0/1024.0), maxElem*sizeof(T)/1024.0/1024.0, offset); for (size_t elem=64; elem+offset<=maxElem; elem*=2) { HIPCHECK ( hipDeviceReset() ); memcpytest2<T>(elem+offset, 0, 1, 1, 0); // unpinned host HIPCHECK ( hipDeviceReset() ); memcpytest2<T>(elem+offset, 1, 1, 1, 0); // pinned host } }
int test_gl2(size_t N) { size_t Nbytes = N*sizeof(int); int *A_d, *B_d, *C_d; int *A_h, *B_h, *C_h; HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); // Full vadd in one large chunk, to get things started: HIPCHECK ( hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); HIPCHECK ( hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice)); hipLaunchKernel(vectorADD2, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, N); HIPCHECK ( hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); HIPCHECK (hipDeviceSynchronize()); HipTest::checkVectorADD(A_h, B_h, C_h, N); return 0; }
void test_manyInflightCopies(hipStream_t stream, int numElements, int numCopies, bool syncBetweenCopies) { size_t Nbytes = numElements * sizeof(T); size_t eachCopyElements = numElements / numCopies; size_t eachCopyBytes = eachCopyElements * sizeof(T); printf( "------------------------------------------------------------------------------------------" "-----\n"); printf( "testing: %s Nbytes=%zu (%6.1f MB) numCopies=%d eachCopyElements=%zu eachCopyBytes=%zu\n", __func__, Nbytes, (double)(Nbytes) / 1024.0 / 1024.0, numCopies, eachCopyElements, eachCopyBytes); T* A_d; T *A_h1, *A_h2; HIPCHECK(hipHostMalloc((void**)&A_h1, Nbytes, hipHostMallocDefault)); HIPCHECK(hipHostMalloc((void**)&A_h2, Nbytes, hipHostMallocDefault)); HIPCHECK(hipMalloc(&A_d, Nbytes)); for (int i = 0; i < numElements; i++) { A_h1[i] = 3.14f + static_cast<T>(i); } // stream=0; // fixme TODO for (int i = 0; i < numCopies; i++) { HIPASSERT(A_d + i * eachCopyElements < A_d + Nbytes); HIPCHECK(hipMemcpyAsync(&A_d[i * eachCopyElements], &A_h1[i * eachCopyElements], eachCopyBytes, hipMemcpyHostToDevice, stream)); } if (syncBetweenCopies) { HIPCHECK(hipDeviceSynchronize()); } for (int i = 0; i < numCopies; i++) { HIPASSERT(A_d + i * eachCopyElements < A_d + Nbytes); HIPCHECK(hipMemcpyAsync(&A_h2[i * eachCopyElements], &A_d[i * eachCopyElements], eachCopyBytes, hipMemcpyDeviceToHost, stream)); } HIPCHECK(hipDeviceSynchronize()); // Verify we copied back all the data correctly: for (int i = 0; i < numElements; i++) { HIPASSERT(A_h1[i] == A_h2[i]); } HIPCHECK(hipHostFree(A_h1)); HIPCHECK(hipHostFree(A_h2)); HIPCHECK(hipFree(A_d)); }
int main() { float *A, *Ad; HIPCHECK(hipHostMalloc((void**)&A, SIZE, hipHostMallocDefault)); HIPCHECK(hipMalloc((void**)&Ad, SIZE)); hipStream_t stream; HIPCHECK(hipStreamCreate(&stream)); for (int i = 0; i < SIZE; i++) { HIPCHECK(hipMemcpyAsync(Ad, A, SIZE, hipMemcpyHostToDevice, stream)); HIPCHECK(hipDeviceSynchronize()); } }
int main(int argc, char *argv[]) { HipTest::parseStandardArguments(argc, argv, true); printf ("info: set device to %d\n", p_gpuDevice); HIPCHECK(hipSetDevice(p_gpuDevice)); if (p_tests & 0x1) { printf ("\n\n=== tests&1 (types)\n"); printSep(); HIPCHECK ( hipDeviceReset() ); size_t width = N/6; size_t height = N/6; memcpy2Dtest<float>(321, 211, 0); memcpy2Dtest<double>(322, 211, 0); memcpy2Dtest<char>(320, 211, 0); memcpy2Dtest<int>(323, 211, 0); printf ("===\n\n\n"); printf ("\n\n=== tests&1 (types)\n"); printSep(); // 2D memcpyArraytest<float>(320, 211, 0, 0); memcpyArraytest<unsigned int>(322, 211, 0, 0); memcpyArraytest<int>(320, 211, 0, 0); memcpyArraytest<float>(320, 211, 0, 1); memcpyArraytest<float>(322, 211, 0, 1); memcpyArraytest<int>(320, 211, 0, 1); printSep(); // 1D memcpyArraytest<float>(320, 1, 0); memcpyArraytest<unsigned int>(322, 1, 0); memcpyArraytest<int>(320, 1, 0); printf ("===\n\n\n"); } if (p_tests & 0x4) { printf ("\n\n=== tests&4 (test sizes and offsets)\n"); printSep(); HIPCHECK ( hipDeviceReset() ); printSep(); memcpyArraytest_size<float>(0,0); printSep(); memcpyArraytest_size<float>(0,64); printSep(); memcpyArraytest_size<float>(1024*1024,13); printSep(); memcpyArraytest_size<float>(1024*1024,50); } passed(); }
int main() { int numDevices = 0; int device; HIPCHECK(hipGetDeviceCount(&numDevices)); for(int i=0;i<numDevices;i++){ HIPCHECK(hipSetDevice(i)); HIPCHECK(hipGetDevice(&device)); HIPASSERT(device == i); } passed(); }
int main() { int numDevices = 0; char name[len]; hipDevice_t device; HIPCHECK(hipGetDeviceCount(&numDevices)); for (int i = 0; i < numDevices; i++) { HIPCHECK(hipDeviceGet(&device, i)); HIPCHECK(hipDeviceGetName(name, len, device)); HIPASSERT(name != ""); } passed(); }
int main() { int numDevices = 0; size_t totMem; hipDevice_t device; HIPCHECK(hipGetDeviceCount(&numDevices)); for (int i = 0; i < numDevices; i++) { HIPCHECK(hipDeviceGet(&device, i)); HIPCHECK(hipDeviceTotalMem(&totMem, device)); HIPASSERT(totMem != 0); } passed(); }
unsigned setNumBlocks(unsigned blocksPerCU, unsigned threadsPerBlock, size_t N) { int device; HIPCHECK(hipGetDevice(&device)); hipDeviceProp_t props; HIPCHECK(hipGetDeviceProperties(&props, device)); unsigned blocks = props.multiProcessorCount * blocksPerCU; if (blocks * threadsPerBlock > N) { blocks = (N + threadsPerBlock - 1) / threadsPerBlock; } return blocks; }
int main() { int numDevices = 0; int major,minor; hipDevice_t device; HIPCHECK(hipGetDeviceCount(&numDevices)); for(int i=0;i<numDevices;i++){ HIPCHECK(hipDeviceGet(&device,i)); HIPCHECK(hipDeviceComputeCapability(&major, &minor, device)); HIPASSERT(major >= 0); HIPASSERT(minor >= 0); } passed(); }
bool testhipMemset3D(int memsetval,int p_gpuDevice) { size_t numH = 256; size_t numW = 256; size_t depth = 10; size_t width = numW * sizeof(char); size_t sizeElements = width * numH * depth; size_t elements = numW* numH* depth; printf ("testhipMemset3D memsetval=%2x device=%d\n", memsetval, p_gpuDevice); char *A_h; bool testResult = true; hipExtent extent = make_hipExtent(width, numH, depth); hipPitchedPtr devPitchedPtr; HIPCHECK(hipMalloc3D(&devPitchedPtr, extent)); A_h = (char*)malloc(sizeElements); HIPASSERT(A_h != NULL); for (size_t i=0; i<elements; i++) { A_h[i] = 1; } HIPCHECK ( hipMemset3D( devPitchedPtr, memsetval, extent) ); hipMemcpy3DParms myparms = {0}; myparms.srcPos = make_hipPos(0,0,0); myparms.dstPos = make_hipPos(0,0,0); myparms.dstPtr = make_hipPitchedPtr(A_h, width , numW, numH); myparms.srcPtr = devPitchedPtr; myparms.extent = extent; #ifdef __HIP_PLATFORM_NVCC__ myparms.kind = hipMemcpyKindToCudaMemcpyKind(hipMemcpyDeviceToHost); #else myparms.kind = hipMemcpyDeviceToHost; #endif HIPCHECK(hipMemcpy3D(&myparms)); for (int i=0; i<elements; i++) { if (A_h[i] != memsetval) { testResult = false; printf("mismatch at index:%d computed:%02x, memsetval:%02x\n", i, (int)A_h[i], (int)memsetval); break; } } HIPCHECK(hipFree(devPitchedPtr.ptr)); free(A_h); return testResult; }
void waitStreams(int iterations) { // Repeatedly sync and wait for all streams to complete. // TO make this interesting, the test has other threads repeatedly adding and removing streams to the device. for (int i=0; i<iterations; i++) { HIPCHECK(hipDeviceSynchronize()); } }
int main(int argc, char* argv[]) { HipTest::parseStandardArguments(argc, argv, false); parseMyArguments(argc, argv); printf("info: set device to %d tests=%x\n", p_gpuDevice, p_tests); HIPCHECK(hipSetDevice(p_gpuDevice)); if (p_tests & 0x01) { simpleNegTest(); } if (p_tests & 0x02) { hipStream_t stream; HIPCHECK(hipStreamCreate(&stream)); test_manyInflightCopies<float>(stream, 1024, 16, true); test_manyInflightCopies<float>( stream, 1024, 4, true); // verify we re-use the same entries instead of growing pool. test_manyInflightCopies<float>(stream, 1024 * 8, 64, false); HIPCHECK(hipStreamDestroy(stream)); } if (p_tests & 0x04) { test_chunkedAsyncExample(p_streams, true, true, true); // Easy sync version test_chunkedAsyncExample(p_streams, false, true, true); // Easy sync version test_chunkedAsyncExample(p_streams, false, false, true); // Some async test_chunkedAsyncExample(p_streams, false, false, false); // All async } if (p_tests & 0x08) { hipStream_t stream; HIPCHECK(hipStreamCreate(&stream)); // test_pingpong<int, Pinned>(stream, 1024*1024*32, 1, 1, false); // test_pingpong<int, Pinned>(stream, 1024*1024*32, 1, 10, false); HIPCHECK(hipStreamDestroy(stream)); } passed(); }
int main(int argc, char *argv[]) { HipTest::parseStandardArguments(argc, argv, true); bool testResult = false; HIPCHECK(hipSetDevice(p_gpuDevice)); testResult = testhipMemset3D(memsetval, p_gpuDevice); if (testResult) { passed(); } else { exit(EXIT_FAILURE); } }
void run1(size_t size, hipStream_t stream){ float *Ah, *Bh, *Cd, *Dd, *Eh; HIPCHECK(hipHostMalloc((void**)&Ah, size, hipHostMallocDefault)); HIPCHECK(hipHostMalloc((void**)&Bh, size, hipHostMallocDefault)); HIPCHECK(hipMalloc(&Cd, size)); HIPCHECK(hipMalloc(&Dd, size)); HIPCHECK(hipHostMalloc((void**)&Eh, size, hipHostMallocDefault)); for(int i=0;i<N;i++){ Ah[i] = 1.0f; } HIPCHECK(hipMemcpyAsync(Bh, Ah, size, hipMemcpyHostToHost, stream)); HIPCHECK(hipMemcpyAsync(Cd, Bh, size, hipMemcpyHostToDevice, stream)); hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Cd); HIPCHECK(hipMemcpyAsync(Dd, Cd, size, hipMemcpyDeviceToDevice, stream)); HIPCHECK(hipMemcpyAsync(Eh, Dd, size, hipMemcpyDeviceToHost, stream)); HIPCHECK(hipDeviceSynchronize()); HIPASSERT(Eh[10] == Ah[10] + 1.0f); }
void simpleNegTest() { printf("testing: %s\n", __func__); hipError_t e; float *A_malloc, *A_pinned, *A_d; size_t Nbytes = N * sizeof(float); A_malloc = (float*)malloc(Nbytes); HIPCHECK(hipHostMalloc((void**)&A_pinned, Nbytes, hipHostMallocDefault)); A_d = NULL; HIPCHECK(hipMalloc(&A_d, Nbytes)); HIPASSERT(A_d != NULL); // Can't use default with async copy e = hipMemcpyAsync(A_pinned, A_d, Nbytes, hipMemcpyDefault, NULL); // HIPASSERT (e == hipSuccess); // Not sure what happens here, the memory must be pinned. e = hipMemcpyAsync(A_malloc, A_d, Nbytes, hipMemcpyDeviceToHost, NULL); printf(" async memcpy of A_malloc to A_d. Result=%d\n", e); // HIPASSERT (e==hipErrorInvalidValue); }
void memcpy2Dtest(size_t numW, size_t numH, bool usePinnedHost) { size_t width = numW * sizeof(T); size_t sizeElements = width * numH; printf("memcpy2Dtest: %s<%s> size=%lu (%6.2fMB) W: %d, H:%d, usePinnedHost: %d\n", __func__, TYPENAME(T), sizeElements, sizeElements/1024.0/1024.0, (int)numW, (int)numH, usePinnedHost); T *A_d, *B_d, *C_d; T *A_h, *B_h, *C_h; size_t pitch_A, pitch_B, pitch_C; hipChannelFormatDesc desc = hipCreateChannelDesc<T>(); HipTest::initArrays2DPitch(&A_d, &B_d, &C_d, &pitch_A, &pitch_B, &pitch_C, numW, numH); HipTest::initArraysForHost(&A_h, &B_h, &C_h, numW*numH, usePinnedHost); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numW*numH); HIPCHECK (hipMemcpy2D (A_d, pitch_A, A_h, width, width, numH, hipMemcpyHostToDevice) ); HIPCHECK (hipMemcpy2D (B_d, pitch_B, B_h, width, width, numH, hipMemcpyHostToDevice) ); hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, (pitch_C/sizeof(T))*numH); HIPCHECK (hipMemcpy2D (C_h, width, C_d, pitch_C, width, numH, hipMemcpyDeviceToHost) ); HIPCHECK ( hipDeviceSynchronize() ); HipTest::checkVectorADD(A_h, B_h, C_h, numW*numH); HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, usePinnedHost); printf (" %s success\n", __func__); }
// Create a lot of streams and then destroy 'em. void createThenDestroyStreams(int iterations, int burstSize) { hipStream_t *streams = new hipStream_t[burstSize]; for (int i=0; i<iterations; i++) { if (p_verbose & 0x1) { printf ("%s iter=%d, create %d then destroy %d\n", __func__, i, burstSize, burstSize); } for (int j=0; j<burstSize; j++) { if (p_verbose & 0x2) { printf (" %d.%d streamCreate\n", i, j); } HIPCHECK( hipStreamCreate(&streams[j])); } for (int j=0; j<burstSize; j++) { if (p_verbose & 0x2) { printf (" %d.%d streamDestroy\n", i, j); } HIPCHECK( hipStreamDestroy(streams[j])); } } delete streams; }
int main(int argc, char **argv){ HipTest::parseStandardArguments(argc, argv, true); hipStream_t stream[3]; for(int i=0;i<3;i++){ HIPCHECK(hipStreamCreate(&stream[i])); } const size_t size = N * sizeof(float); std::thread t1(run1, size, stream[0]); std::thread t2(run1, size, stream[0]); std::thread t3(run, size, stream[1], stream[2]); t1.join(); // std::cout<<"T1"<<std::endl; t2.join(); // std::cout<<"T2"<<std::endl; t3.join(); passed(); }
int main(int argc, char *argv[]) { hipStream_t stream; unsigned int flags; HIPCHECK(hipStreamCreateWithFlags(&stream, hipStreamDefault)); HIPCHECK(hipStreamGetFlags(stream, &flags)); HIPASSERT(flags == 0); HIPCHECK(hipStreamDestroy(stream)); HIPCHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); HIPCHECK(hipStreamGetFlags(stream, &flags)); HIPASSERT(flags == 1); HIPCHECK(hipStreamDestroy(stream)); passed(); }
int main() { size_t Nbytes = N*sizeof(int); int numDevices = 0; int *A_d, *B_d, *C_d, *X_d, *Y_d, *Z_d; int *A_h, *B_h, *C_h ; hipStream_t s; HIPCHECK(hipGetDeviceCount(&numDevices)); if(numDevices > 1) { HIPCHECK(hipSetDevice(0)); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false); HIPCHECK(hipSetDevice(1)); HIPCHECK(hipMalloc(&X_d,Nbytes)); HIPCHECK(hipMalloc(&Y_d,Nbytes)); HIPCHECK(hipMalloc(&Z_d,Nbytes)); HIPCHECK(hipSetDevice(0)); HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice)); hipLaunchKernel( HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, static_cast<const int*>(A_d), static_cast<const int*>(B_d), C_d, N); HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); HIPCHECK(hipDeviceSynchronize()); HipTest::checkVectorADD(A_h, B_h, C_h, N); HIPCHECK(hipStreamCreate(&s)); HIPCHECK(hipSetDevice(1)); HIPCHECK(hipMemcpyDtoDAsync((hipDeviceptr_t)X_d, (hipDeviceptr_t)A_d, Nbytes, s)); HIPCHECK(hipMemcpyDtoDAsync((hipDeviceptr_t)Y_d, (hipDeviceptr_t)B_d, Nbytes, s)); hipLaunchKernel( HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, static_cast<const int*>(X_d), static_cast<const int*>(Y_d), Z_d, N); HIPCHECK(hipMemcpyDtoHAsync(C_h, (hipDeviceptr_t)Z_d, Nbytes, s)); HIPCHECK(hipStreamSynchronize(s)); HIPCHECK(hipDeviceSynchronize()); HipTest::checkVectorADD(A_h, B_h, C_h, N); HIPCHECK(hipStreamDestroy(s)); HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false); HIPCHECK(hipFree(X_d)); HIPCHECK(hipFree(Y_d)); HIPCHECK(hipFree(Z_d)); } passed(); }
int main(int argc, char *argv[]) { HipTest::parseStandardArguments(argc, argv, true); printf ("info: set device to %d\n", p_gpuDevice); HIPCHECK(hipSetDevice(p_gpuDevice)); if (p_tests & 0x1) { printf ("\n\n=== tests&1 (types and different memcpy kinds (H2D, D2H, H2H, D2D)\n"); HIPCHECK ( hipDeviceReset() ); memcpytest2_for_type<float>(N); memcpytest2_for_type<double>(N); memcpytest2_for_type<char>(N); memcpytest2_for_type<int>(N); printf ("===\n\n\n"); } if (p_tests & 0x2) { // Some tests around the 64MB boundary which have historically shown issues: printf ("\n\n=== tests&0x2 (64MB boundary)\n"); #if 0 // These all pass: memcpytest2<float>(15*1024*1024, 1, 0, 0, 0); memcpytest2<float>(16*1024*1024, 1, 0, 0, 0); memcpytest2<float>(16*1024*1024+16*1024, 1, 0, 0, 0); #endif // Just over 64MB: memcpytest2<float>(16*1024*1024+512*1024, 1, 0, 0, 0); memcpytest2<float>(17*1024*1024+1024, 1, 0, 0, 0); memcpytest2<float>(32*1024*1024, 1, 0, 0, 0); memcpytest2<float>(32*1024*1024, 0, 0, 0, 0); memcpytest2<float>(32*1024*1024, 1, 1, 1, 0); memcpytest2<float>(32*1024*1024, 1, 1, 1, 0); } if (p_tests & 0x4) { printf ("\n\n=== tests&4 (test sizes and offsets)\n"); HIPCHECK ( hipDeviceReset() ); printSep(); memcpytest2_sizes<float>(0,0); printSep(); memcpytest2_sizes<float>(0,64); printSep(); memcpytest2_sizes<float>(1024*1024, 13); printSep(); memcpytest2_sizes<float>(1024*1024, 50); } if (p_tests & 0x8) { printf ("\n\n=== tests&8\n"); HIPCHECK ( hipDeviceReset() ); printSep(); // Simplest cases: serialize the threads, and also used pinned memory: // This verifies that the sub-calls to memcpytest2 are correct. multiThread_1<float>(true, true); // Serialize, but use unpinned memory to stress the unpinned memory xfer path. multiThread_1<float>(true, false); // Remove serialization, so two threads are performing memory copies in parallel. multiThread_1<float>(false, true); // Remove serialization, and use unpinned. multiThread_1<float>(false, false); // TODO printf ("===\n\n\n"); } passed(); }
void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, SyncMode syncMode, bool expectMismatch) { // This test sends a long-running kernel to the null stream, then tests to see if the // specified synchronization technique is effective. // // Some syncMode are not expected to correctly sync (for example "syncNone"). in these // cases the test sets expectMismatch and the check logic below will attempt to ensure that // the undesired synchronization did not occur - ie ensure the kernel is still running and did // not yet update the stop event. This can be tricky since if the kernel runs fast enough it // may complete before the check. To prevent this, the addCountReverse has a count parameter // which causes it to loop repeatedly, and the results are checked in reverse order. // // Tests with expectMismatch=true should ensure the kernel finishes correctly. This results // are checked and we test to make sure stop event has completed. if (!(testMask & p_tests)) { return; } printf ("\ntest 0x%02x: syncMode=%s expectMismatch=%d\n", testMask, syncModeString(syncMode), expectMismatch); size_t sizeBytes = numElements * sizeof(int); int count =100; int init0 = 0; HIPCHECK(hipMemset(C_d, init0, sizeBytes)); for (int i=0; i<numElements; i++) { C_h[i] = -1; // initialize } hipStream_t otherStream = 0; unsigned flags = (syncMode == syncMarkerThenOtherNonBlockingStream) ? hipStreamNonBlocking : hipStreamDefault; HIPCHECK(hipStreamCreateWithFlags(&otherStream, flags)); hipEvent_t stop, otherStreamEvent; HIPCHECK(hipEventCreate(&stop)); HIPCHECK(hipEventCreate(&otherStreamEvent)); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); // Launch kernel into null stream, should result in C_h == count. hipLaunchKernelGGL( HipTest::addCountReverse, dim3(blocks), dim3(threadsPerBlock), 0, 0 /*stream*/, static_cast<const int*>(C_d), C_h, numElements, count); HIPCHECK(hipEventRecord(stop, 0/*default*/)); switch (syncMode) { case syncNone: break; case syncNullStream: HIPCHECK(hipStreamSynchronize(0)); // wait on host for null stream: break; case syncOtherStream: // Does this synchronize with the null stream? HIPCHECK(hipStreamSynchronize(otherStream)); break; case syncMarkerThenOtherStream: case syncMarkerThenOtherNonBlockingStream: // this may wait for NULL stream depending hipStreamNonBlocking flag above HIPCHECK(hipEventRecord(otherStreamEvent, otherStream)); HIPCHECK(hipStreamSynchronize(otherStream)); break; case syncDevice: HIPCHECK(hipDeviceSynchronize()); break; default: assert(0); }; hipError_t done = hipEventQuery(stop); if (expectMismatch) { assert (done == hipErrorNotReady); } else { assert (done == hipSuccess); } int mismatches = 0; int expected = init0 + count; for (int i=0; i<numElements; i++) { bool compareEqual = (C_h[i] == expected); if (!compareEqual) { mismatches ++; if (!expectMismatch) { printf ("C_h[%d] (%d) != %d\n", i, C_h[i], expected); assert(C_h[i] == expected); } } } if (expectMismatch) { assert (mismatches > 0); } HIPCHECK(hipStreamDestroy(otherStream)); HIPCHECK(hipEventDestroy(stop)); HIPCHECK(hipEventDestroy(otherStreamEvent)); HIPCHECK(hipDeviceSynchronize()); printf ("test: OK - %d mismatches (%6.2f%%)\n", mismatches, ((double)(mismatches)*100.0)/numElements); }
void test_pingpong(hipStream_t stream, size_t numElements, int numInflight, int numPongs, bool doHostSide) { HIPASSERT(numElements % numInflight == 0); // Must be evenly divisible. size_t Nbytes = numElements * sizeof(T); size_t eachCopyElements = numElements / numInflight; size_t eachCopyBytes = eachCopyElements * sizeof(T); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); printf( "------------------------------------------------------------------------------------------" "-----\n"); printf( "testing: %s<%s> Nbytes=%zu (%6.1f MB) numPongs=%d numInflight=%d eachCopyElements=%zu " "eachCopyBytes=%zu\n", __func__, HostTraits<AllocType>::Name(), Nbytes, (double)(Nbytes) / 1024.0 / 1024.0, numPongs, numInflight, eachCopyElements, eachCopyBytes); T* A_h = NULL; T* A_d = NULL; A_h = (T*)(HostTraits<AllocType>::Alloc(Nbytes)); HIPCHECK(hipMalloc(&A_d, Nbytes)); // Initialize the host array: const T initValue = 13; const T deviceConst = 2; const T hostConst = 10000; for (size_t i = 0; i < numElements; i++) { A_h[i] = initValue + i; } for (int k = 0; k < numPongs; k++) { for (int i = 0; i < numInflight; i++) { HIPASSERT(A_d + i * eachCopyElements < A_d + Nbytes); HIPCHECK(hipMemcpyAsync(&A_d[i * eachCopyElements], &A_h[i * eachCopyElements], eachCopyBytes, hipMemcpyHostToDevice, stream)); } hipLaunchKernel(addK<T>, dim3(blocks), dim3(threadsPerBlock), 0, stream, A_d, 2, numElements); for (int i = 0; i < numInflight; i++) { HIPASSERT(A_d + i * eachCopyElements < A_d + Nbytes); HIPCHECK(hipMemcpyAsync(&A_h[i * eachCopyElements], &A_d[i * eachCopyElements], eachCopyBytes, hipMemcpyDeviceToHost, stream)); } if (doHostSide) { assert(0); #if 0 hipEvent_t e; HIPCHECK(hipEventCreate(&e)); #endif HIPCHECK(hipDeviceSynchronize()); for (size_t i = 0; i < numElements; i++) { A_h[i] += hostConst; } } }; HIPCHECK(hipDeviceSynchronize()); // Verify we copied back all the data correctly: for (size_t i = 0; i < numElements; i++) { T gold = initValue + i; // Perform calcs in same order as test above to replicate FP order-of-operations: for (int k = 0; k < numPongs; k++) { gold += deviceConst; if (doHostSide) { gold += hostConst; } } if (gold != A_h[i]) { std::cout << i << ": gold=" << gold << " out=" << A_h[i] << std::endl; HIPASSERT(gold == A_h[i]); } } HIPCHECK(hipHostFree(A_h)); HIPCHECK(hipFree(A_d)); }
static void* Alloc(size_t sizeBytes) { void* p; HIPCHECK(hipHostMalloc((void**)&p, sizeBytes, hipHostMallocDefault)); return p; };
void run(size_t size, hipStream_t stream1, hipStream_t stream2){ float *Ah, *Bh, *Cd, *Dd, *Eh; float *Ahh, *Bhh, *Cdd, *Ddd, *Ehh; HIPCHECK(hipHostMalloc((void**)&Ah, size, hipHostMallocDefault)); HIPCHECK(hipHostMalloc((void**)&Bh, size, hipHostMallocDefault)); HIPCHECK(hipMalloc(&Cd, size)); HIPCHECK(hipMalloc(&Dd, size)); HIPCHECK(hipHostMalloc((void**)&Eh, size, hipHostMallocDefault)); HIPCHECK(hipHostMalloc((void**)&Ahh, size, hipHostMallocDefault)); HIPCHECK(hipHostMalloc((void**)&Bhh, size, hipHostMallocDefault)); HIPCHECK(hipMalloc(&Cdd, size)); HIPCHECK(hipMalloc(&Ddd, size)); HIPCHECK(hipHostMalloc((void**)&Ehh, size, hipHostMallocDefault)); HIPCHECK(hipMemcpyAsync(Bh, Ah, size, hipMemcpyHostToHost, stream1)); HIPCHECK(hipMemcpyAsync(Bhh, Ahh, size, hipMemcpyHostToHost, stream2)); HIPCHECK(hipMemcpyAsync(Cd, Bh, size, hipMemcpyHostToDevice, stream1)); HIPCHECK(hipMemcpyAsync(Cdd, Bhh, size, hipMemcpyHostToDevice, stream2)); hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream1, Cd); hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream2, Cdd); HIPCHECK(hipMemcpyAsync(Dd, Cd, size, hipMemcpyDeviceToDevice, stream1)); HIPCHECK(hipMemcpyAsync(Ddd, Cdd, size, hipMemcpyDeviceToDevice, stream2)); HIPCHECK(hipMemcpyAsync(Eh, Dd, size, hipMemcpyDeviceToHost, stream1)); HIPCHECK(hipMemcpyAsync(Ehh, Ddd, size, hipMemcpyDeviceToHost, stream2)); HIPCHECK(hipDeviceSynchronize()); HIPASSERT(Eh[10] = Ah[10] + 1.0f); HIPASSERT(Ehh[10] = Ahh[10] + 1.0f); }
void memcpytest2(size_t numElements, bool usePinnedHost, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault) { size_t sizeElements = numElements * sizeof(T); printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d\n", __func__, TYPENAME(T), sizeElements, sizeElements/1024.0/1024.0, usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault); T *A_d, *B_d, *C_d; T *A_h, *B_h, *C_h; HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, numElements, usePinnedHost); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); T *A_hh = NULL; T *B_hh = NULL; T *C_dd = NULL; if (useHostToHost) { if (usePinnedHost) { HIPCHECK ( hipHostMalloc((void**)&A_hh, sizeElements, hipHostMallocDefault) ); HIPCHECK ( hipHostMalloc((void**)&B_hh, sizeElements, hipHostMallocDefault) ); } else { A_hh = (T*)malloc(sizeElements); B_hh = (T*)malloc(sizeElements); } // Do some extra host-to-host copies here to mix things up: HIPCHECK ( hipMemcpy(A_hh, A_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); HIPCHECK ( hipMemcpy(B_hh, B_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); HIPCHECK ( hipMemcpy(A_d, A_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); HIPCHECK ( hipMemcpy(B_d, B_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); } else { HIPCHECK ( hipMemcpy(A_d, A_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); HIPCHECK ( hipMemcpy(B_d, B_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); } hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements); if (useDeviceToDevice) { HIPCHECK ( hipMalloc(&C_dd, sizeElements) ); // Do an extra device-to-device copies here to mix things up: HIPCHECK ( hipMemcpy(C_dd, C_d, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyDeviceToDevice)); //Destroy the original C_d: HIPCHECK ( hipMemset(C_d, 0x5A, sizeElements)); HIPCHECK ( hipMemcpy(C_h, C_dd, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); } else { HIPCHECK ( hipMemcpy(C_h, C_d, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); } HIPCHECK ( hipDeviceSynchronize() ); HipTest::checkVectorADD(A_h, B_h, C_h, numElements); HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, usePinnedHost); printf (" %s success\n", __func__); }
// IN: nStreams : number of streams to use for the test // IN :useNullStream - use NULL stream. Synchronizes everything. // IN: useSyncMemcpyH2D - use sync memcpy (no overlap) for H2D // IN: useSyncMemcpyD2H - use sync memcpy (no overlap) for D2H void test_chunkedAsyncExample(int nStreams, bool useNullStream, bool useSyncMemcpyH2D, bool useSyncMemcpyD2H) { size_t Nbytes = N * sizeof(int); printf("testing: %s(useNullStream=%d, useSyncMemcpyH2D=%d, useSyncMemcpyD2H=%d) ", __func__, useNullStream, useSyncMemcpyH2D, useSyncMemcpyD2H); printf("Nbytes=%zu (%6.1f MB)\n", Nbytes, (double)(Nbytes) / 1024.0 / 1024.0); int *A_d, *B_d, *C_d; int *A_h, *B_h, *C_h; HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, true); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); hipStream_t* stream = (hipStream_t*)malloc(sizeof(hipStream_t) * nStreams); if (useNullStream) { nStreams = 1; stream[0] = NULL; } else { for (int i = 0; i < nStreams; ++i) { HIPCHECK(hipStreamCreate(&stream[i])); } } size_t workLeft = N; size_t workPerStream = N / nStreams; for (int i = 0; i < nStreams; ++i) { size_t work = (workLeft < workPerStream) ? workLeft : workPerStream; size_t workBytes = work * sizeof(int); size_t offset = i * workPerStream; HIPASSERT(A_d + offset < A_d + Nbytes); HIPASSERT(B_d + offset < B_d + Nbytes); HIPASSERT(C_d + offset < C_d + Nbytes); if (useSyncMemcpyH2D) { HIPCHECK(hipMemcpy(&A_d[offset], &A_h[offset], workBytes, hipMemcpyHostToDevice)); HIPCHECK(hipMemcpy(&B_d[offset], &B_h[offset], workBytes, hipMemcpyHostToDevice)); } else { HIPCHECK(hipMemcpyAsync(&A_d[offset], &A_h[offset], workBytes, hipMemcpyHostToDevice, stream[i])); HIPCHECK(hipMemcpyAsync(&B_d[offset], &B_h[offset], workBytes, hipMemcpyHostToDevice, stream[i])); }; hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, stream[i], &A_d[offset], &B_d[offset], &C_d[offset], work); if (useSyncMemcpyD2H) { HIPCHECK(hipMemcpy(&C_h[offset], &C_d[offset], workBytes, hipMemcpyDeviceToHost)); } else { HIPCHECK(hipMemcpyAsync(&C_h[offset], &C_d[offset], workBytes, hipMemcpyDeviceToHost, stream[i])); } } HIPCHECK(hipDeviceSynchronize()); HipTest::checkVectorADD(A_h, B_h, C_h, N); HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, true); free(stream); };