void test_manyInflightCopies(hipStream_t stream, int numElements, int numCopies, bool syncBetweenCopies) { size_t Nbytes = numElements * sizeof(T); size_t eachCopyElements = numElements / numCopies; size_t eachCopyBytes = eachCopyElements * sizeof(T); printf( "------------------------------------------------------------------------------------------" "-----\n"); printf( "testing: %s Nbytes=%zu (%6.1f MB) numCopies=%d eachCopyElements=%zu eachCopyBytes=%zu\n", __func__, Nbytes, (double)(Nbytes) / 1024.0 / 1024.0, numCopies, eachCopyElements, eachCopyBytes); T* A_d; T *A_h1, *A_h2; HIPCHECK(hipHostMalloc((void**)&A_h1, Nbytes, hipHostMallocDefault)); HIPCHECK(hipHostMalloc((void**)&A_h2, Nbytes, hipHostMallocDefault)); HIPCHECK(hipMalloc(&A_d, Nbytes)); for (int i = 0; i < numElements; i++) { A_h1[i] = 3.14f + static_cast<T>(i); } // stream=0; // fixme TODO for (int i = 0; i < numCopies; i++) { HIPASSERT(A_d + i * eachCopyElements < A_d + Nbytes); HIPCHECK(hipMemcpyAsync(&A_d[i * eachCopyElements], &A_h1[i * eachCopyElements], eachCopyBytes, hipMemcpyHostToDevice, stream)); } if (syncBetweenCopies) { HIPCHECK(hipDeviceSynchronize()); } for (int i = 0; i < numCopies; i++) { HIPASSERT(A_d + i * eachCopyElements < A_d + Nbytes); HIPCHECK(hipMemcpyAsync(&A_h2[i * eachCopyElements], &A_d[i * eachCopyElements], eachCopyBytes, hipMemcpyDeviceToHost, stream)); } HIPCHECK(hipDeviceSynchronize()); // Verify we copied back all the data correctly: for (int i = 0; i < numElements; i++) { HIPASSERT(A_h1[i] == A_h2[i]); } HIPCHECK(hipHostFree(A_h1)); HIPCHECK(hipHostFree(A_h2)); HIPCHECK(hipFree(A_d)); }
void runTests(int64_t numElements) { size_t sizeBytes = numElements * sizeof(int); printf ("\n\ntest: starting sequence with sizeBytes=%zu bytes, %6.2f MB\n", sizeBytes, sizeBytes/1024.0/1024.0); int *C_h, *C_d; HIPCHECK(hipMalloc(&C_d, sizeBytes)); HIPCHECK(hipHostMalloc(&C_h, sizeBytes)); { test (0x01, C_d, C_h, numElements, syncNone, true /*expectMismatch*/); test (0x02, C_d, C_h, numElements, syncNullStream, false /*expectMismatch*/); test (0x04, C_d, C_h, numElements, syncOtherStream, true /*expectMismatch*/); test (0x08, C_d, C_h, numElements, syncDevice, false /*expectMismatch*/); // Sending a marker to to null stream may synchronize the otherStream // - other created with hipStreamNonBlocking=0 : synchronization, should match // - other created with hipStreamNonBlocking=1 : no synchronization, may mismatch test (0x10, C_d, C_h, numElements, syncMarkerThenOtherStream, false /*expectMismatch*/); // TODO - review why this test seems flaky //test (0x20, C_d, C_h, numElements, syncMarkerThenOtherNonBlockingStream, true /*expectMismatch*/); } HIPCHECK(hipFree(C_d)); HIPCHECK(hipHostFree(C_h)); }
int main(){ float *Ad, *A; hipHostMalloc((void**)&A, size); hipMalloc((void**)&Ad, size); assert(hipSuccess == hipMemcpy(Ad, A, size, hipMemcpyHostToDevice)); assert(hipSuccess == hipMemcpy(A, Ad, size, hipMemcpyDeviceToHost)); passed(); }
int main() { float *A, *Ad; HIPCHECK(hipHostMalloc((void**)&A, SIZE, hipHostMallocDefault)); HIPCHECK(hipMalloc((void**)&Ad, SIZE)); hipStream_t stream; HIPCHECK(hipStreamCreate(&stream)); for (int i = 0; i < SIZE; i++) { HIPCHECK(hipMemcpyAsync(Ad, A, SIZE, hipMemcpyHostToDevice, stream)); HIPCHECK(hipDeviceSynchronize()); } }
void run1(size_t size, hipStream_t stream){ float *Ah, *Bh, *Cd, *Dd, *Eh; HIPCHECK(hipHostMalloc((void**)&Ah, size, hipHostMallocDefault)); HIPCHECK(hipHostMalloc((void**)&Bh, size, hipHostMallocDefault)); HIPCHECK(hipMalloc(&Cd, size)); HIPCHECK(hipMalloc(&Dd, size)); HIPCHECK(hipHostMalloc((void**)&Eh, size, hipHostMallocDefault)); for(int i=0;i<N;i++){ Ah[i] = 1.0f; } HIPCHECK(hipMemcpyAsync(Bh, Ah, size, hipMemcpyHostToHost, stream)); HIPCHECK(hipMemcpyAsync(Cd, Bh, size, hipMemcpyHostToDevice, stream)); hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Cd); HIPCHECK(hipMemcpyAsync(Dd, Cd, size, hipMemcpyDeviceToDevice, stream)); HIPCHECK(hipMemcpyAsync(Eh, Dd, size, hipMemcpyDeviceToHost, stream)); HIPCHECK(hipDeviceSynchronize()); HIPASSERT(Eh[10] == Ah[10] + 1.0f); }
void run(size_t size, hipStream_t stream1, hipStream_t stream2){ float *Ah, *Bh, *Cd, *Dd, *Eh; float *Ahh, *Bhh, *Cdd, *Ddd, *Ehh; HIPCHECK(hipHostMalloc((void**)&Ah, size, hipHostMallocDefault)); HIPCHECK(hipHostMalloc((void**)&Bh, size, hipHostMallocDefault)); HIPCHECK(hipMalloc(&Cd, size)); HIPCHECK(hipMalloc(&Dd, size)); HIPCHECK(hipHostMalloc((void**)&Eh, size, hipHostMallocDefault)); HIPCHECK(hipHostMalloc((void**)&Ahh, size, hipHostMallocDefault)); HIPCHECK(hipHostMalloc((void**)&Bhh, size, hipHostMallocDefault)); HIPCHECK(hipMalloc(&Cdd, size)); HIPCHECK(hipMalloc(&Ddd, size)); HIPCHECK(hipHostMalloc((void**)&Ehh, size, hipHostMallocDefault)); HIPCHECK(hipMemcpyAsync(Bh, Ah, size, hipMemcpyHostToHost, stream1)); HIPCHECK(hipMemcpyAsync(Bhh, Ahh, size, hipMemcpyHostToHost, stream2)); HIPCHECK(hipMemcpyAsync(Cd, Bh, size, hipMemcpyHostToDevice, stream1)); HIPCHECK(hipMemcpyAsync(Cdd, Bhh, size, hipMemcpyHostToDevice, stream2)); hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream1, Cd); hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream2, Cdd); HIPCHECK(hipMemcpyAsync(Dd, Cd, size, hipMemcpyDeviceToDevice, stream1)); HIPCHECK(hipMemcpyAsync(Ddd, Cdd, size, hipMemcpyDeviceToDevice, stream2)); HIPCHECK(hipMemcpyAsync(Eh, Dd, size, hipMemcpyDeviceToHost, stream1)); HIPCHECK(hipMemcpyAsync(Ehh, Ddd, size, hipMemcpyDeviceToHost, stream2)); HIPCHECK(hipDeviceSynchronize()); HIPASSERT(Eh[10] = Ah[10] + 1.0f); HIPASSERT(Ehh[10] = Ahh[10] + 1.0f); }
void simpleNegTest() { printf("testing: %s\n", __func__); hipError_t e; float *A_malloc, *A_pinned, *A_d; size_t Nbytes = N * sizeof(float); A_malloc = (float*)malloc(Nbytes); HIPCHECK(hipHostMalloc((void**)&A_pinned, Nbytes, hipHostMallocDefault)); A_d = NULL; HIPCHECK(hipMalloc(&A_d, Nbytes)); HIPASSERT(A_d != NULL); // Can't use default with async copy e = hipMemcpyAsync(A_pinned, A_d, Nbytes, hipMemcpyDefault, NULL); // HIPASSERT (e == hipSuccess); // Not sure what happens here, the memory must be pinned. e = hipMemcpyAsync(A_malloc, A_d, Nbytes, hipMemcpyDeviceToHost, NULL); printf(" async memcpy of A_malloc to A_d. Result=%d\n", e); // HIPASSERT (e==hipErrorInvalidValue); }
//--- // TODO - remove me, this is deprecated. hipError_t hipMallocHost(void** ptr, size_t sizeBytes) { return hipHostMalloc(ptr, sizeBytes, 0); }
//--- // TODO - remove me, this is deprecated. hipError_t hipHostAlloc(void** ptr, size_t sizeBytes, unsigned int flags) { return hipHostMalloc(ptr, sizeBytes, flags); };
static void* Alloc(size_t sizeBytes) { void* p; HIPCHECK(hipHostMalloc((void**)&p, sizeBytes, hipHostMallocDefault)); return p; };
int main() { int *A, *Am, *B, *Ad, *C, *Cm; A = new int[NUM]; B = new int[NUM]; C = new int[NUM]; for(int i=0;i<NUM;i++) { A[i] = -1*i; B[i] = 0; C[i] = 0; } hipMalloc((void**)&Ad, SIZE); hipHostMalloc((void**)&Am, SIZE); hipHostMalloc((void**)&Cm, SIZE); for(int i=0;i<NUM;i++) { Am[i] = -1*i; Cm[i] = 0; } hipStream_t stream; hipStreamCreate(&stream); hipMemcpyToSymbolAsync(HIP_SYMBOL(globalIn), Am, SIZE, 0, hipMemcpyHostToDevice, stream); hipStreamSynchronize(stream); hipLaunchKernel(Assign, dim3(1,1,1), dim3(NUM,1,1), 0, 0, Ad); hipMemcpy(B, Ad, SIZE, hipMemcpyDeviceToHost); hipMemcpyFromSymbolAsync(Cm, HIP_SYMBOL(globalOut), SIZE, 0, hipMemcpyDeviceToHost, stream); hipStreamSynchronize(stream); for(int i=0;i<NUM;i++) { assert(Am[i] == B[i]); assert(Am[i] == Cm[i]); } for(int i=0;i<NUM;i++) { A[i] = -2*i; B[i] = 0; } hipMemcpyToSymbol(HIP_SYMBOL(globalIn), A, SIZE, 0, hipMemcpyHostToDevice); hipLaunchKernel(Assign, dim3(1,1,1), dim3(NUM,1,1), 0, 0, Ad); hipMemcpy(B, Ad, SIZE, hipMemcpyDeviceToHost); hipMemcpyFromSymbol(C, HIP_SYMBOL(globalOut), SIZE, 0, hipMemcpyDeviceToHost); for(int i=0;i<NUM;i++) { assert(A[i] == B[i]); assert(A[i] == C[i]); } for(int i=0;i<NUM;i++) { A[i] = -3*i; B[i] = 0; } hipMemcpyToSymbolAsync(HIP_SYMBOL(globalIn), A, SIZE, 0, hipMemcpyHostToDevice, stream); hipStreamSynchronize(stream); hipLaunchKernel(Assign, dim3(1,1,1), dim3(NUM,1,1), 0, 0, Ad); hipMemcpy(B, Ad, SIZE, hipMemcpyDeviceToHost); hipMemcpyFromSymbolAsync(C, HIP_SYMBOL(globalOut), SIZE, 0, hipMemcpyDeviceToHost, stream); hipStreamSynchronize(stream); for(int i=0;i<NUM;i++) { assert(A[i] == B[i]); assert(A[i] == C[i]); } hipHostFree(Am); hipHostFree(Cm); hipFree(Ad); delete[] A; delete[] B; delete[] C; passed(); }
void memcpytest2(size_t numElements, bool usePinnedHost, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault) { size_t sizeElements = numElements * sizeof(T); printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d\n", __func__, TYPENAME(T), sizeElements, sizeElements/1024.0/1024.0, usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault); T *A_d, *B_d, *C_d; T *A_h, *B_h, *C_h; HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, numElements, usePinnedHost); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); T *A_hh = NULL; T *B_hh = NULL; T *C_dd = NULL; if (useHostToHost) { if (usePinnedHost) { HIPCHECK ( hipHostMalloc((void**)&A_hh, sizeElements, hipHostMallocDefault) ); HIPCHECK ( hipHostMalloc((void**)&B_hh, sizeElements, hipHostMallocDefault) ); } else { A_hh = (T*)malloc(sizeElements); B_hh = (T*)malloc(sizeElements); } // Do some extra host-to-host copies here to mix things up: HIPCHECK ( hipMemcpy(A_hh, A_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); HIPCHECK ( hipMemcpy(B_hh, B_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); HIPCHECK ( hipMemcpy(A_d, A_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); HIPCHECK ( hipMemcpy(B_d, B_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); } else { HIPCHECK ( hipMemcpy(A_d, A_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); HIPCHECK ( hipMemcpy(B_d, B_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); } hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements); if (useDeviceToDevice) { HIPCHECK ( hipMalloc(&C_dd, sizeElements) ); // Do an extra device-to-device copies here to mix things up: HIPCHECK ( hipMemcpy(C_dd, C_d, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyDeviceToDevice)); //Destroy the original C_d: HIPCHECK ( hipMemset(C_d, 0x5A, sizeElements)); HIPCHECK ( hipMemcpy(C_h, C_dd, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); } else { HIPCHECK ( hipMemcpy(C_h, C_d, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); } HIPCHECK ( hipDeviceSynchronize() ); HipTest::checkVectorADD(A_h, B_h, C_h, numElements); HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, usePinnedHost); printf (" %s success\n", __func__); }
int main() { try { bool done = false; boost::fibers::fiber f1([&done]{ std::cout << "f1: entered" << std::endl; try { hipStream_t stream; hipStreamCreate( & stream); int size = 1024 * 1024; int full_size = 20 * size; int * host_a, * host_b, * host_c; hipHostMalloc( & host_a, full_size * sizeof( int), hipHostMallocDefault); hipHostMalloc( & host_b, full_size * sizeof( int), hipHostMallocDefault); hipHostMalloc( & host_c, full_size * sizeof( int), hipHostMallocDefault); int * dev_a, * dev_b, * dev_c; hipMalloc( & dev_a, size * sizeof( int) ); hipMalloc( & dev_b, size * sizeof( int) ); hipMalloc( & dev_c, size * sizeof( int) ); std::minstd_rand generator; std::uniform_int_distribution<> distribution(1, 6); for ( int i = 0; i < full_size; ++i) { host_a[i] = distribution( generator); host_b[i] = distribution( generator); } for ( int i = 0; i < full_size; i += size) { hipMemcpyAsync( dev_a, host_a + i, size * sizeof( int), hipMemcpyHostToDevice, stream); hipMemcpyAsync( dev_b, host_b + i, size * sizeof( int), hipMemcpyHostToDevice, stream); hipLaunchKernel( vector_add, dim3(size / 256), dim3(256), 0, stream, dev_a, dev_b, dev_c, size); hipMemcpyAsync( host_c + i, dev_c, size * sizeof( int), hipMemcpyDeviceToHost, stream); } auto result = boost::fibers::hip::waitfor_all( stream); BOOST_ASSERT( stream == std::get< 0 >( result) ); BOOST_ASSERT( hipSuccess == std::get< 1 >( result) ); std::cout << "f1: GPU computation finished" << std::endl; hipHostFree( host_a); hipHostFree( host_b); hipHostFree( host_c); hipFree( dev_a); hipFree( dev_b); hipFree( dev_c); hipStreamDestroy( stream); done = true; } catch ( std::exception const& ex) { std::cerr << "exception: " << ex.what() << std::endl; } std::cout << "f1: leaving" << std::endl; }); boost::fibers::fiber f2([&done]{ std::cout << "f2: entered" << std::endl; while ( ! done) { std::cout << "f2: sleeping" << std::endl; boost::this_fiber::sleep_for( std::chrono::milliseconds( 1 ) ); } std::cout << "f2: leaving" << std::endl; }); f1.join(); f2.join(); std::cout << "done." << std::endl; return EXIT_SUCCESS; } catch ( std::exception const& e) { std::cerr << "exception: " << e.what() << std::endl; } catch (...) { std::cerr << "unhandled exception" << std::endl; } return EXIT_FAILURE; }