void run(size_t size, hipStream_t stream1, hipStream_t stream2){ float *Ah, *Bh, *Cd, *Dd, *Eh; float *Ahh, *Bhh, *Cdd, *Ddd, *Ehh; HIPCHECK(hipHostMalloc((void**)&Ah, size, hipHostMallocDefault)); HIPCHECK(hipHostMalloc((void**)&Bh, size, hipHostMallocDefault)); HIPCHECK(hipMalloc(&Cd, size)); HIPCHECK(hipMalloc(&Dd, size)); HIPCHECK(hipHostMalloc((void**)&Eh, size, hipHostMallocDefault)); HIPCHECK(hipHostMalloc((void**)&Ahh, size, hipHostMallocDefault)); HIPCHECK(hipHostMalloc((void**)&Bhh, size, hipHostMallocDefault)); HIPCHECK(hipMalloc(&Cdd, size)); HIPCHECK(hipMalloc(&Ddd, size)); HIPCHECK(hipHostMalloc((void**)&Ehh, size, hipHostMallocDefault)); HIPCHECK(hipMemcpyAsync(Bh, Ah, size, hipMemcpyHostToHost, stream1)); HIPCHECK(hipMemcpyAsync(Bhh, Ahh, size, hipMemcpyHostToHost, stream2)); HIPCHECK(hipMemcpyAsync(Cd, Bh, size, hipMemcpyHostToDevice, stream1)); HIPCHECK(hipMemcpyAsync(Cdd, Bhh, size, hipMemcpyHostToDevice, stream2)); hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream1, Cd); hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream2, Cdd); HIPCHECK(hipMemcpyAsync(Dd, Cd, size, hipMemcpyDeviceToDevice, stream1)); HIPCHECK(hipMemcpyAsync(Ddd, Cdd, size, hipMemcpyDeviceToDevice, stream2)); HIPCHECK(hipMemcpyAsync(Eh, Dd, size, hipMemcpyDeviceToHost, stream1)); HIPCHECK(hipMemcpyAsync(Ehh, Ddd, size, hipMemcpyDeviceToHost, stream2)); HIPCHECK(hipDeviceSynchronize()); HIPASSERT(Eh[10] = Ah[10] + 1.0f); HIPASSERT(Ehh[10] = Ahh[10] + 1.0f); }
void test_manyInflightCopies(hipStream_t stream, int numElements, int numCopies, bool syncBetweenCopies) { size_t Nbytes = numElements * sizeof(T); size_t eachCopyElements = numElements / numCopies; size_t eachCopyBytes = eachCopyElements * sizeof(T); printf( "------------------------------------------------------------------------------------------" "-----\n"); printf( "testing: %s Nbytes=%zu (%6.1f MB) numCopies=%d eachCopyElements=%zu eachCopyBytes=%zu\n", __func__, Nbytes, (double)(Nbytes) / 1024.0 / 1024.0, numCopies, eachCopyElements, eachCopyBytes); T* A_d; T *A_h1, *A_h2; HIPCHECK(hipHostMalloc((void**)&A_h1, Nbytes, hipHostMallocDefault)); HIPCHECK(hipHostMalloc((void**)&A_h2, Nbytes, hipHostMallocDefault)); HIPCHECK(hipMalloc(&A_d, Nbytes)); for (int i = 0; i < numElements; i++) { A_h1[i] = 3.14f + static_cast<T>(i); } // stream=0; // fixme TODO for (int i = 0; i < numCopies; i++) { HIPASSERT(A_d + i * eachCopyElements < A_d + Nbytes); HIPCHECK(hipMemcpyAsync(&A_d[i * eachCopyElements], &A_h1[i * eachCopyElements], eachCopyBytes, hipMemcpyHostToDevice, stream)); } if (syncBetweenCopies) { HIPCHECK(hipDeviceSynchronize()); } for (int i = 0; i < numCopies; i++) { HIPASSERT(A_d + i * eachCopyElements < A_d + Nbytes); HIPCHECK(hipMemcpyAsync(&A_h2[i * eachCopyElements], &A_d[i * eachCopyElements], eachCopyBytes, hipMemcpyDeviceToHost, stream)); } HIPCHECK(hipDeviceSynchronize()); // Verify we copied back all the data correctly: for (int i = 0; i < numElements; i++) { HIPASSERT(A_h1[i] == A_h2[i]); } HIPCHECK(hipHostFree(A_h1)); HIPCHECK(hipHostFree(A_h2)); HIPCHECK(hipFree(A_d)); }
int main() { int numDevices = 0; int major,minor; hipDevice_t device; HIPCHECK(hipGetDeviceCount(&numDevices)); for(int i=0;i<numDevices;i++){ HIPCHECK(hipDeviceGet(&device,i)); HIPCHECK(hipDeviceComputeCapability(&major, &minor, device)); HIPASSERT(major >= 0); HIPASSERT(minor >= 0); } passed(); }
int main(int argc, char *argv[]) { hipStream_t stream; unsigned int flags; HIPCHECK(hipStreamCreateWithFlags(&stream, hipStreamDefault)); HIPCHECK(hipStreamGetFlags(stream, &flags)); HIPASSERT(flags == 0); HIPCHECK(hipStreamDestroy(stream)); HIPCHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); HIPCHECK(hipStreamGetFlags(stream, &flags)); HIPASSERT(flags == 1); HIPCHECK(hipStreamDestroy(stream)); passed(); }
int main() { int numDevices = 0; int device; HIPCHECK(hipGetDeviceCount(&numDevices)); for(int i=0;i<numDevices;i++){ HIPCHECK(hipSetDevice(i)); HIPCHECK(hipGetDevice(&device)); HIPASSERT(device == i); } passed(); }
int main() { int numDevices = 0; char name[len]; hipDevice_t device; HIPCHECK(hipGetDeviceCount(&numDevices)); for (int i = 0; i < numDevices; i++) { HIPCHECK(hipDeviceGet(&device, i)); HIPCHECK(hipDeviceGetName(name, len, device)); HIPASSERT(name != ""); } passed(); }
int main() { int numDevices = 0; size_t totMem; hipDevice_t device; HIPCHECK(hipGetDeviceCount(&numDevices)); for (int i = 0; i < numDevices; i++) { HIPCHECK(hipDeviceGet(&device, i)); HIPCHECK(hipDeviceTotalMem(&totMem, device)); HIPASSERT(totMem != 0); } passed(); }
bool testhipMemset3D(int memsetval,int p_gpuDevice) { size_t numH = 256; size_t numW = 256; size_t depth = 10; size_t width = numW * sizeof(char); size_t sizeElements = width * numH * depth; size_t elements = numW* numH* depth; printf ("testhipMemset3D memsetval=%2x device=%d\n", memsetval, p_gpuDevice); char *A_h; bool testResult = true; hipExtent extent = make_hipExtent(width, numH, depth); hipPitchedPtr devPitchedPtr; HIPCHECK(hipMalloc3D(&devPitchedPtr, extent)); A_h = (char*)malloc(sizeElements); HIPASSERT(A_h != NULL); for (size_t i=0; i<elements; i++) { A_h[i] = 1; } HIPCHECK ( hipMemset3D( devPitchedPtr, memsetval, extent) ); hipMemcpy3DParms myparms = {0}; myparms.srcPos = make_hipPos(0,0,0); myparms.dstPos = make_hipPos(0,0,0); myparms.dstPtr = make_hipPitchedPtr(A_h, width , numW, numH); myparms.srcPtr = devPitchedPtr; myparms.extent = extent; #ifdef __HIP_PLATFORM_NVCC__ myparms.kind = hipMemcpyKindToCudaMemcpyKind(hipMemcpyDeviceToHost); #else myparms.kind = hipMemcpyDeviceToHost; #endif HIPCHECK(hipMemcpy3D(&myparms)); for (int i=0; i<elements; i++) { if (A_h[i] != memsetval) { testResult = false; printf("mismatch at index:%d computed:%02x, memsetval:%02x\n", i, (int)A_h[i], (int)memsetval); break; } } HIPCHECK(hipFree(devPitchedPtr.ptr)); free(A_h); return testResult; }
void run1(size_t size, hipStream_t stream){ float *Ah, *Bh, *Cd, *Dd, *Eh; HIPCHECK(hipHostMalloc((void**)&Ah, size, hipHostMallocDefault)); HIPCHECK(hipHostMalloc((void**)&Bh, size, hipHostMallocDefault)); HIPCHECK(hipMalloc(&Cd, size)); HIPCHECK(hipMalloc(&Dd, size)); HIPCHECK(hipHostMalloc((void**)&Eh, size, hipHostMallocDefault)); for(int i=0;i<N;i++){ Ah[i] = 1.0f; } HIPCHECK(hipMemcpyAsync(Bh, Ah, size, hipMemcpyHostToHost, stream)); HIPCHECK(hipMemcpyAsync(Cd, Bh, size, hipMemcpyHostToDevice, stream)); hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Cd); HIPCHECK(hipMemcpyAsync(Dd, Cd, size, hipMemcpyDeviceToDevice, stream)); HIPCHECK(hipMemcpyAsync(Eh, Dd, size, hipMemcpyDeviceToHost, stream)); HIPCHECK(hipDeviceSynchronize()); HIPASSERT(Eh[10] == Ah[10] + 1.0f); }
void simpleNegTest() { printf("testing: %s\n", __func__); hipError_t e; float *A_malloc, *A_pinned, *A_d; size_t Nbytes = N * sizeof(float); A_malloc = (float*)malloc(Nbytes); HIPCHECK(hipHostMalloc((void**)&A_pinned, Nbytes, hipHostMallocDefault)); A_d = NULL; HIPCHECK(hipMalloc(&A_d, Nbytes)); HIPASSERT(A_d != NULL); // Can't use default with async copy e = hipMemcpyAsync(A_pinned, A_d, Nbytes, hipMemcpyDefault, NULL); // HIPASSERT (e == hipSuccess); // Not sure what happens here, the memory must be pinned. e = hipMemcpyAsync(A_malloc, A_d, Nbytes, hipMemcpyDeviceToHost, NULL); printf(" async memcpy of A_malloc to A_d. Result=%d\n", e); // HIPASSERT (e==hipErrorInvalidValue); }
void test_pingpong(hipStream_t stream, size_t numElements, int numInflight, int numPongs, bool doHostSide) { HIPASSERT(numElements % numInflight == 0); // Must be evenly divisible. size_t Nbytes = numElements * sizeof(T); size_t eachCopyElements = numElements / numInflight; size_t eachCopyBytes = eachCopyElements * sizeof(T); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); printf( "------------------------------------------------------------------------------------------" "-----\n"); printf( "testing: %s<%s> Nbytes=%zu (%6.1f MB) numPongs=%d numInflight=%d eachCopyElements=%zu " "eachCopyBytes=%zu\n", __func__, HostTraits<AllocType>::Name(), Nbytes, (double)(Nbytes) / 1024.0 / 1024.0, numPongs, numInflight, eachCopyElements, eachCopyBytes); T* A_h = NULL; T* A_d = NULL; A_h = (T*)(HostTraits<AllocType>::Alloc(Nbytes)); HIPCHECK(hipMalloc(&A_d, Nbytes)); // Initialize the host array: const T initValue = 13; const T deviceConst = 2; const T hostConst = 10000; for (size_t i = 0; i < numElements; i++) { A_h[i] = initValue + i; } for (int k = 0; k < numPongs; k++) { for (int i = 0; i < numInflight; i++) { HIPASSERT(A_d + i * eachCopyElements < A_d + Nbytes); HIPCHECK(hipMemcpyAsync(&A_d[i * eachCopyElements], &A_h[i * eachCopyElements], eachCopyBytes, hipMemcpyHostToDevice, stream)); } hipLaunchKernel(addK<T>, dim3(blocks), dim3(threadsPerBlock), 0, stream, A_d, 2, numElements); for (int i = 0; i < numInflight; i++) { HIPASSERT(A_d + i * eachCopyElements < A_d + Nbytes); HIPCHECK(hipMemcpyAsync(&A_h[i * eachCopyElements], &A_d[i * eachCopyElements], eachCopyBytes, hipMemcpyDeviceToHost, stream)); } if (doHostSide) { assert(0); #if 0 hipEvent_t e; HIPCHECK(hipEventCreate(&e)); #endif HIPCHECK(hipDeviceSynchronize()); for (size_t i = 0; i < numElements; i++) { A_h[i] += hostConst; } } }; HIPCHECK(hipDeviceSynchronize()); // Verify we copied back all the data correctly: for (size_t i = 0; i < numElements; i++) { T gold = initValue + i; // Perform calcs in same order as test above to replicate FP order-of-operations: for (int k = 0; k < numPongs; k++) { gold += deviceConst; if (doHostSide) { gold += hostConst; } } if (gold != A_h[i]) { std::cout << i << ": gold=" << gold << " out=" << A_h[i] << std::endl; HIPASSERT(gold == A_h[i]); } } HIPCHECK(hipHostFree(A_h)); HIPCHECK(hipFree(A_d)); }
// IN: nStreams : number of streams to use for the test // IN :useNullStream - use NULL stream. Synchronizes everything. // IN: useSyncMemcpyH2D - use sync memcpy (no overlap) for H2D // IN: useSyncMemcpyD2H - use sync memcpy (no overlap) for D2H void test_chunkedAsyncExample(int nStreams, bool useNullStream, bool useSyncMemcpyH2D, bool useSyncMemcpyD2H) { size_t Nbytes = N * sizeof(int); printf("testing: %s(useNullStream=%d, useSyncMemcpyH2D=%d, useSyncMemcpyD2H=%d) ", __func__, useNullStream, useSyncMemcpyH2D, useSyncMemcpyD2H); printf("Nbytes=%zu (%6.1f MB)\n", Nbytes, (double)(Nbytes) / 1024.0 / 1024.0); int *A_d, *B_d, *C_d; int *A_h, *B_h, *C_h; HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, true); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); hipStream_t* stream = (hipStream_t*)malloc(sizeof(hipStream_t) * nStreams); if (useNullStream) { nStreams = 1; stream[0] = NULL; } else { for (int i = 0; i < nStreams; ++i) { HIPCHECK(hipStreamCreate(&stream[i])); } } size_t workLeft = N; size_t workPerStream = N / nStreams; for (int i = 0; i < nStreams; ++i) { size_t work = (workLeft < workPerStream) ? workLeft : workPerStream; size_t workBytes = work * sizeof(int); size_t offset = i * workPerStream; HIPASSERT(A_d + offset < A_d + Nbytes); HIPASSERT(B_d + offset < B_d + Nbytes); HIPASSERT(C_d + offset < C_d + Nbytes); if (useSyncMemcpyH2D) { HIPCHECK(hipMemcpy(&A_d[offset], &A_h[offset], workBytes, hipMemcpyHostToDevice)); HIPCHECK(hipMemcpy(&B_d[offset], &B_h[offset], workBytes, hipMemcpyHostToDevice)); } else { HIPCHECK(hipMemcpyAsync(&A_d[offset], &A_h[offset], workBytes, hipMemcpyHostToDevice, stream[i])); HIPCHECK(hipMemcpyAsync(&B_d[offset], &B_h[offset], workBytes, hipMemcpyHostToDevice, stream[i])); }; hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, stream[i], &A_d[offset], &B_d[offset], &C_d[offset], work); if (useSyncMemcpyD2H) { HIPCHECK(hipMemcpy(&C_h[offset], &C_d[offset], workBytes, hipMemcpyDeviceToHost)); } else { HIPCHECK(hipMemcpyAsync(&C_h[offset], &C_d[offset], workBytes, hipMemcpyDeviceToHost, stream[i])); } } HIPCHECK(hipDeviceSynchronize()); HipTest::checkVectorADD(A_h, B_h, C_h, N); HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, true); free(stream); };
void memcpyArraytest(size_t numW, size_t numH, bool usePinnedHost, bool usePitch=false) { size_t width = numW * sizeof(T); size_t sizeElements = width * numH; printf("memcpyArraytest: %s<%s> size=%lu (%6.2fMB) W: %d, H: %d, usePinnedHost: %d, usePitch: %d\n", __func__, TYPENAME(T), sizeElements, sizeElements/1024.0/1024.0, (int)numW, (int)numH, usePinnedHost, usePitch); hipArray *A_d, *B_d, *C_d; T *A_h, *B_h, *C_h; // 1D if ((numW >= 1) && (numH == 1)) { hipChannelFormatDesc desc = hipCreateChannelDesc<T>(); HipTest::initHIPArrays(&A_d, &B_d, &C_d, &desc, numW, 1, 0); HipTest::initArraysForHost(&A_h, &B_h, &C_h, numW*numH, usePinnedHost); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numW*numH); HIPCHECK (hipMemcpyToArray (A_d, 0, 0, (void *)A_h, width, hipMemcpyHostToDevice) ); HIPCHECK (hipMemcpyToArray (B_d, 0, 0, (void *)B_h, width, hipMemcpyHostToDevice) ); hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, (T*)A_d->data, (T*)B_d->data, (T*)C_d->data, numW); HIPCHECK (hipMemcpy (C_h, C_d->data, width, hipMemcpyDeviceToHost) ); HIPCHECK ( hipDeviceSynchronize() ); HipTest::checkVectorADD(A_h, B_h, C_h, numW); } // 2D else if ((numW >= 1) && (numH >= 1)) { hipChannelFormatDesc desc = hipCreateChannelDesc<T>(); HipTest::initHIPArrays(&A_d, &B_d, &C_d, &desc, numW, numH, 0); HipTest::initArraysForHost(&A_h, &B_h, &C_h, numW*numH, usePinnedHost); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numW*numH); if (usePitch) { T *A_p, *B_p, *C_p; size_t pitch_A, pitch_B, pitch_C; HipTest::initArrays2DPitch(&A_p, &B_p, &C_p, &pitch_A, &pitch_B, &pitch_C, numW, numH); HIPCHECK (hipMemcpy2D (A_p, pitch_A, A_h, width, width, numH, hipMemcpyHostToDevice) ); HIPCHECK (hipMemcpy2D (B_p, pitch_B, B_h, width, width, numH, hipMemcpyHostToDevice) ); HIPCHECK (hipMemcpy2DToArray (A_d, 0, 0, (void *)A_p, pitch_A, width, numH, hipMemcpyDeviceToDevice) ); HIPCHECK (hipMemcpy2DToArray (B_d, 0, 0, (void *)B_p, pitch_B, width, numH, hipMemcpyDeviceToDevice) ); hipFree(A_p); hipFree(B_p); hipFree(C_p); } else { HIPCHECK (hipMemcpy2DToArray (A_d, 0, 0, (void *)A_h, width, width, numH, hipMemcpyHostToDevice) ); HIPCHECK (hipMemcpy2DToArray (B_d, 0, 0, (void *)B_h, width, width, numH, hipMemcpyHostToDevice) ); } hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, (T*)A_d->data, (T*)B_d->data, (T*)C_d->data, numW*numH); HIPCHECK (hipMemcpy2D ((void*)C_h, width, (void*)C_d->data, width, width, numH, hipMemcpyDeviceToHost) ); HIPCHECK ( hipDeviceSynchronize() ); HipTest::checkVectorADD(A_h, B_h, C_h, numW*numH); } // Unknown else { HIPASSERT("Incompatible dimensions" && 0); } hipFreeArray(A_d); hipFreeArray(B_d); hipFreeArray(C_d); HipTest::freeArraysForHost(A_h, B_h, C_h, usePinnedHost); printf (" %s success\n", __func__); }