void test_manyInflightCopies(hipStream_t stream, int numElements, int numCopies, bool syncBetweenCopies) { size_t Nbytes = numElements * sizeof(T); size_t eachCopyElements = numElements / numCopies; size_t eachCopyBytes = eachCopyElements * sizeof(T); printf( "------------------------------------------------------------------------------------------" "-----\n"); printf( "testing: %s Nbytes=%zu (%6.1f MB) numCopies=%d eachCopyElements=%zu eachCopyBytes=%zu\n", __func__, Nbytes, (double)(Nbytes) / 1024.0 / 1024.0, numCopies, eachCopyElements, eachCopyBytes); T* A_d; T *A_h1, *A_h2; HIPCHECK(hipHostMalloc((void**)&A_h1, Nbytes, hipHostMallocDefault)); HIPCHECK(hipHostMalloc((void**)&A_h2, Nbytes, hipHostMallocDefault)); HIPCHECK(hipMalloc(&A_d, Nbytes)); for (int i = 0; i < numElements; i++) { A_h1[i] = 3.14f + static_cast<T>(i); } // stream=0; // fixme TODO for (int i = 0; i < numCopies; i++) { HIPASSERT(A_d + i * eachCopyElements < A_d + Nbytes); HIPCHECK(hipMemcpyAsync(&A_d[i * eachCopyElements], &A_h1[i * eachCopyElements], eachCopyBytes, hipMemcpyHostToDevice, stream)); } if (syncBetweenCopies) { HIPCHECK(hipDeviceSynchronize()); } for (int i = 0; i < numCopies; i++) { HIPASSERT(A_d + i * eachCopyElements < A_d + Nbytes); HIPCHECK(hipMemcpyAsync(&A_h2[i * eachCopyElements], &A_d[i * eachCopyElements], eachCopyBytes, hipMemcpyDeviceToHost, stream)); } HIPCHECK(hipDeviceSynchronize()); // Verify we copied back all the data correctly: for (int i = 0; i < numElements; i++) { HIPASSERT(A_h1[i] == A_h2[i]); } HIPCHECK(hipHostFree(A_h1)); HIPCHECK(hipHostFree(A_h2)); HIPCHECK(hipFree(A_d)); }
int main() { size_t Nbytes = N * sizeof(int); int numDevices = 0; int *A_d, *B_d, *C_d, *X_d, *Y_d, *Z_d; int *A_h, *B_h, *C_h; hipStream_t s; HIPCHECK(hipGetDeviceCount(&numDevices)); if (numDevices > 1) { HIPCHECK(hipSetDevice(0)); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false); HIPCHECK(hipSetDevice(1)); HIPCHECK(hipMalloc(&X_d, Nbytes)); HIPCHECK(hipMalloc(&Y_d, Nbytes)); HIPCHECK(hipMalloc(&Z_d, Nbytes)); HIPCHECK(hipSetDevice(0)); HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice)); hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, static_cast<const int*>(A_d), static_cast<const int*>(B_d), C_d, N); HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); HIPCHECK(hipDeviceSynchronize()); HipTest::checkVectorADD(A_h, B_h, C_h, N); HIPCHECK(hipSetDevice(1)); HIPCHECK(hipStreamCreate(&s)); HIPCHECK(hipMemcpyDtoDAsync((hipDeviceptr_t)X_d, (hipDeviceptr_t)A_d, Nbytes, s)); HIPCHECK(hipMemcpyDtoDAsync((hipDeviceptr_t)Y_d, (hipDeviceptr_t)B_d, Nbytes, s)); hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, static_cast<const int*>(X_d), static_cast<const int*>(Y_d), Z_d, N); HIPCHECK(hipMemcpyDtoHAsync(C_h, (hipDeviceptr_t)Z_d, Nbytes, s)); HIPCHECK(hipStreamSynchronize(s)); HIPCHECK(hipDeviceSynchronize()); HipTest::checkVectorADD(A_h, B_h, C_h, N); HIPCHECK(hipStreamDestroy(s)); HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false); HIPCHECK(hipFree(X_d)); HIPCHECK(hipFree(Y_d)); HIPCHECK(hipFree(Z_d)); } passed(); }
int test_gl2(size_t N) { size_t Nbytes = N*sizeof(int); int *A_d, *B_d, *C_d; int *A_h, *B_h, *C_h; HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); // Full vadd in one large chunk, to get things started: HIPCHECK ( hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); HIPCHECK ( hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice)); hipLaunchKernel(vectorADD2, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, N); HIPCHECK ( hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); HIPCHECK (hipDeviceSynchronize()); HipTest::checkVectorADD(A_h, B_h, C_h, N); return 0; }
void run(size_t size, hipStream_t stream1, hipStream_t stream2){ float *Ah, *Bh, *Cd, *Dd, *Eh; float *Ahh, *Bhh, *Cdd, *Ddd, *Ehh; HIPCHECK(hipHostMalloc((void**)&Ah, size, hipHostMallocDefault)); HIPCHECK(hipHostMalloc((void**)&Bh, size, hipHostMallocDefault)); HIPCHECK(hipMalloc(&Cd, size)); HIPCHECK(hipMalloc(&Dd, size)); HIPCHECK(hipHostMalloc((void**)&Eh, size, hipHostMallocDefault)); HIPCHECK(hipHostMalloc((void**)&Ahh, size, hipHostMallocDefault)); HIPCHECK(hipHostMalloc((void**)&Bhh, size, hipHostMallocDefault)); HIPCHECK(hipMalloc(&Cdd, size)); HIPCHECK(hipMalloc(&Ddd, size)); HIPCHECK(hipHostMalloc((void**)&Ehh, size, hipHostMallocDefault)); HIPCHECK(hipMemcpyAsync(Bh, Ah, size, hipMemcpyHostToHost, stream1)); HIPCHECK(hipMemcpyAsync(Bhh, Ahh, size, hipMemcpyHostToHost, stream2)); HIPCHECK(hipMemcpyAsync(Cd, Bh, size, hipMemcpyHostToDevice, stream1)); HIPCHECK(hipMemcpyAsync(Cdd, Bhh, size, hipMemcpyHostToDevice, stream2)); hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream1, Cd); hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream2, Cdd); HIPCHECK(hipMemcpyAsync(Dd, Cd, size, hipMemcpyDeviceToDevice, stream1)); HIPCHECK(hipMemcpyAsync(Ddd, Cdd, size, hipMemcpyDeviceToDevice, stream2)); HIPCHECK(hipMemcpyAsync(Eh, Dd, size, hipMemcpyDeviceToHost, stream1)); HIPCHECK(hipMemcpyAsync(Ehh, Ddd, size, hipMemcpyDeviceToHost, stream2)); HIPCHECK(hipDeviceSynchronize()); HIPASSERT(Eh[10] = Ah[10] + 1.0f); HIPASSERT(Ehh[10] = Ahh[10] + 1.0f); }
void waitStreams(int iterations) { // Repeatedly sync and wait for all streams to complete. // TO make this interesting, the test has other threads repeatedly adding and removing streams to the device. for (int i=0; i<iterations; i++) { HIPCHECK(hipDeviceSynchronize()); } }
int main() { float *A, *Ad; HIPCHECK(hipHostMalloc((void**)&A, SIZE, hipHostMallocDefault)); HIPCHECK(hipMalloc((void**)&Ad, SIZE)); hipStream_t stream; HIPCHECK(hipStreamCreate(&stream)); for (int i = 0; i < SIZE; i++) { HIPCHECK(hipMemcpyAsync(Ad, A, SIZE, hipMemcpyHostToDevice, stream)); HIPCHECK(hipDeviceSynchronize()); } }
void runbench_warmup(double *cd, long size){ const long reduced_grid_size = size/(UNROLLED_MEMORY_ACCESSES)/32; const int BLOCK_SIZE = 256; const int TOTAL_REDUCED_BLOCKS = reduced_grid_size/BLOCK_SIZE; dim3 dimBlock(BLOCK_SIZE, 1, 1); dim3 dimReducedGrid(TOTAL_REDUCED_BLOCKS, 1, 1); hipLaunchKernel(HIP_KERNEL_NAME(benchmark_func< short, BLOCK_SIZE, 0 >), dim3(dimReducedGrid), dim3(dimBlock ), 0, 0, (short)1, (short*)cd); CUDA_SAFE_CALL( hipGetLastError() ); CUDA_SAFE_CALL( hipDeviceSynchronize() ); }
int main() { setup(); int *A, *Ad; for(int i=0; i<NUM_SIZE; i++) { A = (int*)malloc(size[i]); valSet(A, 1, size[i]); hipMalloc(&Ad, size[i]); std::cout<<"Malloc success at size: "<<size[i]<<std::endl; for(int j=0; j<NUM_ITER; j++) { std::cout<<"\r"<<"Iter: "<<j; hipMemcpy(Ad, A, size[i], hipMemcpyHostToDevice); } std::cout<<std::endl; hipDeviceSynchronize(); } }
int main(){ setup(); int *A, *Ad; for(int i=0;i<NUM_SIZE;i++){ A = (int*)malloc(size[i]); valSet(A, 1, size[i]); hipMalloc(&Ad, size[i]); std::cout<<"Malloc success at size: "<<size[i]<<std::endl; for(int j=0;j<NUM_ITER;j++){ std::cout<<"Iter: "<<j<<std::endl; hipMemcpy(Ad, A, size[i], hipMemcpyHostToDevice); hipLaunchKernel(Add, dim3(1), dim3(size[i]/sizeof(int)), 0, 0, Ad); hipMemcpy(A, Ad, size[i], hipMemcpyDeviceToHost); } hipDeviceSynchronize(); } }
int main() { setup(); int *A, *Ad; for (int i = 0; i < NUM_SIZE; i++) { std::cout << size[i] << std::endl; A = (int*)malloc(size[i]); valSet(A, 1, size[i]); hipMalloc(&Ad, size[i]); std::cout << "Malloc success at size: " << size[i] << std::endl; clock_t start, end; start = clock(); for (int j = 0; j < NUM_ITER; j++) { // std::cout<<"At iter: "<<j<<std::endl; hipMemcpy(Ad, A, size[i], hipMemcpyHostToDevice); } hipDeviceSynchronize(); end = clock(); double uS = (double)(end - start) * 1000 / (NUM_ITER * CLOCKS_PER_SEC); std::cout << uS << std::endl; } }
void run1(size_t size, hipStream_t stream){ float *Ah, *Bh, *Cd, *Dd, *Eh; HIPCHECK(hipHostMalloc((void**)&Ah, size, hipHostMallocDefault)); HIPCHECK(hipHostMalloc((void**)&Bh, size, hipHostMallocDefault)); HIPCHECK(hipMalloc(&Cd, size)); HIPCHECK(hipMalloc(&Dd, size)); HIPCHECK(hipHostMalloc((void**)&Eh, size, hipHostMallocDefault)); for(int i=0;i<N;i++){ Ah[i] = 1.0f; } HIPCHECK(hipMemcpyAsync(Bh, Ah, size, hipMemcpyHostToHost, stream)); HIPCHECK(hipMemcpyAsync(Cd, Bh, size, hipMemcpyHostToDevice, stream)); hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Cd); HIPCHECK(hipMemcpyAsync(Dd, Cd, size, hipMemcpyDeviceToDevice, stream)); HIPCHECK(hipMemcpyAsync(Eh, Dd, size, hipMemcpyDeviceToHost, stream)); HIPCHECK(hipDeviceSynchronize()); HIPASSERT(Eh[10] == Ah[10] + 1.0f); }
void memcpy2Dtest(size_t numW, size_t numH, bool usePinnedHost) { size_t width = numW * sizeof(T); size_t sizeElements = width * numH; printf("memcpy2Dtest: %s<%s> size=%lu (%6.2fMB) W: %d, H:%d, usePinnedHost: %d\n", __func__, TYPENAME(T), sizeElements, sizeElements/1024.0/1024.0, (int)numW, (int)numH, usePinnedHost); T *A_d, *B_d, *C_d; T *A_h, *B_h, *C_h; size_t pitch_A, pitch_B, pitch_C; hipChannelFormatDesc desc = hipCreateChannelDesc<T>(); HipTest::initArrays2DPitch(&A_d, &B_d, &C_d, &pitch_A, &pitch_B, &pitch_C, numW, numH); HipTest::initArraysForHost(&A_h, &B_h, &C_h, numW*numH, usePinnedHost); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numW*numH); HIPCHECK (hipMemcpy2D (A_d, pitch_A, A_h, width, width, numH, hipMemcpyHostToDevice) ); HIPCHECK (hipMemcpy2D (B_d, pitch_B, B_h, width, width, numH, hipMemcpyHostToDevice) ); hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, (pitch_C/sizeof(T))*numH); HIPCHECK (hipMemcpy2D (C_h, width, C_d, pitch_C, width, numH, hipMemcpyDeviceToHost) ); HIPCHECK ( hipDeviceSynchronize() ); HipTest::checkVectorADD(A_h, B_h, C_h, numW*numH); HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, usePinnedHost); printf (" %s success\n", __func__); }
int main(){ hipError_t err; float *A, *Ad; A = new float[LEN]; for(int i=0;i<LEN;i++){ A[i] = 1.0f; } hipStream_t stream; err = hipStreamCreate(&stream); check("Creating stream",err); err = hipMalloc(&Ad, SIZE); check("Allocating Ad memory on device", err); err = hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice); check("Doing memory copy from A to Ad", err); float mS = 0; hipEvent_t start, stop; hipEventCreate(&start); hipEventCreate(&stop); ResultDatabase resultDB[8]; hipEventRecord(start); hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, 0, Ad); hipEventRecord(stop); hipEventElapsedTime(&mS, start, stop); resultDB[0].AddResult(std::string("First Kernel Launch"), "", "uS", mS*1000); // std::cout<<"First Kernel Launch: \t\t"<<mS*1000<<" uS"<<std::endl; resultDB[0].DumpSummary(std::cout); hipEventRecord(start); hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, 0, Ad); hipEventRecord(stop); hipEventElapsedTime(&mS, start, stop); resultDB[1].AddResult(std::string("Second Kernel Launch"), "", "uS", mS*1000); // std::cout<<"Second Kernel Launch: \t\t"<<mS*1000<<" uS"<<std::endl; resultDB[1].DumpSummary(std::cout); hipEventRecord(start); for(int i=0;i<ITER;i++){ hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, 0, Ad); } hipDeviceSynchronize(); hipEventRecord(stop); hipEventElapsedTime(&mS, start, stop); resultDB[2].AddResult(std::string("NULL Stream Sync dispatch wait"), "", "uS", mS*1000/ITER); resultDB[2].DumpSummary(std::cout); // std::cout<<"NULL Stream Sync dispatch wait: \t"<<mS*1000/ITER<<" uS"<<std::endl; hipDeviceSynchronize(); hipEventRecord(start); for(int i=0;i<ITER;i++){ hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, 0, Ad); } hipEventRecord(stop); hipDeviceSynchronize(); hipEventElapsedTime(&mS, start, stop); resultDB[3].AddResult(std::string("NULL Stream Async dispatch wait"), "", "uS", mS*1000/ITER); resultDB[3].DumpSummary(std::cout); // std::cout<<"NULL Stream Async dispatch wait: \t"<<mS*1000/ITER<<" uS"<<std::endl; hipDeviceSynchronize(); hipEventRecord(start); for(int i=0;i<ITER;i++){ hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, stream, Ad); hipDeviceSynchronize(); } hipEventRecord(stop); hipEventElapsedTime(&mS, start, stop); resultDB[4].AddResult(std::string("Stream Sync dispatch wait"), "", "uS", mS*1000/ITER); resultDB[4].DumpSummary(std::cout); // std::cout<<"Stream Sync dispatch wait: \t\t"<<mS*1000/ITER<<" uS"<<std::endl; hipDeviceSynchronize(); hipEventRecord(start); for(int i=0;i<ITER;i++){ hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, stream, Ad); } hipDeviceSynchronize(); hipEventRecord(stop); hipEventElapsedTime(&mS, start, stop); resultDB[5].AddResult(std::string("Stream Async dispatch wait"), "", "uS", mS*1000/ITER); // std::cout<<"Stream Async dispatch wait: \t\t"<<mS*1000/ITER<<" uS"<<std::endl; resultDB[5].DumpSummary(std::cout); hipDeviceSynchronize(); hipEventRecord(start); for(int i=0;i<ITER;i++){ hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, 0, Ad); } hipEventRecord(stop); hipEventElapsedTime(&mS, start, stop); resultDB[6].AddResult(std::string("NULL Stream No Wait"), "", "uS", mS*1000/ITER); resultDB[6].DumpSummary(std::cout); // std::cout<<"NULL Stream Dispatch No Wait: \t\t"<<mS*1000/ITER<<" uS"<<std::endl; hipDeviceSynchronize(); hipEventRecord(start); for(int i=0;i<ITER;i++){ hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, stream, Ad); } hipEventRecord(stop); hipEventElapsedTime(&mS, start, stop); resultDB[7].AddResult(std::string("Stream Dispatch No Wait"), "", "uS", mS*1000/ITER); resultDB[7].DumpSummary(std::cout); // std::cout<<"Stream Dispatch No Wait: \t\t"<<mS*1000/ITER<<" uS"<<std::endl; hipDeviceSynchronize(); }
int main(){ hipLaunchKernel(HIP_KERNEL_NAME(Empty), dim3(1), dim3(1), 0, 0, 0); hipDeviceSynchronize(); passed(); }
void test_pingpong(hipStream_t stream, size_t numElements, int numInflight, int numPongs, bool doHostSide) { HIPASSERT(numElements % numInflight == 0); // Must be evenly divisible. size_t Nbytes = numElements * sizeof(T); size_t eachCopyElements = numElements / numInflight; size_t eachCopyBytes = eachCopyElements * sizeof(T); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); printf( "------------------------------------------------------------------------------------------" "-----\n"); printf( "testing: %s<%s> Nbytes=%zu (%6.1f MB) numPongs=%d numInflight=%d eachCopyElements=%zu " "eachCopyBytes=%zu\n", __func__, HostTraits<AllocType>::Name(), Nbytes, (double)(Nbytes) / 1024.0 / 1024.0, numPongs, numInflight, eachCopyElements, eachCopyBytes); T* A_h = NULL; T* A_d = NULL; A_h = (T*)(HostTraits<AllocType>::Alloc(Nbytes)); HIPCHECK(hipMalloc(&A_d, Nbytes)); // Initialize the host array: const T initValue = 13; const T deviceConst = 2; const T hostConst = 10000; for (size_t i = 0; i < numElements; i++) { A_h[i] = initValue + i; } for (int k = 0; k < numPongs; k++) { for (int i = 0; i < numInflight; i++) { HIPASSERT(A_d + i * eachCopyElements < A_d + Nbytes); HIPCHECK(hipMemcpyAsync(&A_d[i * eachCopyElements], &A_h[i * eachCopyElements], eachCopyBytes, hipMemcpyHostToDevice, stream)); } hipLaunchKernel(addK<T>, dim3(blocks), dim3(threadsPerBlock), 0, stream, A_d, 2, numElements); for (int i = 0; i < numInflight; i++) { HIPASSERT(A_d + i * eachCopyElements < A_d + Nbytes); HIPCHECK(hipMemcpyAsync(&A_h[i * eachCopyElements], &A_d[i * eachCopyElements], eachCopyBytes, hipMemcpyDeviceToHost, stream)); } if (doHostSide) { assert(0); #if 0 hipEvent_t e; HIPCHECK(hipEventCreate(&e)); #endif HIPCHECK(hipDeviceSynchronize()); for (size_t i = 0; i < numElements; i++) { A_h[i] += hostConst; } } }; HIPCHECK(hipDeviceSynchronize()); // Verify we copied back all the data correctly: for (size_t i = 0; i < numElements; i++) { T gold = initValue + i; // Perform calcs in same order as test above to replicate FP order-of-operations: for (int k = 0; k < numPongs; k++) { gold += deviceConst; if (doHostSide) { gold += hostConst; } } if (gold != A_h[i]) { std::cout << i << ": gold=" << gold << " out=" << A_h[i] << std::endl; HIPASSERT(gold == A_h[i]); } } HIPCHECK(hipHostFree(A_h)); HIPCHECK(hipFree(A_d)); }
// IN: nStreams : number of streams to use for the test // IN :useNullStream - use NULL stream. Synchronizes everything. // IN: useSyncMemcpyH2D - use sync memcpy (no overlap) for H2D // IN: useSyncMemcpyD2H - use sync memcpy (no overlap) for D2H void test_chunkedAsyncExample(int nStreams, bool useNullStream, bool useSyncMemcpyH2D, bool useSyncMemcpyD2H) { size_t Nbytes = N * sizeof(int); printf("testing: %s(useNullStream=%d, useSyncMemcpyH2D=%d, useSyncMemcpyD2H=%d) ", __func__, useNullStream, useSyncMemcpyH2D, useSyncMemcpyD2H); printf("Nbytes=%zu (%6.1f MB)\n", Nbytes, (double)(Nbytes) / 1024.0 / 1024.0); int *A_d, *B_d, *C_d; int *A_h, *B_h, *C_h; HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, true); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); hipStream_t* stream = (hipStream_t*)malloc(sizeof(hipStream_t) * nStreams); if (useNullStream) { nStreams = 1; stream[0] = NULL; } else { for (int i = 0; i < nStreams; ++i) { HIPCHECK(hipStreamCreate(&stream[i])); } } size_t workLeft = N; size_t workPerStream = N / nStreams; for (int i = 0; i < nStreams; ++i) { size_t work = (workLeft < workPerStream) ? workLeft : workPerStream; size_t workBytes = work * sizeof(int); size_t offset = i * workPerStream; HIPASSERT(A_d + offset < A_d + Nbytes); HIPASSERT(B_d + offset < B_d + Nbytes); HIPASSERT(C_d + offset < C_d + Nbytes); if (useSyncMemcpyH2D) { HIPCHECK(hipMemcpy(&A_d[offset], &A_h[offset], workBytes, hipMemcpyHostToDevice)); HIPCHECK(hipMemcpy(&B_d[offset], &B_h[offset], workBytes, hipMemcpyHostToDevice)); } else { HIPCHECK(hipMemcpyAsync(&A_d[offset], &A_h[offset], workBytes, hipMemcpyHostToDevice, stream[i])); HIPCHECK(hipMemcpyAsync(&B_d[offset], &B_h[offset], workBytes, hipMemcpyHostToDevice, stream[i])); }; hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, stream[i], &A_d[offset], &B_d[offset], &C_d[offset], work); if (useSyncMemcpyD2H) { HIPCHECK(hipMemcpy(&C_h[offset], &C_d[offset], workBytes, hipMemcpyDeviceToHost)); } else { HIPCHECK(hipMemcpyAsync(&C_h[offset], &C_d[offset], workBytes, hipMemcpyDeviceToHost, stream[i])); } } HIPCHECK(hipDeviceSynchronize()); HipTest::checkVectorADD(A_h, B_h, C_h, N); HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, true); free(stream); };
void memcpyArraytest(size_t numW, size_t numH, bool usePinnedHost, bool usePitch=false) { size_t width = numW * sizeof(T); size_t sizeElements = width * numH; printf("memcpyArraytest: %s<%s> size=%lu (%6.2fMB) W: %d, H: %d, usePinnedHost: %d, usePitch: %d\n", __func__, TYPENAME(T), sizeElements, sizeElements/1024.0/1024.0, (int)numW, (int)numH, usePinnedHost, usePitch); hipArray *A_d, *B_d, *C_d; T *A_h, *B_h, *C_h; // 1D if ((numW >= 1) && (numH == 1)) { hipChannelFormatDesc desc = hipCreateChannelDesc<T>(); HipTest::initHIPArrays(&A_d, &B_d, &C_d, &desc, numW, 1, 0); HipTest::initArraysForHost(&A_h, &B_h, &C_h, numW*numH, usePinnedHost); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numW*numH); HIPCHECK (hipMemcpyToArray (A_d, 0, 0, (void *)A_h, width, hipMemcpyHostToDevice) ); HIPCHECK (hipMemcpyToArray (B_d, 0, 0, (void *)B_h, width, hipMemcpyHostToDevice) ); hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, (T*)A_d->data, (T*)B_d->data, (T*)C_d->data, numW); HIPCHECK (hipMemcpy (C_h, C_d->data, width, hipMemcpyDeviceToHost) ); HIPCHECK ( hipDeviceSynchronize() ); HipTest::checkVectorADD(A_h, B_h, C_h, numW); } // 2D else if ((numW >= 1) && (numH >= 1)) { hipChannelFormatDesc desc = hipCreateChannelDesc<T>(); HipTest::initHIPArrays(&A_d, &B_d, &C_d, &desc, numW, numH, 0); HipTest::initArraysForHost(&A_h, &B_h, &C_h, numW*numH, usePinnedHost); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numW*numH); if (usePitch) { T *A_p, *B_p, *C_p; size_t pitch_A, pitch_B, pitch_C; HipTest::initArrays2DPitch(&A_p, &B_p, &C_p, &pitch_A, &pitch_B, &pitch_C, numW, numH); HIPCHECK (hipMemcpy2D (A_p, pitch_A, A_h, width, width, numH, hipMemcpyHostToDevice) ); HIPCHECK (hipMemcpy2D (B_p, pitch_B, B_h, width, width, numH, hipMemcpyHostToDevice) ); HIPCHECK (hipMemcpy2DToArray (A_d, 0, 0, (void *)A_p, pitch_A, width, numH, hipMemcpyDeviceToDevice) ); HIPCHECK (hipMemcpy2DToArray (B_d, 0, 0, (void *)B_p, pitch_B, width, numH, hipMemcpyDeviceToDevice) ); hipFree(A_p); hipFree(B_p); hipFree(C_p); } else { HIPCHECK (hipMemcpy2DToArray (A_d, 0, 0, (void *)A_h, width, width, numH, hipMemcpyHostToDevice) ); HIPCHECK (hipMemcpy2DToArray (B_d, 0, 0, (void *)B_h, width, width, numH, hipMemcpyHostToDevice) ); } hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, (T*)A_d->data, (T*)B_d->data, (T*)C_d->data, numW*numH); HIPCHECK (hipMemcpy2D ((void*)C_h, width, (void*)C_d->data, width, width, numH, hipMemcpyDeviceToHost) ); HIPCHECK ( hipDeviceSynchronize() ); HipTest::checkVectorADD(A_h, B_h, C_h, numW*numH); } // Unknown else { HIPASSERT("Incompatible dimensions" && 0); } hipFreeArray(A_d); hipFreeArray(B_d); hipFreeArray(C_d); HipTest::freeArraysForHost(A_h, B_h, C_h, usePinnedHost); printf (" %s success\n", __func__); }
int main(int argc, char **argv) { uchar4 *h_rgbaImage, *d_rgbaImage; unsigned char *h_greyImage, *d_greyImage; std::string input_file; std::string output_file; std::string reference_file; double perPixelError = 0.0; double globalError = 0.0; bool useEpsCheck = false; switch (argc) { case 2: input_file = std::string(argv[1]); output_file = "HW1_output.png"; reference_file = "HW1_reference.png"; break; case 3: input_file = std::string(argv[1]); output_file = std::string(argv[2]); reference_file = "HW1_reference.png"; break; case 4: input_file = std::string(argv[1]); output_file = std::string(argv[2]); reference_file = std::string(argv[3]); break; case 6: useEpsCheck=true; input_file = std::string(argv[1]); output_file = std::string(argv[2]); reference_file = std::string(argv[3]); perPixelError = atof(argv[4]); globalError = atof(argv[5]); break; default: std::cerr << "Usage: ./HW1 input_file [output_filename] [reference_filename] [perPixelError] [globalError]" << std::endl; exit(1); } //load the image and give us our input and output pointers preProcess(&h_rgbaImage, &h_greyImage, &d_rgbaImage, &d_greyImage, input_file); GpuTimer timer; timer.Start(); //call the students' code your_rgba_to_greyscale(h_rgbaImage, d_rgbaImage, d_greyImage, numRows(), numCols()); timer.Stop(); hipDeviceSynchronize(); checkCudaErrors(hipGetLastError()); int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed()); if (err < 0) { //Couldn't print! Probably the student closed stdout - bad news std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl; exit(1); } size_t numPixels = numRows()*numCols(); checkCudaErrors(hipMemcpy(h_greyImage, d_greyImage, sizeof(unsigned char) * numPixels, hipMemcpyDeviceToHost)); //check results and output the grey image postProcess(output_file, h_greyImage); referenceCalculation(h_rgbaImage, h_greyImage, numRows(), numCols()); postProcess(reference_file, h_greyImage); //generateReferenceImage(input_file, reference_file); compareImages(reference_file, output_file, useEpsCheck, perPixelError, globalError); cleanup(); return 0; }
void memcpytest2(size_t numElements, bool usePinnedHost, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault) { size_t sizeElements = numElements * sizeof(T); printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d\n", __func__, TYPENAME(T), sizeElements, sizeElements/1024.0/1024.0, usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault); T *A_d, *B_d, *C_d; T *A_h, *B_h, *C_h; HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, numElements, usePinnedHost); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); T *A_hh = NULL; T *B_hh = NULL; T *C_dd = NULL; if (useHostToHost) { if (usePinnedHost) { HIPCHECK ( hipHostMalloc((void**)&A_hh, sizeElements, hipHostMallocDefault) ); HIPCHECK ( hipHostMalloc((void**)&B_hh, sizeElements, hipHostMallocDefault) ); } else { A_hh = (T*)malloc(sizeElements); B_hh = (T*)malloc(sizeElements); } // Do some extra host-to-host copies here to mix things up: HIPCHECK ( hipMemcpy(A_hh, A_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); HIPCHECK ( hipMemcpy(B_hh, B_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); HIPCHECK ( hipMemcpy(A_d, A_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); HIPCHECK ( hipMemcpy(B_d, B_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); } else { HIPCHECK ( hipMemcpy(A_d, A_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); HIPCHECK ( hipMemcpy(B_d, B_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); } hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements); if (useDeviceToDevice) { HIPCHECK ( hipMalloc(&C_dd, sizeElements) ); // Do an extra device-to-device copies here to mix things up: HIPCHECK ( hipMemcpy(C_dd, C_d, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyDeviceToDevice)); //Destroy the original C_d: HIPCHECK ( hipMemset(C_d, 0x5A, sizeElements)); HIPCHECK ( hipMemcpy(C_h, C_dd, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); } else { HIPCHECK ( hipMemcpy(C_h, C_d, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); } HIPCHECK ( hipDeviceSynchronize() ); HipTest::checkVectorADD(A_h, B_h, C_h, numElements); HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, usePinnedHost); printf (" %s success\n", __func__); }
void no_cache(float *A_h, float *A_d, float *X_h, float *X_d, float *Y_h, float *Y_d, size_t NUM_ROW, int p=0) { if(p) printf ("info: allocate host mem (%6.2f KB)\n", NUM_COLUMN*NUM_ROW*sizeof(float)/1024.0); A_h = (float*)malloc(NUM_ROW * NUM_COLUMN * sizeof(float)); CHECK(A_h == 0 ? hipErrorMemoryAllocation : hipSuccess ); X_h = (float*)malloc(NUM_COLUMN * sizeof(float)); CHECK(X_h == 0 ? hipErrorMemoryAllocation : hipSuccess ); Y_h = (float*)malloc(NUM_ROW * sizeof(float)); CHECK(Y_h == 0 ? hipErrorMemoryAllocation : hipSuccess ); // Fill with Phi + i for (size_t i=0; i<NUM_ROW * NUM_COLUMN; i++) { A_h[i] = 1.618f + (i % NB_X); } for (size_t i=0; i< NUM_COLUMN; i++) { X_h[i] = 1.618f + i; } if(p) printf ("info: allocate device mem (%6.2f KB)\n", NUM_ROW * NUM_COLUMN * sizeof(float)/1024.0); CHECK(hipMalloc(&A_d, NUM_ROW * NUM_COLUMN * sizeof(float))); CHECK(hipMalloc(&X_d, NUM_COLUMN * sizeof(float))); CHECK(hipMalloc(&Y_d, NUM_ROW * sizeof(float))); if(p) printf ("info: copy Host2Device\n"); CHECK ( hipMemcpy(A_d, A_h, NUM_ROW * NUM_COLUMN * sizeof(float), hipMemcpyHostToDevice)); CHECK ( hipMemcpy(X_d, X_h, NUM_COLUMN * sizeof(float), hipMemcpyHostToDevice)); const unsigned blocks = (NUM_ROW -1)/NB_X + 1; const unsigned threadsPerBlock = NB_X; if(p) printf ("info: launch 'gemv_kernel' kernel\n"); for(int i=1 ; i < 1e3; i*=2) { size_t num_row = NB_X * i; clock_t t; t = clock(); double time; time = rocblas_wtime(); hipLaunchKernelGGL(HIP_KERNEL_NAME(gemv_kernel), dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, X_d, Y_d, num_row); //time = rocblas_wtime() - time; hipDeviceSynchronize(); t = clock() - t; time = ((double)t)/CLOCKS_PER_SEC*1000 ; if(p) printf ("It took me %d clicks (%f miliseconds).\n",t, time); printf ("Row = %d, It took me (%f milliseconds), Gflops=%f\n",time, num_row, 2*num_row*NUM_COLUMN/(time)/10e6); } /* if(p) printf ("info: copy Device2Host\n"); CHECK ( hipMemcpy(Y_h, Y_d, NUM_ROW * sizeof(float), hipMemcpyDeviceToHost)); if(p) printf ("info: check result\n"); for (size_t i=0; i<NUM_ROW; i++) { float res = 0; for(int j=0; j<NUM_COLUMN; j++){ res += A_h[i + j * NUM_ROW] * X_h[j]; } if (Y_h[i] != res) { printf("i=%d, CPU result=%f, GPU result=%f\n", i, res, Y_h[i]); //CHECK(hipErrorUnknown); } } */ if(p) printf ("PASSED!\n"); hipFree(A_d); hipFree(Y_d); hipFree(X_d); free(A_h); free(Y_h); free(X_h); }
// CPU Timer(in millisecond): double rocblas_wtime( void ){ hipDeviceSynchronize(); struct timeval tv; gettimeofday(&tv, NULL); return (tv.tv_sec * 1000) + tv.tv_usec /1000; };
extern "C" void mixbenchGPU(double *c, long size){ const char *benchtype = "compute with global memory (block strided)"; printf("Trade-off type: %s\n", benchtype); double *cd; CUDA_SAFE_CALL( hipMalloc((void**)&cd, size*sizeof(double)) ); // Copy data to device memory CUDA_SAFE_CALL( hipMemset(cd, 0, size*sizeof(double)) ); // initialize to zeros // Synchronize in order to wait for memory operations to finish CUDA_SAFE_CALL( hipDeviceSynchronize() ); printf("---------------------------------------------------------- CSV data ----------------------------------------------------------\n"); printf("Experiment ID, Single Precision ops,,,, Double precision ops,,,, Integer operations,,, \n"); printf("Compute iters, Flops/byte, ex.time, GFLOPS, GB/sec, Flops/byte, ex.time, GFLOPS, GB/sec, Iops/byte, ex.time, GIOPS, GB/sec\n"); runbench_warmup(cd, size); runbench<32>(cd, size); runbench<31>(cd, size); runbench<30>(cd, size); runbench<29>(cd, size); runbench<28>(cd, size); runbench<27>(cd, size); runbench<26>(cd, size); runbench<25>(cd, size); runbench<24>(cd, size); runbench<23>(cd, size); runbench<22>(cd, size); runbench<21>(cd, size); runbench<20>(cd, size); runbench<19>(cd, size); runbench<18>(cd, size); runbench<17>(cd, size); runbench<16>(cd, size); runbench<15>(cd, size); runbench<14>(cd, size); runbench<13>(cd, size); runbench<12>(cd, size); runbench<11>(cd, size); runbench<10>(cd, size); runbench<9>(cd, size); runbench<8>(cd, size); runbench<7>(cd, size); runbench<6>(cd, size); runbench<5>(cd, size); runbench<4>(cd, size); runbench<3>(cd, size); runbench<2>(cd, size); runbench<1>(cd, size); runbench<0>(cd, size); printf("---------------------------------------------------------- CSV data ----------------------------------------------------------\n"); // Copy results back to host memory CUDA_SAFE_CALL( hipMemcpy(c, cd, size*sizeof(double), hipMemcpyDeviceToHost) ); CUDA_SAFE_CALL( hipFree(cd) ); CUDA_SAFE_CALL( hipDeviceReset() ); }
void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, SyncMode syncMode, bool expectMismatch) { // This test sends a long-running kernel to the null stream, then tests to see if the // specified synchronization technique is effective. // // Some syncMode are not expected to correctly sync (for example "syncNone"). in these // cases the test sets expectMismatch and the check logic below will attempt to ensure that // the undesired synchronization did not occur - ie ensure the kernel is still running and did // not yet update the stop event. This can be tricky since if the kernel runs fast enough it // may complete before the check. To prevent this, the addCountReverse has a count parameter // which causes it to loop repeatedly, and the results are checked in reverse order. // // Tests with expectMismatch=true should ensure the kernel finishes correctly. This results // are checked and we test to make sure stop event has completed. if (!(testMask & p_tests)) { return; } printf ("\ntest 0x%02x: syncMode=%s expectMismatch=%d\n", testMask, syncModeString(syncMode), expectMismatch); size_t sizeBytes = numElements * sizeof(int); int count =100; int init0 = 0; HIPCHECK(hipMemset(C_d, init0, sizeBytes)); for (int i=0; i<numElements; i++) { C_h[i] = -1; // initialize } hipStream_t otherStream = 0; unsigned flags = (syncMode == syncMarkerThenOtherNonBlockingStream) ? hipStreamNonBlocking : hipStreamDefault; HIPCHECK(hipStreamCreateWithFlags(&otherStream, flags)); hipEvent_t stop, otherStreamEvent; HIPCHECK(hipEventCreate(&stop)); HIPCHECK(hipEventCreate(&otherStreamEvent)); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); // Launch kernel into null stream, should result in C_h == count. hipLaunchKernelGGL( HipTest::addCountReverse, dim3(blocks), dim3(threadsPerBlock), 0, 0 /*stream*/, static_cast<const int*>(C_d), C_h, numElements, count); HIPCHECK(hipEventRecord(stop, 0/*default*/)); switch (syncMode) { case syncNone: break; case syncNullStream: HIPCHECK(hipStreamSynchronize(0)); // wait on host for null stream: break; case syncOtherStream: // Does this synchronize with the null stream? HIPCHECK(hipStreamSynchronize(otherStream)); break; case syncMarkerThenOtherStream: case syncMarkerThenOtherNonBlockingStream: // this may wait for NULL stream depending hipStreamNonBlocking flag above HIPCHECK(hipEventRecord(otherStreamEvent, otherStream)); HIPCHECK(hipStreamSynchronize(otherStream)); break; case syncDevice: HIPCHECK(hipDeviceSynchronize()); break; default: assert(0); }; hipError_t done = hipEventQuery(stop); if (expectMismatch) { assert (done == hipErrorNotReady); } else { assert (done == hipSuccess); } int mismatches = 0; int expected = init0 + count; for (int i=0; i<numElements; i++) { bool compareEqual = (C_h[i] == expected); if (!compareEqual) { mismatches ++; if (!expectMismatch) { printf ("C_h[%d] (%d) != %d\n", i, C_h[i], expected); assert(C_h[i] == expected); } } } if (expectMismatch) { assert (mismatches > 0); } HIPCHECK(hipStreamDestroy(otherStream)); HIPCHECK(hipEventDestroy(stop)); HIPCHECK(hipEventDestroy(otherStreamEvent)); HIPCHECK(hipDeviceSynchronize()); printf ("test: OK - %d mismatches (%6.2f%%)\n", mismatches, ((double)(mismatches)*100.0)/numElements); }