int main() { size_t Nbytes = N * sizeof(int); int numDevices = 0; int *A_d, *B_d, *C_d, *X_d, *Y_d, *Z_d; int *A_h, *B_h, *C_h; hipStream_t s; HIPCHECK(hipGetDeviceCount(&numDevices)); if (numDevices > 1) { HIPCHECK(hipSetDevice(0)); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false); HIPCHECK(hipSetDevice(1)); HIPCHECK(hipMalloc(&X_d, Nbytes)); HIPCHECK(hipMalloc(&Y_d, Nbytes)); HIPCHECK(hipMalloc(&Z_d, Nbytes)); HIPCHECK(hipSetDevice(0)); HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice)); hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, static_cast<const int*>(A_d), static_cast<const int*>(B_d), C_d, N); HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); HIPCHECK(hipDeviceSynchronize()); HipTest::checkVectorADD(A_h, B_h, C_h, N); HIPCHECK(hipSetDevice(1)); HIPCHECK(hipStreamCreate(&s)); HIPCHECK(hipMemcpyDtoDAsync((hipDeviceptr_t)X_d, (hipDeviceptr_t)A_d, Nbytes, s)); HIPCHECK(hipMemcpyDtoDAsync((hipDeviceptr_t)Y_d, (hipDeviceptr_t)B_d, Nbytes, s)); hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, static_cast<const int*>(X_d), static_cast<const int*>(Y_d), Z_d, N); HIPCHECK(hipMemcpyDtoHAsync(C_h, (hipDeviceptr_t)Z_d, Nbytes, s)); HIPCHECK(hipStreamSynchronize(s)); HIPCHECK(hipDeviceSynchronize()); HipTest::checkVectorADD(A_h, B_h, C_h, N); HIPCHECK(hipStreamDestroy(s)); HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false); HIPCHECK(hipFree(X_d)); HIPCHECK(hipFree(Y_d)); HIPCHECK(hipFree(Z_d)); } passed(); }
bool run_erfinv() { double *A, *Ad, *B, *Bd; A = new double[N]; B = new double[N]; for (int i = 0; i < N; i++) { A[i] = -0.6; B[i] = 0.0; } hipMalloc((void**)&Ad, SIZE); hipMalloc((void**)&Bd, SIZE); hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice); hipLaunchKernelGGL(test_erfinv, dim3(1), dim3(N), 0, 0, Ad, Bd); hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost); int passed = 0; for (int i = 0; i < 512; i++) { if (B[i] - A[i] < 0.000001) { passed = 1; } } delete[] A; delete[] B; hipFree(Ad); hipFree(Bd); if (passed == 1) { return true; } assert(passed == 1); return false; }
bool run_rnorm() { double *A, *Ad, *B, *Bd; A = new double[N]; B = new double[N]; double val = 0.0; for (int i = 0; i < N; i++) { A[i] = 1.0; B[i] = 0.0; val += 1.0; } val = 1 / sqrt(val); hipMalloc((void**)&Ad, SIZE); hipMalloc((void**)&Bd, SIZE); hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice); hipLaunchKernelGGL(test_rnorm, dim3(1), dim3(N), 0, 0, Ad, Bd); hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost); int passed = 0; for (int i = 0; i < 512; i++) { if (B[0] - val < 0.000001) { passed = 1; } } delete[] A; delete[] B; hipFree(Ad); hipFree(Bd); if (passed == 1) { return true; } assert(passed == 1); return false; }
bool run_lround() { double *A, *Ad; long int *B, *Bd; A = new double[N]; B = new long int[N]; for (int i = 0; i < N; i++) { A[i] = 1.345; } hipMalloc((void**)&Ad, SIZE); hipMalloc((void**)&Bd, N * sizeof(long int)); hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice); hipLaunchKernelGGL(test_lround, dim3(1), dim3(N), 0, 0, Ad, Bd); hipMemcpy(B, Bd, N * sizeof(long int), hipMemcpyDeviceToHost); int passed = 0; for (int i = 0; i < 512; i++) { long int x = round(A[i]); if (B[i] == x) { passed = 1; } } delete[] A; delete[] B; hipFree(Ad); hipFree(Bd); if (passed == 1) { return true; } assert(passed == 1); return false; }
bool run_add() { constexpr size_t N = 64; std::vector<T> host_input(N); std::vector<T> host_expected(N); for (int i = 0; i < N; ++i) { host_input[i] = (T)i; host_expected[i] = host_input[i] + host_input[i]; } T* input1; hipMalloc(&input1, N * sizeof(T)); hipMemcpy(input1, host_input.data(), host_input.size()*sizeof(T), hipMemcpyHostToDevice); T* input2; hipMalloc(&input2, N * sizeof(T)); hipMemcpy(input2, host_input.data(), host_input.size()*sizeof(T), hipMemcpyHostToDevice); constexpr unsigned int blocks = 1; constexpr unsigned int threads_per_block = 1; hipLaunchKernelGGL(add<T>, dim3(blocks), dim3(threads_per_block), 0, 0, input1, input2, N); hipMemcpy(host_input.data(), input1, host_input.size()*sizeof(T), hipMemcpyDeviceToHost); bool equal = true; for (int i = 0; i < N; i++) { equal &= (host_input[i] == host_expected[i]); } return equal; }
int main() { hipLaunchKernelGGL( compileDoublePrecisionMathOnDevice, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0, 1); passed(); }
bool run_rnorm4d() { double *A, *Ad, *B, *Bd, *C, *Cd, *D, *Dd, *E, *Ed; A = new double[N]; B = new double[N]; C = new double[N]; D = new double[N]; E = new double[N]; double val = 0.0; for (int i = 0; i < N; i++) { A[i] = 1.0; B[i] = 2.0; C[i] = 3.0; D[i] = 4.0; } val = 1 / sqrt(1.0 + 4.0 + 9.0 + 16.0); hipMalloc((void**)&Ad, SIZE); hipMalloc((void**)&Bd, SIZE); hipMalloc((void**)&Cd, SIZE); hipMalloc((void**)&Dd, SIZE); hipMalloc((void**)&Ed, SIZE); hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice); hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice); hipMemcpy(Cd, C, SIZE, hipMemcpyHostToDevice); hipMemcpy(Dd, D, SIZE, hipMemcpyHostToDevice); hipLaunchKernelGGL(test_rnorm4d, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd, Dd, Ed); hipMemcpy(E, Ed, SIZE, hipMemcpyDeviceToHost); int passed = 0; for (int i = 0; i < 512; i++) { if (E[i] - val < 0.000001) { passed = 1; } } delete[] A; delete[] B; delete[] C; delete[] D; delete[] E; hipFree(Ad); hipFree(Bd); hipFree(Cd); hipFree(Dd); hipFree(Ed); if (passed == 1) { return true; } assert(passed == 1); return false; }
bool run_sincospi() { double *A, *Ad, *B, *C, *Bd, *Cd; A = new double[N]; B = new double[N]; C = new double[N]; for (int i = 0; i < N; i++) { A[i] = 1.0; } hipMalloc((void**)&Ad, SIZE); hipMalloc((void**)&Bd, SIZE); hipMalloc((void**)&Cd, SIZE); hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice); hipLaunchKernelGGL(test_sincospi, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd); hipMemcpy(B, Bd, SIZE, hipMemcpyDeviceToHost); hipMemcpy(C, Cd, SIZE, hipMemcpyDeviceToHost); int passed = 0; for (int i = 0; i < 512; i++) { if (B[i] - sin(3.14 * 1.0) < 0.1) { passed = 1; } } passed = 0; for (int i = 0; i < 512; i++) { if (C[i] - cos(3.14 * 1.0) < 0.1) { passed = 1; } } delete[] A; delete[] B; delete[] C; hipFree(Ad); hipFree(Bd); hipFree(Cd); if (passed == 1) { return true; } assert(passed == 1); return false; }
bool run_rhypot() { double *A, *Ad, *B, *Bd, *C, *Cd; A = new double[N]; B = new double[N]; C = new double[N]; double val = 0.0; for (int i = 0; i < N; i++) { A[i] = 1.0; B[i] = 2.0; } val = 1 / sqrt(1.0 + 4.0); hipMalloc((void**)&Ad, SIZE); hipMalloc((void**)&Bd, SIZE); hipMalloc((void**)&Cd, SIZE); hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice); hipMemcpy(Bd, B, SIZE, hipMemcpyHostToDevice); hipLaunchKernelGGL(test_rhypot, dim3(1), dim3(N), 0, 0, Ad, Bd, Cd); hipMemcpy(C, Cd, SIZE, hipMemcpyDeviceToHost); int passed = 0; for (int i = 0; i < 512; i++) { if (C[i] - val < 0.000001) { passed = 1; } } delete[] A; delete[] B; delete[] C; hipFree(Ad); hipFree(Bd); hipFree(Cd); if (passed == 1) { return true; } assert(passed == 1); return false; }
void no_cache(float *A_h, float *A_d, float *X_h, float *X_d, float *Y_h, float *Y_d, size_t NUM_ROW, int p=0) { if(p) printf ("info: allocate host mem (%6.2f KB)\n", NUM_COLUMN*NUM_ROW*sizeof(float)/1024.0); A_h = (float*)malloc(NUM_ROW * NUM_COLUMN * sizeof(float)); CHECK(A_h == 0 ? hipErrorMemoryAllocation : hipSuccess ); X_h = (float*)malloc(NUM_COLUMN * sizeof(float)); CHECK(X_h == 0 ? hipErrorMemoryAllocation : hipSuccess ); Y_h = (float*)malloc(NUM_ROW * sizeof(float)); CHECK(Y_h == 0 ? hipErrorMemoryAllocation : hipSuccess ); // Fill with Phi + i for (size_t i=0; i<NUM_ROW * NUM_COLUMN; i++) { A_h[i] = 1.618f + (i % NB_X); } for (size_t i=0; i< NUM_COLUMN; i++) { X_h[i] = 1.618f + i; } if(p) printf ("info: allocate device mem (%6.2f KB)\n", NUM_ROW * NUM_COLUMN * sizeof(float)/1024.0); CHECK(hipMalloc(&A_d, NUM_ROW * NUM_COLUMN * sizeof(float))); CHECK(hipMalloc(&X_d, NUM_COLUMN * sizeof(float))); CHECK(hipMalloc(&Y_d, NUM_ROW * sizeof(float))); if(p) printf ("info: copy Host2Device\n"); CHECK ( hipMemcpy(A_d, A_h, NUM_ROW * NUM_COLUMN * sizeof(float), hipMemcpyHostToDevice)); CHECK ( hipMemcpy(X_d, X_h, NUM_COLUMN * sizeof(float), hipMemcpyHostToDevice)); const unsigned blocks = (NUM_ROW -1)/NB_X + 1; const unsigned threadsPerBlock = NB_X; if(p) printf ("info: launch 'gemv_kernel' kernel\n"); for(int i=1 ; i < 1e3; i*=2) { size_t num_row = NB_X * i; clock_t t; t = clock(); double time; time = rocblas_wtime(); hipLaunchKernelGGL(HIP_KERNEL_NAME(gemv_kernel), dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, X_d, Y_d, num_row); //time = rocblas_wtime() - time; hipDeviceSynchronize(); t = clock() - t; time = ((double)t)/CLOCKS_PER_SEC*1000 ; if(p) printf ("It took me %d clicks (%f miliseconds).\n",t, time); printf ("Row = %d, It took me (%f milliseconds), Gflops=%f\n",time, num_row, 2*num_row*NUM_COLUMN/(time)/10e6); } /* if(p) printf ("info: copy Device2Host\n"); CHECK ( hipMemcpy(Y_h, Y_d, NUM_ROW * sizeof(float), hipMemcpyDeviceToHost)); if(p) printf ("info: check result\n"); for (size_t i=0; i<NUM_ROW; i++) { float res = 0; for(int j=0; j<NUM_COLUMN; j++){ res += A_h[i + j * NUM_ROW] * X_h[j]; } if (Y_h[i] != res) { printf("i=%d, CPU result=%f, GPU result=%f\n", i, res, Y_h[i]); //CHECK(hipErrorUnknown); } } */ if(p) printf ("PASSED!\n"); hipFree(A_d); hipFree(Y_d); hipFree(X_d); free(A_h); free(Y_h); free(X_h); }
void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, SyncMode syncMode, bool expectMismatch) { // This test sends a long-running kernel to the null stream, then tests to see if the // specified synchronization technique is effective. // // Some syncMode are not expected to correctly sync (for example "syncNone"). in these // cases the test sets expectMismatch and the check logic below will attempt to ensure that // the undesired synchronization did not occur - ie ensure the kernel is still running and did // not yet update the stop event. This can be tricky since if the kernel runs fast enough it // may complete before the check. To prevent this, the addCountReverse has a count parameter // which causes it to loop repeatedly, and the results are checked in reverse order. // // Tests with expectMismatch=true should ensure the kernel finishes correctly. This results // are checked and we test to make sure stop event has completed. if (!(testMask & p_tests)) { return; } printf ("\ntest 0x%02x: syncMode=%s expectMismatch=%d\n", testMask, syncModeString(syncMode), expectMismatch); size_t sizeBytes = numElements * sizeof(int); int count =100; int init0 = 0; HIPCHECK(hipMemset(C_d, init0, sizeBytes)); for (int i=0; i<numElements; i++) { C_h[i] = -1; // initialize } hipStream_t otherStream = 0; unsigned flags = (syncMode == syncMarkerThenOtherNonBlockingStream) ? hipStreamNonBlocking : hipStreamDefault; HIPCHECK(hipStreamCreateWithFlags(&otherStream, flags)); hipEvent_t stop, otherStreamEvent; HIPCHECK(hipEventCreate(&stop)); HIPCHECK(hipEventCreate(&otherStreamEvent)); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); // Launch kernel into null stream, should result in C_h == count. hipLaunchKernelGGL( HipTest::addCountReverse, dim3(blocks), dim3(threadsPerBlock), 0, 0 /*stream*/, static_cast<const int*>(C_d), C_h, numElements, count); HIPCHECK(hipEventRecord(stop, 0/*default*/)); switch (syncMode) { case syncNone: break; case syncNullStream: HIPCHECK(hipStreamSynchronize(0)); // wait on host for null stream: break; case syncOtherStream: // Does this synchronize with the null stream? HIPCHECK(hipStreamSynchronize(otherStream)); break; case syncMarkerThenOtherStream: case syncMarkerThenOtherNonBlockingStream: // this may wait for NULL stream depending hipStreamNonBlocking flag above HIPCHECK(hipEventRecord(otherStreamEvent, otherStream)); HIPCHECK(hipStreamSynchronize(otherStream)); break; case syncDevice: HIPCHECK(hipDeviceSynchronize()); break; default: assert(0); }; hipError_t done = hipEventQuery(stop); if (expectMismatch) { assert (done == hipErrorNotReady); } else { assert (done == hipSuccess); } int mismatches = 0; int expected = init0 + count; for (int i=0; i<numElements; i++) { bool compareEqual = (C_h[i] == expected); if (!compareEqual) { mismatches ++; if (!expectMismatch) { printf ("C_h[%d] (%d) != %d\n", i, C_h[i], expected); assert(C_h[i] == expected); } } } if (expectMismatch) { assert (mismatches > 0); } HIPCHECK(hipStreamDestroy(otherStream)); HIPCHECK(hipEventDestroy(stop)); HIPCHECK(hipEventDestroy(otherStreamEvent)); HIPCHECK(hipDeviceSynchronize()); printf ("test: OK - %d mismatches (%6.2f%%)\n", mismatches, ((double)(mismatches)*100.0)/numElements); }
int main() { hipLaunchKernelGGL(FloatMathPrecise, dim3(1, 1, 1), dim3(1, 1, 1), 0, 0); passed(); }