C++ (Cpp) hipDeviceSynchronize示例

示例#1

0

显示文件

文件： hipMemcpyAsync.cpp 项目： ssahasra/HIP

void test_manyInflightCopies(hipStream_t stream, int numElements, int numCopies,
                             bool syncBetweenCopies) {
    size_t Nbytes = numElements * sizeof(T);
    size_t eachCopyElements = numElements / numCopies;
    size_t eachCopyBytes = eachCopyElements * sizeof(T);

    printf(
        "------------------------------------------------------------------------------------------"
        "-----\n");
    printf(
        "testing: %s  Nbytes=%zu (%6.1f MB) numCopies=%d eachCopyElements=%zu eachCopyBytes=%zu\n",
        __func__, Nbytes, (double)(Nbytes) / 1024.0 / 1024.0, numCopies, eachCopyElements,
        eachCopyBytes);

    T* A_d;
    T *A_h1, *A_h2;

    HIPCHECK(hipHostMalloc((void**)&A_h1, Nbytes, hipHostMallocDefault));
    HIPCHECK(hipHostMalloc((void**)&A_h2, Nbytes, hipHostMallocDefault));
    HIPCHECK(hipMalloc(&A_d, Nbytes));

    for (int i = 0; i < numElements; i++) {
        A_h1[i] = 3.14f + static_cast<T>(i);
    }


    // stream=0; // fixme TODO


    for (int i = 0; i < numCopies; i++) {
        HIPASSERT(A_d + i * eachCopyElements < A_d + Nbytes);
        HIPCHECK(hipMemcpyAsync(&A_d[i * eachCopyElements], &A_h1[i * eachCopyElements],
                                eachCopyBytes, hipMemcpyHostToDevice, stream));
    }

    if (syncBetweenCopies) {
        HIPCHECK(hipDeviceSynchronize());
    }

    for (int i = 0; i < numCopies; i++) {
        HIPASSERT(A_d + i * eachCopyElements < A_d + Nbytes);
        HIPCHECK(hipMemcpyAsync(&A_h2[i * eachCopyElements], &A_d[i * eachCopyElements],
                                eachCopyBytes, hipMemcpyDeviceToHost, stream));
    }

    HIPCHECK(hipDeviceSynchronize());


    // Verify we copied back all the data correctly:
    for (int i = 0; i < numElements; i++) {
        HIPASSERT(A_h1[i] == A_h2[i]);
    }


    HIPCHECK(hipHostFree(A_h1));
    HIPCHECK(hipHostFree(A_h2));
    HIPCHECK(hipFree(A_d));
}

示例#2

0

显示文件

文件： hipMemcpyDtoDAsync.cpp 项目： ssahasra/HIP

int main() {
    size_t Nbytes = N * sizeof(int);
    int numDevices = 0;
    int *A_d, *B_d, *C_d, *X_d, *Y_d, *Z_d;
    int *A_h, *B_h, *C_h;
    hipStream_t s;

    HIPCHECK(hipGetDeviceCount(&numDevices));
    if (numDevices > 1) {
        HIPCHECK(hipSetDevice(0));
        unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
        HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
        HIPCHECK(hipSetDevice(1));
        HIPCHECK(hipMalloc(&X_d, Nbytes));
        HIPCHECK(hipMalloc(&Y_d, Nbytes));
        HIPCHECK(hipMalloc(&Z_d, Nbytes));


        HIPCHECK(hipSetDevice(0));
        HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
        HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
        hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                        static_cast<const int*>(A_d), static_cast<const int*>(B_d), C_d, N);
        HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
        HIPCHECK(hipDeviceSynchronize());
        HipTest::checkVectorADD(A_h, B_h, C_h, N);

        HIPCHECK(hipSetDevice(1));
        HIPCHECK(hipStreamCreate(&s));
        HIPCHECK(hipMemcpyDtoDAsync((hipDeviceptr_t)X_d, (hipDeviceptr_t)A_d, Nbytes, s));
        HIPCHECK(hipMemcpyDtoDAsync((hipDeviceptr_t)Y_d, (hipDeviceptr_t)B_d, Nbytes, s));

        hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
                        static_cast<const int*>(X_d), static_cast<const int*>(Y_d), Z_d, N);
        HIPCHECK(hipMemcpyDtoHAsync(C_h, (hipDeviceptr_t)Z_d, Nbytes, s));
        HIPCHECK(hipStreamSynchronize(s));
        HIPCHECK(hipDeviceSynchronize());

        HipTest::checkVectorADD(A_h, B_h, C_h, N);
        HIPCHECK(hipStreamDestroy(s));
        HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
        HIPCHECK(hipFree(X_d));
        HIPCHECK(hipFree(Y_d));
        HIPCHECK(hipFree(Z_d));
    }

    passed();
}

示例#3

0

显示文件

文件： hipGridLaunch.cpp 项目： GPUOpen-ProfessionalCompute-Tools/HIP

int test_gl2(size_t N) {

    size_t Nbytes = N*sizeof(int);

    int *A_d, *B_d, *C_d;
    int *A_h, *B_h, *C_h;

    HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N);



    unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);


    // Full vadd in one large chunk, to get things started:
    HIPCHECK ( hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
    HIPCHECK ( hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));

    hipLaunchKernel(vectorADD2, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, N);

    HIPCHECK ( hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));

    HIPCHECK (hipDeviceSynchronize());

    HipTest::checkVectorADD(A_h, B_h, C_h, N);

    return 0;

}

示例#4

0

显示文件

文件： hipMultiThreadStreams2.cpp 项目： GPUOpen-ProfessionalCompute-Tools/HIP

void run(size_t size, hipStream_t stream1, hipStream_t stream2){
	float *Ah, *Bh, *Cd, *Dd, *Eh;
	float *Ahh, *Bhh, *Cdd, *Ddd, *Ehh;

	HIPCHECK(hipHostMalloc((void**)&Ah, size, hipHostMallocDefault));
	HIPCHECK(hipHostMalloc((void**)&Bh, size, hipHostMallocDefault));
	HIPCHECK(hipMalloc(&Cd, size));
	HIPCHECK(hipMalloc(&Dd, size));
	HIPCHECK(hipHostMalloc((void**)&Eh, size, hipHostMallocDefault));
	HIPCHECK(hipHostMalloc((void**)&Ahh, size, hipHostMallocDefault));
	HIPCHECK(hipHostMalloc((void**)&Bhh, size, hipHostMallocDefault));
	HIPCHECK(hipMalloc(&Cdd, size));
	HIPCHECK(hipMalloc(&Ddd, size));
	HIPCHECK(hipHostMalloc((void**)&Ehh, size, hipHostMallocDefault));

	HIPCHECK(hipMemcpyAsync(Bh, Ah, size, hipMemcpyHostToHost, stream1));
	HIPCHECK(hipMemcpyAsync(Bhh, Ahh, size, hipMemcpyHostToHost, stream2));
	HIPCHECK(hipMemcpyAsync(Cd, Bh, size, hipMemcpyHostToDevice, stream1));
	HIPCHECK(hipMemcpyAsync(Cdd, Bhh, size, hipMemcpyHostToDevice, stream2));
	hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream1, Cd);
	hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream2, Cdd);
	HIPCHECK(hipMemcpyAsync(Dd, Cd, size, hipMemcpyDeviceToDevice, stream1));
	HIPCHECK(hipMemcpyAsync(Ddd, Cdd, size, hipMemcpyDeviceToDevice, stream2));
	HIPCHECK(hipMemcpyAsync(Eh, Dd, size, hipMemcpyDeviceToHost, stream1));
	HIPCHECK(hipMemcpyAsync(Ehh, Ddd, size, hipMemcpyDeviceToHost, stream2));
	HIPCHECK(hipDeviceSynchronize());
	HIPASSERT(Eh[10] = Ah[10] + 1.0f);
	HIPASSERT(Ehh[10] = Ahh[10] + 1.0f);
}

示例#5

0

显示文件

文件： hipMultiThreadDevice.cpp 项目： GPUOpen-ProfessionalCompute-Tools/HIP

void waitStreams(int iterations)
{
    // Repeatedly sync and wait for all streams to complete.
    // TO make this interesting, the test has other threads repeatedly adding and removing streams to the device.
    for (int i=0; i<iterations; i++) {
        HIPCHECK(hipDeviceSynchronize());
    }
}

示例#6

0

显示文件

文件： hipMemcpyAsync2.cpp 项目： ssahasra/HIP

int main() {
    float *A, *Ad;
    HIPCHECK(hipHostMalloc((void**)&A, SIZE, hipHostMallocDefault));
    HIPCHECK(hipMalloc((void**)&Ad, SIZE));
    hipStream_t stream;
    HIPCHECK(hipStreamCreate(&stream));
    for (int i = 0; i < SIZE; i++) {
        HIPCHECK(hipMemcpyAsync(Ad, A, SIZE, hipMemcpyHostToDevice, stream));
        HIPCHECK(hipDeviceSynchronize());
    }
}

示例#7

0

显示文件

文件： mix_kernels_hip.cpp 项目： sunway513/mixbench

void runbench_warmup(double *cd, long size){
	const long reduced_grid_size = size/(UNROLLED_MEMORY_ACCESSES)/32;
	const int BLOCK_SIZE = 256;
	const int TOTAL_REDUCED_BLOCKS = reduced_grid_size/BLOCK_SIZE;

	dim3 dimBlock(BLOCK_SIZE, 1, 1);
	dim3 dimReducedGrid(TOTAL_REDUCED_BLOCKS, 1, 1);

	hipLaunchKernel(HIP_KERNEL_NAME(benchmark_func< short, BLOCK_SIZE, 0 >), dim3(dimReducedGrid), dim3(dimBlock ), 0, 0, (short)1, (short*)cd);
	CUDA_SAFE_CALL( hipGetLastError() );
	CUDA_SAFE_CALL( hipDeviceSynchronize() );
}

示例#8

0

显示文件

文件： hipStressMemcpy.cpp 项目： GPUOpen-ProfessionalCompute-Tools/HIP

int main() {
    setup();
    int *A, *Ad;
    for(int i=0; i<NUM_SIZE; i++) {
        A = (int*)malloc(size[i]);
        valSet(A, 1, size[i]);
        hipMalloc(&Ad, size[i]);
        std::cout<<"Malloc success at size: "<<size[i]<<std::endl;
        for(int j=0; j<NUM_ITER; j++) {
            std::cout<<"\r"<<"Iter: "<<j;
            hipMemcpy(Ad, A, size[i], hipMemcpyHostToDevice);
        }
        std::cout<<std::endl;
        hipDeviceSynchronize();
    }
}

示例#9

0

显示文件

文件： hipStressChain.cpp 项目： kknox/HIP

int main(){
    setup();
    int *A, *Ad;
    for(int i=0;i<NUM_SIZE;i++){
        A = (int*)malloc(size[i]);
        valSet(A, 1, size[i]);
        hipMalloc(&Ad, size[i]);
        std::cout<<"Malloc success at size: "<<size[i]<<std::endl;

        for(int j=0;j<NUM_ITER;j++){
            std::cout<<"Iter: "<<j<<std::endl;
            hipMemcpy(Ad, A, size[i], hipMemcpyHostToDevice);
            hipLaunchKernel(Add, dim3(1), dim3(size[i]/sizeof(int)), 0, 0, Ad);
            hipMemcpy(A, Ad, size[i], hipMemcpyDeviceToHost);
        }

        hipDeviceSynchronize();
    }
}

示例#10

0

显示文件

文件： hipPerfMemcpy.cpp 项目： ssahasra/HIP

int main() {
    setup();
    int *A, *Ad;
    for (int i = 0; i < NUM_SIZE; i++) {
        std::cout << size[i] << std::endl;
        A = (int*)malloc(size[i]);
        valSet(A, 1, size[i]);
        hipMalloc(&Ad, size[i]);
        std::cout << "Malloc success at size: " << size[i] << std::endl;
        clock_t start, end;
        start = clock();
        for (int j = 0; j < NUM_ITER; j++) {
            //            std::cout<<"At iter: "<<j<<std::endl;
            hipMemcpy(Ad, A, size[i], hipMemcpyHostToDevice);
        }
        hipDeviceSynchronize();
        end = clock();
        double uS = (double)(end - start) * 1000 / (NUM_ITER * CLOCKS_PER_SEC);
        std::cout << uS << std::endl;
    }
}

示例#11

0

显示文件

文件： hipMultiThreadStreams2.cpp 项目： GPUOpen-ProfessionalCompute-Tools/HIP

void run1(size_t size, hipStream_t stream){
	float *Ah, *Bh, *Cd, *Dd, *Eh;

	HIPCHECK(hipHostMalloc((void**)&Ah, size, hipHostMallocDefault));
	HIPCHECK(hipHostMalloc((void**)&Bh, size, hipHostMallocDefault));
	HIPCHECK(hipMalloc(&Cd, size));
	HIPCHECK(hipMalloc(&Dd, size));
	HIPCHECK(hipHostMalloc((void**)&Eh, size, hipHostMallocDefault));

	for(int i=0;i<N;i++){
		Ah[i] = 1.0f;
	}

	HIPCHECK(hipMemcpyAsync(Bh, Ah, size, hipMemcpyHostToHost, stream));
	HIPCHECK(hipMemcpyAsync(Cd, Bh, size, hipMemcpyHostToDevice, stream));
	hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Cd);
	HIPCHECK(hipMemcpyAsync(Dd, Cd, size, hipMemcpyDeviceToDevice, stream));
	HIPCHECK(hipMemcpyAsync(Eh, Dd, size, hipMemcpyDeviceToHost, stream));
	HIPCHECK(hipDeviceSynchronize());
	HIPASSERT(Eh[10] == Ah[10] + 1.0f);
}

示例#12

0

显示文件

文件： hipArray.cpp 项目： kknox/HIP

void memcpy2Dtest(size_t numW, size_t numH, bool usePinnedHost)
{

  size_t width = numW * sizeof(T);
  size_t sizeElements = width * numH;

  printf("memcpy2Dtest: %s<%s> size=%lu (%6.2fMB) W: %d, H:%d, usePinnedHost: %d\n",
         __func__,
         TYPENAME(T),
         sizeElements, sizeElements/1024.0/1024.0,
         (int)numW, (int)numH, usePinnedHost);

  T *A_d, *B_d, *C_d;
  T *A_h, *B_h, *C_h;

  size_t pitch_A, pitch_B, pitch_C;

  hipChannelFormatDesc desc = hipCreateChannelDesc<T>();
  HipTest::initArrays2DPitch(&A_d, &B_d, &C_d, &pitch_A, &pitch_B, &pitch_C, numW, numH);
  HipTest::initArraysForHost(&A_h, &B_h, &C_h, numW*numH, usePinnedHost);
  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numW*numH);

  HIPCHECK (hipMemcpy2D (A_d, pitch_A, A_h, width, width, numH, hipMemcpyHostToDevice) );
  HIPCHECK (hipMemcpy2D (B_d, pitch_B, B_h, width, width, numH, hipMemcpyHostToDevice) );

  hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, (pitch_C/sizeof(T))*numH);

  HIPCHECK (hipMemcpy2D (C_h, width, C_d, pitch_C, width, numH, hipMemcpyDeviceToHost) );

  HIPCHECK ( hipDeviceSynchronize() );
  HipTest::checkVectorADD(A_h, B_h, C_h, numW*numH);

  HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, usePinnedHost);

  printf ("  %s success\n", __func__);
}

示例#13

0

显示文件

文件： hipDispatchLatency.cpp 项目： GPUOpen-ProfessionalCompute-Tools/HIP

int main(){

	hipError_t err;
	float *A, *Ad;

	A = new float[LEN];

	for(int i=0;i<LEN;i++){
		A[i] = 1.0f;
	}

	hipStream_t stream;
	err = hipStreamCreate(&stream);
	check("Creating stream",err);

	err = hipMalloc(&Ad, SIZE);
	check("Allocating Ad memory on device", err);

	err = hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice);
	check("Doing memory copy from A to Ad", err);

	float mS = 0;
	hipEvent_t start, stop;
	hipEventCreate(&start);
	hipEventCreate(&stop);

	ResultDatabase resultDB[8];

	hipEventRecord(start);
	hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, 0, Ad);
	hipEventRecord(stop);
	hipEventElapsedTime(&mS, start, stop);
	resultDB[0].AddResult(std::string("First Kernel Launch"), "", "uS", mS*1000); 
//	std::cout<<"First Kernel Launch: \t\t"<<mS*1000<<" uS"<<std::endl;
	resultDB[0].DumpSummary(std::cout);
	hipEventRecord(start);
	hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, 0, Ad);
	hipEventRecord(stop);
	hipEventElapsedTime(&mS, start, stop);
	resultDB[1].AddResult(std::string("Second Kernel Launch"), "", "uS", mS*1000); 
//	std::cout<<"Second Kernel Launch: \t\t"<<mS*1000<<" uS"<<std::endl;
	resultDB[1].DumpSummary(std::cout);
	hipEventRecord(start);
	for(int i=0;i<ITER;i++){
		hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, 0, Ad);
	}
	hipDeviceSynchronize();
	hipEventRecord(stop);
	hipEventElapsedTime(&mS, start, stop);
	resultDB[2].AddResult(std::string("NULL Stream Sync dispatch wait"), "", "uS", mS*1000/ITER); 
	resultDB[2].DumpSummary(std::cout);
//	std::cout<<"NULL Stream Sync dispatch wait: \t"<<mS*1000/ITER<<" uS"<<std::endl;
	hipDeviceSynchronize();

	hipEventRecord(start);
	for(int i=0;i<ITER;i++){
		hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, 0, Ad);
	}
	hipEventRecord(stop);
	hipDeviceSynchronize();
	hipEventElapsedTime(&mS, start, stop);
	resultDB[3].AddResult(std::string("NULL Stream Async dispatch wait"), "", "uS", mS*1000/ITER); 
	resultDB[3].DumpSummary(std::cout);
//	std::cout<<"NULL Stream Async dispatch wait: \t"<<mS*1000/ITER<<" uS"<<std::endl;
	hipDeviceSynchronize();

	hipEventRecord(start);
	for(int i=0;i<ITER;i++){
		hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, stream, Ad);
		hipDeviceSynchronize();
	}
	hipEventRecord(stop);
	hipEventElapsedTime(&mS, start, stop);
	resultDB[4].AddResult(std::string("Stream Sync dispatch wait"), "", "uS", mS*1000/ITER); 
	resultDB[4].DumpSummary(std::cout);
//	std::cout<<"Stream Sync dispatch wait: \t\t"<<mS*1000/ITER<<" uS"<<std::endl;
	hipDeviceSynchronize();
	hipEventRecord(start);
	for(int i=0;i<ITER;i++){
		hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, stream, Ad);
	}
	hipDeviceSynchronize();
	hipEventRecord(stop);
	hipEventElapsedTime(&mS, start, stop);
	resultDB[5].AddResult(std::string("Stream Async dispatch wait"), "", "uS", mS*1000/ITER); 
//	std::cout<<"Stream Async dispatch wait: \t\t"<<mS*1000/ITER<<" uS"<<std::endl;
	resultDB[5].DumpSummary(std::cout);
	hipDeviceSynchronize();

	hipEventRecord(start);
	for(int i=0;i<ITER;i++){
		hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, 0, Ad);
	}
	hipEventRecord(stop);
	hipEventElapsedTime(&mS, start, stop);
	resultDB[6].AddResult(std::string("NULL Stream No Wait"), "", "uS", mS*1000/ITER); 
	resultDB[6].DumpSummary(std::cout);
//	std::cout<<"NULL Stream Dispatch No Wait: \t\t"<<mS*1000/ITER<<" uS"<<std::endl;
	hipDeviceSynchronize();

	hipEventRecord(start);
	for(int i=0;i<ITER;i++){
		hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, stream, Ad);
	}
	hipEventRecord(stop);
	hipEventElapsedTime(&mS, start, stop);
	resultDB[7].AddResult(std::string("Stream Dispatch No Wait"), "", "uS", mS*1000/ITER); 
	resultDB[7].DumpSummary(std::cout);
//	std::cout<<"Stream Dispatch No Wait: \t\t"<<mS*1000/ITER<<" uS"<<std::endl;
	hipDeviceSynchronize();
}

示例#14

0

显示文件

文件： hipEmptyKernel.cpp 项目： kknox/HIP

int main(){
hipLaunchKernel(HIP_KERNEL_NAME(Empty), dim3(1), dim3(1), 0, 0, 0);
hipDeviceSynchronize();
passed();
}

示例#15

0

显示文件

文件： hipMemcpyAsync.cpp 项目： ssahasra/HIP

void test_pingpong(hipStream_t stream, size_t numElements, int numInflight, int numPongs,
                   bool doHostSide) {
    HIPASSERT(numElements % numInflight == 0);  // Must be evenly divisible.
    size_t Nbytes = numElements * sizeof(T);
    size_t eachCopyElements = numElements / numInflight;
    size_t eachCopyBytes = eachCopyElements * sizeof(T);

    unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements);

    printf(
        "------------------------------------------------------------------------------------------"
        "-----\n");
    printf(
        "testing: %s<%s>  Nbytes=%zu (%6.1f MB) numPongs=%d numInflight=%d eachCopyElements=%zu "
        "eachCopyBytes=%zu\n",
        __func__, HostTraits<AllocType>::Name(), Nbytes, (double)(Nbytes) / 1024.0 / 1024.0,
        numPongs, numInflight, eachCopyElements, eachCopyBytes);

    T* A_h = NULL;
    T* A_d = NULL;

    A_h = (T*)(HostTraits<AllocType>::Alloc(Nbytes));
    HIPCHECK(hipMalloc(&A_d, Nbytes));

    // Initialize the host array:
    const T initValue = 13;
    const T deviceConst = 2;
    const T hostConst = 10000;
    for (size_t i = 0; i < numElements; i++) {
        A_h[i] = initValue + i;
    }


    for (int k = 0; k < numPongs; k++) {
        for (int i = 0; i < numInflight; i++) {
            HIPASSERT(A_d + i * eachCopyElements < A_d + Nbytes);
            HIPCHECK(hipMemcpyAsync(&A_d[i * eachCopyElements], &A_h[i * eachCopyElements],
                                    eachCopyBytes, hipMemcpyHostToDevice, stream));
        }

        hipLaunchKernel(addK<T>, dim3(blocks), dim3(threadsPerBlock), 0, stream, A_d, 2,
                        numElements);

        for (int i = 0; i < numInflight; i++) {
            HIPASSERT(A_d + i * eachCopyElements < A_d + Nbytes);
            HIPCHECK(hipMemcpyAsync(&A_h[i * eachCopyElements], &A_d[i * eachCopyElements],
                                    eachCopyBytes, hipMemcpyDeviceToHost, stream));
        }

        if (doHostSide) {
            assert(0);
#if 0
            hipEvent_t e;
            HIPCHECK(hipEventCreate(&e));
#endif
            HIPCHECK(hipDeviceSynchronize());
            for (size_t i = 0; i < numElements; i++) {
                A_h[i] += hostConst;
            }
        }
    };

    HIPCHECK(hipDeviceSynchronize());


    // Verify we copied back all the data correctly:
    for (size_t i = 0; i < numElements; i++) {
        T gold = initValue + i;
        // Perform calcs in same order as test above to replicate FP order-of-operations:
        for (int k = 0; k < numPongs; k++) {
            gold += deviceConst;
            if (doHostSide) {
                gold += hostConst;
            }
        }

        if (gold != A_h[i]) {
            std::cout << i << ": gold=" << gold << " out=" << A_h[i] << std::endl;
            HIPASSERT(gold == A_h[i]);
        }
    }


    HIPCHECK(hipHostFree(A_h));
    HIPCHECK(hipFree(A_d));
}

示例#16

0

显示文件

文件： hipMemcpyAsync.cpp 项目： ssahasra/HIP

// IN: nStreams : number of streams to use for the test
// IN :useNullStream - use NULL stream.  Synchronizes everything.
// IN: useSyncMemcpyH2D - use sync memcpy (no overlap) for H2D
// IN: useSyncMemcpyD2H - use sync memcpy (no overlap) for D2H
void test_chunkedAsyncExample(int nStreams, bool useNullStream, bool useSyncMemcpyH2D,
                              bool useSyncMemcpyD2H) {
    size_t Nbytes = N * sizeof(int);
    printf("testing: %s(useNullStream=%d, useSyncMemcpyH2D=%d, useSyncMemcpyD2H=%d)  ", __func__,
           useNullStream, useSyncMemcpyH2D, useSyncMemcpyD2H);
    printf("Nbytes=%zu (%6.1f MB)\n", Nbytes, (double)(Nbytes) / 1024.0 / 1024.0);

    int *A_d, *B_d, *C_d;
    int *A_h, *B_h, *C_h;

    HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, true);


    unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);


    hipStream_t* stream = (hipStream_t*)malloc(sizeof(hipStream_t) * nStreams);
    if (useNullStream) {
        nStreams = 1;
        stream[0] = NULL;
    } else {
        for (int i = 0; i < nStreams; ++i) {
            HIPCHECK(hipStreamCreate(&stream[i]));
        }
    }


    size_t workLeft = N;
    size_t workPerStream = N / nStreams;
    for (int i = 0; i < nStreams; ++i) {
        size_t work = (workLeft < workPerStream) ? workLeft : workPerStream;
        size_t workBytes = work * sizeof(int);

        size_t offset = i * workPerStream;
        HIPASSERT(A_d + offset < A_d + Nbytes);
        HIPASSERT(B_d + offset < B_d + Nbytes);
        HIPASSERT(C_d + offset < C_d + Nbytes);
        if (useSyncMemcpyH2D) {
            HIPCHECK(hipMemcpy(&A_d[offset], &A_h[offset], workBytes, hipMemcpyHostToDevice));
            HIPCHECK(hipMemcpy(&B_d[offset], &B_h[offset], workBytes, hipMemcpyHostToDevice));
        } else {
            HIPCHECK(hipMemcpyAsync(&A_d[offset], &A_h[offset], workBytes, hipMemcpyHostToDevice,
                                    stream[i]));
            HIPCHECK(hipMemcpyAsync(&B_d[offset], &B_h[offset], workBytes, hipMemcpyHostToDevice,
                                    stream[i]));
        };

        hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, stream[i],
                        &A_d[offset], &B_d[offset], &C_d[offset], work);

        if (useSyncMemcpyD2H) {
            HIPCHECK(hipMemcpy(&C_h[offset], &C_d[offset], workBytes, hipMemcpyDeviceToHost));
        } else {
            HIPCHECK(hipMemcpyAsync(&C_h[offset], &C_d[offset], workBytes, hipMemcpyDeviceToHost,
                                    stream[i]));
        }
    }


    HIPCHECK(hipDeviceSynchronize());


    HipTest::checkVectorADD(A_h, B_h, C_h, N);

    HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, true);

    free(stream);
};

示例#17

0

显示文件

文件： hipArray.cpp 项目： kknox/HIP

void memcpyArraytest(size_t numW, size_t numH, bool usePinnedHost, bool usePitch=false)
{

  size_t width = numW * sizeof(T);
  size_t sizeElements = width * numH;

  printf("memcpyArraytest: %s<%s> size=%lu (%6.2fMB) W: %d, H: %d, usePinnedHost: %d, usePitch: %d\n",
         __func__,
         TYPENAME(T),
         sizeElements, sizeElements/1024.0/1024.0,
         (int)numW, (int)numH, usePinnedHost, usePitch);

  hipArray *A_d, *B_d, *C_d;
  T *A_h, *B_h, *C_h;

  // 1D
  if ((numW >= 1) && (numH == 1)) {
    hipChannelFormatDesc desc = hipCreateChannelDesc<T>();
    HipTest::initHIPArrays(&A_d, &B_d, &C_d, &desc, numW, 1, 0);
    HipTest::initArraysForHost(&A_h, &B_h, &C_h, numW*numH, usePinnedHost);
    unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numW*numH);

    HIPCHECK (hipMemcpyToArray (A_d, 0, 0, (void *)A_h, width, hipMemcpyHostToDevice) );
    HIPCHECK (hipMemcpyToArray (B_d, 0, 0, (void *)B_h, width, hipMemcpyHostToDevice) );

    hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, (T*)A_d->data, (T*)B_d->data, (T*)C_d->data, numW);

    HIPCHECK (hipMemcpy (C_h, C_d->data, width, hipMemcpyDeviceToHost) );

    HIPCHECK ( hipDeviceSynchronize() );
    HipTest::checkVectorADD(A_h, B_h, C_h, numW);

  }
  // 2D
  else if ((numW >= 1) && (numH >= 1)) {


    hipChannelFormatDesc desc = hipCreateChannelDesc<T>();
    HipTest::initHIPArrays(&A_d, &B_d, &C_d, &desc, numW, numH, 0);
    HipTest::initArraysForHost(&A_h, &B_h, &C_h, numW*numH, usePinnedHost);
    unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numW*numH);

    if (usePitch) {
      T *A_p, *B_p, *C_p;
      size_t pitch_A, pitch_B, pitch_C;

      HipTest::initArrays2DPitch(&A_p, &B_p, &C_p, &pitch_A, &pitch_B, &pitch_C, numW, numH);
      HIPCHECK (hipMemcpy2D (A_p, pitch_A, A_h, width, width, numH, hipMemcpyHostToDevice) );
      HIPCHECK (hipMemcpy2D (B_p, pitch_B, B_h, width, width, numH, hipMemcpyHostToDevice) );

      HIPCHECK (hipMemcpy2DToArray (A_d, 0, 0, (void *)A_p, pitch_A, width, numH, hipMemcpyDeviceToDevice) );
      HIPCHECK (hipMemcpy2DToArray (B_d, 0, 0, (void *)B_p, pitch_B, width, numH, hipMemcpyDeviceToDevice) );

      hipFree(A_p);
      hipFree(B_p);
      hipFree(C_p);
    }
    else {
      HIPCHECK (hipMemcpy2DToArray (A_d, 0, 0, (void *)A_h, width, width, numH, hipMemcpyHostToDevice) );
      HIPCHECK (hipMemcpy2DToArray (B_d, 0, 0, (void *)B_h, width, width, numH, hipMemcpyHostToDevice) );
    }

    hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, (T*)A_d->data, (T*)B_d->data, (T*)C_d->data, numW*numH);

    HIPCHECK (hipMemcpy2D ((void*)C_h, width, (void*)C_d->data, width, width, numH, hipMemcpyDeviceToHost) );

    HIPCHECK ( hipDeviceSynchronize() );
    HipTest::checkVectorADD(A_h, B_h, C_h, numW*numH);
  }
  // Unknown
  else {
    HIPASSERT("Incompatible dimensions" && 0);
  }

  hipFreeArray(A_d);
  hipFreeArray(B_d);
  hipFreeArray(C_d);
  HipTest::freeArraysForHost(A_h, B_h, C_h, usePinnedHost);

  printf ("  %s success\n", __func__);

}

示例#18

0

显示文件

文件： main.cpp 项目： briansp2020/cs344

int main(int argc, char **argv) {
  uchar4        *h_rgbaImage, *d_rgbaImage;
  unsigned char *h_greyImage, *d_greyImage;

  std::string input_file;
  std::string output_file;
  std::string reference_file;
  double perPixelError = 0.0;
  double globalError   = 0.0;
  bool useEpsCheck = false;
  switch (argc)
  {
	case 2:
	  input_file = std::string(argv[1]);
	  output_file = "HW1_output.png";
	  reference_file = "HW1_reference.png";
	  break;
	case 3:
	  input_file  = std::string(argv[1]);
      output_file = std::string(argv[2]);
	  reference_file = "HW1_reference.png";
	  break;
	case 4:
	  input_file  = std::string(argv[1]);
      output_file = std::string(argv[2]);
	  reference_file = std::string(argv[3]);
	  break;
	case 6:
	  useEpsCheck=true;
	  input_file  = std::string(argv[1]);
	  output_file = std::string(argv[2]);
	  reference_file = std::string(argv[3]);
	  perPixelError = atof(argv[4]);
      globalError   = atof(argv[5]);
	  break;
	default:
      std::cerr << "Usage: ./HW1 input_file [output_filename] [reference_filename] [perPixelError] [globalError]" << std::endl;
      exit(1);
  }
  //load the image and give us our input and output pointers
  preProcess(&h_rgbaImage, &h_greyImage, &d_rgbaImage, &d_greyImage, input_file);

  GpuTimer timer;
  timer.Start();
  //call the students' code
  your_rgba_to_greyscale(h_rgbaImage, d_rgbaImage, d_greyImage, numRows(), numCols());
  timer.Stop();
  hipDeviceSynchronize(); checkCudaErrors(hipGetLastError());

  int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed());

  if (err < 0) {
    //Couldn't print! Probably the student closed stdout - bad news
    std::cerr << "Couldn't print timing information! STDOUT Closed!" << std::endl;
    exit(1);
  }

  size_t numPixels = numRows()*numCols();
  checkCudaErrors(hipMemcpy(h_greyImage, d_greyImage, sizeof(unsigned char) * numPixels, hipMemcpyDeviceToHost));

  //check results and output the grey image
  postProcess(output_file, h_greyImage);

  referenceCalculation(h_rgbaImage, h_greyImage, numRows(), numCols());

  postProcess(reference_file, h_greyImage);

  //generateReferenceImage(input_file, reference_file);
  compareImages(reference_file, output_file, useEpsCheck, perPixelError, 
                globalError);

  cleanup();

  return 0;
}

示例#19

0

显示文件

文件： hipMemcpy.cpp 项目： GPUOpen-ProfessionalCompute-Tools/HIP

void memcpytest2(size_t numElements, bool usePinnedHost, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault)
{
    size_t sizeElements = numElements * sizeof(T);
    printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d\n", 
            __func__, 
            TYPENAME(T),
            sizeElements, sizeElements/1024.0/1024.0,
            usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault);


    T *A_d, *B_d, *C_d;
    T *A_h, *B_h, *C_h;


    HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, numElements, usePinnedHost);
    unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements);

    T *A_hh = NULL;
    T *B_hh = NULL;
    T *C_dd = NULL;



    if (useHostToHost) {
        if (usePinnedHost) {
            HIPCHECK ( hipHostMalloc((void**)&A_hh, sizeElements, hipHostMallocDefault) );
            HIPCHECK ( hipHostMalloc((void**)&B_hh, sizeElements, hipHostMallocDefault) );
        } else {
            A_hh = (T*)malloc(sizeElements);
            B_hh = (T*)malloc(sizeElements);
        }


        // Do some extra host-to-host copies here to mix things up:
        HIPCHECK ( hipMemcpy(A_hh, A_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost));
        HIPCHECK ( hipMemcpy(B_hh, B_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost));


        HIPCHECK ( hipMemcpy(A_d, A_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
        HIPCHECK ( hipMemcpy(B_d, B_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
    } else {
        HIPCHECK ( hipMemcpy(A_d, A_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
        HIPCHECK ( hipMemcpy(B_d, B_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
    }

    hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements);

    if (useDeviceToDevice) {
        HIPCHECK ( hipMalloc(&C_dd, sizeElements) );

        // Do an extra device-to-device copies here to mix things up:
        HIPCHECK ( hipMemcpy(C_dd, C_d,  sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyDeviceToDevice));

        //Destroy the original C_d:
        HIPCHECK ( hipMemset(C_d, 0x5A, sizeElements));

        HIPCHECK ( hipMemcpy(C_h, C_dd, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost));
    } else {
        HIPCHECK ( hipMemcpy(C_h, C_d, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost));
    }

    HIPCHECK ( hipDeviceSynchronize() );
    HipTest::checkVectorADD(A_h, B_h, C_h, numElements);

    HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, usePinnedHost);

    printf ("  %s success\n", __func__);
}

示例#20

0

显示文件

文件： cache.cpp 项目： tingxingdong/hip_experiment

void no_cache(float *A_h, float *A_d, float *X_h, float *X_d, float *Y_h, float *Y_d, size_t NUM_ROW, int p=0)
{


	if(p) printf ("info: allocate host mem (%6.2f KB)\n", NUM_COLUMN*NUM_ROW*sizeof(float)/1024.0);

	A_h = (float*)malloc(NUM_ROW * NUM_COLUMN * sizeof(float));
	CHECK(A_h == 0 ? hipErrorMemoryAllocation : hipSuccess );

    X_h = (float*)malloc(NUM_COLUMN * sizeof(float));
	CHECK(X_h == 0 ? hipErrorMemoryAllocation : hipSuccess );
    
    Y_h = (float*)malloc(NUM_ROW * sizeof(float));
	CHECK(Y_h == 0 ? hipErrorMemoryAllocation : hipSuccess );

	// Fill with Phi + i
    for (size_t i=0; i<NUM_ROW * NUM_COLUMN; i++)
	{
		 A_h[i] = 1.618f + (i % NB_X);
	}

    for (size_t i=0; i< NUM_COLUMN; i++)
	{
		 X_h[i] = 1.618f + i;
	}

    
	if(p) printf ("info: allocate device mem (%6.2f KB)\n", NUM_ROW * NUM_COLUMN * sizeof(float)/1024.0);
	CHECK(hipMalloc(&A_d, NUM_ROW * NUM_COLUMN * sizeof(float)));
	CHECK(hipMalloc(&X_d, NUM_COLUMN * sizeof(float)));
	CHECK(hipMalloc(&Y_d, NUM_ROW * sizeof(float)));

	if(p) printf ("info: copy Host2Device\n");
    CHECK ( hipMemcpy(A_d, A_h, NUM_ROW * NUM_COLUMN * sizeof(float), hipMemcpyHostToDevice));
    CHECK ( hipMemcpy(X_d, X_h, NUM_COLUMN * sizeof(float), hipMemcpyHostToDevice));

	const unsigned blocks = (NUM_ROW -1)/NB_X + 1;
	const unsigned threadsPerBlock = NB_X;

	if(p) printf ("info: launch 'gemv_kernel' kernel\n");


    for(int i=1 ; i < 1e3; i*=2)
    {
        size_t num_row = NB_X * i;
        clock_t t;
        t = clock();
        
        double time; 
        time = rocblas_wtime();

	    hipLaunchKernelGGL(HIP_KERNEL_NAME(gemv_kernel), dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, X_d, Y_d, num_row);
    
        //time = rocblas_wtime() - time;

        hipDeviceSynchronize();
        t = clock() - t;
        time = ((double)t)/CLOCKS_PER_SEC*1000 ;

        if(p) printf ("It took me %d clicks (%f miliseconds).\n",t, time);
    
        
        printf ("Row = %d, It took me (%f milliseconds), Gflops=%f\n",time, num_row, 2*num_row*NUM_COLUMN/(time)/10e6);
    }
/*
	if(p) printf ("info: copy Device2Host\n");
    CHECK ( hipMemcpy(Y_h, Y_d, NUM_ROW * sizeof(float), hipMemcpyDeviceToHost));

	if(p) printf ("info: check result\n");


    for (size_t i=0; i<NUM_ROW; i++)  {
        float res = 0;
        for(int j=0; j<NUM_COLUMN; j++){
            res += A_h[i + j * NUM_ROW] * X_h[j];
        }
        if (Y_h[i] != res) 
        {
            printf("i=%d, CPU result=%f, GPU result=%f\n", i, res, Y_h[i]);
		    //CHECK(hipErrorUnknown);
        }
    }

*/
	if(p) printf ("PASSED!\n");

    hipFree(A_d);
    hipFree(Y_d);
    hipFree(X_d);

    free(A_h);
    free(Y_h);
    free(X_h);

}

示例#21

0

显示文件

文件： cache.cpp 项目： tingxingdong/hip_experiment

// CPU Timer(in millisecond): 
    double rocblas_wtime( void ){
        hipDeviceSynchronize();
        struct timeval tv;
        gettimeofday(&tv, NULL);
        return (tv.tv_sec * 1000) + tv.tv_usec /1000;
    };

示例#22

0

显示文件

文件： mix_kernels_hip.cpp 项目： sunway513/mixbench

extern "C" void mixbenchGPU(double *c, long size){
	const char *benchtype = "compute with global memory (block strided)";
	printf("Trade-off type:       %s\n", benchtype);
	double *cd;

	CUDA_SAFE_CALL( hipMalloc((void**)&cd, size*sizeof(double)) );

	// Copy data to device memory
	CUDA_SAFE_CALL( hipMemset(cd, 0, size*sizeof(double)) );  // initialize to zeros

	// Synchronize in order to wait for memory operations to finish
	CUDA_SAFE_CALL( hipDeviceSynchronize() );

	printf("---------------------------------------------------------- CSV data ----------------------------------------------------------\n");
	printf("Experiment ID, Single Precision ops,,,,              Double precision ops,,,,              Integer operations,,, \n");
	printf("Compute iters, Flops/byte, ex.time,  GFLOPS, GB/sec, Flops/byte, ex.time,  GFLOPS, GB/sec, Iops/byte, ex.time,   GIOPS, GB/sec\n");

	runbench_warmup(cd, size);

	runbench<32>(cd, size);
	runbench<31>(cd, size);
	runbench<30>(cd, size);
	runbench<29>(cd, size);
	runbench<28>(cd, size);
	runbench<27>(cd, size);
	runbench<26>(cd, size);
	runbench<25>(cd, size);
	runbench<24>(cd, size);
	runbench<23>(cd, size);
	runbench<22>(cd, size);
	runbench<21>(cd, size);
	runbench<20>(cd, size);
	runbench<19>(cd, size);
	runbench<18>(cd, size);
	runbench<17>(cd, size);
	runbench<16>(cd, size);
	runbench<15>(cd, size);
	runbench<14>(cd, size);
	runbench<13>(cd, size);
	runbench<12>(cd, size);
	runbench<11>(cd, size);
	runbench<10>(cd, size);
	runbench<9>(cd, size);
	runbench<8>(cd, size);
	runbench<7>(cd, size);
	runbench<6>(cd, size);
	runbench<5>(cd, size);
	runbench<4>(cd, size);
	runbench<3>(cd, size);
	runbench<2>(cd, size);
	runbench<1>(cd, size);
	runbench<0>(cd, size);

	printf("---------------------------------------------------------- CSV data ----------------------------------------------------------\n");

	// Copy results back to host memory
	CUDA_SAFE_CALL( hipMemcpy(c, cd, size*sizeof(double), hipMemcpyDeviceToHost) );

	CUDA_SAFE_CALL( hipFree(cd) );

	CUDA_SAFE_CALL( hipDeviceReset() );
}

示例#23

0

显示文件

文件： hipStreamSync2.cpp 项目： kknox/HIP

void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, SyncMode syncMode, bool expectMismatch)
{

    // This test sends a long-running kernel to the null stream, then tests to see if the
    // specified synchronization technique is effective.
    //
    // Some syncMode are not expected to correctly sync (for example "syncNone").  in these
    // cases the test sets expectMismatch and the check logic below will attempt to ensure that
    // the undesired synchronization did not occur - ie ensure the kernel is still running and did
    // not yet update the stop event.  This can be tricky since if the kernel runs fast enough it
    // may complete before the check.  To prevent this, the addCountReverse has a count parameter
    // which causes it to loop repeatedly, and the results are checked in reverse order.
    //
    // Tests with expectMismatch=true should ensure the kernel finishes correctly. This results
    // are checked and we test to make sure stop event has completed.

    if (!(testMask & p_tests)) {
        return;
    }
    printf ("\ntest 0x%02x: syncMode=%s expectMismatch=%d\n",
            testMask, syncModeString(syncMode), expectMismatch);

    size_t sizeBytes = numElements * sizeof(int);

    int count =100;
    int init0 = 0;
    HIPCHECK(hipMemset(C_d, init0, sizeBytes));
    for (int i=0; i<numElements; i++) {
        C_h[i] = -1; // initialize
    }

    hipStream_t otherStream = 0;
    unsigned flags = (syncMode == syncMarkerThenOtherNonBlockingStream) ?  hipStreamNonBlocking : hipStreamDefault;
    HIPCHECK(hipStreamCreateWithFlags(&otherStream, flags));
    hipEvent_t stop, otherStreamEvent;
    HIPCHECK(hipEventCreate(&stop));
    HIPCHECK(hipEventCreate(&otherStreamEvent));


    unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements);
    // Launch kernel into null stream, should result in C_h == count.
    hipLaunchKernelGGL(
        HipTest::addCountReverse,
        dim3(blocks),
        dim3(threadsPerBlock),
        0,
        0 /*stream*/,
        static_cast<const int*>(C_d),
        C_h,
        numElements,
        count);
    HIPCHECK(hipEventRecord(stop, 0/*default*/));

    switch (syncMode) {
        case syncNone:
            break;
        case syncNullStream:
            HIPCHECK(hipStreamSynchronize(0));  // wait on host for null stream:
            break;
        case syncOtherStream:
            // Does this synchronize with the null stream?
            HIPCHECK(hipStreamSynchronize(otherStream));
            break;
        case syncMarkerThenOtherStream:
        case syncMarkerThenOtherNonBlockingStream:

            // this may wait for NULL stream depending hipStreamNonBlocking flag above
            HIPCHECK(hipEventRecord(otherStreamEvent, otherStream));

            HIPCHECK(hipStreamSynchronize(otherStream));
            break;
        case syncDevice:
            HIPCHECK(hipDeviceSynchronize());
            break;
        default:
            assert(0);
    };

    hipError_t done = hipEventQuery(stop);

    if (expectMismatch) {
        assert (done == hipErrorNotReady);
    } else {
        assert (done == hipSuccess);
    }

    int mismatches = 0;
    int expected = init0 + count;
    for (int i=0; i<numElements; i++) {
        bool compareEqual = (C_h[i] == expected);
        if (!compareEqual) {
            mismatches ++;
            if  (!expectMismatch) {
                printf ("C_h[%d] (%d) != %d\n", i, C_h[i], expected);
                assert(C_h[i] == expected);
            }
        }
    }

    if (expectMismatch) {
        assert (mismatches > 0);
    }


    HIPCHECK(hipStreamDestroy(otherStream));
    HIPCHECK(hipEventDestroy(stop));
    HIPCHECK(hipEventDestroy(otherStreamEvent));

    HIPCHECK(hipDeviceSynchronize());

    printf ("test:   OK - %d mismatches (%6.2f%%)\n",  mismatches, ((double)(mismatches)*100.0)/numElements);
}