예제 #1
0
void runTests(int64_t numElements)
{
    size_t sizeBytes = numElements * sizeof(int);

    printf ("\n\ntest: starting sequence with sizeBytes=%zu bytes, %6.2f MB\n", sizeBytes, sizeBytes/1024.0/1024.0);


    int *C_h, *C_d;
    HIPCHECK(hipMalloc(&C_d, sizeBytes));
    HIPCHECK(hipHostMalloc(&C_h, sizeBytes));


    {
        test (0x01, C_d, C_h, numElements,  syncNone,                             true /*expectMismatch*/);
        test (0x02, C_d, C_h, numElements,  syncNullStream,                       false /*expectMismatch*/);
        test (0x04, C_d, C_h, numElements,  syncOtherStream,                      true /*expectMismatch*/);
        test (0x08, C_d, C_h, numElements,  syncDevice,                           false /*expectMismatch*/);

        // Sending a marker to to null stream may synchronize the otherStream
        //  - other created with hipStreamNonBlocking=0 : synchronization, should match
        //  - other created with hipStreamNonBlocking=1 : no synchronization, may mismatch
        test (0x10, C_d, C_h, numElements,  syncMarkerThenOtherStream,            false /*expectMismatch*/);

        // TODO - review why this test seems flaky
        //test (0x20, C_d, C_h, numElements,  syncMarkerThenOtherNonBlockingStream, true /*expectMismatch*/);
    }


    HIPCHECK(hipFree(C_d));
    HIPCHECK(hipHostFree(C_h));
}
예제 #2
0
int main()
{
    unsigned flag = 0;
    HIPCHECK(hipDeviceReset());

    int deviceCount = 0;
    HIPCHECK(hipGetDeviceCount(&deviceCount));

    for(int j=0;j<deviceCount;j++){

        HIPCHECK(hipSetDevice(j));

        for(int i=0;i<4;i++){
            flag = 1 << i;
            printf ("Flag=%x\n", flag);
            HIPCHECK(hipSetDeviceFlags(flag));
            //HIPCHECK_API(hipSetDeviceFlags(flag), hipErrorInvalidValue);
        }

        flag = 0;

    }

    passed();
}
void memcpytest2_sizes(size_t maxElem=0, size_t offset=0)
{
    printSep();
    printf ("test: %s<%s>\n", __func__,  TYPENAME(T));

    int deviceId;
    HIPCHECK(hipGetDevice(&deviceId));

    size_t free, total;
    HIPCHECK(hipMemGetInfo(&free, &total));

    if (maxElem == 0) {
        maxElem = free/sizeof(T)/5;
    }

    printf ("  device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB)    maxSize=%6.1fMB offset=%lu\n", 
            deviceId, free, (float)(free/1024.0/1024.0), total, (float)(total/1024.0/1024.0), maxElem*sizeof(T)/1024.0/1024.0, offset);

    for (size_t elem=64; elem+offset<=maxElem; elem*=2) {
        HIPCHECK ( hipDeviceReset() );
        memcpytest2<T>(elem+offset, 0, 1, 1, 0);  // unpinned host
        HIPCHECK ( hipDeviceReset() );
        memcpytest2<T>(elem+offset, 1, 1, 1, 0);  // pinned host
    }
}
int test_gl2(size_t N) {

    size_t Nbytes = N*sizeof(int);

    int *A_d, *B_d, *C_d;
    int *A_h, *B_h, *C_h;

    HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N);



    unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);


    // Full vadd in one large chunk, to get things started:
    HIPCHECK ( hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
    HIPCHECK ( hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));

    hipLaunchKernel(vectorADD2, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, N);

    HIPCHECK ( hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));

    HIPCHECK (hipDeviceSynchronize());

    HipTest::checkVectorADD(A_h, B_h, C_h, N);

    return 0;

}
예제 #5
0
void test_manyInflightCopies(hipStream_t stream, int numElements, int numCopies,
                             bool syncBetweenCopies) {
    size_t Nbytes = numElements * sizeof(T);
    size_t eachCopyElements = numElements / numCopies;
    size_t eachCopyBytes = eachCopyElements * sizeof(T);

    printf(
        "------------------------------------------------------------------------------------------"
        "-----\n");
    printf(
        "testing: %s  Nbytes=%zu (%6.1f MB) numCopies=%d eachCopyElements=%zu eachCopyBytes=%zu\n",
        __func__, Nbytes, (double)(Nbytes) / 1024.0 / 1024.0, numCopies, eachCopyElements,
        eachCopyBytes);

    T* A_d;
    T *A_h1, *A_h2;

    HIPCHECK(hipHostMalloc((void**)&A_h1, Nbytes, hipHostMallocDefault));
    HIPCHECK(hipHostMalloc((void**)&A_h2, Nbytes, hipHostMallocDefault));
    HIPCHECK(hipMalloc(&A_d, Nbytes));

    for (int i = 0; i < numElements; i++) {
        A_h1[i] = 3.14f + static_cast<T>(i);
    }


    // stream=0; // fixme TODO


    for (int i = 0; i < numCopies; i++) {
        HIPASSERT(A_d + i * eachCopyElements < A_d + Nbytes);
        HIPCHECK(hipMemcpyAsync(&A_d[i * eachCopyElements], &A_h1[i * eachCopyElements],
                                eachCopyBytes, hipMemcpyHostToDevice, stream));
    }

    if (syncBetweenCopies) {
        HIPCHECK(hipDeviceSynchronize());
    }

    for (int i = 0; i < numCopies; i++) {
        HIPASSERT(A_d + i * eachCopyElements < A_d + Nbytes);
        HIPCHECK(hipMemcpyAsync(&A_h2[i * eachCopyElements], &A_d[i * eachCopyElements],
                                eachCopyBytes, hipMemcpyDeviceToHost, stream));
    }

    HIPCHECK(hipDeviceSynchronize());


    // Verify we copied back all the data correctly:
    for (int i = 0; i < numElements; i++) {
        HIPASSERT(A_h1[i] == A_h2[i]);
    }


    HIPCHECK(hipHostFree(A_h1));
    HIPCHECK(hipHostFree(A_h2));
    HIPCHECK(hipFree(A_d));
}
예제 #6
0
int main() {
    float *A, *Ad;
    HIPCHECK(hipHostMalloc((void**)&A, SIZE, hipHostMallocDefault));
    HIPCHECK(hipMalloc((void**)&Ad, SIZE));
    hipStream_t stream;
    HIPCHECK(hipStreamCreate(&stream));
    for (int i = 0; i < SIZE; i++) {
        HIPCHECK(hipMemcpyAsync(Ad, A, SIZE, hipMemcpyHostToDevice, stream));
        HIPCHECK(hipDeviceSynchronize());
    }
}
예제 #7
0
파일: hipArray.cpp 프로젝트: kknox/HIP
int main(int argc, char *argv[])
{
    HipTest::parseStandardArguments(argc, argv, true);

    printf ("info: set device to %d\n", p_gpuDevice);
    HIPCHECK(hipSetDevice(p_gpuDevice));

    if (p_tests & 0x1) {
        printf ("\n\n=== tests&1 (types)\n");
        printSep();
        HIPCHECK ( hipDeviceReset() );
        size_t width = N/6;
        size_t height = N/6;
        memcpy2Dtest<float>(321, 211, 0);
        memcpy2Dtest<double>(322, 211, 0);
        memcpy2Dtest<char>(320, 211, 0);
        memcpy2Dtest<int>(323, 211, 0);
        printf ("===\n\n\n");

        printf ("\n\n=== tests&1 (types)\n");
        printSep();
        // 2D
        memcpyArraytest<float>(320, 211, 0, 0);
        memcpyArraytest<unsigned int>(322, 211, 0, 0);
        memcpyArraytest<int>(320, 211, 0, 0);
        memcpyArraytest<float>(320, 211, 0, 1);
        memcpyArraytest<float>(322, 211, 0, 1);
        memcpyArraytest<int>(320, 211, 0, 1);
        printSep();
        // 1D
        memcpyArraytest<float>(320, 1, 0);
        memcpyArraytest<unsigned int>(322, 1, 0);
        memcpyArraytest<int>(320, 1, 0);
        printf ("===\n\n\n");
    }

    if (p_tests & 0x4) {
        printf ("\n\n=== tests&4 (test sizes and offsets)\n");
        printSep();
        HIPCHECK ( hipDeviceReset() );
        printSep();
        memcpyArraytest_size<float>(0,0);
        printSep();
        memcpyArraytest_size<float>(0,64);
        printSep();
        memcpyArraytest_size<float>(1024*1024,13);
        printSep();
        memcpyArraytest_size<float>(1024*1024,50);
    }

    passed();

}
int main()
{
    int numDevices = 0;
    int device;
    HIPCHECK(hipGetDeviceCount(&numDevices));
    for(int i=0;i<numDevices;i++){
        HIPCHECK(hipSetDevice(i));
        HIPCHECK(hipGetDevice(&device));
        HIPASSERT(device == i);
    }
    passed();
}
예제 #9
0
int main() {
    int numDevices = 0;
    char name[len];
    hipDevice_t device;
    HIPCHECK(hipGetDeviceCount(&numDevices));
    for (int i = 0; i < numDevices; i++) {
        HIPCHECK(hipDeviceGet(&device, i));
        HIPCHECK(hipDeviceGetName(name, len, device));
        HIPASSERT(name != "");
    }
    passed();
}
예제 #10
0
int main() {
    int numDevices = 0;
    size_t totMem;
    hipDevice_t device;
    HIPCHECK(hipGetDeviceCount(&numDevices));
    for (int i = 0; i < numDevices; i++) {
        HIPCHECK(hipDeviceGet(&device, i));
        HIPCHECK(hipDeviceTotalMem(&totMem, device));
        HIPASSERT(totMem != 0);
    }
    passed();
}
예제 #11
0
unsigned setNumBlocks(unsigned blocksPerCU, unsigned threadsPerBlock, size_t N) {
    int device;
    HIPCHECK(hipGetDevice(&device));
    hipDeviceProp_t props;
    HIPCHECK(hipGetDeviceProperties(&props, device));

    unsigned blocks = props.multiProcessorCount * blocksPerCU;
    if (blocks * threadsPerBlock > N) {
        blocks = (N + threadsPerBlock - 1) / threadsPerBlock;
    }

    return blocks;
}
예제 #12
0
int main()
{
    int numDevices = 0;
    int major,minor;
    hipDevice_t device;
    HIPCHECK(hipGetDeviceCount(&numDevices));
    for(int i=0;i<numDevices;i++){
        HIPCHECK(hipDeviceGet(&device,i));
        HIPCHECK(hipDeviceComputeCapability(&major, &minor, device));
        HIPASSERT(major >= 0);
        HIPASSERT(minor >= 0);
    }
    passed();
}
예제 #13
0
bool testhipMemset3D(int memsetval,int p_gpuDevice)
{
    size_t numH = 256;
    size_t numW = 256;
    size_t depth = 10;
    size_t width = numW * sizeof(char);
    size_t sizeElements = width * numH * depth;
    size_t elements = numW* numH* depth;


	printf ("testhipMemset3D memsetval=%2x device=%d\n", memsetval, p_gpuDevice);
    char *A_h;
    bool testResult = true;
    hipExtent extent = make_hipExtent(width, numH, depth);
    hipPitchedPtr devPitchedPtr;

    HIPCHECK(hipMalloc3D(&devPitchedPtr, extent));
	A_h = (char*)malloc(sizeElements);
	HIPASSERT(A_h != NULL);
	for (size_t i=0; i<elements; i++) {
        A_h[i] = 1;
    }
	HIPCHECK ( hipMemset3D( devPitchedPtr, memsetval, extent) );
	hipMemcpy3DParms myparms = {0};
	myparms.srcPos = make_hipPos(0,0,0);
	myparms.dstPos = make_hipPos(0,0,0);
	myparms.dstPtr = make_hipPitchedPtr(A_h, width , numW, numH);
	myparms.srcPtr = devPitchedPtr;
	myparms.extent = extent;
#ifdef __HIP_PLATFORM_NVCC__
	myparms.kind = hipMemcpyKindToCudaMemcpyKind(hipMemcpyDeviceToHost);
#else
    myparms.kind = hipMemcpyDeviceToHost;
#endif
    HIPCHECK(hipMemcpy3D(&myparms));

    for (int i=0; i<elements; i++) {
        if (A_h[i] != memsetval) {
			testResult = false;
            printf("mismatch at index:%d computed:%02x, memsetval:%02x\n", i, (int)A_h[i], (int)memsetval);
            break;
        }
    }
    HIPCHECK(hipFree(devPitchedPtr.ptr));
    free(A_h);
    return testResult;
}
void waitStreams(int iterations)
{
    // Repeatedly sync and wait for all streams to complete.
    // TO make this interesting, the test has other threads repeatedly adding and removing streams to the device.
    for (int i=0; i<iterations; i++) {
        HIPCHECK(hipDeviceSynchronize());
    }
}
예제 #15
0
int main(int argc, char* argv[]) {
    HipTest::parseStandardArguments(argc, argv, false);
    parseMyArguments(argc, argv);


    printf("info: set device to %d  tests=%x\n", p_gpuDevice, p_tests);
    HIPCHECK(hipSetDevice(p_gpuDevice));

    if (p_tests & 0x01) {
        simpleNegTest();
    }

    if (p_tests & 0x02) {
        hipStream_t stream;
        HIPCHECK(hipStreamCreate(&stream));

        test_manyInflightCopies<float>(stream, 1024, 16, true);
        test_manyInflightCopies<float>(
            stream, 1024, 4, true);  // verify we re-use the same entries instead of growing pool.
        test_manyInflightCopies<float>(stream, 1024 * 8, 64, false);

        HIPCHECK(hipStreamDestroy(stream));
    }


    if (p_tests & 0x04) {
        test_chunkedAsyncExample(p_streams, true, true, true);     // Easy sync version
        test_chunkedAsyncExample(p_streams, false, true, true);    // Easy sync version
        test_chunkedAsyncExample(p_streams, false, false, true);   // Some async
        test_chunkedAsyncExample(p_streams, false, false, false);  // All async
    }

    if (p_tests & 0x08) {
        hipStream_t stream;
        HIPCHECK(hipStreamCreate(&stream));

        //        test_pingpong<int, Pinned>(stream, 1024*1024*32, 1, 1, false);
        //        test_pingpong<int, Pinned>(stream, 1024*1024*32, 1, 10, false);

        HIPCHECK(hipStreamDestroy(stream));
    }


    passed();
}
예제 #16
0
int main(int argc, char *argv[])
{
    HipTest::parseStandardArguments(argc, argv, true);
    bool testResult = false;
    HIPCHECK(hipSetDevice(p_gpuDevice));
    testResult = testhipMemset3D(memsetval, p_gpuDevice);
    if (testResult) {
        passed();
    } else {
        exit(EXIT_FAILURE);
    }
}
void run1(size_t size, hipStream_t stream){
	float *Ah, *Bh, *Cd, *Dd, *Eh;

	HIPCHECK(hipHostMalloc((void**)&Ah, size, hipHostMallocDefault));
	HIPCHECK(hipHostMalloc((void**)&Bh, size, hipHostMallocDefault));
	HIPCHECK(hipMalloc(&Cd, size));
	HIPCHECK(hipMalloc(&Dd, size));
	HIPCHECK(hipHostMalloc((void**)&Eh, size, hipHostMallocDefault));

	for(int i=0;i<N;i++){
		Ah[i] = 1.0f;
	}

	HIPCHECK(hipMemcpyAsync(Bh, Ah, size, hipMemcpyHostToHost, stream));
	HIPCHECK(hipMemcpyAsync(Cd, Bh, size, hipMemcpyHostToDevice, stream));
	hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream, Cd);
	HIPCHECK(hipMemcpyAsync(Dd, Cd, size, hipMemcpyDeviceToDevice, stream));
	HIPCHECK(hipMemcpyAsync(Eh, Dd, size, hipMemcpyDeviceToHost, stream));
	HIPCHECK(hipDeviceSynchronize());
	HIPASSERT(Eh[10] == Ah[10] + 1.0f);
}
예제 #18
0
void simpleNegTest() {
    printf("testing: %s\n", __func__);
    hipError_t e;
    float *A_malloc, *A_pinned, *A_d;

    size_t Nbytes = N * sizeof(float);
    A_malloc = (float*)malloc(Nbytes);
    HIPCHECK(hipHostMalloc((void**)&A_pinned, Nbytes, hipHostMallocDefault));
    A_d = NULL;
    HIPCHECK(hipMalloc(&A_d, Nbytes));
    HIPASSERT(A_d != NULL);
    // Can't use default with async copy
    e = hipMemcpyAsync(A_pinned, A_d, Nbytes, hipMemcpyDefault, NULL);
    //    HIPASSERT (e == hipSuccess);


    // Not sure what happens here, the memory must be pinned.
    e = hipMemcpyAsync(A_malloc, A_d, Nbytes, hipMemcpyDeviceToHost, NULL);

    printf("  async memcpy of A_malloc to A_d. Result=%d\n", e);
    // HIPASSERT (e==hipErrorInvalidValue);
}
예제 #19
0
파일: hipArray.cpp 프로젝트: kknox/HIP
void memcpy2Dtest(size_t numW, size_t numH, bool usePinnedHost)
{

  size_t width = numW * sizeof(T);
  size_t sizeElements = width * numH;

  printf("memcpy2Dtest: %s<%s> size=%lu (%6.2fMB) W: %d, H:%d, usePinnedHost: %d\n",
         __func__,
         TYPENAME(T),
         sizeElements, sizeElements/1024.0/1024.0,
         (int)numW, (int)numH, usePinnedHost);

  T *A_d, *B_d, *C_d;
  T *A_h, *B_h, *C_h;

  size_t pitch_A, pitch_B, pitch_C;

  hipChannelFormatDesc desc = hipCreateChannelDesc<T>();
  HipTest::initArrays2DPitch(&A_d, &B_d, &C_d, &pitch_A, &pitch_B, &pitch_C, numW, numH);
  HipTest::initArraysForHost(&A_h, &B_h, &C_h, numW*numH, usePinnedHost);
  unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numW*numH);

  HIPCHECK (hipMemcpy2D (A_d, pitch_A, A_h, width, width, numH, hipMemcpyHostToDevice) );
  HIPCHECK (hipMemcpy2D (B_d, pitch_B, B_h, width, width, numH, hipMemcpyHostToDevice) );

  hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, (pitch_C/sizeof(T))*numH);

  HIPCHECK (hipMemcpy2D (C_h, width, C_d, pitch_C, width, numH, hipMemcpyDeviceToHost) );

  HIPCHECK ( hipDeviceSynchronize() );
  HipTest::checkVectorADD(A_h, B_h, C_h, numW*numH);

  HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, usePinnedHost);

  printf ("  %s success\n", __func__);
}
// Create a lot of streams and then destroy 'em.
void createThenDestroyStreams(int iterations, int burstSize)
{
    hipStream_t *streams = new hipStream_t[burstSize];

    for (int i=0; i<iterations; i++) {
        if (p_verbose & 0x1) {
          printf ("%s iter=%d, create %d then destroy %d\n", __func__, i, burstSize, burstSize);
        }
        for (int j=0; j<burstSize; j++) {
            if (p_verbose & 0x2) {
                printf ("  %d.%d streamCreate\n", i, j);
            }
            HIPCHECK( hipStreamCreate(&streams[j]));
        }
        for (int j=0; j<burstSize; j++) {
            if (p_verbose & 0x2) {
                printf ("  %d.%d streamDestroy\n", i, j);
            }
            HIPCHECK( hipStreamDestroy(streams[j]));
        }
    }

    delete streams;
}
int main(int argc, char **argv){
	HipTest::parseStandardArguments(argc, argv, true);


	hipStream_t stream[3];
	for(int i=0;i<3;i++){
		HIPCHECK(hipStreamCreate(&stream[i]));
	}

	const size_t size = N * sizeof(float);

	std::thread t1(run1, size, stream[0]);
	std::thread t2(run1, size, stream[0]);
	std::thread t3(run, size, stream[1], stream[2]);
	t1.join();
//	std::cout<<"T1"<<std::endl;
	t2.join();
//	std::cout<<"T2"<<std::endl;
	t3.join(); 
	passed();
}
예제 #22
0
int main(int argc, char *argv[])
{
    hipStream_t stream;
    unsigned int flags;
    HIPCHECK(hipStreamCreateWithFlags(&stream, hipStreamDefault));
    HIPCHECK(hipStreamGetFlags(stream, &flags));
    HIPASSERT(flags == 0); 
    HIPCHECK(hipStreamDestroy(stream));

    HIPCHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
    HIPCHECK(hipStreamGetFlags(stream, &flags));
    HIPASSERT(flags == 1); 
    HIPCHECK(hipStreamDestroy(stream));

    passed();
}
예제 #23
0
int main()
{
    size_t Nbytes = N*sizeof(int);
    int numDevices = 0;
    int *A_d, *B_d, *C_d, *X_d, *Y_d, *Z_d;
    int *A_h, *B_h, *C_h ;
    hipStream_t s;

    HIPCHECK(hipGetDeviceCount(&numDevices));
     if(numDevices > 1)
     {
        HIPCHECK(hipSetDevice(0));
        unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
        HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
        HIPCHECK(hipSetDevice(1));
        HIPCHECK(hipMalloc(&X_d,Nbytes));
        HIPCHECK(hipMalloc(&Y_d,Nbytes));
        HIPCHECK(hipMalloc(&Z_d,Nbytes));


        HIPCHECK(hipSetDevice(0));
        HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
        HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
        hipLaunchKernel(
            HipTest::vectorADD,
            dim3(blocks),
            dim3(threadsPerBlock),
            0,
            0,
            static_cast<const int*>(A_d),
            static_cast<const int*>(B_d),
            C_d,
            N);
        HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
        HIPCHECK(hipDeviceSynchronize());
        HipTest::checkVectorADD(A_h, B_h, C_h, N);

        HIPCHECK(hipStreamCreate(&s));
        HIPCHECK(hipSetDevice(1));
        HIPCHECK(hipMemcpyDtoDAsync((hipDeviceptr_t)X_d, (hipDeviceptr_t)A_d, Nbytes, s));
        HIPCHECK(hipMemcpyDtoDAsync((hipDeviceptr_t)Y_d, (hipDeviceptr_t)B_d, Nbytes, s));

        hipLaunchKernel(
            HipTest::vectorADD,
            dim3(blocks),
            dim3(threadsPerBlock),
            0,
            0,
            static_cast<const int*>(X_d),
            static_cast<const int*>(Y_d),
            Z_d,
            N);
        HIPCHECK(hipMemcpyDtoHAsync(C_h, (hipDeviceptr_t)Z_d, Nbytes, s));
        HIPCHECK(hipStreamSynchronize(s));
        HIPCHECK(hipDeviceSynchronize());

        HipTest::checkVectorADD(A_h, B_h, C_h, N);
        HIPCHECK(hipStreamDestroy(s));
        HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
        HIPCHECK(hipFree(X_d));
        HIPCHECK(hipFree(Y_d));
        HIPCHECK(hipFree(Z_d));
     }

        passed();


}
int main(int argc, char *argv[])
{
    HipTest::parseStandardArguments(argc, argv, true);

    printf ("info: set device to %d\n", p_gpuDevice);
    HIPCHECK(hipSetDevice(p_gpuDevice));


    if (p_tests & 0x1) {
        printf ("\n\n=== tests&1 (types and different memcpy kinds (H2D, D2H, H2H, D2D)\n");
        HIPCHECK ( hipDeviceReset() );
        memcpytest2_for_type<float>(N);
        memcpytest2_for_type<double>(N);
        memcpytest2_for_type<char>(N);
        memcpytest2_for_type<int>(N);
        printf ("===\n\n\n");
    }


    if (p_tests & 0x2) {
        // Some tests around the 64MB boundary which have historically shown issues:
        printf ("\n\n=== tests&0x2 (64MB boundary)\n");
#if 0
        // These all pass:
        memcpytest2<float>(15*1024*1024, 1, 0, 0, 0);  
        memcpytest2<float>(16*1024*1024, 1, 0, 0, 0);  
        memcpytest2<float>(16*1024*1024+16*1024,  1, 0, 0, 0);  
#endif
        // Just over 64MB:
        memcpytest2<float>(16*1024*1024+512*1024,  1, 0, 0, 0);  
        memcpytest2<float>(17*1024*1024+1024,  1, 0, 0, 0);  
        memcpytest2<float>(32*1024*1024, 1, 0, 0, 0);  
        memcpytest2<float>(32*1024*1024, 0, 0, 0, 0);  
        memcpytest2<float>(32*1024*1024, 1, 1, 1, 0);  
        memcpytest2<float>(32*1024*1024, 1, 1, 1, 0);  
    }


    if (p_tests & 0x4) {
        printf ("\n\n=== tests&4 (test sizes and offsets)\n");
        HIPCHECK ( hipDeviceReset() );
        printSep();
        memcpytest2_sizes<float>(0,0);
        printSep();
        memcpytest2_sizes<float>(0,64);
        printSep();
        memcpytest2_sizes<float>(1024*1024, 13);
        printSep();
        memcpytest2_sizes<float>(1024*1024, 50);
    }

    if (p_tests & 0x8) {
        printf ("\n\n=== tests&8\n");
        HIPCHECK ( hipDeviceReset() );
        printSep();

        // Simplest cases: serialize the threads, and also used pinned memory:
        // This verifies that the sub-calls to memcpytest2 are correct.
        multiThread_1<float>(true, true); 

        // Serialize, but use unpinned memory to stress the unpinned memory xfer path.
        multiThread_1<float>(true, false);

        // Remove serialization, so two threads are performing memory copies in parallel.
        multiThread_1<float>(false, true);

        // Remove serialization, and use unpinned.
        multiThread_1<float>(false, false); // TODO
        printf ("===\n\n\n");
    }


    passed();

}
예제 #25
0
void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, SyncMode syncMode, bool expectMismatch)
{

    // This test sends a long-running kernel to the null stream, then tests to see if the
    // specified synchronization technique is effective.
    //
    // Some syncMode are not expected to correctly sync (for example "syncNone").  in these
    // cases the test sets expectMismatch and the check logic below will attempt to ensure that
    // the undesired synchronization did not occur - ie ensure the kernel is still running and did
    // not yet update the stop event.  This can be tricky since if the kernel runs fast enough it
    // may complete before the check.  To prevent this, the addCountReverse has a count parameter
    // which causes it to loop repeatedly, and the results are checked in reverse order.
    //
    // Tests with expectMismatch=true should ensure the kernel finishes correctly. This results
    // are checked and we test to make sure stop event has completed.

    if (!(testMask & p_tests)) {
        return;
    }
    printf ("\ntest 0x%02x: syncMode=%s expectMismatch=%d\n",
            testMask, syncModeString(syncMode), expectMismatch);

    size_t sizeBytes = numElements * sizeof(int);

    int count =100;
    int init0 = 0;
    HIPCHECK(hipMemset(C_d, init0, sizeBytes));
    for (int i=0; i<numElements; i++) {
        C_h[i] = -1; // initialize
    }

    hipStream_t otherStream = 0;
    unsigned flags = (syncMode == syncMarkerThenOtherNonBlockingStream) ?  hipStreamNonBlocking : hipStreamDefault;
    HIPCHECK(hipStreamCreateWithFlags(&otherStream, flags));
    hipEvent_t stop, otherStreamEvent;
    HIPCHECK(hipEventCreate(&stop));
    HIPCHECK(hipEventCreate(&otherStreamEvent));


    unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements);
    // Launch kernel into null stream, should result in C_h == count.
    hipLaunchKernelGGL(
        HipTest::addCountReverse,
        dim3(blocks),
        dim3(threadsPerBlock),
        0,
        0 /*stream*/,
        static_cast<const int*>(C_d),
        C_h,
        numElements,
        count);
    HIPCHECK(hipEventRecord(stop, 0/*default*/));

    switch (syncMode) {
        case syncNone:
            break;
        case syncNullStream:
            HIPCHECK(hipStreamSynchronize(0));  // wait on host for null stream:
            break;
        case syncOtherStream:
            // Does this synchronize with the null stream?
            HIPCHECK(hipStreamSynchronize(otherStream));
            break;
        case syncMarkerThenOtherStream:
        case syncMarkerThenOtherNonBlockingStream:

            // this may wait for NULL stream depending hipStreamNonBlocking flag above
            HIPCHECK(hipEventRecord(otherStreamEvent, otherStream));

            HIPCHECK(hipStreamSynchronize(otherStream));
            break;
        case syncDevice:
            HIPCHECK(hipDeviceSynchronize());
            break;
        default:
            assert(0);
    };

    hipError_t done = hipEventQuery(stop);

    if (expectMismatch) {
        assert (done == hipErrorNotReady);
    } else {
        assert (done == hipSuccess);
    }

    int mismatches = 0;
    int expected = init0 + count;
    for (int i=0; i<numElements; i++) {
        bool compareEqual = (C_h[i] == expected);
        if (!compareEqual) {
            mismatches ++;
            if  (!expectMismatch) {
                printf ("C_h[%d] (%d) != %d\n", i, C_h[i], expected);
                assert(C_h[i] == expected);
            }
        }
    }

    if (expectMismatch) {
        assert (mismatches > 0);
    }


    HIPCHECK(hipStreamDestroy(otherStream));
    HIPCHECK(hipEventDestroy(stop));
    HIPCHECK(hipEventDestroy(otherStreamEvent));

    HIPCHECK(hipDeviceSynchronize());

    printf ("test:   OK - %d mismatches (%6.2f%%)\n",  mismatches, ((double)(mismatches)*100.0)/numElements);
}
예제 #26
0
void test_pingpong(hipStream_t stream, size_t numElements, int numInflight, int numPongs,
                   bool doHostSide) {
    HIPASSERT(numElements % numInflight == 0);  // Must be evenly divisible.
    size_t Nbytes = numElements * sizeof(T);
    size_t eachCopyElements = numElements / numInflight;
    size_t eachCopyBytes = eachCopyElements * sizeof(T);

    unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements);

    printf(
        "------------------------------------------------------------------------------------------"
        "-----\n");
    printf(
        "testing: %s<%s>  Nbytes=%zu (%6.1f MB) numPongs=%d numInflight=%d eachCopyElements=%zu "
        "eachCopyBytes=%zu\n",
        __func__, HostTraits<AllocType>::Name(), Nbytes, (double)(Nbytes) / 1024.0 / 1024.0,
        numPongs, numInflight, eachCopyElements, eachCopyBytes);

    T* A_h = NULL;
    T* A_d = NULL;

    A_h = (T*)(HostTraits<AllocType>::Alloc(Nbytes));
    HIPCHECK(hipMalloc(&A_d, Nbytes));

    // Initialize the host array:
    const T initValue = 13;
    const T deviceConst = 2;
    const T hostConst = 10000;
    for (size_t i = 0; i < numElements; i++) {
        A_h[i] = initValue + i;
    }


    for (int k = 0; k < numPongs; k++) {
        for (int i = 0; i < numInflight; i++) {
            HIPASSERT(A_d + i * eachCopyElements < A_d + Nbytes);
            HIPCHECK(hipMemcpyAsync(&A_d[i * eachCopyElements], &A_h[i * eachCopyElements],
                                    eachCopyBytes, hipMemcpyHostToDevice, stream));
        }

        hipLaunchKernel(addK<T>, dim3(blocks), dim3(threadsPerBlock), 0, stream, A_d, 2,
                        numElements);

        for (int i = 0; i < numInflight; i++) {
            HIPASSERT(A_d + i * eachCopyElements < A_d + Nbytes);
            HIPCHECK(hipMemcpyAsync(&A_h[i * eachCopyElements], &A_d[i * eachCopyElements],
                                    eachCopyBytes, hipMemcpyDeviceToHost, stream));
        }

        if (doHostSide) {
            assert(0);
#if 0
            hipEvent_t e;
            HIPCHECK(hipEventCreate(&e));
#endif
            HIPCHECK(hipDeviceSynchronize());
            for (size_t i = 0; i < numElements; i++) {
                A_h[i] += hostConst;
            }
        }
    };

    HIPCHECK(hipDeviceSynchronize());


    // Verify we copied back all the data correctly:
    for (size_t i = 0; i < numElements; i++) {
        T gold = initValue + i;
        // Perform calcs in same order as test above to replicate FP order-of-operations:
        for (int k = 0; k < numPongs; k++) {
            gold += deviceConst;
            if (doHostSide) {
                gold += hostConst;
            }
        }

        if (gold != A_h[i]) {
            std::cout << i << ": gold=" << gold << " out=" << A_h[i] << std::endl;
            HIPASSERT(gold == A_h[i]);
        }
    }


    HIPCHECK(hipHostFree(A_h));
    HIPCHECK(hipFree(A_d));
}
예제 #27
0
 static void* Alloc(size_t sizeBytes) {
     void* p;
     HIPCHECK(hipHostMalloc((void**)&p, sizeBytes, hipHostMallocDefault));
     return p;
 };
void run(size_t size, hipStream_t stream1, hipStream_t stream2){
	float *Ah, *Bh, *Cd, *Dd, *Eh;
	float *Ahh, *Bhh, *Cdd, *Ddd, *Ehh;

	HIPCHECK(hipHostMalloc((void**)&Ah, size, hipHostMallocDefault));
	HIPCHECK(hipHostMalloc((void**)&Bh, size, hipHostMallocDefault));
	HIPCHECK(hipMalloc(&Cd, size));
	HIPCHECK(hipMalloc(&Dd, size));
	HIPCHECK(hipHostMalloc((void**)&Eh, size, hipHostMallocDefault));
	HIPCHECK(hipHostMalloc((void**)&Ahh, size, hipHostMallocDefault));
	HIPCHECK(hipHostMalloc((void**)&Bhh, size, hipHostMallocDefault));
	HIPCHECK(hipMalloc(&Cdd, size));
	HIPCHECK(hipMalloc(&Ddd, size));
	HIPCHECK(hipHostMalloc((void**)&Ehh, size, hipHostMallocDefault));

	HIPCHECK(hipMemcpyAsync(Bh, Ah, size, hipMemcpyHostToHost, stream1));
	HIPCHECK(hipMemcpyAsync(Bhh, Ahh, size, hipMemcpyHostToHost, stream2));
	HIPCHECK(hipMemcpyAsync(Cd, Bh, size, hipMemcpyHostToDevice, stream1));
	HIPCHECK(hipMemcpyAsync(Cdd, Bhh, size, hipMemcpyHostToDevice, stream2));
	hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream1, Cd);
	hipLaunchKernel(HIP_KERNEL_NAME(Inc), dim3(N/500), dim3(500), 0, stream2, Cdd);
	HIPCHECK(hipMemcpyAsync(Dd, Cd, size, hipMemcpyDeviceToDevice, stream1));
	HIPCHECK(hipMemcpyAsync(Ddd, Cdd, size, hipMemcpyDeviceToDevice, stream2));
	HIPCHECK(hipMemcpyAsync(Eh, Dd, size, hipMemcpyDeviceToHost, stream1));
	HIPCHECK(hipMemcpyAsync(Ehh, Ddd, size, hipMemcpyDeviceToHost, stream2));
	HIPCHECK(hipDeviceSynchronize());
	HIPASSERT(Eh[10] = Ah[10] + 1.0f);
	HIPASSERT(Ehh[10] = Ahh[10] + 1.0f);
}
void memcpytest2(size_t numElements, bool usePinnedHost, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault)
{
    size_t sizeElements = numElements * sizeof(T);
    printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d\n", 
            __func__, 
            TYPENAME(T),
            sizeElements, sizeElements/1024.0/1024.0,
            usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault);


    T *A_d, *B_d, *C_d;
    T *A_h, *B_h, *C_h;


    HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, numElements, usePinnedHost);
    unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements);

    T *A_hh = NULL;
    T *B_hh = NULL;
    T *C_dd = NULL;



    if (useHostToHost) {
        if (usePinnedHost) {
            HIPCHECK ( hipHostMalloc((void**)&A_hh, sizeElements, hipHostMallocDefault) );
            HIPCHECK ( hipHostMalloc((void**)&B_hh, sizeElements, hipHostMallocDefault) );
        } else {
            A_hh = (T*)malloc(sizeElements);
            B_hh = (T*)malloc(sizeElements);
        }


        // Do some extra host-to-host copies here to mix things up:
        HIPCHECK ( hipMemcpy(A_hh, A_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost));
        HIPCHECK ( hipMemcpy(B_hh, B_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost));


        HIPCHECK ( hipMemcpy(A_d, A_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
        HIPCHECK ( hipMemcpy(B_d, B_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
    } else {
        HIPCHECK ( hipMemcpy(A_d, A_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
        HIPCHECK ( hipMemcpy(B_d, B_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
    }

    hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements);

    if (useDeviceToDevice) {
        HIPCHECK ( hipMalloc(&C_dd, sizeElements) );

        // Do an extra device-to-device copies here to mix things up:
        HIPCHECK ( hipMemcpy(C_dd, C_d,  sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyDeviceToDevice));

        //Destroy the original C_d:
        HIPCHECK ( hipMemset(C_d, 0x5A, sizeElements));

        HIPCHECK ( hipMemcpy(C_h, C_dd, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost));
    } else {
        HIPCHECK ( hipMemcpy(C_h, C_d, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost));
    }

    HIPCHECK ( hipDeviceSynchronize() );
    HipTest::checkVectorADD(A_h, B_h, C_h, numElements);

    HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, usePinnedHost);

    printf ("  %s success\n", __func__);
}
예제 #30
0
// IN: nStreams : number of streams to use for the test
// IN :useNullStream - use NULL stream.  Synchronizes everything.
// IN: useSyncMemcpyH2D - use sync memcpy (no overlap) for H2D
// IN: useSyncMemcpyD2H - use sync memcpy (no overlap) for D2H
void test_chunkedAsyncExample(int nStreams, bool useNullStream, bool useSyncMemcpyH2D,
                              bool useSyncMemcpyD2H) {
    size_t Nbytes = N * sizeof(int);
    printf("testing: %s(useNullStream=%d, useSyncMemcpyH2D=%d, useSyncMemcpyD2H=%d)  ", __func__,
           useNullStream, useSyncMemcpyH2D, useSyncMemcpyD2H);
    printf("Nbytes=%zu (%6.1f MB)\n", Nbytes, (double)(Nbytes) / 1024.0 / 1024.0);

    int *A_d, *B_d, *C_d;
    int *A_h, *B_h, *C_h;

    HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, true);


    unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);


    hipStream_t* stream = (hipStream_t*)malloc(sizeof(hipStream_t) * nStreams);
    if (useNullStream) {
        nStreams = 1;
        stream[0] = NULL;
    } else {
        for (int i = 0; i < nStreams; ++i) {
            HIPCHECK(hipStreamCreate(&stream[i]));
        }
    }


    size_t workLeft = N;
    size_t workPerStream = N / nStreams;
    for (int i = 0; i < nStreams; ++i) {
        size_t work = (workLeft < workPerStream) ? workLeft : workPerStream;
        size_t workBytes = work * sizeof(int);

        size_t offset = i * workPerStream;
        HIPASSERT(A_d + offset < A_d + Nbytes);
        HIPASSERT(B_d + offset < B_d + Nbytes);
        HIPASSERT(C_d + offset < C_d + Nbytes);
        if (useSyncMemcpyH2D) {
            HIPCHECK(hipMemcpy(&A_d[offset], &A_h[offset], workBytes, hipMemcpyHostToDevice));
            HIPCHECK(hipMemcpy(&B_d[offset], &B_h[offset], workBytes, hipMemcpyHostToDevice));
        } else {
            HIPCHECK(hipMemcpyAsync(&A_d[offset], &A_h[offset], workBytes, hipMemcpyHostToDevice,
                                    stream[i]));
            HIPCHECK(hipMemcpyAsync(&B_d[offset], &B_h[offset], workBytes, hipMemcpyHostToDevice,
                                    stream[i]));
        };

        hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, stream[i],
                        &A_d[offset], &B_d[offset], &C_d[offset], work);

        if (useSyncMemcpyD2H) {
            HIPCHECK(hipMemcpy(&C_h[offset], &C_d[offset], workBytes, hipMemcpyDeviceToHost));
        } else {
            HIPCHECK(hipMemcpyAsync(&C_h[offset], &C_d[offset], workBytes, hipMemcpyDeviceToHost,
                                    stream[i]));
        }
    }


    HIPCHECK(hipDeviceSynchronize());


    HipTest::checkVectorADD(A_h, B_h, C_h, N);

    HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, true);

    free(stream);
};