예제 #1
0
void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, SyncMode syncMode, bool expectMismatch)
{

    // This test sends a long-running kernel to the null stream, then tests to see if the
    // specified synchronization technique is effective.
    //
    // Some syncMode are not expected to correctly sync (for example "syncNone").  in these
    // cases the test sets expectMismatch and the check logic below will attempt to ensure that
    // the undesired synchronization did not occur - ie ensure the kernel is still running and did
    // not yet update the stop event.  This can be tricky since if the kernel runs fast enough it
    // may complete before the check.  To prevent this, the addCountReverse has a count parameter
    // which causes it to loop repeatedly, and the results are checked in reverse order.
    //
    // Tests with expectMismatch=true should ensure the kernel finishes correctly. This results
    // are checked and we test to make sure stop event has completed.

    if (!(testMask & p_tests)) {
        return;
    }
    printf ("\ntest 0x%02x: syncMode=%s expectMismatch=%d\n",
            testMask, syncModeString(syncMode), expectMismatch);

    size_t sizeBytes = numElements * sizeof(int);

    int count =100;
    int init0 = 0;
    HIPCHECK(hipMemset(C_d, init0, sizeBytes));
    for (int i=0; i<numElements; i++) {
        C_h[i] = -1; // initialize
    }

    hipStream_t otherStream = 0;
    unsigned flags = (syncMode == syncMarkerThenOtherNonBlockingStream) ?  hipStreamNonBlocking : hipStreamDefault;
    HIPCHECK(hipStreamCreateWithFlags(&otherStream, flags));
    hipEvent_t stop, otherStreamEvent;
    HIPCHECK(hipEventCreate(&stop));
    HIPCHECK(hipEventCreate(&otherStreamEvent));


    unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements);
    // Launch kernel into null stream, should result in C_h == count.
    hipLaunchKernelGGL(
        HipTest::addCountReverse,
        dim3(blocks),
        dim3(threadsPerBlock),
        0,
        0 /*stream*/,
        static_cast<const int*>(C_d),
        C_h,
        numElements,
        count);
    HIPCHECK(hipEventRecord(stop, 0/*default*/));

    switch (syncMode) {
        case syncNone:
            break;
        case syncNullStream:
            HIPCHECK(hipStreamSynchronize(0));  // wait on host for null stream:
            break;
        case syncOtherStream:
            // Does this synchronize with the null stream?
            HIPCHECK(hipStreamSynchronize(otherStream));
            break;
        case syncMarkerThenOtherStream:
        case syncMarkerThenOtherNonBlockingStream:

            // this may wait for NULL stream depending hipStreamNonBlocking flag above
            HIPCHECK(hipEventRecord(otherStreamEvent, otherStream));

            HIPCHECK(hipStreamSynchronize(otherStream));
            break;
        case syncDevice:
            HIPCHECK(hipDeviceSynchronize());
            break;
        default:
            assert(0);
    };

    hipError_t done = hipEventQuery(stop);

    if (expectMismatch) {
        assert (done == hipErrorNotReady);
    } else {
        assert (done == hipSuccess);
    }

    int mismatches = 0;
    int expected = init0 + count;
    for (int i=0; i<numElements; i++) {
        bool compareEqual = (C_h[i] == expected);
        if (!compareEqual) {
            mismatches ++;
            if  (!expectMismatch) {
                printf ("C_h[%d] (%d) != %d\n", i, C_h[i], expected);
                assert(C_h[i] == expected);
            }
        }
    }

    if (expectMismatch) {
        assert (mismatches > 0);
    }


    HIPCHECK(hipStreamDestroy(otherStream));
    HIPCHECK(hipEventDestroy(stop));
    HIPCHECK(hipEventDestroy(otherStreamEvent));

    HIPCHECK(hipDeviceSynchronize());

    printf ("test:   OK - %d mismatches (%6.2f%%)\n",  mismatches, ((double)(mismatches)*100.0)/numElements);
}
예제 #2
0
파일: HW2.cpp 프로젝트: briansp2020/cs344
//return types are void since any internal error will be handled by quitting
//no point in returning error codes...
//returns a pointer to an RGBA version of the input image
//and a pointer to the single channel grey-scale output
//on both the host and device
void preProcess(uchar4 **h_inputImageRGBA, uchar4 **h_outputImageRGBA,
                uchar4 **d_inputImageRGBA, uchar4 **d_outputImageRGBA,
                unsigned char **d_redBlurred,
                unsigned char **d_greenBlurred,
                unsigned char **d_blueBlurred,
                float **h_filter, int *filterWidth,
                const std::string &filename) {

  //make sure the context initializes ok
  checkCudaErrors(hipFree(0));

  cv::Mat image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR);
  if (image.empty()) {
    std::cerr << "Couldn't open file: " << filename << std::endl;
    exit(1);
  }

  cv::cvtColor(image, imageInputRGBA, CV_BGR2RGBA);

  //allocate memory for the output
  imageOutputRGBA.create(image.rows, image.cols, CV_8UC4);

  //This shouldn't ever happen given the way the images are created
  //at least based upon my limited understanding of OpenCV, but better to check
  if (!imageInputRGBA.isContinuous() || !imageOutputRGBA.isContinuous()) {
    std::cerr << "Images aren't continuous!! Exiting." << std::endl;
    exit(1);
  }

  *h_inputImageRGBA  = (uchar4 *)imageInputRGBA.ptr<unsigned char>(0);
  *h_outputImageRGBA = (uchar4 *)imageOutputRGBA.ptr<unsigned char>(0);

  const size_t numPixels = numRows() * numCols();
  //allocate memory on the device for both input and output
  checkCudaErrors(hipMalloc(d_inputImageRGBA, sizeof(uchar4) * numPixels));
  checkCudaErrors(hipMalloc(d_outputImageRGBA, sizeof(uchar4) * numPixels));
  checkCudaErrors(hipMemset(*d_outputImageRGBA, 0, numPixels * sizeof(uchar4))); //make sure no memory is left laying around

  //copy input array to the GPU
  checkCudaErrors(hipMemcpy(*d_inputImageRGBA, *h_inputImageRGBA, sizeof(uchar4) * numPixels, hipMemcpyHostToDevice));

  d_inputImageRGBA__  = *d_inputImageRGBA;
  d_outputImageRGBA__ = *d_outputImageRGBA;

  //now create the filter that they will use
  const int blurKernelWidth = 9;
  const float blurKernelSigma = 2.;

  *filterWidth = blurKernelWidth;

  //create and fill the filter we will convolve with
  *h_filter = new float[blurKernelWidth * blurKernelWidth];
  h_filter__ = *h_filter;

  float filterSum = 0.f; //for normalization

  for (int r = -blurKernelWidth/2; r <= blurKernelWidth/2; ++r) {
    for (int c = -blurKernelWidth/2; c <= blurKernelWidth/2; ++c) {
      float filterValue = expf( -(float)(c * c + r * r) / (2.f * blurKernelSigma * blurKernelSigma));
      (*h_filter)[(r + blurKernelWidth/2) * blurKernelWidth + c + blurKernelWidth/2] = filterValue;
      filterSum += filterValue;
    }
  }

  float normalizationFactor = 1.f / filterSum;

  for (int r = -blurKernelWidth/2; r <= blurKernelWidth/2; ++r) {
    for (int c = -blurKernelWidth/2; c <= blurKernelWidth/2; ++c) {
      (*h_filter)[(r + blurKernelWidth/2) * blurKernelWidth + c + blurKernelWidth/2] *= normalizationFactor;
    }
  }

  //blurred
  checkCudaErrors(hipMalloc(d_redBlurred,    sizeof(unsigned char) * numPixels));
  checkCudaErrors(hipMalloc(d_greenBlurred,  sizeof(unsigned char) * numPixels));
  checkCudaErrors(hipMalloc(d_blueBlurred,   sizeof(unsigned char) * numPixels));
  checkCudaErrors(hipMemset(*d_redBlurred,   0, sizeof(unsigned char) * numPixels));
  checkCudaErrors(hipMemset(*d_greenBlurred, 0, sizeof(unsigned char) * numPixels));
  checkCudaErrors(hipMemset(*d_blueBlurred,  0, sizeof(unsigned char) * numPixels));
}
void memcpytest2(size_t numElements, bool usePinnedHost, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault)
{
    size_t sizeElements = numElements * sizeof(T);
    printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d\n", 
            __func__, 
            TYPENAME(T),
            sizeElements, sizeElements/1024.0/1024.0,
            usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault);


    T *A_d, *B_d, *C_d;
    T *A_h, *B_h, *C_h;


    HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, numElements, usePinnedHost);
    unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements);

    T *A_hh = NULL;
    T *B_hh = NULL;
    T *C_dd = NULL;



    if (useHostToHost) {
        if (usePinnedHost) {
            HIPCHECK ( hipHostMalloc((void**)&A_hh, sizeElements, hipHostMallocDefault) );
            HIPCHECK ( hipHostMalloc((void**)&B_hh, sizeElements, hipHostMallocDefault) );
        } else {
            A_hh = (T*)malloc(sizeElements);
            B_hh = (T*)malloc(sizeElements);
        }


        // Do some extra host-to-host copies here to mix things up:
        HIPCHECK ( hipMemcpy(A_hh, A_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost));
        HIPCHECK ( hipMemcpy(B_hh, B_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost));


        HIPCHECK ( hipMemcpy(A_d, A_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
        HIPCHECK ( hipMemcpy(B_d, B_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
    } else {
        HIPCHECK ( hipMemcpy(A_d, A_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
        HIPCHECK ( hipMemcpy(B_d, B_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice));
    }

    hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements);

    if (useDeviceToDevice) {
        HIPCHECK ( hipMalloc(&C_dd, sizeElements) );

        // Do an extra device-to-device copies here to mix things up:
        HIPCHECK ( hipMemcpy(C_dd, C_d,  sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyDeviceToDevice));

        //Destroy the original C_d:
        HIPCHECK ( hipMemset(C_d, 0x5A, sizeElements));

        HIPCHECK ( hipMemcpy(C_h, C_dd, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost));
    } else {
        HIPCHECK ( hipMemcpy(C_h, C_d, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost));
    }

    HIPCHECK ( hipDeviceSynchronize() );
    HipTest::checkVectorADD(A_h, B_h, C_h, numElements);

    HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, usePinnedHost);

    printf ("  %s success\n", __func__);
}
예제 #4
0
extern "C" void mixbenchGPU(double *c, long size){
	const char *benchtype = "compute with global memory (block strided)";
	printf("Trade-off type:       %s\n", benchtype);
	double *cd;

	CUDA_SAFE_CALL( hipMalloc((void**)&cd, size*sizeof(double)) );

	// Copy data to device memory
	CUDA_SAFE_CALL( hipMemset(cd, 0, size*sizeof(double)) );  // initialize to zeros

	// Synchronize in order to wait for memory operations to finish
	CUDA_SAFE_CALL( hipDeviceSynchronize() );

	printf("---------------------------------------------------------- CSV data ----------------------------------------------------------\n");
	printf("Experiment ID, Single Precision ops,,,,              Double precision ops,,,,              Integer operations,,, \n");
	printf("Compute iters, Flops/byte, ex.time,  GFLOPS, GB/sec, Flops/byte, ex.time,  GFLOPS, GB/sec, Iops/byte, ex.time,   GIOPS, GB/sec\n");

	runbench_warmup(cd, size);

	runbench<32>(cd, size);
	runbench<31>(cd, size);
	runbench<30>(cd, size);
	runbench<29>(cd, size);
	runbench<28>(cd, size);
	runbench<27>(cd, size);
	runbench<26>(cd, size);
	runbench<25>(cd, size);
	runbench<24>(cd, size);
	runbench<23>(cd, size);
	runbench<22>(cd, size);
	runbench<21>(cd, size);
	runbench<20>(cd, size);
	runbench<19>(cd, size);
	runbench<18>(cd, size);
	runbench<17>(cd, size);
	runbench<16>(cd, size);
	runbench<15>(cd, size);
	runbench<14>(cd, size);
	runbench<13>(cd, size);
	runbench<12>(cd, size);
	runbench<11>(cd, size);
	runbench<10>(cd, size);
	runbench<9>(cd, size);
	runbench<8>(cd, size);
	runbench<7>(cd, size);
	runbench<6>(cd, size);
	runbench<5>(cd, size);
	runbench<4>(cd, size);
	runbench<3>(cd, size);
	runbench<2>(cd, size);
	runbench<1>(cd, size);
	runbench<0>(cd, size);

	printf("---------------------------------------------------------- CSV data ----------------------------------------------------------\n");

	// Copy results back to host memory
	CUDA_SAFE_CALL( hipMemcpy(c, cd, size*sizeof(double), hipMemcpyDeviceToHost) );

	CUDA_SAFE_CALL( hipFree(cd) );

	CUDA_SAFE_CALL( hipDeviceReset() );
}