void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, SyncMode syncMode, bool expectMismatch) { // This test sends a long-running kernel to the null stream, then tests to see if the // specified synchronization technique is effective. // // Some syncMode are not expected to correctly sync (for example "syncNone"). in these // cases the test sets expectMismatch and the check logic below will attempt to ensure that // the undesired synchronization did not occur - ie ensure the kernel is still running and did // not yet update the stop event. This can be tricky since if the kernel runs fast enough it // may complete before the check. To prevent this, the addCountReverse has a count parameter // which causes it to loop repeatedly, and the results are checked in reverse order. // // Tests with expectMismatch=true should ensure the kernel finishes correctly. This results // are checked and we test to make sure stop event has completed. if (!(testMask & p_tests)) { return; } printf ("\ntest 0x%02x: syncMode=%s expectMismatch=%d\n", testMask, syncModeString(syncMode), expectMismatch); size_t sizeBytes = numElements * sizeof(int); int count =100; int init0 = 0; HIPCHECK(hipMemset(C_d, init0, sizeBytes)); for (int i=0; i<numElements; i++) { C_h[i] = -1; // initialize } hipStream_t otherStream = 0; unsigned flags = (syncMode == syncMarkerThenOtherNonBlockingStream) ? hipStreamNonBlocking : hipStreamDefault; HIPCHECK(hipStreamCreateWithFlags(&otherStream, flags)); hipEvent_t stop, otherStreamEvent; HIPCHECK(hipEventCreate(&stop)); HIPCHECK(hipEventCreate(&otherStreamEvent)); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); // Launch kernel into null stream, should result in C_h == count. hipLaunchKernelGGL( HipTest::addCountReverse, dim3(blocks), dim3(threadsPerBlock), 0, 0 /*stream*/, static_cast<const int*>(C_d), C_h, numElements, count); HIPCHECK(hipEventRecord(stop, 0/*default*/)); switch (syncMode) { case syncNone: break; case syncNullStream: HIPCHECK(hipStreamSynchronize(0)); // wait on host for null stream: break; case syncOtherStream: // Does this synchronize with the null stream? HIPCHECK(hipStreamSynchronize(otherStream)); break; case syncMarkerThenOtherStream: case syncMarkerThenOtherNonBlockingStream: // this may wait for NULL stream depending hipStreamNonBlocking flag above HIPCHECK(hipEventRecord(otherStreamEvent, otherStream)); HIPCHECK(hipStreamSynchronize(otherStream)); break; case syncDevice: HIPCHECK(hipDeviceSynchronize()); break; default: assert(0); }; hipError_t done = hipEventQuery(stop); if (expectMismatch) { assert (done == hipErrorNotReady); } else { assert (done == hipSuccess); } int mismatches = 0; int expected = init0 + count; for (int i=0; i<numElements; i++) { bool compareEqual = (C_h[i] == expected); if (!compareEqual) { mismatches ++; if (!expectMismatch) { printf ("C_h[%d] (%d) != %d\n", i, C_h[i], expected); assert(C_h[i] == expected); } } } if (expectMismatch) { assert (mismatches > 0); } HIPCHECK(hipStreamDestroy(otherStream)); HIPCHECK(hipEventDestroy(stop)); HIPCHECK(hipEventDestroy(otherStreamEvent)); HIPCHECK(hipDeviceSynchronize()); printf ("test: OK - %d mismatches (%6.2f%%)\n", mismatches, ((double)(mismatches)*100.0)/numElements); }
//return types are void since any internal error will be handled by quitting //no point in returning error codes... //returns a pointer to an RGBA version of the input image //and a pointer to the single channel grey-scale output //on both the host and device void preProcess(uchar4 **h_inputImageRGBA, uchar4 **h_outputImageRGBA, uchar4 **d_inputImageRGBA, uchar4 **d_outputImageRGBA, unsigned char **d_redBlurred, unsigned char **d_greenBlurred, unsigned char **d_blueBlurred, float **h_filter, int *filterWidth, const std::string &filename) { //make sure the context initializes ok checkCudaErrors(hipFree(0)); cv::Mat image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR); if (image.empty()) { std::cerr << "Couldn't open file: " << filename << std::endl; exit(1); } cv::cvtColor(image, imageInputRGBA, CV_BGR2RGBA); //allocate memory for the output imageOutputRGBA.create(image.rows, image.cols, CV_8UC4); //This shouldn't ever happen given the way the images are created //at least based upon my limited understanding of OpenCV, but better to check if (!imageInputRGBA.isContinuous() || !imageOutputRGBA.isContinuous()) { std::cerr << "Images aren't continuous!! Exiting." << std::endl; exit(1); } *h_inputImageRGBA = (uchar4 *)imageInputRGBA.ptr<unsigned char>(0); *h_outputImageRGBA = (uchar4 *)imageOutputRGBA.ptr<unsigned char>(0); const size_t numPixels = numRows() * numCols(); //allocate memory on the device for both input and output checkCudaErrors(hipMalloc(d_inputImageRGBA, sizeof(uchar4) * numPixels)); checkCudaErrors(hipMalloc(d_outputImageRGBA, sizeof(uchar4) * numPixels)); checkCudaErrors(hipMemset(*d_outputImageRGBA, 0, numPixels * sizeof(uchar4))); //make sure no memory is left laying around //copy input array to the GPU checkCudaErrors(hipMemcpy(*d_inputImageRGBA, *h_inputImageRGBA, sizeof(uchar4) * numPixels, hipMemcpyHostToDevice)); d_inputImageRGBA__ = *d_inputImageRGBA; d_outputImageRGBA__ = *d_outputImageRGBA; //now create the filter that they will use const int blurKernelWidth = 9; const float blurKernelSigma = 2.; *filterWidth = blurKernelWidth; //create and fill the filter we will convolve with *h_filter = new float[blurKernelWidth * blurKernelWidth]; h_filter__ = *h_filter; float filterSum = 0.f; //for normalization for (int r = -blurKernelWidth/2; r <= blurKernelWidth/2; ++r) { for (int c = -blurKernelWidth/2; c <= blurKernelWidth/2; ++c) { float filterValue = expf( -(float)(c * c + r * r) / (2.f * blurKernelSigma * blurKernelSigma)); (*h_filter)[(r + blurKernelWidth/2) * blurKernelWidth + c + blurKernelWidth/2] = filterValue; filterSum += filterValue; } } float normalizationFactor = 1.f / filterSum; for (int r = -blurKernelWidth/2; r <= blurKernelWidth/2; ++r) { for (int c = -blurKernelWidth/2; c <= blurKernelWidth/2; ++c) { (*h_filter)[(r + blurKernelWidth/2) * blurKernelWidth + c + blurKernelWidth/2] *= normalizationFactor; } } //blurred checkCudaErrors(hipMalloc(d_redBlurred, sizeof(unsigned char) * numPixels)); checkCudaErrors(hipMalloc(d_greenBlurred, sizeof(unsigned char) * numPixels)); checkCudaErrors(hipMalloc(d_blueBlurred, sizeof(unsigned char) * numPixels)); checkCudaErrors(hipMemset(*d_redBlurred, 0, sizeof(unsigned char) * numPixels)); checkCudaErrors(hipMemset(*d_greenBlurred, 0, sizeof(unsigned char) * numPixels)); checkCudaErrors(hipMemset(*d_blueBlurred, 0, sizeof(unsigned char) * numPixels)); }
void memcpytest2(size_t numElements, bool usePinnedHost, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault) { size_t sizeElements = numElements * sizeof(T); printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d\n", __func__, TYPENAME(T), sizeElements, sizeElements/1024.0/1024.0, usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault); T *A_d, *B_d, *C_d; T *A_h, *B_h, *C_h; HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, numElements, usePinnedHost); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); T *A_hh = NULL; T *B_hh = NULL; T *C_dd = NULL; if (useHostToHost) { if (usePinnedHost) { HIPCHECK ( hipHostMalloc((void**)&A_hh, sizeElements, hipHostMallocDefault) ); HIPCHECK ( hipHostMalloc((void**)&B_hh, sizeElements, hipHostMallocDefault) ); } else { A_hh = (T*)malloc(sizeElements); B_hh = (T*)malloc(sizeElements); } // Do some extra host-to-host copies here to mix things up: HIPCHECK ( hipMemcpy(A_hh, A_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); HIPCHECK ( hipMemcpy(B_hh, B_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); HIPCHECK ( hipMemcpy(A_d, A_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); HIPCHECK ( hipMemcpy(B_d, B_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); } else { HIPCHECK ( hipMemcpy(A_d, A_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); HIPCHECK ( hipMemcpy(B_d, B_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); } hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements); if (useDeviceToDevice) { HIPCHECK ( hipMalloc(&C_dd, sizeElements) ); // Do an extra device-to-device copies here to mix things up: HIPCHECK ( hipMemcpy(C_dd, C_d, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyDeviceToDevice)); //Destroy the original C_d: HIPCHECK ( hipMemset(C_d, 0x5A, sizeElements)); HIPCHECK ( hipMemcpy(C_h, C_dd, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); } else { HIPCHECK ( hipMemcpy(C_h, C_d, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); } HIPCHECK ( hipDeviceSynchronize() ); HipTest::checkVectorADD(A_h, B_h, C_h, numElements); HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, usePinnedHost); printf (" %s success\n", __func__); }
extern "C" void mixbenchGPU(double *c, long size){ const char *benchtype = "compute with global memory (block strided)"; printf("Trade-off type: %s\n", benchtype); double *cd; CUDA_SAFE_CALL( hipMalloc((void**)&cd, size*sizeof(double)) ); // Copy data to device memory CUDA_SAFE_CALL( hipMemset(cd, 0, size*sizeof(double)) ); // initialize to zeros // Synchronize in order to wait for memory operations to finish CUDA_SAFE_CALL( hipDeviceSynchronize() ); printf("---------------------------------------------------------- CSV data ----------------------------------------------------------\n"); printf("Experiment ID, Single Precision ops,,,, Double precision ops,,,, Integer operations,,, \n"); printf("Compute iters, Flops/byte, ex.time, GFLOPS, GB/sec, Flops/byte, ex.time, GFLOPS, GB/sec, Iops/byte, ex.time, GIOPS, GB/sec\n"); runbench_warmup(cd, size); runbench<32>(cd, size); runbench<31>(cd, size); runbench<30>(cd, size); runbench<29>(cd, size); runbench<28>(cd, size); runbench<27>(cd, size); runbench<26>(cd, size); runbench<25>(cd, size); runbench<24>(cd, size); runbench<23>(cd, size); runbench<22>(cd, size); runbench<21>(cd, size); runbench<20>(cd, size); runbench<19>(cd, size); runbench<18>(cd, size); runbench<17>(cd, size); runbench<16>(cd, size); runbench<15>(cd, size); runbench<14>(cd, size); runbench<13>(cd, size); runbench<12>(cd, size); runbench<11>(cd, size); runbench<10>(cd, size); runbench<9>(cd, size); runbench<8>(cd, size); runbench<7>(cd, size); runbench<6>(cd, size); runbench<5>(cd, size); runbench<4>(cd, size); runbench<3>(cd, size); runbench<2>(cd, size); runbench<1>(cd, size); runbench<0>(cd, size); printf("---------------------------------------------------------- CSV data ----------------------------------------------------------\n"); // Copy results back to host memory CUDA_SAFE_CALL( hipMemcpy(c, cd, size*sizeof(double), hipMemcpyDeviceToHost) ); CUDA_SAFE_CALL( hipFree(cd) ); CUDA_SAFE_CALL( hipDeviceReset() ); }