CUDARunner::~CUDARunner()
{
	DeallocateResources();

	cuModuleUnload(m_module);
	cuCtxDestroy(m_context);
}
Beispiel #2
0
WEAK void halide_release() {
    // CUcontext ignore;
    // TODO: this is for timing; bad for release-mode performance
    CHECK_CALL( cuCtxSynchronize(), "cuCtxSynchronize on exit" );

    // Only destroy the context if we own it
    if (weak_cuda_ctx) {
        CHECK_CALL( cuCtxDestroy(weak_cuda_ctx), "cuCtxDestroy on exit" );    
        weak_cuda_ctx = 0;
    }

    // Destroy the events
    if (__start) {
        cuEventDestroy(__start);
        cuEventDestroy(__end);
        __start = __end = 0;
    }

    // Unload the module
    if (__mod) {
        CHECK_CALL( cuModuleUnload(__mod), "cuModuleUnload" );
        __mod = 0;
    }

    //CHECK_CALL( cuCtxPopCurrent(&ignore), "cuCtxPopCurrent" );
}
Beispiel #3
0
bool
GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
{
  struct ptx_image_data *image, **prev_p;
  struct ptx_device *dev = ptx_devices[ord];

  if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
    {
      GOMP_PLUGIN_error ("Offload data incompatible with PTX plugin"
			 " (expected %u, received %u)",
			 GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
      return false;
    }

  bool ret = true;
  pthread_mutex_lock (&dev->image_lock);
  for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
    if (image->target_data == target_data)
      {
	*prev_p = image->next;
	if (cuModuleUnload (image->module) != CUDA_SUCCESS)
	  ret = false;
	free (image->fns);
	free (image);
	break;
      }
  pthread_mutex_unlock (&dev->image_lock);
  return ret;
}
Beispiel #4
0
void clean_cuda(void)
{
    CUresult res;

    for (int i = 0; i < device_num; i++)
    {
        res = cuModuleUnload(module[i]);
        CUDA_CHECK(res, "cuModuleUnload()");
    }

    for (int i = 0; i < device_num; i++)
    {
        res = cuCtxDestroy(ctx[i]);
        CUDA_CHECK(res, "cuCtxDestroy()");
    }

    free(NR_MAXTHREADS_X);
    free(NR_MAXTHREADS_Y);
    free(ConvolutionKernel_func);
    free(DistanceTransformTwoDimensionalProblem_func);
    free(BilinearKernelTex32F_func);
    free(calculateHistogram_func);
    free(getFeatureMaps_func);
    free(calculateNorm_func);
    free(normalizeAndTruncate_func);
    free(PCAFeatureMapsAddNullableBorder_func);
    free(module);
    free(dev);
    free(ctx);
}
Beispiel #5
0
int main(){
	init_test();
	const std::string source = 
	".version 4.2\n"
	".target sm_20\n"
	".address_size 64\n"
	".visible .entry kernel(.param .u64 kernel_param_0) {\n"
	".reg .s32 	%r<2>;\n"
	".reg .s64 	%rd<3>;\n"
	"bra 	BB1_2;\n"
	"ld.param.u64 	%rd1, [kernel_param_0];\n"
	"cvta.to.global.u64 	%rd2, %rd1;\n"
	"mov.u32 	%r1, 5;\n"
	"st.global.u32 	[%rd2], %r1;\n"
	"BB1_2: ret;\n"
	"}\n";
	CUmodule modId = 0;
	CUfunction funcHandle = 0;
	cu_assert(cuModuleLoadData(&modId, source.c_str()));
	cu_assert(cuModuleGetFunction(&funcHandle, modId, "kernel"));
	CUdeviceptr devValue;
	int hostValue = 10;
	cu_assert(cuMemAlloc(&devValue, sizeof(int)));
	cu_assert(cuMemcpyHtoD(devValue, &hostValue, sizeof(hostValue)));
	void * params[] = {&devValue};
	cu_assert(cuLaunchKernel(funcHandle, 1,1,1, 1,1,1, 0,0, params, nullptr));
	cu_assert(cuMemcpyDtoH(&hostValue, devValue, sizeof(hostValue)));
	assert(hostValue == 10);
	std::cout << hostValue << "\n";
	cu_assert(cuMemFree(devValue));
	cu_assert(cuModuleUnload(modId));
	return 0;
}
Beispiel #6
0
WEAK void halide_release() {
    // It's possible that this is being called from the destructor of
    // a static variable, in which case the driver may already be
    // shutting down. For this reason we allow the deinitialized
    // error.
    CHECK_CALL_DEINIT_OK( cuCtxSynchronize(), "cuCtxSynchronize on exit" );

    // Only destroy the context if we own it
    if (weak_cuda_ctx) {
        CHECK_CALL_DEINIT_OK( cuCtxDestroy(weak_cuda_ctx), "cuCtxDestroy on exit" );
        weak_cuda_ctx = 0;
    }

    // Destroy the events
    if (__start) {
        cuEventDestroy(__start);
        cuEventDestroy(__end);
        __start = __end = 0;
    }

    // Unload the module
    if (__mod) {
        CHECK_CALL_DEINIT_OK( cuModuleUnload(__mod), "cuModuleUnload" );
        __mod = 0;
    }

    //CHECK_CALL( cuCtxPopCurrent(&ignore), "cuCtxPopCurrent" );
}
Beispiel #7
0
void KernelRT::Release() {
	Reset();
	if (hModule) {
		cuModuleUnload(hModule);
		hModule = NULL;
	}
}
Beispiel #8
0
WEAK void halide_release(void *user_context) {
    // Do not do any of this if there is not context set. E.g.
    // if halide_release is called and no CUDA calls have been made.
    if (cuda_ctx_ptr != NULL) {
        // It's possible that this is being called from the destructor of
        // a static variable, in which case the driver may already be
        // shutting down. For this reason we allow the deinitialized
        // error.
        CHECK_CALL_DEINIT_OK( cuCtxSynchronize(), "cuCtxSynchronize on exit" );

        // Destroy the events
        if (__start) {
            cuEventDestroy(__start);
            cuEventDestroy(__end);
            __start = __end = 0;
        }

        // Unload the module
        if (__mod) {
            CHECK_CALL_DEINIT_OK( cuModuleUnload(__mod), "cuModuleUnload" );
            __mod = 0;
        }

        // Only destroy the context if we own it
        if (weak_cuda_ctx) {
            CHECK_CALL_DEINIT_OK( cuCtxDestroy(weak_cuda_ctx), "cuCtxDestroy on exit" );
            weak_cuda_ctx = 0;
        }

        cuda_ctx_ptr = NULL;
    }

    //CHECK_CALL( cuCtxPopCurrent(&ignore), "cuCtxPopCurrent" );
}
/**
 * This measures the overhead in launching a kernel function on each GPU in the
 * system.
 *
 * It does this by executing a small kernel (copying 1 value in global memory) a
 * very large number of times and taking the average execution time.  This
 * program uses the CUDA driver API.
 */
int main() {
  CU_ERROR_CHECK(cuInit(0));

  int count;
  CU_ERROR_CHECK(cuDeviceGetCount(&count));

  float x = 5.0f;
  for (int d = 0; d < count; d++) {
    CUdevice device;
    CU_ERROR_CHECK(cuDeviceGet(&device, d));

    CUcontext context;
    CU_ERROR_CHECK(cuCtxCreate(&context, 0, device));

    CUdeviceptr in, out;
    CU_ERROR_CHECK(cuMemAlloc(&in, sizeof(float)));
    CU_ERROR_CHECK(cuMemAlloc(&out, sizeof(float)));
    CU_ERROR_CHECK(cuMemcpyHtoD(in, &x, sizeof(float)));

    CUmodule module;
    CU_ERROR_CHECK(cuModuleLoadData(&module, imageBytes));

    CUfunction function;
    CU_ERROR_CHECK(cuModuleGetFunction(&function, module, "kernel"));

    void * params[] = { &in, &out };

    CUevent start, stop;
    CU_ERROR_CHECK(cuEventCreate(&start, 0));
    CU_ERROR_CHECK(cuEventCreate(&stop, 0));

    CU_ERROR_CHECK(cuEventRecord(start, 0));
    for (int i = 0; i < ITERATIONS; i++)
      CU_ERROR_CHECK(cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, params, NULL));

    CU_ERROR_CHECK(cuEventRecord(stop, 0));
    CU_ERROR_CHECK(cuEventSynchronize(stop));

    float time;
    CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop));

    CU_ERROR_CHECK(cuEventDestroy(start));
    CU_ERROR_CHECK(cuEventDestroy(stop));

    CU_ERROR_CHECK(cuMemFree(in));
    CU_ERROR_CHECK(cuMemFree(out));

    fprintf(stdout, "Device %d: %fms\n", d, (time / (double)ITERATIONS));

    CU_ERROR_CHECK(cuModuleUnload(module));

    CU_ERROR_CHECK(cuCtxDestroy(context));
  }

  return 0;
}
Beispiel #10
0
GpuCompilationContext::~GpuCompilationContext() {
#ifdef HAVE_CUDA
  static_cast<const CudaMgr_Namespace::CudaMgr*>(cuda_mgr_)->setContext(device_id_);
  auto status = cuModuleUnload(module_);
  // TODO(alex): handle this race better
  if (status == CUDA_ERROR_DEINITIALIZED) {
    return;
  }
  checkCudaErrors(status);
#endif
}
Beispiel #11
0
SEXP R_auto_cuModuleUnload(SEXP r_hmod)
{
    SEXP r_ans = R_NilValue;
    CUmodule hmod = (CUmodule) getRReference(r_hmod);
    
    CUresult ans;
    ans = cuModuleUnload(hmod);
    
    r_ans = Renum_convert_CUresult(ans) ;
    
    return(r_ans);
}
Beispiel #12
0
//----------------------------------------------------------------------------//
bool CUDAImpl::_UnloadModule(std::string * err)
{
    CUresult c_err;
    if (_cudaBuild) {
        _cudaKernels.clear();
        _cudaBuild = false;
        c_err = cuModuleUnload(_cudaModule);
        if (_cudaErrorLoadModule(c_err, err)) {
            return false;
        }
    }
    return true;
}
Beispiel #13
0
int
main()
{
  CUresult result;
  result = cuInit(0);
  CUdevice device;
  result = cuDeviceGet(&device, 0);
  CUcontext ctx;
  result = cuCtxCreate(&ctx, 0, device);
  CUmodule module;
  result = cuModuleLoad(&module, "cuda-shift-throughput.cubin");
  CUfunction kernel;
  result = cuModuleGetFunction(&kernel, module, "kernel");
  int block;
  result = cuFuncGetAttribute(&block,
                              CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
                              kernel);
  int grid = 1024 * 1024;
  CUevent event[2];
  for (ptrdiff_t i = 0; i < 2; ++i) {
    result = cuEventCreate(&event[i], 0);
  }
  result = cuEventRecord(event[0], 0);
  result = cuLaunchKernel(kernel, grid, 1, 1, block, 1, 1, 0, 0, 0, 0);
  result = cuEventRecord(event[1], 0);
  result = cuEventSynchronize(event[1]);
  float time;
  result = cuEventElapsedTime(&time, event[0], event[1]);
  int gpuclock;
  result =
    cuDeviceGetAttribute(&gpuclock, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device);
  int gpump;
  result =
    cuDeviceGetAttribute(&gpump, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
                         device);
  std::printf("Clock: %d KHz, # of MPs: %d\n", gpuclock, gpump);
  std::printf("Elapsed Time: %f milliseconds\n", time);
  std::printf("# of Threads: %d, # of SHLs : %lld\n", block,
              1024ll * block * grid);
  std::printf("Throughput: %f\n",
              1024.0 * block * grid / ((double) gpump * gpuclock * time));
  for (ptrdiff_t i = 0; i < 2; ++i) {
    result = cuEventDestroy(event[i]);
  }
  result = cuModuleUnload(module);
  result = cuCtxDestroy(ctx);
  return 0;
}
Beispiel #14
0
/*
 * module unload and destroy context
 */
void
clean_cuda(void){

    res = cuModuleUnload(module);
    if(res != CUDA_SUCCESS){
      printf("cuModuleUnload failed: res = %s\n", conv(res));
      exit(1);
    }
    
    res = cuCtxDestroy(ctx);
    if(res != CUDA_SUCCESS){
      printf("cuCtxDestroy failed: res = %s\n", conv(res));
      exit(1);
    }

}
int gib_destroy ( gib_context c ) {
  /* TODO:  Make sure everything created in gib_init is destroyed here. */
  ERROR_CHECK_FAIL(cuCtxPushCurrent(((gpu_context)(c->acc_context))->pCtx));
  int rc_i = gib_cpu_destroy(c);
  if (rc_i != GIB_SUC) {
    printf("gib_cpu_destroy returned %i\n", rc_i);
    exit(EXIT_FAILURE);
  }
  gpu_context gpu_c = (gpu_context) c->acc_context;
#if !GIB_USE_MMAP
  ERROR_CHECK_FAIL(cuMemFree(gpu_c->buffers));
#endif
  ERROR_CHECK_FAIL(cuModuleUnload(gpu_c->module));
  ERROR_CHECK_FAIL(cuCtxDestroy(gpu_c->pCtx));
  return GIB_SUC;
}
Beispiel #16
0
static void _cuda_freekernel(gpukernel *k) {
  k->refcnt--;
  if (k->refcnt == 0) {
    if (k->ctx != NULL) {
      cuda_enter(k->ctx);
      cuModuleUnload(k->m);
      cuda_exit(k->ctx);
      cuda_free_ctx(k->ctx);
    }
    CLEAR(k);
    free(k->args);
    free(k->bin);
    free(k->types);
    free(k);
  }
}
Beispiel #17
0
int main(){
	init_test();
	const std::string test_source =
	".version 4.2\n"
	".target sm_20\n"
	".address_size 64\n"
	".visible .entry _Z6kernelPfi(\n"
	".param .u64 _Z6kernelPfi_param_0,\n"
	".param .u32 _Z6kernelPfi_param_1){\n"
	".reg .pred 	%p<2>;\n"
	".reg .f32 	%f<3>;\n"
	".reg .s32 	%r<3>;\n"
	".reg .s64 	%rd<5>;\n"
	"ld.param.u64 	%rd1, [_Z6kernelPfi_param_0];\n"
	"ld.param.u32 	%r2, [_Z6kernelPfi_param_1];\n"
	"mov.u32 	%r1, %tid.x;\n"
	"setp.ge.u32	%p1, %r1, %r2;\n"
	"@%p1 bra 	BB0_2;\n"
	"cvta.to.global.u64 	%rd2, %rd1;\n"
	"cvt.rn.f32.u32	%f1, %r1;\n"
	"mul.f32 	%f2, %f1, 0f3FC00000;\n"
	"mul.wide.u32 	%rd3, %r1, 4;\n"
	"add.s64 	%rd4, %rd2, %rd3;\n"
	"st.global.f32 	[%rd4], %f2;\n"
	"BB0_2:\n"
	"ret;\n"
	"}";
	CUmodule modId = 0;
	CUfunction funcHandle = 0;
	cu_assert(cuModuleLoadData(&modId, test_source.c_str()));
	cu_assert(cuModuleGetFunction(&funcHandle, modId, "_Z6kernelPfi"));
	CUdeviceptr devArray;
	int size = 10;
	float hostArray[size];
	memset(hostArray, 0, size * sizeof(hostArray[0]));
	cu_assert(cuMemAlloc(&devArray, sizeof(float) * size));
	void * params[] = {&devArray, &size};
	auto result = cuLaunchKernel(funcHandle, 1,1,1, size*2,1,1, 0,0, params, nullptr);
	cu_assert(result);
	cu_assert(cuMemcpyDtoH(&hostArray, devArray, sizeof(hostArray[0])*size));
	cu_assert(cuMemFree(devArray));
	cu_assert(cuModuleUnload(modId));
	for (int i=0 ; i<size ; ++i)
		std::cout << hostArray[i] << '\n';
	return 0;
}
Beispiel #18
0
void setupModuleResource(const char* kernelFileName)
{
    CUmodule newModule = createModuleFromFile(kernelFileName);
    if (newModule != NULL)
    {
        if (module != NULL) cuModuleUnload(module);
        module = newModule;
    }
    checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "mainImage"));

    // TODO: take care of bytes
    size_t bytes;
    checkCudaErrors(cuModuleGetGlobal(&d_iResolution, &bytes, module, "iResolution"));
    checkCudaErrors(cuModuleGetGlobal(&d_iGlobalTime, &bytes, module, "iGlobalTime"));
    checkCudaErrors(cuModuleGetGlobal(&d_iMouse, &bytes, module, "iMouse"));
    checkCudaErrors(cuModuleGetGlobal(&d_fragColor, &d_fragColor_bytes, module, "fragColor"));
}
Beispiel #19
0
CUresult cuda_driver_api_exit(CUcontext ctx, CUmodule mod)
{
	CUresult res;

	res = cuModuleUnload(mod);
	if (res != CUDA_SUCCESS) {
		printf("cuModuleUnload failed: res = %lu\n", (unsigned long)res);
		return res;
	}

	res = cuCtxDestroy(ctx);
	if (res != CUDA_SUCCESS) {
		printf("cuCtxDestroy failed: res = %lu\n", (unsigned long)res);
		return res;
	}

	return CUDA_SUCCESS;
}
Beispiel #20
0
int mmult_gpu_close(struct device_info *device_info)
{
	CUresult res;

        res = cuModuleUnload(device_info->module);
        if (res != CUDA_SUCCESS) {
                printf("cuModuleUnload failed: res = %lu\n", (unsigned long)res);
                return -1;
        }

        res = cuCtxDestroy(device_info->context);
        if (res != CUDA_SUCCESS) {
                printf("cuCtxDestroy failed: res = %lu\n", (unsigned long)res);
                return -1;
        }

	return 0;
}
Beispiel #21
0
WEAK void halide_release(void *user_context) {
    DEBUG_PRINTF( user_context, "CUDA: halide_release (user_context: %p)\n", user_context );

    int err;
    CUcontext ctx;
    err = halide_acquire_cuda_context(user_context, &ctx);
    if (err != CUDA_SUCCESS || !ctx) {
        return;
    }

    // It's possible that this is being called from the destructor of
    // a static variable, in which case the driver may already be
    // shutting down.
    err = cuCtxSynchronize();
    halide_assert(user_context, err == CUDA_SUCCESS || err == CUDA_ERROR_DEINITIALIZED);

    // Unload the modules attached to this context. Note that the list
    // nodes themselves are not freed, only the module objects are
    // released. Subsequent calls to halide_init_kernels might re-create
    // the program object using the same list node to store the module
    // object.
    module_state *state = state_list;
    while (state) {
        if (state->module) {
            DEBUG_PRINTF(user_context, "    cuModuleUnload %p\n", state->module);
            err = cuModuleUnload(state->module);
            halide_assert(user_context, err == CUDA_SUCCESS || err == CUDA_ERROR_DEINITIALIZED);
            state->module = 0;
        }
        state = state->next;
    }

    // Only destroy the context if we own it
    if (ctx == weak_cuda_ctx) {
        DEBUG_PRINTF(user_context, "    cuCtxDestroy %p\n", weak_cuda_ctx);
        err = cuCtxDestroy(weak_cuda_ctx);
        halide_assert(user_context, err == CUDA_SUCCESS || err == CUDA_ERROR_DEINITIALIZED);
        weak_cuda_ctx = NULL;
    }

    halide_release_cuda_context(user_context);
}
Beispiel #22
0
void
GOMP_OFFLOAD_unload_image (int ord, unsigned version, const void *target_data)
{
  struct ptx_image_data *image, **prev_p;
  struct ptx_device *dev = ptx_devices[ord];

  if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
    return;
  
  pthread_mutex_lock (&dev->image_lock);
  for (prev_p = &dev->images; (image = *prev_p) != 0; prev_p = &image->next)
    if (image->target_data == target_data)
      {
	*prev_p = image->next;
	cuModuleUnload (image->module);
	free (image->fns);
	free (image);
	break;
      }
  pthread_mutex_unlock (&dev->image_lock);
}
Beispiel #23
0
int main(){
	init_test();
	const std::string source = 
	".version 4.2\n"
	".target sm_20\n"
	".address_size 64\n"
	".visible .entry kernel_4(\n"
	".param .u32 kernel_4_param_0,\n"
	".param .u64 kernel_4_param_1\n"
	")\n"
	"{\n"
	".reg .s32 	%r<3>;\n"
	".reg .s64 	%rd<3>;\n"
	"ld.param.u32 	%r1, [kernel_4_param_0];\n"
	"ld.param.u64 	%rd1, [kernel_4_param_1];\n"
	"cvta.to.global.u64 	%rd2, %rd1;\n"
	"add.s32 	%r2, %r1, 7;\n"
	"st.global.u32 	[%rd2], %r2;\n"
	"ret;\n"
	"}";
	CUmodule modId = 0;
	CUfunction funcHandle = 0;
	cu_assert(cuModuleLoadData(&modId, source.c_str()));
	cu_assert(cuModuleGetFunction(&funcHandle, modId, "kernel_4"));
	CUdeviceptr devValue;
	int hostValue = 10;
	cu_assert(cuMemAlloc(&devValue, sizeof(int)));
	void * params[] = {&hostValue, &devValue};
	cu_assert(cuLaunchKernel(funcHandle, 1,1,1, 1,1,1, 0,0, params, nullptr));
	int result = 0;
	cu_assert(cuMemcpyDtoH(&result, devValue, sizeof(result)));
	assert(result == hostValue + 7);
	std::cout << result << "\n";
	cu_assert(cuMemFree(devValue));
	cu_assert(cuModuleUnload(modId));
	return 0;
}
Beispiel #24
0
/// main - Program entry point
int main(int argc, char** argv) {
  if (argc != 3) {
    printf("Usage: %s dataCount blockSize\n", argv[0]);
    exit(1);
  }

  CUdevice device;
  CUmodule cudaModule;
  CUcontext context;
  CUfunction function;
  CUlinkState linker;
  int devCount;

  // CUDA initialization
  checkCudaErrors(cuInit(0));
  checkCudaErrors(cuDeviceGetCount(&devCount));
  checkCudaErrors(cuDeviceGet(&device, 0));

  char name[128];
  checkCudaErrors(cuDeviceGetName(name, 128, device));
  std::cout << "Using CUDA Device [0]: " << name << "\n";

  int devMajor, devMinor;
  checkCudaErrors(cuDeviceComputeCapability(&devMajor, &devMinor, device));
  std::cout << "Device Compute Capability: " << devMajor << "." << devMinor
            << "\n";
  if (devMajor < 2) {
    std::cerr << "ERROR: Device 0 is not SM 2.0 or greater\n";
    return 1;
  }

  std::ifstream t("kernel.ptx");
  if (!t.is_open()) {
    std::cerr << "kernel.ptx not found\n";
    return 1;
  }
  std::string str((std::istreambuf_iterator<char>(t)),
                  std::istreambuf_iterator<char>());

  // Create driver context
  checkCudaErrors(cuCtxCreate(&context, 0, device));

  // Create module for object
  checkCudaErrors(cuModuleLoadDataEx(&cudaModule, str.c_str(), 0, 0, 0));

  // Get kernel function
  checkCudaErrors(cuModuleGetFunction(&function, cudaModule, "kernel"));

  // Device data
  CUdeviceptr devBufferA;
  CUdeviceptr devBufferB;
  CUdeviceptr devBufferC;
  CUdeviceptr devBufferSMid;

  // Size
  unsigned dataCount = atoi(argv[1]);

  checkCudaErrors(cuMemAlloc(&devBufferA, sizeof(float) * dataCount));
  checkCudaErrors(cuMemAlloc(&devBufferB, sizeof(float) * dataCount));
  checkCudaErrors(cuMemAlloc(&devBufferC, sizeof(float) * dataCount));
  checkCudaErrors(cuMemAlloc(&devBufferSMid, sizeof(int) * dataCount));

  float* hostA = new float[dataCount];
  float* hostB = new float[dataCount];
  float* hostC = new float[dataCount];
  int* hostSMid = new int[dataCount];

  // Populate input
  for (unsigned i = 0; i != dataCount; ++i) {
    hostA[i] = (float)i;
    hostB[i] = (float)(2 * i);
    hostC[i] = 2.0f;
    hostSMid[i] = 0;
  }

  checkCudaErrors(
      cuMemcpyHtoD(devBufferA, &hostA[0], sizeof(float) * dataCount));
  checkCudaErrors(
      cuMemcpyHtoD(devBufferB, &hostB[0], sizeof(float) * dataCount));

  unsigned blockSizeX = atoi(argv[2]);
  unsigned blockSizeY = 1;
  unsigned blockSizeZ = 1;
  unsigned gridSizeX = (dataCount + blockSizeX - 1) / blockSizeX;
  unsigned gridSizeY = 1;
  unsigned gridSizeZ = 1;

  // Kernel parameters
  void* KernelParams[] = {&devBufferA, &devBufferB, &devBufferC,
                          &devBufferSMid};

  std::cout << "Launching kernel\n";

  // Kernel launch
  checkCudaErrors(cuLaunchKernel(function, gridSizeX, gridSizeY, gridSizeZ,
                                 blockSizeX, blockSizeY, blockSizeZ, 0, NULL,
                                 KernelParams, NULL));

  // Retrieve device data
  checkCudaErrors(
      cuMemcpyDtoH(&hostC[0], devBufferC, sizeof(float) * dataCount));
  checkCudaErrors(
      cuMemcpyDtoH(&hostSMid[0], devBufferSMid, sizeof(int) * dataCount));

  std::cout << "Results:\n";
  std::cout << "SM " << hostSMid[0] << ":" << hostA[0] << " + " << hostB[0]
            << " = " << hostC[0] << "\n";
  for (unsigned i = 1; i != dataCount; i++) {
    if (hostSMid[i] != hostSMid[i - 1])
      std::cout << "SM " << hostSMid[i] << ":" << hostA[i] << " + " << hostB[i]
                << " = " << hostC[i] << "\n";
  }

  // Clean up after ourselves
  delete[] hostA;
  delete[] hostB;
  delete[] hostC;
  delete[] hostSMid;

  // Clean-up
  checkCudaErrors(cuMemFree(devBufferA));
  checkCudaErrors(cuMemFree(devBufferB));
  checkCudaErrors(cuMemFree(devBufferC));
  checkCudaErrors(cuMemFree(devBufferSMid));
  checkCudaErrors(cuModuleUnload(cudaModule));
  checkCudaErrors(cuCtxDestroy(context));

  return 0;
}
//------------------------------------------------------------------------------
void mat_mul_test(const std::vector< std::string >& file_paths,
                  const char* kernel_name,
                  int size, 
                  int grid_dim_x,  int grid_dim_y,  int grid_dim_z,
                  int block_dim_x, int block_dim_y, int block_dim_z) {
    const int MATRIX_WIDTH = size; 
    const int MATRIX_HEIGHT = MATRIX_WIDTH; 
    const int VECTOR_SIZE = MATRIX_WIDTH;
    const int MATRIX_SIZE = MATRIX_WIDTH * MATRIX_HEIGHT;
    const int MATRIX_BYTE_SIZE = sizeof(real_t) * MATRIX_SIZE;
    const int VECTOR_BYTE_SIZE = sizeof(real_t) * VECTOR_SIZE;
   
    CCHECK(cuInit(0));
    array_t in_matrix_h(MATRIX_SIZE,  real_t(1));
    array_t in_vector_h(VECTOR_SIZE,  real_t(1));
    array_t out_vector_h(VECTOR_SIZE, real_t(0));
    CUdeviceptr in_matrix_d = 0;
    CUdeviceptr in_vector_d = 0;
    CUdeviceptr out_vector_d = 0;
    CUdevice device = CUdevice();
    CUcontext ctx = CUcontext();
    CCHECK(cuCtxCreate(&ctx, 0, device));
    CCHECK(cuMemAlloc(&in_matrix_d, MATRIX_BYTE_SIZE));
    assert(in_matrix_d);
    CCHECK(cuMemAlloc(&in_vector_d, VECTOR_BYTE_SIZE));
    assert(in_vector_d);
    CCHECK(cuMemAlloc(&out_vector_d, VECTOR_BYTE_SIZE));
    assert(out_vector_d);
    CCHECK(cuMemcpy(in_matrix_d,  CUdeviceptr(&in_matrix_h[0]),
           MATRIX_BYTE_SIZE));
    CCHECK(cuMemcpy(in_vector_d,  CUdeviceptr(&in_vector_h[0]),
           VECTOR_BYTE_SIZE));
    CCHECK(cuMemcpy(out_vector_d, CUdeviceptr(&out_vector_h[0]),
           VECTOR_BYTE_SIZE));
    CUmodule module = CUmodule();
    CUfunction fun = CUfunction(); 
   
    build(module, fun, file_paths, kernel_name);
       
    void* kernel_params[] = {&in_matrix_d,
                             (void *)(&MATRIX_WIDTH),
                             (void *)(&MATRIX_HEIGHT),
                             &in_vector_d,
                             &out_vector_d};
    CCHECK(cuLaunchKernel(fun, 
                    grid_dim_x, grid_dim_y, grid_dim_z,
                    block_dim_x, block_dim_y, block_dim_z,
                    0,//shared_mem_bytes
                    0,//stream
                    kernel_params,
                    0));
    
    CCHECK(cuMemcpy(CUdeviceptr(&out_vector_h[0]), out_vector_d,
                    VECTOR_BYTE_SIZE));
      
    // print first two and last elements
    std::cout << "vector[0]    = " << out_vector_h[ 0 ] << '\n';
    std::cout << "vector[1]    = " << out_vector_h[ 1 ] << '\n';
    std::cout << "vector[last] = " << out_vector_h.back() << std::endl;

    CCHECK(cuMemFree(in_matrix_d));
    CCHECK(cuMemFree(in_vector_d));
    CCHECK(cuMemFree(out_vector_d));
    CCHECK(cuModuleUnload(module));
    CCHECK(cuCtxDestroy(ctx));
}
Beispiel #26
0
int main() {
  CU_ERROR_CHECK(cuInit(0));

  int count;
  CU_ERROR_CHECK(cuDeviceGetCount(&count));

  count = (count > 2) ? 2 : count;

  CUdevice devices[count];
  for (int i = 0; i < count; i++)
    CU_ERROR_CHECK(cuDeviceGet(&devices[i], i));

  // Question 1:  Can you create multiple contexts on the same device?
  {
    fprintf(stderr, "Attempting to create multiple contexts on each device...\n");
    CUcontext contexts[count * N];
    size_t j = 0;
    for (int i = 0; i < count; i++) {
      CUresult error = CUDA_SUCCESS;
      size_t k;
      for (k = 0; k < N && error == CUDA_SUCCESS; k++) {
        error = cuCtxCreate(&contexts[j], CU_CTX_SCHED_AUTO, devices[i]);
        if (error == CUDA_SUCCESS)
          CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[j++]));
      }
      fprintf(stderr, "  created %zu contexts on device %d before cuCtxCreate returned \"%s\"\n", (k - 1), i, cuGetErrorString(error));
    }

    CUresult error = CUDA_SUCCESS;
    size_t k;
    for (k = 0; k < j && error == CUDA_SUCCESS; k++)
      error = cuCtxPushCurrent(contexts[k]);
    if (error == CUDA_SUCCESS)
      fprintf(stderr, "  successfully pushed %zu contexts with cuCtxPushCurrent\n", k);
    else
      fprintf(stderr, "  pushed %zu contexts before cuCtxPushCurrent returned \"%s\"\n", (k - 1), cuGetErrorString(error));

    for (size_t k = 0; k < j; k++)
      CU_ERROR_CHECK(cuCtxDestroy(contexts[k]));

    fprintf(stderr, "\n");
  }

  CUcontext contexts[count][2];
  for (int i = 0; i < count; i++) {
    for (size_t j = 0; j < 2; j++) {
      CU_ERROR_CHECK(cuCtxCreate(&contexts[i][j], CU_CTX_SCHED_AUTO, devices[i]));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[i][j]));
    }
  }

  // Question 2:  Can you access a host pointer in a different context from
  // which it was created?
  // Question 3:  Can you free a host pointer in a different context from which
  // it was created?
  {
    void * hPtr;
    CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0]));
    CU_ERROR_CHECK(cuMemAllocHost(&hPtr, 1024));
    CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0]));

    CUdeviceptr dPtr[count];
    CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1]));
    CU_ERROR_CHECK(cuMemAlloc(&dPtr[0], 1024)); // Different context, same device
    fprintf(stderr, "Accessing a host pointer from a different context to which it was allocated (on the same device) returns \"%s\"\n", cuGetErrorString(cuMemcpyHtoD(dPtr[0], hPtr, 1024)));
    CU_ERROR_CHECK(cuMemFree(dPtr[0]));
    CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1]));
    if (count > 1) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0]));
      CU_ERROR_CHECK(cuMemAlloc(&dPtr[1], 1024)); // Different context, different device
      fprintf(stderr, "Accessing a host pointer from a different context to which it was allocated (on a different device) returns \"%s\"\n", cuGetErrorString(cuMemcpyHtoD(dPtr[1], hPtr, 1024)));
    CU_ERROR_CHECK(cuMemFree(dPtr[1]));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0]));
    }

    fprintf(stderr, "\n");

    CUresult error = CUDA_ERROR_UNKNOWN;
    if (count > 1) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0]));
      error = cuMemFreeHost(hPtr);
      fprintf(stderr, "Freeing a host pointer from a different context to which it was allocated (on a different device) returns \"%s\"\n", cuGetErrorString(error));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0]));
    }
    if (error != CUDA_SUCCESS) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1]));
      error = cuMemFreeHost(hPtr);
      fprintf(stderr, "Freeing a host pointer from a different context to which it was allocated (on the same device) returns \"%s\"\n", cuGetErrorString(error));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1]));
    }
    if (error != CUDA_SUCCESS) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0]));
      error = cuMemFreeHost(hPtr);
      fprintf(stderr, "Freeing a host pointer from the same context to which it was allocated returns \"%s\"\n", cuGetErrorString(error));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0]));
    }

    fprintf(stderr, "\n");
  }

  // Question 4:  Can you access a device pointer in a different context from
  // which it was created?
  // Question 5:  Can you free a device pointer in a different context from which
  // it was created?
  {
    CUdeviceptr dPtr[count][2];
    CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0]));
    CU_ERROR_CHECK(cuMemAlloc(&dPtr[0][0], 1024));
    CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0]));
    CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1]));
    CU_ERROR_CHECK(cuMemAlloc(&dPtr[0][1], 1024));
    CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1]));

    CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1]));
    fprintf(stderr, "Accessing a device pointer from a different context to which it was allocated (on the same device) returns \"%s\"\n", cuGetErrorString(cuMemcpyDtoD(dPtr[0][0], dPtr[0][1], 1024)));
    CU_ERROR_CHECK(cuMemFree(dPtr[0][1]));
    CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1]));

    if (count > 1) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0]));
      CU_ERROR_CHECK(cuMemAlloc(&dPtr[1][0], 1024)); // Different context, different device
      fprintf(stderr, "Accessing a device pointer from a different context to which it was allocated (on a different device) returns \"%s\"\n", cuGetErrorString(cuMemcpyDtoD(dPtr[0][0], dPtr[1][0], 1024)));
      CU_ERROR_CHECK(cuMemFree(dPtr[1][0]));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0]));
    }

    fprintf(stderr, "\n");

    CUresult error = CUDA_ERROR_UNKNOWN;
    if (count > 1) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0]));
      error = cuMemFree(dPtr[0][0]);
      fprintf(stderr, "Freeing a device pointer from a different context to which it was allocated (on a different device) returns \"%s\"\n", cuGetErrorString(error));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0]));
    }
    if (error != CUDA_SUCCESS) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1]));
      error = cuMemFree(dPtr[0][0]);
      fprintf(stderr, "Freeing a device pointer from a different context to which it was allocated (on the same device) returns \"%s\"\n", cuGetErrorString(error));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1]));
    }
    if (error != CUDA_SUCCESS) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0]));
      error = cuMemFree(dPtr[0][0]);
      fprintf(stderr, "Freeing a device pointer from the same context to which it was allocated returns \"%s\"\n", cuGetErrorString(error));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0]));
    }

    fprintf(stderr, "\n");
  }

  // Question 6:  Can you access a module in a different context from which it
  // was loaded?
  // Question 7:  Can you unload a module in a different context from which it
  // was loaded?
  {
    CUmodule module;
    CUdeviceptr ptr;
    CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0]));
    CU_ERROR_CHECK(cuModuleLoad(&module,  "kernel-test.ptx"));
    CU_ERROR_CHECK(cuMemAlloc(&ptr, sizeof(float)));
    CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0]));

    CUfunction function = 0;
    if (count > 0) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0]));
      fprintf(stderr, "Getting a function pointer from a different context to which it was loaded (on a different device) returns \"%s\"\n", cuGetErrorString(cuModuleGetFunction(&function, module, "kernel")));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0]));
    }
    if (function == 0) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1]));
      fprintf(stderr, "Getting a function pointer from a different context to which it was loaded (on the same device) returns \"%s\"\n", cuGetErrorString(cuModuleGetFunction(&function, module, "kernel")));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1]));
    }
    if (function == 0) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0]));
      fprintf(stderr, "Getting a function pointer from the same context to which it was loaded returns \"%s\"\n", cuGetErrorString(cuModuleGetFunction(&function, module, "kernel")));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0]));
    }

    fprintf(stderr, "\n");

    CUdeviceptr a, b;
    CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0]));
    CU_ERROR_CHECK(cuMemAlloc(&a, sizeof(float)));
    CU_ERROR_CHECK(cuMemAlloc(&b, sizeof(float)));
    CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0]));
    void * params[] = { &a, & b };

    CUresult error = CUDA_ERROR_UNKNOWN;
    if (count > 0) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0]));
      fprintf(stderr, "Launching a function from a different context to which it was loaded (on a different device) returns \"%s\"\n", cuGetErrorString(error = cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, params, NULL)));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0]));
    }
    if (error != CUDA_SUCCESS) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1]));
      fprintf(stderr, "Launching a function from a different context to which it was loaded (on the same device) returns \"%s\"\n", cuGetErrorString(error = cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, params, NULL)));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1]));
    }
    if (error != CUDA_SUCCESS) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0]));
      fprintf(stderr, "Launching a function from the same context to which it was loaded returns \"%s\"\n", cuGetErrorString(error = cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, params, NULL)));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0]));
    }

    fprintf(stderr, "\n");

    error = CUDA_ERROR_UNKNOWN;
    if (count > 0) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0]));
      fprintf(stderr, "Unloading a module from a different context to which it was loaded (on a different device) returns \"%s\"\n", cuGetErrorString(error = cuModuleUnload(module)));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0]));
    }
    if (error != CUDA_SUCCESS) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1]));
      fprintf(stderr, "Unloading a module from a different context to which it was loaded (on the same device) returns \"%s\"\n", cuGetErrorString(error = cuModuleUnload(module)));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1]));
    }
    if (error != CUDA_SUCCESS) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0]));
      fprintf(stderr, "Unloading a module from the same context to which it was loaded returns \"%s\"\n", cuGetErrorString(error = cuModuleUnload(module)));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0]));
    }

    CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0]));
    CU_ERROR_CHECK(cuMemFree(a));
    CU_ERROR_CHECK(cuMemFree(b));
    CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0]));
  }

  for (int i = 0; i < count; i++) {
    for (size_t j = 0; j < 2; j++)
      CU_ERROR_CHECK(cuCtxDestroy(contexts[i][j]));
  }

  return 0;
}
Beispiel #27
0
int cuda_test_madd_vmmap_hybrid(unsigned int n, char *path)
{
	int i, j, idx;
	CUresult res;
	CUdevice dev;
	CUcontext ctx;
	CUfunction function;
	CUmodule module;
	CUdeviceptr a_dev, b_dev, c_dev;
	unsigned int *a_buf, *b_buf, *c_buf;
	unsigned long long int a_phys, b_phys, c_phys;
	unsigned int *c = (unsigned int *) malloc (n*n * sizeof(unsigned int));
	int block_x, block_y, grid_x, grid_y;
	char fname[256];
	int ret = 0;
	struct timeval tv;
	struct timeval tv_total_start, tv_total_end;
	float total;
	struct timeval tv_h2d_start, tv_h2d_end;
	float h2d;
	struct timeval tv_d2h_start, tv_d2h_end;
	float d2h;
	struct timeval tv_exec_start, tv_exec_end;
	struct timeval tv_mem_alloc_start;
	struct timeval tv_data_init_start;
	float data_init;
	struct timeval tv_conf_kern_start;
	struct timeval tv_close_start;
	float mem_alloc;
	float exec;
	float init_gpu;
	float configure_kernel;
	float close_gpu;
	float data_read;

	unsigned int dummy_b, dummy_c;
		

	/* block_x * block_y should not exceed 512. */
	block_x = n < 16 ? n : 16;
	block_y = n < 16 ? n : 16;
	grid_x = n / block_x;
	if (n % block_x != 0)
		grid_x++;
	grid_y = n / block_y;
	if (n % block_y != 0)
		grid_y++;

	gettimeofday(&tv_total_start, NULL);

	res = cuInit(0);
	if (res != CUDA_SUCCESS) {
		printf("cuInit failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	res = cuDeviceGet(&dev, 0);
	if (res != CUDA_SUCCESS) {
		printf("cuDeviceGet failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	res = cuCtxCreate(&ctx, 0, dev);
	if (res != CUDA_SUCCESS) {
		printf("cuCtxCreate failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	sprintf(fname, "%s/madd_gpu.cubin", path);
	res = cuModuleLoad(&module, fname);
	if (res != CUDA_SUCCESS) {
		printf("cuModuleLoad() failed\n");
		return -1;
	}
	res = cuModuleGetFunction(&function, module, "_Z3addPjS_S_j");
	if (res != CUDA_SUCCESS) {
		printf("cuModuleGetFunction() failed\n");
		return -1;
	}
	res = cuFuncSetBlockShape(function, block_x, block_y, 1);
	if (res != CUDA_SUCCESS) {
		printf("cuFuncSetBlockShape() failed\n");
		return -1;
	}

	gettimeofday(&tv_mem_alloc_start, NULL);

	/* a[] */
	res = cuMemAlloc(&a_dev, n*n * sizeof(unsigned int));
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc (a) failed\n");
		return -1;
	}
	res = cuMemMap((void**)&a_buf, a_dev, n*n * sizeof(unsigned int));
	if (res != CUDA_SUCCESS) {
		printf("cuMemMap (a) failed\n");
		return -1;
	}
	res = cuMemGetPhysAddr(&a_phys, (void*)a_buf);
	if (res != CUDA_SUCCESS) {
		printf("cuMemGetPhysAddress (a) failed\n");
		return -1;
	}
	/*printf("a[]: Physical Address 0x%llx\n", a_phys);*/

	/* b[] */
	res = cuMemAlloc(&b_dev, n*n * sizeof(unsigned int));
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc (b) failed\n");
		return -1;
	}
	res = cuMemMap((void**)&b_buf, b_dev, n*n * sizeof(unsigned int));
	if (res != CUDA_SUCCESS) {
		printf("cuMemMap (b) failed\n");
		return -1;
	}
	res = cuMemGetPhysAddr(&b_phys, (void*)b_buf);
	if (res != CUDA_SUCCESS) {
		printf("cuMemGetPhysAddress (b) failed\n");
		return -1;
	}
	/*printf("b[]: Physical Address 0x%llx\n", b_phys);*/

	/* c[] */
	res = cuMemAlloc(&c_dev, n*n * sizeof(unsigned int));
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc (c) failed\n");
		return -1;
	}
	res = cuMemMap((void**)&c_buf, c_dev, n*n * sizeof(unsigned int));
	if (res != CUDA_SUCCESS) {
		printf("cuMemMap (c) failed\n");
		return -1;
	}
	res = cuMemGetPhysAddr(&c_phys, (void*)c_buf);
	if (res != CUDA_SUCCESS) {
		printf("cuMemGetPhysAddress (c) failed\n");
		return -1;
	}
	/*printf("c[]: Physical Address 0x%llx\n", c_phys);*/

	gettimeofday(&tv_data_init_start, NULL);

	/* initialize A[] & B[] */
	for (i = 0; i < n; i++) {
		idx = i*n;
		for(j = 0; j < n; j++) {			
			a_buf[idx++] = i;
		}
	}
	for (i = 0; i < n; i++) {
		idx = i*n;
		for(j = 0; j < n; j++) {
			b_buf[idx++] = i;
		}
	}

	gettimeofday(&tv_h2d_start, NULL);
	gettimeofday(&tv_h2d_end, NULL);


	gettimeofday(&tv_conf_kern_start, NULL);

	/* set kernel parameters */
	res = cuParamSeti(function, 0, a_dev);	
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (a) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuParamSeti(function, 4, a_dev >> 32);
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (a) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuParamSeti(function, 8, b_dev);
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (b) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuParamSeti(function, 12, b_dev >> 32);
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (b) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuParamSeti(function, 16, c_dev);
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuParamSeti(function, 20, c_dev >> 32);
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuParamSeti(function, 24, n);
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuParamSetSize(function, 28);
	if (res != CUDA_SUCCESS) {
		printf("cuParamSetSize failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	gettimeofday(&tv_exec_start, NULL);
	/* launch the kernel */
	res = cuLaunchGrid(function, grid_x, grid_y);
	if (res != CUDA_SUCCESS) {
		printf("cuLaunchGrid failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	cuCtxSynchronize();
	gettimeofday(&tv_exec_end, NULL);


	gettimeofday(&tv_d2h_start, NULL);
	/* download c[] */


	memcpy(c, c_buf, n*n*sizeof(unsigned int));

	gettimeofday(&tv_d2h_end, NULL);

	/* Read back */
	for (i = 0; i < n; i++) {
		idx = i*n;
		for(j = 0; j < n; j++) {			
			dummy_c = c[idx++];
		}
	}



	gettimeofday(&tv_close_start, NULL);

	res = cuMemUnmap((void*)a_buf);
	if (res != CUDA_SUCCESS) {
		printf("cuMemUnmap (a) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuMemFree(a_dev);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFree (a) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuMemUnmap((void*)b_buf);
	if (res != CUDA_SUCCESS) {
		printf("cuMemUnmap (b) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuMemFree(b_dev);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFree (b) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuMemUnmap((void*)c_buf);
	if (res != CUDA_SUCCESS) {
		printf("cuMemUnmap (c) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuMemFree(c_dev);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFree (c) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	res = cuModuleUnload(module);
	if (res != CUDA_SUCCESS) {
		printf("cuModuleUnload failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	res = cuCtxDestroy(ctx);
	if (res != CUDA_SUCCESS) {
		printf("cuCtxDestroy failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	gettimeofday(&tv_total_end, NULL);




	tvsub(&tv_mem_alloc_start, &tv_total_start, &tv);
	init_gpu = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	tvsub(&tv_data_init_start, &tv_mem_alloc_start, &tv);
	mem_alloc = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	tvsub(&tv_h2d_start, &tv_data_init_start, &tv);
	data_init = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	tvsub(&tv_h2d_end, &tv_h2d_start, &tv);
	h2d = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	tvsub(&tv_exec_start, &tv_conf_kern_start, &tv);
	configure_kernel = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	tvsub(&tv_exec_end, &tv_exec_start, &tv);
	exec = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	tvsub(&tv_d2h_end, &tv_d2h_start, &tv);
	d2h = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	tvsub(&tv_close_start, &tv_d2h_end, &tv);
	data_read = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	tvsub(&tv_total_end, &tv_close_start, &tv);
	close_gpu = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	tvsub(&tv_total_end, &tv_total_start, &tv);
	total = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	printf("Init: %f\n", init_gpu);
	printf("MemAlloc: %f\n", mem_alloc);
	printf("DataInit: %f\n", data_init);
	printf("HtoD: %f\n", h2d);
	printf("KernConf: %f\n", configure_kernel);
	printf("Exec: %f\n", exec);
	printf("DtoH: %f\n", d2h);
	printf("DataRead: %f\n", data_read);
	printf("Close: %f\n", close_gpu);
	printf("Total: %f\n", total);


	return ret;
}
Beispiel #28
0
int cuda_test_fmadd(unsigned int n, char *path)
{
	int i, j, idx;
	CUresult res;
	CUdevice dev;
	CUcontext ctx;
	CUfunction function;
	CUmodule module;
	CUdeviceptr a_dev, b_dev, c_dev;
	float *a = (float *) malloc (n*n * sizeof(float));
	float *b = (float *) malloc (n*n * sizeof(float));
	float *c = (float *) malloc (n*n * sizeof(float));
	int block_x, block_y, grid_x, grid_y;
	int offset;
	char fname[256];
	struct timeval tv;
	struct timeval tv_total_start, tv_total_end;
	float total;
	struct timeval tv_h2d_start, tv_h2d_end;
	float h2d;
	struct timeval tv_d2h_start, tv_d2h_end;
	float d2h;
	struct timeval tv_exec_start, tv_exec_end;
	float exec;

	/* initialize A[] & B[] */
	for (i = 0; i < n; i++) {
		for(j = 0; j < n; j++) {
			idx = i * n + j;
			a[idx] = i + 0.1;
			b[idx] = i + 0.1;
		}
	}

	/* block_x * block_y should not exceed 512. */
	block_x = n < 16 ? n : 16;
	block_y = n < 16 ? n : 16;
	grid_x = n / block_x;
	if (n % block_x != 0)
		grid_x++;
	grid_y = n / block_y;
	if (n % block_y != 0)
		grid_y++;
	printf("block = (%d, %d)\n", block_x, block_y);
	printf("grid = (%d, %d)\n", grid_x, grid_y);

	gettimeofday(&tv_total_start, NULL);

	res = cuInit(0);
	if (res != CUDA_SUCCESS) {
		printf("cuInit failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	res = cuDeviceGet(&dev, 0);
	if (res != CUDA_SUCCESS) {
		printf("cuDeviceGet failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	res = cuCtxCreate(&ctx, 0, dev);
	if (res != CUDA_SUCCESS) {
		printf("cuCtxCreate failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	sprintf(fname, "%s/fmadd_gpu.cubin", path);
	res = cuModuleLoad(&module, fname);
	if (res != CUDA_SUCCESS) {
		printf("cuModuleLoad() failed\n");
		return -1;
	}
	res = cuModuleGetFunction(&function, module, "_Z3addPfS_S_i");
	if (res != CUDA_SUCCESS) {
		printf("cuModuleGetFunction() failed\n");
		return -1;
	}
	res = cuFuncSetSharedSize(function, 0x40); /* just random */
	if (res != CUDA_SUCCESS) {
		printf("cuFuncSetSharedSize() failed\n");
		return -1;
	}
	res = cuFuncSetBlockShape(function, block_x, block_y, 1);
	if (res != CUDA_SUCCESS) {
		printf("cuFuncSetBlockShape() failed\n");
		return -1;
	}

	/* a[] */
	res = cuMemAlloc(&a_dev, n*n * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc (a) failed\n");
		return -1;
	}
	/* b[] */
	res = cuMemAlloc(&b_dev, n*n * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc (b) failed\n");
		return -1;
	}
	/* c[] */
	res = cuMemAlloc(&c_dev, n*n * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc (c) failed\n");
		return -1;
	}

	gettimeofday(&tv_h2d_start, NULL);
	/* upload a[] and b[] */
	res = cuMemcpyHtoD(a_dev, a, n*n * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD (a) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuMemcpyHtoD(b_dev, b, n*n * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD (b) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	gettimeofday(&tv_h2d_end, NULL);

	/* set kernel parameters */
	offset = 0;
	res = cuParamSetv(function, offset, &a_dev, sizeof(a_dev));	
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (a) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	offset += sizeof(a_dev);
	res = cuParamSetv(function, offset, &b_dev, sizeof(b_dev));
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (b) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	offset += sizeof(b_dev);
	res = cuParamSetv(function, offset, &c_dev, sizeof(c_dev));
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	offset += sizeof(c_dev);
	res = cuParamSetv(function, offset, &n, sizeof(n));
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	offset += sizeof(n);
	res = cuParamSetSize(function, offset);
	if (res != CUDA_SUCCESS) {
		printf("cuParamSetSize failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	gettimeofday(&tv_exec_start, NULL);
	/* launch the kernel */
	res = cuLaunchGrid(function, grid_x, grid_y);
	if (res != CUDA_SUCCESS) {
		printf("cuLaunchGrid failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	cuCtxSynchronize();
	gettimeofday(&tv_exec_end, NULL);

	gettimeofday(&tv_d2h_start, NULL);
	/* download c[] */
	res = cuMemcpyDtoH(c, c_dev, n*n * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyDtoH (c) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	gettimeofday(&tv_d2h_end, NULL);

	res = cuMemFree(a_dev);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFree (a) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuMemFree(b_dev);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFree (b) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuMemFree(c_dev);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFree (c) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	res = cuModuleUnload(module);
	if (res != CUDA_SUCCESS) {
		printf("cuModuleUnload failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	res = cuCtxDestroy(ctx);
	if (res != CUDA_SUCCESS) {
		printf("cuCtxDestroy failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	gettimeofday(&tv_total_end, NULL);

	/* check the results */
	i = j = idx = 0;
	while (i < n) {
		while (j < n) {
			idx = i * n + j;
			if (c[idx] != a[idx] + b[idx]) {
				printf("c[%d] = %f\n", idx, c[idx]);
				printf("a[%d]+b[%d] = %f\n", idx, idx, a[idx]+b[idx]);
				return -1;
			}
			j++;
		}
		i++;
	}

	free(a);
	free(b);
	free(c);

	tvsub(&tv_h2d_end, &tv_h2d_start, &tv);
	h2d = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
	tvsub(&tv_d2h_end, &tv_d2h_start, &tv);
	d2h = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
	tvsub(&tv_exec_end, &tv_exec_start, &tv);
	exec = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
	tvsub(&tv_total_end, &tv_total_start, &tv);
	total = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	printf("HtoD: %f\n", h2d);
	printf("DtoH: %f\n", d2h);
	printf("Exec: %f\n", exec);
	printf("Time (Memcpy + Launch): %f\n", h2d + d2h + exec);
	printf("Total: %f\n", total);

	return 0;
}
int main( int argc, char **argv )
{                               
    CUdevice   main_device  = 0;
    CUcontext  main_context = nullptr;
                             
    CUmodule   mod_vectorAdd = nullptr;
    CUfunction fun_vectorAdd = nullptr;

    std::string path_vectorAdd( "D:/devel/vectoradd-cuda-driverAPI/vectorAdd.cu" );
    std::string ptx_vectorAdd ( "D:/devel/vectoradd-cuda-driverAPI/vectorAdd.ptx" );

    CUdeviceptr input_data  = 0u;
    CUdeviceptr output_data = 0u;

    std::size_t problem_size = 1024;
    try
    {
        //Initialize the driver API
        check_error( cuInit( 0u ) );
             
        {
            int device_count = 0u;
            check_error( cuDeviceGetCount( &device_count ) );

            if( ! device_count )
            {
                std::cerr << "No CUDA devices available" << std::endl;
                throw CUDA_ERROR_NO_DEVICE;
            }
        }

        check_error( cuDeviceGet( &main_device, 0 ) );
        check_error( cuCtxCreate( &main_context, 0, main_device ) );

        //Try to manually compile the source file
        {
            std::stringstream build_command;
            build_command <<
                "nvcc "
                "-ptx "
                "-o " << ptx_vectorAdd << " " <<
                path_vectorAdd;

            if( int build_status = system( build_command.str( ).c_str( ) ) )
            {            
                std::cerr << "Failed to compile source cuda file into a ptx assembly" << std::endl;
                throw CUDA_ERROR_UNKNOWN;
            }

            //Find module entry with assembly
            std::string str_assembly;

            {
                std::ifstream fassembly( ptx_vectorAdd ); 
                if( !fassembly.is_open( ) )
                {           
                    std::cerr << "'Vector Add' assembly unavailable" << std::endl;
                    throw CUDA_ERROR_FILE_NOT_FOUND;
                }         


                fassembly.seekg (0, std::ios::end);
                str_assembly.resize( std::string::size_type( fassembly.tellg() ) );

                fassembly.seekg (0, std::ios::beg);
                fassembly.read( &str_assembly[0], str_assembly.size( ) );

                fassembly.close( );
            }

            auto entry_pos = str_assembly.find( ".entry" );
            if( entry_pos == std::string::npos )
            {       
                std::cerr << "No entry point in 'Vector Add'" << std::endl;
                throw CUDA_ERROR_INVALID_SOURCE;
            }
              
            entry_pos += 6u; //".entry".size( )

            auto search_limit = str_assembly.find_first_of( " (", entry_pos );
            if( search_limit == std::string::npos )
            {       
                std::cerr << "No entry point in 'Vector Add'" << std::endl;
                throw CUDA_ERROR_INVALID_SOURCE;
            }
              
            std::string funcName( str_assembly.substr( entry_pos, search_limit ) );

            check_error( cuModuleLoad        ( &mod_vectorAdd, ptx_vectorAdd.c_str( ) ) );
            check_error( cuModuleGetFunction ( &fun_vectorAdd, mod_vectorAdd, funcName.c_str( ) ) );
        }

        //Play with buffer
        cuMemAlloc( &input_data,  problem_size * sizeof( float ) );
        cuMemAlloc( &output_data, problem_size * sizeof( float ) );

        {             
            int threadsPerBlock = 256;
            int blocksPerGrid   = (problem_size + threadsPerBlock - 1) / threadsPerBlock;

            void* args[] = { &input_data, &output_data, &problem_size };

            cuLaunchKernel(
                fun_vectorAdd,
                blocksPerGrid, 1, 1,
                threadsPerBlock, 1, 1,
                0, 
                0, 
                args,
                nullptr);
        }

        float* result = new float[problem_size];

        cuMemcpyDtoH( result, output_data, problem_size * sizeof( float ) );

        std::copy(
            result, result + problem_size,
            std::ostream_iterator<float>(std::cout, ", ") );  

        delete[] result;
                         
        if( output_data )   cuMemFree      ( output_data );
        if( input_data )    cuMemFree      ( input_data );
        if( mod_vectorAdd ) cuModuleUnload ( mod_vectorAdd );
        if( main_context )  cuCtxDestroy   ( main_context );
    } 
    catch( int return_code )
    {
        if( output_data )   cuMemFree      ( output_data );
        if( input_data )    cuMemFree      ( input_data );
        if( mod_vectorAdd ) cuModuleUnload ( mod_vectorAdd );
        if( main_context )  cuCtxDestroy   ( main_context );

        system("PAUSE");
        return return_code;
int main(int argc, char **argv)
{
  //data
  CUdeviceptr  d_data0   = 0;
  CUdeviceptr  d_data1   = 0;
  DataStruct *h_data0  = 0;
  DataStruct *h_data1  = 0;
  DataStruct h_data_reference0;
  DataStruct h_data_reference1;
  unsigned int memSize = sizeof(DataStruct);
  
  //device references
  CUcontext    hContext = 0;
  CUdevice     hDevice  = 0;
  CUmodule     hModule  = 0;
  CUstream     hStream  = 0;

  // Initialize the device and get a handle to the kernel
  CUresult status = initialize(0, &hContext, &hDevice, &hModule, &hStream);
  
  // Allocate memory on host and device
  if ((h_data0 = (DataStruct *)malloc(memSize)) == NULL)
    {
      std::cerr << "Could not allocate host memory" << std::endl;
      exit(-1);
    }
  status = cuMemAlloc(&d_data0, memSize);

  if ((h_data1 = (DataStruct *)malloc(memSize)) == NULL)
    {
      std::cerr << "Could not allocate host memory" << std::endl;
      exit(-1);
    }
  status = cuMemAlloc(&d_data1, memSize);
  if (status != CUDA_SUCCESS)
    printf("ERROR: during cuMemAlloc\n");

  ///////////////////////////////////////////////////////////////////////////////
  //======================= test cases ========================================//
  ///////////////////////////////////////////////////////////////////////////////
  std::string name = "";
  unsigned int testnum=0;
  unsigned int passed=0;

  //++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  /////////////////////// Ralf ///////////////////////////////////////////////////
  //++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

  if(runRalfFunction("test_phi_scalar", test_phi_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize))
    passed++;
  testnum++;
  if(runRalfFunction("test_phi2_scalar", test_phi2_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize))
    passed++;
  testnum++;
  if(runRalfFunction("test_phi3_scalar", test_phi3_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize))
    passed++;
  testnum++;
  if(runRalfFunction("test_phi4_scalar", test_phi4_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize))
    passed++;
  testnum++;
  if(runRalfFunction("test_phi5_scalar", test_phi5_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize))
    passed++;
  testnum++;
  if(runRalfFunction("test_phi6_scalar", test_phi6_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize))
    passed++;
  testnum++;
  if(runRalfFunction("test_phi7_scalar", test_phi7_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize))
    passed++;
  testnum++;
  if(runRalfFunction("test_phi8_scalar", test_phi8_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize))
    passed++;
  testnum++;
  if(runRalfFunction("test_phi9_scalar", test_phi9_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize))
    passed++;
  testnum++;

  if(runRalfFunction("test_loopbad_scalar", test_loopbad_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize))
    passed++;
  testnum++;
  if(runRalfFunction("test_loop23_scalar", test_loop23_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize))
    passed++;
  testnum++;
  if(runRalfFunction("test_loop13_scalar", test_loop13_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize))
    passed++;
  testnum++;

  ////////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_GetElementPointer_constant"; /////////////////////
  setZero(h_data0,&h_data_reference0);

  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_GetElementPointer_constant(&h_data_reference0);                //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;
  
  ///////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_calculate"; /////////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->i = h_data_reference0.i = 3;
  h_data0->f = h_data_reference0.f = 3.2;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_calculate(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;

  ///////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_parquetShader"; /////////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->f = h_data_reference0.f = 1;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_parquetShader(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;

  ///////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_GetElementPointer_dyn"; /////////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->i = h_data_reference0.i = 3;
  h_data0->u = h_data_reference0.u = 7;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_GetElementPointer_dyn(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;

  ///////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_branch_simple"; // Branch 1 /////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->f = h_data_reference0.f = -4;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_branch_simple(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;

  ///////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_branch_simple"; // Branch 2 /////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->f = h_data_reference0.f = 8;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_branch_simple(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;
  
  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_branch_simplePHI"; // Branch 1 /////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->f = h_data_reference0.f = -10;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_branch_simplePHI(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;

  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_branch_loop"; //////////////////////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->i = h_data_reference0.i = 100;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_branch_loop(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;

  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_math"; //////////////////////////////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->f = h_data_reference0.f = 1.4;
  h_data0->i = h_data_reference0.i = 3;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_math(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;

  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_signedOperands"; //////////////////////////////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->i = h_data_reference0.i = 3;
  h_data0->f = h_data_reference0.f = -7;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_signedOperands(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;

    
  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_constantOperands"; //////////////////////////////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->i = h_data_reference0.i = 3;
  h_data0->f = h_data_reference0.f = -1.44;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_constantOperands(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;
    
  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_branch_loop_semihard"; /////////////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->i = h_data_reference0.i = 10;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_branch_loop_semihard(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;

  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_branch_loop_hard"; // Branch 1 /////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->i = h_data_reference0.i = 1;
  h_data0->u = h_data_reference0.u = 3;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_branch_loop_hard(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;
  
  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*////////////*/ name = "test_branch_loop_hard"; // Branch 2 /////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->i = h_data_reference0.i = 7;
  h_data0->u = h_data_reference0.u = 10;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_branch_loop_hard(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;
 
  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_binaryInst"; /////////////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->i = h_data_reference0.i = 5;
  h_data0->f = h_data_reference0.f = -121.23;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_binaryInst(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;

  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_selp"; /////////////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->i = h_data_reference0.i = -15;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_selp(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;

  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_GetElementPointer_complicated"; /////////////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->i = h_data_reference0.i = 1;
  h_data_reference0.s.s.f = h_data0->s.s.f = 3.11;
  h_data_reference0.s.sa[2].f = h_data0->s.sa[2].f = -4.32;
  h_data_reference0.s.sa[h_data0->i].f = h_data0->s.sa[h_data0->i].f = 111.3;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_GetElementPointer_complicated(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;

  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_call"; /////////////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->i = h_data_reference0.i = 10;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_call(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;

  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*/////////////*/ name = "test_alloca"; /////////////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->i = h_data_reference0.i = 1;
  h_data0->f = h_data_reference0.f = -3.23;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_alloca(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                     //compare Data
    {passed++; std::cout << " => Test passed!!!\n";}
  testnum++;

  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_alloca_complicated"; /////////////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->i = h_data_reference0.i = 1;
  h_data0->f = h_data_reference0.f = 23.213;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_alloca_complicated(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;


  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_globalVariables"; /////////////////////////
  setZero(h_data0,&h_data_reference0);
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_globalVariables(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;

  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_specialRegisters_x"; /////////////////////////
  setZero(h_data0,&h_data_reference0);
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize, 2,3,4, 2,3);   //run device function
  runHostTestFunction(test_specialRegisters_x, &h_data_reference0,   2,3,4, 2,3);   //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;


  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_specialRegisters_y"; /////////////////////////
  setZero(h_data0,&h_data_reference0);
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize, 2,3,4, 2,3);   //run device function
  runHostTestFunction(test_specialRegisters_x, &h_data_reference0,   2,3,4, 2,3);   //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;

  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_dualArgument"; /////////////////////////
  setZero(h_data0,&h_data_reference0);
  setZero(h_data1,&h_data_reference1);
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunDualTestFunction(&hModule, name, d_data0, d_data1, h_data0, h_data1, memSize);   //run device function

  test_dualArgument(&h_data_reference0,&h_data_reference1);   //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  if(compareData(h_data1,&h_data_reference1))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;  testnum++;

  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_vector"; /////////////////////////
  setZero(h_data0,&h_data_reference0);

  h_data0->fa[0] = h_data_reference0.fa[0] = 0.43f;
  h_data0->fa[1] = h_data_reference0.fa[1] = 0.234f;
  h_data0->fa[2] = h_data_reference0.fa[2] = 12893.f;
  h_data0->fa[3] = h_data_reference0.fa[3] = 13.33f;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_vector(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                     //compare Data
    {passed++; std::cout << " => Test passed!!!\n";}
  testnum++;

  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_reg2Const"; /////////////////////////
  setZero(h_data0,&h_data_reference0);

  /*
  unsigned int bytes; //size of constant
  CUdeviceptr devptr_const=0; 
  status = cuModuleGetGlobal(&devptr_const,
			     &bytes,
			     hModule, "__ptx_constant_data_global");

  cuMemcpyHtoD(devptr_const, h_data0, memSize);
  */

  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_reg2Const(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                     //compare Data
    {passed++; std::cout << " => Test passed!!!\n";}
  testnum++;

  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_constantMemory"; /////////////////////////
  setZero(h_data0,&h_data_reference0);

  h_data0->fa[0] = __ptx_constant_data_global.fa[0] = 0.2348f;
  
  unsigned int bytes; //size of constant
  CUdeviceptr devptr_const=0; 
  status = cuModuleGetGlobal(&devptr_const,
			     &bytes,
			     hModule, "__ptx_constant_data_global");

  cuMemcpyHtoD(devptr_const, h_data0, memSize);

  setZero(h_data0,&h_data_reference0);

  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_constantMemory(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                     //compare Data
    {passed++; std::cout << " => Test passed!!!\n";}
  testnum++;


  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_sharedMemory"; /////////////////////////
  setZero(h_data0,&h_data_reference0);

  for(int i = 0; i < ARRAY_N/2; i++)
    h_data0->fa[i*2] = i;

  for(int i = 0; i < ARRAY_N/2; i++)
    h_data0->fa[i*2+1] = -i;

  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize, 32,1,1, 1,1);   //run device function

  for(int i = 0; i < ARRAY_N/2; i++)
    h_data_reference0.fa[i] = i;
  for(int i = 0; i < ARRAY_N/2; i++)
    h_data_reference0.fa[i+32] = -i;
  //  runHostTestFunction(test_sharedMemory, &h_data_reference0, 16,1,1, 1,1);                                 //run host reference

  if(compareData(h_data0,&h_data_reference0))                     //compare Data
    {passed++; std::cout << " => Test passed!!!\n";}
  testnum++;

  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_lightShader"; /////////////////////////
  setZero(h_data0,&h_data_reference0);

  /*
  unsigned int bytes; //size of constant
  CUdeviceptr devptr_const=0; 
  status = cuModuleGetGlobal(&devptr_const,
			     &bytes,
			     hModule, "__ptx_constant_data_global");

  cuMemcpyHtoD(devptr_const, h_data0, memSize);
  */

  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function

  /*
  test_lightShader(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                     //compare Data
    {passed++; std::cout << " => Test passed!!!\n";}
  testnum++;
  */

  ///////////////////////////////////////////////////////////////////////////////
  //======================= test cases END ====================================//
  ///////////////////////////////////////////////////////////////////////////////

  // Check the result
  std::cout << "\nPASSED " << passed << " tests" << std::endl;
  std::cout << "FAILED " << (testnum-passed) << " tests" << std::endl;

  // Cleanup
  if (d_data0)
    {
      cuMemFree(d_data0);
      d_data0 = 0;
    }
  if (d_data1)
    {
      cuMemFree(d_data1);
      d_data1 = 0;
    }
  if (h_data0)
    {
      free(h_data0);
      h_data0 = 0;
    }
  if (h_data1)
    {
      free(h_data1);
      h_data1 = 0;
    }
  if (hModule)
    {
      cuModuleUnload(hModule);
      hModule = 0;
    }
  if (hStream)
    {
      cuStreamDestroy(hStream);
      hStream = 0;
    }
  if (hContext)
    {
      cuCtxDestroy(hContext);
      hContext = 0;
    }
  return 0;
}