Exemple #1
0
	bool load_kernels(bool experimental)
	{
		/* check if cuda init succeeded */
		if(cuContext == 0)
			return false;
		
		/* check if GPU is supported with current feature set */
		if(!support_device(experimental))
			return false;

		/* get kernel */
		string cubin = compile_kernel();

		if(cubin == "")
			return false;

		/* open module */
		cuda_push_context();

		CUresult result = cuModuleLoad(&cuModule, cubin.c_str());
		if(cuda_error_(result, "cuModuleLoad"))
			cuda_error_message(string_printf("Failed loading CUDA kernel %s.", cubin.c_str()));

		cuda_pop_context();

		return (result == CUDA_SUCCESS);
	}
Exemple #2
0
CUresult cuda_driver_api_init(CUcontext *pctx, CUmodule *pmod, const char *f)
{
	CUresult res;
	CUdevice dev;

	res = cuInit(0);
	if (res != CUDA_SUCCESS) {
		printf("cuInit failed: res = %lu\n", (unsigned long)res);
		return res;
	}

	res = cuDeviceGet(&dev, 0);
	if (res != CUDA_SUCCESS) {
		printf("cuDeviceGet failed: res = %lu\n", (unsigned long)res);
		return res;
	}

	res = cuCtxCreate(pctx, 0, dev);
	if (res != CUDA_SUCCESS) {
		printf("cuCtxCreate failed: res = %lu\n", (unsigned long)res);
		return res;
	}
	
	res = cuModuleLoad(pmod, f);
	if (res != CUDA_SUCCESS) {
		printf("cuModuleLoad() failed\n");
		cuCtxDestroy(*pctx);
		return res;
	}

	return CUDA_SUCCESS;
}
Exemple #3
0
Object cuda_over_map(Object self, int nparts, int *argcv,
        Object *argv, int flags) {
    CUresult error;
    cuInit(0);
    int deviceCount = 0;
    error = cuDeviceGetCount(&deviceCount);
    if (deviceCount == 0) {
        raiseError("No CUDA devices found");
    }
    CUdevice cuDevice;
    CUcontext cuContext;
    CUmodule cuModule;
    CUfunction cuFunc;
    error = cuDeviceGet(&cuDevice, 0);
    error = cuCtxCreate(&cuContext, 0, cuDevice);
    CUdeviceptr d_A;
    CUdeviceptr d_B;
    CUdeviceptr d_res;
    errcheck(cuModuleLoad(&cuModule, grcstring(argv[argcv[0]])));
    CUdeviceptr dps[argcv[0]];
    void *args[argcv[0]+2];
    int size = INT_MAX;
    for (int i=0; i<argcv[0]; i++) {
        struct CudaFloatArray *a = (struct CudaFloatArray *)argv[i];
        if (a->size < size)
            size = a->size;
        errcheck(cuMemAlloc(&dps[i], size * sizeof(float)));
        errcheck(cuMemcpyHtoD(dps[i], &a->data, size * sizeof(float)));
        args[i+1] = &dps[i];
    }
    struct CudaFloatArray *r =
        (struct CudaFloatArray *)(alloc_CudaFloatArray(size));
    int fsize = sizeof(float) * size;
    errcheck(cuMemAlloc(&d_res, fsize));
    errcheck(cuMemcpyHtoD(d_res, &r->data, fsize));
    args[0] = &d_res;
    args[argcv[0]+1] = &size;

    int threadsPerBlock = 256;
    int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock;
    char name[256];
    strcpy(name, "block");
    strcat(name, grcstring(argv[argcv[0]]) + strlen("_cuda/"));
    for (int i=0; name[i] != 0; i++)
        if (name[i] == '.') {
            name[i] = 0;
            break;
        }
    errcheck(cuModuleGetFunction(&cuFunc, cuModule, name));
    errcheck(cuLaunchKernel(cuFunc, blocksPerGrid, 1, 1,
        threadsPerBlock, 1, 1,
        0,
        NULL, args, NULL));
    errcheck(cuMemcpyDtoH(&r->data, d_res, fsize));
    cuMemFree(d_res);
    for (int i=0; i<argcv[0]; i++)
        cuMemFree(dps[i]);
    return (Object)r;
}
Exemple #4
0
CUresult loadCUDAModules()
{
    CUmodule cuModule_;
    checkCudaErrors(cuModuleLoad(&cuModule_, "videoPP64.ptx"));
    checkCudaErrors(cuModuleGetFunction(&g_kernelNV12toARGB, cuModule_, "NV12ToARGBdrvapi"));
    checkCudaErrors(cuModuleGetFunction(&g_kernelARGBtoNV12, cuModule_, "ARGBToNv12drvapi"));
    checkCudaErrors(cuModuleGetFunction(&g_kernelARGBpostprocess, cuModule_, "ARGBpostprocess"));
}
static CUresult
initCuda(CUcontext _cuContext, char* executablePath, CUfunction *mathop, 
	int argc, char** argv, const char* cubin_name, const char* kernel_name)
{
    CUdevice cuDevice;
    CUT_DEVICE_INIT_DRV(cuDevice, argc, argv);
    print_GetProperties(cuDevice);

    CUresult status = cuCtxCreate( &_cuContext, 0, cuDevice );
    if ( CUDA_SUCCESS != status ) {
        Error(_cuContext, status);
    }
    else printf("(1) context creation successful\n");

    char* module_path = cutFindFilePath(cubin_name, executablePath);
    printf ("\t cubin:%s, path:%s, mmp_ptr:%lu\n", cubin_name, executablePath, module_path);
    if(module_path != NULL)
      printf ("\t cubin:%s, path:%s, module_path:%c%c%c%c\n", cubin_name, executablePath, *module_path, *(module_path+1), *(module_path+2), *(module_path+3));
    char* data_path = "./data/";
    size_t len_path = strlen(data_path);
    size_t len_fn = strlen(cubin_name);
    // printf ("Sizes: data:%lu, cubinname:%lu\n", len_path, len_fn);

    char* module_path_new = (char*)malloc(sizeof(char) * (len_path + len_fn));
    strcpy(module_path_new, data_path);
    strcat(module_path_new, cubin_name);
    strcat(module_path_new, "\0");
    if (module_path_new == 0) {
        status = CUDA_ERROR_NOT_FOUND;
        Error(_cuContext, status);
    }
    FILE *fp = fopen(module_path_new,"r");
    if( fp ) {
	printf("(2) cubin_File found in modulepath:%s\n", module_path_new);
        fclose(fp);
    } else {
	printf("(2) cubin file not exist: %s\n", module_path_new);
    }
    CUmodule cuModule;
    status = cuModuleLoad(&cuModule, module_path_new);
    cutFree(module_path_new);
    if ( CUDA_SUCCESS != status ) {
        Error(_cuContext, status);
    }
    else printf ("(3) module Load successful\n");

    CUfunction cuFunction = 0;
    status = cuModuleGetFunction(&cuFunction, cuModule, kernel_name);
    if ( CUDA_SUCCESS != status) {
        Error(_cuContext, status);
    }
    else printf ("(4) getFunction successful w/cuFunction\n");

    *mathop = cuFunction;

    return CUDA_SUCCESS;

}
Exemple #6
0
/*
 * Class:     edu_syr_pcpratts_rootbeer_runtime2_cuda_CudaRuntime2
 * Method:    loadFunction
 * Signature: ()V
 */
JNIEXPORT void JNICALL Java_edu_syr_pcpratts_rootbeer_runtime2_cuda_CudaRuntime2_loadFunction
  (JNIEnv *env, jobject this_obj, jlong heap_end_ptr, jstring filename, jint num_blocks){

  void * cubin_file;
  int offset;
  CUresult status;
  char * native_filename;
  heapEndPtr = heap_end_ptr;
  
  native_filename = (*env)->GetStringUTFChars(env, filename, 0);
  status = cuModuleLoad(&cuModule, native_filename);
  CHECK_STATUS(env, "error in cuModuleLoad", status);
  (*env)->ReleaseStringUTFChars(env, filename, native_filename);

  status = cuModuleGetFunction(&cuFunction, cuModule, "_Z5entryPcS_PiPxS1_S0_S0_i"); 
  CHECK_STATUS(env,"error in cuModuleGetFunction",status)

  status = cuFuncSetCacheConfig(cuFunction, CU_FUNC_CACHE_PREFER_L1);
  CHECK_STATUS(env,"error in cuFuncSetCacheConfig",status)

  status = cuParamSetSize(cuFunction, (7 * sizeof(CUdeviceptr) + sizeof(int))); 
  CHECK_STATUS(env,"error in cuParamSetSize",status)

  offset = 0;
  status = cuParamSetv(cuFunction, offset, (void *) &gcInfoSpace, sizeof(CUdeviceptr)); 
  CHECK_STATUS(env,"error in cuParamSetv gcInfoSpace",status)
  offset += sizeof(CUdeviceptr);

  status = cuParamSetv(cuFunction, offset, (void *) &gpuToSpace, sizeof(CUdeviceptr)); 
  CHECK_STATUS(env,"error in cuParamSetv gpuToSpace",status)
  offset += sizeof(CUdeviceptr);

  status = cuParamSetv(cuFunction, offset, (void *) &gpuHandlesMemory, sizeof(CUdeviceptr)); 
  CHECK_STATUS(env,"error in cuParamSetv gpuHandlesMemory %",status)
  offset += sizeof(CUdeviceptr);

  status = cuParamSetv(cuFunction, offset, (void *) &gpuHeapEndPtr, sizeof(CUdeviceptr)); 
  CHECK_STATUS(env,"error in cuParamSetv gpuHeapEndPtr",status)
  offset += sizeof(CUdeviceptr);

  status = cuParamSetv(cuFunction, offset, (void *) &gpuBufferSize, sizeof(CUdeviceptr));
  CHECK_STATUS(env,"error in cuParamSetv gpuBufferSize",status)
  offset += sizeof(CUdeviceptr); 

  status = cuParamSetv(cuFunction, offset, (void *) &gpuExceptionsMemory, sizeof(CUdeviceptr)); 
  CHECK_STATUS(env,"error in cuParamSetv gpuExceptionsMemory",status)
  offset += sizeof(CUdeviceptr);

  status = cuParamSetv(cuFunction, offset, (void *) &gpuClassMemory, sizeof(CUdeviceptr)); 
  CHECK_STATUS(env,"error in cuParamSetv gpuClassMemory",status)
  offset += sizeof(CUdeviceptr);

  status = cuParamSeti(cuFunction, offset, num_blocks); 
  CHECK_STATUS(env,"error in cuParamSetv num_blocks",status)
  offset += sizeof(int);
}
Exemple #7
0
CUresult CuContext::LoadModuleFilename(const std::string& filename, 
	ModulePtr* ppModule) {

	ModulePtr module(new CuModule);
	CUresult result = cuModuleLoad(&module->_module, filename.c_str());
	HANDLE_RESULT();

	module->_context = this;
	ppModule->swap(module);
	return CUDA_SUCCESS;
}
Exemple #8
0
/*
 * Initializaiton in order to use kernel program 
 */
void
init_cuda(void){

  thread_num = (N <= 16) ? N : 16 ;  
  block_num = N / (thread_num*thread_num);
  if(N % (thread_num*thread_num) != 0) block_num++;
  
  res = cuInit(0);
  if(res != CUDA_SUCCESS){
    printf("cuInit failed: res = %s\n", conv(res));
    exit(1);
  }
  
  res = cuDeviceGet(&dev, 0);
  if(res != CUDA_SUCCESS){
    printf("cuDeviceGet failed: res = %s\n", conv(res));
    exit(1);
  }
  
  res = cuCtxCreate(&ctx, 0, dev);
  if(res != CUDA_SUCCESS){
    printf("cuCtxCreate failed: res = %s\n", conv(res));
    exit(1);
  }
  
  res = cuModuleLoad(&module, "./cuda_main.cubin");
  if(res != CUDA_SUCCESS){
    printf("cuModuleLoad() failed: res = %s\n", conv(res));
    exit(1);
  }
  
  res = cuModuleGetFunction(&function, module, "cuda_main");
  if(res != CUDA_SUCCESS){
    printf("cuModuleGetFunction() failed: res = %s\n",  conv(res));
    exit(1);
  }
  

  /* 
   * preparation for launch kernel 
   */
  res = cuFuncSetSharedSize(function, 0x40);  /* just random */
  if(res != CUDA_SUCCESS){
    printf("cuFuncSetSharedSize() failed: res = %s\n", conv(res));
    exit(1);
  }
  
  res = cuFuncSetBlockShape(function, thread_num, thread_num, 1);
  if(res != CUDA_SUCCESS){
    printf("cuFuncSetBlockShape() failed: res = %s\n", conv(res));
    exit(1);
  }

}
Exemple #9
0
SEXP
R_auto_cuModuleLoad(SEXP r_fname)
{
    SEXP r_ans = R_NilValue;
    CUmodule module;
    const char * fname = CHAR(STRING_ELT(r_fname, 0));
    CUresult ans;
    ans = cuModuleLoad(& module,  fname);
    if(ans)
       return(R_cudaErrorInfo(ans));
    r_ans = R_createRef(module, "CUmodule") ;
    return(r_ans);
}
Exemple #10
0
  kernel_t<CUDA>* kernel_t<CUDA>::buildFromBinary(const std::string &filename,
                                                 const std::string &functionName_){
    OCCA_EXTRACT_DATA(CUDA, Kernel);

    functionName = functionName_;

    OCCA_CUDA_CHECK("Kernel (" + functionName + ") : Loading Module",
                    cuModuleLoad(&data_.module, filename.c_str()));

    OCCA_CUDA_CHECK("Kernel (" + functionName + ") : Loading Function",
                    cuModuleGetFunction(&data_.function, data_.module, functionName.c_str()));

    return this;
  }
Exemple #11
0
int
main()
{
  CUresult result;
  result = cuInit(0);
  CUdevice device;
  result = cuDeviceGet(&device, 0);
  CUcontext ctx;
  result = cuCtxCreate(&ctx, 0, device);
  CUmodule module;
  result = cuModuleLoad(&module, "cuda-shift-throughput.cubin");
  CUfunction kernel;
  result = cuModuleGetFunction(&kernel, module, "kernel");
  int block;
  result = cuFuncGetAttribute(&block,
                              CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
                              kernel);
  int grid = 1024 * 1024;
  CUevent event[2];
  for (ptrdiff_t i = 0; i < 2; ++i) {
    result = cuEventCreate(&event[i], 0);
  }
  result = cuEventRecord(event[0], 0);
  result = cuLaunchKernel(kernel, grid, 1, 1, block, 1, 1, 0, 0, 0, 0);
  result = cuEventRecord(event[1], 0);
  result = cuEventSynchronize(event[1]);
  float time;
  result = cuEventElapsedTime(&time, event[0], event[1]);
  int gpuclock;
  result =
    cuDeviceGetAttribute(&gpuclock, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device);
  int gpump;
  result =
    cuDeviceGetAttribute(&gpump, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
                         device);
  std::printf("Clock: %d KHz, # of MPs: %d\n", gpuclock, gpump);
  std::printf("Elapsed Time: %f milliseconds\n", time);
  std::printf("# of Threads: %d, # of SHLs : %lld\n", block,
              1024ll * block * grid);
  std::printf("Throughput: %f\n",
              1024.0 * block * grid / ((double) gpump * gpuclock * time));
  for (ptrdiff_t i = 0; i < 2; ++i) {
    result = cuEventDestroy(event[i]);
  }
  result = cuModuleUnload(module);
  result = cuCtxDestroy(ctx);
  return 0;
}
CUresult initialize(int device, CUcontext *phContext, CUdevice *phDevice, CUmodule *phModule, CUstream *phStream)
{
  // Initialize the device and create the context
  cuInit(0);
  cuDeviceGet(phDevice, device);
  CUresult status = cuCtxCreate(phContext, 0, *phDevice);
  if (status != CUDA_SUCCESS)
    {std::cout << "ERROR: could not create context\n"; exit(0);}

    status = cuModuleLoad(phModule, "PTXTestFunctions.o.ptx");
  
  if (status != CUDA_SUCCESS)
    {std::cout << "ERROR: could not load .ptx module: " << status << "\n"; exit(0);}

  // Create stream
  status = cuStreamCreate(phStream, 0);
  if (status != CUDA_SUCCESS)
    {printf("ERROR: during stream creation\n"); exit(0);}

  return status;
}
int main(int argc, char ** argv)
{
	int dev_count = 0;

	CUdevice   device;
	CUcontext  context;
	CUmodule   module;
	CUfunction function;

	cuInit(0);

	cuDeviceGetCount(&dev_count);

	if (dev_count < 1) return -1;

	cuDeviceGet( &device, 0 );
	cuCtxCreate( &context, 0, device );
	
	cuModuleLoad( &module, "hello.cuda_runtime.ptx" );
	cuModuleGetFunction( &function, module, "_Z6kernelPf" );

	int N = 512;
	CUdeviceptr pData;
	cuMemAlloc( &pData, N * sizeof(float) );
	cuFuncSetBlockShape( function, N, 1, 1 );
	cuParamSeti( function, 0, pData );
	cuParamSetSize( function, 4 );

	cuLaunchGrid( function, 1, 1 );

	float * pHostData = new float[N];

	cuMemcpyDtoH( pHostData, pData, N * sizeof( float) );

	cuMemFree( pData );

	delete [] pHostData;

	return 0;
}
Exemple #14
0
int madd_gpu_init(struct device_info *device_info)
{
	char fname[256];
	CUresult res;

	/* printf("madd_gpu_init called.\n"); */

	/* Initialization */
	if ((res = cuInit(0)) != CUDA_SUCCESS) {
		printf("cuInit failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	if ((res = cuDeviceGet(&device_info->dev, 0)) != CUDA_SUCCESS) {
		printf("cuDeviceGet failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	if ((res = cuCtxCreate(&device_info->context, 0, device_info->dev)) !=
	 CUDA_SUCCESS) {
		printf("cuCtxCreate failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	/* binary files are located in the same directory as the source code */
	if ((res = cuModuleLoad(&device_info->module, MODULE_FILE_NAME)) !=
	 CUDA_SUCCESS) {
		printf("cuModuleLoad() failed\n");
		return -1;
	}

	if ((res = cuModuleGetFunction(&device_info->kernel, 
	 device_info->module, KERNEL_NAME)) != CUDA_SUCCESS) {
		printf("cuModuleGetFunction() failed\n");
		return -1;
	}

	return 0;
}
Exemple #15
0
  kernel_t<CUDA>* kernel_t<CUDA>::buildFromSource(const std::string &filename,
                                                  const std::string &functionName_,
                                                  const kernelInfo &info_){
    OCCA_EXTRACT_DATA(CUDA, Kernel);

    functionName = functionName_;

    kernelInfo info = info_;
    std::string cachedBinary = getCachedBinaryName(filename, info);

    struct stat buffer;
    const bool fileExists = (stat(cachedBinary.c_str(), &buffer) == 0);

    if(fileExists){
      std::cout << "Found cached binary of [" << filename << "] in [" << cachedBinary << "]\n";
      return buildFromBinary(cachedBinary, functionName);
    }

    if(!haveFile(cachedBinary)){
      waitForFile(cachedBinary);

      return buildFromBinary(cachedBinary, functionName);
    }

    std::string iCachedBinary = createIntermediateSource(filename,
                                                         cachedBinary,
                                                         info);

    std::string libPath, soname;
    getFilePrefixAndName(cachedBinary, libPath, soname);

    std::string oCachedBinary = libPath + "o_" + soname + ".o";

    std::string archSM = "";

    if(dev->dHandle->compilerFlags.find("-arch=sm_") == std::string::npos){
      std::stringstream archSM_;

      int major, minor;
      OCCA_CUDA_CHECK("Kernel (" + functionName + ") : Getting CUDA Device Arch",
                      cuDeviceComputeCapability(&major, &minor, data_.device) );

      archSM_ << " -arch=sm_" << major << minor << ' ';

      archSM = archSM_.str();
    }

    std::stringstream command;

    //---[ PTX Check Command ]----------
    if(dev->dHandle->compilerEnvScript.size())
      command << dev->dHandle->compilerEnvScript << " && ";

    command << dev->dHandle->compiler
            << ' '          << dev->dHandle->compilerFlags
            << archSM
            << " -Xptxas -v,-dlcm=cg,-abi=no"
            << ' '          << info.flags
            << " -x cu -c " << iCachedBinary
            << " -o "       << oCachedBinary;

    const std::string &ptxCommand = command.str();

    std::cout << "Compiling [" << functionName << "]\n" << ptxCommand << "\n";

#if (OCCA_OS == LINUX_OS) || (OCCA_OS == OSX_OS)
    const int ptxError = system(ptxCommand.c_str());
#else
    const int ptxError = system(("\"" +  ptxCommand + "\"").c_str());
#endif

    // Not needed here I guess
    // if(ptxError){
    //   releaseFile(cachedBinary);
    //   throw 1;
    // }

    //---[ Compiling Command ]----------
    command.str("");

    command << dev->dHandle->compiler
            << " -o "       << cachedBinary
            << " -ptx -I."
            << ' '          << dev->dHandle->compilerFlags
            << archSM
            << ' '          << info.flags
            << " -x cu "    << iCachedBinary;

    const std::string &sCommand = command.str();

    std::cout << sCommand << '\n';

    const int compileError = system(sCommand.c_str());

    if(compileError){
      releaseFile(cachedBinary);
      throw 1;
    }

    const CUresult moduleLoadError = cuModuleLoad(&data_.module,
                                                  cachedBinary.c_str());

    if(moduleLoadError)
      releaseFile(cachedBinary);

    OCCA_CUDA_CHECK("Kernel (" + functionName + ") : Loading Module",
                    moduleLoadError);

    const CUresult moduleGetFunctionError = cuModuleGetFunction(&data_.function,
                                                                data_.module,
                                                                functionName.c_str());

    if(moduleGetFunctionError)
      releaseFile(cachedBinary);

    OCCA_CUDA_CHECK("Kernel (" + functionName + ") : Loading Function",
                    moduleGetFunctionError);

    releaseFile(cachedBinary);

    return this;
  }
Exemple #16
0
Object cuda_using_do_blockWidth_blockHeight_gridWidth_gridHeight(Object self, int nparts, int *argcv,
        Object *argv, int flags) {
    CUresult error;
    cuInit(0);
    int deviceCount = 0;
    error = cuDeviceGetCount(&deviceCount);
    if (deviceCount == 0) {
        raiseError("No CUDA devices found");
    }
    CUdevice cuDevice;
    CUcontext cuContext;
    CUmodule cuModule;
    CUfunction cuFunc;
    error = cuDeviceGet(&cuDevice, 0);
    error = cuCtxCreate(&cuContext, 0, cuDevice);
    // do through gridWidth only have one argument each
    int argOffset = argcv[0] + 1;
    int blockDimX = integerfromAny(argv[argOffset++]);
    int blockDimY = integerfromAny(argv[argOffset++]);
    int gridDimX = integerfromAny(argv[argOffset++]);
    int gridDimY = integerfromAny(argv[argOffset++]);

    char *tmp = grcstring(argv[argcv[0]]);
    char argStr[strlen(tmp) + 1];
    strcpy(argStr, tmp);
    char *tmp2 = strtok(argStr, " ");
    char blockname[128];
    strcpy(blockname, tmp2);
    errcheck(cuModuleLoad(&cuModule, blockname));
    CUdeviceptr dps[argcv[0]];
    float floats[argcv[0]];
    void *args[argcv[0]];
    int ints[argcv[0]];
    argStr[strlen(blockname)] = ' ';
    strtok(argStr, " ");
    for (int i=0; i<argcv[0]; i++) {
        char *argType = strtok(NULL, " ");
        if (argType[0] == 'f' && argType[1] == '*') {
            struct CudaFloatArray *a = (struct CudaFloatArray *)argv[i];
            errcheck(cuMemAlloc(&dps[i], a->size * sizeof(float)));
            errcheck(cuMemcpyHtoD(dps[i], &a->data, a->size * sizeof(float)));
            args[i] = &dps[i];
        } else if (argType[0] == 'f') {
            floats[i] = (float)*((double *)(argv[i]->data));
            args[i] = &floats[i];
        } else if (argType[0] == 'i') {
            ints[i] = integerfromAny(argv[i]);
            args[i] = &ints[i];
        } else {
            // Fail
            char buf[256];
            sprintf(buf, "CUDA argument cannot be coerced. This shouldn't happen. Argument string: %s\n", argType);
            raiseError(buf);
        }
    }
    char name[256];
    strcpy(name, "block");
    strcat(name, blockname + strlen("_cuda/"));
    for (int i=0; name[i] != 0; i++)
        if (name[i] == '.') {
            name[i] = 0;
            break;
        }
    errcheck(cuModuleGetFunction(&cuFunc, cuModule, name));
    errcheck(cuLaunchKernel(cuFunc, gridDimX, gridDimY, 1,
        blockDimX, blockDimY, 1,
        0,
        NULL, args, NULL));
    for (int i=0; i<argcv[0]; i++) {
        struct CudaFloatArray *a = (struct CudaFloatArray *)argv[i];
        errcheck(cuMemcpyDtoH(&a->data, dps[i], a->size * sizeof(float)));
        cuMemFree(dps[i]);
    }
    return alloc_none();
}
Exemple #17
0
void test_tasks(unsigned int size, int nr_tasks)
{
	int i;
	pid_t pid;
	int status;
	CUresult res;
	CUdevice dev;
	CUcontext ctx;
	CUdeviceptr data_addr;
	CUmodule module;
	CUfunction function;
	unsigned int *in, *out;
	unsigned int n = size / 4;

	res = cuInit(0);
	if (res != CUDA_SUCCESS) {
		printf("cuInit failed: res = %u\n", res);
		exit(-1);
	}

	res = cuDeviceGet(&dev, 0);
	if (res != CUDA_SUCCESS) {
		printf("cuDeviceGet failed: res = %u\n", res);
		exit(-1);
	}

	res = cuCtxCreate(&ctx, 0, dev);
	if (res != CUDA_SUCCESS) {
		printf("cuCtxCreate failed: res = %u\n", res);
		exit(-1);
	}

	res = cuMemAlloc(&data_addr, size);
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc failed: res = %u\n", res);
		exit(-1);
	}

	in = (unsigned int *) malloc(size);
	out = (unsigned int *) malloc(size);
	
	res = cuMemcpyHtoD(data_addr, in, size);
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD failed: res = %u\n", res);
		exit(-1);
	}

#if 1
	res = cuModuleLoad(&module, "./loop_gpu.cubin");
	if (res != CUDA_SUCCESS) {
		printf("cuModuleLoad() failed\n");
		exit(-1);
	}
	res = cuModuleGetFunction(&function, module, "_Z4loopPjjj");
	if (res != CUDA_SUCCESS) {
		printf("cuModuleGetFunction() failed\n");
		exit(-1);
	}
	
	void *param1[] = {&data_addr, &size, &n}; 
	//res = cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, (void**)param1, 0);
	if (res != CUDA_SUCCESS) {
		printf("cuLaunchKernel failed: res = %u\n", res);
		exit(-1);
	}
	//cuCtxSynchronize();
#endif

	if (--nr_tasks) {
		pid = fork();
		if (pid == 0) { /* child */
			test_tasks(size, nr_tasks);
			printf("Child finished\n");
			exit(0);
		}
		else { /* parent */
			waitpid(pid, &status, 0);
		}
	}

#if 0
	res = cuModuleLoad(&module, "./loop_gpu.cubin");
	if (res != CUDA_SUCCESS) {
		printf("cuModuleLoad() failed\n");
		exit(-1);
	}
	res = cuModuleGetFunction(&function, module, "_Z4loopPjjj");
	if (res != CUDA_SUCCESS) {
		printf("cuModuleGetFunction() failed\n");
		exit(-1);
	}
	
	void *param1[] = {&data_addr, &size, &n}; 
	res = cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, (void**)param1, 0);
	if (res != CUDA_SUCCESS) {
		printf("cuLaunchKernel failed: res = %u\n", res);
		exit(-1);
	}
#endif

	res = cuMemcpyDtoH(out, data_addr, size);
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyDtoH failed: res = %u\n", res);
		exit(-1);
	}

	res = cuModuleUnload(module);
	if (res != CUDA_SUCCESS) {
		printf("cuModuleUnload failed: res = %lu\n", res);
		exit(-1);
	}
	
	res = cuMemFree(data_addr);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFree failed: res = %u\n", res);
		exit(-1);
	}
	
	res = cuCtxDestroy(ctx);
	if (res != CUDA_SUCCESS) {
		printf("cuCtxDestroy failed: res = %u\n", (unsigned int)res);
		exit(-1);
	}
	
	free(in);
	free(out);
}
Exemple #18
0
int main() {
  CU_ERROR_CHECK(cuInit(0));

  int count;
  CU_ERROR_CHECK(cuDeviceGetCount(&count));

  count = (count > 2) ? 2 : count;

  CUdevice devices[count];
  for (int i = 0; i < count; i++)
    CU_ERROR_CHECK(cuDeviceGet(&devices[i], i));

  // Question 1:  Can you create multiple contexts on the same device?
  {
    fprintf(stderr, "Attempting to create multiple contexts on each device...\n");
    CUcontext contexts[count * N];
    size_t j = 0;
    for (int i = 0; i < count; i++) {
      CUresult error = CUDA_SUCCESS;
      size_t k;
      for (k = 0; k < N && error == CUDA_SUCCESS; k++) {
        error = cuCtxCreate(&contexts[j], CU_CTX_SCHED_AUTO, devices[i]);
        if (error == CUDA_SUCCESS)
          CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[j++]));
      }
      fprintf(stderr, "  created %zu contexts on device %d before cuCtxCreate returned \"%s\"\n", (k - 1), i, cuGetErrorString(error));
    }

    CUresult error = CUDA_SUCCESS;
    size_t k;
    for (k = 0; k < j && error == CUDA_SUCCESS; k++)
      error = cuCtxPushCurrent(contexts[k]);
    if (error == CUDA_SUCCESS)
      fprintf(stderr, "  successfully pushed %zu contexts with cuCtxPushCurrent\n", k);
    else
      fprintf(stderr, "  pushed %zu contexts before cuCtxPushCurrent returned \"%s\"\n", (k - 1), cuGetErrorString(error));

    for (size_t k = 0; k < j; k++)
      CU_ERROR_CHECK(cuCtxDestroy(contexts[k]));

    fprintf(stderr, "\n");
  }

  CUcontext contexts[count][2];
  for (int i = 0; i < count; i++) {
    for (size_t j = 0; j < 2; j++) {
      CU_ERROR_CHECK(cuCtxCreate(&contexts[i][j], CU_CTX_SCHED_AUTO, devices[i]));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[i][j]));
    }
  }

  // Question 2:  Can you access a host pointer in a different context from
  // which it was created?
  // Question 3:  Can you free a host pointer in a different context from which
  // it was created?
  {
    void * hPtr;
    CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0]));
    CU_ERROR_CHECK(cuMemAllocHost(&hPtr, 1024));
    CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0]));

    CUdeviceptr dPtr[count];
    CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1]));
    CU_ERROR_CHECK(cuMemAlloc(&dPtr[0], 1024)); // Different context, same device
    fprintf(stderr, "Accessing a host pointer from a different context to which it was allocated (on the same device) returns \"%s\"\n", cuGetErrorString(cuMemcpyHtoD(dPtr[0], hPtr, 1024)));
    CU_ERROR_CHECK(cuMemFree(dPtr[0]));
    CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1]));
    if (count > 1) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0]));
      CU_ERROR_CHECK(cuMemAlloc(&dPtr[1], 1024)); // Different context, different device
      fprintf(stderr, "Accessing a host pointer from a different context to which it was allocated (on a different device) returns \"%s\"\n", cuGetErrorString(cuMemcpyHtoD(dPtr[1], hPtr, 1024)));
    CU_ERROR_CHECK(cuMemFree(dPtr[1]));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0]));
    }

    fprintf(stderr, "\n");

    CUresult error = CUDA_ERROR_UNKNOWN;
    if (count > 1) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0]));
      error = cuMemFreeHost(hPtr);
      fprintf(stderr, "Freeing a host pointer from a different context to which it was allocated (on a different device) returns \"%s\"\n", cuGetErrorString(error));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0]));
    }
    if (error != CUDA_SUCCESS) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1]));
      error = cuMemFreeHost(hPtr);
      fprintf(stderr, "Freeing a host pointer from a different context to which it was allocated (on the same device) returns \"%s\"\n", cuGetErrorString(error));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1]));
    }
    if (error != CUDA_SUCCESS) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0]));
      error = cuMemFreeHost(hPtr);
      fprintf(stderr, "Freeing a host pointer from the same context to which it was allocated returns \"%s\"\n", cuGetErrorString(error));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0]));
    }

    fprintf(stderr, "\n");
  }

  // Question 4:  Can you access a device pointer in a different context from
  // which it was created?
  // Question 5:  Can you free a device pointer in a different context from which
  // it was created?
  {
    CUdeviceptr dPtr[count][2];
    CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0]));
    CU_ERROR_CHECK(cuMemAlloc(&dPtr[0][0], 1024));
    CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0]));
    CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1]));
    CU_ERROR_CHECK(cuMemAlloc(&dPtr[0][1], 1024));
    CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1]));

    CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1]));
    fprintf(stderr, "Accessing a device pointer from a different context to which it was allocated (on the same device) returns \"%s\"\n", cuGetErrorString(cuMemcpyDtoD(dPtr[0][0], dPtr[0][1], 1024)));
    CU_ERROR_CHECK(cuMemFree(dPtr[0][1]));
    CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1]));

    if (count > 1) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0]));
      CU_ERROR_CHECK(cuMemAlloc(&dPtr[1][0], 1024)); // Different context, different device
      fprintf(stderr, "Accessing a device pointer from a different context to which it was allocated (on a different device) returns \"%s\"\n", cuGetErrorString(cuMemcpyDtoD(dPtr[0][0], dPtr[1][0], 1024)));
      CU_ERROR_CHECK(cuMemFree(dPtr[1][0]));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0]));
    }

    fprintf(stderr, "\n");

    CUresult error = CUDA_ERROR_UNKNOWN;
    if (count > 1) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0]));
      error = cuMemFree(dPtr[0][0]);
      fprintf(stderr, "Freeing a device pointer from a different context to which it was allocated (on a different device) returns \"%s\"\n", cuGetErrorString(error));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0]));
    }
    if (error != CUDA_SUCCESS) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1]));
      error = cuMemFree(dPtr[0][0]);
      fprintf(stderr, "Freeing a device pointer from a different context to which it was allocated (on the same device) returns \"%s\"\n", cuGetErrorString(error));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1]));
    }
    if (error != CUDA_SUCCESS) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0]));
      error = cuMemFree(dPtr[0][0]);
      fprintf(stderr, "Freeing a device pointer from the same context to which it was allocated returns \"%s\"\n", cuGetErrorString(error));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0]));
    }

    fprintf(stderr, "\n");
  }

  // Question 6:  Can you access a module in a different context from which it
  // was loaded?
  // Question 7:  Can you unload a module in a different context from which it
  // was loaded?
  {
    CUmodule module;
    CUdeviceptr ptr;
    CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0]));
    CU_ERROR_CHECK(cuModuleLoad(&module,  "kernel-test.ptx"));
    CU_ERROR_CHECK(cuMemAlloc(&ptr, sizeof(float)));
    CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0]));

    CUfunction function = 0;
    if (count > 0) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0]));
      fprintf(stderr, "Getting a function pointer from a different context to which it was loaded (on a different device) returns \"%s\"\n", cuGetErrorString(cuModuleGetFunction(&function, module, "kernel")));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0]));
    }
    if (function == 0) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1]));
      fprintf(stderr, "Getting a function pointer from a different context to which it was loaded (on the same device) returns \"%s\"\n", cuGetErrorString(cuModuleGetFunction(&function, module, "kernel")));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1]));
    }
    if (function == 0) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0]));
      fprintf(stderr, "Getting a function pointer from the same context to which it was loaded returns \"%s\"\n", cuGetErrorString(cuModuleGetFunction(&function, module, "kernel")));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0]));
    }

    fprintf(stderr, "\n");

    CUdeviceptr a, b;
    CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0]));
    CU_ERROR_CHECK(cuMemAlloc(&a, sizeof(float)));
    CU_ERROR_CHECK(cuMemAlloc(&b, sizeof(float)));
    CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0]));
    void * params[] = { &a, & b };

    CUresult error = CUDA_ERROR_UNKNOWN;
    if (count > 0) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0]));
      fprintf(stderr, "Launching a function from a different context to which it was loaded (on a different device) returns \"%s\"\n", cuGetErrorString(error = cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, params, NULL)));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0]));
    }
    if (error != CUDA_SUCCESS) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1]));
      fprintf(stderr, "Launching a function from a different context to which it was loaded (on the same device) returns \"%s\"\n", cuGetErrorString(error = cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, params, NULL)));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1]));
    }
    if (error != CUDA_SUCCESS) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0]));
      fprintf(stderr, "Launching a function from the same context to which it was loaded returns \"%s\"\n", cuGetErrorString(error = cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, params, NULL)));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0]));
    }

    fprintf(stderr, "\n");

    error = CUDA_ERROR_UNKNOWN;
    if (count > 0) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[1][0]));
      fprintf(stderr, "Unloading a module from a different context to which it was loaded (on a different device) returns \"%s\"\n", cuGetErrorString(error = cuModuleUnload(module)));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[1][0]));
    }
    if (error != CUDA_SUCCESS) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][1]));
      fprintf(stderr, "Unloading a module from a different context to which it was loaded (on the same device) returns \"%s\"\n", cuGetErrorString(error = cuModuleUnload(module)));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][1]));
    }
    if (error != CUDA_SUCCESS) {
      CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0]));
      fprintf(stderr, "Unloading a module from the same context to which it was loaded returns \"%s\"\n", cuGetErrorString(error = cuModuleUnload(module)));
      CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0]));
    }

    CU_ERROR_CHECK(cuCtxPushCurrent(contexts[0][0]));
    CU_ERROR_CHECK(cuMemFree(a));
    CU_ERROR_CHECK(cuMemFree(b));
    CU_ERROR_CHECK(cuCtxPopCurrent(&contexts[0][0]));
  }

  for (int i = 0; i < count; i++) {
    for (size_t j = 0; j < 2; j++)
      CU_ERROR_CHECK(cuCtxDestroy(contexts[i][j]));
  }

  return 0;
}
Exemple #19
0
int cuda_test_madd_vmmap_hybrid(unsigned int n, char *path)
{
	int i, j, idx;
	CUresult res;
	CUdevice dev;
	CUcontext ctx;
	CUfunction function;
	CUmodule module;
	CUdeviceptr a_dev, b_dev, c_dev;
	unsigned int *a_buf, *b_buf, *c_buf;
	unsigned long long int a_phys, b_phys, c_phys;
	unsigned int *c = (unsigned int *) malloc (n*n * sizeof(unsigned int));
	int block_x, block_y, grid_x, grid_y;
	char fname[256];
	int ret = 0;
	struct timeval tv;
	struct timeval tv_total_start, tv_total_end;
	float total;
	struct timeval tv_h2d_start, tv_h2d_end;
	float h2d;
	struct timeval tv_d2h_start, tv_d2h_end;
	float d2h;
	struct timeval tv_exec_start, tv_exec_end;
	struct timeval tv_mem_alloc_start;
	struct timeval tv_data_init_start;
	float data_init;
	struct timeval tv_conf_kern_start;
	struct timeval tv_close_start;
	float mem_alloc;
	float exec;
	float init_gpu;
	float configure_kernel;
	float close_gpu;
	float data_read;

	unsigned int dummy_b, dummy_c;
		

	/* block_x * block_y should not exceed 512. */
	block_x = n < 16 ? n : 16;
	block_y = n < 16 ? n : 16;
	grid_x = n / block_x;
	if (n % block_x != 0)
		grid_x++;
	grid_y = n / block_y;
	if (n % block_y != 0)
		grid_y++;

	gettimeofday(&tv_total_start, NULL);

	res = cuInit(0);
	if (res != CUDA_SUCCESS) {
		printf("cuInit failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	res = cuDeviceGet(&dev, 0);
	if (res != CUDA_SUCCESS) {
		printf("cuDeviceGet failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	res = cuCtxCreate(&ctx, 0, dev);
	if (res != CUDA_SUCCESS) {
		printf("cuCtxCreate failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	sprintf(fname, "%s/madd_gpu.cubin", path);
	res = cuModuleLoad(&module, fname);
	if (res != CUDA_SUCCESS) {
		printf("cuModuleLoad() failed\n");
		return -1;
	}
	res = cuModuleGetFunction(&function, module, "_Z3addPjS_S_j");
	if (res != CUDA_SUCCESS) {
		printf("cuModuleGetFunction() failed\n");
		return -1;
	}
	res = cuFuncSetBlockShape(function, block_x, block_y, 1);
	if (res != CUDA_SUCCESS) {
		printf("cuFuncSetBlockShape() failed\n");
		return -1;
	}

	gettimeofday(&tv_mem_alloc_start, NULL);

	/* a[] */
	res = cuMemAlloc(&a_dev, n*n * sizeof(unsigned int));
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc (a) failed\n");
		return -1;
	}
	res = cuMemMap((void**)&a_buf, a_dev, n*n * sizeof(unsigned int));
	if (res != CUDA_SUCCESS) {
		printf("cuMemMap (a) failed\n");
		return -1;
	}
	res = cuMemGetPhysAddr(&a_phys, (void*)a_buf);
	if (res != CUDA_SUCCESS) {
		printf("cuMemGetPhysAddress (a) failed\n");
		return -1;
	}
	/*printf("a[]: Physical Address 0x%llx\n", a_phys);*/

	/* b[] */
	res = cuMemAlloc(&b_dev, n*n * sizeof(unsigned int));
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc (b) failed\n");
		return -1;
	}
	res = cuMemMap((void**)&b_buf, b_dev, n*n * sizeof(unsigned int));
	if (res != CUDA_SUCCESS) {
		printf("cuMemMap (b) failed\n");
		return -1;
	}
	res = cuMemGetPhysAddr(&b_phys, (void*)b_buf);
	if (res != CUDA_SUCCESS) {
		printf("cuMemGetPhysAddress (b) failed\n");
		return -1;
	}
	/*printf("b[]: Physical Address 0x%llx\n", b_phys);*/

	/* c[] */
	res = cuMemAlloc(&c_dev, n*n * sizeof(unsigned int));
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc (c) failed\n");
		return -1;
	}
	res = cuMemMap((void**)&c_buf, c_dev, n*n * sizeof(unsigned int));
	if (res != CUDA_SUCCESS) {
		printf("cuMemMap (c) failed\n");
		return -1;
	}
	res = cuMemGetPhysAddr(&c_phys, (void*)c_buf);
	if (res != CUDA_SUCCESS) {
		printf("cuMemGetPhysAddress (c) failed\n");
		return -1;
	}
	/*printf("c[]: Physical Address 0x%llx\n", c_phys);*/

	gettimeofday(&tv_data_init_start, NULL);

	/* initialize A[] & B[] */
	for (i = 0; i < n; i++) {
		idx = i*n;
		for(j = 0; j < n; j++) {			
			a_buf[idx++] = i;
		}
	}
	for (i = 0; i < n; i++) {
		idx = i*n;
		for(j = 0; j < n; j++) {
			b_buf[idx++] = i;
		}
	}

	gettimeofday(&tv_h2d_start, NULL);
	gettimeofday(&tv_h2d_end, NULL);


	gettimeofday(&tv_conf_kern_start, NULL);

	/* set kernel parameters */
	res = cuParamSeti(function, 0, a_dev);	
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (a) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuParamSeti(function, 4, a_dev >> 32);
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (a) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuParamSeti(function, 8, b_dev);
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (b) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuParamSeti(function, 12, b_dev >> 32);
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (b) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuParamSeti(function, 16, c_dev);
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuParamSeti(function, 20, c_dev >> 32);
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuParamSeti(function, 24, n);
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuParamSetSize(function, 28);
	if (res != CUDA_SUCCESS) {
		printf("cuParamSetSize failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	gettimeofday(&tv_exec_start, NULL);
	/* launch the kernel */
	res = cuLaunchGrid(function, grid_x, grid_y);
	if (res != CUDA_SUCCESS) {
		printf("cuLaunchGrid failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	cuCtxSynchronize();
	gettimeofday(&tv_exec_end, NULL);


	gettimeofday(&tv_d2h_start, NULL);
	/* download c[] */


	memcpy(c, c_buf, n*n*sizeof(unsigned int));

	gettimeofday(&tv_d2h_end, NULL);

	/* Read back */
	for (i = 0; i < n; i++) {
		idx = i*n;
		for(j = 0; j < n; j++) {			
			dummy_c = c[idx++];
		}
	}



	gettimeofday(&tv_close_start, NULL);

	res = cuMemUnmap((void*)a_buf);
	if (res != CUDA_SUCCESS) {
		printf("cuMemUnmap (a) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuMemFree(a_dev);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFree (a) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuMemUnmap((void*)b_buf);
	if (res != CUDA_SUCCESS) {
		printf("cuMemUnmap (b) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuMemFree(b_dev);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFree (b) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuMemUnmap((void*)c_buf);
	if (res != CUDA_SUCCESS) {
		printf("cuMemUnmap (c) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuMemFree(c_dev);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFree (c) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	res = cuModuleUnload(module);
	if (res != CUDA_SUCCESS) {
		printf("cuModuleUnload failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	res = cuCtxDestroy(ctx);
	if (res != CUDA_SUCCESS) {
		printf("cuCtxDestroy failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	gettimeofday(&tv_total_end, NULL);




	tvsub(&tv_mem_alloc_start, &tv_total_start, &tv);
	init_gpu = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	tvsub(&tv_data_init_start, &tv_mem_alloc_start, &tv);
	mem_alloc = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	tvsub(&tv_h2d_start, &tv_data_init_start, &tv);
	data_init = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	tvsub(&tv_h2d_end, &tv_h2d_start, &tv);
	h2d = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	tvsub(&tv_exec_start, &tv_conf_kern_start, &tv);
	configure_kernel = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	tvsub(&tv_exec_end, &tv_exec_start, &tv);
	exec = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	tvsub(&tv_d2h_end, &tv_d2h_start, &tv);
	d2h = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	tvsub(&tv_close_start, &tv_d2h_end, &tv);
	data_read = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	tvsub(&tv_total_end, &tv_close_start, &tv);
	close_gpu = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	tvsub(&tv_total_end, &tv_total_start, &tv);
	total = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	printf("Init: %f\n", init_gpu);
	printf("MemAlloc: %f\n", mem_alloc);
	printf("DataInit: %f\n", data_init);
	printf("HtoD: %f\n", h2d);
	printf("KernConf: %f\n", configure_kernel);
	printf("Exec: %f\n", exec);
	printf("DtoH: %f\n", d2h);
	printf("DataRead: %f\n", data_read);
	printf("Close: %f\n", close_gpu);
	printf("Total: %f\n", total);


	return ret;
}
Exemple #20
0
int
main (int argc, char **argv)
{
  CUdevice dev;
  CUfunction delay;
  CUmodule module;
  CUresult r;
  CUstream stream;
  unsigned long *a, *d_a, dticks;
  int nbytes;
  float dtime;
  void *kargs[2];
  int clkrate;
  int devnum, nprocs;

  acc_init (acc_device_nvidia);

  devnum = acc_get_device_num (acc_device_nvidia);

  r = cuDeviceGet (&dev, devnum);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
      abort ();
    }

  r =
    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
			  dev);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
      abort ();
    }

  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
      abort ();
    }

  r = cuModuleLoad (&module, "subr.ptx");
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
      abort ();
    }

  r = cuModuleGetFunction (&delay, module, "delay");
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
      abort ();
    }

  nbytes = nprocs * sizeof (unsigned long);

  dtime = 200.0;

  dticks = (unsigned long) (dtime * clkrate);

  a = (unsigned long *) malloc (nbytes);
  d_a = (unsigned long *) acc_malloc (nbytes);

  acc_map_data (a, d_a, nbytes);

  kargs[0] = (void *) &d_a;
  kargs[1] = (void *) &dticks;

  r = cuStreamCreate (&stream, CU_STREAM_DEFAULT);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuStreamCreate failed: %d\n", r);
      abort ();
    }

  if (!acc_set_cuda_stream (0, stream))
    abort ();
    
  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
      abort ();
    }

  if (acc_async_test_all () != 0)
    {
      fprintf (stderr, "asynchronous operation not running\n");
      abort ();
    }

  sleep ((int) (dtime / 1000.f) + 1);

  if (acc_async_test_all () != 1)
    {
      fprintf (stderr, "found asynchronous operation still running\n");
      abort ();
    }

  acc_unmap_data (a);

  free (a);
  acc_free (d_a);

  acc_shutdown (acc_device_nvidia);

  exit (0);
}
CudaModule::CudaModule(const std::string& cubinFile)
{
  staticInit();
  checkError("cuModuleLoad", cuModuleLoad(&m_module, cubinFile.c_str()));
}
int main( int argc, char **argv )
{                               
    CUdevice   main_device  = 0;
    CUcontext  main_context = nullptr;
                             
    CUmodule   mod_vectorAdd = nullptr;
    CUfunction fun_vectorAdd = nullptr;

    std::string path_vectorAdd( "D:/devel/vectoradd-cuda-driverAPI/vectorAdd.cu" );
    std::string ptx_vectorAdd ( "D:/devel/vectoradd-cuda-driverAPI/vectorAdd.ptx" );

    CUdeviceptr input_data  = 0u;
    CUdeviceptr output_data = 0u;

    std::size_t problem_size = 1024;
    try
    {
        //Initialize the driver API
        check_error( cuInit( 0u ) );
             
        {
            int device_count = 0u;
            check_error( cuDeviceGetCount( &device_count ) );

            if( ! device_count )
            {
                std::cerr << "No CUDA devices available" << std::endl;
                throw CUDA_ERROR_NO_DEVICE;
            }
        }

        check_error( cuDeviceGet( &main_device, 0 ) );
        check_error( cuCtxCreate( &main_context, 0, main_device ) );

        //Try to manually compile the source file
        {
            std::stringstream build_command;
            build_command <<
                "nvcc "
                "-ptx "
                "-o " << ptx_vectorAdd << " " <<
                path_vectorAdd;

            if( int build_status = system( build_command.str( ).c_str( ) ) )
            {            
                std::cerr << "Failed to compile source cuda file into a ptx assembly" << std::endl;
                throw CUDA_ERROR_UNKNOWN;
            }

            //Find module entry with assembly
            std::string str_assembly;

            {
                std::ifstream fassembly( ptx_vectorAdd ); 
                if( !fassembly.is_open( ) )
                {           
                    std::cerr << "'Vector Add' assembly unavailable" << std::endl;
                    throw CUDA_ERROR_FILE_NOT_FOUND;
                }         


                fassembly.seekg (0, std::ios::end);
                str_assembly.resize( std::string::size_type( fassembly.tellg() ) );

                fassembly.seekg (0, std::ios::beg);
                fassembly.read( &str_assembly[0], str_assembly.size( ) );

                fassembly.close( );
            }

            auto entry_pos = str_assembly.find( ".entry" );
            if( entry_pos == std::string::npos )
            {       
                std::cerr << "No entry point in 'Vector Add'" << std::endl;
                throw CUDA_ERROR_INVALID_SOURCE;
            }
              
            entry_pos += 6u; //".entry".size( )

            auto search_limit = str_assembly.find_first_of( " (", entry_pos );
            if( search_limit == std::string::npos )
            {       
                std::cerr << "No entry point in 'Vector Add'" << std::endl;
                throw CUDA_ERROR_INVALID_SOURCE;
            }
              
            std::string funcName( str_assembly.substr( entry_pos, search_limit ) );

            check_error( cuModuleLoad        ( &mod_vectorAdd, ptx_vectorAdd.c_str( ) ) );
            check_error( cuModuleGetFunction ( &fun_vectorAdd, mod_vectorAdd, funcName.c_str( ) ) );
        }

        //Play with buffer
        cuMemAlloc( &input_data,  problem_size * sizeof( float ) );
        cuMemAlloc( &output_data, problem_size * sizeof( float ) );

        {             
            int threadsPerBlock = 256;
            int blocksPerGrid   = (problem_size + threadsPerBlock - 1) / threadsPerBlock;

            void* args[] = { &input_data, &output_data, &problem_size };

            cuLaunchKernel(
                fun_vectorAdd,
                blocksPerGrid, 1, 1,
                threadsPerBlock, 1, 1,
                0, 
                0, 
                args,
                nullptr);
        }

        float* result = new float[problem_size];

        cuMemcpyDtoH( result, output_data, problem_size * sizeof( float ) );

        std::copy(
            result, result + problem_size,
            std::ostream_iterator<float>(std::cout, ", ") );  

        delete[] result;
                         
        if( output_data )   cuMemFree      ( output_data );
        if( input_data )    cuMemFree      ( input_data );
        if( mod_vectorAdd ) cuModuleUnload ( mod_vectorAdd );
        if( main_context )  cuCtxDestroy   ( main_context );
    } 
    catch( int return_code )
    {
        if( output_data )   cuMemFree      ( output_data );
        if( input_data )    cuMemFree      ( input_data );
        if( mod_vectorAdd ) cuModuleUnload ( mod_vectorAdd );
        if( main_context )  cuCtxDestroy   ( main_context );

        system("PAUSE");
        return return_code;
Exemple #23
0
int cuda_test_fmadd(unsigned int n, char *path)
{
	int i, j, idx;
	CUresult res;
	CUdevice dev;
	CUcontext ctx;
	CUfunction function;
	CUmodule module;
	CUdeviceptr a_dev, b_dev, c_dev;
	float *a = (float *) malloc (n*n * sizeof(float));
	float *b = (float *) malloc (n*n * sizeof(float));
	float *c = (float *) malloc (n*n * sizeof(float));
	int block_x, block_y, grid_x, grid_y;
	int offset;
	char fname[256];
	struct timeval tv;
	struct timeval tv_total_start, tv_total_end;
	float total;
	struct timeval tv_h2d_start, tv_h2d_end;
	float h2d;
	struct timeval tv_d2h_start, tv_d2h_end;
	float d2h;
	struct timeval tv_exec_start, tv_exec_end;
	float exec;

	/* initialize A[] & B[] */
	for (i = 0; i < n; i++) {
		for(j = 0; j < n; j++) {
			idx = i * n + j;
			a[idx] = i + 0.1;
			b[idx] = i + 0.1;
		}
	}

	/* block_x * block_y should not exceed 512. */
	block_x = n < 16 ? n : 16;
	block_y = n < 16 ? n : 16;
	grid_x = n / block_x;
	if (n % block_x != 0)
		grid_x++;
	grid_y = n / block_y;
	if (n % block_y != 0)
		grid_y++;
	printf("block = (%d, %d)\n", block_x, block_y);
	printf("grid = (%d, %d)\n", grid_x, grid_y);

	gettimeofday(&tv_total_start, NULL);

	res = cuInit(0);
	if (res != CUDA_SUCCESS) {
		printf("cuInit failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	res = cuDeviceGet(&dev, 0);
	if (res != CUDA_SUCCESS) {
		printf("cuDeviceGet failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	res = cuCtxCreate(&ctx, 0, dev);
	if (res != CUDA_SUCCESS) {
		printf("cuCtxCreate failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	sprintf(fname, "%s/fmadd_gpu.cubin", path);
	res = cuModuleLoad(&module, fname);
	if (res != CUDA_SUCCESS) {
		printf("cuModuleLoad() failed\n");
		return -1;
	}
	res = cuModuleGetFunction(&function, module, "_Z3addPfS_S_i");
	if (res != CUDA_SUCCESS) {
		printf("cuModuleGetFunction() failed\n");
		return -1;
	}
	res = cuFuncSetSharedSize(function, 0x40); /* just random */
	if (res != CUDA_SUCCESS) {
		printf("cuFuncSetSharedSize() failed\n");
		return -1;
	}
	res = cuFuncSetBlockShape(function, block_x, block_y, 1);
	if (res != CUDA_SUCCESS) {
		printf("cuFuncSetBlockShape() failed\n");
		return -1;
	}

	/* a[] */
	res = cuMemAlloc(&a_dev, n*n * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc (a) failed\n");
		return -1;
	}
	/* b[] */
	res = cuMemAlloc(&b_dev, n*n * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc (b) failed\n");
		return -1;
	}
	/* c[] */
	res = cuMemAlloc(&c_dev, n*n * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc (c) failed\n");
		return -1;
	}

	gettimeofday(&tv_h2d_start, NULL);
	/* upload a[] and b[] */
	res = cuMemcpyHtoD(a_dev, a, n*n * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD (a) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuMemcpyHtoD(b_dev, b, n*n * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD (b) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	gettimeofday(&tv_h2d_end, NULL);

	/* set kernel parameters */
	offset = 0;
	res = cuParamSetv(function, offset, &a_dev, sizeof(a_dev));	
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (a) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	offset += sizeof(a_dev);
	res = cuParamSetv(function, offset, &b_dev, sizeof(b_dev));
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (b) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	offset += sizeof(b_dev);
	res = cuParamSetv(function, offset, &c_dev, sizeof(c_dev));
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	offset += sizeof(c_dev);
	res = cuParamSetv(function, offset, &n, sizeof(n));
	if (res != CUDA_SUCCESS) {
		printf("cuParamSeti (c) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	offset += sizeof(n);
	res = cuParamSetSize(function, offset);
	if (res != CUDA_SUCCESS) {
		printf("cuParamSetSize failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	gettimeofday(&tv_exec_start, NULL);
	/* launch the kernel */
	res = cuLaunchGrid(function, grid_x, grid_y);
	if (res != CUDA_SUCCESS) {
		printf("cuLaunchGrid failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	cuCtxSynchronize();
	gettimeofday(&tv_exec_end, NULL);

	gettimeofday(&tv_d2h_start, NULL);
	/* download c[] */
	res = cuMemcpyDtoH(c, c_dev, n*n * sizeof(float));
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyDtoH (c) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	gettimeofday(&tv_d2h_end, NULL);

	res = cuMemFree(a_dev);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFree (a) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuMemFree(b_dev);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFree (b) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}
	res = cuMemFree(c_dev);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFree (c) failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	res = cuModuleUnload(module);
	if (res != CUDA_SUCCESS) {
		printf("cuModuleUnload failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	res = cuCtxDestroy(ctx);
	if (res != CUDA_SUCCESS) {
		printf("cuCtxDestroy failed: res = %lu\n", (unsigned long)res);
		return -1;
	}

	gettimeofday(&tv_total_end, NULL);

	/* check the results */
	i = j = idx = 0;
	while (i < n) {
		while (j < n) {
			idx = i * n + j;
			if (c[idx] != a[idx] + b[idx]) {
				printf("c[%d] = %f\n", idx, c[idx]);
				printf("a[%d]+b[%d] = %f\n", idx, idx, a[idx]+b[idx]);
				return -1;
			}
			j++;
		}
		i++;
	}

	free(a);
	free(b);
	free(c);

	tvsub(&tv_h2d_end, &tv_h2d_start, &tv);
	h2d = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
	tvsub(&tv_d2h_end, &tv_d2h_start, &tv);
	d2h = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
	tvsub(&tv_exec_end, &tv_exec_start, &tv);
	exec = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
	tvsub(&tv_total_end, &tv_total_start, &tv);
	total = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;

	printf("HtoD: %f\n", h2d);
	printf("DtoH: %f\n", d2h);
	printf("Exec: %f\n", exec);
	printf("Time (Memcpy + Launch): %f\n", h2d + d2h + exec);
	printf("Total: %f\n", total);

	return 0;
}
int main( int argc, char** argv)
{
	uint num_threads;
	uint num_blocks, block_size;
	uint length;
	uint nBytes;
	int *list;
	int status, verbose, c, i, j, logBlocks;
	int read_stdin;
	struct timeval start_time, end_time;
	unsigned long total_time;
	CUdevice hDevice;
	CUcontext hContext;
	CUmodule hModule;
	CUfunction bitonicBlockFn;
	CUfunction mergeBlocksFn;
	CUdeviceptr pDeviceArrayA;
	CUdeviceptr pDeviceArrayB;

	status = SUCCESS;
	verbose = 0;
	read_stdin = FALSE;
	length = 0;

	while ((c = getopt (argc, argv, "dip:vO")) != -1) {
		switch (c) {
		case 'd':
			verbose |= GROSS_DEBUG;
			break;
		case 'i':
			read_stdin = TRUE;
		case 'O':
			verbose |= OUTPUT;
			break;
		case 'p':
			length = 1 << atoi(optarg);
			break;
		case 'v':
			verbose |= DEBUG;
			break;
		case '?':
		default:
			print_usage();
			return FAILURE;
		}
	}

	if ( read_stdin == TRUE ) {
		/* Read sequence of integers from stdin */
		list = (int*) malloc (INIT_INPUT_SIZE * sizeof(int) );
		length = readIntegers(list, INIT_INPUT_SIZE);
	} else if ( length > 0 ) {
		list = (int*) malloc (length * sizeof(int) );
		randomInts(list, length);
	} else if (optind >= argc) { /* No size was given */
		print_usage();
		return FAILURE;
	} else {
		/* Generate our own integers */
		length = atoi(argv[optind]);
		list = (int*) malloc (length * sizeof(int) );
		randomInts(list, length);
	}

	/*
	* Phase 1:
	* 	There will be one thread for each element to be sorted. Each
	*	block will perform bitonic sort on MAX_THREADS_PER_BLOCK elements.
	*/

	/* Initialize sizes */
	num_threads = _min(length, MAX_THREADS_PER_BLOCK );
	num_blocks = (length-1) / MAX_THREADS_PER_BLOCK + 1;
	nBytes = length * sizeof(int);

	if (verbose & DEBUG) printf("Initializing GPU.\n");
	
	/* Start timing */
	gettimeofday(&start_time, NULL);

	/* Initialize GPU */
	cutilDrvSafeCall( cuInit(0) 					);
	cutilDrvSafeCall( cuDeviceGet(&hDevice, 0)			); 
	cutilDrvSafeCall( cuCtxCreate(&hContext, 0, hDevice) 		);
	cutilDrvSafeCall( cuModuleLoad(&hModule, MODULE_FILE) 		);
	cutilDrvSafeCall( cuModuleGetFunction(&bitonicBlockFn, hModule, BITONIC_BLOCK_FN) );

	/* Allocate memory on the device */
	cutilDrvSafeCall( cuMemAlloc(&pDeviceArrayA, nBytes)		);
	cutilDrvSafeCall( cuMemAlloc(&pDeviceArrayB, nBytes)		);
	cutilDrvSafeCall( cuMemcpyHtoD(pDeviceArrayA, list, nBytes) 	);
	cutilDrvSafeCall( cuFuncSetBlockShape(bitonicBlockFn, num_threads, 1, 1));
	cutilDrvSafeCall( cuParamSeti(bitonicBlockFn, 0, pDeviceArrayA)	);
	cutilDrvSafeCall( cuParamSetSize(bitonicBlockFn, 4)		);
	
	/* Execute the kernel on the GPU */
	if ( verbose & DEBUG ) printf("Launching bitonic sort kernel with %d blocks and %d threads per block.\n", num_blocks, num_threads);
	cutilDrvSafeCall( cuLaunchGrid(bitonicBlockFn, num_blocks, 1)		);

	/*
	* Phase 2:
	* 	At this point each block is a sorted list. Now it's time to merge them.	
	*/

	/* TODO This should go away after development */
	if ( verbose & GROSS_DEBUG ) {
		cuMemcpyDtoH(list, pDeviceArrayA, nBytes);
		for (i=0; i<num_blocks; ++i) {
			printf("### Block %d:\n", i);
			for (j=0; j<num_threads; ++j) {
				printf("%d\n", list[i*num_threads + j]);
			}
		}
	}
	
	i=0;

	/* Do we need to merge blocks? */
	if ( num_blocks > 1 ) {

		/* There will be Log_2(num_blocks) merge steps. */
		logBlocks = 0;
		for (i=1; i<num_blocks; i *= 2)	++logBlocks;

		if ( verbose & DEBUG ) printf("There will be %d merge steps.\n", logBlocks);	

		block_size = num_threads; 	/* How big the blocks were in the last grid launch. */
		num_threads = num_blocks >> 1;  /* Start with blocks/2 threads */
		num_blocks = (num_threads-1) / MAX_THREADS_PER_BLOCK  +  1;

		cutilDrvSafeCall( cuModuleGetFunction(&mergeBlocksFn, hModule, MERGE_BLOCKS_FN) );
		cuParamSeti(mergeBlocksFn, 4, block_size);
		cuParamSetSize(mergeBlocksFn, 16);

		for (i=0; i < logBlocks; ++i) {
			cuFuncSetBlockShape(mergeBlocksFn, num_threads, 1, 1);
			cuParamSeti(mergeBlocksFn, 0, i); /* set merge level */

			/* Merging uses a source array and destination array, the gpu has 2 arrays allocated
			 * so we swap which is the source and which is the destination for each iteration. */
			if ( i%2 == 0 ) {
				cuParamSeti(mergeBlocksFn, 8, pDeviceArrayA);
				cuParamSeti(mergeBlocksFn, 12, pDeviceArrayB);
			} else {
				cuParamSeti(mergeBlocksFn, 8, pDeviceArrayB);
				cuParamSeti(mergeBlocksFn, 12, pDeviceArrayA);
			}

			if ( verbose & DEBUG ) {
				printf("Launching block merge kernel with %d blocks and %d threads per block\n", 
									num_blocks, num_threads/num_blocks);
			}	
			cutilDrvSafeCall( cuLaunchGrid(mergeBlocksFn, num_blocks, 1) );

			num_threads = num_threads >> 1;
			num_blocks = (num_threads-1) / MAX_THREADS_PER_BLOCK  +  1;
		}
	}
Exemple #25
0
// Host code
int main()
{
	int N = 3;
	size_t size = N * sizeof(float);
	float* h_A = (float*)malloc(size);
	float* h_B = (float*)malloc(size);
	float* h_C = (float*)malloc(size);

	// Set up vectors.
	for (int i = 0; i < N; ++i)
	{
		h_A[i] = i * 1.0;
		h_B[i] = i * 1.0 + 1;
		h_C[i] = 0;
		printf("i %d A %f B %f C %f\n", i, h_A[i], h_B[i], h_C[i]);
	}

	// Initialize
	if (cuInit(0) != CUDA_SUCCESS)
		exit (0);

	// Get number of devices supporting CUDA
	int deviceCount = 0;
	cuDeviceGetCount(&deviceCount);
	if (deviceCount == 0)
	{
		printf("There is no device supporting CUDA.\n");
		exit (0);
	}

	// Get handle for device 0
	CUdevice cuDevice = 0;
	CUresult r1 = cuDeviceGet(&cuDevice, 0);
	// Create context
	CUcontext cuContext;
	cuCtxCreate(&cuContext, 0, cuDevice);
	// Create module from binary file
	CUmodule cuModule;
	CUresult r2 = cuModuleLoad(&cuModule, "VecAdd.ptx");
	// Get function handle from module
	CUfunction vecAdd;
	CUresult r3 = cuModuleGetFunction(&vecAdd, cuModule, "VecAdd");
	// Allocate vectors in device memory
	CUdeviceptr d_A;
	CUresult r4 = cuMemAlloc(&d_A, size);
	CUdeviceptr d_B;
	CUresult r5 = cuMemAlloc(&d_B, size);
	CUdeviceptr d_C;
	CUresult r6 = cuMemAlloc(&d_C, size);
	// Copy vectors from host memory to device memory
	// h_A and h_B are input vectors stored in host memory
	CUresult r7 = cuMemcpyHtoD(d_A, h_A, size);
	CUresult r8 = cuMemcpyHtoD(d_B, h_B, size);
	// Invoke kernel
#define ALIGN_UP(offset, alignment) (offset) = ((offset) + (alignment) - 1) & ~((alignment) - 1)
	int offset = 0;
	void* ptr;
	ptr = (void*)(size_t)d_A;
	ALIGN_UP(offset, __alignof(ptr));
	CUresult r9 = cuParamSetv(vecAdd, offset, &ptr, sizeof(ptr));
	offset += sizeof(ptr);
	ptr = (void*)(size_t)d_B;
	ALIGN_UP(offset, __alignof(ptr));
	CUresult r10 = cuParamSetv(vecAdd, offset, &ptr, sizeof(ptr));
	offset += sizeof(ptr);
	ptr = (void*)(size_t)d_C;
	ALIGN_UP(offset, __alignof(ptr));
	CUresult r11 = cuParamSetv(vecAdd, offset, &ptr, sizeof(ptr));
	offset += sizeof(ptr);
	ptr = (void*)(int)N;
	ALIGN_UP(offset, __alignof(ptr));
	CUresult r11a = cuParamSetv(vecAdd, offset, &ptr, sizeof(ptr));
	offset += sizeof(ptr);
	CUresult r12 = cuParamSetSize(vecAdd, offset);
	int threadsPerBlock = 256;
	int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
	CUresult r13 = cuFuncSetBlockShape(vecAdd, threadsPerBlock, 1, 1);
	CUresult r14 = cuLaunchGrid(vecAdd, blocksPerGrid, 1);
	// Copy result from device memory to host memory
	// h_C contains the result in host memory
	CUresult r15 = cuMemcpyDtoH(h_C, d_C, size);
	for (int i = 0; i < N; ++i)
	{
		printf("i %d A %f B %f C %f\n", i, h_A[i], h_B[i], h_C[i]);
	}

	// Free device memory
	cuMemFree(d_A);
	cuMemFree(d_B);
	cuMemFree(d_C);
}
CUDARunner::CUDARunner():GPURunner<unsigned long,int>(TYPE_CUDA)
{
	m_in=0;
	m_devin=0;
	m_out=0;
	m_devout=0;
	CUresult rval;
	int major=0;
	int minor=0;
	std::string cuda_module_path("bitcoinminercuda.ptx");

	rval=cuInit(0);

	if(rval==CUDA_SUCCESS)
	{
		rval=cuDeviceGetCount(&m_devicecount);

		printf("%d CUDA GPU devices found\n",m_devicecount);

		if(m_devicecount>0)
		{
			if(m_deviceindex>=0 && m_deviceindex<m_devicecount)
			{
				printf("Setting CUDA device to device %d\n",m_deviceindex);
				rval=cuDeviceGet(&m_device,m_deviceindex);
				if(rval!=CUDA_SUCCESS)
				{
					exit(0);
				}
			}
			else
			{
				m_deviceindex=0;
				printf("Setting CUDA device to first device found\n");
				rval=cuDeviceGet(&m_device,0);
				if(rval!=CUDA_SUCCESS)
				{
					exit(0);
				}
			}

			cuDeviceComputeCapability(&major, &minor, m_device);

			rval=cuCtxCreate(&m_context,CU_CTX_BLOCKING_SYNC,m_device);
			if(rval!=CUDA_SUCCESS)
			{
				printf("Unable to create CUDA context\n");
				exit(0);
			}

			printf("Loading module %s\n",cuda_module_path.c_str());
			rval=cuModuleLoad(&m_module,cuda_module_path.c_str());
			if(rval!=CUDA_SUCCESS)
			{
				printf("Unable to load CUDA module: %i\n", rval);
				cuCtxDestroy(m_context);
				exit(0);
			}

			rval=cuModuleGetFunction(&m_function,m_module,"cuda_process");
			if(rval!=CUDA_SUCCESS)
			{
				printf("Unable to get function cuda_process %d\n",rval);
				cuModuleUnload(m_module);
				cuCtxDestroy(m_context);
				exit(0);
			}

			printf("CUDA initialized\n");

		}
		else
		{
			printf("No CUDA capable devices found\n");
			exit(0);
		}
	}
	else
	{
		printf("Unable to initialize CUDA\n");
		exit(0);
	}
}
Exemple #27
0
double CUDAImpl::Build(std::string * err)
{
    _StartTimer();

    if(!_UnloadModule(err)) {
        return GPUIP_ERROR;
    }

    const char * file_helper_math_h = ".helper_math.h";
    const char * file_temp_cu = ".temp.cu";
    const char * file_temp_ptx = ".temp.ptx";
    
    // Includes vector float operations such as mult, add etc
    std::ofstream out_helper(file_helper_math_h);
    out_helper << get_cuda_helper_math();
    out_helper.close();
    
    // Create temporary file to compile
    std::ofstream out(file_temp_cu);
    out << "#include \"" << file_helper_math_h << "\"\n";
    out << "extern \"C\" { \n"; // To avoid function name mangling 
    for(size_t i = 0; i < _kernels.size(); ++i) {
        out << _kernels[i]->code << "\n";
    }
    out << "}"; // End the extern C bracket
    out.close();

    std::stringstream ss;
    const char * cuda_bin_path = getenv("CUDA_BIN_PATH");
    if (cuda_bin_path  != NULL) {
        ss << cuda_bin_path << "/nvcc";
    } else {
        ss << "nvcc";
    }
    ss << " -ptx " << file_temp_cu << " -o " << file_temp_ptx
       << " --Wno-deprecated-gpu-targets"
       << " -include " << file_helper_math_h;
    if(sizeof(void *) == 4) {
        ss << " -m32";
    } else {
        ss << " -m64";
    }
#ifdef _WIN32
    const char * cl_bin_path = getenv("CL_BIN_PATH");
    if (cl_bin_path != NULL) {
        ss << " -ccbin \"" << cl_bin_path << "\"";
    }
#endif
    ss << " 2>&1" << std::endl; // get both standard output and error
    std::string pipe_err;
    int nvcc_exit_status = _execPipe(ss.str().c_str(), &pipe_err);

    // Cleanup temp text file
    _removeFile(file_helper_math_h);
    _removeFile(file_temp_cu);
        
    if (nvcc_exit_status) {
        (*err) = "Cuda error: Could not compile kernels:\n";
        (*err) += pipe_err;
        return GPUIP_ERROR;
    }

    // Load cuda ptx from file
    CUresult c_err = cuModuleLoad(&_cudaModule, ".temp.ptx");
    _removeFile(file_temp_ptx);
    if (_cudaErrorLoadModule(c_err, err)) {
        return GPUIP_ERROR;
    }

    _cudaKernels.resize(_kernels.size());
    for(size_t i = 0; i < _kernels.size(); ++i) {
        c_err = cuModuleGetFunction(&_cudaKernels[i], _cudaModule,
                                    _kernels[i]->name.c_str());
        if (_cudaErrorGetFunction(c_err, err, _kernels[i]->name)) {
            return GPUIP_ERROR;
        }
    }

    _cudaBuild = true;
    
    return _StopTimer();
}
Exemple #28
0
int main(int argc, char *argv[])
{
	argc--; argv++;

	// Instruction-level test of PTX assembly language and emulator.
	// This test should work natively and under emulation.  Many of the
	// instructions tested here stress many poorly documented features
	// of the PTX assembly language.  If the emulator passes these
	// tests, then it can surely pass code that is generated by the
	// nvcc compiler.
	
	test(cuInit(0), "cuInit");

	int deviceCount = 0;
	test(cuDeviceGetCount(&deviceCount), "cuDeviceGetCount");

	int device = 0;
	if (argc)
		device = atoi(*argv);

	CUdevice cuDevice = 0;
	test(cuDeviceGet(&cuDevice, device), "cuDeviceGet");

	CUcontext cuContext;
	int xxx = cuCtxCreate(&cuContext, 0, cuDevice);

	CUmodule cuModule;
	test(cuModuleLoad(&cuModule, "inst.ptx"), "cuModuleLoad");

	// Do basic test.  No sense continuing if we cannot complete this
	// test.
	try
	{
		CUfunction proc;
		test(cuModuleGetFunction(&proc, cuModule, "InstBasic"), "cuModuleGetFunction");

		bool * h_R = (bool*)malloc(sizeof(bool));
		memset(h_R, 0, sizeof(bool));

		CUdeviceptr d_R;
		test(cuMemAlloc(&d_R, sizeof(bool)), "cuMemAlloc");

		test(cuMemcpyHtoD(d_R, h_R, sizeof(bool)), "cuMemcpyHtoD");

		int offset = 0;
		void* ptr;
	
		ptr = (void*)(size_t)d_R;
		ALIGN_UP(offset, __alignof(ptr));
		test(cuParamSetv(proc, offset, &ptr, sizeof(ptr)), "cuParamSetv");
		offset += sizeof(ptr);

		test(cuParamSetSize(proc, offset), "cuParamSetSize");

		int threadsPerBlock = 1;
		int blocksPerGrid = 1;

		test(cuFuncSetBlockShape(proc, threadsPerBlock, 1, 1), "cuFuncSetBlockShape");

		test(cuLaunchGrid(proc, blocksPerGrid, 1), "cuLaunchGrid");

		test(cuMemcpyDtoH(h_R, d_R, sizeof(bool)), "cuMemcpyDtoH");

		test(cuMemFree(d_R), "cuMemFree");

		if (h_R[0] == 1)
			std::cout << "Basic test passed.\n";
		else {
			std::cout << "Basic test failed.\n";
			exit(1);
		}

	} catch (...)
	{
		test(1, "test crashed.");
	}

	// Do LD, ST, MOV test.
	load_and_test(cuModule, "InstLSMC");

	// Do ADD, SUB test.
	load_and_test(cuModule, "InstAddSub");

	return 0;
}
Exemple #29
0
int
main (int argc, char **argv)
{
  CUdevice dev;
  CUfunction delay;
  CUmodule module;
  CUresult r;
  CUstream stream;
  unsigned long *a, *d_a, dticks;
  int nbytes;
  float atime, dtime;
  void *kargs[2];
  int clkrate;
  int devnum, nprocs;

  acc_init (acc_device_nvidia);

  devnum = acc_get_device_num (acc_device_nvidia);

  r = cuDeviceGet (&dev, devnum);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
      abort ();
    }

  r =
    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
			  dev);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
      abort ();
    }

  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
      abort ();
    }

  r = cuModuleLoad (&module, "subr.ptx");
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
      abort ();
    }

  r = cuModuleGetFunction (&delay, module, "delay");
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
      abort ();
    }

  nbytes = nprocs * sizeof (unsigned long);

  dtime = 200.0;

  dticks = (unsigned long) (dtime * clkrate);

  a = (unsigned long *) malloc (nbytes);
  d_a = (unsigned long *) acc_malloc (nbytes);

  acc_map_data (a, d_a, nbytes);

  kargs[0] = (void *) &d_a;
  kargs[1] = (void *) &dticks;

  r = cuStreamCreate (&stream, CU_STREAM_DEFAULT);
  if (r != CUDA_SUCCESS)
	{
	  fprintf (stderr, "cuStreamCreate failed: %d\n", r);
	  abort ();
	}

  acc_set_cuda_stream (0, stream);

  init_timers (1);

  start_timer (0);

  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
      abort ();
    }

  acc_wait (1);

  atime = stop_timer (0);

  if (atime < dtime)
    {
      fprintf (stderr, "actual time < delay time\n");
      abort ();
    }

  start_timer (0);

  acc_wait (1);

  atime = stop_timer (0);

  if (0.010 < atime)
    {
      fprintf (stderr, "actual time < delay time\n");
      abort ();
    }

  acc_unmap_data (a);

  fini_timers ();

  free (a);
  acc_free (d_a);

  acc_shutdown (acc_device_nvidia);

  return 0;
}
    void test_driver_api()
    {
        CUdevice cuDevice;
        CUcontext cuContext;
        CUmodule cuModule;
        size_t totalGlobalMem;
        CUfunction matrixMult = 0;
        // cuda driver api intialization
        {
            int major = 0, minor = 0;
            char deviceName[100];

            cuda::Check::CUDAError(cuInit(0), "Error intializing cuda");
            int deviceCount;
            cuda::Check::CUDAError(cuDeviceGetCount(&deviceCount), "Error getting the number of devices");
            if (deviceCount <= 0)
            {
                std::cerr << "No devices found" << std::endl;
                return;
            }

            cuDeviceGet(&cuDevice, 0);

            // get compute capabilities and the devicename
            cuda::Check::CUDAError(cuDeviceComputeCapability(&major, &minor, cuDevice), "Error getting Device compute capability");
            cuda::Check::CUDAError(cuDeviceGetName(deviceName, 256, cuDevice), "Error getting device name");
            std::cout << "> GPU Device has SM " << major << "." << minor << " compute capability" << std::endl;

            cuda::Check::CUDAError(cuDeviceTotalMem(&totalGlobalMem, cuDevice), "Error getting totat global memory");
            std::cout << "  Total amount of global memory:     " << (unsigned long long)totalGlobalMem << " bytes" << std::endl;
            std::string tmp = (totalGlobalMem > (unsigned long long)4 * 1024 * 1024 * 1024L) ? "YES" : "NO";
            std::cout << "  64-bit Memory Address:             " << tmp << std::endl;

            cuda::Check::CUDAError(cuCtxCreate(&cuContext, 0, cuDevice), "Error creating the context");
        }
        // Compile and get the function
        {
            std::string module_path = "MatrixMult.cubin";
            std::cout << "> initCUDA loading module: " << module_path << std::endl;

            cuda::Check::CUDAError(cuModuleLoad(&cuModule, module_path.c_str()), "Error loading module");

            cuda::Check::CUDAError(cuModuleGetFunction(&matrixMult, cuModule, "MatrixMultKernelSimpleDriverAPI"), "Error retrieving the function");
        }
        // Call the kernel
        {
            int WIDTH = BLOCK_SIZE;
            int HEIGHT = BLOCK_SIZE;
            std::stringstream text;
            text << "CUDA Matrix Multiplication (" << WIDTH << "x" << WIDTH << ") Simple method Multiplication time";
            HostMatrix<float> M(WIDTH, HEIGHT); M.fillWithRandomData(); //M.print(std::cout); 
            HostMatrix<float> N(WIDTH, HEIGHT); N.fill_diagonal(2); //N.print(std::cout); 
            HostMatrix<float> C(WIDTH, HEIGHT);
            {
                ScopedTimer t(text.str());

                // allocate device memory
                CUdeviceptr d_M;
                cuda::Check::CUDAError(cuMemAlloc(&d_M, M.sizeInBytes()), "Error allocating memory");
                CUdeviceptr d_N;
                cuda::Check::CUDAError(cuMemAlloc(&d_N, N.sizeInBytes()), "Error allocating memory");

                // copy host memory to device
                cuda::Check::CUDAError(cuMemcpyHtoD(d_M, M, M.sizeInBytes()), "Error uploading memory to device");
                cuda::Check::CUDAError(cuMemcpyHtoD(d_N, N, N.sizeInBytes()), "Error uploading memory to device");

                // allocate device memory for result
                CUdeviceptr d_C;
                cuda::Check::CUDAError(cuMemAlloc(&d_C, C.sizeInBytes()), "Error allocating memory");


                dim3 block(BLOCK_SIZE, BLOCK_SIZE, 1);
                dim3 grid(C.width_ / BLOCK_SIZE, C.height_ / BLOCK_SIZE, 1);
                void *args[6] = { &d_M, &d_N, &d_C, &WIDTH, &WIDTH, &WIDTH};

                // new CUDA 4.0 Driver API Kernel launch call
                cuda::Check::CUDAError(cuLaunchKernel(
                    matrixMult,                                     // Selected kernel function
                    grid.x, grid.y, grid.z,                         // grid config 
                    block.x, block.y, block.z,                      // block config
                    2 * BLOCK_SIZE*BLOCK_SIZE*sizeof(float),        
                    NULL, args, NULL), "Error executing Kernel");

                cuda::Check::CUDAError(cuMemcpyDtoH((void *)C, d_C, C.sizeInBytes()),"Error downloading memory to host");
            }
            C.print(std::cout);
        }

        cuCtxDestroy(cuContext);
    }