Пример #1
0
CUresult  updateConstantMemory_drvapi(CUmodule module, float *hueCSC)
{
    CUdeviceptr  d_constHueCSC, d_constAlpha;
    size_t       d_cscBytes, d_alphaBytes;

    // First grab the global device pointers from the CUBIN
    cuModuleGetGlobal(&d_constHueCSC,  &d_cscBytes  , module, "constHueColorSpaceMat");
    cuModuleGetGlobal(&d_constAlpha ,  &d_alphaBytes, module, "constAlpha");

    CUresult error = CUDA_SUCCESS;

    // Copy the constants to video memory
    cuMemcpyHtoD(d_constHueCSC,
                 reinterpret_cast<const void *>(hueCSC),
                 d_cscBytes);
    getLastCudaDrvErrorMsg("cuMemcpyHtoD (d_constHueCSC) copy to Constant Memory failed");


    uint32 cudaAlpha      = ((uint32)0xff<< 24);

    cuMemcpyHtoD(d_constAlpha,
                 reinterpret_cast<const void *>(&cudaAlpha),
                 d_alphaBytes);
    getLastCudaDrvErrorMsg("cuMemcpyHtoD (d_constAlpha) copy to Constant Memory failed");

    return error;
}
Пример #2
0
int main(int argc, char ** argv){
	int i;
        if( (argc>=2) && (atoi(argv[1])!=RANK)) error("rank %d mandatory",RANK);
        printf("CUDA RANK=%d\n",RANK);

	kernel.print();
// build busylist
        busylist = (uint32_t*)malloc_file(CNK*sizeof(uint32_t),FMODE_RO,BLIST_FORMAT,RANK);

// put busylist
        SafeCall(cuMemHostRegister(busylist,CNK*sizeof*busylist,CU_MEMHOSTREGISTER_DEVICEMAP));
        SafeCall(cuMemHostGetDevicePointer(&host_busylist,busylist,0));

        SafeCall(cuModuleGetGlobal(&dev_busylist,&bytes,kernel.module[0].module,"busylist"));
        if(bytes!=sizeof(host_busylist)) error("busylist!");
	SafeCall(cuMemcpyHtoD(dev_busylist,&host_busylist,bytes));
// put array
#ifdef IN_mk_data
        mkdir(DATADIR,0755); errno=0;
        array = (unsigned char *)malloc_file(abytes(RANK,CNK),1,DATADIR"%d",RANK);
#else
        array = (unsigned char *)malloc_file(abytes(RANK,CNK),0,DATADIR"%d",RANK);
#endif
        SafeCall(cuMemHostRegister(array,abytes(RANK,CNK),CU_MEMHOSTREGISTER_DEVICEMAP));
        SafeCall(cuMemHostGetDevicePointer(&host_array,array,0));

        SafeCall(cuModuleGetGlobal(&dev_array,&bytes,kernel.module[0].module,"array"));
        if(bytes!=sizeof(host_array)) error("array!");
	SafeCall(cuMemcpyHtoD(dev_array,&host_array,bytes));

#define THREADS 512
#define MAXG    65535
uint64_t nado = (cnk[RANK] +(THREADS-1))/THREADS;
uint32_t gridx = nado>MAXG?MAXG:nado;
uint32_t gridy = (nado+(MAXG-1))/MAXG;
printf("gridy=%d gridx=%d THREAD=%d\n",gridy, gridx, THREADS);

	kernel.launch(params,THREADS,gridx,gridy);
	kernel.wait();

	SafeCall(cuMemHostUnregister(busylist));
	SafeCall(cuMemHostUnregister(array));

        SafeCall(cuModuleGetGlobal(&dev_changed,&bytes,kernel.module[0].module,"changed"));
        if(bytes!=sizeof(changed)) error("changed!");
	SafeCall(cuMemcpyDtoH(changed,dev_changed,bytes));

	for(i=0;i<CACHESIZE;i++)
		total += changed[i];
	printf("changed=%ju\n",total);

	return 0;
}
Пример #3
0
void setupModuleResource(const char* kernelFileName)
{
    CUmodule newModule = createModuleFromFile(kernelFileName);
    if (newModule != NULL)
    {
        if (module != NULL) cuModuleUnload(module);
        module = newModule;
    }
    checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "mainImage"));

    // TODO: take care of bytes
    size_t bytes;
    checkCudaErrors(cuModuleGetGlobal(&d_iResolution, &bytes, module, "iResolution"));
    checkCudaErrors(cuModuleGetGlobal(&d_iGlobalTime, &bytes, module, "iGlobalTime"));
    checkCudaErrors(cuModuleGetGlobal(&d_iMouse, &bytes, module, "iMouse"));
    checkCudaErrors(cuModuleGetGlobal(&d_fragColor, &d_fragColor_bytes, module, "fragColor"));
}
Пример #4
0
	void const_copy_to(const char *name, void *host, size_t size)
	{
		CUdeviceptr mem;
		size_t bytes;

		cuda_push_context();
		cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name))
		//assert(bytes == size);
		cuda_assert(cuMemcpyHtoD(mem, host, size))
		cuda_pop_context();
	}
Пример #5
0
void swanBindToConstantEx( const char *modname, const  char *constname, size_t len , void *ptr ) {
	CUmodule mod = swanGetModule( modname );
	CUdeviceptr p;
	size_t lenr;
	int err = cuModuleGetGlobal( &p, &lenr, mod, constname );

	if( err != CUDA_SUCCESS ) { error ("swanBindToConstant failed -- no such name" ); }

	if( len != lenr ) {
		if( err != CUDA_SUCCESS ) { error ("swanBindToConstant failed -- size wrong" ); }
	}
	err = cuMemcpyHtoD( p, ptr, len );
	if( err != CUDA_SUCCESS ) { error ("swanBindToConstant failed -- copy failed" ); }
}
void CudaModuleScene::initCudaObj(ApexCudaVar& var)
{
	const char* varName = var.getName();

	for (int j = 0 ; j < numRegisteredVariables ; j++)
	{
		if (nvidia::strcmp(variableTable[j].varName, varName) == 0)
		{
			ApexCudaModule* cudaModule = getCudaModule(variableTable[j].modIndex);
			PX_ASSERT(cudaModule->isValid());

			CUdeviceptr cuDevPtr;
			size_t size;
			cuModuleGetGlobal(&cuDevPtr, &size, cudaModule->getCuModule(), varName);

			var.init(this, cudaModule, cuDevPtr, size, getGpuDispatcher()->getCudaContextManager());
			break;
		}
	}
}
Пример #7
0
Buffer& CudaModule::getGlobal(const std::string& name)
{
  Buffer* found = m_globalHash[name];
  
  if (found) {
    return *found;
  }

  CUdeviceptr ptr;
  size_t size;
  
  checkError( "cuModuleGetGlobal", 
              cuModuleGetGlobal(&ptr, &size, m_module, name.c_str()));

  Buffer* buffer = new Buffer;  
  buffer->wrapCuda(ptr, size);

  m_globalHash[name] = buffer;
  return *buffer;
}
Пример #8
0
CUresult CuModule::GetGlobal(const std::string& name, GlobalMemPtr* ppGlobal) {
	for(size_t i(0); i < _globals.size(); ++i) 
		if(name == _globals[i]->Name()) {
			*ppGlobal = _globals[i];
			return CUDA_SUCCESS;
		}

	GlobalMemPtr mem(new CuGlobalMem);
	CUresult result = cuModuleGetGlobal(&mem->_deviceptr, &mem->_size, 
		_module, name.c_str());
	HANDLE_RESULT();

	mem->_context = _context;
	mem->_globalName = name;
	mem->_module = this;

	_globals.push_back(mem);
	*ppGlobal = mem;
	return CUDA_SUCCESS;
}
Пример #9
0
SEXP
R_auto_cuModuleGetGlobal(SEXP r_hmod, SEXP r_name)
{
    SEXP r_ans = R_NilValue;
    CUdeviceptr dptr;
    size_t bytes;
    CUmodule hmod = (CUmodule) getRReference(r_hmod);
    const char * name = CHAR(STRING_ELT(r_name, 0));
    CUresult ans;
    ans = cuModuleGetGlobal(& dptr, & bytes,  hmod,  name);
    if(ans)
       return(R_cudaErrorInfo(ans));
    PROTECT(r_ans = NEW_LIST(2));
    SEXP r_names;
    PROTECT(r_names = NEW_CHARACTER(2));
    SET_VECTOR_ELT(r_ans, 0, R_createRef((void*) dptr, "CUdeviceptr"));
    SET_VECTOR_ELT(r_ans, 1, ScalarReal(bytes));
    SET_STRING_ELT(r_names, 0, mkChar("dptr"));
    SET_STRING_ELT(r_names, 1, mkChar("bytes"));
    SET_NAMES(r_ans, r_names);
    UNPROTECT(2);
    return(r_ans);
}
Пример #10
0
int
GOMP_OFFLOAD_load_image (int ord, unsigned version, const void *target_data,
			 struct addr_pair **target_table)
{
  CUmodule module;
  const char *const *var_names;
  const struct targ_fn_launch *fn_descs;
  unsigned int fn_entries, var_entries, i, j;
  CUresult r;
  struct targ_fn_descriptor *targ_fns;
  struct addr_pair *targ_tbl;
  const nvptx_tdata_t *img_header = (const nvptx_tdata_t *) target_data;
  struct ptx_image_data *new_image;
  struct ptx_device *dev;

  if (GOMP_VERSION_DEV (version) > GOMP_VERSION_NVIDIA_PTX)
    GOMP_PLUGIN_fatal ("Offload data incompatible with PTX plugin"
		       " (expected %u, received %u)",
		       GOMP_VERSION_NVIDIA_PTX, GOMP_VERSION_DEV (version));
  
  GOMP_OFFLOAD_init_device (ord);

  dev = ptx_devices[ord];
  
  nvptx_attach_host_thread_to_device (ord);

  link_ptx (&module, img_header->ptx_objs, img_header->ptx_num);

  /* The mkoffload utility emits a struct of pointers/integers at the
     start of each offload image.  The array of kernel names and the
     functions addresses form a one-to-one correspondence.  */

  var_entries = img_header->var_num;
  var_names = img_header->var_names;
  fn_entries = img_header->fn_num;
  fn_descs = img_header->fn_descs;

  targ_tbl = GOMP_PLUGIN_malloc (sizeof (struct addr_pair)
				 * (fn_entries + var_entries));
  targ_fns = GOMP_PLUGIN_malloc (sizeof (struct targ_fn_descriptor)
				 * fn_entries);

  *target_table = targ_tbl;

  new_image = GOMP_PLUGIN_malloc (sizeof (struct ptx_image_data));
  new_image->target_data = target_data;
  new_image->module = module;
  new_image->fns = targ_fns;

  pthread_mutex_lock (&dev->image_lock);
  new_image->next = dev->images;
  dev->images = new_image;
  pthread_mutex_unlock (&dev->image_lock);

  for (i = 0; i < fn_entries; i++, targ_fns++, targ_tbl++)
    {
      CUfunction function;

      r = cuModuleGetFunction (&function, module, fn_descs[i].fn);
      if (r != CUDA_SUCCESS)
	GOMP_PLUGIN_fatal ("cuModuleGetFunction error: %s", cuda_error (r));

      targ_fns->fn = function;
      targ_fns->launch = &fn_descs[i];

      targ_tbl->start = (uintptr_t) targ_fns;
      targ_tbl->end = targ_tbl->start + 1;
    }

  for (j = 0; j < var_entries; j++, targ_tbl++)
    {
      CUdeviceptr var;
      size_t bytes;

      r = cuModuleGetGlobal (&var, &bytes, module, var_names[j]);
      if (r != CUDA_SUCCESS)
        GOMP_PLUGIN_fatal ("cuModuleGetGlobal error: %s", cuda_error (r));

      targ_tbl->start = (uintptr_t) var;
      targ_tbl->end = targ_tbl->start + bytes;
    }

  return fn_entries + var_entries;
}
Пример #11
0
int main(int argc, char **argv)
{
  //data
  CUdeviceptr  d_data0   = 0;
  CUdeviceptr  d_data1   = 0;
  DataStruct *h_data0  = 0;
  DataStruct *h_data1  = 0;
  DataStruct h_data_reference0;
  DataStruct h_data_reference1;
  unsigned int memSize = sizeof(DataStruct);
  
  //device references
  CUcontext    hContext = 0;
  CUdevice     hDevice  = 0;
  CUmodule     hModule  = 0;
  CUstream     hStream  = 0;

  // Initialize the device and get a handle to the kernel
  CUresult status = initialize(0, &hContext, &hDevice, &hModule, &hStream);
  
  // Allocate memory on host and device
  if ((h_data0 = (DataStruct *)malloc(memSize)) == NULL)
    {
      std::cerr << "Could not allocate host memory" << std::endl;
      exit(-1);
    }
  status = cuMemAlloc(&d_data0, memSize);

  if ((h_data1 = (DataStruct *)malloc(memSize)) == NULL)
    {
      std::cerr << "Could not allocate host memory" << std::endl;
      exit(-1);
    }
  status = cuMemAlloc(&d_data1, memSize);
  if (status != CUDA_SUCCESS)
    printf("ERROR: during cuMemAlloc\n");

  ///////////////////////////////////////////////////////////////////////////////
  //======================= test cases ========================================//
  ///////////////////////////////////////////////////////////////////////////////
  std::string name = "";
  unsigned int testnum=0;
  unsigned int passed=0;

  //++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  /////////////////////// Ralf ///////////////////////////////////////////////////
  //++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

  if(runRalfFunction("test_phi_scalar", test_phi_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize))
    passed++;
  testnum++;
  if(runRalfFunction("test_phi2_scalar", test_phi2_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize))
    passed++;
  testnum++;
  if(runRalfFunction("test_phi3_scalar", test_phi3_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize))
    passed++;
  testnum++;
  if(runRalfFunction("test_phi4_scalar", test_phi4_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize))
    passed++;
  testnum++;
  if(runRalfFunction("test_phi5_scalar", test_phi5_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize))
    passed++;
  testnum++;
  if(runRalfFunction("test_phi6_scalar", test_phi6_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize))
    passed++;
  testnum++;
  if(runRalfFunction("test_phi7_scalar", test_phi7_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize))
    passed++;
  testnum++;
  if(runRalfFunction("test_phi8_scalar", test_phi8_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize))
    passed++;
  testnum++;
  if(runRalfFunction("test_phi9_scalar", test_phi9_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize))
    passed++;
  testnum++;

  if(runRalfFunction("test_loopbad_scalar", test_loopbad_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize))
    passed++;
  testnum++;
  if(runRalfFunction("test_loop23_scalar", test_loop23_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize))
    passed++;
  testnum++;
  if(runRalfFunction("test_loop13_scalar", test_loop13_scalar, &hModule, d_data0, h_data0, &h_data_reference0, memSize))
    passed++;
  testnum++;

  ////////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_GetElementPointer_constant"; /////////////////////
  setZero(h_data0,&h_data_reference0);

  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_GetElementPointer_constant(&h_data_reference0);                //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;
  
  ///////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_calculate"; /////////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->i = h_data_reference0.i = 3;
  h_data0->f = h_data_reference0.f = 3.2;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_calculate(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;

  ///////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_parquetShader"; /////////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->f = h_data_reference0.f = 1;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_parquetShader(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;

  ///////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_GetElementPointer_dyn"; /////////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->i = h_data_reference0.i = 3;
  h_data0->u = h_data_reference0.u = 7;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_GetElementPointer_dyn(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;

  ///////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_branch_simple"; // Branch 1 /////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->f = h_data_reference0.f = -4;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_branch_simple(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;

  ///////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_branch_simple"; // Branch 2 /////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->f = h_data_reference0.f = 8;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_branch_simple(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;
  
  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_branch_simplePHI"; // Branch 1 /////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->f = h_data_reference0.f = -10;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_branch_simplePHI(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;

  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_branch_loop"; //////////////////////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->i = h_data_reference0.i = 100;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_branch_loop(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;

  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_math"; //////////////////////////////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->f = h_data_reference0.f = 1.4;
  h_data0->i = h_data_reference0.i = 3;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_math(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;

  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_signedOperands"; //////////////////////////////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->i = h_data_reference0.i = 3;
  h_data0->f = h_data_reference0.f = -7;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_signedOperands(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;

    
  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_constantOperands"; //////////////////////////////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->i = h_data_reference0.i = 3;
  h_data0->f = h_data_reference0.f = -1.44;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_constantOperands(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;
    
  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_branch_loop_semihard"; /////////////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->i = h_data_reference0.i = 10;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_branch_loop_semihard(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;

  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_branch_loop_hard"; // Branch 1 /////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->i = h_data_reference0.i = 1;
  h_data0->u = h_data_reference0.u = 3;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_branch_loop_hard(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;
  
  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*////////////*/ name = "test_branch_loop_hard"; // Branch 2 /////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->i = h_data_reference0.i = 7;
  h_data0->u = h_data_reference0.u = 10;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_branch_loop_hard(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;
 
  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_binaryInst"; /////////////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->i = h_data_reference0.i = 5;
  h_data0->f = h_data_reference0.f = -121.23;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_binaryInst(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;

  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_selp"; /////////////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->i = h_data_reference0.i = -15;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_selp(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;

  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_GetElementPointer_complicated"; /////////////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->i = h_data_reference0.i = 1;
  h_data_reference0.s.s.f = h_data0->s.s.f = 3.11;
  h_data_reference0.s.sa[2].f = h_data0->s.sa[2].f = -4.32;
  h_data_reference0.s.sa[h_data0->i].f = h_data0->s.sa[h_data0->i].f = 111.3;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_GetElementPointer_complicated(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;

  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_call"; /////////////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->i = h_data_reference0.i = 10;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_call(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;

  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*/////////////*/ name = "test_alloca"; /////////////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->i = h_data_reference0.i = 1;
  h_data0->f = h_data_reference0.f = -3.23;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_alloca(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                     //compare Data
    {passed++; std::cout << " => Test passed!!!\n";}
  testnum++;

  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_alloca_complicated"; /////////////////////////
  setZero(h_data0,&h_data_reference0);
  h_data0->i = h_data_reference0.i = 1;
  h_data0->f = h_data_reference0.f = 23.213;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_alloca_complicated(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;


  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_globalVariables"; /////////////////////////
  setZero(h_data0,&h_data_reference0);
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_globalVariables(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;

  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_specialRegisters_x"; /////////////////////////
  setZero(h_data0,&h_data_reference0);
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize, 2,3,4, 2,3);   //run device function
  runHostTestFunction(test_specialRegisters_x, &h_data_reference0,   2,3,4, 2,3);   //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;


  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_specialRegisters_y"; /////////////////////////
  setZero(h_data0,&h_data_reference0);
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize, 2,3,4, 2,3);   //run device function
  runHostTestFunction(test_specialRegisters_x, &h_data_reference0,   2,3,4, 2,3);   //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;

  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_dualArgument"; /////////////////////////
  setZero(h_data0,&h_data_reference0);
  setZero(h_data1,&h_data_reference1);
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunDualTestFunction(&hModule, name, d_data0, d_data1, h_data0, h_data1, memSize);   //run device function

  test_dualArgument(&h_data_reference0,&h_data_reference1);   //run host reference
  if(compareData(h_data0,&h_data_reference0))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  if(compareData(h_data1,&h_data_reference1))                      //compare Data
    {passed++;  std::cout << " => Test passed!!!\n";}
  testnum++;  testnum++;

  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_vector"; /////////////////////////
  setZero(h_data0,&h_data_reference0);

  h_data0->fa[0] = h_data_reference0.fa[0] = 0.43f;
  h_data0->fa[1] = h_data_reference0.fa[1] = 0.234f;
  h_data0->fa[2] = h_data_reference0.fa[2] = 12893.f;
  h_data0->fa[3] = h_data_reference0.fa[3] = 13.33f;
  
  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_vector(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                     //compare Data
    {passed++; std::cout << " => Test passed!!!\n";}
  testnum++;

  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_reg2Const"; /////////////////////////
  setZero(h_data0,&h_data_reference0);

  /*
  unsigned int bytes; //size of constant
  CUdeviceptr devptr_const=0; 
  status = cuModuleGetGlobal(&devptr_const,
			     &bytes,
			     hModule, "__ptx_constant_data_global");

  cuMemcpyHtoD(devptr_const, h_data0, memSize);
  */

  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_reg2Const(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                     //compare Data
    {passed++; std::cout << " => Test passed!!!\n";}
  testnum++;

  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_constantMemory"; /////////////////////////
  setZero(h_data0,&h_data_reference0);

  h_data0->fa[0] = __ptx_constant_data_global.fa[0] = 0.2348f;
  
  unsigned int bytes; //size of constant
  CUdeviceptr devptr_const=0; 
  status = cuModuleGetGlobal(&devptr_const,
			     &bytes,
			     hModule, "__ptx_constant_data_global");

  cuMemcpyHtoD(devptr_const, h_data0, memSize);

  setZero(h_data0,&h_data_reference0);

  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function
  test_constantMemory(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                     //compare Data
    {passed++; std::cout << " => Test passed!!!\n";}
  testnum++;


  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_sharedMemory"; /////////////////////////
  setZero(h_data0,&h_data_reference0);

  for(int i = 0; i < ARRAY_N/2; i++)
    h_data0->fa[i*2] = i;

  for(int i = 0; i < ARRAY_N/2; i++)
    h_data0->fa[i*2+1] = -i;

  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize, 32,1,1, 1,1);   //run device function

  for(int i = 0; i < ARRAY_N/2; i++)
    h_data_reference0.fa[i] = i;
  for(int i = 0; i < ARRAY_N/2; i++)
    h_data_reference0.fa[i+32] = -i;
  //  runHostTestFunction(test_sharedMemory, &h_data_reference0, 16,1,1, 1,1);                                 //run host reference

  if(compareData(h_data0,&h_data_reference0))                     //compare Data
    {passed++; std::cout << " => Test passed!!!\n";}
  testnum++;

  //////////////////////////////////////////////////////////////////////////////////////////////////
  /*///////////////*/ name = "test_lightShader"; /////////////////////////
  setZero(h_data0,&h_data_reference0);

  /*
  unsigned int bytes; //size of constant
  CUdeviceptr devptr_const=0; 
  status = cuModuleGetGlobal(&devptr_const,
			     &bytes,
			     hModule, "__ptx_constant_data_global");

  cuMemcpyHtoD(devptr_const, h_data0, memSize);
  */

  std::cout << "=============== Test " << testnum << ": " << name << " ===================\n";
  loadAndRunTestFunction(&hModule, name, d_data0, h_data0, memSize);   //run device function

  /*
  test_lightShader(&h_data_reference0);                                 //run host reference
  if(compareData(h_data0,&h_data_reference0))                     //compare Data
    {passed++; std::cout << " => Test passed!!!\n";}
  testnum++;
  */

  ///////////////////////////////////////////////////////////////////////////////
  //======================= test cases END ====================================//
  ///////////////////////////////////////////////////////////////////////////////

  // Check the result
  std::cout << "\nPASSED " << passed << " tests" << std::endl;
  std::cout << "FAILED " << (testnum-passed) << " tests" << std::endl;

  // Cleanup
  if (d_data0)
    {
      cuMemFree(d_data0);
      d_data0 = 0;
    }
  if (d_data1)
    {
      cuMemFree(d_data1);
      d_data1 = 0;
    }
  if (h_data0)
    {
      free(h_data0);
      h_data0 = 0;
    }
  if (h_data1)
    {
      free(h_data1);
      h_data1 = 0;
    }
  if (hModule)
    {
      cuModuleUnload(hModule);
      hModule = 0;
    }
  if (hStream)
    {
      cuStreamDestroy(hStream);
      hStream = 0;
    }
  if (hContext)
    {
      cuCtxDestroy(hContext);
      hContext = 0;
    }
  return 0;
}
Пример #12
0
static void vq_handle_output(VirtIODevice *vdev, VirtQueue *vq)
{
	VirtQueueElement elem;
	
	while(virtqueue_pop(vq, &elem)) {
		struct param *p = elem.out_sg[0].iov_base;
	
		//for all library routines: get required arguments from buffer, execute, and push results back in virtqueue
		switch (p->syscall_type) {
		case CUINIT: {
			p->result = cuInit(p->flags);
			break;
		}
		case CUDRIVERGETVERSION: {
			p->result = cuDriverGetVersion(&p->val1);
			break;
		}
		case CUDEVICEGETCOUNT: {
			p->result = cuDeviceGetCount(&p->val1);
			break;
		}
		case CUDEVICEGET: {
			p->result = cuDeviceGet(&p->device, p->val1);
			break;
		}
		case CUDEVICECOMPUTECAPABILITY: {
			p->result = cuDeviceComputeCapability(&p->val1, &p->val2, p->device);
			break;
		}
		case CUDEVICEGETNAME: {
			p->result = cuDeviceGetName(elem.in_sg[0].iov_base, p->val1, p->device);
			break;
		}
		case CUDEVICEGETATTRIBUTE: {
			p->result = cuDeviceGetAttribute(&p->val1, p->attrib, p->device);
			break;
		}
		case CUCTXCREATE: {
                        p->result = cuCtxCreate(&p->ctx, p->flags, p->device);				
			break;
		}
		case CUCTXDESTROY: {
			p->result = cuCtxDestroy(p->ctx);
			break;
		}
		case CUCTXGETCURRENT: {
			p->result = cuCtxGetCurrent(&p->ctx);
			break;
		}
		case CUCTXGETDEVICE: {
			p->result = cuCtxGetDevice(&p->device);
			break;
		}
		case CUCTXPOPCURRENT: {
			p->result = cuCtxPopCurrent(&p->ctx);
			break;
		}
		case CUCTXSETCURRENT: {
			p->result = cuCtxSetCurrent(p->ctx);
	                break;
		}
	        case CUCTXSYNCHRONIZE: {
		        p->result = cuCtxSynchronize();
	                break;
	        }
		case CUMODULELOAD: {
			//hardcoded path - needs improvement
			//all .cubin files should be stored in $QEMU_NFS_PATH - currently $QEMU_NFS_PATH is shared between host and guest with NFS
			char *binname = malloc((strlen((char *)elem.out_sg[1].iov_base)+strlen(getenv("QEMU_NFS_PATH")+1))*sizeof(char));
			if (!binname) {
				p->result = 0;
		                virtqueue_push(vq, &elem, 0);
				break;
			}
		        strcpy(binname, getenv("QEMU_NFS_PATH"));
		        strcat(binname, (char *)elem.out_sg[1].iov_base);
			//change current CUDA context
			//each CUDA contets has its own virtual memory space - isolation is ensured by switching contexes
                        if (cuCtxSetCurrent(p->ctx) != 0) {
				p->result = 999;
                                break;
			}
			p->result = cuModuleLoad(&p->module, binname);
			free(binname);
			break;
		}
                case CUMODULEGETGLOBAL: {
                        char *name = malloc(100*sizeof(char));
                        if (!name) {
                                p->result = 999;
                                break;
                        }
                        strcpy(name, (char *)elem.out_sg[1].iov_base);
                        p->result = cuModuleGetGlobal(&p->dptr,&p->size1,p->module,(const char *)name);
                        break;
                }
		case CUMODULEUNLOAD: {
			p->result = cuModuleUnload(p->module);
			break;			
		}
		case CUMEMALLOC: {
			if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
			p->result = cuMemAlloc(&p->dptr, p->bytesize);
			break;
		}
                case CUMEMALLOCPITCH: {
                        if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
			p->result = cuMemAllocPitch(&p->dptr, &p->size3, p->size1, p->size2, p->bytesize);
			break;
		}
		//large buffers are alocated in smaller chuncks in guest kernel space
		//gets each chunck seperately and copies it to device memory
	        case CUMEMCPYHTOD: {
			int i;
			size_t offset;
                        unsigned long s, nr_pages = p->nr_pages;
                        if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
			offset = 0;
			for (i=0; i<nr_pages; i++) {
				s = *(long *)elem.out_sg[1+2*i+1].iov_base;
				p->result = cuMemcpyHtoD(p->dptr+offset, elem.out_sg[1+2*i].iov_base, s);
				if (p->result != 0) break;
				offset += s;
			}
	                break;
		}
		case CUMEMCPYHTODASYNC: {
			int i;
                        size_t offset;
                        unsigned long s, nr_pages = p->nr_pages;
                        if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
                        offset = 0;
			for (i=0; i<nr_pages; i++) {
                                s = *(long *)elem.out_sg[1+2*i+1].iov_base;
                                p->result = cuMemcpyHtoDAsync(p->dptr+offset, elem.out_sg[1+2*i].iov_base, s, p->stream);
                                if (p->result != 0) break;
                                offset += s;
                        }
                        break;
		}
		case CUMEMCPYDTODASYNC: {
			p->result = cuMemcpyDtoDAsync(p->dptr, p->dptr1, p->size1, p->stream);
                        break;		
		}
	        case CUMEMCPYDTOH: {
			int i;
                        unsigned long s, nr_pages = p->nr_pages;
                        if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
			size_t offset = 0;
			for (i=0; i<nr_pages; i++) {
				s = *(long *)elem.in_sg[0+2*i+1].iov_base;
				p->result = cuMemcpyDtoH(elem.in_sg[0+2*i].iov_base, p->dptr+offset, s);
				if (p->result != 0) break;
				offset += s;
			}
			break;
		}
		case CUMEMCPYDTOHASYNC: {
			int i;
                        unsigned long s, nr_pages = p->nr_pages;
                        if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
                        size_t offset = 0;
			for (i=0; i<nr_pages; i++) {
                                s = *(long *)elem.in_sg[0+2*i+1].iov_base;
                                p->result = cuMemcpyDtoHAsync(elem.in_sg[0+2*i].iov_base, p->dptr+offset, s, p->stream);
                                if (p->result != 0) break;
                                offset += s;
                        }
			break;
		}
		case CUMEMSETD32: {
			p->result = cuMemsetD32(p->dptr, p->bytecount, p->bytesize);
			break;
		}
	        case CUMEMFREE: {
	                p->result = cuMemFree(p->dptr);
	                break;
	        }
		case CUMODULEGETFUNCTION: {
			char *name = (char *)elem.out_sg[1].iov_base;
			name[p->length] = '\0';
                        if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
			p->result = cuModuleGetFunction(&p->function, p->module, name);
			break;	
		}
		case CULAUNCHKERNEL: {
			void **args = malloc(p->val1*sizeof(void *));
	                if (!args) {
				p->result = 9999;
	                        break;
        	        }
			int i;
			for (i=0; i<p->val1; i++) {
				args[i] = elem.out_sg[1+i].iov_base;
			}
                        if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
			p->result = cuLaunchKernel(p->function,
					p->gridDimX, p->gridDimY, p->gridDimZ,
			                p->blockDimX, p->blockDimY, p->blockDimZ,
					p->bytecount, 0, args, 0);
			free(args);
			break;
		}
		case CUEVENTCREATE: {
			p->result = cuEventCreate(&p->event1, p->flags);
			break;
		}
		case CUEVENTDESTROY: {
			p->result = cuEventDestroy(p->event1);
			break;
		}
		case CUEVENTRECORD: {
			p->result = cuEventRecord(p->event1, p->stream);
			break;
		}
		case CUEVENTSYNCHRONIZE: {
			p->result = cuEventSynchronize(p->event1);
			break;
		}
		case CUEVENTELAPSEDTIME: {
			p->result = cuEventElapsedTime(&p->pMilliseconds, p->event1, p->event2);
			break;
		}
		case CUSTREAMCREATE: {
			p->result =  cuStreamCreate(&p->stream, 0);
			break;
		}		
                case CUSTREAMSYNCHRONIZE: {
                        p->result = cuStreamSynchronize(p->stream);
                        break;
                }
                case CUSTREAMQUERY: {
                        p->result = cuStreamQuery(p->stream);
                        break;
                }
		case CUSTREAMDESTROY: {
                        p->result = cuStreamDestroy(p->stream);
                        break;
                }

		default: 
			printf("Unknown syscall_type\n");
		}
		virtqueue_push(vq, &elem, 0);
	}
	//notify frontend - trigger virtual interrupt
	virtio_notify(vdev, vq);
	return;
}
extern "C" void binomialOptionsGPU(
    real *callValue,
    TOptionData  *optionData,
    int optN,
    int argc,
    char **argv
)
{
    if (!moduleLoaded) {
      kernel_file = sdkFindFilePath("binomialOptions_kernel.cu", argv[0]);
      compileFileToPTX(kernel_file, 0, NULL, &ptx, &ptxSize);
      module = loadPTX(ptx, argc, argv);
      moduleLoaded = true;
    }

    __TOptionData h_OptionData[MAX_OPTIONS];

    for (int i = 0; i < optN; i++)
    {
        const real      T = optionData[i].T;
        const real      R = optionData[i].R;
        const real      V = optionData[i].V;

        const real     dt = T / (real)NUM_STEPS;
        const real    vDt = V * sqrt(dt);
        const real    rDt = R * dt;
        //Per-step interest and discount factors
        const real     If = exp(rDt);
        const real     Df = exp(-rDt);
        //Values and pseudoprobabilities of upward and downward moves
        const real      u = exp(vDt);
        const real      d = exp(-vDt);
        const real     pu = (If - d) / (u - d);
        const real     pd = (real)1.0 - pu;
        const real puByDf = pu * Df;
        const real pdByDf = pd * Df;

        h_OptionData[i].S      = (real)optionData[i].S;
        h_OptionData[i].X      = (real)optionData[i].X;
        h_OptionData[i].vDt    = (real)vDt;
        h_OptionData[i].puByDf = (real)puByDf;
        h_OptionData[i].pdByDf = (real)pdByDf;
    }

    CUfunction kernel_addr;
    checkCudaErrors(cuModuleGetFunction(&kernel_addr, module, "binomialOptionsKernel"));

    CUdeviceptr d_OptionData;
    checkCudaErrors(cuModuleGetGlobal(&d_OptionData, NULL, module, "d_OptionData"));
    checkCudaErrors(cuMemcpyHtoD(d_OptionData, h_OptionData, optN * sizeof(__TOptionData)));

    dim3 cudaBlockSize(128,1,1);
    dim3 cudaGridSize(optN, 1, 1);

    checkCudaErrors(cuLaunchKernel(kernel_addr,
                                            cudaGridSize.x, cudaGridSize.y, cudaGridSize.z, /* grid dim */
                                            cudaBlockSize.x, cudaBlockSize.y, cudaBlockSize.z, /* block dim */
                                            0,0, /* shared mem, stream */
                                            NULL, /* arguments */
                                            0));

    checkCudaErrors(cuCtxSynchronize());

    CUdeviceptr d_CallValue;
    checkCudaErrors(cuModuleGetGlobal(&d_CallValue, NULL, module, "d_CallValue"));
    checkCudaErrors(cuMemcpyDtoH(callValue, d_CallValue, optN *sizeof(real)));
}
int gib_recover ( void *buffers, int buf_size, int *buf_ids, int recover_last,
		  gib_context c ) {
  ERROR_CHECK_FAIL(cuCtxPushCurrent(((gpu_context)(c->acc_context))->pCtx));
#if !GIB_USE_MMAP
  if (buf_size > gib_buf_size) {
    int rc = gib_cpu_recover(buffers, buf_size, buf_ids, recover_last, c);
    ERROR_CHECK_FAIL(cuCtxPopCurrent(&((gpu_context)(c->acc_context))->pCtx));
    return rc;
  }
#endif

  int i, j;
  int n = c->n;
  int m = c->m;
  unsigned char A[128*128], inv[128*128], modA[128*128];
  for (i = n; i < n+recover_last; i++)
    if (buf_ids[i] >= n) {
      fprintf(stderr, "Attempting to recover a parity buffer, not allowed\n");
      return GIB_ERR;
    }

  gib_galois_gen_A(A, m+n, n);

  /* Modify the matrix to have the failed drives reflected */
  for (i = 0; i < n; i++) 
    for (j = 0; j < n; j++) 
      modA[i*n+j] = A[buf_ids[i]*n+j];

  gib_galois_gaussian_elim(modA, inv, n, n);

  /* Copy row buf_ids[i] into row i */
  for (i = n; i < n+recover_last; i++)
    for (j = 0; j < n; j++)
      modA[i*n+j] = inv[buf_ids[i]*n+j];

  int nthreads_per_block = 128;
  int fetch_size = sizeof(int)*nthreads_per_block;
  int nblocks = (buf_size + fetch_size - 1)/fetch_size;
  gpu_context gpu_c = (gpu_context) c->acc_context;

  CUdeviceptr F_d;
  ERROR_CHECK_FAIL(cuModuleGetGlobal(&F_d, NULL, gpu_c->module, "F_d"));
  ERROR_CHECK_FAIL(cuMemcpyHtoD(F_d, modA+n*n, (c->m)*(c->n)));

#if !GIB_USE_MMAP
  ERROR_CHECK_FAIL(cuMemcpyHtoD(gpu_c->buffers, buffers, (c->n)*buf_size));
#endif
  ERROR_CHECK_FAIL(cuFuncSetBlockShape(gpu_c->recover, nthreads_per_block, 
				       1, 1));
  int offset = 0;
  void *ptr;
#if GIB_USE_MMAP
  CUdeviceptr cpu_buffers;
  ERROR_CHECK_FAIL(cuMemHostGetDevicePointer(&cpu_buffers, buffers, 0));
  ptr = (void *)cpu_buffers;
#else
  ptr = (void *)gpu_c->buffers;
#endif
  ERROR_CHECK_FAIL(cuParamSetv(gpu_c->recover, offset, &ptr, sizeof(ptr)));
  offset += sizeof(ptr);
  ERROR_CHECK_FAIL(cuParamSetv(gpu_c->recover, offset, &buf_size, 
			       sizeof(buf_size)));
  offset += sizeof(buf_size);
  ERROR_CHECK_FAIL(cuParamSetv(gpu_c->recover, offset, &recover_last, 
			       sizeof(recover_last)));
  offset += sizeof(recover_last);
  ERROR_CHECK_FAIL(cuParamSetSize(gpu_c->recover, offset));
  ERROR_CHECK_FAIL(cuLaunchGrid(gpu_c->recover, nblocks, 1));
#if !GIB_USE_MMAP
  CUdeviceptr tmp_d = gpu_c->buffers + c->n*buf_size;
  void *tmp_h = (void *)((unsigned char *)(buffers) + c->n*buf_size);
  ERROR_CHECK_FAIL(cuMemcpyDtoH(tmp_h, tmp_d, recover_last*buf_size));
#else
  cuCtxSynchronize();
#endif
  ERROR_CHECK_FAIL(cuCtxPopCurrent(&((gpu_context)(c->acc_context))->pCtx));
  return GIB_SUC;
}
int gib_generate ( void *buffers, int buf_size, gib_context c ) {
  ERROR_CHECK_FAIL(cuCtxPushCurrent(((gpu_context)(c->acc_context))->pCtx));
  /* Do it all at once if the buffers are small enough */
#if !GIB_USE_MMAP
  /* This is too large to do at once in the GPU memory we have allocated.
   * Split it into several noncontiguous jobs. 
   */
  if (buf_size > gib_buf_size) {
    int rc = gib_generate_nc(buffers, buf_size, buf_size, c);
    ERROR_CHECK_FAIL(cuCtxPopCurrent(&((gpu_context)(c->acc_context))->pCtx));
    return rc;
  }
#endif

  int nthreads_per_block = 128;
  int fetch_size = sizeof(int)*nthreads_per_block;
  int nblocks = (buf_size + fetch_size - 1)/fetch_size;
  gpu_context gpu_c = (gpu_context) c->acc_context;
  
  unsigned char F[256*256];
  gib_galois_gen_F(F, c->m, c->n);
  CUdeviceptr F_d;
  ERROR_CHECK_FAIL(cuModuleGetGlobal(&F_d, NULL, gpu_c->module, "F_d"));
  ERROR_CHECK_FAIL(cuMemcpyHtoD(F_d, F, (c->m)*(c->n)));
  
#if !GIB_USE_MMAP
  /* Copy the buffers to memory */
  ERROR_CHECK_FAIL(cuMemcpyHtoD(gpu_c->buffers, buffers, 
				(c->n)*buf_size));
#endif
  /* Configure and launch */
  ERROR_CHECK_FAIL(cuFuncSetBlockShape(gpu_c->checksum, nthreads_per_block,
				       1, 1));
  int offset = 0;
  void *ptr;
#if GIB_USE_MMAP
  CUdeviceptr cpu_buffers;
  ERROR_CHECK_FAIL(cuMemHostGetDevicePointer(&cpu_buffers, buffers, 0));
  ptr = (void *)cpu_buffers;
#else
  ptr = (void *)(gpu_c->buffers);
#endif
  ERROR_CHECK_FAIL(cuParamSetv(gpu_c->checksum, offset, &ptr, sizeof(ptr)));
  offset += sizeof(ptr);
  ERROR_CHECK_FAIL(cuParamSetv(gpu_c->checksum, offset, &buf_size,
			       sizeof(buf_size)));
  offset += sizeof(buf_size);
  ERROR_CHECK_FAIL(cuParamSetSize(gpu_c->checksum, offset));
  ERROR_CHECK_FAIL(cuLaunchGrid(gpu_c->checksum, nblocks, 1));

  /* Get the results back */
#if !GIB_USE_MMAP
  CUdeviceptr tmp_d = gpu_c->buffers + c->n*buf_size;
  void *tmp_h = (void *)((unsigned char *)(buffers) + c->n*buf_size);
  ERROR_CHECK_FAIL(cuMemcpyDtoH(tmp_h, tmp_d, (c->m)*buf_size));
#else
  ERROR_CHECK_FAIL(cuCtxSynchronize());
#endif
  ERROR_CHECK_FAIL(cuCtxPopCurrent(&((gpu_context)(c->acc_context))->pCtx));
  return GIB_SUC; 
}
/* Initializes the CPU and GPU runtimes. */
int gib_init ( int n, int m, gib_context *c ) {
  static CUcontext pCtx;
  static CUdevice dev;
  if (m < 2 || n < 2) {
    fprintf(stderr, 
	    "It makes little sense to use Reed-Solomon coding when n or m is\n"
	    "less than two.  Use XOR or replication instead.\n");
    exit(1);
  }
  int rc_i = gib_cpu_init(n,m,c);
  if (rc_i != GIB_SUC) {
    fprintf(stderr, "gib_cpu_init returned %i\n", rc_i);
    exit(EXIT_FAILURE);
  }

  int gpu_id = 0;
  if (!cudaInitialized) {
    /* Initialize the CUDA runtime */
    int device_count;
    ERROR_CHECK_FAIL(cuInit(0));
    ERROR_CHECK_FAIL(cuDeviceGetCount(&device_count));
    if (getenv("GIB_GPU_ID") != NULL) {
        gpu_id = atoi(getenv("GIB_GPU_ID"));
        if (device_count <= gpu_id) {
            fprintf(stderr,
                    "GIB_GPU_ID is set to an invalid value (%i).  There are \n"
                    "only %i GPUs in the system.  Please specify another \n"
                    "value.\n", gpu_id, device_count);
            exit(-1);
        }
    }
    cudaInitialized = 1;
  }
  ERROR_CHECK_FAIL(cuDeviceGet(&dev, gpu_id));
#if GIB_USE_MMAP
    ERROR_CHECK_FAIL(cuCtxCreate(&pCtx, CU_CTX_MAP_HOST, dev));	
#else
    ERROR_CHECK_FAIL(cuCtxCreate(&pCtx, 0, dev));
#endif

  /* Initialize the Gibraltar context */
  gpu_context gpu_c = (gpu_context) malloc(sizeof(struct gpu_context_t));
  gpu_c->dev = dev;
  gpu_c->pCtx = pCtx;
  (*c)->acc_context = (void *)gpu_c;
	
  /* Determine whether the PTX has been generated or not by attempting to
   * open it read-only.
   */
  if (getenv("GIB_CACHE_DIR") == NULL) {
    fprintf(stderr, 
	   "Your environment is not completely set.  Please indicate a \n"
	   "directory where generated files may be placed with the \n"
	   "GIB_CACHE_DIR environment variable.  This directory should\n"
	   "not be publicly accessible and should exist.\n");
    exit(-1);
  }

  /* Try to open the appropriate ptx file.  If it doesn't exist, compile a
   * new one.
   */
  int filename_len = strlen(getenv("GIB_CACHE_DIR")) + 
    strlen("/gib_cuda_+.ptx") + log10(n)+1 + log10(m)+1 + 1;
  char *filename = (char *)malloc(filename_len);
  sprintf(filename, "%s/gib_cuda_%i+%i.ptx", getenv("GIB_CACHE_DIR"), n, m);

  FILE *fp = fopen(filename, "r");
  if (fp == NULL) {
    /* Compile the ptx and open it */
    int pid = fork();
    if (pid == -1) {
      perror("Forking for nvcc");
      exit(-1);
    }
    if (pid == 0) {
      gib_cuda_compile(n, m, filename); /* never returns */
    }
    int status;
    wait(&status);
    if (status != 0) {
      printf("Waiting for the compiler failed.\n");
      printf("The exit status was %i\n", WEXITSTATUS(status));
      printf("The child did%s exit normally.\n", 
	     (WIFEXITED(status)) ? "" : " NOT");
      
      exit(-1);
    }
    fp = fopen(filename, "r");
    if (fp == NULL) {
      perror(filename);
      exit(-1);
    }
  }
  fclose(fp);

  /* If we got here, the ptx file exists.  Use it. */
  ERROR_CHECK_FAIL(cuModuleLoad(&(gpu_c->module), filename));
  ERROR_CHECK_FAIL(cuModuleGetFunction(&(gpu_c->checksum), (gpu_c->module), 
	       "_Z14gib_checksum_dP11shmem_bytesi"));
  ERROR_CHECK_FAIL(cuModuleGetFunction(&(gpu_c->recover),
	       (gpu_c->module), 
	       "_Z13gib_recover_dP11shmem_bytesii"));
	
  /* Initialize the math libraries */
  gib_galois_init();
  unsigned char F[256*256];
  gib_galois_gen_F(F, m, n);

  /* Initialize/Allocate GPU-side structures */
  CUdeviceptr log_d, ilog_d, F_d;
  ERROR_CHECK_FAIL(cuModuleGetGlobal(&log_d, NULL, gpu_c->module, "gf_log_d"));
  ERROR_CHECK_FAIL(cuMemcpyHtoD(log_d, gib_gf_log, 256));
  ERROR_CHECK_FAIL(cuModuleGetGlobal(&ilog_d, NULL, gpu_c->module, 
				     "gf_ilog_d"));
  ERROR_CHECK_FAIL(cuMemcpyHtoD(ilog_d, gib_gf_ilog, 256));
  ERROR_CHECK_FAIL(cuModuleGetGlobal(&F_d, NULL, gpu_c->module, "F_d"));
  ERROR_CHECK_FAIL(cuMemcpyHtoD(F_d, F, m*n));
#if !GIB_USE_MMAP
  ERROR_CHECK_FAIL(cuMemAlloc(&(gpu_c->buffers), (n+m)*gib_buf_size));
#endif
  ERROR_CHECK_FAIL(cuCtxPopCurrent((&gpu_c->pCtx)));
  free(filename);
  return GIB_SUC;
}