void GPUInterface::GetDeviceDescription(int deviceNumber,
                                        char* deviceDescription) {
#ifdef BEAGLE_DEBUG_FLOW
    fprintf(stderr, "\t\t\tEntering GPUInterface::GetDeviceDescription\n");
#endif

    CUdevice tmpCudaDevice;

    SAFE_CUDA(cuDeviceGet(&tmpCudaDevice, (*resourceMap)[deviceNumber]));

#if CUDA_VERSION >= 3020
    size_t totalGlobalMemory = 0;
#else
    unsigned int totalGlobalMemory = 0;
#endif
    int clockSpeed = 0;
    int mpCount = 0;
    int major = 0;
    int minor = 0;

    SAFE_CUDA(cuDeviceComputeCapability(&major, &minor, tmpCudaDevice));
    SAFE_CUDA(cuDeviceTotalMem(&totalGlobalMemory, tmpCudaDevice));
    SAFE_CUDA(cuDeviceGetAttribute(&clockSpeed, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, tmpCudaDevice));
    SAFE_CUDA(cuDeviceGetAttribute(&mpCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, tmpCudaDevice));

    sprintf(deviceDescription,
            "Global memory (MB): %d | Clock speed (Ghz): %1.2f | Number of cores: %d",
            int(totalGlobalMemory / 1024.0 / 1024.0 + 0.5),
            clockSpeed / 1000000.0,
            nGpuArchCoresPerSM[major] * mpCount);

#ifdef BEAGLE_DEBUG_FLOW
    fprintf(stderr, "\t\t\tLeaving  GPUInterface::GetDeviceDescription\n");
#endif
}
Пример #2
0
static int
get_all_attributes(CUdevice c){
	int attr,n;

	for(n = 0 ; n <= CU_DEVICE_ATTRIBUTE_ECC_ENABLED ; ++n){
		CUresult cerr;

		if( (cerr = cuDeviceGetAttribute(&attr,n,c)) ){
			fprintf(stderr,"Error acquiring device attr %d (%d)\n",n,cerr);
			return -1;
		}
		printf("Device attribute %d: %d\n",n,attr);
	}
	while(n <= CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID){
		CUresult cerr;

		if( (cerr = cuDeviceGetAttribute(&attr,n,c)) ){
			fprintf(stderr,"Error acquiring device attr %d (%d)\n",n,cerr);
			return -1;
		}
		printf("Device attribute %d: 0x%04x\n",n,attr);
		++n;
	}
	return 0;
}
Пример #3
0
        /// Returns device compute capability as a tuple of major and minor version numbers.
        std::tuple<int, int> compute_capability() const {
            int major, minor;

            cuda_check( cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, d) );
            cuda_check( cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, d) );

            return std::make_tuple(major, minor);
        }
Пример #4
0
bool GPUInterface::GetSupportsDoublePrecision(int deviceNumber) {
	CUdevice tmpCudaDevice;
	SAFE_CUDA(cuDeviceGet(&tmpCudaDevice, (*resourceMap)[deviceNumber]));

	int major = 0;
	int minor = 0;
    SAFE_CUDA(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, tmpCudaDevice));
    SAFE_CUDA(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, tmpCudaDevice));
	return (major >= 2 || (major >= 1 && minor >= 3));
}
Пример #5
0
CUDADevice::CUDADevice(const CUdevice device_number)
  : m_cuda_device_number(device_number)
{
    char device_name[256];
    cuDeviceGetName(device_name, 256, m_cuda_device_number);
    m_name = device_name;

    cuDeviceComputeCapability(
        &m_compute_capability.first,
        &m_compute_capability.second,
        m_cuda_device_number);

    cuDeviceGetAttribute(&m_compute_mode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, m_cuda_device_number);

    cuDeviceTotalMem(&m_total_mem, m_cuda_device_number);

    cuDeviceGetAttribute(&m_max_threads_per_block, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, m_cuda_device_number);

    cuDeviceGetAttribute(&m_max_block_dim_x, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, m_cuda_device_number);
    cuDeviceGetAttribute(&m_max_block_dim_y, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, m_cuda_device_number);
    cuDeviceGetAttribute(&m_max_block_dim_z, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, m_cuda_device_number);

    cuDeviceGetAttribute(&m_max_grid_dim_x, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, m_cuda_device_number);
    cuDeviceGetAttribute(&m_max_grid_dim_y, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, m_cuda_device_number);
    cuDeviceGetAttribute(&m_max_grid_dim_z, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, m_cuda_device_number);

    cuDeviceGetAttribute(&m_max_registers, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, m_cuda_device_number);
}
Пример #6
0
static CUresult get_cc(CUdevice dev, int *maj, int *min) {
#if CUDA_VERSION < 6500
  return cuDeviceComputeCapability(maj, min, dev);
#else
  CUresult lerr;
  lerr = cuDeviceGetAttribute(maj,
                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
                              dev);
  if (lerr != CUDA_SUCCESS)
    return lerr;
  return cuDeviceGetAttribute(min,
                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
                              dev);
#endif
}
Пример #7
0
int
main()
{
  CUresult result;
  result = cuInit(0);
  CUdevice device;
  result = cuDeviceGet(&device, 0);
  CUcontext ctx;
  result = cuCtxCreate(&ctx, 0, device);
  CUmodule module;
  result = cuModuleLoad(&module, "cuda-shift-throughput.cubin");
  CUfunction kernel;
  result = cuModuleGetFunction(&kernel, module, "kernel");
  int block;
  result = cuFuncGetAttribute(&block,
                              CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
                              kernel);
  int grid = 1024 * 1024;
  CUevent event[2];
  for (ptrdiff_t i = 0; i < 2; ++i) {
    result = cuEventCreate(&event[i], 0);
  }
  result = cuEventRecord(event[0], 0);
  result = cuLaunchKernel(kernel, grid, 1, 1, block, 1, 1, 0, 0, 0, 0);
  result = cuEventRecord(event[1], 0);
  result = cuEventSynchronize(event[1]);
  float time;
  result = cuEventElapsedTime(&time, event[0], event[1]);
  int gpuclock;
  result =
    cuDeviceGetAttribute(&gpuclock, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device);
  int gpump;
  result =
    cuDeviceGetAttribute(&gpump, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
                         device);
  std::printf("Clock: %d KHz, # of MPs: %d\n", gpuclock, gpump);
  std::printf("Elapsed Time: %f milliseconds\n", time);
  std::printf("# of Threads: %d, # of SHLs : %lld\n", block,
              1024ll * block * grid);
  std::printf("Throughput: %f\n",
              1024.0 * block * grid / ((double) gpump * gpuclock * time));
  for (ptrdiff_t i = 0; i < 2; ++i) {
    result = cuEventDestroy(event[i]);
  }
  result = cuModuleUnload(module);
  result = cuCtxDestroy(ctx);
  return 0;
}
Пример #8
0
void printout_devices( )
{
  int ndevices;
  cuDeviceGetCount( &ndevices );
  for( int idevice = 0; idevice < ndevices; idevice++ )
    {
      char name[200];
#if CUDA_VERSION > 3010 
      size_t totalMem;
#else
      unsigned int totalMem;
#endif

      int clock;
      CUdevice dev;

      cuDeviceGet( &dev, idevice );
      cuDeviceGetName( name, sizeof(name), dev );
      cuDeviceTotalMem( &totalMem, dev );
      cuDeviceGetAttribute( &clock,
                            CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev );
      printf( "device %d: %s, %.1f MHz clock, %.1f MB memory\n",
              idevice, name, clock/1000.f, totalMem/1024.f/1024.f );
    }
}
Пример #9
0
static void *do_init(CUdevice dev, int flags, int *ret) {
    cuda_context *res;
    CUcontext ctx;
    unsigned int fl = CU_CTX_SCHED_AUTO;
    int i;

    CHKFAIL(NULL);
    if (flags & GA_CTX_SINGLE_THREAD)
      fl = CU_CTX_SCHED_SPIN;
    if (flags & GA_CTX_MULTI_THREAD)
      fl = CU_CTX_SCHED_YIELD;
    err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev);
    CHKFAIL(NULL);
    if (i != 1)
      FAIL(NULL, GA_UNSUPPORTED_ERROR);
    err = cuCtxCreate(&ctx, fl, dev);
    CHKFAIL(NULL);
    res = cuda_make_ctx(ctx, 0);
    if (res == NULL) {
      cuCtxDestroy(ctx);
      FAIL(NULL, GA_IMPL_ERROR);
    }
    res->flags |= flags;
    /* Don't leave the context on the thread stack */
    cuCtxPopCurrent(NULL);

    return res;
}
inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)
{
	CUresult error_result = cuDeviceGetAttribute( attribute, device_attribute, device );
    if (error_result != CUDA_SUCCESS) {
        shrLog( "cuDeviceGetAttribute returned %d\n-> %s\n", (int)error_result, getCudaDrvErrorString(error_result) );
		exit(0);
    }
}
Пример #11
0
static int
_gaspi_find_GPU_numa_node(int cudevice)
{
  CUresult cres;
  int domain, bus, dev;
  char path[128];
  FILE *sysfile = NULL;

  domain = 0;

#ifdef CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID
  cres = cuDeviceGetAttribute(&domain, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, cudevice);
  if( CUDA_SUCCESS != cres )
    {
      errno = ENOSYS;
      return -1;
    }
#endif

  cres = cuDeviceGetAttribute(&bus, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, cudevice);
  if( CUDA_SUCCESS != cres )
    {
      return -1;
    }

  cres = cuDeviceGetAttribute(&dev, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, cudevice);
  if( CUDA_SUCCESS != cres )
    {
      return -1;
    }

  sprintf(path, "/sys/bus/pci/devices/%04x:%02x:%02x.0/numa_node", domain, bus, dev);
  sysfile = fopen(path, "r");
  if( !sysfile )
    {
      gaspi_print_error("Failed to open %s.", path);
      return -1;
    }

  int numa_node = -1;
  fscanf (sysfile, "%1d", &numa_node);
  fclose(sysfile);

  return numa_node;
}
Пример #12
0
void getBestDevice(){
  int num_devices;
  int status;
  int i;
  CUdevice temp_device;
  int curr_multiprocessors;
  int max_multiprocessors = -1;
  int max_i = -1;
  
  status = cuDeviceGetCount(&num_devices);   
  if (CUDA_SUCCESS != status) 
  {
    printf("error in cuDeviceGetCount\n");
  }
  for(i = 0; i < num_devices; ++i){
    status = cuDeviceGet(&temp_device, i);
    if (CUDA_SUCCESS != status) 
    {
      printf("error in cuDeviceGet\n");
    }
    status = cuDeviceGetAttribute(&curr_multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, temp_device);    
    if (CUDA_SUCCESS != status) 
    {
      printf("error in cuDeviceGetAttribute CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT\n");
    }
    if(curr_multiprocessors > max_multiprocessors)
    {
      max_multiprocessors = curr_multiprocessors;
      max_i = i;
    }
  }

  status = cuDeviceGet(&cuDevice, max_i); 
  if (CUDA_SUCCESS != status) 
  {
    printf("error in cuDeviceGetName\n");
  }
  status = cuDeviceGetAttribute(&maxGridDim, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, cuDevice);    
  if (CUDA_SUCCESS != status) 
  {
    printf("error in cuDeviceGetAttribute CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X\n");
  }
  numMultiProcessors = max_multiprocessors;
}
Пример #13
0
inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)
{
    CUresult error = 	cuDeviceGetAttribute( attribute, device_attribute, device );

    if( CUDA_SUCCESS != error) {
        fprintf(stderr, "cuSafeCallNoSync() Driver API error = %04d from file <%s>, line %i.\n",
                error, __FILE__, __LINE__);
        exit(-1);
    }
}
Пример #14
0
  int device_t<CUDA>::simdWidth(){
    if(simdWidth_)
      return simdWidth_;

    OCCA_EXTRACT_DATA(CUDA, Device);

    OCCA_CUDA_CHECK("Device: Get Warp Size",
                    cuDeviceGetAttribute(&simdWidth_,
                                         CU_DEVICE_ATTRIBUTE_WARP_SIZE,
                                         data_.device) );

    return simdWidth_;
  }
Пример #15
0
void getBestDevice(JNIEnv *env){
  int num_devices;
  int status;
  int i;
  CUdevice temp_device;
  int curr_multiprocessors;
  int max_multiprocessors = -1;
  int max_i = -1;
  
  status = cuDeviceGetCount(&num_devices);
  CHECK_STATUS(env,"error in cuDeviceGetCount",status)
          
  if(num_devices == 0)
      throw_cuda_errror_exception(env,"0 Cuda Devices were found",0);
  
  for(i = 0; i < num_devices; ++i){
    status = cuDeviceGet(&temp_device, i);
    CHECK_STATUS(env,"error in cuDeviceGet",status)
            
    status = cuDeviceGetAttribute(&curr_multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, temp_device);    
    CHECK_STATUS(env,"error in cuDeviceGetAttribute",status)
            
    if(curr_multiprocessors > max_multiprocessors)
    {
      max_multiprocessors = curr_multiprocessors;
      max_i = i;
    }
  }

  status = cuDeviceGet(&cuDevice, max_i); 
  CHECK_STATUS(env,"error in cuDeviceGet",status)
          
  status = cuDeviceGetAttribute(&maxGridDim, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, cuDevice);    
  CHECK_STATUS(env,"error in cuDeviceGetAttribute",status)
          
  numMultiProcessors = max_multiprocessors;

}
Пример #16
0
CUdevice CudaModule::selectDevice(void)
{  
  CUresult res = CUDA_SUCCESS;
  
  int numDevices;
  checkError("cuDeviceGetCount", cuDeviceGetCount(&numDevices));

  CUdevice device = 0;
  S32 bestScore = FW_S32_MIN;
  
  for (int i=0; i<numDevices; ++i)
  {
    CUdevice dev;
    checkError("cuDeviceGet", cuDeviceGet(&dev, i));

    int clockRate;
    res = cuDeviceGetAttribute(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
    checkError("cuDeviceGetAttribute", res);

    int numProcessors;
    res = cuDeviceGetAttribute(&numProcessors, 
                               CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);
    checkError("cuDeviceGetAttribute", res);
    
    S32 score = clockRate * numProcessors;
    if (score > bestScore)
    {
      device = dev;
      bestScore = score;
    }
  }

  if (bestScore == FW_S32_MIN) {
    fail("No appropriate CUDA device found!");
  }
  
  return device;
}
Пример #17
0
int CudaModule::getDeviceAttribute(CUdevice_attribute attrib)
{
  staticInit();

  if (!s_available) {
    return 0;
  }

  int value;
  checkError( "cuDeviceGetAttribute", 
              cuDeviceGetAttribute(&value, attrib, s_device));
  
  return value;
}
Пример #18
0
Vec2i CudaModule::selectGridSize(int numBlocks)
{
  CUresult res = CUDA_SUCCESS;
  int maxWidth;

  res = cuDeviceGetAttribute(&maxWidth, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, s_device);
  checkError("cuDeviceGetAttribute", res);

  Vec2i size(numBlocks, 1);
  while (size.x > maxWidth)
  {
    size.x = (size.x + 1) >> 1;
    size.y <<= 1;
  }
  return size;
}
Пример #19
0
Object cuda_cores(Object self, int nparts, int *argcv,
        Object *argv, int flags) {
    cuInit(0);
    int deviceCount = 0;
    cuDeviceGetCount(&deviceCount);
    if (deviceCount == 0) {
        raiseError("No CUDA devices found");
    }
    CUdevice cuDevice;
    int mpcount;
    cuDeviceGetAttribute(&mpcount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
            cuDevice);
    int major, minor;
    cuDeviceComputeCapability(&major, &minor, cuDevice);
    mpcount *= coreMultiplicand(major, minor);
    return alloc_Float64(mpcount);
}
Пример #20
0
inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)
{
    CUresult error =    cuDeviceGetAttribute(attribute, device_attribute, device);

    if (CUDA_SUCCESS != error)
    {
        fprintf(stderr, "cuSafeCallNoSync() Driver API error = %04d from file <%s>, line %i.\n",
                error, __FILE__, __LINE__);

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        exit(EXIT_FAILURE);
    }
}
Пример #21
0
bool VideoDecoderCUDAPrivate::initCuda()
{
    CUresult result = cuInit(0);
    if (result != CUDA_SUCCESS) {
        available = false;
        qWarning("cuInit(0) faile (%d)", result);
        return false;
    }
    cudev = GetMaxGflopsGraphicsDeviceId();

    int clockRate;
    cuDeviceGetAttribute(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, cudev);
    int major, minor;
    cuDeviceComputeCapability(&major, &minor, cudev);
    char devname[256];
    cuDeviceGetName(devname, 256, cudev);
    description = QString("CUDA device: %1 %2.%3 %4 MHz").arg(devname).arg(major).arg(minor).arg(clockRate/1000);

    //TODO: cuD3DCtxCreate > cuGLCtxCreate > cuCtxCreate
    checkCudaErrors(cuCtxCreate(&cuctx, CU_CTX_SCHED_BLOCKING_SYNC, cudev)); //CU_CTX_SCHED_AUTO?
    CUcontext cuCurrent = NULL;
    result = cuCtxPopCurrent(&cuCurrent);
    if (result != CUDA_SUCCESS) {
        qWarning("cuCtxPopCurrent: %d\n", result);
        return false;
    }
    checkCudaErrors(cuvidCtxLockCreate(&vid_ctx_lock, cuctx));
    {
        AutoCtxLock lock(this, vid_ctx_lock);
        Q_UNUSED(lock);
        //Flags- Parameters for stream creation (must be 0 (CU_STREAM_DEFAULT=0 in cuda5) in cuda 4.2, no CU_STREAM_NON_BLOCKING)
        checkCudaErrors(cuStreamCreate(&stream, 0));//CU_STREAM_NON_BLOCKING)); //CU_STREAM_DEFAULT
        //require compute capability >= 1.1
        //flag: Reserved for future use, must be 0
        //cuStreamAddCallback(stream, CUstreamCallback, this, 0);
    }
    return true;
}
Пример #22
0
int
main (int argc, char **argv)
{
  CUdevice dev;
  CUfunction delay;
  CUmodule module;
  CUresult r;
  CUstream stream;
  unsigned long *a, *d_a, dticks;
  int nbytes;
  float atime, dtime;
  void *kargs[2];
  int clkrate;
  int devnum, nprocs;

  acc_init (acc_device_nvidia);

  devnum = acc_get_device_num (acc_device_nvidia);

  r = cuDeviceGet (&dev, devnum);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
      abort ();
    }

  r =
    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
			  dev);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
      abort ();
    }

  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
      abort ();
    }

  r = cuModuleLoad (&module, "subr.ptx");
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
      abort ();
    }

  r = cuModuleGetFunction (&delay, module, "delay");
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
      abort ();
    }

  nbytes = nprocs * sizeof (unsigned long);

  dtime = 200.0;

  dticks = (unsigned long) (dtime * clkrate);

  a = (unsigned long *) malloc (nbytes);
  d_a = (unsigned long *) acc_malloc (nbytes);

  acc_map_data (a, d_a, nbytes);

  kargs[0] = (void *) &d_a;
  kargs[1] = (void *) &dticks;

  r = cuStreamCreate (&stream, CU_STREAM_DEFAULT);
  if (r != CUDA_SUCCESS)
	{
	  fprintf (stderr, "cuStreamCreate failed: %d\n", r);
	  abort ();
	}

  acc_set_cuda_stream (0, stream);

  init_timers (1);

  start_timer (0);

  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
      abort ();
    }

  acc_wait (1);

  atime = stop_timer (0);

  if (atime < dtime)
    {
      fprintf (stderr, "actual time < delay time\n");
      abort ();
    }

  start_timer (0);

  acc_wait (1);

  atime = stop_timer (0);

  if (0.010 < atime)
    {
      fprintf (stderr, "actual time < delay time\n");
      abort ();
    }

  acc_unmap_data (a);

  fini_timers ();

  free (a);
  acc_free (d_a);

  acc_shutdown (acc_device_nvidia);

  return 0;
}
Пример #23
0
int main(int argc, char *argv[])
{
	char	   *kernel_source;
	char	   *kfunc_names[MAX_KERNEL_FUNCTIONS];
	int			kfunc_index = 0;
	int			target_device = -1;
	long		target_capability = -1;
	int			print_version = 0;
	int			print_devices = 0;
	int			num_devices;
	int			i, opt;
	int			major;
	int			minor;
	CUdevice	device;
	CUcontext	context;
	CUmodule	cuda_module;
	CUresult	rc;

	/* misc initialization */
	cmdname = basename(strdup(argv[0]));
	cuInit(0);
	rc = cuDeviceGetCount(&num_devices);
	if (rc != CUDA_SUCCESS)
		cuda_error(rc, "cuDeviceGetCount");

	while ((opt = getopt(argc, argv, "k:d:c:s:S:vlh")) >= 0)
	{
		switch (opt)
		{
			case 'k':
				if (kfunc_index == MAX_KERNEL_FUNCTIONS)
				{
					fputs("Too much kernel function specified", stderr);
					return 1;
				}
				kfunc_names[kfunc_index++] = strdup(optarg);
				break;
			case 'd':
				if (target_device >= 0)
				{
					fputs("-d is specified twice or more", stderr);
					usage();
				}
				if (target_capability >= 0)
				{
					fputs("-d and -c are exclusive option", stderr);
					usage();
				}
				target_device = atoi(optarg);
				if (target_device < 0 || target_device >= num_devices)
				{
					fprintf(stderr, "invalid device: -d %d\n", target_device);
					usage();
				}
				break;
			case 'c':
				if (target_capability >= 0)
				{
					fputs("-c is specified twice or more", stderr);
					usage();
				}
				if (target_device >= 0)
				{
					fputs("-d and -c are exclusive option", stderr);
					usage();
				}
				if (sscanf(optarg, "%d.%d", &major, &minor) != 2)
				{
					fprintf(stderr, "invalid capability format: -c %s\n",
							optarg);
					usage();
				}
				target_capability = major * 10 + minor;
				break;
			case 's':
				dynamic_shmem_per_thread = atol(optarg);
				if (dynamic_shmem_per_thread < 0)
				{
					fprintf(stderr, "invalid dynamic shmem per thread: %ld\n",
							dynamic_shmem_per_thread);
					usage();
				}
				break;
			case 'S':
				dynamic_shmem_per_block = atol(optarg);
				if (dynamic_shmem_per_block < 0)
				{
					fprintf(stderr, "invalid dynamic shmem per block: %ld",
							dynamic_shmem_per_block);
					usage();
				}
				break;
			case 'v':
				print_version = 1;
				break;
			case 'l':
				print_devices = 1;
				break;
			case 'h':
			default:
				usage();
				break;
		}
	}

	if (optind + 1 != argc)
	{
		if (print_version || print_devices)
		{
			if (print_version)
				print_nvrtc_version();
			if (print_devices)
				print_cuda_devices(num_devices);
			return 0;
		}
		fputs("no kernel source is specified", stderr);
		usage();
	}
	kernel_source = argv[optind];

	if (target_capability < 0)
	{
		CUdevice	dev;

		if (target_device < 0)
			target_device = 0;	/* default device */

		rc = cuDeviceGet(&dev, target_device);
		if (rc != CUDA_SUCCESS)
			cuda_error(rc, "cuDeviceGet");

		rc = cuDeviceGetAttribute(&major,
					CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev);
		if (rc != CUDA_SUCCESS)
			cuda_error(rc, "cuDeviceGetAttribute");
		rc = cuDeviceGetAttribute(&minor,
					CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev);
		if (rc != CUDA_SUCCESS)
			cuda_error(rc, "cuDeviceGetAttribute");

		target_capability = 10 * major + minor;
	}

	if (print_version)
		print_nvrtc_version();
	if (print_devices)
		print_cuda_devices(num_devices);

	/* make a dummy context */
	rc = cuDeviceGet(&device, 0);
	if (rc != CUDA_SUCCESS)
		cuda_error(rc, "cuDeviceGet");
	rc = cuCtxCreate(&context, 0, device);
	if (rc != CUDA_SUCCESS)
		cuda_error(rc, "cuCtxCreate");

	cuda_module = build_kernel_source(kernel_source, target_capability);

	for (i=0; i < kfunc_index; i++)
	{
		if (i > 0)
			putchar('\n');
		print_function_attrs(cuda_module, kfunc_names[i]);
	}

	/* drop a cuda context */
	rc = cuCtxDestroy(context);
	if (rc != CUDA_SUCCESS)
		cuda_error(rc, "cuCtxDestroy");

	return 0;
}
Пример #24
0
int
main (int argc, char **argv)
{
  CUdevice dev;
  CUfunction delay;
  CUmodule module;
  CUresult r;
  CUstream stream;
  unsigned long *a, *d_a, dticks;
  int nbytes;
  float dtime;
  void *kargs[2];
  int clkrate;
  int devnum, nprocs;

  acc_init (acc_device_nvidia);

  devnum = acc_get_device_num (acc_device_nvidia);

  r = cuDeviceGet (&dev, devnum);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
      abort ();
    }

  r =
    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
			  dev);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
      abort ();
    }

  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
      abort ();
    }

  r = cuModuleLoad (&module, "subr.ptx");
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
      abort ();
    }

  r = cuModuleGetFunction (&delay, module, "delay");
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
      abort ();
    }

  nbytes = nprocs * sizeof (unsigned long);

  dtime = 200.0;

  dticks = (unsigned long) (dtime * clkrate);

  a = (unsigned long *) malloc (nbytes);
  d_a = (unsigned long *) acc_malloc (nbytes);

  acc_map_data (a, d_a, nbytes);

  kargs[0] = (void *) &d_a;
  kargs[1] = (void *) &dticks;

  r = cuStreamCreate (&stream, CU_STREAM_DEFAULT);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuStreamCreate failed: %d\n", r);
      abort ();
    }

  if (!acc_set_cuda_stream (0, stream))
    abort ();
    
  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
      abort ();
    }

  if (acc_async_test_all () != 0)
    {
      fprintf (stderr, "asynchronous operation not running\n");
      abort ();
    }

  sleep ((int) (dtime / 1000.f) + 1);

  if (acc_async_test_all () != 1)
    {
      fprintf (stderr, "found asynchronous operation still running\n");
      abort ();
    }

  acc_unmap_data (a);

  free (a);
  acc_free (d_a);

  acc_shutdown (acc_device_nvidia);

  exit (0);
}
Пример #25
0
static int cuda_property(void *c, gpudata *buf, gpukernel *k, int prop_id,
                         void *res) {
  cuda_context *ctx = NULL;
  if (c != NULL) {
    ctx = (cuda_context *)c;
    ASSERT_CTX(ctx);
  } else if (buf != NULL) {
    ASSERT_BUF(buf);
    ctx = buf->ctx;
  } else if (k != NULL) {
    ASSERT_KER(k);
    ctx = k->ctx;
  }
  /* I know that 512 and 1024 are magic numbers.
     There is an indication in buffer.h, though. */
  if (prop_id < 512) {
    if (ctx == NULL)
      return GA_VALUE_ERROR;
  } else if (prop_id < 1024) {
    if (buf == NULL)
      return GA_VALUE_ERROR;
  } else {
    if (k == NULL)
      return GA_VALUE_ERROR;
  }

  switch (prop_id) {
    char *s;
    CUdevice id;
    int i;
    size_t sz;

  case GA_CTX_PROP_DEVNAME:
    cuda_enter(ctx);
    ctx->err = cuCtxGetDevice(&id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    /* 256 is what the CUDA API uses so it's good enough for me */
    s = malloc(256);
    if (s == NULL) {
      cuda_exit(ctx);
      return GA_MEMORY_ERROR;
    }
    ctx->err = cuDeviceGetName(s, 256, id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    *((char **)res) = s;
    cuda_exit(ctx);
    return GA_NO_ERROR;

  case GA_CTX_PROP_MAXLSIZE:
    cuda_enter(ctx);
    ctx->err = cuCtxGetDevice(&id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
                                    id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    *((size_t *)res) = i;
    cuda_exit(ctx);
    return GA_NO_ERROR;

  case GA_CTX_PROP_LMEMSIZE:
    cuda_enter(ctx);
    ctx->err = cuCtxGetDevice(&id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
                                    id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    *((size_t *)res) = i;
    cuda_exit(ctx);
    return GA_NO_ERROR;

  case GA_CTX_PROP_NUMPROCS:
    cuda_enter(ctx);
    ctx->err = cuCtxGetDevice(&id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    ctx->err = cuDeviceGetAttribute(&i,
                                    CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
                                    id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    *((unsigned int *)res) = i;
    cuda_exit(ctx);
    return GA_NO_ERROR;

  case GA_CTX_PROP_MAXGSIZE:
    cuda_enter(ctx);
    ctx->err = cuCtxGetDevice(&id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
                                    id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    *((size_t *)res) = i;
    cuda_exit(ctx);
    return GA_NO_ERROR;

  case GA_CTX_PROP_BLAS_OPS:
#ifdef WITH_CUDA_CUBLAS
    *((gpuarray_blas_ops **)res) = &cublas_ops;
    return GA_NO_ERROR;
#else
    *((void **)res) = NULL;
    return GA_DEVSUP_ERROR;
#endif

  case GA_CTX_PROP_BIN_ID:
    *((const char **)res) = ctx->bin_id;
    return GA_NO_ERROR;

  case GA_CTX_PROP_ERRBUF:
    *((gpudata **)res) = ctx->errbuf;
    return GA_NO_ERROR;

  case GA_CTX_PROP_TOTAL_GMEM:
    cuda_enter(ctx);
    ctx->err = cuMemGetInfo(&sz, (size_t *)res);
    cuda_exit(ctx);
    return ctx->err == CUDA_SUCCESS ? GA_NO_ERROR : GA_IMPL_ERROR;

  case GA_CTX_PROP_FREE_GMEM:
    cuda_enter(ctx);
    ctx->err = cuMemGetInfo((size_t *)res, &sz);
    cuda_exit(ctx);
    return ctx->err == CUDA_SUCCESS ? GA_NO_ERROR : GA_IMPL_ERROR;

  case GA_BUFFER_PROP_REFCNT:
    *((unsigned int *)res) = buf->refcnt;
    return GA_NO_ERROR;

  case GA_BUFFER_PROP_SIZE:
    *((size_t *)res) = buf->sz;
    return GA_NO_ERROR;

  case GA_BUFFER_PROP_CTX:
  case GA_KERNEL_PROP_CTX:
    *((void **)res) = (void *)ctx;
    return GA_NO_ERROR;

  case GA_KERNEL_PROP_MAXLSIZE:
    cuda_enter(ctx);
    ctx->err = cuFuncGetAttribute(&i,
                                  CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
                                  k->k);
    cuda_exit(ctx);
    if (ctx->err != CUDA_SUCCESS)
      return GA_IMPL_ERROR;
    *((size_t *)res) = i;
    return GA_NO_ERROR;

  case GA_KERNEL_PROP_PREFLSIZE:
    cuda_enter(ctx);
    ctx->err = cuCtxGetDevice(&id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_WARP_SIZE, id);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      return GA_IMPL_ERROR;
    }
    cuda_exit(ctx);
    *((size_t *)res) = i;
    return GA_NO_ERROR;

  case GA_KERNEL_PROP_NUMARGS:
    *((unsigned int *)res) = k->argcount;
    return GA_NO_ERROR;

  case GA_KERNEL_PROP_TYPES:
    *((const int **)res) = k->types;
    return GA_NO_ERROR;

  default:
    return GA_INVALID_ERROR;
  }
}
Пример #26
0
static CUT_THREADPROC dt_thread_func(void *p)
{
	dt_partition *pt = (dt_partition *)p;
	struct timeval tv;
	CUresult res;
	int thread_num_x=0, thread_num_y=0;
	int block_num_x=0, block_num_y=0;

	res = cuCtxSetCurrent(ctx[pt->pid]);
	if(res != CUDA_SUCCESS) {
		printf("cuCtxSetCurrent(ctx[%d]) failed: res = %s\n", pt->pid, cuda_response_to_string(res));
		exit(1);
	}

	/* allocate GPU memory */

	//printf("part_error_array_num = %d\n",part_error_array_num);


	if(pt->pid == 0){
		gettimeofday(&tv_memcpy_start, NULL);
	}

	res = cuMemcpyHtoD(part_C_dev[pt->pid], dst_C, SUM_SIZE_C);
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD(part_C_dev) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemcpyHtoD(part_error_array_dev[pt->pid], part_error_array, part_error_array_num*sizeof(int));
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD(part_error_array_dev) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemcpyHtoD(pm_size_array_dev[pt->pid], &pt->size_array[0][0], pt->NoP*2*pt->L_MAX*sizeof(int));
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD(pm_size_array_dev) falied: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemcpyHtoD(def_array_dev[pt->pid], pt->def, sum_size_def_array);
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD(def_array_dev) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemcpyHtoD(numpart_dev[pt->pid], pt->numpart, pt->NoC*sizeof(int));
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD(cuMemcpyHtoD(numpart_dev) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemcpyHtoD(PIDX_array_dev[pt->pid], pt->dst_PIDX, pt->tmp_array_size);
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD(PIDX_array) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemcpyHtoD(DID_4_array_dev[pt->pid], pt->dst_DID_4, pt->tmp_array_size);
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD(DID_4__array) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}


	if(pt->pid == 0){
		gettimeofday(&tv_memcpy_end, NULL);
		tvsub(&tv_memcpy_end, &tv_memcpy_start, &tv);
		time_memcpy += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
	}

	int sharedMemBytes = 0;

	/* get max thread num per block */
	int max_threads_num = 0;
	res = cuDeviceGetAttribute(&max_threads_num, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev[pt->pid]);
	if(res != CUDA_SUCCESS){
		printf("\ncuDeviceGetAttribute() failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	/* prepare for launch inverse_Q */
	void* kernel_args_inverse[] = {
		&part_C_dev[pt->pid],
		&pm_size_array_dev[pt->pid],
		&part_error_array_dev[pt->pid],
		&part_error_array_num,
		(void*)&(pt->NoP),
		&PIDX_array_dev[pt->pid],
		&numpart_dev[pt->pid],
		(void*)&(pt->NoC),
		(void*)&(pt->max_numpart),
		(void*)&(pt->interval),
		(void*)&(pt->L_MAX),
		(void*)&(pt->pid),
		(void*)&(device_num)
	};

	/* define CUDA block shape */
	int upper_limit_th_num_x = max_threads_num/(pt->max_numpart*pt->NoC);
	int upper_limit_th_num_y = max_threads_num/upper_limit_th_num_x;
	if(upper_limit_th_num_x < 1) upper_limit_th_num_x++;
	if(upper_limit_th_num_y < 1) upper_limit_th_num_y++;

	thread_num_x = (pt->max_dim0*pt->max_dim1 < upper_limit_th_num_x) ? (pt->max_dim0*pt->max_dim1) : upper_limit_th_num_x;
	thread_num_y = (pt->max_numpart < upper_limit_th_num_y) ? pt->max_numpart : upper_limit_th_num_y;

	block_num_x = (pt->max_dim0*pt->max_dim1) / thread_num_x;
	block_num_y = (pt->max_numpart) / thread_num_y;
	if((pt->max_dim0*pt->max_dim1) % thread_num_x != 0) block_num_x++;
	if(pt->max_numpart % thread_num_y != 0) block_num_y++;


	int blockDimY = thread_num_y / device_num;
	if(thread_num_y%device_num != 0){
		blockDimY++;
	}

	/* launch iverse_Q */
	if(pt->pid == 0){
		gettimeofday(&tv_kernel_start, NULL);
	}
	res = cuLaunchKernel(
		func_inverse_Q[pt->pid],      // call function
		block_num_x,         // gridDimX
		block_num_y,         // gridDimY
		pt->L_MAX-pt->interval,      // gridDimZ
		thread_num_x,        // blockDimX
		blockDimY,        // blockDimY
		pt->NoC,                 // blockDimZ
		sharedMemBytes,      // sharedMemBytes
		NULL,                // hStream
		kernel_args_inverse, // kernelParams
		NULL                 // extra
		);
	if(res != CUDA_SUCCESS) {
		printf("block_num_x %d, block_num_y %d, thread_num_x %d, thread_num_y %d\n", block_num_x, block_num_y, thread_num_x, thread_num_y);
		printf("cuLaunchKernel(inverse_Q) failed : res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuCtxSynchronize();
	if(res != CUDA_SUCCESS) {
		printf("cuCtxSynchronize(inverse_Q) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	if(pt->pid == 0){
		gettimeofday(&tv_kernel_end, NULL);
		tvsub(&tv_kernel_end, &tv_kernel_start, &tv);
		time_kernel += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
	}


	/* prepare for launch dt1d_x */
	void* kernel_args_x[] = {
		&part_C_dev[pt->pid],                  // FLOAT *src_start
		&tmpM_dev[pt->pid],                    // FLOTA *dst
		&tmpIy_dev[pt->pid],                   // int *ptr
		&DID_4_array_dev[pt->pid],             // int *DID_4_array,
		&def_array_dev[pt->pid],               // FLOAT *def_array,
		&pm_size_array_dev[pt->pid],           // int *size_array
		(void*)&(pt->NoP),                  // int NoP
		&PIDX_array_dev[pt->pid],              // int *PIDX_array
		&part_error_array_dev[pt->pid],        // int *error_array
		(void*)&(part_error_array_num), // int error_array_num
		&numpart_dev[pt->pid],                 // int *numpart
		(void*)&(pt->NoC),                  // int NoC
		(void*)&(pt->max_numpart),          // int max_numpart
		(void*)&(pt->interval),             // int interval
		(void*)&(pt->L_MAX),                 // int L_MAX
		(void*)&(pt->pid),                   // int pid
		(void*)&(device_num)                 // int device_num
	};


	max_threads_num = 64/pt->NoC;
	if(max_threads_num < 1) max_threads_num++;

	thread_num_x = (pt->max_dim1 < max_threads_num) ? pt->max_dim1 : max_threads_num;
	thread_num_y = (pt->max_numpart < max_threads_num) ? pt->max_numpart : max_threads_num;

	block_num_x = pt->max_dim1 / thread_num_x;
	block_num_y = pt->max_numpart / thread_num_y;
	if(pt->max_dim1 % thread_num_x != 0) block_num_x++;
	if(pt->max_numpart % thread_num_y != 0) block_num_y++;

	blockDimY = thread_num_y / device_num;
	if(thread_num_y%device_num != 0){
		blockDimY++;
	}

	/* launch dt1d_x */
	if(pt->pid == 0){
		gettimeofday(&tv_kernel_start, NULL);
	}

	res = cuLaunchKernel(
		func_dt1d_x[pt->pid],    // call function
		block_num_x,    // gridDimX
		block_num_y,    // gridDimY
		pt->L_MAX-pt->interval, // gridDimZ
		thread_num_x,   // blockDimX
		blockDimY,   // blockDimY
		pt->NoC,            // blockDimZ
		sharedMemBytes, // sharedMemBytes
		NULL,           // hStream
		kernel_args_x,  // kernelParams
		NULL            // extra
		);
	if(res != CUDA_SUCCESS) {

		printf("block_num_x %d, block_num_y %d, thread_num_x %d, thread_num_y %d\n", block_num_x, block_num_y, thread_num_x, thread_num_y);

		printf("cuLaunchKernel(dt1d_x) failed : res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuCtxSynchronize();
	if(res != CUDA_SUCCESS) {
		printf("cuCtxSynchronize(dt1d_x) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	if(pt->pid == 0){
		gettimeofday(&tv_kernel_end, NULL);
		tvsub(&tv_kernel_end, &tv_kernel_start, &tv);
		time_kernel += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
	}


	/* prepare for launch dt1d_y */
	void* kernel_args_y[] = {
		&tmpM_dev[pt->pid],                    // FLOAT *src_start
		&M_dev[pt->pid],                       // FLOAT *dst_start
		&tmpIx_dev[pt->pid],                   // int *ptr_start
		&DID_4_array_dev[pt->pid],             // int *DID_4_array,
		&def_array_dev[pt->pid],               // FLOAT *def_array,
		(void*)&(pt->NoP),                  // int NoP
		&pm_size_array_dev[pt->pid],           // int *size_array
		&numpart_dev[pt->pid],                 // int *numpart,
		&PIDX_array_dev[pt->pid],              // int *PIDX_array,
		(void*)&(pt->NoC),                  // int NoC
		(void*)&(pt->max_numpart),          // int max_numpart
		(void*)&(pt->interval),             // int interval
		(void*)&(pt->L_MAX),                // int L_MAX
		&part_error_array_dev[pt->pid],        // int *error_array
		(void*)&(part_error_array_num), // int error_array_num
		(void*)&(pt->pid),                   // int pid
		(void*)&(device_num)                 // int device_num
	};


	thread_num_x = (pt->max_dim0 < max_threads_num) ? pt->max_dim0 : max_threads_num;
	thread_num_y = (pt->max_numpart < max_threads_num) ? pt->max_numpart : max_threads_num;

	block_num_x = pt->max_dim0 / thread_num_x;
	block_num_y = pt->max_numpart / thread_num_y;
	if(pt->max_dim0 % thread_num_x != 0) block_num_x++;
	if(pt->max_numpart % thread_num_y != 0) block_num_y++;

	blockDimY = thread_num_y / device_num;
	if(thread_num_y%device_num != 0){
		blockDimY++;
	}

	/* prepare for launch dt1d_y */
	if(pt->pid == 0){
		gettimeofday(&tv_kernel_start, NULL);
	}

	res = cuLaunchKernel(
		func_dt1d_y[pt->pid],    // call functions
		block_num_x,    // gridDimX
		block_num_y,    // gridDimY
		pt->L_MAX-pt->interval, // gridDimZ
		thread_num_x,   // blockDimX
		blockDimY,   // blockDimY
		pt->NoC,            // blockDimZ
		sharedMemBytes, // sharedMemBytes
		NULL,           // hStream
		kernel_args_y,  // kernelParams
		NULL            // extra
		);
	if(res != CUDA_SUCCESS) {
		printf("cuLaunchKernel(dt1d_y failed : res = %s\n", cuda_response_to_string(res));
		exit(1);
	}


	res = cuCtxSynchronize();
	if(res != CUDA_SUCCESS) {
		printf("cuCtxSynchronize(dt1d_y) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	if(pt->pid == 0){
		gettimeofday(&tv_kernel_end, NULL);
		tvsub(&tv_kernel_end, &tv_kernel_start, &tv);
		time_kernel += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
	}



	/* downloads datas from GPU */

	/* downloads M from GPU */

	int sum_part_size = 0;
	int sum_pointer_size = 0;
	int sum_move_size = 0;
	int part_size = 0;
	int pointer_size = 0;
	int part_y = 0;
	int move_size = 0;
	int start_kk = 0;
	int end_kk = 0;
	int part_end_kk = 0;
	unsigned long long int pointer_dst_M = (unsigned long long int)pt->dst_M;
	unsigned long long int pointer_M_dev = (unsigned long long int)M_dev[pt->pid];

	for(int L=0; L<(pt->L_MAX-pt->interval); L++) {

		/**************************************************************************/
		/* loop condition */
		if( (pt->FSIZE[(L+pt->interval)*2]+2*pt->pady < pt->max_Y) || (pt->FSIZE[(L+pt->interval)*2+1]+2*pt->padx < pt->max_X) )
		{
			continue;
		}
		/* loop conditon */
		/**************************************************************************/


		for(int jj=0; jj<pt->NoC; jj++) {

			part_y = pt->numpart[jj] / device_num;
			if(pt->numpart[jj]%device_num != 0){
				part_y++;
			}


			start_kk = part_y * pt->pid;
			end_kk = part_y * (pt->pid + 1);

			if(end_kk > pt->numpart[jj]){
				end_kk = pt->numpart[jj];
			}

			if(pt->pid > 0){
				part_end_kk = part_y * pt->pid;
			}

			for(int kk=0; kk<pt->numpart[jj]; kk++) {

				int PIDX = pt->PIDX_array[L][jj][kk];
				int dims0 = pt->size_array[L][PIDX*2];
				int dims1 = pt->size_array[L][PIDX*2+1];
				if(start_kk <= kk && kk < end_kk){
					part_size += dims0 * dims1;
				}
				//if(pt->pid > 0 && part_start_kk <= kk && kk < part_end_kk){
				if(pt->pid > 0 && 0 <= kk && kk < part_end_kk){
					pointer_size += dims0 * dims1;
				}
				move_size += dims0 * dims1;
			}

			sum_part_size += part_size;
			sum_pointer_size += pointer_size;
			sum_move_size += move_size;

			// error pt->pid == 2 && L == 24 && jj == 1

			if(pt->pid*part_y < pt->numpart[jj]){

				if(pt->pid == 0){
					gettimeofday(&tv_memcpy_start, NULL);
				}


				res = cuMemcpyDtoH((void *)(pointer_dst_M+(unsigned long long int)(pointer_size*sizeof(FLOAT))), (CUdeviceptr)(pointer_M_dev+(unsigned long long int)(pointer_size*sizeof(FLOAT))), part_size*sizeof(FLOAT));
				if(res != CUDA_SUCCESS) {
					printf("error pid = %d\n",pt->pid);
					printf("cuMemcpyDtoH(dst_M) failed: res = %s\n", cuda_response_to_string(res));
					exit(1);
				}

				if(pt->pid == 0){
					gettimeofday(&tv_memcpy_end, NULL);
					tvsub(&tv_memcpy_end, &tv_memcpy_start, &tv);
					time_memcpy += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
				}

			}

			pointer_dst_M += (unsigned long long int)(move_size * sizeof(FLOAT));
			pointer_M_dev += (unsigned long long int)(move_size * sizeof(FLOAT));

			part_size = 0;
			pointer_size = 0;
			move_size = 0;

		}

	}


	/* downloads tmpIx from GPU */

	sum_part_size = 0;
	sum_pointer_size = 0;
	part_size = 0;
	pointer_size = 0;
	part_y = 0;
	move_size = 0;
	start_kk = 0;
	end_kk = 0;
	part_end_kk = 0;
	unsigned long long int pointer_dst_tmpIx = (unsigned long long int)pt->dst_tmpIx;
	unsigned long long int pointer_tmpIx_dev = (unsigned long long int)tmpIx_dev[pt->pid];


	for(int L=0; L<(pt->L_MAX-pt->interval); L++) {

		/**************************************************************************/
		/* loop condition */
		if( (pt->FSIZE[(L+pt->interval)*2]+2*pt->pady < pt->max_Y) || (pt->FSIZE[(L+pt->interval)*2+1]+2*pt->padx < pt->max_X) )
		{
			continue;
		}
		/* loop conditon */
		/**************************************************************************/


		for(int jj=0; jj<pt->NoC; jj++) {

			part_y = pt->numpart[jj] / device_num;
			if(pt->numpart[jj]%device_num != 0){
				part_y++;
			}

			start_kk = part_y * pt->pid;
			end_kk = part_y * (pt->pid + 1);

			if(end_kk > pt->numpart[jj]){
				end_kk = pt->numpart[jj];
			}

			if(pt->pid > 0){
				part_end_kk = part_y * pt->pid;
			}

			for(int kk=0; kk<pt->numpart[jj]; kk++) {

				int PIDX = pt->PIDX_array[L][jj][kk];
				int dims0 = pt->size_array[L][PIDX*2];
				int dims1 = pt->size_array[L][PIDX*2+1];
				if(start_kk <= kk && kk < end_kk){
					part_size += dims0 * dims1;
				}
				if(pt->pid > 0){
					if(0 <= kk && kk < part_end_kk){
						pointer_size += dims0 * dims1;
					}
				}
				move_size += dims0 * dims1;
			}

			sum_part_size += part_size;
			sum_pointer_size += pointer_size;


			if(pt->pid*part_y < pt->numpart[jj]){

				if(pt->pid == 0){
					gettimeofday(&tv_memcpy_start, NULL);
				}


				res = cuMemcpyDtoH((void *)(pointer_dst_tmpIx+(unsigned long long int)(pointer_size*sizeof(int))), (CUdeviceptr)(pointer_tmpIx_dev+(unsigned long long int)(pointer_size*sizeof(int))), part_size*sizeof(int));
				if(res != CUDA_SUCCESS) {
					printf("error pid = %d\n",pt->pid);
					printf("cuMemcpyDtoH(tmpIx) failed: res = %s\n", cuda_response_to_string(res));
					exit(1);
				}

				if(pt->pid == 0){
					gettimeofday(&tv_memcpy_end, NULL);
					tvsub(&tv_memcpy_end, &tv_memcpy_start, &tv);
					time_memcpy += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
				}

			}

			pointer_dst_tmpIx += (unsigned long long int)(move_size * sizeof(int));
			pointer_tmpIx_dev += (unsigned long long int)(move_size * sizeof(int));

			part_size = 0;
			pointer_size = 0;
			move_size = 0;

		}

	}


	/* downloads tmpIy from GPU */

	sum_part_size = 0;
	sum_pointer_size = 0;
	part_size = 0;
	pointer_size = 0;
	part_y = 0;
	move_size = 0;
	start_kk = 0;
	end_kk = 0;
	part_end_kk = 0;
	unsigned long long int pointer_dst_tmpIy = (unsigned long long int)pt->dst_tmpIy;
	unsigned long long int pointer_tmpIy_dev = (unsigned long long int)tmpIy_dev[pt->pid];


	for(int L=0; L<(pt->L_MAX-pt->interval); L++) {

		/**************************************************************************/
		/* loop condition */
		if( (pt->FSIZE[(L+pt->interval)*2]+2*pt->pady < pt->max_Y) || (pt->FSIZE[(L+pt->interval)*2+1]+2*pt->padx < pt->max_X) )
		{
			continue;
		}
		/* loop conditon */
		/**************************************************************************/


		for(int jj=0; jj<pt->NoC; jj++) {

			part_y = pt->numpart[jj] / device_num;
			if(pt->numpart[jj]%device_num != 0){
				part_y++;
			}

			start_kk = part_y * pt->pid;
			end_kk = part_y * (pt->pid + 1);

			if(end_kk > pt->numpart[jj]){
				end_kk = pt->numpart[jj];
			}

			if(pt->pid > 0){
				part_end_kk = part_y * pt->pid;
			}

			for(int kk=0; kk<pt->numpart[jj]; kk++) {

				int PIDX = pt->PIDX_array[L][jj][kk];
				int dims0 = pt->size_array[L][PIDX*2];
				int dims1 = pt->size_array[L][PIDX*2+1];
				if(start_kk <= kk && kk < end_kk){
					part_size += dims0 * dims1;
				}
				if(pt->pid > 0){
					if(0 <= kk && kk < part_end_kk){
						pointer_size += dims0 * dims1;
					}
				}
				move_size += dims0 * dims1;
			}

			sum_part_size += part_size;
			sum_pointer_size += pointer_size;

			if(pt->pid*part_y < pt->numpart[jj]){

				if(pt->pid == 0){
					gettimeofday(&tv_memcpy_start, NULL);
				}

				res = cuMemcpyDtoH((void *)(pointer_dst_tmpIy+(unsigned long long int)(pointer_size*sizeof(int))), (CUdeviceptr)(pointer_tmpIy_dev+(unsigned long long int)(pointer_size*sizeof(int))), part_size*sizeof(int));
				if(res != CUDA_SUCCESS) {
					printf("error pid = %d\n",pt->pid);
					printf("cuMemcpyDtoH(tmpIy) failed: res = %s\n", cuda_response_to_string(res));
					exit(1);
				}

				if(pt->pid == 0){
					gettimeofday(&tv_memcpy_end, NULL);
					tvsub(&tv_memcpy_end, &tv_memcpy_start, &tv);
					time_memcpy += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
				}

			}

			pointer_dst_tmpIy += (unsigned long long int)(move_size * sizeof(int));
			pointer_tmpIy_dev += (unsigned long long int)(move_size * sizeof(int));

			part_size = 0;
			pointer_size = 0;
			move_size = 0;

		}

	}


	/* end of thread */
	CUT_THREADEND;
}
Пример #27
0
static CUresult create_context(void *user_context, CUcontext *ctx) {
    // Initialize CUDA
    CUresult err = cuInit(0);
    if (err != CUDA_SUCCESS) {
        halide_error_varargs(user_context, "CUDA: cuInit failed (%s)",
                             _get_error_name(err));
        return err;
    }

    // Make sure we have a device
    int deviceCount = 0;
    err = cuDeviceGetCount(&deviceCount);
    if (err != CUDA_SUCCESS) {
        halide_error_varargs(user_context, "CUDA: cuGetDeviceCount failed (%s)",
                             _get_error_name(err));
        return err;
    }
    if (deviceCount <= 0) {
        halide_error(user_context, "CUDA: No devices available");
        return CUDA_ERROR_NO_DEVICE;
    }

    int device = halide_get_gpu_device(user_context);
    if (device == -1) {
        device = deviceCount - 1;
    }

    // Get device
    CUdevice dev;
    CUresult status = cuDeviceGet(&dev, device);
    if (status != CUDA_SUCCESS) {
        halide_error(user_context, "CUDA: Failed to get device\n");
        return status;
    }

    DEBUG_PRINTF( user_context, "    Got device %d\n", dev );

    // Dump device attributes
    #ifdef DEBUG
    {
        char name[256];
        name[0] = 0;
        err = cuDeviceGetName(name, 256, dev);
        DEBUG_PRINTF(user_context, "      %s\n", name);

        if (err != CUDA_SUCCESS) {
            halide_error_varargs(user_context, "CUDA: cuDeviceGetName failed (%s)",
                                 _get_error_name(err));
            return err;
        }

        size_t memory = 0;
        err = cuDeviceTotalMem(&memory, dev);
        DEBUG_PRINTF(user_context, "      total memory: %d MB\n", (int)(memory >> 20));

        if (err != CUDA_SUCCESS) {
            halide_error_varargs(user_context, "CUDA: cuDeviceTotalMem failed (%s)",
                                 _get_error_name(err));
            return err;
        }

        // Declare variables for other state we want to query.
        int max_threads_per_block = 0, warp_size = 0, num_cores = 0;
        int max_block_size[] = {0, 0, 0};
        int max_grid_size[] = {0, 0, 0};
        int max_shared_mem = 0, max_constant_mem = 0;
        int cc_major = 0, cc_minor = 0;

        struct {int *dst; CUdevice_attribute attr;} attrs[] = {
            {&max_threads_per_block, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK},
            {&warp_size,             CU_DEVICE_ATTRIBUTE_WARP_SIZE},
            {&num_cores,             CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT},
            {&max_block_size[0],     CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X},
            {&max_block_size[1],     CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y},
            {&max_block_size[2],     CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z},
            {&max_grid_size[0],      CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X},
            {&max_grid_size[1],      CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y},
            {&max_grid_size[2],      CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z},
            {&max_shared_mem,        CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK},
            {&max_constant_mem,      CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY},
            {&cc_major,              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR},
            {&cc_minor,              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR},
            {NULL,                   CU_DEVICE_ATTRIBUTE_MAX}};

        // Do all the queries.
        for (int i = 0; attrs[i].dst; i++) {
            err = cuDeviceGetAttribute(attrs[i].dst, attrs[i].attr, dev);
            if (err != CUDA_SUCCESS) {
                halide_error_varargs(user_context,
                                     "CUDA: cuDeviceGetAttribute failed (%s) for attribute %d",
                                     _get_error_name(err), (int)attrs[i].attr);
                return err;
            }
        }

        // threads per core is a function of the compute capability
        int threads_per_core = (cc_major == 1 ? 8 :
                                cc_major == 2 ? (cc_minor == 0 ? 32 : 48) :
                                cc_major == 3 ? 192 :
                                cc_major == 5 ? 128 : 0);

        DEBUG_PRINTF(user_context,
                     "      max threads per block: %d\n"
                     "      warp size: %d\n"
                     "      max block size: %d %d %d\n"
                     "      max grid size: %d %d %d\n"
                     "      max shared memory per block: %d\n"
                     "      max constant memory per block: %d\n"
                     "      compute capability %d.%d\n"
                     "      cuda cores: %d x %d = %d\n",
                     max_threads_per_block, warp_size,
                     max_block_size[0], max_block_size[1], max_block_size[2],
                     max_grid_size[0], max_grid_size[1], max_grid_size[2],
                     max_shared_mem, max_constant_mem,
                     cc_major, cc_minor,
                     num_cores, threads_per_core, num_cores * threads_per_core);
    }
    #endif

    // Create context
    DEBUG_PRINTF( user_context, "    cuCtxCreate %d -> ", dev );
    err = cuCtxCreate(ctx, 0, dev);
    if (err != CUDA_SUCCESS) {
        DEBUG_PRINTF( user_context, "%s\n", _get_error_name(err) );
        halide_error_varargs(user_context, "CUDA: cuCtxCreate failed (%s)",
                             _get_error_name(err));
        return err;
    } else {
        unsigned int version = 0;
        cuCtxGetApiVersion(*ctx, &version);
        DEBUG_PRINTF( user_context, "%p (%d)\n", *ctx, version);
    }

    return CUDA_SUCCESS;
}
Пример #28
0
static void calc_a_score_GPU(FLOAT *ac_score,  FLOAT **score,
			     int *ssize_start,  Model_info *MI,
			     FLOAT scale, int *size_score_array,
			     int NoC)
{
	CUresult res;

	const int IHEI = MI->IM_HEIGHT;
	const int IWID = MI->IM_WIDTH;
	int pady_n = MI->pady;
	int padx_n = MI->padx;
	int block_pad = (int)(scale/2.0);

	struct timeval tv;

	int *RY_array, *RX_array;
	res = cuMemHostAlloc((void**)&RY_array, NoC*sizeof(int), CU_MEMHOSTALLOC_DEVICEMAP);
	if(res != CUDA_SUCCESS) {
		printf("cuMemHostAlloc(RY_array) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemHostAlloc((void**)&RX_array, NoC*sizeof(int), CU_MEMHOSTALLOC_DEVICEMAP);
	if(res != CUDA_SUCCESS) {
		printf("cuMemHostAlloc(RX_array) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	for(int i = 0; i < NoC; i++) {
		int rsize[2] = {MI->rsize[i*2], MI->rsize[i*2+1]};

		RY_array[i] = (int)((FLOAT)rsize[0]*scale/2.0-1.0+block_pad);
		RX_array[i] = (int)((FLOAT)rsize[1]*scale/2.0-1.0+block_pad);
	}

	CUdeviceptr ac_score_dev, score_dev;
	CUdeviceptr ssize_dev, size_score_dev;
	CUdeviceptr RY_dev, RX_dev;

	int size_score=0;
	for(int i = 0; i < NoC; i++) {
		size_score += size_score_array[i];
	}

	/* allocate GPU memory */
	res = cuMemAlloc(&ac_score_dev, gpu_size_A_SCORE);
	if(res != CUDA_SUCCESS) {
		printf("cuMemAlloc(ac_score) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemAlloc(&score_dev, size_score);
	if(res != CUDA_SUCCESS) {
		printf("cuMemAlloc(score) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemAlloc(&ssize_dev, NoC*sizeof(int));
	if(res != CUDA_SUCCESS) {
		printf("cuMemAlloc(ssize) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemAlloc(&size_score_dev, NoC*sizeof(int));
	if(res != CUDA_SUCCESS) {
		printf("cuMemAlloc(size_score) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemAlloc(&RY_dev, NoC*sizeof(int));
	if(res != CUDA_SUCCESS) {
		printf("cuMemAlloc(RY) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemAlloc(&RX_dev, NoC*sizeof(int));
	if(res != CUDA_SUCCESS) {
		printf("cuMemAlloc(RX) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	gettimeofday(&tv_memcpy_start, nullptr);
	/* upload date to GPU */
	res = cuMemcpyHtoD(ac_score_dev, &ac_score[0], gpu_size_A_SCORE);
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD(ac_score) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemcpyHtoD(score_dev, &score[0][0], size_score);
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD(score) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemcpyHtoD(ssize_dev, &ssize_start[0], NoC*sizeof(int));
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD(ssize) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemcpyHtoD(size_score_dev, &size_score_array[0], NoC*sizeof(int));
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD(size_score) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemcpyHtoD(RY_dev, &RY_array[0], NoC*sizeof(int));
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD(RY) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemcpyHtoD(RX_dev, &RX_array[0], NoC*sizeof(int));
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoD(RX) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	gettimeofday(&tv_memcpy_end, nullptr);
	tvsub(&tv_memcpy_end, &tv_memcpy_start, &tv);
	time_memcpy += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;

	void* kernel_args[] = {
		(void*)&IWID,
		(void*)&IHEI,
		(void*)&scale,
		(void*)&padx_n,
		(void*)&pady_n,
		&RX_dev,
		&RY_dev,
		&ac_score_dev,
		&score_dev,
		&ssize_dev,
		(void*)&NoC,
		&size_score_dev
	};

	int sharedMemBytes = 0;

	/* define CUDA block shape */
	int max_threads_num = 0;
	int thread_num_x, thread_num_y;
	int block_num_x, block_num_y;

	res = cuDeviceGetAttribute(&max_threads_num, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev[0]);
	if(res != CUDA_SUCCESS){
		printf("\ncuDeviceGetAttribute() failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	NR_MAXTHREADS_X[0] = (int)sqrt((double)max_threads_num/NoC);
	NR_MAXTHREADS_Y[0] = (int)sqrt((double)max_threads_num/NoC);

	thread_num_x = (IWID < NR_MAXTHREADS_X[0]) ? IWID : NR_MAXTHREADS_X[0];
	thread_num_y = (IHEI < NR_MAXTHREADS_Y[0]) ? IHEI : NR_MAXTHREADS_Y[0];

	block_num_x = IWID / thread_num_x;
	block_num_y = IHEI / thread_num_y;
	if(IWID % thread_num_x != 0) block_num_x++;
	if(IHEI % thread_num_y != 0) block_num_y++;

	gettimeofday(&tv_kernel_start, nullptr);
	/* launch GPU kernel */
	res = cuLaunchKernel(
		func_calc_a_score[0], // call function
		block_num_x,       // gridDimX
		block_num_y,       // gridDimY
		1,                 // gridDimZ
		thread_num_x,      // blockDimX
		thread_num_y,      // blockDimY
		NoC,               // blockDimZ
		sharedMemBytes,    // sharedMemBytes
		nullptr,              // hStream
		kernel_args,       // kernelParams
		nullptr               // extra
		);
	if(res != CUDA_SUCCESS) {
		printf("cuLaunchKernel(calc_a_score) failed : res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuCtxSynchronize();
	if(res != CUDA_SUCCESS) {
		printf("cuCtxSynchronize(calc_a_score) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}
	gettimeofday(&tv_kernel_end, nullptr);
	tvsub(&tv_kernel_end, &tv_kernel_start, &tv);
	time_kernel += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;

	gettimeofday(&tv_memcpy_start, nullptr);
	/* download data from GPU */
	res = cuMemcpyDtoH(ac_score, ac_score_dev, gpu_size_A_SCORE);
	if(res != CUDA_SUCCESS) {
		printf("cuMemcpyDtoH(ac_score) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	gettimeofday(&tv_memcpy_end, nullptr);
	tvsub(&tv_memcpy_end, &tv_memcpy_start, &tv);
	time_memcpy += tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;

	/* free GPU memory */
	res = cuMemFree(ac_score_dev);
	if(res != CUDA_SUCCESS) {
		printf("cuMemFree(ac_score_dev) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemFree(score_dev);
	if(res != CUDA_SUCCESS) {
		printf("cuMemFree(score_dev) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemFree(ssize_dev);
	if(res != CUDA_SUCCESS) {
		printf("cuMemFree(ssize_dev) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemFree(size_score_dev);
	if(res != CUDA_SUCCESS) {
		printf("cuMemFree(size_score_dev) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemFree(RY_dev);
	if(res != CUDA_SUCCESS) {
		printf("cuMemFree(RY_dev) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemFree(RX_dev);
	if(res != CUDA_SUCCESS) {
		printf("cuMemFree(RX_dev) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	/* free CPU memory */
	res = cuMemFreeHost(RY_array);
	if(res != CUDA_SUCCESS) {
		printf("cuMemFreeHost(RY_array) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}

	res = cuMemFreeHost(RX_array);
	if(res != CUDA_SUCCESS) {
		printf("cuMemFreeHost(RX_array) failed: res = %s\n", cuda_response_to_string(res));
		exit(1);
	}
}
Пример #29
0
void
pocl_cuda_init (cl_device_id device, const char *parameters)
{
  CUresult result;

  result = cuInit (0);
  CUDA_CHECK (result, "cuInit");

  if (device->data)
    return;

  pocl_cuda_device_data_t *data = malloc (sizeof (pocl_cuda_device_data_t));
  result = cuDeviceGet (&data->device, 0);
  CUDA_CHECK (result, "cuDeviceGet");

  // Get specific device name
  device->long_name = device->short_name = malloc (256 * sizeof (char));
  cuDeviceGetName (device->long_name, 256, data->device);

  // Get other device properties
  cuDeviceGetAttribute ((int *)&device->max_work_group_size,
                        CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
                        data->device);
  cuDeviceGetAttribute ((int *)(device->max_work_item_sizes + 0),
                        CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, data->device);
  cuDeviceGetAttribute ((int *)(device->max_work_item_sizes + 1),
                        CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, data->device);
  cuDeviceGetAttribute ((int *)(device->max_work_item_sizes + 2),
                        CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, data->device);
  cuDeviceGetAttribute (
      (int *)&device->local_mem_size,
      CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR, data->device);
  cuDeviceGetAttribute ((int *)&device->max_compute_units,
                        CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
                        data->device);
  cuDeviceGetAttribute ((int *)&device->max_clock_frequency,
                        CU_DEVICE_ATTRIBUTE_CLOCK_RATE, data->device);
  cuDeviceGetAttribute ((int *)&device->error_correction_support,
                        CU_DEVICE_ATTRIBUTE_ECC_ENABLED, data->device);
  cuDeviceGetAttribute ((int *)&device->host_unified_memory,
                        CU_DEVICE_ATTRIBUTE_INTEGRATED, data->device);
  cuDeviceGetAttribute ((int *)&device->max_constant_buffer_size,
                        CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY,
                        data->device);

  device->preferred_vector_width_char = 1;
  device->preferred_vector_width_short = 1;
  device->preferred_vector_width_int = 1;
  device->preferred_vector_width_long = 1;
  device->preferred_vector_width_float = 1;
  device->preferred_vector_width_double = 1;
  device->preferred_vector_width_half = 0;
  device->native_vector_width_char = 1;
  device->native_vector_width_short = 1;
  device->native_vector_width_int = 1;
  device->native_vector_width_long = 1;
  device->native_vector_width_float = 1;
  device->native_vector_width_double = 1;
  device->native_vector_width_half = 0;

  device->single_fp_config = CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO
                             | CL_FP_ROUND_TO_INF | CL_FP_FMA | CL_FP_INF_NAN
                             | CL_FP_DENORM;
  device->double_fp_config = CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO
                             | CL_FP_ROUND_TO_INF | CL_FP_FMA | CL_FP_INF_NAN
                             | CL_FP_DENORM;

  device->local_mem_type = CL_LOCAL;
  device->host_unified_memory = 0;

  // Get GPU architecture name
  int sm_maj, sm_min;
  cuDeviceGetAttribute (&sm_maj, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
                        data->device);
  cuDeviceGetAttribute (&sm_min, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
                        data->device);
  char *gpu_arch = malloc (16 * sizeof (char));
  snprintf (gpu_arch, 16, "sm_%d%d", sm_maj, sm_min);
  device->llvm_cpu = pocl_get_string_option ("POCL_CUDA_GPU_ARCH", gpu_arch);
  POCL_MSG_PRINT_INFO ("[CUDA] GPU architecture = %s\n", device->llvm_cpu);

  // Create context
  result = cuCtxCreate (&data->context, CU_CTX_MAP_HOST, data->device);
  CUDA_CHECK (result, "cuCtxCreate");

  // Get global memory size
  size_t memfree, memtotal;
  result = cuMemGetInfo (&memfree, &memtotal);
  device->max_mem_alloc_size = max (memtotal / 4, 128 * 1024 * 1024);
  device->global_mem_size = memtotal;

  device->data = data;
}
Пример #30
0
static struct ptx_device *
nvptx_open_device (int n)
{
  struct ptx_device *ptx_dev;
  CUdevice dev, ctx_dev;
  CUresult r;
  int async_engines, pi;

  r = cuDeviceGet (&dev, n);
  if (r != CUDA_SUCCESS)
    GOMP_PLUGIN_fatal ("cuDeviceGet error: %s", cuda_error (r));

  ptx_dev = GOMP_PLUGIN_malloc (sizeof (struct ptx_device));

  ptx_dev->ord = n;
  ptx_dev->dev = dev;
  ptx_dev->ctx_shared = false;

  r = cuCtxGetDevice (&ctx_dev);
  if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
    GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r));
  
  if (r != CUDA_ERROR_INVALID_CONTEXT && ctx_dev != dev)
    {
      /* The current host thread has an active context for a different device.
         Detach it.  */
      CUcontext old_ctx;
      
      r = cuCtxPopCurrent (&old_ctx);
      if (r != CUDA_SUCCESS)
	GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r));
    }

  r = cuCtxGetCurrent (&ptx_dev->ctx);
  if (r != CUDA_SUCCESS)
    GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));

  if (!ptx_dev->ctx)
    {
      r = cuCtxCreate (&ptx_dev->ctx, CU_CTX_SCHED_AUTO, dev);
      if (r != CUDA_SUCCESS)
	GOMP_PLUGIN_fatal ("cuCtxCreate error: %s", cuda_error (r));
    }
  else
    ptx_dev->ctx_shared = true;

  r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);
  if (r != CUDA_SUCCESS)
    GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));

  ptx_dev->overlap = pi;

  r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
  if (r != CUDA_SUCCESS)
    GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));

  ptx_dev->map = pi;

  r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
  if (r != CUDA_SUCCESS)
    GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));

  ptx_dev->concur = pi;

  r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
  if (r != CUDA_SUCCESS)
    GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));

  ptx_dev->mode = pi;

  r = cuDeviceGetAttribute (&pi, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
  if (r != CUDA_SUCCESS)
    GOMP_PLUGIN_fatal ("cuDeviceGetAttribute error: %s", cuda_error (r));

  ptx_dev->mkern = pi;

  r = cuDeviceGetAttribute (&async_engines,
			    CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
  if (r != CUDA_SUCCESS)
    async_engines = 1;

  ptx_dev->images = NULL;
  pthread_mutex_init (&ptx_dev->image_lock, NULL);

  init_streams_for_device (ptx_dev, async_engines);

  return ptx_dev;
}