Exemplo n.º 1
0
/**
* Throws a runtimeexception called CudaMemoryException
* allocd - number of bytes tried to allocate
* id - variable the memory assignment was for
*/
void throw_cuda_errror_exception(JNIEnv *env, const char *message, int error) {
	char msg[1024];
	jclass exp;
	jfieldID fid;
  int a = 0;
  int b = 0;
  char name[1024];

	if(error == CUDA_SUCCESS){
		return;
	}

	exp = (*env)->FindClass(env,"edu/syr/pcpratts/rootbeer/runtime2/cuda/CudaErrorException");

    // we truncate the message to 900 characters to stop any buffer overflow
	switch(error){
		case CUDA_ERROR_OUT_OF_MEMORY:
			sprintf(msg, "CUDA_ERROR_OUT_OF_MEMORY: %.900s",message);
			break;
    case CUDA_ERROR_NO_BINARY_FOR_GPU:
      cuDeviceGetName(name,1024,cuDevice);
      cuDeviceComputeCapability(&a, &b, cuDevice);
      sprintf(msg, "No binary for gpu. Selected %s (%d.%d). 2.0 compatibility required.", name, a, b);
      break;
		default:
			sprintf(msg, "ERROR STATUS:%i : %.900s", error, message);
	}

	fid = (*env)->GetFieldID(env,exp, "cudaError_enum", "I");
	(*env)->SetLongField(env,exp,fid, (jint)error);

    (*env)->ThrowNew(env,exp,msg);
}
void GPUInterface::GetDeviceDescription(int deviceNumber,
                                        char* deviceDescription) {
#ifdef BEAGLE_DEBUG_FLOW
    fprintf(stderr, "\t\t\tEntering GPUInterface::GetDeviceDescription\n");
#endif

    CUdevice tmpCudaDevice;

    SAFE_CUDA(cuDeviceGet(&tmpCudaDevice, (*resourceMap)[deviceNumber]));

#if CUDA_VERSION >= 3020
    size_t totalGlobalMemory = 0;
#else
    unsigned int totalGlobalMemory = 0;
#endif
    int clockSpeed = 0;
    int mpCount = 0;
    int major = 0;
    int minor = 0;

    SAFE_CUDA(cuDeviceComputeCapability(&major, &minor, tmpCudaDevice));
    SAFE_CUDA(cuDeviceTotalMem(&totalGlobalMemory, tmpCudaDevice));
    SAFE_CUDA(cuDeviceGetAttribute(&clockSpeed, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, tmpCudaDevice));
    SAFE_CUDA(cuDeviceGetAttribute(&mpCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, tmpCudaDevice));

    sprintf(deviceDescription,
            "Global memory (MB): %d | Clock speed (Ghz): %1.2f | Number of cores: %d",
            int(totalGlobalMemory / 1024.0 / 1024.0 + 0.5),
            clockSpeed / 1000000.0,
            nGpuArchCoresPerSM[major] * mpCount);

#ifdef BEAGLE_DEBUG_FLOW
    fprintf(stderr, "\t\t\tLeaving  GPUInterface::GetDeviceDescription\n");
#endif
}
Exemplo n.º 3
0
  deviceIdentifier device_t<CUDA>::getIdentifier() const {
    deviceIdentifier dID;

    dID.mode_ = CUDA;

    const size_t archPos = compilerFlags.find("-arch=sm_");

    if(archPos == std::string::npos){
      OCCA_EXTRACT_DATA(CUDA, Device);

      std::stringstream archSM_;

      int major, minor;
      OCCA_CUDA_CHECK("Getting CUDA Device Arch",
                      cuDeviceComputeCapability(&major, &minor, data_.device) );

      archSM_ << major << minor;

      dID.flagMap["sm_arch"] = archSM_.str();
    }
    else{
      const char *c0 = (compilerFlags.c_str() + archPos);
      const char *c1 = c0;

      while((*c0 != '\0') && (*c0 != ' '))
        ++c1;

      dID.flagMap["sm_arch"] = std::string(c0, c1 - c0);
    }

    return dID;
  }
Exemplo n.º 4
0
main()
{
  /* initialize CUDA */
  CUresult res;
  res = cuInit(0);
  MY_CUDA_CHECK(res, "cuInit()");

  /* check GPU is setted or not */
  int device_num;
  res = cuDeviceGetCount(&device_num);
  MY_CUDA_CHECK(res, "cuDeviceGetCount()");

  if (device_num == 0) {        // no GPU is detected
    fprintf(stderr, "no CUDA capable GPU is detected...\n");
    exit(1);
  }

  printf("%d GPUs are detected\n", device_num);

  for (int i=0; i<device_num; i++)
    {
      /* get device handle of GPU No.i */
      CUdevice dev;
      res = cuDeviceGet(&dev, i);
      MY_CUDA_CHECK(res, "cuDeviceGet()");
      
      /* search compute capability of GPU No.i */
      int major=0, minor=0;
      res = cuDeviceComputeCapability(&major, &minor, dev);
      MY_CUDA_CHECK(res, "cuDeviceComputeCapability()");
     
      printf("GPU[%d] : actual compute capability is : %d%d\n", i, major, minor);
    }
}
Exemplo n.º 5
0
CUDADevice::CUDADevice(const CUdevice device_number)
  : m_cuda_device_number(device_number)
{
    char device_name[256];
    cuDeviceGetName(device_name, 256, m_cuda_device_number);
    m_name = device_name;

    cuDeviceComputeCapability(
        &m_compute_capability.first,
        &m_compute_capability.second,
        m_cuda_device_number);

    cuDeviceGetAttribute(&m_compute_mode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, m_cuda_device_number);

    cuDeviceTotalMem(&m_total_mem, m_cuda_device_number);

    cuDeviceGetAttribute(&m_max_threads_per_block, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, m_cuda_device_number);

    cuDeviceGetAttribute(&m_max_block_dim_x, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, m_cuda_device_number);
    cuDeviceGetAttribute(&m_max_block_dim_y, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, m_cuda_device_number);
    cuDeviceGetAttribute(&m_max_block_dim_z, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, m_cuda_device_number);

    cuDeviceGetAttribute(&m_max_grid_dim_x, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, m_cuda_device_number);
    cuDeviceGetAttribute(&m_max_grid_dim_y, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, m_cuda_device_number);
    cuDeviceGetAttribute(&m_max_grid_dim_z, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, m_cuda_device_number);

    cuDeviceGetAttribute(&m_max_registers, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, m_cuda_device_number);
}
Exemplo n.º 6
0
/*===========================================================================*/
int Device::minorRevision() const
{
    int major_revision = 0;
    int minor_revision = 0;
    KVS_CU_CALL( cuDeviceComputeCapability( &major_revision, &minor_revision, m_handler ) );

    return minor_revision;
}
Exemplo n.º 7
0
int get_suitable_block_num(int device,
			   int *max_block_num,
			   int *mp_num,
			   int word_size,
			   int thread_num,
			   int large_size)
{
#ifdef TODO
    cudaDeviceProp dev;
    CUdevice cuDevice;
    int max_thread_dev;
    int max_block, max_block_mem, max_block_dev;
    int major, minor, ver;
    //int regs, max_block_regs;

    ccudaGetDeviceProperties(&dev, device);
    cuDeviceGet(&cuDevice, device);
    cuDeviceComputeCapability(&major, &minor, cuDevice);
    //cudaFuncGetAttributes()
#if 0
    if (word_size == 4) {
	regs = 14;
    } else {
	regs = 16;
    }
    max_block_regs = dev.regsPerBlock / (regs * thread_num);
#endif
    max_block_mem = dev.sharedMemPerBlock / (large_size * word_size + 16);
    if (major == 9999 && minor == 9999) {
	return -1;
    }
    ver = major * 100 + minor;
    if (ver <= 101) {
	max_thread_dev = 768;
    } else if (ver <= 103) {
	max_thread_dev = 1024;
    } else if (ver <= 200) {
	max_thread_dev = 1536;
    } else {
	max_thread_dev = 1536;
    }
    max_block_dev = max_thread_dev / thread_num;
    if (max_block_mem < max_block_dev) {
	max_block = max_block_mem;
    } else {
	max_block = max_block_dev;
    }
#if 0
    if (max_block_regs < max_block) {
	max_block = max_block_regs;
    }
#endif
    *max_block_num = max_block;
    *mp_num = dev.multiProcessorCount;
    return max_block * dev.multiProcessorCount;
#endif
    return 0;
}
bool GPUInterface::GetSupportsDoublePrecision(int deviceNumber) {
    CUdevice tmpCudaDevice;
    SAFE_CUDA(cuDeviceGet(&tmpCudaDevice, (*resourceMap)[deviceNumber]));

    int major = 0;
    int minor = 0;
    SAFE_CUDA(cuDeviceComputeCapability(&major, &minor, tmpCudaDevice));
    return (major >= 2 || (major >= 1 && minor >= 3));
}
void print_GetProperties(CUdevice cuDevice)
{
    int count = 0;
    cuDeviceGetCount(&count);
    printf ("cuDevice(%d)GetCount = %d\n", cuDevice, count);

    int len = 1024;
    char* dev_name = (char*)malloc(sizeof(char) * len);
    cuDeviceGetName(dev_name, len, cuDevice);
    printf("cuda-devicename = %s\n", dev_name);
    free(dev_name);

    int mj_v = 0, mn_v = 0;
    cuDeviceComputeCapability(&mj_v, &mn_v, cuDevice);
    printf("compute capability = mj:%d, mn:%d\n", mj_v, mn_v);

    size_t byt_mem = 0;
    cuDeviceTotalMem(&byt_mem, cuDevice);
    printf("total mem = %d\n", byt_mem);
    CUdevprop cp;
    cuDeviceGetProperties(&cp, cuDevice);
    printf("Thd/blk = %d, thrdDim xyz = (%d, %d, %d:threads), GridSz xyz = (%d, %d, %d:blocks), shrdmem/blk = %d, constmem = %d bytes, simdwidth = %d, mempitch = %d, regsPerBlock = %d, clockRate = %d, textureAlign = %d \n",
        cp.maxThreadsPerBlock, cp.maxThreadsDim[0], cp.maxThreadsDim[1], cp.maxThreadsDim[2], cp.maxGridSize[0], cp.maxGridSize[1], cp.maxGridSize[2], cp.sharedMemPerBlock, cp.totalConstantMemory, cp.SIMDWidth, cp.memPitch, cp.regsPerBlock, cp.clockRate, cp.textureAlign);

    int ip;
    cuDeviceGetAttribute(&ip, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuDevice);
    printf ("Attrib - CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = %d\n", ip);
    cuDeviceGetAttribute(&ip, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, cuDevice);
    printf ("Attrib - CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = %d\n", ip);
    cuDeviceGetAttribute(&ip, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, cuDevice);
    printf ("Attrib - CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = %d\n", ip);
    cuDeviceGetAttribute(&ip, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, cuDevice);
    printf ("Attrib - CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = %d\n", ip);
    cuDeviceGetAttribute(&ip, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, cuDevice);
    printf ("Attrib - CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = %d\n", ip);
    cuDeviceGetAttribute(&ip, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, cuDevice);
    printf ("Attrib - CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = %d\n", ip);
    cuDeviceGetAttribute(&ip, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, cuDevice);
    printf ("Attrib - CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = %d\n", ip);
    cuDeviceGetAttribute(&ip, CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK, cuDevice);
    printf ("Attrib - CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = %d\n", ip);
    cuDeviceGetAttribute(&ip, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, cuDevice);
    printf ("Attrib - CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = %d\n", ip);
    cuDeviceGetAttribute(&ip, CU_DEVICE_ATTRIBUTE_WARP_SIZE, cuDevice);
    printf ("Attrib - CU_DEVICE_ATTRIBUTE_WARP_SIZE = %d\n", ip);
    cuDeviceGetAttribute(&ip, CU_DEVICE_ATTRIBUTE_MAX_PITCH, cuDevice);
    printf ("Attrib - CU_DEVICE_ATTRIBUTE_MAX_PITCH = %d\n", ip);
    cuDeviceGetAttribute(&ip, CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK, cuDevice);
    printf ("Attrib - CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = %d\n", ip);
    cuDeviceGetAttribute(&ip, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, cuDevice);
    printf ("Attrib - CU_DEVICE_ATTRIBUTE_CLOCK_RATE = %d\n", ip);
    cuDeviceGetAttribute(&ip, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, cuDevice);
    printf ("Attrib - CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = %d\n", ip);
}
Exemplo n.º 10
0
/*
 * Initialize the CUPTI events data of the given VampirTrace CUPTI context.
 * 
 * @param vtCtx pointer to the VampirTrace CUPTI context
 */
void vt_cupti_events_initContext(vt_cupti_ctx_t *vtcuptiCtx)
{
  vt_cupti_events_t *vtcuptiEvtCtx = NULL;
  
  vt_cntl_msg(2, "[CUPTI Events] Initializing VampirTrace CUPTI events context");

  /* get a pointer to eventIDArray */
  {
    CUresult cuErr = CUDA_SUCCESS;
    int dev_major, dev_minor;
    vt_cupti_device_t *cuptiDev;

    /* TODO: do not trace this driver API function call */
    cuErr = cuDeviceComputeCapability(&dev_major, &dev_minor, vtcuptiCtx->cuDev);
    VT_CUDRV_CALL(cuErr, "cuDeviceComputeCapability");

    /* check if device capability already listed */
    VT_CUPTI_LOCK();
      cuptiDev = vtcuptievtCapList;
    VT_CUPTI_UNLOCK();
    
    cuptiDev = vt_cupti_checkMetricList(cuptiDev, dev_major, dev_minor);
    if(cuptiDev){
      /* allocate the VampirTrace CUPTI events context */
      vtcuptiEvtCtx = (vt_cupti_events_t *)malloc(sizeof(vt_cupti_events_t));
      if(vtcuptiEvtCtx == NULL)
        vt_error_msg("[CUPTI Events] malloc(sizeof(vt_cupti_events_t)) failed!");
      
      vtcuptiEvtCtx->vtDevCap = cuptiDev;
      vtcuptiEvtCtx->vtGrpList = NULL;
      vtcuptiEvtCtx->counterData = NULL;
      vtcuptiEvtCtx->cuptiEvtIDs = NULL;
      
      vtcuptiCtx->events = vtcuptiEvtCtx;
    }else{
      return;
    }
  }

  /* create and add the VampirTrace CUPTI groups to the context */
  vt_cupti_addEvtGrpsToCtx(vtcuptiCtx);

  /* allocate memory for CUPTI counter reads */
  {
    size_t allocSize = vtcuptiEvtCtx->vtGrpList->evtNum;
    
    vtcuptiEvtCtx->counterData = 
            (uint64_t *)malloc(allocSize*sizeof(uint64_t));
    vtcuptiEvtCtx->cuptiEvtIDs = 
            (CUpti_EventID *)malloc(allocSize*sizeof(CUpti_EventID));
  }
  
  vt_cuptievt_start(vtcuptiEvtCtx);
}
Exemplo n.º 11
0
Handle<Value> CudaDevice::computeCapability(const Arguments& args) {
  HandleScope scope;

  CudaDevice *cu = ObjectWrap::Unwrap<CudaDevice>(args.Holder());
  int major = 0, minor = 0;
  cuDeviceComputeCapability(&major, &minor, cu->m_device);
  
  Local<Array> result = Array::New(2);
  result->Set(0, Integer::New(major));
  result->Set(1, Integer::New(minor));
  return scope.Close(result);
}
Exemplo n.º 12
0
Object cuda_computeCapability(Object self, int nparts, int *argcv,
        Object *argv, int flags) {
    cuInit(0);
    int deviceCount = 0;
    cuDeviceGetCount(&deviceCount);
    if (deviceCount == 0) {
        raiseError("No CUDA devices found");
    }
    CUdevice cuDevice;
    int major, minor;
    cuDeviceComputeCapability(&major, &minor, cuDevice);
    return alloc_Float64(major + minor / 10.0);
}
Exemplo n.º 13
0
int CudaModule::getComputeCapability(void)
{
  staticInit();
  
  if (!s_available) {
    return 0;
  }

  int major, minor;
  checkError( "cuDeviceComputeCapability", 
              cuDeviceComputeCapability(&major, &minor, s_device));
              
  return major * 10 + minor;
}
Exemplo n.º 14
0
	bool support_device(bool experimental)
	{
		if(!experimental) {
			int major, minor;
			cuDeviceComputeCapability(&major, &minor, cuDevId);

			if(major < 2) {
				cuda_error_message(string_printf("CUDA device supported only with compute capability 2.0 or up, found %d.%d.", major, minor));
				return false;
			}
		}

		return true;
	}
Exemplo n.º 15
0
static CUresult get_cc(CUdevice dev, int *maj, int *min) {
#if CUDA_VERSION < 6500
  return cuDeviceComputeCapability(maj, min, dev);
#else
  CUresult lerr;
  lerr = cuDeviceGetAttribute(maj,
                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
                              dev);
  if (lerr != CUDA_SUCCESS)
    return lerr;
  return cuDeviceGetAttribute(min,
                              CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
                              dev);
#endif
}
Exemplo n.º 16
0
Object cuda_cores(Object self, int nparts, int *argcv,
        Object *argv, int flags) {
    cuInit(0);
    int deviceCount = 0;
    cuDeviceGetCount(&deviceCount);
    if (deviceCount == 0) {
        raiseError("No CUDA devices found");
    }
    CUdevice cuDevice;
    int mpcount;
    cuDeviceGetAttribute(&mpcount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
            cuDevice);
    int major, minor;
    cuDeviceComputeCapability(&major, &minor, cuDevice);
    mpcount *= coreMultiplicand(major, minor);
    return alloc_Float64(mpcount);
}
Exemplo n.º 17
0
int GPUInterface::Initialize() {
#ifdef BEAGLE_DEBUG_FLOW
    fprintf(stderr,"\t\t\tEntering GPUInterface::Initialize\n");
#endif

    resourceMap = new std::map<int, int>;

    // Driver init; CUDA manual: "Currently, the Flags parameter must be 0."
    CUresult error = cuInit(0);

    if (error == CUDA_ERROR_NO_DEVICE) {
        return 0;
    } else if (error != CUDA_SUCCESS) {
        fprintf(stderr, "CUDA error: \"%s\" from file <%s>, line %i.\n",
                GetCUDAErrorDescription(error), __FILE__, __LINE__);
        exit(-1);
    }

    int numDevices = 0;
    SAFE_CUDA(cuDeviceGetCount(&numDevices));

    CUdevice tmpCudaDevice;
    int capabilityMajor;
    int capabilityMinor;
    int currentDevice = 0;
    for (int i=0; i < numDevices; i++) {
        SAFE_CUDA(cuDeviceGet(&tmpCudaDevice, i));
        SAFE_CUDA(cuDeviceComputeCapability(&capabilityMajor, &capabilityMinor, tmpCudaDevice));
        if ((capabilityMajor > 1 && capabilityMinor != 9999) || (capabilityMajor == 1 && capabilityMinor > 0)) {
            resourceMap->insert(std::make_pair(currentDevice++, i));
        }
    }

#ifdef BEAGLE_DEBUG_FLOW
    fprintf(stderr,"\t\t\tLeaving  GPUInterface::Initialize\n");
#endif

    return 1;
}
Exemplo n.º 18
0
bool VideoDecoderCUDAPrivate::initCuda()
{
    CUresult result = cuInit(0);
    if (result != CUDA_SUCCESS) {
        available = false;
        qWarning("cuInit(0) faile (%d)", result);
        return false;
    }
    cudev = GetMaxGflopsGraphicsDeviceId();

    int clockRate;
    cuDeviceGetAttribute(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, cudev);
    int major, minor;
    cuDeviceComputeCapability(&major, &minor, cudev);
    char devname[256];
    cuDeviceGetName(devname, 256, cudev);
    description = QString("CUDA device: %1 %2.%3 %4 MHz").arg(devname).arg(major).arg(minor).arg(clockRate/1000);

    //TODO: cuD3DCtxCreate > cuGLCtxCreate > cuCtxCreate
    checkCudaErrors(cuCtxCreate(&cuctx, CU_CTX_SCHED_BLOCKING_SYNC, cudev)); //CU_CTX_SCHED_AUTO?
    CUcontext cuCurrent = NULL;
    result = cuCtxPopCurrent(&cuCurrent);
    if (result != CUDA_SUCCESS) {
        qWarning("cuCtxPopCurrent: %d\n", result);
        return false;
    }
    checkCudaErrors(cuvidCtxLockCreate(&vid_ctx_lock, cuctx));
    {
        AutoCtxLock lock(this, vid_ctx_lock);
        Q_UNUSED(lock);
        //Flags- Parameters for stream creation (must be 0 (CU_STREAM_DEFAULT=0 in cuda5) in cuda 4.2, no CU_STREAM_NON_BLOCKING)
        checkCudaErrors(cuStreamCreate(&stream, 0));//CU_STREAM_NON_BLOCKING)); //CU_STREAM_DEFAULT
        //require compute capability >= 1.1
        //flag: Reserved for future use, must be 0
        //cuStreamAddCallback(stream, CUstreamCallback, this, 0);
    }
    return true;
}
Exemplo n.º 19
0
int cuda_api::GetMaxGflopsGraphicsDeviceId() {
    CUdevice current_device = 0, max_perf_device = 0;
    int device_count     = 0, sm_per_multiproc = 0;
    int max_compute_perf = 0, best_SM_arch     = 0;
    int major = 0, minor = 0, multiProcessorCount, clockRate;
    int bTCC = 0, version;
    char deviceName[256];

    cuDeviceGetCount(&device_count);
    if (device_count <= 0)
        return -1;

    cuDriverGetVersion(&version);
    // Find the best major SM Architecture GPU device that are graphics devices
    while (current_device < device_count) {
        cuDeviceGetName(deviceName, 256, current_device);
        cuDeviceComputeCapability(&major, &minor, current_device);
        if (version >= 3020) {
            cuDeviceGetAttribute(&bTCC, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, current_device);
        } else {
            // Assume a Tesla GPU is running in TCC if we are running CUDA 3.1
            if (deviceName[0] == 'T')
                bTCC = 1;
        }
        if (!bTCC) {
            if (major > 0 && major < 9999) {
                best_SM_arch = std::max(best_SM_arch, major);
            }
        }
        current_device++;
    }
    // Find the best CUDA capable GPU device
    current_device = 0;
    while (current_device < device_count) {
        cuDeviceGetAttribute(&multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, current_device);
        cuDeviceGetAttribute(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, current_device);
        cuDeviceComputeCapability(&major, &minor, current_device);
        if (version >= 3020) {
            cuDeviceGetAttribute(&bTCC, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, current_device);
        } else {
            // Assume a Tesla GPU is running in TCC if we are running CUDA 3.1
            if (deviceName[0] == 'T')
                bTCC = 1;
        }
        if (major == 9999 && minor == 9999) {
            sm_per_multiproc = 1;
        } else {
            sm_per_multiproc = _ConvertSMVer2Cores(major, minor);
        }
        // If this is a Tesla based GPU and SM 2.0, and TCC is disabled, this is a contendor
        if (!bTCC) {// Is this GPU running the TCC driver?  If so we pass on this
            int compute_perf = multiProcessorCount * sm_per_multiproc * clockRate;
            printf("%s @%d compute_perf=%d max_compute_perf=%d\n", __FUNCTION__, __LINE__, compute_perf, max_compute_perf);
            if (compute_perf > max_compute_perf) {
                // If we find GPU with SM major > 2, search only these
                if (best_SM_arch > 2) {
                    printf("%s @%d best_SM_arch=%d\n", __FUNCTION__, __LINE__, best_SM_arch);
                    // If our device = dest_SM_arch, then we pick this one
                    if (major == best_SM_arch) {
                        max_compute_perf = compute_perf;
                        max_perf_device = current_device;
                    }
                } else {
                    max_compute_perf = compute_perf;
                    max_perf_device = current_device;
                }
            }
            cuDeviceGetName(deviceName, 256, current_device);
            printf("CUDA Device: %s, Compute: %d.%d, CUDA Cores: %d, Clock: %d MHz\n", deviceName, major, minor, multiProcessorCount * sm_per_multiproc, clockRate / 1000);
        }
        ++current_device;
    }
    return max_perf_device;
}
Exemplo n.º 20
0
static gpukernel *cuda_newkernel(void *c, unsigned int count,
                                 const char **strings, const size_t *lengths,
                                 const char *fname, unsigned int argcount,
                                 const int *types, int flags, int *ret,
                                 char **err_str) {
    cuda_context *ctx = (cuda_context *)c;
    strb sb = STRB_STATIC_INIT;
    char *bin, *log = NULL;
    srckey k, *ak;
    binval *av;
    gpukernel *res;
    size_t bin_len = 0, log_len = 0;
    CUdevice dev;
    unsigned int i;
    int ptx_mode = 0;
    int binary_mode = 0;
    int major, minor;

    if (count == 0) FAIL(NULL, GA_VALUE_ERROR);

    if (flags & GA_USE_OPENCL)
      FAIL(NULL, GA_DEVSUP_ERROR);

    if (flags & GA_USE_BINARY) {
      // GA_USE_BINARY is exclusive
      if (flags & ~GA_USE_BINARY)
        FAIL(NULL, GA_INVALID_ERROR);
      // We need the length for binary data and there is only one blob.
      if (count != 1 || lengths == NULL || lengths[0] == 0)
        FAIL(NULL, GA_VALUE_ERROR);
    }

    cuda_enter(ctx);

    ctx->err = cuCtxGetDevice(&dev);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      FAIL(NULL, GA_IMPL_ERROR);
    }
    ctx->err = cuDeviceComputeCapability(&major, &minor, dev);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      FAIL(NULL, GA_IMPL_ERROR);
    }

    // GA_USE_CLUDA is done later
    // GA_USE_SMALL will always work
    if (flags & GA_USE_DOUBLE) {
      if (major < 1 || (major == 1 && minor < 3)) {
        cuda_exit(ctx);
        FAIL(NULL, GA_DEVSUP_ERROR);
      }
    }
    if (flags & GA_USE_COMPLEX) {
      // just for now since it is most likely broken
      cuda_exit(ctx);
      FAIL(NULL, GA_DEVSUP_ERROR);
    }
    // GA_USE_HALF should always work

    if (flags & GA_USE_PTX) {
      ptx_mode = 1;
    } else if (flags & GA_USE_BINARY) {
      binary_mode = 1;
    }

    if (binary_mode) {
      bin = memdup(strings[0], lengths[0]);
      bin_len = lengths[0];
      if (bin == NULL) {
        cuda_exit(ctx);
        FAIL(NULL, GA_MEMORY_ERROR);
      }
    } else {
      if (flags & GA_USE_CLUDA) {
        strb_appends(&sb, CUDA_PREAMBLE);
      }

      if (lengths == NULL) {
        for (i = 0; i < count; i++)
        strb_appends(&sb, strings[i]);
      } else {
        for (i = 0; i < count; i++) {
          if (lengths[i] == 0)
            strb_appends(&sb, strings[i]);
          else
            strb_appendn(&sb, strings[i], lengths[i]);
        }
      }

      strb_append0(&sb);

      if (strb_error(&sb)) {
        strb_clear(&sb);
        cuda_exit(ctx);
        return NULL;
      }

      if (ptx_mode) {
        bin = sb.s;
        bin_len = sb.l;
      } else {
        bin = NULL;
        if (compile_cache != NULL) {
          k.src = sb.s;
          k.len = sb.l;
          memcpy(k.arch, ctx->bin_id, BIN_ID_LEN);
          av = cache_get(compile_cache, &k);
          if (av != NULL) {
            bin = memdup(av->bin, av->len);
            bin_len = av->len;
          }
        }
        if (bin == NULL) {
          bin = call_compiler(sb.s, sb.l, ctx->bin_id, &bin_len,
                              &log, &log_len, ret);
        }
        if (bin == NULL) {
          if (err_str != NULL) {
            strb debug_msg = STRB_STATIC_INIT;

            // We're substituting debug_msg for a string with this first line:
            strb_appends(&debug_msg, "CUDA kernel build failure ::\n");

            /* Delete the final NUL */
            sb.l--;
            gpukernel_source_with_line_numbers(1, (const char **)&sb.s,
                                               &sb.l, &debug_msg);

            if (log != NULL) {
              strb_appends(&debug_msg, "\nCompiler log:\n");
              strb_appendn(&debug_msg, log, log_len);
              free(log);
            }
            *err_str = strb_cstr(&debug_msg);
            // *err_str will be free()d by the caller (see docs in kernel.h)
          }
          strb_clear(&sb);
          cuda_exit(ctx);
          return NULL;
        }
        if (compile_cache == NULL)
          compile_cache = cache_twoq(16, 16, 16, 8, src_eq, src_hash, src_free,
                                     bin_free);

        if (compile_cache != NULL) {
          ak = malloc(sizeof(*ak));
          av = malloc(sizeof(*av));
          if (ak == NULL || av == NULL) {
            free(ak);
            free(av);
            goto done;
          }
          ak->src = memdup(sb.s, sb.l);
          if (ak->src == NULL) {
            free(ak);
            free(av);
            goto done;
          }
          ak->len = sb.l;
          memmove(ak->arch, ctx->bin_id, BIN_ID_LEN);
          av->len = bin_len;
          av->bin = memdup(bin, bin_len);
          if (av->bin == NULL) {
            src_free(ak);
            free(av);
            goto done;
          }
          cache_add(compile_cache, ak, av);
        }
      done:
        strb_clear(&sb);
      }
    }

    res = calloc(1, sizeof(*res));
    if (res == NULL) {
      free(bin);
      cuda_exit(ctx);
      FAIL(NULL, GA_SYS_ERROR);
    }

    res->bin_sz = bin_len;
    res->bin = bin;

    res->refcnt = 1;
    res->argcount = argcount;
    res->types = calloc(argcount, sizeof(int));
    if (res->types == NULL) {
      _cuda_freekernel(res);
      cuda_exit(ctx);
      FAIL(NULL, GA_MEMORY_ERROR);
    }
    memcpy(res->types, types, argcount*sizeof(int));
    res->args = calloc(argcount, sizeof(void *));
    if (res->args == NULL) {
      _cuda_freekernel(res);
      cuda_exit(ctx);
      FAIL(NULL, GA_MEMORY_ERROR);
    }

    ctx->err = cuModuleLoadData(&res->m, bin);

    if (ctx->err != CUDA_SUCCESS) {
      _cuda_freekernel(res);
      cuda_exit(ctx);
      FAIL(NULL, GA_IMPL_ERROR);
    }

    ctx->err = cuModuleGetFunction(&res->k, res->m, fname);
    if (ctx->err != CUDA_SUCCESS) {
      _cuda_freekernel(res);
      cuda_exit(ctx);
      FAIL(NULL, GA_IMPL_ERROR);
    }

    res->ctx = ctx;
    ctx->refcnt++;
    cuda_exit(ctx);
    TAG_KER(res);
    return res;
}
Exemplo n.º 21
0
CUDARunner::CUDARunner():GPURunner<unsigned long,int>(TYPE_CUDA)
{
	m_in=0;
	m_devin=0;
	m_out=0;
	m_devout=0;
	CUresult rval;
	int major=0;
	int minor=0;
	std::string cuda_module_path("bitcoinminercuda.ptx");

	rval=cuInit(0);

	if(rval==CUDA_SUCCESS)
	{
		rval=cuDeviceGetCount(&m_devicecount);

		printf("%d CUDA GPU devices found\n",m_devicecount);

		if(m_devicecount>0)
		{
			if(m_deviceindex>=0 && m_deviceindex<m_devicecount)
			{
				printf("Setting CUDA device to device %d\n",m_deviceindex);
				rval=cuDeviceGet(&m_device,m_deviceindex);
				if(rval!=CUDA_SUCCESS)
				{
					exit(0);
				}
			}
			else
			{
				m_deviceindex=0;
				printf("Setting CUDA device to first device found\n");
				rval=cuDeviceGet(&m_device,0);
				if(rval!=CUDA_SUCCESS)
				{
					exit(0);
				}
			}

			cuDeviceComputeCapability(&major, &minor, m_device);

			rval=cuCtxCreate(&m_context,CU_CTX_BLOCKING_SYNC,m_device);
			if(rval!=CUDA_SUCCESS)
			{
				printf("Unable to create CUDA context\n");
				exit(0);
			}

			printf("Loading module %s\n",cuda_module_path.c_str());
			rval=cuModuleLoad(&m_module,cuda_module_path.c_str());
			if(rval!=CUDA_SUCCESS)
			{
				printf("Unable to load CUDA module: %i\n", rval);
				cuCtxDestroy(m_context);
				exit(0);
			}

			rval=cuModuleGetFunction(&m_function,m_module,"cuda_process");
			if(rval!=CUDA_SUCCESS)
			{
				printf("Unable to get function cuda_process %d\n",rval);
				cuModuleUnload(m_module);
				cuCtxDestroy(m_context);
				exit(0);
			}

			printf("CUDA initialized\n");

		}
		else
		{
			printf("No CUDA capable devices found\n");
			exit(0);
		}
	}
	else
	{
		printf("Unable to initialize CUDA\n");
		exit(0);
	}
}
Exemplo n.º 22
0
void CudaModule::printDeviceInfo(CUdevice device)
{
    static const struct
    {
        CUdevice_attribute  attrib;
        const char*         name;
    } attribs[] =
    {
#define A21(ENUM, NAME) { CU_DEVICE_ATTRIBUTE_ ## ENUM, NAME },
#if (CUDA_VERSION >= 4000)
#   define A40(ENUM, NAME) A21(ENUM, NAME)
#else
#   define A40(ENUM, NAME) // TODO: Some of these may exist in earlier versions, too.
#endif

        A21(CLOCK_RATE,                         "Clock rate")
        A40(MEMORY_CLOCK_RATE,                  "Memory clock rate")
        A21(MULTIPROCESSOR_COUNT,               "Number of SMs")
//      A40(GLOBAL_MEMORY_BUS_WIDTH,            "DRAM bus width")
//      A40(L2_CACHE_SIZE,                      "L2 cache size")

        A21(MAX_THREADS_PER_BLOCK,              "Max threads per block")
        A40(MAX_THREADS_PER_MULTIPROCESSOR,     "Max threads per SM")
        A21(REGISTERS_PER_BLOCK,                "Registers per block")
//      A40(MAX_REGISTERS_PER_BLOCK,            "Max registers per block")
        A21(SHARED_MEMORY_PER_BLOCK,            "Shared mem per block")
//      A40(MAX_SHARED_MEMORY_PER_BLOCK,        "Max shared mem per block")
        A21(TOTAL_CONSTANT_MEMORY,              "Constant memory")
//      A21(WARP_SIZE,                          "Warp size")

        A21(MAX_BLOCK_DIM_X,                    "Max blockDim.x")
//      A21(MAX_BLOCK_DIM_Y,                    "Max blockDim.y")
//      A21(MAX_BLOCK_DIM_Z,                    "Max blockDim.z")
        A21(MAX_GRID_DIM_X,                     "Max gridDim.x")
//      A21(MAX_GRID_DIM_Y,                     "Max gridDim.y")
//      A21(MAX_GRID_DIM_Z,                     "Max gridDim.z")
//      A40(MAXIMUM_TEXTURE1D_WIDTH,            "Max tex1D.x")
//      A40(MAXIMUM_TEXTURE2D_WIDTH,            "Max tex2D.x")
//      A40(MAXIMUM_TEXTURE2D_HEIGHT,           "Max tex2D.y")
//      A40(MAXIMUM_TEXTURE3D_WIDTH,            "Max tex3D.x")
//      A40(MAXIMUM_TEXTURE3D_HEIGHT,           "Max tex3D.y")
//      A40(MAXIMUM_TEXTURE3D_DEPTH,            "Max tex3D.z")
//      A40(MAXIMUM_TEXTURE1D_LAYERED_WIDTH,    "Max layerTex1D.x")
//      A40(MAXIMUM_TEXTURE1D_LAYERED_LAYERS,   "Max layerTex1D.y")
//      A40(MAXIMUM_TEXTURE2D_LAYERED_WIDTH,    "Max layerTex2D.x")
//      A40(MAXIMUM_TEXTURE2D_LAYERED_HEIGHT,   "Max layerTex2D.y")
//      A40(MAXIMUM_TEXTURE2D_LAYERED_LAYERS,   "Max layerTex2D.z")
//      A40(MAXIMUM_TEXTURE2D_ARRAY_WIDTH,      "Max array.x")
//      A40(MAXIMUM_TEXTURE2D_ARRAY_HEIGHT,     "Max array.y")
//      A40(MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES,  "Max array.z")

//      A21(MAX_PITCH,                          "Max memcopy pitch")
//      A21(TEXTURE_ALIGNMENT,                  "Texture alignment")
//      A40(SURFACE_ALIGNMENT,                  "Surface alignment")

        A40(CONCURRENT_KERNELS,                 "Concurrent launches supported")
        A21(GPU_OVERLAP,                        "Concurrent memcopy supported")
        A40(ASYNC_ENGINE_COUNT,                 "Max concurrent memcopies")
//      A40(KERNEL_EXEC_TIMEOUT,                "Kernel launch time limited")
//      A40(INTEGRATED,                         "Integrated with host memory")
        A40(UNIFIED_ADDRESSING,                 "Unified addressing supported")
        A40(CAN_MAP_HOST_MEMORY,                "Can map host memory")
        A40(ECC_ENABLED,                        "ECC enabled")

//      A40(TCC_DRIVER,                         "Driver is TCC")
//      A40(COMPUTE_MODE,                       "Compute exclusivity mode")

//      A40(PCI_BUS_ID,                         "PCI bus ID")
//      A40(PCI_DEVICE_ID,                      "PCI device ID")
//      A40(PCI_DOMAIN_ID,                      "PCI domain ID")

#undef A21
#undef A40
    };

    char name[256];
    int major;
    int minor;
    size_t memory;

    checkError("cuDeviceGetName", cuDeviceGetName(name, FW_ARRAY_SIZE(name) - 1, device));
    checkError("cuDeviceComputeCapability", cuDeviceComputeCapability(&major, &minor, device));
    checkError("cuDeviceTotalMem", cuDeviceTotalMem(&memory, device));
    name[FW_ARRAY_SIZE(name) - 1] = '\0';

    printf("\n");
    char deviceIdStr[16];
    sprintf( deviceIdStr, "CUDA device %d", device);
    printf("%-32s%s\n",deviceIdStr, name);
        
    printf("%-32s%s\n", "---", "---");
    
    int version = getDriverVersion();
    printf("%-32s%d.%d\n", "CUDA driver API version", version/10, version%10);
    printf("%-32s%d.%d\n", "Compute capability", major, minor);
    printf("%-32s%.0f megs\n", "Total memory", (F32)memory * exp2(-20));

    for (int i = 0; i < (int)FW_ARRAY_SIZE(attribs); i++)
    {
        int value;
        if (cuDeviceGetAttribute(&value, attribs[i].attrib, device) == CUDA_SUCCESS)
            printf("%-32s%d\n", attribs[i].name, value);
    }
    printf("\n");
}
Exemplo n.º 23
0
    void test_driver_api()
    {
        CUdevice cuDevice;
        CUcontext cuContext;
        CUmodule cuModule;
        size_t totalGlobalMem;
        CUfunction matrixMult = 0;
        // cuda driver api intialization
        {
            int major = 0, minor = 0;
            char deviceName[100];

            cuda::Check::CUDAError(cuInit(0), "Error intializing cuda");
            int deviceCount;
            cuda::Check::CUDAError(cuDeviceGetCount(&deviceCount), "Error getting the number of devices");
            if (deviceCount <= 0)
            {
                std::cerr << "No devices found" << std::endl;
                return;
            }

            cuDeviceGet(&cuDevice, 0);

            // get compute capabilities and the devicename
            cuda::Check::CUDAError(cuDeviceComputeCapability(&major, &minor, cuDevice), "Error getting Device compute capability");
            cuda::Check::CUDAError(cuDeviceGetName(deviceName, 256, cuDevice), "Error getting device name");
            std::cout << "> GPU Device has SM " << major << "." << minor << " compute capability" << std::endl;

            cuda::Check::CUDAError(cuDeviceTotalMem(&totalGlobalMem, cuDevice), "Error getting totat global memory");
            std::cout << "  Total amount of global memory:     " << (unsigned long long)totalGlobalMem << " bytes" << std::endl;
            std::string tmp = (totalGlobalMem > (unsigned long long)4 * 1024 * 1024 * 1024L) ? "YES" : "NO";
            std::cout << "  64-bit Memory Address:             " << tmp << std::endl;

            cuda::Check::CUDAError(cuCtxCreate(&cuContext, 0, cuDevice), "Error creating the context");
        }
        // Compile and get the function
        {
            std::string module_path = "MatrixMult.cubin";
            std::cout << "> initCUDA loading module: " << module_path << std::endl;

            cuda::Check::CUDAError(cuModuleLoad(&cuModule, module_path.c_str()), "Error loading module");

            cuda::Check::CUDAError(cuModuleGetFunction(&matrixMult, cuModule, "MatrixMultKernelSimpleDriverAPI"), "Error retrieving the function");
        }
        // Call the kernel
        {
            int WIDTH = BLOCK_SIZE;
            int HEIGHT = BLOCK_SIZE;
            std::stringstream text;
            text << "CUDA Matrix Multiplication (" << WIDTH << "x" << WIDTH << ") Simple method Multiplication time";
            HostMatrix<float> M(WIDTH, HEIGHT); M.fillWithRandomData(); //M.print(std::cout); 
            HostMatrix<float> N(WIDTH, HEIGHT); N.fill_diagonal(2); //N.print(std::cout); 
            HostMatrix<float> C(WIDTH, HEIGHT);
            {
                ScopedTimer t(text.str());

                // allocate device memory
                CUdeviceptr d_M;
                cuda::Check::CUDAError(cuMemAlloc(&d_M, M.sizeInBytes()), "Error allocating memory");
                CUdeviceptr d_N;
                cuda::Check::CUDAError(cuMemAlloc(&d_N, N.sizeInBytes()), "Error allocating memory");

                // copy host memory to device
                cuda::Check::CUDAError(cuMemcpyHtoD(d_M, M, M.sizeInBytes()), "Error uploading memory to device");
                cuda::Check::CUDAError(cuMemcpyHtoD(d_N, N, N.sizeInBytes()), "Error uploading memory to device");

                // allocate device memory for result
                CUdeviceptr d_C;
                cuda::Check::CUDAError(cuMemAlloc(&d_C, C.sizeInBytes()), "Error allocating memory");


                dim3 block(BLOCK_SIZE, BLOCK_SIZE, 1);
                dim3 grid(C.width_ / BLOCK_SIZE, C.height_ / BLOCK_SIZE, 1);
                void *args[6] = { &d_M, &d_N, &d_C, &WIDTH, &WIDTH, &WIDTH};

                // new CUDA 4.0 Driver API Kernel launch call
                cuda::Check::CUDAError(cuLaunchKernel(
                    matrixMult,                                     // Selected kernel function
                    grid.x, grid.y, grid.z,                         // grid config 
                    block.x, block.y, block.z,                      // block config
                    2 * BLOCK_SIZE*BLOCK_SIZE*sizeof(float),        
                    NULL, args, NULL), "Error executing Kernel");

                cuda::Check::CUDAError(cuMemcpyDtoH((void *)C, d_C, C.sizeInBytes()),"Error downloading memory to host");
            }
            C.print(std::cout);
        }

        cuCtxDestroy(cuContext);
    }
Exemplo n.º 24
0
/*
 * Initializes a CUPTI host thread and create the event group.
 *
 * @param ptid the VampirTrace thread id
 * @param cuCtx optionally given CUDA context
 *
 * @return the created VampirTrace CUPTI host thread structure
 */
static vt_cupti_ctx_t* vt_cupti_initCtx(uint32_t ptid, CUcontext cuCtx)
{
  vt_cupti_ctx_t *vtcuptiCtx = NULL;
  uint64_t time;

  vt_cntl_msg(2, "[CUPTI] Initializing VampirTrace CUPTI context (ptid=%d)",
              ptid);
  
  time = vt_pform_wtime();
  vt_enter(ptid, &time, rid_cupti_init);

  /* do not trace CUDA functions invoked here */
  VT_SUSPEND_CUDA_TRACING(ptid);

  /* initialize CUDA driver API, if necessary and get context handle */
  if(cuCtx == NULL){
#if (defined(CUDA_VERSION) && (CUDA_VERSION < 4000))
    CHECK_CU_ERROR(cuCtxPopCurrent(&cuCtx), "cuCtxPopCurrent");
    CHECK_CU_ERROR(cuCtxPushCurrent(cuCtx), "cuCtxPushCurrent");
#else
    CHECK_CU_ERROR(cuCtxGetCurrent(&cuCtx), "cuCtxGetCurrent");
#endif
  }

  /* get a pointer to eventIDArray */
  {
    CUresult cuErr = CUDA_SUCCESS;
    int dev_major, dev_minor;
    CUdevice cuDev = 0;
    vt_cupti_dev_t *cuptiDev;

    CHECK_CU_ERROR(cuCtxGetDevice(&cuDev), "cuCtxGetDevice");

    cuErr = cuDeviceComputeCapability(&dev_major, &dev_minor, cuDev);
    CHECK_CU_ERROR(cuErr, "cuDeviceComputeCapability");

    /* check if device capability already listed */
    CUPTI_LOCK();
      cuptiDev = vt_cupti_capList;
    CUPTI_UNLOCK();
    
    cuptiDev = vt_cupti_checkMetricList(cuptiDev, dev_major, dev_minor);
    if(cuptiDev){
      vtcuptiCtx = (vt_cupti_ctx_t*)malloc(sizeof(vt_cupti_ctx_t));
      if(vtcuptiCtx == NULL)
        vt_error_msg("malloc(sizeof(VTCUPTIhostThrd)) failed!");
      vtcuptiCtx->cuCtx = cuCtx;
      vtcuptiCtx->vtDevCap = cuptiDev;
      vtcuptiCtx->vtGrpList = NULL;
      vtcuptiCtx->counterData = NULL;
      vtcuptiCtx->cuptiEvtIDs = NULL;
      vtcuptiCtx->next = NULL;
    }else{
      time = vt_pform_wtime();
      vt_exit(ptid, &time);
      VT_RESUME_CUDA_TRACING(ptid);
      return NULL;
    }
  }

  VT_RESUME_CUDA_TRACING(ptid);

  /* create and add the VampirTrace CUPTI groups to the context */
  vt_cupti_addEvtGrpsToCtx(vtcuptiCtx);

  /* allocate memory for CUPTI counter reads */
  {
    size_t allocSize = vtcuptiCtx->vtGrpList->evtNum;
    
    vtcuptiCtx->counterData = (uint64_t *)malloc(allocSize*sizeof(uint64_t));
    vtcuptiCtx->cuptiEvtIDs = (CUpti_EventID *)malloc(allocSize*sizeof(CUpti_EventID));
  }

  /* add VampirTrace CUPTI context entry to list (as first element) */
  CUPTI_LOCK();
    vtcuptiCtx->next = vtcuptiCtxlist;
    vtcuptiCtxlist = vtcuptiCtx;
  CUPTI_UNLOCK();

  time = vt_pform_wtime();
  vt_exit(ptid, &time);

  return vtcuptiCtx;
}
Exemplo n.º 25
0
/*
 * Setup a list of devices with different device capabilities and add the 
 * metrics, which are specified by the user.
 * 
 * @return a list of CUDA devices with different device capabilities
 */
static vt_cupti_device_t* vt_cuptievt_setupMetricList(void)
{
  CUresult err;
  int deviceCount, id;
  vt_cupti_device_t *capList = NULL;
  
  VT_SUSPEND_MALLOC_TRACING(VT_CURRENT_THREAD);
  
  /* CUDA initialization */
	VT_CUDRV_CALL(cuInit(0), "cuInit");
  
  /* How many GPGPU devices do we have? */
	err = cuDeviceGetCount( &deviceCount );
	VT_CUDRV_CALL(err, "cuDeviceGetCount");
	if(deviceCount == 0){
		vt_error_msg("[CUPTI Events] There is no device supporting CUDA available.");
	}

  /* create list with available compute capabilities */
  for(id = 0; id < deviceCount; id++){
    CUdevice cuDev;
    vt_cupti_device_t *cuptiDev;
    int dev_major, dev_minor;

    err = cuDeviceGet(&cuDev, id);
		VT_CUDRV_CALL(err, "cuDeviceGet");

    err = cuDeviceComputeCapability(&dev_major, &dev_minor, cuDev);
    VT_CUDRV_CALL(err, "cuDeviceComputeCapability");

    /* check if device capability already listed */
    cuptiDev = vt_cupti_checkMetricList(capList, dev_major, dev_minor);

    if(cuptiDev == NULL){
      /* allocate memory for device list entry */
      cuptiDev = (vt_cupti_device_t *)malloc(sizeof(vt_cupti_device_t));
      cuptiDev->dev_major = dev_major;
      cuptiDev->dev_minor = dev_minor;
      cuptiDev->cuDev = cuDev;
      cuptiDev->vtcuptiEvtList = NULL;
      cuptiDev->evtNum = 0;
      cuptiDev->next = NULL;

      /* prepend to list */
      cuptiDev->next = capList;
      capList = cuptiDev;
    }
  }

  vt_cupti_fillMetricList(capList);

  /* cleanup list: remove entries, which don't have metrics */
  {
    vt_cupti_device_t *curr = capList;
    vt_cupti_device_t *last = capList;

    while(curr != NULL){
      vt_cupti_device_t *freeDev = curr;
      curr = curr->next;

      if(freeDev->evtNum == 0){
        /* first element */
        if(freeDev == capList){
          capList = capList->next;
        }else{
          last->next = freeDev->next;
        }
        free(freeDev);
      }else last = freeDev;
    }
  }
  
  VT_RESUME_MALLOC_TRACING(VT_CURRENT_THREAD);

  return capList;
}
Exemplo n.º 26
0
/**
 * Basically the same functionality like cudaGetDeviceProperties looped over
 * cudaGetDeviceCount. Not sure why CUDA runtime is used directly. Faster?
 * Basically 95% boiler plate code
 *
 * Uses JNI to create an ArrayList[GpuDevice] object
 */
JNIEXPORT jobject JNICALL
Java_org_trifort_rootbeer_runtime_CUDARuntime_loadGpuDevices
(
    JNIEnv * env,
    jobject this_ref
)
{
    jclass    gpu_device_class;
    jmethodID gpu_device_init ;
    jobject   gpu_device      ;

    jclass    array_list_class = (*env)->FindClass  ( env, "java/util/ArrayList" );
    jmethodID array_list_init  = (*env)->GetMethodID( env, array_list_class, "<init>", "()V" );
    jmethodID array_list_add   = (*env)->GetMethodID( env, array_list_class, "add", "(Ljava/lang/Object;)Z" );
    jobject   ret              = (*env)->NewObject  ( env, array_list_class, array_list_init );

    gpu_device_class = (*env)->FindClass(env, "org/trifort/rootbeer/runtime/GpuDevice");
    gpu_device_init  = (*env)->GetStaticMethodID(env, gpu_device_class, "newCudaDevice",
      "(IIILjava/lang/String;JJIIIIIIIIZIIIIIIII)Lorg/trifort/rootbeer/runtime/GpuDevice;");
                          /* ^ function signature for constructor arguments */

    int status = cuInit(0);
    if ( status != CUDA_SUCCESS )
        return ret;

    int nDevices = 0;
    cuDeviceGetCount( &nDevices );

    int iDevice = 0;
    for ( iDevice = 0; iDevice < nDevices; ++iDevice )
    {
        CUdevice device;
        status = cuDeviceGet( &device, iDevice );
        if ( status != CUDA_SUCCESS )
            continue;

        int    major_version    ;
        int    minor_version    ;
        char   device_name[4096];
        size_t total_mem        ;

        CE( cuDeviceComputeCapability( &major_version, &minor_version, device ) );
        CE( cuDeviceGetName( device_name, 4096, device ) );
        CE( cuDeviceTotalMem( &total_mem, device ) );
        /* cuCtxCreate and Destroy are VERY expensive (~0.5s) and would only be necessary for free mem */
        // CE( cuCtxCreate ( &context, CU_CTX_MAP_HOST, device ) );
        // CE( cuMemGetInfo( &free_mem, &total_mem ) );
        // CE( cuCtxDestroy( context ) );

        /* makes use of https://gcc.gnu.org/onlinedocs/gcc/Statement-Exprs.html
         * to be able to write the constructor call, variable declaration and
         * call to cuDeviceGetAttribute in one go using a macro.
         * Also seems to work with -std=c99 switch:
         *   f(
         *       ({ int j = 5; j+1; })
         *   );
         */
        #define CUATTR( NAME )                                      \
        ( {                                                         \
            int NAME;                                               \
            CE( cuDeviceGetAttribute( &NAME,                        \
                                      CU_DEVICE_ATTRIBUTE_##NAME,   \
                                      device ) )                    \
            NAME;                                                   \
        } )

        /* @see GpuDevice.java */
        gpu_device = (*env)->CallObjectMethod
        (
            env                                     ,
            gpu_device_class                        ,
            gpu_device_init                         ,
            iDevice                                 ,  // device_id
            major_version                           ,  // major_version
            minor_version                           ,  // minor_version
            (*env)->NewStringUTF(env, device_name)  ,  // device_name
            -1                                      ,  // free_global_mem_size
            (jlong) total_mem                       ,  // total_global_mem_size
            CUATTR( MAX_REGISTERS_PER_BLOCK        ),  // max_registers_per_block
            CUATTR( WARP_SIZE                      ),  // warp_size
            CUATTR( MAX_PITCH                      ),  // max_pitch
            CUATTR( MAX_THREADS_PER_BLOCK          ),  // max_threads_per_block
            CUATTR( MAX_SHARED_MEMORY_PER_BLOCK    ),  // max_shared_memory_per_block
            CUATTR( CLOCK_RATE                     ),  // clock_rate
            CUATTR( MEMORY_CLOCK_RATE              ),  // memory_clock_rate
            CUATTR( TOTAL_CONSTANT_MEMORY          ),  // constant_mem_size
            CUATTR( INTEGRATED                     ),  // integrated
            CUATTR( MAX_THREADS_PER_MULTIPROCESSOR ),  // max_threads_per_multiprocessor
            CUATTR( MULTIPROCESSOR_COUNT           ),  // multiprocessor_count
            CUATTR( MAX_BLOCK_DIM_X                ),  // max_block_dim_x
            CUATTR( MAX_BLOCK_DIM_Y                ),  // max_block_dim_y
            CUATTR( MAX_BLOCK_DIM_Z                ),  // max_block_dim_z
            CUATTR( MAX_GRID_DIM_X                 ),  // max_grid_dim_x
            CUATTR( MAX_GRID_DIM_Y                 ),  // max_grid_dim_y
            CUATTR( MAX_GRID_DIM_Z                 )   // max_grid_dim_z
        );

        #undef CUATTR

        (*env)->CallBooleanMethod( env, ret, array_list_add, gpu_device );
    }

    return ret;
}
Exemplo n.º 27
0
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main( int argc, char** argv) 
{
	CUdevice dev;
	int major = 0, minor = 0;
	int deviceCount = 0;
	char deviceName[256];

	// note your project will need to link with cuda.lib files on windows
	printf("CUDA Device Query (Driver API) statically linked version \n");

	CUresult err = cuInit(0);
    CU_SAFE_CALL_NO_SYNC(cuDeviceGetCount(&deviceCount));
	// This function call returns 0 if there are no CUDA capable devices.
	if (deviceCount == 0) {
        printf("There is no device supporting CUDA\n");
	}
    for (dev = 0; dev < deviceCount; ++dev) {
		CU_SAFE_CALL_NO_SYNC( cuDeviceComputeCapability(&major, &minor, dev) );

        if (dev == 0) {
			// This function call returns 9999 for both major & minor fields, if no CUDA capable devices are present
            if (major == 9999 && minor == 9999)
                printf("There is no device supporting CUDA.\n");
            else if (deviceCount == 1)
                printf("There is 1 device supporting CUDA\n");
            else
                printf("There are %d devices supporting CUDA\n", deviceCount);
        }
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetName(deviceName, 256, dev) );
        printf("\nDevice %d: \"%s\"\n", dev, deviceName);

    #if CUDA_VERSION >= 2020
		int driverVersion = 0;
		cuDriverGetVersion(&driverVersion);
		printf("  CUDA Driver Version:                           %d.%d\n", driverVersion/1000, driverVersion%100);
    #endif
        shrLog("  CUDA Capability Major/Minor version number:    %d.%d\n", major, minor);

		size_t totalGlobalMem;
		CU_SAFE_CALL_NO_SYNC( cuDeviceTotalMem(&totalGlobalMem, dev) );
        printf("  Total amount of global memory:                 %llu bytes\n", (unsigned long long)totalGlobalMem);

    #if CUDA_VERSION >= 2000
	    int multiProcessorCount;
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev ) );
        shrLog("  Multiprocessors x Cores/MP = Cores:            %d (MP) x %d (Cores/MP) = %d (Cores)\n", 
			     multiProcessorCount, ConvertSMVer2Cores(major, minor), 
				 ConvertSMVer2Cores(major, minor) * multiProcessorCount);
	#endif

 	    int totalConstantMemory;
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &totalConstantMemory, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, dev ) );
        printf("  Total amount of constant memory:               %u bytes\n", totalConstantMemory);
 	    int sharedMemPerBlock;
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &sharedMemPerBlock, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, dev ) );
        printf("  Total amount of shared memory per block:       %u bytes\n", sharedMemPerBlock);
 	    int regsPerBlock;
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &regsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev ) );
        printf("  Total number of registers available per block: %d\n", regsPerBlock);
 	    int warpSize;
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev ) );
        printf("  Warp size:                                     %d\n",	warpSize);
 	    int maxThreadsPerBlock;
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &maxThreadsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev ) );
		printf("  Maximum number of threads per block:           %d\n",	maxThreadsPerBlock);
 	    int blockDim[3];
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &blockDim[0], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, dev ) );
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &blockDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, dev ) );
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &blockDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, dev ) );
        printf("  Maximum sizes of each dimension of a block:    %d x %d x %d\n", blockDim[0], blockDim[1], blockDim[2]);
 	    int gridDim[3];
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &gridDim[0], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, dev ) );
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &gridDim[1], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, dev ) );
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &gridDim[2], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, dev ) );
        printf("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n", gridDim[0], gridDim[1], gridDim[2]);
  	    int memPitch;
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &memPitch, CU_DEVICE_ATTRIBUTE_MAX_PITCH, dev ) );
        printf("  Maximum memory pitch:                          %u bytes\n", memPitch);
  	    int textureAlign;
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, dev ) );
        printf("  Texture alignment:                             %u bytes\n", textureAlign);
  	    int clockRate;
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev ) );
        printf("  Clock rate:                                    %.2f GHz\n", clockRate * 1e-6f);
    #if CUDA_VERSION >= 2000
	    int gpuOverlap;
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &gpuOverlap, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev ) );
        printf("  Concurrent copy and execution:                 %s\n",gpuOverlap ? "Yes" : "No");
    #endif

    #if CUDA_VERSION >= 2020
	    int kernelExecTimeoutEnabled;
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &kernelExecTimeoutEnabled, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, dev ) );
        printf("  Run time limit on kernels:                     %s\n", kernelExecTimeoutEnabled ? "Yes" : "No");
	    int integrated;
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev ) );
        printf("  Integrated:                                    %s\n", integrated ? "Yes" : "No");
	    int canMapHostMemory;
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &canMapHostMemory, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev ) );
        printf("  Support host page-locked memory mapping:       %s\n", canMapHostMemory ? "Yes" : "No");
    #endif

    #if CUDA_VERSION >= 3000
	    int concurrentKernels;
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &concurrentKernels, CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev ) );
        printf("  Concurrent kernel execution:                   %s\n", concurrentKernels ? "Yes" : "No");
	    int eccEnabled;
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &eccEnabled,  CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev ) );
        printf("  Device has ECC support enabled:                %s\n", eccEnabled ? "Yes" : "No");
    #endif

    #if CUDA_VERSION >= 3020
	    int tccDriver ;
		CU_SAFE_CALL_NO_SYNC( cuDeviceGetAttribute( &tccDriver ,  CU_DEVICE_ATTRIBUTE_TCC_DRIVER, dev ) );
		printf("  Device is using TCC driver mode:               %s\n", tccDriver ? "Yes" : "No");
    #endif
	}
    printf("\nPASSED\n");
    CUT_EXIT(argc, argv);
}
Exemplo n.º 28
0
	string compile_kernel()
	{
		/* compute cubin name */
		int major, minor;
		cuDeviceComputeCapability(&major, &minor, cuDevId);

		/* attempt to use kernel provided with blender */
		string cubin = path_get(string_printf("lib/kernel_sm_%d%d.cubin", major, minor));
		if(path_exists(cubin))
			return cubin;

		/* not found, try to use locally compiled kernel */
		string kernel_path = path_get("kernel");
		string md5 = path_files_md5_hash(kernel_path);

		cubin = string_printf("cycles_kernel_sm%d%d_%s.cubin", major, minor, md5.c_str());
		cubin = path_user_get(path_join("cache", cubin));

		/* if exists already, use it */
		if(path_exists(cubin))
			return cubin;

#ifdef _WIN32
		if(cuHavePrecompiledKernels()) {
			if(major < 2)
				cuda_error_message(string_printf("CUDA device requires compute capability 2.0 or up, found %d.%d. Your GPU is not supported.", major, minor));
			else
				cuda_error_message(string_printf("CUDA binary kernel for this graphics card compute capability (%d.%d) not found.", major, minor));
			return "";
		}
#endif

		/* if not, find CUDA compiler */
		string nvcc = cuCompilerPath();

		if(nvcc == "") {
			cuda_error_message("CUDA nvcc compiler not found. Install CUDA toolkit in default location.");
			return "";
		}

		/* compile */
		string kernel = path_join(kernel_path, "kernel.cu");
		string include = kernel_path;
		const int machine = system_cpu_bits();
		const int maxreg = 24;

		double starttime = time_dt();
		printf("Compiling CUDA kernel ...\n");

		path_create_directories(cubin);

		string command = string_printf("\"%s\" -arch=sm_%d%d -m%d --cubin \"%s\" "
			"-o \"%s\" --ptxas-options=\"-v\" --maxrregcount=%d --opencc-options -OPT:Olimit=0 -I\"%s\" -DNVCC",
			nvcc.c_str(), major, minor, machine, kernel.c_str(), cubin.c_str(), maxreg, include.c_str());

		if(system(command.c_str()) == -1) {
			cuda_error_message("Failed to execute compilation command, see console for details.");
			return "";
		}

		/* verify if compilation succeeded */
		if(!path_exists(cubin)) {
			cuda_error_message("CUDA kernel compilation failed, see console for details.");
			return "";
		}

		printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);

		return cubin;
	}
Exemplo n.º 29
0
static vt_cupti_dev_t* vt_cupti_setupMetricList(void)
{
  CUresult err;
  int deviceCount, id;
  vt_cupti_dev_t *capList = NULL;

  /* CUDA initialization */
	err = cuInit( 0 );
	if ( err != CUDA_SUCCESS ) {
		printf( "Initialization of CUDA library failed.\n" );
		exit( EXIT_FAILURE );
	}

  /* How many gpgpu devices do we have? */
	err = cuDeviceGetCount( &deviceCount );
	CHECK_CU_ERROR(err, "cuDeviceGetCount");
	if(deviceCount == 0){
		printf("[CUPTI]There is no device supporting CUDA.\n");
		exit(EXIT_FAILURE);
	}

  /* create list with available compute capabilities */
  for(id = 0; id < deviceCount; id++){
    CUdevice cuDev;
    vt_cupti_dev_t *cuptiDev;
    int dev_major, dev_minor;

    err = cuDeviceGet(&cuDev, id);
		CHECK_CU_ERROR(err, "cuDeviceGet");

    err = cuDeviceComputeCapability(&dev_major, &dev_minor, cuDev);
    CHECK_CU_ERROR(err, "cuDeviceComputeCapability");

    /* check if device capability already listed */
    cuptiDev = vt_cupti_checkMetricList(capList, dev_major, dev_minor);

    if(cuptiDev == NULL){
      /* allocate memory for device list entry */
      cuptiDev = (vt_cupti_dev_t *)malloc(sizeof(vt_cupti_dev_t));
      cuptiDev->dev_major = dev_major;
      cuptiDev->dev_minor = dev_minor;
      cuptiDev->cuDev = cuDev;
      cuptiDev->vtcuptiEvtList = NULL;
      cuptiDev->evtNum = 0;
      cuptiDev->next = NULL;

      /* prepend to list */
      cuptiDev->next = capList;
      capList = cuptiDev;
    }
  }

  vt_cupti_fillMetricList(capList);

  /* cleanup list: remove entries, which don't have metrics */
  {
    vt_cupti_dev_t *curr = capList;
    vt_cupti_dev_t *last = capList;

    while(curr != NULL){
      vt_cupti_dev_t *freeDev = curr;
      curr = curr->next;

      if(freeDev->evtNum == 0){
        /* first element */
        if(freeDev == capList){
          capList = capList->next;
        }else{
          last->next = freeDev->next;
        }
        free(freeDev);
      }else last = freeDev;
    }
  }

  return capList;
}
Exemplo n.º 30
0
/// main - Program entry point
int main(int argc, char** argv) {
  if (argc != 3) {
    printf("Usage: %s dataCount blockSize\n", argv[0]);
    exit(1);
  }

  CUdevice device;
  CUmodule cudaModule;
  CUcontext context;
  CUfunction function;
  CUlinkState linker;
  int devCount;

  // CUDA initialization
  checkCudaErrors(cuInit(0));
  checkCudaErrors(cuDeviceGetCount(&devCount));
  checkCudaErrors(cuDeviceGet(&device, 0));

  char name[128];
  checkCudaErrors(cuDeviceGetName(name, 128, device));
  std::cout << "Using CUDA Device [0]: " << name << "\n";

  int devMajor, devMinor;
  checkCudaErrors(cuDeviceComputeCapability(&devMajor, &devMinor, device));
  std::cout << "Device Compute Capability: " << devMajor << "." << devMinor
            << "\n";
  if (devMajor < 2) {
    std::cerr << "ERROR: Device 0 is not SM 2.0 or greater\n";
    return 1;
  }

  std::ifstream t("kernel.ptx");
  if (!t.is_open()) {
    std::cerr << "kernel.ptx not found\n";
    return 1;
  }
  std::string str((std::istreambuf_iterator<char>(t)),
                  std::istreambuf_iterator<char>());

  // Create driver context
  checkCudaErrors(cuCtxCreate(&context, 0, device));

  // Create module for object
  checkCudaErrors(cuModuleLoadDataEx(&cudaModule, str.c_str(), 0, 0, 0));

  // Get kernel function
  checkCudaErrors(cuModuleGetFunction(&function, cudaModule, "kernel"));

  // Device data
  CUdeviceptr devBufferA;
  CUdeviceptr devBufferB;
  CUdeviceptr devBufferC;
  CUdeviceptr devBufferSMid;

  // Size
  unsigned dataCount = atoi(argv[1]);

  checkCudaErrors(cuMemAlloc(&devBufferA, sizeof(float) * dataCount));
  checkCudaErrors(cuMemAlloc(&devBufferB, sizeof(float) * dataCount));
  checkCudaErrors(cuMemAlloc(&devBufferC, sizeof(float) * dataCount));
  checkCudaErrors(cuMemAlloc(&devBufferSMid, sizeof(int) * dataCount));

  float* hostA = new float[dataCount];
  float* hostB = new float[dataCount];
  float* hostC = new float[dataCount];
  int* hostSMid = new int[dataCount];

  // Populate input
  for (unsigned i = 0; i != dataCount; ++i) {
    hostA[i] = (float)i;
    hostB[i] = (float)(2 * i);
    hostC[i] = 2.0f;
    hostSMid[i] = 0;
  }

  checkCudaErrors(
      cuMemcpyHtoD(devBufferA, &hostA[0], sizeof(float) * dataCount));
  checkCudaErrors(
      cuMemcpyHtoD(devBufferB, &hostB[0], sizeof(float) * dataCount));

  unsigned blockSizeX = atoi(argv[2]);
  unsigned blockSizeY = 1;
  unsigned blockSizeZ = 1;
  unsigned gridSizeX = (dataCount + blockSizeX - 1) / blockSizeX;
  unsigned gridSizeY = 1;
  unsigned gridSizeZ = 1;

  // Kernel parameters
  void* KernelParams[] = {&devBufferA, &devBufferB, &devBufferC,
                          &devBufferSMid};

  std::cout << "Launching kernel\n";

  // Kernel launch
  checkCudaErrors(cuLaunchKernel(function, gridSizeX, gridSizeY, gridSizeZ,
                                 blockSizeX, blockSizeY, blockSizeZ, 0, NULL,
                                 KernelParams, NULL));

  // Retrieve device data
  checkCudaErrors(
      cuMemcpyDtoH(&hostC[0], devBufferC, sizeof(float) * dataCount));
  checkCudaErrors(
      cuMemcpyDtoH(&hostSMid[0], devBufferSMid, sizeof(int) * dataCount));

  std::cout << "Results:\n";
  std::cout << "SM " << hostSMid[0] << ":" << hostA[0] << " + " << hostB[0]
            << " = " << hostC[0] << "\n";
  for (unsigned i = 1; i != dataCount; i++) {
    if (hostSMid[i] != hostSMid[i - 1])
      std::cout << "SM " << hostSMid[i] << ":" << hostA[i] << " + " << hostB[i]
                << " = " << hostC[i] << "\n";
  }

  // Clean up after ourselves
  delete[] hostA;
  delete[] hostB;
  delete[] hostC;
  delete[] hostSMid;

  // Clean-up
  checkCudaErrors(cuMemFree(devBufferA));
  checkCudaErrors(cuMemFree(devBufferB));
  checkCudaErrors(cuMemFree(devBufferC));
  checkCudaErrors(cuMemFree(devBufferSMid));
  checkCudaErrors(cuModuleUnload(cudaModule));
  checkCudaErrors(cuCtxDestroy(context));

  return 0;
}