Esempi in C++ (Cpp) per cudaGetDevice

Esempio n. 1

0

Mostra file

File: syncedmem.cpp Progetto: runaway/studycaffe

// 内部使用的  
// 如果当前未初始化直接在GPU分配内存  
// 如果当前在CPU，则在GPU上分配内存并且复制到GPU  
// 如果数据已经在GPU则啥也不做  
inline void SyncedMemory::to_gpu() {  
#ifndef CPU_ONLY  
  switch (head_) {  
  case UNINITIALIZED:  
    // 获取设备  
    CUDA_CHECK(cudaGetDevice(&gpu_device_));  
    // 在设备上分配内存  
    CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_));  
    // 初始化为0  
    caffe_gpu_memset(size_, 0, gpu_ptr_);  
    head_ = HEAD_AT_GPU;  
    own_gpu_data_ = true;  
    break;  
  case HEAD_AT_CPU:  
    if (gpu_ptr_ == NULL) {  
      CUDA_CHECK(cudaGetDevice(&gpu_device_));  
      CUDA_CHECK(cudaMalloc(&gpu_ptr_, size_));  
      own_gpu_data_ = true;  
    }  
    caffe_gpu_memcpy(size_, cpu_ptr_, gpu_ptr_);  
    head_ = SYNCED;  
    break;  
  case HEAD_AT_GPU:  
  case SYNCED:  
    break;  
  }  
#else  
  NO_GPU;  
#endif  
}

Esempio n. 2

0

Mostra file

File: cuda_rotated_pattern_data.hpp Progetto: fengwang/larbed-refinement

        ~cuda_pattern_data()
        {
            int current_id;
            cuda_assert( cudaGetDevice(&current_id) );
            if ( current_id != device_id ) cuda_assert( cudaSetDevice( device_id ) );

            if ( ar ) cuda_assert( cudaFree(ar) );
            if ( dim ) cuda_assert( cudaFree(dim) );
            if ( I_diff ) cuda_assert( cudaFree(I_diff) );
            if ( I_exp ) cuda_assert( cudaFree(I_exp) );
            if ( I_exp ) cuda_assert( cudaFree(I_zigmoid) );
            if ( diag ) cuda_assert( cudaFree(diag) );
            if ( ug ) cuda_assert( cudaFree(ug) );
            if ( cache ) cuda_assert( cudaFree(cache) );
            if ( beams ) cuda_assert( cudaFree(beams) );
            if ( kt_factor ) cuda_assert( cudaFree(kt_factor) );
            if ( gvec ) cuda_assert( cudaFree(gvec) );
            if ( tilt ) cuda_assert( cudaFree(tilt) );

            ar = 0;
            dim = 0;
            I_diff = 0;
            I_exp = 0;
            I_zigmoid = 0;
            diag = 0;
            ug = 0;
            cache = 0;
            gvec = 0;
            tilt = 0;
        }

Esempio n. 3

0

Mostra file

File: THCGeneral.c Progetto: ASAPPinc/cutorch

struct cudaDeviceProp* THCState_getCurrentDeviceProperties(THCState* state)
{
  int curDev = -1;
  THCudaCheck(cudaGetDevice(&curDev));

  return &(state->deviceProperties[curDev]);
}

Esempio n. 4

0

Mostra file

File: common.cpp Progetto: airxcy/ITF_Tracker

void Engine::DeviceQuery() {
  cudaDeviceProp prop;
  int device;
  if (cudaSuccess != cudaGetDevice(&device)) {
    printf("No cuda device present.\n");
    return;
  }
  CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
  LOG(INFO) << "Device id:                     " << device;
  LOG(INFO) << "Major revision number:         " << prop.major;
  LOG(INFO) << "Minor revision number:         " << prop.minor;
  LOG(INFO) << "Name:                          " << prop.name;
  LOG(INFO) << "Total global memory:           " << prop.totalGlobalMem;
  LOG(INFO) << "Total shared memory per block: " << prop.sharedMemPerBlock;
  LOG(INFO) << "Total registers per block:     " << prop.regsPerBlock;
  LOG(INFO) << "Warp size:                     " << prop.warpSize;
  LOG(INFO) << "Maximum memory pitch:          " << prop.memPitch;
  LOG(INFO) << "Maximum threads per block:     " << prop.maxThreadsPerBlock;
  LOG(INFO) << "Maximum dimension of block:    "
      << prop.maxThreadsDim[0] << ", " << prop.maxThreadsDim[1] << ", "
      << prop.maxThreadsDim[2];
  LOG(INFO) << "Maximum dimension of grid:     "
      << prop.maxGridSize[0] << ", " << prop.maxGridSize[1] << ", "
      << prop.maxGridSize[2];
  LOG(INFO) << "Clock rate:                    " << prop.clockRate;
  LOG(INFO) << "Total constant memory:         " << prop.totalConstMem;
  LOG(INFO) << "Texture alignment:             " << prop.textureAlignment;
  LOG(INFO) << "Concurrent copy and execution: "
      << (prop.deviceOverlap ? "Yes" : "No");
  LOG(INFO) << "Number of multiprocessors:     " << prop.multiProcessorCount;
  LOG(INFO) << "Kernel execution timeout:      "
      << (prop.kernelExecTimeoutEnabled ? "Yes" : "No");
  return;
}

Esempio n. 5

0

Mostra file

File: THCGeneral.cpp Progetto: HustlehardInc/pytorch

cudaError_t THCudaMemGetInfoCached(THCState *state,  size_t* freeBytes, size_t* totalBytes, size_t* largestBlock)
{
  size_t cachedBytes = 0;
  THCDeviceAllocator* allocator = state->cudaDeviceAllocator;

  *largestBlock = 0;
  /* get info from CUDA first */
  cudaError_t ret = cudaMemGetInfo(freeBytes, totalBytes);
  if (ret!= cudaSuccess)
    return ret;

  int device;
  ret = cudaGetDevice(&device);
  if (ret!= cudaSuccess)
    return ret;

  /* not always true - our optimistic guess here */
  *largestBlock = *freeBytes;

  if (allocator->cacheInfo != NULL)
    allocator->cacheInfo(allocator->state, device, &cachedBytes, largestBlock);

  /* Adjust resulting free bytes number. largesBlock unused for now */
  *freeBytes += cachedBytes;
  return cudaSuccess;
}

Esempio n. 6

0

Mostra file

File: common.cpp Progetto: FuchenUSTC/caffe-c3d

void Caffe::DeviceQuery() {
  cudaDeviceProp prop;
  int device;
  if (cudaSuccess != cudaGetDevice(&device)) {
    printf("No cuda device present.\n");
    return;
  }
  CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
  printf("Device id:                     %d\n", device);
  printf("Major revision number:         %d\n", prop.major);
  printf("Minor revision number:         %d\n", prop.minor);
  printf("Name:                          %s\n", prop.name);
  printf("Total global memory:           %lu\n", prop.totalGlobalMem);
  printf("Total shared memory per block: %lu\n", prop.sharedMemPerBlock);
  printf("Total registers per block:     %d\n", prop.regsPerBlock);
  printf("Warp size:                     %d\n", prop.warpSize);
  printf("Maximum memory pitch:          %lu\n", prop.memPitch);
  printf("Maximum threads per block:     %d\n", prop.maxThreadsPerBlock);
  printf("Maximum dimension of block:    %d, %d, %d\n",
      prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
  printf("Maximum dimension of grid:     %d, %d, %d\n",
      prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
  printf("Clock rate:                    %d\n", prop.clockRate);
  printf("Total constant memory:         %lu\n", prop.totalConstMem);
  printf("Texture alignment:             %lu\n", prop.textureAlignment);
  printf("Concurrent copy and execution: %s\n",
      (prop.deviceOverlap ? "Yes" : "No"));
  printf("Number of multiprocessors:     %d\n", prop.multiProcessorCount);
  printf("Kernel execution timeout:      %s\n",
      (prop.kernelExecTimeoutEnabled ? "Yes" : "No"));
  return;
}

Esempio n. 7

0

Mostra file

File: GpuDeviceInformationDialog.cpp Progetto: apartridge/GpuRayTracer

void GpuDeviceInformationDialog::setupGpuDeviceTabPages()
{
	
	int numDevs = 0;
	cudaGetDeviceCount(&numDevs);

	this->setWindowTitle(QString("GPU Device Information (") + QString::number(numDevs) + QString(" devices found)"));

	for(int i = 0; i < numDevs; i++)
	{
		cudaDeviceProp devProp;
		cudaGetDeviceProperties(&devProp, i);

		QWidget* deviceTabPage = new GpuDeviceInformationDialogTabPage(devProp, i);
		
		this->tabWidget->addTab(deviceTabPage, devProp.name);

		connect(deviceTabPage, SIGNAL(setMainComputeDevice(int)), this, SLOT(on_setMainComputeDevice(int)));
		connect(this, SIGNAL(hasChangedMainComputeDevice(int)), deviceTabPage, SLOT(on_hasChangedMainComputeDevice(int)));

	}

	int currentComputeDevice;
	cudaGetDevice(&currentComputeDevice);

	emit hasChangedMainComputeDevice(currentComputeDevice);
}

Esempio n. 8

0

Mostra file

File: Device.cpp Progetto: dtegunov/warp

__declspec(dllexport) int __stdcall GetDevice()
{
    int device = 0;
    cudaGetDevice(&device);

    return device;
}

Esempio n. 9

0

Mostra file

File: rcrackiThread.cpp Progetto: ChunHungLiu/ctf-writeup

// start processing of jobs
void rcrackiThread::rcrackiThreadEntryPoint()
{
#if GPU
	if(gpu != 0 && cudaGetDevice(&cudaDevId) == CUDA_SUCCESS) {
		cudaBuffCount = 0x2000;
		cudaChainSize = 100;

		cudaDeviceProp deviceProp;
		if(cudaGetDeviceProperties(&deviceProp, cudaDevId) == CUDA_SUCCESS) {
			switch(deviceProp.major) {
			case 1: ; break;
			case 2:
				cudaBuffCount = 0x4000;
				cudaChainSize = 200;
				break;
			}
		}
		cudaBuffCount = rcuda::GetChainsBufferSize(cudaBuffCount);
	}
	else
#endif
		cudaDevId = -1;

	if (falseAlarmChecker) {
		if (falseAlarmCheckerO) {
			CheckAlarmO();
		}
		else {
			CheckAlarm();
		}
	}
	else {
		PreCalculate();
	}
}

Esempio n. 10

0

Mostra file

File: cudaWrapper.cpp Progetto: bdiegel/TS

bool TryToAddSingleFitStream(void * vpsM, WorkerInfoQueue* q){
#ifdef ION_COMPILE_CUDA
  int dev_id = 0;
  cudaStreamManager * psM = (cudaStreamManager *) vpsM;
  SingleFitStream * temp;
  cudaGetDevice( &dev_id );
  int i;
    try{ // exception handling to allow fallback to CPU Fit if not a single strweam could be created
      temp =  new SingleFitStream(q);
      i = psM->addStreamUnit( temp);
      std::cout <<"CUDA: Device " <<  dev_id <<  " Single Fit stream " << i <<" created " << std::endl;
      psM->printMemoryUsage();
    }
    catch(cudaException& e)
    {
      cout << e.what() << endl;
      if(psM->getNumStreams() > 0){ 
        cout << "CUDA: Device " << dev_id<< " could not create more than " << psM->getNumStreams() << " Single Fit streams" << std::endl;       
        psM->printMemoryUsage();
      }else{
        std::cout << "CUDA: Device " << dev_id << " no Single Fit streams could be created >>>>>>>>>>>>>>>>> FALLING BACK TO CPU!"<< std::endl;
        return false;
      }
    }

#endif
  return true;
}

Esempio n. 11

0

Mostra file

File: CachedDeviceProperties.cpp Progetto: JohnJPS/fbcuda

const cudaDeviceProp& getCurrentDeviceProperties() {
  int device = 0;
  auto err = cudaGetDevice(&device);
  checkCuda(err, std::string("CUDA ERROR: cudaGetDeviceCount "));

  return getDeviceProperties(device);
}

Esempio n. 12

0

Mostra file

File: GpuSurfDetectorInternal.cpp Progetto: dongmingdmdm/camnavi

  GpuSurfDetectorInternal::GpuSurfDetectorInternal(GpuSurfConfiguration config) : 
    m_initialized(false),
    m_config(config)
  {
    int deviceCount;
    int device;
    cudaError_t err;
    cudaGetDeviceCount(&deviceCount);
    ASRL_ASSERT_GT(deviceCount,0,"There are no CUDA capable devices present");
    
	
    err = cudaGetDevice(&device);
    ASRL_ASSERT_EQ(err,cudaSuccess, "Unable to get the CUDA device: " << cudaGetErrorString(err));		
    //std::cout << "Found device " << device << std::endl;
    err = cudaGetDeviceProperties(&m_deviceProp,device);
    ASRL_ASSERT_EQ(err,cudaSuccess, "Unable to get the CUDA device properties: " << cudaGetErrorString(err));		

    // Some more checking...
    ASRL_ASSERT_GE(m_deviceProp.major,1,"Minimum compute capability 1.1 is necessary");
    ASRL_ASSERT_GE(m_deviceProp.minor,1,"Minimum compute capability 1.1 is necessary");

    m_maxmin.init(ASRL_SURF_MAX_CANDIDATES,false);
    m_maxmin.memset(0);

  }

Esempio n. 13

0

Mostra file

File: CUDADevicesService.cpp Progetto: christiantinauer/relaxometry

int CUDADevicesService::getMaximumTexture2DHeight() {
	int device;
	cudaGetDevice(&device);
	cudaDeviceProp* devProperties = new cudaDeviceProp();
	cudaGetDeviceProperties(devProperties, device);
	return devProperties->maxTexture2D[1];
}

Esempio n. 14

0

Mostra file

File: CUDADevicesService.cpp Progetto: christiantinauer/relaxometry

int CUDADevicesService::getSharedMemoryPerBlock() {
	int device;
	cudaGetDevice(&device);
	cudaDeviceProp* devProperties = new cudaDeviceProp();
	cudaGetDeviceProperties(devProperties, device);
	return devProperties->sharedMemPerBlock;
}

Esempio n. 15

0

Mostra file

File: init.c Progetto: ASAPPinc/cutorch

/*
   Usage:
   cutorch.streamWaitFor(waiterStream, {waitForStream1, ..., waitForStreamN})
   for streams on the current device. Creates a one-way barrier where
   waiterStream waits for waitForStream1-N to reach the current point.
*/
static int cutorch_streamWaitFor(lua_State *L)
{
  THCState *state = cutorch_getstate(L);

  int curDev = -1;
  THCudaCheck(cudaGetDevice(&curDev));

  /* Check that the waiting stream is in bounds; this will error out if not */
  int waitingId = (int) luaL_checknumber(L, 1);
  cudaStream_t streamWaiting =
    THCState_getDeviceStream(state, curDev, waitingId);

  /* Validate the streams that we are waiting on */
  int streams = checkAndCountListOfStreams(L, state, 2, curDev);

  if (streams < 1) {
    /* nothing to synchronize */
    return 0;
  }
  /* One-way dependency; streamWaiting will wait for the list of streams to
     wait on to complete execution of pending scheduled kernels/events */
  cudaEvent_t * events = (cudaEvent_t*)malloc(sizeof(cudaEvent_t) * streams);
  createSingleDeviceEvents(L, state, 2, curDev, events);
  /* Then, wait on them */
  for (int i = 0; i < streams; i++) {
    THCudaCheck(cudaStreamWaitEvent(streamWaiting, events[i], 0));
    THCudaCheck(cudaEventDestroy(events[i]));
  }
  free(events);
  return 0;
}

Esempio n. 16

0

Mostra file

File: gpu_properties.cpp Progetto: LStoleriu/hotspin

void gpu_print_properties(FILE* out){
  int device = -1;
  gpu_safe( cudaGetDevice(&device) );
  
  cudaDeviceProp prop;
  gpu_safe( cudaGetDeviceProperties(&prop, device) ); 
  
  int MiB = 1024 * 1024;
  int kiB = 1024;
  
  fprintf(out, "     Device number: %d\n", device);
  fprintf(out, "       Device name: %s\n", prop.name);
  fprintf(out, "     Global Memory: %d MiB\n", (int)(prop.totalGlobalMem/MiB));
  fprintf(out, "     Shared Memory: %d kiB/block\n", (int)(prop.sharedMemPerBlock/kiB));
  fprintf(out, "   Constant memory: %d kiB\n", (int)(prop.totalConstMem/kiB));
  fprintf(out, "         Registers: %d per block\n", (int)(prop.regsPerBlock/kiB));
  fprintf(out, "         Warp size: %d threads\n", (int)(prop.warpSize));
  //fprintf(out, "  Max memory pitch: %d bytes\n", (int)(prop.memPitch));
  fprintf(out, " Texture alignment: %d bytes\n", (int)(prop.textureAlignment));
  fprintf(out, " Max threads/block: %d\n", prop.maxThreadsPerBlock);
  fprintf(out, "    Max block size: %d x %d x %d threads\n", prop.maxThreadsDim[X], prop.maxThreadsDim[Y], prop.maxThreadsDim[Z]);
  fprintf(out, "     Max grid size: %d x %d x %d blocks\n", prop.maxGridSize[X], prop.maxGridSize[Y], prop.maxGridSize[Z]);
  fprintf(out, "Compute capability: %d.%d\n", prop.major, prop.minor);
  fprintf(out, "        Clock rate: %d MHz\n", prop.clockRate/1000);
  fprintf(out, "   Multiprocessors: %d\n", prop.multiProcessorCount);
  fprintf(out, "   Timeout enabled: %d\n", prop.kernelExecTimeoutEnabled);
  fprintf(out, "      Compute mode: %d\n", prop.computeMode);
  fprintf(out, "    Device overlap: %d\n", prop.deviceOverlap);
  fprintf(out, "Concurrent kernels: %d\n", prop.concurrentKernels);
  fprintf(out, "        Integrated: %d\n", prop.integrated);
  fprintf(out, "  Can map host mem: %d\n", prop.canMapHostMemory);
  
}

Esempio n. 17

0

Mostra file

File: init.c Progetto: ASAPPinc/cutorch

/*
   Usage:
   cutorch.streamBarrier({stream1, stream2, ..., streamN})
   applies to streams for the current device. Creates a N-way barrier
   to synchronize all of the streams given
*/
static int cutorch_streamBarrier(lua_State *L)
{
  THCState *state = cutorch_getstate(L);

  int curDev = -1;
  THCudaCheck(cudaGetDevice(&curDev));

  int streams = checkAndCountListOfStreams(L, state, 1, curDev);

  if (streams < 2) {
    /* nothing to synchronize together */
    return 0;
  }
  /* Multi-way dependency (barrier); all streams must complete execution
     of pending scheduled kernels/events */
  cudaEvent_t * events = (cudaEvent_t*)malloc(sizeof(cudaEvent_t) * streams);
  /* First, create an event and record them for all streams */
  int eventsCreated =  createSingleDeviceEvents(L, state, 1, curDev, events);

  /* Then, wait on the event. Each stream is actually waiting on itself here
     too, but that's harmless and isn't worth weeding out. */
  waitSingleDeviceEvents(L, state, 1, curDev, events, eventsCreated);
  for (int i = 0; i < eventsCreated; i++)
    THCudaCheck(cudaEventDestroy(events[i]));

  free(events);
  return 0;
}

Esempio n. 18

0

Mostra file

File: cudpp.cpp Progetto: UIKit0/cudpp

/**
 * @brief Performs the Move-to-Front Transform
 *
 * Performs a parallel move-to-front transform on 1,048,576 elements.
 * The MTF uses a scan-based algorithm to parallelize the computation.
 * The MTF uses a scan-based algorithm described in our paper "Parallel
 * Lossless Data Compression on the GPU". (See the \ref references bibliography).
 *
 * - Currently, the MTF can only be performed on 1,048,576 (uchar) elements.
 * - The transformed string is written to \a d_mtfOut.
 *
 * @param[in] planHandle Handle to plan for MTF
 * @param[out] d_out Output data
 * @param[in] d_in Input data
 * @param[in] numElements Number of elements
 * @returns CUDPPResult indicating success or error condition
 *
 * @see cudppPlan, CUDPPConfiguration, CUDPPAlgorithm
 */
CUDPP_DLL
CUDPPResult cudppMoveToFrontTransform(CUDPPHandle planHandle,
                                      unsigned char *d_in,
                                      unsigned char *d_out,
                                      size_t numElements)
{
    // first check: is this device >= 2.0? if not, return error
    int dev;
    cudaGetDevice(&dev);

    cudaDeviceProp devProps;
    cudaGetDeviceProperties(&devProps, dev);

    if((int)devProps.major < 2) {
        // Only supported on devices with compute
        // capability 2.0 or greater
        return CUDPP_ERROR_ILLEGAL_CONFIGURATION;
    }

    CUDPPMtfPlan * plan = 
        (CUDPPMtfPlan *) getPlanPtrFromHandle<CUDPPMtfPlan>(planHandle);
    
    if(plan != NULL)
    {
        if (plan->m_config.algorithm != CUDPP_MTF)
            return CUDPP_ERROR_INVALID_PLAN;
        if (plan->m_config.datatype != CUDPP_UCHAR)
            return CUDPP_ERROR_ILLEGAL_CONFIGURATION;

        cudppMtfDispatch(d_in, d_out, numElements, plan);
        return CUDPP_SUCCESS;
    }
    else
        return CUDPP_ERROR_INVALID_HANDLE;
}

Esempio n. 19

0

Mostra file

File: cuda_util.cpp Progetto: edanssandes/MASA-CUDAlign

/**
 * Returns the compute capability of the selected GPU.
 * @return the compute capability in the integer format (210 means
 * version 2.1)
 */
int getDevCapability() {
    cudaDeviceProp devProp;
    int dev;
    cudaGetDevice(&dev);
    cutilSafeCall(cudaGetDeviceProperties(&devProp, dev));
    return devProp.major*100+devProp.minor*10;
}

Esempio n. 20

0

Mostra file

File: parallel.cpp Progetto: bbshocking/caffe

void P2PSync<Dtype>::on_gradients_ready(Timer* timer, ostringstream* timing) {
#ifndef CPU_ONLY
#ifdef DEBUG
  int device;
  CUDA_CHECK(cudaGetDevice(&device));
  CHECK(device == solver_->param().device_id());
#endif

  // Sum children gradients as they appear in the queue
  for (int i = 0; i < children_.size(); ++i) {
    timer->Start();
    P2PSync<Dtype> *child = queue_.pop();
    Dtype* src = child->parent_grads_;
    Dtype* dst = diff_;

#ifdef DEBUG
    bool ok = false;
    for (int j = 0; j < children_.size(); ++j) {
      if (child == children_[j]) {
        ok = true;
      }
    }
    CHECK(ok);
    cudaPointerAttributes attributes;
    CUDA_CHECK(cudaPointerGetAttributes(&attributes, src));
    CHECK(attributes.device == device);
    CUDA_CHECK(cudaPointerGetAttributes(&attributes, dst));
    CHECK(attributes.device == device);
#endif

    caffe_gpu_add(size_, src, dst, dst);
    *timing << " add_grad: " << timer->MilliSeconds();
  }

  // Send gradients to parent
  if (parent_) {
    timer->Start();
    Dtype* src = diff_;
    Dtype* dst = parent_grads_;

#ifdef DEBUG
    cudaPointerAttributes attributes;
    CUDA_CHECK(cudaPointerGetAttributes(&attributes, src));
    CHECK(attributes.device == device);
    CUDA_CHECK(cudaPointerGetAttributes(&attributes, dst));
    CHECK(attributes.device == parent_->solver_->param().device_id());
#endif

    CUDA_CHECK(cudaMemcpyAsync(dst, src, size_ * sizeof(Dtype),  //
        cudaMemcpyDeviceToDevice, cudaStreamDefault));
    CUDA_CHECK(cudaStreamSynchronize(cudaStreamDefault));
    parent_->queue_.push(this);
    *timing << " send_grad: " << timer->MilliSeconds();
  } else {
    // Loss functions divide gradients by the batch size, so to compensate
    // for split batch, the root solver divides by number of solvers.
    caffe_gpu_scal(size_, Dtype(1.0 / Caffe::solver_count()), diff_);
  }
#endif
}

Esempio n. 21

0

Mostra file

File: volumeFiltering.cpp Progetto: ajperalt/nvidia-cuda-7.0-samples

void checkDeviceMeetComputeSpec(int argc, char **argv)
{
    int device = 0;
    cudaGetDevice(&device);

    if (checkCUDAProfile(device, MIN_RUNTIME_VERSION, MIN_COMPUTE_VERSION))
    {
        fprintf(stderr,"\nCUDA Capable Device %d, meets minimum required specs.\n", device);
    }
    else
    {
        fprintf(stderr, "\nNo configuration with minimum compute capabilities found.  Exiting...\n");
        fprintf(stderr, "This sample requires:\n");
        fprintf(stderr, "\tCUDA Compute Capability >= %d.%d is required\n", MIN_COMPUTE_VERSION/16, MIN_COMPUTE_VERSION%16);
        fprintf(stderr, "\tCUDA Runtime Version    >= %d.%d is required\n", MIN_RUNTIME_VERSION/1000, (MIN_RUNTIME_VERSION%100)/10);

        // cudaDeviceReset causes the driver to clean up all state. While
        // not mandatory in normal operation, it is good practice.  It is also
        // needed to ensure correct operation when the application is being
        // profiled. Calling cudaDeviceReset causes all profile data to be
        // flushed before the application exits
        cudaDeviceReset();
        exit(EXIT_SUCCESS);
    }
}

Esempio n. 22

0

Mostra file

File: PTXInstrumentor.cpp Progetto: anshumang/gpulynx

    int PTXInstrumentor::sendKernelProfile(mqd_t messageQueue) {
    
        int err = 0;
        
        _profile.pid = getpid();
       
	cudaGetDevice(&_profile.device);
 
        int len = kernelName.size() > MAX_KERNEL_NAME_SIZE - 1 
            ? MAX_KERNEL_NAME_SIZE - 1 : kernelName.size();
        
        std::strncpy(_profile.name, kernelName.data(), len);
        _profile.name[len] = '\0';
        
        do {
            err = mq_send(messageQueue, (char *)&_profile, sizeof(kernel_profile), MQ_DFT_PRIO);
            
            if(err == 0)
                break;
            else {
                /* if O_NONBLOCK is set and the message queue is full, EAGAIN is returned. In this
                case, we do nothing so that if the messages we send are not consumed by anyone, 
                we still function normally */
                if(errno == EAGAIN)
                    break;
            
            }
        } 
        /* keep sending message while a signal interrupted call */
        while(errno == EINTR);
        
        return err;
    }

Esempio n. 23

0

Mostra file

File: THCGeneral.cpp Progetto: HustlehardInc/pytorch

int THCState_getPeerToPeerAccess(THCState* state, int dev, int devToAccess)
{
  if (dev < 0 || dev >= state->numDevices) {
    THError("%d is not a device", dev);
  }
  if (devToAccess < 0 || devToAccess >= state->numDevices) {
    THError("%d is not a device", devToAccess);
  }
  if (state->p2pAccessEnabled[dev][devToAccess] == -1) {
    int prevDev = 0;
    THCudaCheck(cudaGetDevice(&prevDev));
    THCudaCheck(cudaSetDevice(dev));

    int access = 0;
    THCudaCheck(cudaDeviceCanAccessPeer(&access, dev, devToAccess));
    if (access) {
      cudaError_t err = cudaDeviceEnablePeerAccess(devToAccess, 0);
      if (err == cudaErrorPeerAccessAlreadyEnabled) {
        // ignore and clear the error if access was already enabled
        cudaGetLastError();
      } else {
        THCudaCheck(err);
      }
      state->p2pAccessEnabled[dev][devToAccess] = 1;
    } else {
      state->p2pAccessEnabled[dev][devToAccess] = 0;
    }

    THCudaCheck(cudaSetDevice(prevDev));
  }
  return state->p2pAccessEnabled[dev][devToAccess];
}

Esempio n. 24

0

Mostra file

File: THCGeneral.c Progetto: noa/cutorch

void THCudaShutdown(THCState* state)
{
  THCRandom_shutdown(state);
  THCudaBlas_shutdown(state);
  free(state->blasState);
  free(state->rngState);
  free(state->deviceProperties);

  int prevDev = -1;
  THCudaCheck(cudaGetDevice(&prevDev));

  for (int dev = 0; dev < state->numDevices; ++dev) {
    THCudaCheck(cudaSetDevice(dev));

    /* Free Torch-defined streams (0 is the default stream) */
    for (int stream = 1; stream <= state->numUserStreams; ++stream) {
      THCudaCheck(cudaStreamDestroy(state->streamsPerDevice[dev][stream]));
    }

    free(state->streamsPerDevice[dev]);
  }

  free(state->streamsPerDevice);
  THCudaCheck(cudaSetDevice(prevDev));
}

Esempio n. 25

0

Mostra file

File: THCTensorCopy.c Progetto: SiavashGorji/LuaJIT-Luarocks-Torch7

void THTensor_(copyAsyncCuda)(THCState *state, THTensor *self, struct THCTensor *src)
{
  THArgCheck(THTensor_(nElement)(self) == THCTensor_(nElement)(state, src), 2, "sizes do not match");
  THArgCheck(THTensor_(isContiguous)(self), 2, "Target tensor must be contiguous");
  THArgCheck(THCTensor_(isContiguous)(state, src), 3, "Source tensor must be contiguous");

  if (THTensor_(nElement)(self) == 0) return;

  // Perform the copy wrt the current stream on the CudaTensor's device.
  int tensorDevice = THCTensor_(getDevice)(state, src);
  int currentDevice;
  THCudaCheck(cudaGetDevice(&currentDevice));

  if (currentDevice != tensorDevice) {
    THCudaCheck(cudaSetDevice(tensorDevice));
  }

  THCudaCheck(cudaMemcpyAsync(THTensor_(data)(self),
                              THCTensor_(data)(state, src),
                              THCTensor_(nElement)(state, src) * sizeof(real),
                              cudaMemcpyDeviceToHost,
                              THCState_getDeviceStream(state, tensorDevice,
                                                       THCState_getCurrentStreamIndex(state))));

  if (currentDevice != tensorDevice) {
    THCudaCheck(cudaSetDevice(currentDevice));
  }
}

Esempio n. 26

0

Mostra file

File: THCTensorCopy.cpp Progetto: gtgalone/pytorch

void THCTensor_(copyAsyncCPU)(THCState *state, THCTensor *self, struct THTensor *src)
{
  THArgCheck(THCTensor_(nElement)(state, self) == THTensor_(nElement)(src), 2, "sizes do not match");
  THArgCheck(THCTensor_(isContiguous)(state, self), 2, "Target tensor must be contiguous");
  THArgCheck(THTensor_(isContiguous)(src), 3, "Source tensor must be contiguous");

  if (THCTensor_(nElement)(state, self) == 0) return;

  // Perform the copy wrt the current stream on the CudaTensor's device.
  int tensorDevice = THCTensor_(getDevice)(state, self);
  int currentDevice;
  THCudaCheck(cudaGetDevice(&currentDevice));

  if (currentDevice != tensorDevice) {
    THCudaCheck(cudaSetDevice(tensorDevice));
  }

  THCStream *stream  = THCState_getStream(state);
  THCudaCheck(cudaMemcpyAsync(THCTensor_(data)(state, self),
                              THTensor_(data)(src),
                              THTensor_(nElement)(src) * sizeof(real),
                              cudaMemcpyHostToDevice,
                              stream->stream));

  THCudaCheck(THCCachingHostAllocator_recordEvent(THStorage_(data)(src->storage), stream));

  if (currentDevice != tensorDevice) {
    THCudaCheck(cudaSetDevice(currentDevice));
  }
}

Esempio n. 27

0

Mostra file

File: THCGeneral.c Progetto: ASAPPinc/cutorch

void THCState_setPeerToPeerAccess(THCState* state, int dev, int devToAccess,
                                  int enable)
{
  /* This will perform device bounds checking for us */
  int prevEnabled = THCState_getPeerToPeerAccess(state, dev, devToAccess);

  if (enable != prevEnabled) {
    /* If we're attempting to enable p2p access but p2p access isn't */
    /* supported, throw an error */
    if (enable) {
      int access = 0;
      THCudaCheck(cudaDeviceCanAccessPeer(&access, dev, devToAccess));

      if (!access) {
        THError("p2p access not supported for %d accessing %d",
                dev, devToAccess);
      }
    }

    state->p2pAccessEnabled[dev][devToAccess] = enable;

    int prevDev = 0;
    THCudaCheck(cudaGetDevice(&prevDev));
    THCudaCheck(cudaSetDevice(dev));

    /* This should be in sync with the current access state */
    if (enable) {
      THCudaCheck(cudaDeviceEnablePeerAccess(devToAccess, 0));
    } else {
      THCudaCheck(cudaDeviceDisablePeerAccess(devToAccess));
    }

    THCudaCheck(cudaSetDevice(prevDev));
  }
}

Esempio n. 28

0

Mostra file

File: cuda.c Progetto: KaiqiZhang/darknet

int cuda_get_device()
{
    int n = 0;
    cudaError_t status = cudaGetDevice(&n);
    check_error(status);
    return n;
}

Esempio n. 29

0

Mostra file

File: vtest_hostalloc.cpp Progetto: ckennelly/panoptes

TEST(HostAlloc, MappedPointer) {
    cudaError_t ret;
    int device;

    ret = cudaGetDevice(&device);
    ASSERT_EQ(cudaSuccess, ret);

    struct cudaDeviceProp prop;
    ret = cudaGetDeviceProperties(&prop, device);
    ASSERT_EQ(cudaSuccess, ret);

    void * ptr;
    ret = cudaHostAlloc(&ptr, 4, cudaHostAllocMapped);
    ASSERT_EQ(cudaSuccess, ret);

    /*
     * Try to retrieve the device pointer, expecting a result according to
     * prop.canMapHostMemory.
     */
    void * device_ptr;
    ret = cudaHostGetDevicePointer(&device_ptr, ptr, 0);
    if (prop.canMapHostMemory) {
        EXPECT_EQ(cudaSuccess, ret);
        EXPECT_FALSE(device_ptr == NULL);
    } else {
        EXPECT_EQ(cudaErrorMemoryAllocation, ret);
    }

    ret = cudaFreeHost(ptr);
    ASSERT_EQ(cudaSuccess, ret);
}

Esempio n. 30

0

Mostra file

File: cuda_xpattern_data.hpp Progetto: fengwang/larbed-refinement

        ~cuda_xpattern_data()
        {
            int current_id;
            cuda_assert( cudaGetDevice(&current_id) );
            if ( current_id != device_id ) cuda_assert( cudaSetDevice( device_id ) );

            if ( ar ) cuda_assert( cudaFree(ar) );
            if ( dim ) cuda_assert( cudaFree(dim) );
            if ( I_diff ) cuda_assert( cudaFree(I_diff) );
            if ( I_exp ) cuda_assert( cudaFree(I_exp) );
            if ( I_exp ) cuda_assert( cudaFree(I_zigmoid) );
            if ( diag ) cuda_assert( cudaFree(diag) );
            if ( ug ) cuda_assert( cudaFree(ug) );
            if ( thickness_array ) cuda_assert( cudaFree( thickness_array ) );
            if ( cache ) cuda_assert( cudaFree(cache) );

            ar = 0;
            dim = 0;
            I_diff = 0;
            I_exp = 0;
            I_zigmoid = 0;
            diag = 0;
            ug = 0;
            thickness_array = 0;
            cache = 0;
        }