Пример #1
0
int cuda_init_memopt(void) 
{
	int num_devices = cuda_devices();
	int device;
	int max_device = 0;

	if (num_devices > 1) {

		size_t mem_max = 0;
		size_t mem_free;
		size_t mem_total;

		for (device = 0; device < num_devices; device++) {

			cuda_init(device);
			CUDA_ERROR(cudaMemGetInfo(&mem_free, &mem_total));
			//printf(" device (%d): %d\n", device, mem_available);

			if (mem_max < mem_free) {

				mem_max = mem_free;
				max_device = device;
			}
		}
		//printf(" max device: %d\n", max_device);
		CUDA_ERROR(cudaSetDevice(max_device));
		// FIXME: we should set last_init
	}

	return max_device;
}
Пример #2
0
    /**
     * Returns information about device memory.
     *
     * @param free amount of free memory in bytes. can be NULL
     * @param total total amount of memory in bytes. can be NULL. (NULL by default)
     */
    void getMemoryInfo(size_t *free, size_t *total = NULL)
    {
        size_t freeInternal = 0;
        size_t totalInternal = 0;

        CUDA_CHECK(cudaMemGetInfo(&freeInternal, &totalInternal));

        if (free != NULL)
        {
            if (reservedMem > freeInternal)
                freeInternal = 0;
            else
                freeInternal -= reservedMem;

            *free = freeInternal;
        }
        if (total != NULL)
        {
            if (reservedMem > totalInternal)
                totalInternal = 0;
            else
                totalInternal -= reservedMem;

            *total = totalInternal;
        }
    }
Пример #3
0
cudaError_t THCudaMemGetInfoCached(THCState *state,  size_t* freeBytes, size_t* totalBytes, size_t* largestBlock)
{
  size_t cachedBytes = 0;
  THCDeviceAllocator* allocator = state->cudaDeviceAllocator;

  *largestBlock = 0;
  /* get info from CUDA first */
  cudaError_t ret = cudaMemGetInfo(freeBytes, totalBytes);
  if (ret!= cudaSuccess)
    return ret;

  int device;
  ret = cudaGetDevice(&device);
  if (ret!= cudaSuccess)
    return ret;

  /* not always true - our optimistic guess here */
  *largestBlock = *freeBytes;

  if (allocator->cacheInfo != NULL)
    allocator->cacheInfo(allocator->state, device, &cachedBytes, largestBlock);

  /* Adjust resulting free bytes number. largesBlock unused for now */
  *freeBytes += cachedBytes;
  return cudaSuccess;
}
Пример #4
0
// return free memory in megabytes
int cuda_available_memory(int thr_id)
{
	int dev_id = device_map[thr_id % MAX_GPUS];
	size_t mtotal, mfree = 0;
	cudaSetDevice(dev_id);
	cudaMemGetInfo(&mfree, &mtotal);
	return (int) (mfree / (1024 * 1024));
}
Пример #5
0
void getDeviceMemoryInfoInMb(int device, size_t *total, size_t *free) {
  static const int bytesInMb = 1024 * 1024;
  size_t freeInBytes;
  size_t totalInBytes;
  CHECK_ERR(cudaGetDevice(&device));
  CHECK_ERR(cudaMemGetInfo(&freeInBytes, &totalInBytes));
  *total = totalInBytes / bytesInMb;
  *free = freeInBytes / bytesInMb;
}
Пример #6
0
/** \brief Debug short device memory information (free/total) to stream if DEBUG
 *flag is set to true.
 *
 * @param force always print output
 * @param stream output stream
 */
inline void showMemoryInfo(bool force, FILE *stream)
{
  size_t free_mem = 0;
  size_t total_mem = 0;
  cudaMemGetInfo(&free_mem, &total_mem);
  if (DEBUG || force)
    fprintf(stream, "memory usage, free: %lu total: %lu\n", free_mem,
            total_mem);
}
Пример #7
0
static int cutorch_getMemoryUsage(lua_State *L) {
  size_t freeBytes = 0;
  size_t totalBytes = 0;
  int curDevice;
  THCudaCheck(cudaGetDevice(&curDevice));

  int device = luaL_optint(L, 1, -10);
  if (device == -10) { /* no argument passed, current device mem usage */
    THCudaCheck(cudaMemGetInfo(&freeBytes, &totalBytes));
  } else { /* argument was given, particular device's memory usage */
    THCudaCheck(cudaSetDevice(device-1)); /* zero indexed */
    THCudaCheck(cudaMemGetInfo(&freeBytes, &totalBytes));
    THCudaCheck(cudaSetDevice(curDevice));
  }
  lua_pushnumber(L, freeBytes);
  lua_pushnumber(L, totalBytes);
  return 2;
}
Пример #8
0
void oskar_device_mem_info(size_t* mem_free, size_t* mem_total)
{
    if (!mem_free || !mem_total) return;
#ifdef OSKAR_HAVE_CUDA
    cudaMemGetInfo(mem_free, mem_total);
#else
    (void) mem_free;
    (void) mem_total;
#endif
}
Пример #9
0
static KMCUDAResult print_memory_stats() {
  size_t free_bytes, total_bytes;
  if (cudaMemGetInfo(&free_bytes, &total_bytes) != cudaSuccess) {
    return kmcudaRuntimeError;
  }
  printf("GPU memory: used %zu bytes (%.1f%%), free %zu bytes, total %zu bytes\n",
         total_bytes - free_bytes, (total_bytes - free_bytes) * 100.0 / total_bytes,
         free_bytes, total_bytes);
  return kmcudaSuccess;
}
Пример #10
0
inline void check_device_memory( const char* filename, const int line_number)
{
#ifdef CUDA_DEBUG
   size_t avail;
   size_t total;
   cudaMemGetInfo( &avail, &total);
   size_t used = total - avail;
   printf( "CUDA device memory usage at %s:%i: Used: %f Mb, Free %f Mb\n", filename, line_number, float(used)/(1024*1024), float(avail)/(1024*1024));
#endif
}
Пример #11
0
AllocPtr CudaDevice::CreateDefaultAlloc() {
	// Create the allocator. Use a bucket allocator with a capacity limit at
	// 80% of free mem.
	intrusive_ptr<CudaAllocBuckets> alloc(new CudaAllocBuckets(this));
	size_t freeMem, totalMem;

	cudaMemGetInfo(&freeMem, &totalMem);
	alloc->SetCapacity((size_t)(.80 * freeMem));
	
	return AllocPtr(alloc.get());
}
Пример #12
0
		std::ostream& operator<< (std::ostream& out, const cuda_running_configuration& running_configuration)
		{
			out << "--- CUDA versions ---" << std::endl;
			out << "Driver version = " << running_configuration.driver_version / 1000 << "." << (running_configuration.driver_version % 100) / 10 << std::endl;
			out << "Runtime version = " << running_configuration.runtime_version / 1000 << "." << (running_configuration.runtime_version % 100) / 10 << std::endl;

			out << "--- Device ---" << std::endl;

			out << "Device Id = " << running_configuration.device_id << std::endl;
			out << "Device name = " << running_configuration.device_name << std::endl;
			out << "Compute capability = " << running_configuration.compute_capability_major << "." << running_configuration.compute_capability_minor << std::endl;
			out << "Clock rate = " << (running_configuration.clock_rate / 1000) << " MHz" << std::endl;
			out << "Memory clock rate = " << (running_configuration.memory_clock_rate / 1000) << " MHz" << std::endl;
			out << "Memory bus width = " << running_configuration.memory_bus_width << " bits" << std::endl;
			out << "Global memory size = " << running_configuration.global_memory_size / (1024 * 1024) << " MB" << std::endl;
			out << "ECC support = " << (running_configuration.ecc_enabled ? "Enabled" : "Disabled") << std::endl;
			out << "L2 cache size = " << running_configuration.l2_cache_size << " bytes" << std::endl;
			out << "Multiprocessor count = " << running_configuration.multiprocessor_count << std::endl;
			out << "Shared memory per block size = " << running_configuration.smem_per_block << " bytes" << std::endl;
			out << "Maximum number of threads per multiprocessor = " << running_configuration.max_threads_per_multiprocessor << std::endl;
			out << "Maximum number of threads per block = " << running_configuration.max_threads_per_block << std::endl;
			out << "Maximum sizes of each dimension of a block = "
				<< running_configuration.max_threads_dim[0] << " x "
				<< running_configuration.max_threads_dim[1] << " x "
				<< running_configuration.max_threads_dim[2] << std::endl;
			out << "Maximum sizes of each dimension of a grid = "
				<< running_configuration.max_grid_size[0] << " x "
				<< running_configuration.max_grid_size[1] << " x "
				<< running_configuration.max_grid_size[2] << std::endl;
			out << "Maximum size of 1D texture bound to linear memory = " << running_configuration.max_texture_1d_linear << std::endl;
			out << "Texture alignment = " << running_configuration.texture_alignment << " bytes" << std::endl;
			out << "PCI Bus ID = " << running_configuration.pci_bus_id << std::endl;
			out << "PCI Location ID = " << running_configuration.pci_device_id << std::endl;
			#ifdef WIN32
				out << "Driver mode = " << (running_configuration.tcc_mode ? "TCC" : "WDDM") << std::endl;
			#endif

			out << "--- Settings ---" << std::endl;

			out << "Max global memory usage ratio = " << running_configuration.max_global_memory_usage_ratio << std::endl;

			out << "--- Status ---" << std::endl;

			size_t free_memory;
			size_t total_memory;
			cuda_safe_call(cudaMemGetInfo(&free_memory, &total_memory));

			out << "Free memory = " << free_memory / (1024 * 1024) << " MB" << std::endl;
			out << "Total memory = " << total_memory / (1024 * 1024) << " MB" << std::endl;

			return out;
		}
Пример #13
0
inline std::string
cudaMemoryInfoText()
{
    size_t free;
    size_t total;
    CUGIP_CHECK_RESULT(cudaMemGetInfo( &free, &total));

    return boost::str( boost::format("Free GPU memory: %1% MB; Total GPU memory %2% MB; Occupied %3%%%")
                       % (float(free) / (1024*1024))
                       % (float(total) / (1024*1024))
                       % (100.0f * float(total - free)/total)
                     );
}
Пример #14
0
int getMemUsage(const int &myRank)
{
    size_t free_byte,
           total_byte;

    CHECK(cudaMemGetInfo(&free_byte, &total_byte));

    std::cout << "myRank: " << myRank << " "
              << free_byte / 1024.0 / 1024.0 
              << " / " << total_byte / 1024.0 / 1024.0 << std::endl;

    return 0;
}
Пример #15
0
void memoryInfo(void)
{
	size_t free;
	size_t total;
	
	cudaCheck(cudaMemGetInfo (&free,&total),"MemInfo11");
	
	printf("\n");
	printf("\nRANK=%d\n",RANK);
	printf("\nGPU total memory = % .2f MB\n",(float)total/1e6);
	printf("\nGPU free  memory = % .2f MB\n",(float)free/1e6);

}
Пример #16
0
void AboutDialog::
        showEvent(QShowEvent *)
{
    ui->labelVersion->setText( QString::fromStdString( Sawe::Configuration::version_string() ) );
    ui->labelTimestamp->setText( QString("Built on %1 at %2 from revision %3.")
                                 .arg(Sawe::Configuration::build_date().c_str())
                                 .arg(Sawe::Configuration::build_time().c_str())
                                 .arg(Sawe::Configuration::revision().c_str()) );
    ui->labelLicense->setText( Sawe::Reader::reader_text().c_str() );
    if (Sawe::Reader::reader_title() == Sawe::Reader::reader_text() )
        ui->labelLicense->clear();

    int cores = Sawe::Configuration::cpuCores();

#ifdef USE_CUDA
    size_t free=0, total=0;
    cudaMemGetInfo(&free, &total);
    cudaDeviceProp prop = CudaProperties::getCudaDeviceProp();

    ui->labelSystem->setText(QString(
            "Using CPU with %13 core%14.\n"
            "Using GPU (%1 of %2) %3.\n"
            "%4 free of %5 total graphics memory.\n"
            "Gpu Gflops: %6\n"
            "Gpu memory speed: %7/s (estimated)\n"
            "Cpu memory speed: %8/s (estimated)\n"
            "Cuda compute capability: %9.%10\n"
            "Cuda driver version: %11\n"
            "Cuda runtime version: %12\n")
                             .arg(1+CudaProperties::getCudaCurrentDevice())
                             .arg(CudaProperties::getCudaDeviceCount())
                             .arg(prop.name)
                             .arg(DataStorageVoid::getMemorySizeText( free, 4 ).c_str())
                             .arg(DataStorageVoid::getMemorySizeText( total, 4 ).c_str())
                             .arg(CudaProperties::flops(prop)*1e-9, 0, 'f', 0)
                             .arg(DataStorageVoid::getMemorySizeText( CudaProperties::gpu_memory_speed(), 1, 'f' ).c_str())
                             .arg(DataStorageVoid::getMemorySizeText( CpuProperties::cpu_memory_speed(), 1, 'f' ).c_str())
                             .arg(prop.major).arg(prop.minor)
                             .arg(CudaProperties::getCudaDriverVersion())
                             .arg(CudaProperties::getCudaRuntimeVersion())
                             .arg(cores).arg(cores==1?"":"s")
                             );
#else
    ui->labelSystem->setText(QString(
            "Using CPU with %2 core%3.\n"
            "Cpu memory speed: %1 GB/s (estimated)\n")
                             .arg(CpuProperties::cpu_memory_speed()*1e-9, 0, 'f', 1)
                             .arg(cores).arg(cores==1?"":"s")
                             );
#endif
}
Пример #17
0
__declspec(dllexport) long __stdcall GetTotalMemory(int device)
{
    int currentdevice = 0;
    cudaGetDevice(&currentdevice);

    cudaSetDevice(device);

    size_t freemem = 0, totalmem = 0;
    cudaMemGetInfo(&freemem, &totalmem);

    cudaSetDevice(currentdevice);

    return (long)(totalmem >> 20);
}
Пример #18
0
/**
 * \brief Creates and initializes the working data for the plan
 * \param [in] plan The data and memory location for the plan.
 * \return int Error flag value
 * \sa parseCUDAMEMPlan 
 * \sa makeCUDAMEMPlan
 * \sa execCUDAMEMPlan
 * \sa perfCUDAMEMPlan
 * \sa killCUDAMEMPlan
*/
int   initCUDAMEMPlan(void *plan) {
	size_t avail, total, arraybytes;
	int M,i;
	int ret = make_error(ALLOC,generic_err);
	double gputhreads;
	cudaError_t cudaStat;
	struct cudaDeviceProp prop;
	Plan *p;
	CUDAMEMdata *d = NULL;
	p = (Plan *)plan;
	if (p) {
		d = (CUDAMEMdata*)p->vptr;
		p->exec_count = 0;
		perftimer_init(&p->timers, NUM_TIMERS);
	}
	if(d) {
		CUDA_CALL( cudaSetDevice(d->device) );
		CUDA_CALL( cudaMemGetInfo(&avail, &total) );
		CUDA_CALL( cudaGetDeviceProperties(&prop, d->device) );
		if (d->nGpuThreads != 0) {	// use the user spec'd number of threads or default to warp*cores
			gputhreads = (double)(d->nGpuThreads);
		} else {
			gputhreads = d->nGpuThreads = prop.warpSize * prop.multiProcessorCount;
		}
		if (prop.major < 2) {	// check results on older devices
			d->resultCheck = 1;
		} else {
			d->resultCheck = 0;
		}
		// calculate M for 6 M*M arrays to fill 100%/75%/50% of GPU free memory 
		// M = (d->nGpuThreads) * (int)(sqrt(0.75*avail/(6.0*sizeof(double)*gputhreads*gputhreads)));
		// M = (d->nGpuThreads) * (int)(sqrt(0.50*avail/(6.0*sizeof(double)*gputhreads*gputhreads)));
		M = (d->nGpuThreads) * (int)(sqrt(1.00*avail/(6.0*sizeof(double)*gputhreads*gputhreads)));
		// assume one will fit in host memory
		d->M = M;
		arraybytes = (size_t)(0.99*avail);
		d->arraybytes = arraybytes;
                d->arrayelems = arraybytes / sizeof(int);
		// host array and device arrays
		CUDA_CALL( cudaMallocHost((void **)(&(d->hostarray)), arraybytes) );
		CUDA_CALL( cudaMalloc    ((void **)(&(d->devicearray)), arraybytes) );
		// initialize so that results are M*PI**2/100
		//for(i=0; i<3*M*M; i++) d->HA[i] = (double)0.31415926535;
		//CUDA_CALL( cudaMemcpy( (d->DA), (d->HA), arraybytes, cudaMemcpyHostToDevice) );
		//CUDA_CALL( cudaMemcpy( (d->DB), (d->DA), arraybytes, cudaMemcpyDeviceToDevice) );
		ret = ERR_CLEAN;
	}
	return ret;
}
Пример #19
0
static int cutorch_getDeviceProperties(lua_State *L)
{
  int device = (int)luaL_checknumber(L, 1)-1;

  // switch context to given device so the call to cudaMemGetInfo is for the correct device
  int oldDevice;
  THCudaCheck(cudaGetDevice(&oldDevice));
  THCudaCheck(cudaSetDevice(device));

  struct cudaDeviceProp prop;
  THCudaCheck(cudaGetDeviceProperties(&prop, device));
  lua_newtable(L);
  SET_DEVN_PROP(canMapHostMemory);
  SET_DEVN_PROP(clockRate);
  SET_DEVN_PROP(computeMode);
  SET_DEVN_PROP(deviceOverlap);
  SET_DEVN_PROP(integrated);
  SET_DEVN_PROP(kernelExecTimeoutEnabled);
  SET_DEVN_PROP(major);
  SET_DEVN_PROP(maxThreadsPerBlock);
  SET_DEVN_PROP(memPitch);
  SET_DEVN_PROP(minor);
  SET_DEVN_PROP(multiProcessorCount);
  SET_DEVN_PROP(regsPerBlock);
  SET_DEVN_PROP(sharedMemPerBlock);
  SET_DEVN_PROP(textureAlignment);
  SET_DEVN_PROP(totalConstMem);
  SET_DEVN_PROP(totalGlobalMem);
  SET_DEVN_PROP(warpSize);
  SET_DEVN_PROP(pciBusID);
  SET_DEVN_PROP(pciDeviceID);
  SET_DEVN_PROP(pciDomainID);
  SET_DEVN_PROP(maxTexture1D);
  SET_DEVN_PROP(maxTexture1DLinear);

  size_t freeMem;
  THCudaCheck(cudaMemGetInfo (&freeMem, NULL));
  lua_pushnumber(L, freeMem);
  lua_setfield(L, -2, "freeGlobalMem");

  lua_pushstring(L, prop.name);
  lua_setfield(L, -2, "name");

  // restore context
  THCudaCheck(cudaSetDevice(oldDevice));

  return 1;
}
Пример #20
0
int main(int argc, char **argv)
{
  cudaError_t err = cudaSuccess;
  int deviceCount = 0;
  size_t totalDevMem, freeDevMem;
  size_t lastLineLength = 0; // MUST be initialized to zero

  signal(SIGTERM, signalHandler);
  signal(SIGQUIT, signalHandler);
  signal(SIGINT, signalHandler);
  signal(SIGHUP, signalHandler);

  writeLine(lastLineLength, "Preparing...");

  err = cudaGetDeviceCount(&deviceCount);

  if (err != cudaSuccess) {
   std::cerr << "ERROR: " << cudaGetErrorString(err) << std::endl; 
  }

  while (err == cudaSuccess && gRun) {
    
    std::ostringstream stream;

    for (int i=0; i < deviceCount; ++i) {
      if (err == cudaSuccess) {
	err = cudaSetDevice(i);
	if (err == cudaSuccess) {
	  cudaMemGetInfo(&freeDevMem, &totalDevMem);
	  if (i != 0)
	    stream << " : ";
	  stream << "Dev " << i << " (" << (freeDevMem/1024) << " KB of " << (totalDevMem/1048576) << " MB free)";
	}
      }
    }
    if (err == cudaSuccess) {
      writeLine(lastLineLength, stream.str());
    }
    
    sleep(5); // TODO - make the cycle time an optional command line flag...
  }

  cudaThreadExit();

  std::cout << std::endl;

  return 0;
}
Пример #21
0
void printMemInfo(const char *header)
{
  int bytesInMb = 1024 * 1024;
  int device;
  size_t free;
  size_t total;
  CHECK_ERR(cudaGetDevice(&device));
  CHECK_ERR(cudaMemGetInfo(&free, &total));

  long freeMb = free / bytesInMb;
  long usedMb = (total - free) / bytesInMb;
  long totalMb = total / bytesInMb;

  printf("--%-50s GPU [%d] Mem Used: %-6ld MB. Free: %-6ld MB. Total: %-6ld MB\n", header, device, usedMb, freeMb,
      totalMb);
}
Пример #22
0
        // Returns the maximum theoretically possible value of n, for which the
        // call allocate(n, 0) could succeed. In most implementations, this
        // returns std::numeric_limits<size_type>::max() / sizeof(value_type).
        size_type max_size() const noexcept
        {
            detail::scoped_active_target active(target_);
            std::size_t free = 0;
            std::size_t total = 0;
            cudaError_t error = cudaMemGetInfo(&free, &total);
            if (error != cudaSuccess)
            {
                HPX_THROW_EXCEPTION(kernel_error,
                    "cuda::allocator<T>::max_size()",
                    std::string("cudaMemGetInfo failed: ") +
                        cudaGetErrorString(error));
            }

            return total / sizeof(value_type);
        }
Пример #23
0
// Print free and used memory on GPU.
void print_cuda_meminfo(void)
{
    size_t byte_tot;
    size_t byte_free;
    cudaError_t cuda_status = cudaMemGetInfo(&byte_free, &byte_tot);

    if (cuda_status != cudaSuccess)
	    error("ERROR: cudaMemGetInfo failed. %s\n", cudaGetErrorString(cuda_status));


    double dbyte_tot = (double)byte_tot;
    double dbyte_free = (double)byte_free;
    double dbyte_used = dbyte_tot - dbyte_free;

    debug_printf(DP_INFO , "GPU memory usage: used = %.4f MiB, free = %.4f MiB, total = %.4f MiB\n", dbyte_used/MiBYTE, dbyte_free/MiBYTE, dbyte_tot/MiBYTE);
}
Пример #24
0
std::string CudaDevice::DeviceString() const {
	size_t freeMem, totalMem;
	cudaMemGetInfo(&freeMem, &totalMem);
	double memBandwidth = (_prop.memoryClockRate * 1000.0) *
		(_prop.memoryBusWidth / 8 * 2) / 1.0e9;

	std::string s = stringprintf(
		"%s : %8.3lf Mhz   (Ordinal %d)\n"
		"%d SMs enabled. Compute Capability sm_%d%d\n"
		"FreeMem: %6dMB   TotalMem: %6dMB.\n"
		"Mem Clock: %8.3lf Mhz x %d bits   (%1.3lf GB/s)\n"
		"ECC %s\n\n",
		_prop.name, _prop.clockRate / 1000.0, _ordinal,
		_prop.multiProcessorCount, _prop.major, _prop.minor,
		(int)(freeMem / (1<< 20)), (int)(totalMem / (1<< 20)),
		_prop.memoryClockRate / 1000.0, _prop.memoryBusWidth, memBandwidth,
		_prop.ECCEnabled ? "Enabled" : "Disabled");
	return s;
}
Пример #25
0
SEXP
R_auto_cudaMemGetInfo()
{
    SEXP r_ans = R_NilValue;
    size_t free;
    size_t total;
    cudaError_t ans;
    ans = cudaMemGetInfo(& free, & total);
    if(ans)
       return(R_cudaError_t_Info(ans));
    PROTECT(r_ans = NEW_LIST(2));
    SEXP r_names;
    PROTECT(r_names = NEW_CHARACTER(2));
    SET_VECTOR_ELT(r_ans, 0, ScalarReal(free));
    SET_VECTOR_ELT(r_ans, 1, ScalarReal(total));
    SET_STRING_ELT(r_names, 0, mkChar("free"));
    SET_STRING_ELT(r_names, 1, mkChar("total"));
    SET_NAMES(r_ans, r_names);
    UNPROTECT(2);
    return(r_ans);
}
Пример #26
0
void showCudaUsage() {
size_t free_byte;
size_t total_byte;
cudaError_t cuda_status = cudaMemGetInfo( &free_byte, &total_byte ) ;
if ( cudaSuccess != cuda_status ){
fprintf(stderr, "Error: cudaMemGetInfo fails, %s \n", cudaGetErrorString(cuda_status) );
exit(1);
}
double free_db = (double)free_byte ;
double total_db = (double)total_byte ;
double used_db = total_db - free_db ;
//printf("GPU memory usage: used = %f, free = %f MB, total = %f MB\n",
// used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0);
std::cerr << "GPU memory usage: used = "
<< used_db/1024.0/1024.0
<< ", free = "
<< free_db/1024.0/1024.0
<< " MB, total = "
<< total_db/1024.0/1024.0
<< " MB\n";
}
Пример #27
0
void oskar_device_get_info_cuda(oskar_Device* device)
{
#ifdef OSKAR_HAVE_CUDA
    struct cudaDeviceProp prop;
    cudaDriverGetVersion(&device->cuda_driver_version);
    cudaRuntimeGetVersion(&device->cuda_runtime_version);
    cudaGetDeviceProperties(&prop, device->index);
    device->name = (char*) realloc(device->name, 1 + strlen(prop.name));
    device->vendor = (char*) realloc(device->vendor, 1 + strlen("NVIDIA"));
    strcpy(device->name, prop.name);
    strcpy(device->vendor, "NVIDIA");
    device->is_nv = 1;
    device->platform_type = 'C';
    device->device_type = 'G';
    device->compute_capability[0] = prop.major;
    device->compute_capability[1] = prop.minor;
    device->supports_double = 0;
    if (prop.major >= 2 || prop.minor >= 3)
        device->supports_double = 1;
    device->supports_atomic32 = 1;
    device->supports_atomic64 = 1;
    device->global_mem_cache_size = (size_t) prop.l2CacheSize;
    device->local_mem_size = prop.sharedMemPerBlock;
    device->max_work_group_size = (size_t) prop.maxThreadsPerBlock;
    device->max_local_size[0] = prop.maxThreadsDim[0];
    device->max_local_size[1] = prop.maxThreadsDim[1];
    device->max_local_size[2] = prop.maxThreadsDim[2];
    device->max_compute_units = prop.multiProcessorCount;
    device->max_clock_freq_kHz = prop.clockRate;
    device->memory_clock_freq_kHz = prop.memoryClockRate;
    device->memory_bus_width = prop.memoryBusWidth;
    device->num_registers = (unsigned int) prop.regsPerBlock;
    device->warp_size = prop.warpSize;
    cudaMemGetInfo(&device->global_mem_free_size, &device->global_mem_size);
#endif
    device->num_cores = device->max_compute_units * oskar_get_num_cuda_cores(
            device->compute_capability[0], device->compute_capability[1]);
    device->init = 1;
}
Пример #28
0
void cudaUtils::gpu_memory_usage()
{
  std::ios_base::fmtflags old_flags = std::cout.flags();
  std::streamsize old_precision = std::cout.precision();
  
  std::cout.precision(2);
  
  int device_index = -1;
  checkCudaErrors(cudaGetDevice(&device_index));
  
  size_t free_byte = 0;
  size_t total_byte = 0;
  checkCudaErrors(cudaMemGetInfo(&free_byte, &total_byte));
  
  std::cout << " Device: " << device_index
	    << " GPU memory usage:" 
	    << " used = " << std::fixed << (total_byte-free_byte)/1024.0/1024.0 << "MB,"
	    << " free = " << free_byte/1024.0/1024.0 << "MB,"
	    << " total = " << total_byte/1024.0/1024.0 << "MB" << std::endl;

  std::cout.flags(old_flags);
  std::cout.precision(old_precision);
}
Пример #29
0
void calcula_parametros_execucao(parametros_exec *param_exec, int NP, int NT){

	int npos_por_ciclo, total_ciclos;
	size_t tam_por_ciclo;

	total_ciclos=1;
	npos_por_ciclo=NP;

	cudaMemGetInfo(&param_exec->mem_free, &param_exec->mem_total);

	//FOI DECREMENTADO UMA QUANTIA DO VALOR TOTAL DE MEMORIA LIVRE
	//PARA QUE NAO SEJA USADO 100% DA MEMORIA DISPONIVEL, POR
	//UMA QUESTAO DE ESTABILIDADE. ISSO PODE CAUSAR TRAVAMENTOS.
	//ESSE VALOR PODE SER ALTERADO.
	tam_por_ciclo=(npos_por_ciclo*NT)*sizeof(float);
	while ( tam_por_ciclo > (param_exec->mem_free-50) ){
		total_ciclos=total_ciclos+1;
		npos_por_ciclo=ceil(NP/total_ciclos);
		tam_por_ciclo=(npos_por_ciclo*NT)*sizeof(float);
	}

	param_exec->npos_por_ciclo=npos_por_ciclo;
	param_exec->total_ciclos=total_ciclos;
	param_exec->tam_por_ciclo=tam_por_ciclo;
	param_exec->threads_por_bloco=THREADS_POR_BLOCO;
	param_exec->blocos_por_grid=(npos_por_ciclo+THREADS_POR_BLOCO-1)/THREADS_POR_BLOCO;

	//CALCULA O NUMERO TOTAL DE POSICOES QUE SERAO CALCULADAS
	//ESSE NUMERO PROVAVELMENTE SERA MAIOR QUE O NP,
	//POR ISSO NAO SE PODE ALOCAR EXATAMENTE O TAMANHO DA SAIDA ESPERADO.
	//TEM QUE ALOCAR ESSA MEMORIA A MAIS PARA GARANTIR
	//QUE NO ULTIMO CICLO NAO SEJA COPIADO DADOS ALEM DO QUE FOI ALOCADO
	param_exec->total_npos=param_exec->npos_por_ciclo*param_exec->total_ciclos;

	return;
}
Пример #30
0
/**
    Purpose
    -------
    CGEQRF_OOC computes a QR factorization of a COMPLEX M-by-N matrix A:
    A = Q * R. This version does not require work space on the GPU
    passed as input. GPU memory is allocated in the routine.
    This is an out-of-core (ooc) version that is similar to magma_cgeqrf but
    the difference is that this version can use a GPU even if the matrix
    does not fit into the GPU memory at once.

    Arguments
    ---------
    @param[in]
    m       INTEGER
            The number of rows of the matrix A.  M >= 0.

    @param[in]
    n       INTEGER
            The number of columns of the matrix A.  N >= 0.

    @param[in,out]
    A       COMPLEX array, dimension (LDA,N)
            On entry, the M-by-N matrix A.
            On exit, the elements on and above the diagonal of the array
            contain the min(M,N)-by-N upper trapezoidal matrix R (R is
            upper triangular if m >= n); the elements below the diagonal,
            with the array TAU, represent the orthogonal matrix Q as a
            product of min(m,n) elementary reflectors (see Further
            Details).
    \n
            Higher performance is achieved if A is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    @param[in]
    lda     INTEGER
            The leading dimension of the array A.  LDA >= max(1,M).

    @param[out]
    tau     COMPLEX array, dimension (min(M,N))
            The scalar factors of the elementary reflectors (see Further
            Details).

    @param[out]
    work    (workspace) COMPLEX array, dimension (MAX(1,LWORK))
            On exit, if INFO = 0, WORK[0] returns the optimal LWORK.
    \n
            Higher performance is achieved if WORK is in pinned memory, e.g.
            allocated using magma_malloc_pinned.

    @param[in]
    lwork   INTEGER
            The dimension of the array WORK.  LWORK >= N*NB,
            where NB can be obtained through magma_get_cgeqrf_nb( M, N ).
    \n
            If LWORK = -1, then a workspace query is assumed; the routine
            only calculates the optimal size of the WORK array, returns
            this value as the first entry of the WORK array, and no error
            message related to LWORK is issued.

    @param[out]
    info    INTEGER
      -     = 0:  successful exit
      -     < 0:  if INFO = -i, the i-th argument had an illegal value
                  or another error occured, such as memory allocation failed.

    Further Details
    ---------------
    The matrix Q is represented as a product of elementary reflectors

        Q = H(1) H(2) . . . H(k), where k = min(m,n).

    Each H(i) has the form

        H(i) = I - tau * v * v'

    where tau is a complex scalar, and v is a complex vector with
    v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
    and tau in TAU(i).

    @ingroup magma_cgeqrf_comp
    ********************************************************************/
extern "C" magma_int_t
magma_cgeqrf_ooc(
    magma_int_t m, magma_int_t n,
    magmaFloatComplex *A,    magma_int_t lda, magmaFloatComplex *tau,
    magmaFloatComplex *work, magma_int_t lwork,
    magma_int_t *info )
{
    #define  A(i_,j_) ( A + (i_) + (j_)*lda )
    #define dA(i_,j_) (dA + (i_) + (j_)*ldda)

    /* Constants */
    const magmaFloatComplex c_one = MAGMA_C_ONE;
    
    /* Local variables */
    magmaFloatComplex_ptr dA, dwork;
    magma_int_t i, ib, IB, j, min_mn, lddwork, ldda, rows;

    magma_int_t nb = magma_get_cgeqrf_nb( m, n );

    magma_int_t lwkopt = n * nb;
    work[0] = magma_cmake_lwork( lwkopt );
    bool lquery = (lwork == -1);
    *info = 0;
    if (m < 0) {
        *info = -1;
    } else if (n < 0) {
        *info = -2;
    } else if (lda < max(1,m)) {
        *info = -4;
    } else if (lwork < max(1,n) && ! lquery) {
        *info = -7;
    }
    if (*info != 0) {
        magma_xerbla( __func__, -(*info) );
        return *info;
    }
    else if (lquery) {
        return *info;
    }

    /* Check how much memory do we have */
    size_t freeMem, totalMem;
    cudaMemGetInfo( &freeMem, &totalMem );
    freeMem /= sizeof(magmaFloatComplex);
    
    magma_int_t NB = magma_int_t(0.8*freeMem/m);
    NB = (NB / nb) * nb;

    if (NB >= n)
        return magma_cgeqrf(m, n, A, lda, tau, work, lwork, info);

    min_mn = min(m,n);
    if (min_mn == 0) {
        work[0] = c_one;
        return *info;
    }

    lddwork = magma_roundup( NB, 32 ) + nb;
    ldda    = magma_roundup( m, 32 );

    if (MAGMA_SUCCESS != magma_cmalloc( &dA, (NB + nb)*ldda + nb*lddwork )) {
        *info = MAGMA_ERR_DEVICE_ALLOC;
        return *info;
    }

    magma_queue_t queues[2];
    magma_device_t cdev;
    magma_getdevice( &cdev );
    magma_queue_create( cdev, &queues[0] );
    magma_queue_create( cdev, &queues[1] );

    magmaFloatComplex_ptr ptr = dA + ldda*NB;
    dwork = dA + ldda*(NB + nb);

    /* start the main loop over the blocks that fit in the GPU memory */
    for (i=0; i < n; i += NB) {
        IB = min( n-i, NB );
        //printf("Processing %5d columns -- %5d to %5d ... \n", IB, i, i+IB);

        /* 1. Copy the next part of the matrix to the GPU */
        magma_csetmatrix_async( m, IB,
                                A(0,i),  lda,
                                dA(0,0), ldda, queues[0] );
        magma_queue_sync( queues[0] );

        /* 2. Update it with the previous transformations */
        for (j=0; j < min(i,min_mn); j += nb) {
            ib = min( min_mn-j, nb );

            /* Get a panel in ptr.                                           */
            //   1. Form the triangular factor of the block reflector
            //   2. Send it to the GPU.
            //   3. Put 0s in the upper triangular part of V.
            //   4. Send V to the GPU in ptr.
            //   5. Update the matrix.
            //   6. Restore the upper part of V.
            rows = m-j;
            lapackf77_clarft( MagmaForwardStr, MagmaColumnwiseStr,
                              &rows, &ib, A(j,j), &lda, tau+j, work, &ib);
            magma_csetmatrix_async( ib, ib,
                                    work,  ib,
                                    dwork, lddwork, queues[1] );

            magma_cpanel_to_q( MagmaUpper, ib, A(j,j), lda, work+ib*ib );
            magma_csetmatrix_async( rows, ib,
                                    A(j,j), lda,
                                    ptr,    rows, queues[1] );
            magma_queue_sync( queues[1] );

            magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
                              rows, IB, ib,
                              ptr, rows, dwork,    lddwork,
                              dA(j, 0), ldda, dwork+ib, lddwork, queues[1] );

            magma_cq_to_panel( MagmaUpper, ib, A(j,j), lda, work+ib*ib );
        }

        /* 3. Do a QR on the current part */
        if (i < min_mn)
            magma_cgeqrf2_gpu( m-i, IB, dA(i,0), ldda, tau+i, info );

        /* 4. Copy the current part back to the CPU */
        magma_cgetmatrix_async( m, IB,
                                dA(0,0), ldda,
                                A(0,i),  lda, queues[0] );
    }

    magma_queue_sync( queues[0] );

    magma_queue_destroy( queues[0] );
    magma_queue_destroy( queues[1] );
    magma_free( dA );
    
    return *info;
} /* magma_cgeqrf_ooc */