void COPROC_NVIDIA::get( vector<string>& warnings ) { int cuda_ndevs, retval; char buf[256]; #ifdef _WIN32 HMODULE cudalib = LoadLibrary("nvcuda.dll"); if (!cudalib) { warnings.push_back("No NVIDIA library found"); return; } __cuDeviceGetCount = (CUDA_GDC)GetProcAddress( cudalib, "cuDeviceGetCount" ); __cuDriverGetVersion = (CUDA_GDV)GetProcAddress( cudalib, "cuDriverGetVersion" ); __cuInit = (CUDA_GDI)GetProcAddress( cudalib, "cuInit" ); __cuDeviceGet = (CUDA_GDG)GetProcAddress( cudalib, "cuDeviceGet" ); __cuDeviceGetAttribute = (CUDA_GDA)GetProcAddress( cudalib, "cuDeviceGetAttribute" ); __cuDeviceGetName = (CUDA_GDN)GetProcAddress( cudalib, "cuDeviceGetName" ); __cuDeviceTotalMem = (CUDA_GDM)GetProcAddress( cudalib, "cuDeviceTotalMem" ); __cuDeviceComputeCapability = (CUDA_GDCC)GetProcAddress( cudalib, "cuDeviceComputeCapability" ); __cuCtxCreate = (CUDA_CC)GetProcAddress( cudalib, "cuCtxCreate" ); __cuCtxDestroy = (CUDA_CD)GetProcAddress( cudalib, "cuCtxDestroy" ); __cuMemAlloc = (CUDA_MA)GetProcAddress( cudalib, "cuMemAlloc" ); __cuMemFree = (CUDA_MF)GetProcAddress( cudalib, "cuMemFree" ); __cuMemGetInfo = (CUDA_MGI)GetProcAddress( cudalib, "cuMemGetInfo" ); #ifndef SIM NvAPI_Initialize(); NvAPI_ShortString ss; NvU32 Version = 0; NvAPI_SYS_GetDriverAndBranchVersion(&Version, ss); #if 0 // NvAPI now provides an API for getting #cores :-) // But not FLOPs per clock cycle :-( // Anyway, don't use this for now because server code estimates FLOPS // based on compute capability, so we may as well do the same // See http://docs.nvidia.com/gameworks/content/gameworkslibrary/coresdk/nvapi/ // NvPhysicalGpuHandle GPUHandle[NVAPI_MAX_PHYSICAL_GPUS]; NvU32 GpuCount, nc; NvAPI_EnumPhysicalGPUs(GPUHandle, &GpuCount); for (unsigned int i=0; i<GpuCount; i++) { NvAPI_GPU_GetGpuCoreCount(GPUHandle[i], &nc); } #endif #endif #else #ifdef __APPLE__ cudalib = dlopen("/usr/local/cuda/lib/libcuda.dylib", RTLD_NOW); #else cudalib = dlopen("libcuda.so", RTLD_NOW); #endif if (!cudalib) { warnings.push_back("No NVIDIA library found"); return; } __cuDeviceGetCount = (int(*)(int*)) dlsym(cudalib, "cuDeviceGetCount"); __cuDriverGetVersion = (int(*)(int*)) dlsym( cudalib, "cuDriverGetVersion" ); __cuInit = (int(*)(unsigned int)) dlsym( cudalib, "cuInit" ); __cuDeviceGet = (int(*)(int*, int)) dlsym( cudalib, "cuDeviceGet" ); __cuDeviceGetAttribute = (int(*)(int*, int, int)) dlsym( cudalib, "cuDeviceGetAttribute" ); __cuDeviceGetName = (int(*)(char*, int, int)) dlsym( cudalib, "cuDeviceGetName" ); __cuDeviceTotalMem = (int(*)(size_t*, int)) dlsym( cudalib, "cuDeviceTotalMem" ); __cuDeviceComputeCapability = (int(*)(int*, int*, int)) dlsym( cudalib, "cuDeviceComputeCapability" ); __cuCtxCreate = (int(*)(void**, unsigned int, unsigned int)) dlsym( cudalib, "cuCtxCreate" ); __cuCtxDestroy = (int(*)(void*)) dlsym( cudalib, "cuCtxDestroy" ); __cuMemAlloc = (int(*)(unsigned int*, size_t)) dlsym( cudalib, "cuMemAlloc" ); __cuMemFree = (int(*)(unsigned int)) dlsym( cudalib, "cuMemFree" ); __cuMemGetInfo = (int(*)(size_t*, size_t*)) dlsym( cudalib, "cuMemGetInfo" ); dlclose(cudalib); #endif if (!__cuDriverGetVersion) { warnings.push_back("cuDriverGetVersion() missing from NVIDIA library"); return; } if (!__cuInit) { warnings.push_back("cuInit() missing from NVIDIA library"); return; } if (!__cuDeviceGetCount) { warnings.push_back("cuDeviceGetCount() missing from NVIDIA library"); return; } if (!__cuDeviceGet) { warnings.push_back("cuDeviceGet() missing from NVIDIA library"); return; } if (!__cuDeviceGetAttribute) { warnings.push_back("cuDeviceGetAttribute() missing from NVIDIA library"); return; } if (!__cuDeviceTotalMem) { warnings.push_back("cuDeviceTotalMem() missing from NVIDIA library"); return; } if (!__cuDeviceComputeCapability) { warnings.push_back("cuDeviceComputeCapability() missing from NVIDIA library"); return; } if (!__cuMemAlloc) { warnings.push_back("cuMemAlloc() missing from NVIDIA library"); return; } if (!__cuMemFree) { warnings.push_back("cuMemFree() missing from NVIDIA library"); return; } #ifdef __APPLE__ // If system is just booting, CUDA driver may not be ready yet for (int retryCount=0; retryCount<45; retryCount++) { #endif retval = (*__cuInit)(0); #ifdef __APPLE__ if (!retval) break; if (TickCount() > (120*60)) break; // Don't retry if system has been up for over 2 minutes boinc_sleep(1.); continue; } #endif if (retval) { sprintf(buf, "NVIDIA drivers present but no GPUs found"); warnings.push_back(buf); return; } retval = (*__cuDriverGetVersion)(&cuda_version); if (retval) { sprintf(buf, "cuDriverGetVersion() returned %d", retval); warnings.push_back(buf); return; } have_cuda = true; retval = (*__cuDeviceGetCount)(&cuda_ndevs); if (retval) { sprintf(buf, "cuDeviceGetCount() returned %d", retval); warnings.push_back(buf); return; } sprintf(buf, "NVIDIA library reports %d GPU%s", cuda_ndevs, (cuda_ndevs==1)?"":"s"); warnings.push_back(buf); int j, itemp; size_t global_mem = 0; COPROC_NVIDIA cc; string s; for (j=0; j<cuda_ndevs; j++) { memset(&cc.prop, 0, sizeof(cc.prop)); CUdevice device; retval = (*__cuDeviceGet)(&device, j); if (retval) { sprintf(buf, "cuDeviceGet(%d) returned %d", j, retval); warnings.push_back(buf); return; } (*__cuDeviceGetName)(cc.prop.name, 256, device); if (retval) { sprintf(buf, "cuDeviceGetName(%d) returned %d", j, retval); warnings.push_back(buf); return; } (*__cuDeviceComputeCapability)(&cc.prop.major, &cc.prop.minor, device); (*__cuDeviceTotalMem)(&global_mem, device); cc.prop.totalGlobalMem = (double) global_mem; (*__cuDeviceGetAttribute)(&itemp, CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK, device); cc.prop.sharedMemPerBlock = (double) itemp; (*__cuDeviceGetAttribute)(&cc.prop.regsPerBlock, CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK, device); (*__cuDeviceGetAttribute)(&cc.prop.warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, device); (*__cuDeviceGetAttribute)(&itemp, CU_DEVICE_ATTRIBUTE_MAX_PITCH, device); cc.prop.memPitch = (double) itemp; retval = (*__cuDeviceGetAttribute)(&cc.prop.maxThreadsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, device); retval = (*__cuDeviceGetAttribute)(&cc.prop.maxThreadsDim[0], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, device); (*__cuDeviceGetAttribute)(&cc.prop.maxThreadsDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, device); (*__cuDeviceGetAttribute)(&cc.prop.maxThreadsDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, device); (*__cuDeviceGetAttribute)(&cc.prop.maxGridSize[0], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, device); (*__cuDeviceGetAttribute)(&cc.prop.maxGridSize[1], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, device); (*__cuDeviceGetAttribute)(&cc.prop.maxGridSize[2], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, device); (*__cuDeviceGetAttribute)(&cc.prop.clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device); (*__cuDeviceGetAttribute)(&itemp, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, device); cc.prop.totalConstMem = (double) itemp; (*__cuDeviceGetAttribute)(&itemp, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, device); cc.prop.textureAlignment = (double) itemp; (*__cuDeviceGetAttribute)(&cc.prop.deviceOverlap, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, device); (*__cuDeviceGetAttribute)(&cc.prop.multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device); (*__cuDeviceGetAttribute)(&cc.pci_info.bus_id, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, device); (*__cuDeviceGetAttribute)(&cc.pci_info.device_id, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, device); (*__cuDeviceGetAttribute)(&cc.pci_info.domain_id, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, device); if (cc.prop.major <= 0) continue; // major == 0 means emulation if (cc.prop.major > 100) continue; // e.g. 9999 is an error #if defined(_WIN32) && !defined(SIM) cc.display_driver_version = Version; #elif defined(__APPLE__) cc.display_driver_version = NSVersionOfRunTimeLibrary("cuda"); #else cc.display_driver_version = nvidia_driver_version(); #endif cc.have_cuda = true; cc.cuda_version = cuda_version; cc.device_num = j; cc.set_peak_flops(); get_available_nvidia_ram(cc, warnings); nvidia_gpus.push_back(cc); } if (!nvidia_gpus.size()) { warnings.push_back("No CUDA-capable NVIDIA GPUs found"); } }
void COPROC_NVIDIA::get( bool use_all, // if false, use only those equivalent to most capable vector<string>& warnings, vector<int>& ignore_devs ) { int cuda_ndevs, retval; char buf[256]; #ifdef _WIN32 HMODULE cudalib = LoadLibrary("nvcuda.dll"); if (!cudalib) { warnings.push_back("No NVIDIA library found"); return; } __cuDeviceGetCount = (CUDA_GDC)GetProcAddress( cudalib, "cuDeviceGetCount" ); __cuDriverGetVersion = (CUDA_GDV)GetProcAddress( cudalib, "cuDriverGetVersion" ); __cuInit = (CUDA_GDI)GetProcAddress( cudalib, "cuInit" ); __cuDeviceGet = (CUDA_GDG)GetProcAddress( cudalib, "cuDeviceGet" ); __cuDeviceGetAttribute = (CUDA_GDA)GetProcAddress( cudalib, "cuDeviceGetAttribute" ); __cuDeviceGetName = (CUDA_GDN)GetProcAddress( cudalib, "cuDeviceGetName" ); __cuDeviceTotalMem = (CUDA_GDM)GetProcAddress( cudalib, "cuDeviceTotalMem" ); __cuDeviceComputeCapability = (CUDA_GDCC)GetProcAddress( cudalib, "cuDeviceComputeCapability" ); __cuCtxCreate = (CUDA_CC)GetProcAddress( cudalib, "cuCtxCreate" ); __cuCtxDestroy = (CUDA_CD)GetProcAddress( cudalib, "cuCtxDestroy" ); __cuMemAlloc = (CUDA_MA)GetProcAddress( cudalib, "cuMemAlloc" ); __cuMemFree = (CUDA_MF)GetProcAddress( cudalib, "cuMemFree" ); __cuMemGetInfo = (CUDA_MGI)GetProcAddress( cudalib, "cuMemGetInfo" ); #ifndef SIM NvAPI_Status nvapiStatus; NV_DISPLAY_DRIVER_VERSION Version; memset(&Version, 0, sizeof(Version)); Version.version = NV_DISPLAY_DRIVER_VERSION_VER; NvAPI_Initialize(); nvapiStatus = NvAPI_GetDisplayDriverVersion(NULL, &Version); #endif #else #ifdef __APPLE__ cudalib = dlopen("/usr/local/cuda/lib/libcuda.dylib", RTLD_NOW); #else cudalib = dlopen("libcuda.so", RTLD_NOW); #endif if (!cudalib) { warnings.push_back("No NVIDIA library found"); return; } __cuDeviceGetCount = (int(*)(int*)) dlsym(cudalib, "cuDeviceGetCount"); __cuDriverGetVersion = (int(*)(int*)) dlsym( cudalib, "cuDriverGetVersion" ); __cuInit = (int(*)(unsigned int)) dlsym( cudalib, "cuInit" ); __cuDeviceGet = (int(*)(int*, int)) dlsym( cudalib, "cuDeviceGet" ); __cuDeviceGetAttribute = (int(*)(int*, int, int)) dlsym( cudalib, "cuDeviceGetAttribute" ); __cuDeviceGetName = (int(*)(char*, int, int)) dlsym( cudalib, "cuDeviceGetName" ); __cuDeviceTotalMem = (int(*)(size_t*, int)) dlsym( cudalib, "cuDeviceTotalMem" ); __cuDeviceComputeCapability = (int(*)(int*, int*, int)) dlsym( cudalib, "cuDeviceComputeCapability" ); __cuCtxCreate = (int(*)(void**, unsigned int, unsigned int)) dlsym( cudalib, "cuCtxCreate" ); __cuCtxDestroy = (int(*)(void*)) dlsym( cudalib, "cuCtxDestroy" ); __cuMemAlloc = (int(*)(unsigned int*, size_t)) dlsym( cudalib, "cuMemAlloc" ); __cuMemFree = (int(*)(unsigned int)) dlsym( cudalib, "cuMemFree" ); __cuMemGetInfo = (int(*)(size_t*, size_t*)) dlsym( cudalib, "cuMemGetInfo" ); #endif if (!__cuDriverGetVersion) { warnings.push_back("cuDriverGetVersion() missing from NVIDIA library"); return; } if (!__cuInit) { warnings.push_back("cuInit() missing from NVIDIA library"); return; } if (!__cuDeviceGetCount) { warnings.push_back("cuDeviceGetCount() missing from NVIDIA library"); return; } if (!__cuDeviceGet) { warnings.push_back("cuDeviceGet() missing from NVIDIA library"); return; } if (!__cuDeviceGetAttribute) { warnings.push_back("cuDeviceGetAttribute() missing from NVIDIA library"); return; } if (!__cuDeviceTotalMem) { warnings.push_back("cuDeviceTotalMem() missing from NVIDIA library"); return; } if (!__cuDeviceComputeCapability) { warnings.push_back("cuDeviceComputeCapability() missing from NVIDIA library"); return; } if (!__cuCtxCreate) { warnings.push_back("cuCtxCreate() missing from NVIDIA library"); return; } if (!__cuCtxDestroy) { warnings.push_back("cuCtxDestroy() missing from NVIDIA library"); return; } if (!__cuMemAlloc) { warnings.push_back("cuMemAlloc() missing from NVIDIA library"); return; } if (!__cuMemFree) { warnings.push_back("cuMemFree() missing from NVIDIA library"); return; } if (!__cuMemGetInfo) { warnings.push_back("cuMemGetInfo() missing from NVIDIA library"); return; } retval = (*__cuInit)(0); if (retval) { sprintf(buf, "NVIDIA drivers present but no GPUs found"); warnings.push_back(buf); return; } retval = (*__cuDriverGetVersion)(&cuda_version); if (retval) { sprintf(buf, "cuDriverGetVersion() returned %d", retval); warnings.push_back(buf); return; } retval = (*__cuDeviceGetCount)(&cuda_ndevs); if (retval) { sprintf(buf, "cuDeviceGetCount() returned %d", retval); warnings.push_back(buf); return; } sprintf(buf, "NVIDIA library reports %d GPU%s", cuda_ndevs, (cuda_ndevs==1)?"":"s"); warnings.push_back(buf); int j, itemp; unsigned int i; size_t global_mem; COPROC_NVIDIA cc; string s; for (j=0; j<cuda_ndevs; j++) { memset(&cc.prop, 0, sizeof(cc.prop)); CUdevice device; retval = (*__cuDeviceGet)(&device, j); if (retval) { sprintf(buf, "cuDeviceGet(%d) returned %d", j, retval); warnings.push_back(buf); return; } (*__cuDeviceGetName)(cc.prop.name, 256, device); if (retval) { sprintf(buf, "cuDeviceGetName(%d) returned %d", j, retval); warnings.push_back(buf); return; } (*__cuDeviceComputeCapability)(&cc.prop.major, &cc.prop.minor, device); (*__cuDeviceTotalMem)(&global_mem, device); cc.prop.totalGlobalMem = (double) global_mem; (*__cuDeviceGetAttribute)(&itemp, CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK, device); cc.prop.sharedMemPerBlock = (double) itemp; (*__cuDeviceGetAttribute)(&cc.prop.regsPerBlock, CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK, device); (*__cuDeviceGetAttribute)(&cc.prop.warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, device); (*__cuDeviceGetAttribute)(&itemp, CU_DEVICE_ATTRIBUTE_MAX_PITCH, device); cc.prop.memPitch = (double) itemp; retval = (*__cuDeviceGetAttribute)(&cc.prop.maxThreadsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, device); retval = (*__cuDeviceGetAttribute)(&cc.prop.maxThreadsDim[0], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, device); (*__cuDeviceGetAttribute)(&cc.prop.maxThreadsDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, device); (*__cuDeviceGetAttribute)(&cc.prop.maxThreadsDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, device); (*__cuDeviceGetAttribute)(&cc.prop.maxGridSize[0], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, device); (*__cuDeviceGetAttribute)(&cc.prop.maxGridSize[1], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, device); (*__cuDeviceGetAttribute)(&cc.prop.maxGridSize[2], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, device); (*__cuDeviceGetAttribute)(&cc.prop.clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device); (*__cuDeviceGetAttribute)(&itemp, CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, device); cc.prop.totalConstMem = (double) itemp; (*__cuDeviceGetAttribute)(&itemp, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, device); cc.prop.textureAlignment = (double) itemp; (*__cuDeviceGetAttribute)(&cc.prop.deviceOverlap, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, device); (*__cuDeviceGetAttribute)(&cc.prop.multiProcessorCount, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device); (*__cuDeviceGetAttribute)(&cc.pci_info.bus_id, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, device); (*__cuDeviceGetAttribute)(&cc.pci_info.device_id, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, device); (*__cuDeviceGetAttribute)(&cc.pci_info.domain_id, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, device); if (cc.prop.major <= 0) continue; // major == 0 means emulation if (cc.prop.major > 100) continue; // e.g. 9999 is an error #if defined(_WIN32) && !defined(SIM) cc.display_driver_version = Version.drvVersion; #elif defined(__APPLE__) cc.display_driver_version = NSVersionOfRunTimeLibrary("cuda"); #else cc.display_driver_version = 0; #endif cc.have_cuda = true; cc.cuda_version = cuda_version; cc.device_num = j; cc.set_peak_flops(); cc.get_available_ram(); nvidia_gpus.push_back(cc); } if (!nvidia_gpus.size()) { warnings.push_back("No CUDA-capable NVIDIA GPUs found"); return; } // identify the most capable non-ignored instance // bool first = true; for (i=0; i<nvidia_gpus.size(); i++) { if (in_vector(nvidia_gpus[i].device_num, ignore_devs)) continue; if (first) { *this = nvidia_gpus[i]; first = false; } else if (nvidia_compare(nvidia_gpus[i], *this, false) > 0) { *this = nvidia_gpus[i]; } } // see which other instances are equivalent, // and set "count", "device_nums", and "pci_infos" // count = 0; for (i=0; i<nvidia_gpus.size(); i++) { if (in_vector(nvidia_gpus[i].device_num, ignore_devs)) { nvidia_gpus[i].is_used = COPROC_IGNORED; } else if (use_all || !nvidia_compare(nvidia_gpus[i], *this, true)) { device_nums[count] = nvidia_gpus[i].device_num; pci_infos[count] = nvidia_gpus[i].pci_info; count++; nvidia_gpus[i].is_used = COPROC_USED; } else { nvidia_gpus[i].is_used = COPROC_UNUSED; } } }
void COPROCS::get_opencl( vector<string>& warnings ) { cl_int ciErrNum; cl_platform_id platforms[MAX_OPENCL_PLATFORMS]; cl_uint num_platforms, platform_index, num_devices, device_index; cl_device_id devices[MAX_COPROC_INSTANCES]; char platform_version[256]; char platform_vendor[256]; char buf[256]; OPENCL_DEVICE_PROP prop; int current_CUDA_index; int current_CAL_index; int min_CAL_target; int num_CAL_devices = (int)ati_gpus.size(); vector<int>devnums_pci_slot_sort; vector<OPENCL_DEVICE_PROP>::iterator it; #ifdef _WIN32 opencl_lib = LoadLibrary("OpenCL.dll"); if (!opencl_lib) { warnings.push_back("No OpenCL library found"); return; } __clGetPlatformIDs = (CL_PLATFORMIDS)GetProcAddress( opencl_lib, "clGetPlatformIDs" ); __clGetPlatformInfo = (CL_PLATFORMINFO)GetProcAddress( opencl_lib, "clGetPlatformInfo" ); __clGetDeviceIDs = (CL_DEVICEIDS)GetProcAddress( opencl_lib, "clGetDeviceIDs" ); __clGetDeviceInfo = (CL_INFO)GetProcAddress( opencl_lib, "clGetDeviceInfo" ); #else #ifdef __APPLE__ opencl_lib = dlopen("/System/Library/Frameworks/OpenCL.framework/Versions/Current/OpenCL", RTLD_NOW); #else //TODO: Is this correct? opencl_lib = dlopen("libOpenCL.so", RTLD_NOW); #endif if (!opencl_lib) { warnings.push_back("No OpenCL library found"); return; } __clGetPlatformIDs = (cl_int(*)(cl_uint, cl_platform_id*, cl_uint*)) dlsym( opencl_lib, "clGetPlatformIDs" ); __clGetPlatformInfo = (cl_int(*)(cl_platform_id, cl_platform_info, size_t, void*, size_t*)) dlsym( opencl_lib, "clGetPlatformInfo" ); __clGetDeviceIDs = (cl_int(*)(cl_platform_id, cl_device_type, cl_uint, cl_device_id*, cl_uint*)) dlsym( opencl_lib, "clGetDeviceIDs" ); __clGetDeviceInfo = (cl_int(*)(cl_device_id, cl_device_info, size_t, void*, size_t*)) dlsym( opencl_lib, "clGetDeviceInfo" ); #endif if (!__clGetPlatformIDs) { warnings.push_back("clGetPlatformIDs() missing from OpenCL library"); return; } if (!__clGetPlatformInfo) { warnings.push_back("clGetPlatformInfo() missing from OpenCL library"); return; } if (!__clGetDeviceIDs) { warnings.push_back("clGetDeviceIDs() missing from OpenCL library"); return; } if (!__clGetDeviceInfo) { warnings.push_back("clGetDeviceInfo() missing from OpenCL library"); return; } ciErrNum = (*__clGetPlatformIDs)(MAX_OPENCL_PLATFORMS, platforms, &num_platforms); if ((ciErrNum != CL_SUCCESS) || (num_platforms == 0)) { warnings.push_back("clGetPlatformIDs() failed to return any OpenCL platforms"); return; } if (nvidia_gpus.size()) { for (int i=0; i<(int)nvidia_gpus.size(); ++i) { devnums_pci_slot_sort.push_back(i); } #ifdef __APPLE__ std::stable_sort( devnums_pci_slot_sort.begin(), devnums_pci_slot_sort.end(), compare_pci_slots ); #endif } for (platform_index=0; platform_index<num_platforms; ++platform_index) { ciErrNum = (*__clGetPlatformInfo)( platforms[platform_index], CL_PLATFORM_VERSION, sizeof(platform_version), &platform_version, NULL ); if (ciErrNum != CL_SUCCESS) { snprintf(buf, sizeof(buf), "Couldn't get PLATFORM_VERSION for platform #%d; error %d", platform_index, ciErrNum ); warnings.push_back(buf); continue; } ciErrNum = (*__clGetPlatformInfo)( platforms[platform_index], CL_PLATFORM_VENDOR, sizeof(platform_vendor), &platform_vendor, NULL ); if (ciErrNum != CL_SUCCESS) { snprintf(buf, sizeof(buf), "Couldn't get PLATFORM_VENDOR for platform #%d; error %d", platform_index, ciErrNum ); warnings.push_back(buf); } //////////// CPU ////////////// ciErrNum = (*__clGetDeviceIDs)( platforms[platform_index], (CL_DEVICE_TYPE_CPU), MAX_COPROC_INSTANCES, devices, &num_devices ); if ((ciErrNum != CL_SUCCESS) && (num_devices != 0)) { num_devices = 0; // No devices if (ciErrNum != CL_DEVICE_NOT_FOUND) { snprintf(buf, sizeof(buf), "Couldn't get CPU Device IDs for platform #%d: error %d", platform_index, ciErrNum ); warnings.push_back(buf); } } for (device_index=0; device_index<num_devices; ++device_index) { memset(&prop, 0, sizeof(prop)); prop.device_id = devices[device_index]; strncpy( prop.opencl_platform_version, platform_version, sizeof(prop.opencl_platform_version)-1 ); ciErrNum = get_opencl_info(prop, device_index, warnings); if (ciErrNum != CL_SUCCESS) continue; prop.is_used = COPROC_UNUSED; prop.get_device_version_int(); OPENCL_CPU_PROP c; strlcpy(c.platform_vendor, platform_vendor, sizeof(c.platform_vendor)); c.opencl_prop = prop; cpu_opencls.push_back(c); } //////////// GPUs ////////////// ciErrNum = (*__clGetDeviceIDs)( platforms[platform_index], (CL_DEVICE_TYPE_GPU), MAX_COPROC_INSTANCES, devices, &num_devices ); if (ciErrNum == CL_DEVICE_NOT_FOUND) continue; // No devices if (num_devices == 0) continue; // No devices if (ciErrNum != CL_SUCCESS) { snprintf(buf, sizeof(buf), "Couldn't get Device IDs for platform #%d: error %d", platform_index, ciErrNum ); warnings.push_back(buf); continue; } // Mac OpenCL does not recognize all NVIDIA GPUs returned by CUDA // Fortunately, CUDA and OpenCL return the same GPU model name on // the Mac, so we can use this to match OpenCL devices with CUDA. // current_CUDA_index = 0; // ATI/AMD OpenCL does not always recognize all GPUs returned by CAL. // This is complicated for several reasons: // * CAL returns only an enum (CALtargetEnum) for the GPU's family, // not specific model information. // * OpenCL return only the GPU family name // * Which GPUs support OpenCL varies with different versions of the // AMD Catalyst drivers. // // To deal with this, we make some (probably imperfect) assumptions: // * AMD drivers eliminate OpenCL support for older GPU families first. // * Lower values of CALtargetEnum represent older GPU families. // * All ATI/AMD GPUs reported by OpenCL are also reported by CAL (on // systems where CAL is available) though the converse may not be true. // current_CAL_index = 0; min_CAL_target = 0; if (is_AMD(platform_vendor) && (num_CAL_devices > 0)) { while (1) { int numToMatch = 0; for (int i=0; i<num_CAL_devices; ++i) { if ((int)ati_gpus[i].attribs.target >= min_CAL_target) { ++numToMatch; } } if (numToMatch == (int)num_devices) break; if (numToMatch < (int)num_devices) { warnings.push_back( "Could not match ATI OpenCL and CAL GPUs: ignoring CAL." ); // If we can't match ATI OpenCL and CAL GPUs, ignore CAL // and keep OpenCL because AMD has deprecated CAL. ati_gpus.clear(); ati.have_cal = false; num_CAL_devices = 0; break; } ++min_CAL_target; } } for (device_index=0; device_index<num_devices; ++device_index) { memset(&prop, 0, sizeof(prop)); prop.device_id = devices[device_index]; strncpy( prop.opencl_platform_version, platform_version, sizeof(prop.opencl_platform_version)-1 ); //TODO: Should we store the platform(s) for each GPU found? //TODO: Must we check if multiple platforms found the same GPU and merge the records? ciErrNum = get_opencl_info(prop, device_index, warnings); if (ciErrNum != CL_SUCCESS) continue; prop.is_used = COPROC_UNUSED; prop.get_device_version_int(); //////////// NVIDIA ////////////// if (is_NVIDIA(prop.vendor)) { if (nvidia.have_cuda) { // Mac OpenCL does not recognize all NVIDIA GPUs returned by // CUDA but we assume that OpenCL and CUDA return devices // with identical model name strings and that OpenCL returns // devices in order of acending PCI slot. // // On other systems, assume OpenCL and CUDA return devices // in the same order. // while (1) { if (current_CUDA_index >= (int)(nvidia_gpus.size())) { snprintf(buf, sizeof(buf), "OpenCL NVIDIA index #%d does not match any CUDA device", device_index ); warnings.push_back(buf); return; // Should never happen } if (!strcmp(prop.name, nvidia_gpus[devnums_pci_slot_sort[current_CUDA_index]].prop.name) ) { break; // We have a match } // This CUDA GPU is not recognized by OpenCL, // so try the next // ++current_CUDA_index; } prop.device_num = devnums_pci_slot_sort[current_CUDA_index]; } else { prop.device_num = (int)(nvidia_opencls.size()); } prop.opencl_device_index = device_index; if (nvidia.have_cuda) { prop.peak_flops = nvidia_gpus[prop.device_num].peak_flops; } else { COPROC_NVIDIA c; c.opencl_prop = prop; c.set_peak_flops(); prop.peak_flops = c.peak_flops; } if (nvidia_gpus.size()) { // Assumes OpenCL device_num and CUDA device_num now match // prop.opencl_available_ram = nvidia_gpus[prop.device_num].available_ram; } else { prop.opencl_available_ram = prop.global_mem_size; } // Build nvidia_opencls vector in device_num order for (it=nvidia_opencls.begin(); it<nvidia_opencls.end(); it++) { if (it->device_num > prop.device_num) break; } nvidia_opencls.insert(it, prop); ++current_CUDA_index; } //////////// AMD / ATI ////////////// if (is_AMD(prop.vendor)) { prop.opencl_device_index = device_index; if (ati.have_cal) { // AMD OpenCL does not recognize all AMD GPUs returned by // CAL but we assume that OpenCL and CAL return devices in // the same order. See additional comments earlier in // this source file for more details. // while (1) { if (current_CAL_index >= num_CAL_devices) { snprintf(buf, sizeof(buf), "OpenCL ATI device #%d does not match any CAL device", device_index ); warnings.push_back(buf); return; // Should never happen } if ((int)ati_gpus[current_CAL_index].attribs.target >= min_CAL_target) { break; // We have a match } // This CAL GPU is not recognized by OpenCL, // so try the next // ++current_CAL_index; } prop.device_num = current_CAL_index++; // Always use GPU model name from CAL if // available for ATI / AMD GPUs because // (we believe) it is more user-friendly. // safe_strcpy(prop.name, ati_gpus[prop.device_num].name); // Work around a bug in OpenCL which returns only // 1/2 of total global RAM size: use the value from CAL. // This bug applies only to ATI GPUs, not to NVIDIA // See also further workaround code for Macs. // prop.global_mem_size = ati_gpus[prop.device_num].attribs.localRAM * MEGA; prop.peak_flops = ati_gpus[prop.device_num].peak_flops; } else { // ! ati.have_cal prop.device_num = (int)(ati_opencls.size()); COPROC_ATI c; c.opencl_prop = prop; c.set_peak_flops(); prop.peak_flops = c.peak_flops; } if (ati_gpus.size()) { prop.opencl_available_ram = ati_gpus[prop.device_num].available_ram; } else { prop.opencl_available_ram = prop.global_mem_size; } ati_opencls.push_back(prop); } //////////// INTEL GPU ////////////// // if (is_intel(prop.vendor)) { prop.device_num = (int)(intel_gpu_opencls.size()); prop.opencl_device_index = device_index; COPROC_INTEL c; c.opencl_prop = prop; c.is_used = COPROC_UNUSED; c.available_ram = prop.global_mem_size; safe_strcpy(c.name, prop.name); safe_strcpy(c.version, prop.opencl_driver_version); c.set_peak_flops(); prop.peak_flops = c.peak_flops; prop.opencl_available_ram = prop.global_mem_size; intel_gpu_opencls.push_back(prop); // At present Intel GPUs only support OpenCL // and do not have a native GPGPU framework, // so treat each detected Intel OpenCL GPU device as // a native device. // intel_gpus.push_back(c); } } } #ifdef __APPLE__ // Work around a bug in OpenCL which returns only // 1/2 of total global RAM size. // This bug applies only to ATI GPUs, not to NVIDIA // This has already been fixed on latest Catalyst // drivers, but Mac does not use Catalyst drivers. if (ati_opencls.size() > 0) { opencl_get_ati_mem_size_from_opengl(warnings); } #endif if ((nvidia_opencls.size() == 0) && (ati_opencls.size() == 0) && (intel_gpu_opencls.size() == 0) ) { warnings.push_back( "OpenCL library present but no OpenCL-capable GPUs found" ); } }