cl_double mwDeviceEstimateGFLOPs(const DevInfo* di, cl_bool useDouble) { cl_double gflops = 0.0; if (di->devType == CL_DEVICE_TYPE_GPU) { if (mwIsNvidiaGPUDevice(di)) { gflops = mwCUDAEstimateGFLOPs(di, useDouble); } else if (mwIsAMDGPUDevice(di)) { gflops = mwAMDEstimateGFLOPs(di, useDouble); } else { mw_printf("Unhandled GPU vendor '%s' (0x%x)\n", di->vendor, di->vendorID); gflops = 100.0; } } else { mw_printf("Missing flops estimate for device type %s\n", showCLDeviceType(di->devType)); return 1.0; } return gflops; }
cl_bool mwDriverHasHighCPUWaitIssue(CLInfo* ci) { const DevInfo* di = &ci->di; int major = 0, minor = 0, patchLevel = 0; if (mwIsNvidiaGPUDevice(di)) { if (sscanf(di->driver, "%u.%u", &major, &minor) != 2) { return CL_FALSE; } /* Issue started around 270.xx and hasn't been fixed yet. */ return (major >= 270); } else if (mwIsAMDGPUDevice(di)) { /* Sometimes it has other stuff like (VM) after. Not sure what that means. */ if (sscanf(di->driver, "CAL %d.%d.%d", &major, &minor, &patchLevel) != 3) { return CL_FALSE; } /* I think it happened in 11.7 and 11.8 */ return (major == 1 && minor == 4 && patchLevel >= 1457 && patchLevel < 1546); } else { return CL_FALSE; } }
cl_bool mwNvidiaDriverVersionGreaterEqual(const DevInfo* di, cl_uint minMajor, cl_uint minMinor) { cl_uint minor = 0; cl_uint major = 0; if (!mwIsNvidiaGPUDevice(di) || (sscanf(di->driver, "%u.%u", &major, &minor) != 2)) { return CL_FALSE; } return (major > minMajor) || (major == minMajor && minor >= minMinor); }
NBodyStatus nbInitNBodyStateCL(NBodyState* st, const NBodyCtx* ctx) { cl_int err; const DevInfo* devInfo; if (!st->usesCL) { mw_printf("CL not setup for CL state initialization\n"); return NBODY_CONSISTENCY_ERROR; } /* Bodies must be set before trying to use this */ if (!st->bodytab) { mw_printf("Bodies not set for CL state initialization\n"); return NBODY_CONSISTENCY_ERROR; } if (ctx->potentialType == EXTERNAL_POTENTIAL_CUSTOM_LUA) { mw_printf("Cannot use Lua potential with OpenCL\n"); return NBODY_UNSUPPORTED; } devInfo = &st->ci->di; if (!nbCheckDevCapabilities(devInfo, ctx, st->nbody)) return NBODY_CAPABILITY_ERROR; if ( nbSetThreadCounts(st->workSizes, devInfo, ctx) || nbSetWorkSizes(st->workSizes, devInfo, st->nbody, st->ignoreResponsive)) return NBODY_ERROR; st->effNBody = nbFindEffectiveNBody(st->workSizes, st->usesExact, st->nbody); st->maxDepth = nbFindMaxDepthForDevice(devInfo, st->workSizes, ctx->useQuad); st->usesConsistentMemory = (mwIsNvidiaGPUDevice(devInfo) && mwNvidiaInlinePTXAvailable(st->ci->plat)) || mwDeviceHasConsistentMemory(devInfo); if (nbLoadKernels(ctx, st)) return NBODY_CL_ERROR; err = nbCreateBuffers(ctx, st); if (err != CL_SUCCESS) return NBODY_CL_ERROR; err = nbSetInitialTreeStatus(st); if (err != CL_SUCCESS) return NBODY_CL_ERROR; err = nbSetAllKernelArguments(st); if (err != CL_SUCCESS) return NBODY_CL_ERROR; err = nbMarshalBodies(st, CL_TRUE); if (err != CL_SUCCESS) { mw_printf("Error marshalling initial bodies\n"); return NBODY_CL_ERROR; } return NBODY_SUCCESS; }
cl_int mwGetDevInfo(DevInfo* di, cl_device_id dev) { const AMDGPUData* amdData; cl_int err = CL_SUCCESS; di->devID = dev; err |= clGetDeviceInfo(dev, CL_DEVICE_TYPE, sizeof(di->devType), &di->devType, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_NAME, sizeof(di->devName), di->devName, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_VENDOR, sizeof(di->vendor), di->vendor, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_VENDOR_ID, sizeof(cl_uint), &di->vendorID, NULL); err |= clGetDeviceInfo(dev, CL_DRIVER_VERSION, sizeof(di->driver), di->driver, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_VERSION, sizeof(di->version), di->version, NULL); //err |= clGetDeviceInfo(dev, CL_DEVICE_OPENCL_C_VERSION, sizeof(di->clCVer), di->clCVer, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_ENDIAN_LITTLE, sizeof(cl_bool), &di->littleEndian, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_ERROR_CORRECTION_SUPPORT, sizeof(cl_bool), &di->errCorrect, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_IMAGE_SUPPORT, sizeof(cl_bool), &di->imgSupport, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_ADDRESS_BITS, sizeof(cl_uint), &di->addrBits, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &di->maxCompUnits, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(cl_uint), &di->clockFreq, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &di->memSize, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &di->maxMemAlloc, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, sizeof(cl_ulong), &di->gMemCache, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, sizeof(cl_uint), &di->cachelineSize, NULL); //err |= clGetDeviceInfo(dev, CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(cl_ulong), &unifiedMem, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_LOCAL_MEM_TYPE, sizeof(cl_device_local_mem_type), &di->localMemType, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_DOUBLE_FP_CONFIG, sizeof(cl_device_fp_config), &di->doubleFPConfig, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_SINGLE_FP_CONFIG, sizeof(cl_device_fp_config), &di->floatFPConfig, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &di->localMemSize, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_MAX_CONSTANT_ARGS, sizeof(cl_uint), &di->maxConstArgs, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(cl_ulong), &di->maxConstBufSize, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_MAX_PARAMETER_SIZE, sizeof(size_t), &di->maxParamSize, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &di->maxWorkGroupSize, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), &di->maxWorkItemDim, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(di->maxWorkItemSizes), di->maxWorkItemSizes, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &di->memBaseAddrAlign, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, sizeof(cl_uint), &di->minAlignSize, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_PROFILING_TIMER_RESOLUTION, sizeof(size_t), &di->timerRes, NULL); err |= clGetDeviceInfo(dev, CL_DEVICE_EXTENSIONS, sizeof(di->exts), &di->exts, NULL); di->computeCapabilityMajor = di->computeCapabilityMinor = 0; di->warpSize = 0; if (err == CL_SUCCESS) { if (strstr(di->exts, "cl_nv_device_attribute_query") != NULL) { err |= clGetDeviceInfo(dev, CL_DEVICE_WARP_SIZE_NV, sizeof(di->warpSize), &di->warpSize, NULL); err |= clGetDeviceInfo(di->devID, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, sizeof(cl_uint), &di->computeCapabilityMajor, NULL); err |= clGetDeviceInfo(di->devID, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, sizeof(cl_uint), &di->computeCapabilityMinor, NULL); } else { if (di->devType == CL_DEVICE_TYPE_CPU) { di->warpSize = 1; } else if (di->devType == CL_DEVICE_TYPE_GPU) { /* FIXME: How do I get this on AMD? It's 64 for all of * the high end stuff, but 32 for lower. I think it's * 64 for all the GPUs that do have doubles */ di->warpSize = 64; } else { mw_printf("Unknown device type, using warp size = 1\n"); di->warpSize = 1; } } } di->nonOutput = mwDeviceIsNonOutput(di); di->hasGraphicsQOS = mwDeviceHasGraphicsQOS(di); if (mwIsNvidiaGPUDevice(di)) { di->aluPerCU = mwCUDACoresPerComputeUnit(di); di->doubleFrac = mwCUDAEstimateDoubleFrac(di); di->calTarget = MW_CAL_TARGET_INVALID; if (strstr(di->exts, "cl_nv_device_attribute_query") != NULL) { err |= clGetDeviceInfo(dev, CL_DEVICE_WARP_SIZE_NV, sizeof(di->warpSize), &di->warpSize, NULL); err |= clGetDeviceInfo(di->devID, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, sizeof(cl_uint), &di->computeCapabilityMajor, NULL); err |= clGetDeviceInfo(di->devID, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, sizeof(cl_uint), &di->computeCapabilityMinor, NULL); } } else if (mwIsAMDGPUDevice(di)) { amdData = mwLookupAMDGPUInfo(di); di->aluPerCU = amdData->aluPerCU; di->doubleFrac = amdData->doubleFrac; di->calTarget = amdData->target; di->warpSize = amdData->wavefrontSize; } if (di->warpSize == 0) { mw_printf("Unknown device type, using warp size = 1\n"); di->warpSize = 1; } if (err != CL_SUCCESS) { mwPerrorCL(err, "Error getting device information"); } else { di->doubleExts = mwGetDoubleExts(di->exts); } return err; }