/*! \brief Clears nonbonded shift force output array and energy outputs on the GPU. */ static void nbnxn_ocl_clear_e_fshift(gmx_nbnxn_ocl_t *nb) { cl_int cl_error; cl_atomdata_t * adat = nb->atdat; cl_command_queue ls = nb->stream[eintLocal]; size_t local_work_size[3] = {1, 1, 1}; size_t global_work_size[3] = {1, 1, 1}; cl_int shifts = SHIFTS*3; cl_int arg_no; cl_kernel zero_e_fshift = nb->kernel_zero_e_fshift; local_work_size[0] = 64; // Round the total number of threads up from the array size global_work_size[0] = ((shifts + local_work_size[0] - 1)/local_work_size[0])*local_work_size[0]; arg_no = 0; cl_error = clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->fshift)); cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->e_lj)); cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_mem), &(adat->e_el)); cl_error |= clSetKernelArg(zero_e_fshift, arg_no++, sizeof(cl_uint), &shifts); GMX_ASSERT(cl_error == CL_SUCCESS, ocl_get_error_string(cl_error).c_str()); cl_error = clEnqueueNDRangeKernel(ls, zero_e_fshift, 3, nullptr, global_work_size, local_work_size, 0, nullptr, nullptr); GMX_ASSERT(cl_error == CL_SUCCESS, ocl_get_error_string(cl_error).c_str()); }
//! This function is documented in the header file bool canDetectGpus(std::string *errorMessage) { cl_uint numPlatforms; cl_int status = clGetPlatformIDs(0, nullptr, &numPlatforms); GMX_ASSERT(status != CL_INVALID_VALUE, "Incorrect call of clGetPlatformIDs detected"); #ifdef cl_khr_icd if (status == CL_PLATFORM_NOT_FOUND_KHR) { // No valid ICDs found if (errorMessage != nullptr) { errorMessage->assign("No valid OpenCL driver found"); } return false; } #endif GMX_RELEASE_ASSERT(status == CL_SUCCESS, gmx::formatString("An unexpected value was returned from clGetPlatformIDs %d: %s", status, ocl_get_error_string(status).c_str()).c_str()); bool foundPlatform = (numPlatforms > 0); if (!foundPlatform && errorMessage != nullptr) { errorMessage->assign("No OpenCL platforms found even though the driver was valid"); } return foundPlatform; }
/*! \brief Creates context for OpenCL GPU given by \p mygpu * * A fatal error results if creation fails. * * \param[inout] runtimeData runtime data including program and context * \param[in] devInfo device info struct * \param[in] rank MPI rank (for error reporting) */ static void nbnxn_gpu_create_context(gmx_device_runtime_data_t *runtimeData, const gmx_device_info_t *devInfo, int rank) { cl_context_properties context_properties[3]; cl_platform_id platform_id; cl_device_id device_id; cl_context context; cl_int cl_error; assert(runtimeData != nullptr); assert(devInfo != nullptr); platform_id = devInfo->ocl_gpu_id.ocl_platform_id; device_id = devInfo->ocl_gpu_id.ocl_device_id; context_properties[0] = CL_CONTEXT_PLATFORM; context_properties[1] = reinterpret_cast<cl_context_properties>(platform_id); context_properties[2] = 0; /* Terminates the list of properties */ context = clCreateContext(context_properties, 1, &device_id, nullptr, nullptr, &cl_error); if (CL_SUCCESS != cl_error) { gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s:\n OpenCL error %d: %s", rank, devInfo->device_name, cl_error, ocl_get_error_string(cl_error).c_str()); } runtimeData->context = context; }
/*! \brief Get the warp size reported by device * * This is platform implementation dependant and seems to only work on the Nvidia and AMD platforms! * Nvidia reports 32, AMD for GPU 64. Ignore the rest * * \param context Current OpenCL context * \param deviceId OpenCL device with the context * \return cl_int value of the warp size * * \throws InternalError if an OpenCL error was encountered */ static size_t getWarpSize(cl_context context, cl_device_id deviceId) { cl_int cl_error; const char *warpSizeKernel = "__kernel void test(__global int* test){test[get_local_id(0)] = 0;}"; cl_program program = clCreateProgramWithSource(context, 1, (const char**)&warpSizeKernel, NULL, &cl_error); if (cl_error != CL_SUCCESS) { GMX_THROW(InternalError("Could not create OpenCL program to determine warp size, error was " + ocl_get_error_string(cl_error))); } cl_error = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (cl_error != CL_SUCCESS) { GMX_THROW(InternalError("Could not build OpenCL program to determine warp size, error was " + ocl_get_error_string(cl_error))); } cl_kernel kernel = clCreateKernel(program, "test", &cl_error); if (cl_error != CL_SUCCESS) { GMX_THROW(InternalError("Could not create OpenCL kernel to determine warp size, error was " + ocl_get_error_string(cl_error))); } size_t warpSize = 0; cl_error = clGetKernelWorkGroupInfo(kernel, deviceId, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(warpSize), &warpSize, NULL); if (cl_error != CL_SUCCESS) { GMX_THROW(InternalError("Could not measure OpenCL warp size, error was " + ocl_get_error_string(cl_error))); } if (warpSize == 0) { GMX_THROW(InternalError(formatString("Did not measure a valid OpenCL warp size"))); } cl_error = clReleaseKernel(kernel); if (cl_error != CL_SUCCESS) { GMX_THROW(InternalError("Could not release OpenCL warp-size kernel, error was " + ocl_get_error_string(cl_error))); } cl_error = clReleaseProgram(program); if (cl_error != CL_SUCCESS) { GMX_THROW(InternalError("Could not release OpenCL warp-size program, error was " + ocl_get_error_string(cl_error))); } return warpSize; }
/*! \brief Clears the first natoms_clear elements of the GPU nonbonded force output array. */ static void nbnxn_ocl_clear_f(gmx_nbnxn_ocl_t *nb, int natoms_clear) { if (natoms_clear == 0) { return; } cl_int gmx_used_in_debug cl_error; cl_atomdata_t *atomData = nb->atdat; cl_command_queue ls = nb->stream[eintLocal]; cl_float value = 0.0f; cl_error = clEnqueueFillBuffer(ls, atomData->f, &value, sizeof(cl_float), 0, natoms_clear*sizeof(rvec), 0, nullptr, nullptr); GMX_RELEASE_ASSERT(cl_error == CL_SUCCESS, ("nbnxn_ocl_clear_f failed: " + ocl_get_error_string(cl_error)).c_str()); }
/*! \brief Handles writing the OpenCL JIT compilation log to \c fplog. * * If \c fplog is non-null and either the GMX_OCL_DUMP_LOG environment * variable is set or the compilation failed, then the OpenCL * compilation log is written. * * \param fplog Open file pointer to log file * \param program OpenCL program that was compiled * \param deviceId Id of the device for which compilation took place * \param kernelFilename File name containing the kernel * \param preprocessorOptions String containing the preprocessor command-line options used for the build * \param buildFailed Whether the OpenCL build succeeded * * \throws std::bad_alloc if out of memory */ static void writeOclBuildLog(FILE *fplog, cl_program program, cl_device_id deviceId, const std::string &kernelFilename, const std::string &preprocessorOptions, bool buildFailed) { bool writeOutput = ((fplog != nullptr) && (buildFailed || (getenv("GMX_OCL_DUMP_LOG") != nullptr))); if (!writeOutput) { return; } // Get build log string size size_t buildLogSize; cl_int cl_error = clGetProgramBuildInfo(program, deviceId, CL_PROGRAM_BUILD_LOG, 0, NULL, &buildLogSize); if (cl_error != CL_SUCCESS) { GMX_THROW(InternalError("Could not get OpenCL program build log size, error was " + ocl_get_error_string(cl_error))); } char *buildLog = nullptr; scoped_cptr<char> buildLogGuard; if (buildLogSize != 0) { /* Allocate memory to fit the build log, it can be very large in case of errors */ snew(buildLog, buildLogSize); buildLogGuard.reset(buildLog); /* Get the actual compilation log */ cl_error = clGetProgramBuildInfo(program, deviceId, CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, NULL); if (cl_error != CL_SUCCESS) { GMX_THROW(InternalError("Could not get OpenCL program build log, error was " + ocl_get_error_string(cl_error))); } } std::string message; if (buildFailed) { message += "Compilation of source file " + kernelFilename + " failed!\n"; } else { message += "Compilation of source file " + kernelFilename + " was successful!\n"; } message += "-- Used build options: " + preprocessorOptions + "\n"; message += "--------------LOG START---------------\n"; message += buildLog; message += "---------------LOG END----------------\n";; fputs(message.c_str(), fplog); }
cl_program compileProgram(FILE *fplog, const std::string &kernelBaseFilename, const std::string &extraDefines, cl_context context, cl_device_id deviceId, ocl_vendor_id_t deviceVendorId) { cl_int cl_error; std::string kernelRootPath = getKernelRootPath(); GMX_RELEASE_ASSERT(fplog != nullptr, "Need a valid log file for building OpenCL programs"); /* Load OpenCL source files */ std::string kernelFilename = Path::join(kernelRootPath, kernelBaseFilename); /* Make the build options */ std::string preprocessorOptions = makePreprocessorOptions(kernelRootPath, getWarpSize(context, deviceId), deviceVendorId, extraDefines); bool buildCacheWasRead = false; std::string cacheFilename; if (useBuildCache) { cacheFilename = makeBinaryCacheFilename(kernelBaseFilename, deviceId); } /* Create OpenCL program */ cl_program program = nullptr; if (useBuildCache) { if (File::exists(cacheFilename, File::returnFalseOnError)) { /* Check if there's a valid cache available */ try { program = makeProgramFromCache(cacheFilename, context, deviceId); buildCacheWasRead = true; } catch (FileIOError &e) { // Failing to read from the cache is not a critical error formatExceptionMessageToFile(fplog, e); } } else { fprintf(fplog, "No OpenCL binary cache file was present, so will compile kernels normally.\n"); } } if (program == nullptr) { // Compile OpenCL program from source std::string kernelSource = TextReader::readFileToString(kernelFilename); if (kernelSource.empty()) { GMX_THROW(FileIOError("Error loading OpenCL code " + kernelFilename)); } const char *kernelSourcePtr = kernelSource.c_str(); size_t kernelSourceSize = kernelSource.size(); /* Create program from source code */ program = clCreateProgramWithSource(context, 1, &kernelSourcePtr, &kernelSourceSize, &cl_error); if (cl_error != CL_SUCCESS) { GMX_THROW(InternalError("Could not create OpenCL program, error was " + ocl_get_error_string(cl_error))); } } /* Build the OpenCL program, keeping the status to potentially write to the simulation log file. */ cl_int buildStatus = clBuildProgram(program, 0, NULL, preprocessorOptions.c_str(), NULL, NULL); /* Write log first, and then throw exception that the user know what is the issue even if the build fails. */ writeOclBuildLog(fplog, program, deviceId, kernelFilename, preprocessorOptions, buildStatus != CL_SUCCESS); if (buildStatus != CL_SUCCESS) { GMX_THROW(InternalError("Could not build OpenCL program, error was " + ocl_get_error_string(buildStatus))); } if (useBuildCache) { if (!buildCacheWasRead) { /* If OpenCL caching is ON, but the current cache is not valid => update it */ try { writeBinaryToCache(program, cacheFilename); } catch (GromacsException &e) { // Failing to write the cache is not a critical error formatExceptionMessageToFile(fplog, e); } } } if ((OCL_VENDOR_NVIDIA == deviceVendorId) && getenv("GMX_OCL_DUMP_INTERM_FILES")) { /* If dumping intermediate files has been requested and this is an NVIDIA card => write PTX to file */ char buffer[STRLEN]; cl_error = clGetDeviceInfo(deviceId, CL_DEVICE_NAME, sizeof(buffer), buffer, NULL); if (cl_error != CL_SUCCESS) { GMX_THROW(InternalError("Could not get OpenCL device info, error was " + ocl_get_error_string(cl_error))); } std::string ptxFilename = buffer; ptxFilename += ".ptx"; try { writeBinaryToCache(program, ptxFilename); } catch (GromacsException &e) { // Failing to write the cache is not a critical error formatExceptionMessageToFile(fplog, e); } } return program; }
//! This function is documented in the header file void findGpus(gmx_gpu_info_t *gpu_info) { cl_uint ocl_platform_count; cl_platform_id *ocl_platform_ids; cl_device_type req_dev_type = CL_DEVICE_TYPE_GPU; ocl_platform_ids = nullptr; if (getenv("GMX_OCL_FORCE_CPU") != nullptr) { req_dev_type = CL_DEVICE_TYPE_CPU; } while (true) { cl_int status = clGetPlatformIDs(0, nullptr, &ocl_platform_count); if (CL_SUCCESS != status) { GMX_THROW(gmx::InternalError(gmx::formatString("An unexpected value %d was returned from clGetPlatformIDs: ", status) + ocl_get_error_string(status))); } if (1 > ocl_platform_count) { // TODO this should have a descriptive error message that we only support one OpenCL platform break; } snew(ocl_platform_ids, ocl_platform_count); status = clGetPlatformIDs(ocl_platform_count, ocl_platform_ids, nullptr); if (CL_SUCCESS != status) { GMX_THROW(gmx::InternalError(gmx::formatString("An unexpected value %d was returned from clGetPlatformIDs: ", status) + ocl_get_error_string(status))); } for (unsigned int i = 0; i < ocl_platform_count; i++) { cl_uint ocl_device_count; /* If requesting req_dev_type devices fails, just go to the next platform */ if (CL_SUCCESS != clGetDeviceIDs(ocl_platform_ids[i], req_dev_type, 0, nullptr, &ocl_device_count)) { continue; } if (1 <= ocl_device_count) { gpu_info->n_dev += ocl_device_count; } } if (1 > gpu_info->n_dev) { break; } snew(gpu_info->gpu_dev, gpu_info->n_dev); { int device_index; cl_device_id *ocl_device_ids; snew(ocl_device_ids, gpu_info->n_dev); device_index = 0; for (unsigned int i = 0; i < ocl_platform_count; i++) { cl_uint ocl_device_count; /* If requesting req_dev_type devices fails, just go to the next platform */ if (CL_SUCCESS != clGetDeviceIDs(ocl_platform_ids[i], req_dev_type, gpu_info->n_dev, ocl_device_ids, &ocl_device_count)) { continue; } if (1 > ocl_device_count) { break; } for (unsigned int j = 0; j < ocl_device_count; j++) { gpu_info->gpu_dev[device_index].ocl_gpu_id.ocl_platform_id = ocl_platform_ids[i]; gpu_info->gpu_dev[device_index].ocl_gpu_id.ocl_device_id = ocl_device_ids[j]; gpu_info->gpu_dev[device_index].device_name[0] = 0; clGetDeviceInfo(ocl_device_ids[j], CL_DEVICE_NAME, sizeof(gpu_info->gpu_dev[device_index].device_name), gpu_info->gpu_dev[device_index].device_name, nullptr); gpu_info->gpu_dev[device_index].device_version[0] = 0; clGetDeviceInfo(ocl_device_ids[j], CL_DEVICE_VERSION, sizeof(gpu_info->gpu_dev[device_index].device_version), gpu_info->gpu_dev[device_index].device_version, nullptr); gpu_info->gpu_dev[device_index].device_vendor[0] = 0; clGetDeviceInfo(ocl_device_ids[j], CL_DEVICE_VENDOR, sizeof(gpu_info->gpu_dev[device_index].device_vendor), gpu_info->gpu_dev[device_index].device_vendor, nullptr); gpu_info->gpu_dev[device_index].compute_units = 0; clGetDeviceInfo(ocl_device_ids[j], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(gpu_info->gpu_dev[device_index].compute_units), &(gpu_info->gpu_dev[device_index].compute_units), nullptr); gpu_info->gpu_dev[device_index].adress_bits = 0; clGetDeviceInfo(ocl_device_ids[j], CL_DEVICE_ADDRESS_BITS, sizeof(gpu_info->gpu_dev[device_index].adress_bits), &(gpu_info->gpu_dev[device_index].adress_bits), nullptr); gpu_info->gpu_dev[device_index].vendor_e = get_vendor_id(gpu_info->gpu_dev[device_index].device_vendor); gpu_info->gpu_dev[device_index].stat = is_gmx_supported_gpu_id(gpu_info->gpu_dev + device_index); if (egpuCompatible == gpu_info->gpu_dev[device_index].stat) { gpu_info->n_dev_compatible++; } device_index++; } } gpu_info->n_dev = device_index; /* Dummy sort of devices - AMD first, then NVIDIA, then Intel */ // TODO: Sort devices based on performance. if (0 < gpu_info->n_dev) { int last = -1; for (int i = 0; i < gpu_info->n_dev; i++) { if (OCL_VENDOR_AMD == gpu_info->gpu_dev[i].vendor_e) { last++; if (last < i) { gmx_device_info_t ocl_gpu_info; ocl_gpu_info = gpu_info->gpu_dev[i]; gpu_info->gpu_dev[i] = gpu_info->gpu_dev[last]; gpu_info->gpu_dev[last] = ocl_gpu_info; } } } /* if more than 1 device left to be sorted */ if ((gpu_info->n_dev - 1 - last) > 1) { for (int i = 0; i < gpu_info->n_dev; i++) { if (OCL_VENDOR_NVIDIA == gpu_info->gpu_dev[i].vendor_e) { last++; if (last < i) { gmx_device_info_t ocl_gpu_info; ocl_gpu_info = gpu_info->gpu_dev[i]; gpu_info->gpu_dev[i] = gpu_info->gpu_dev[last]; gpu_info->gpu_dev[last] = ocl_gpu_info; } } } } } sfree(ocl_device_ids); } break; } sfree(ocl_platform_ids); }