bool valid() const { EGLWindow *window = getEGLWindow(); EGLDisplay display = window->getDisplay(); if (!eglDisplayExtensionEnabled(display, "EGL_ANGLE_d3d_texture_client_buffer")) { std::cout << "Test skipped due to missing EGL_ANGLE_d3d_texture_client_buffer" << std::endl; return false; } if (!mD3D11Device && !mD3D9Device) { std::cout << "Test skipped due to no D3D devices being available." << std::endl; return false; } if (IsWindows() && IsAMD() && IsOpenGL()) { std::cout << "Test skipped on Windows AMD OpenGL." << std::endl; return false; } if (IsWindows() && IsIntel() && IsOpenGL()) { std::cout << "Test skipped on Windows Intel OpenGL." << std::endl; return false; } return true; }
// todo(jonahr): Eventually could add support for all conditions/operating // systems, but these are the ones in use for now void validateConfigBase(const GPUTestConfig &config) { EXPECT_EQ(IsWindows(), config.getConditions()[GPUTestConfig::kConditionWin]); EXPECT_EQ(IsOSX(), config.getConditions()[GPUTestConfig::kConditionMac]); EXPECT_EQ(IsLinux(), config.getConditions()[GPUTestConfig::kConditionLinux]); EXPECT_EQ(IsAndroid(), config.getConditions()[GPUTestConfig::kConditionAndroid]); EXPECT_EQ(IsNexus5X(), config.getConditions()[GPUTestConfig::kConditionNexus5X]); EXPECT_EQ(IsPixel2(), config.getConditions()[GPUTestConfig::kConditionPixel2]); EXPECT_EQ(IsIntel(), config.getConditions()[GPUTestConfig::kConditionIntel]); EXPECT_EQ(IsAMD(), config.getConditions()[GPUTestConfig::kConditionAMD]); EXPECT_EQ(IsNVIDIA(), config.getConditions()[GPUTestConfig::kConditionNVIDIA]); EXPECT_EQ(IsDebug(), config.getConditions()[GPUTestConfig::kConditionDebug]); EXPECT_EQ(IsRelease(), config.getConditions()[GPUTestConfig::kConditionRelease]); }
void StateManager9::initialize() { mUsingZeroColorMaskWorkaround = IsAMD(mRenderer9->getVendorId()); }
void GenerateCaps(IDirect3D9 *d3d9, IDirect3DDevice9 *device, D3DDEVTYPE deviceType, UINT adapter, gl::Caps *caps, gl::TextureCapsMap *textureCapsMap, gl::Extensions *extensions, gl::Limitations *limitations) { D3DCAPS9 deviceCaps; if (FAILED(d3d9->GetDeviceCaps(adapter, deviceType, &deviceCaps))) { // Can't continue with out device caps return; } D3DDISPLAYMODE currentDisplayMode; d3d9->GetAdapterDisplayMode(adapter, ¤tDisplayMode); GLuint maxSamples = 0; const gl::FormatSet &allFormats = gl::GetAllSizedInternalFormats(); for (gl::FormatSet::const_iterator internalFormat = allFormats.begin(); internalFormat != allFormats.end(); ++internalFormat) { gl::TextureCaps textureCaps = GenerateTextureFormatCaps(*internalFormat, d3d9, deviceType, adapter, currentDisplayMode.Format); textureCapsMap->insert(*internalFormat, textureCaps); maxSamples = std::max(maxSamples, textureCaps.getMaxSamples()); if (gl::GetInternalFormatInfo(*internalFormat).compressed) { caps->compressedTextureFormats.push_back(*internalFormat); } } // GL core feature limits caps->maxElementIndex = static_cast<GLint64>(std::numeric_limits<unsigned int>::max()); // 3D textures are unimplemented in D3D9 caps->max3DTextureSize = 1; // Only one limit in GL, use the minimum dimension caps->max2DTextureSize = std::min(deviceCaps.MaxTextureWidth, deviceCaps.MaxTextureHeight); // D3D treats cube maps as a special case of 2D textures caps->maxCubeMapTextureSize = caps->max2DTextureSize; // Array textures are not available in D3D9 caps->maxArrayTextureLayers = 1; // ES3-only feature caps->maxLODBias = 0.0f; // No specific limits on render target size, maximum 2D texture size is equivalent caps->maxRenderbufferSize = caps->max2DTextureSize; // Draw buffers are not supported in D3D9 caps->maxDrawBuffers = 1; caps->maxColorAttachments = 1; // No specific limits on viewport size, maximum 2D texture size is equivalent caps->maxViewportWidth = caps->max2DTextureSize; caps->maxViewportHeight = caps->maxViewportWidth; // Point size is clamped to 1.0f when the shader model is less than 3 caps->minAliasedPointSize = 1.0f; caps->maxAliasedPointSize = ((D3DSHADER_VERSION_MAJOR(deviceCaps.PixelShaderVersion) >= 3) ? deviceCaps.MaxPointSize : 1.0f); // Wide lines not supported caps->minAliasedLineWidth = 1.0f; caps->maxAliasedLineWidth = 1.0f; // Primitive count limits (unused in ES2) caps->maxElementsIndices = 0; caps->maxElementsVertices = 0; // Program and shader binary formats (no supported shader binary formats) caps->programBinaryFormats.push_back(GL_PROGRAM_BINARY_ANGLE); caps->vertexHighpFloat.setIEEEFloat(); caps->vertexMediumpFloat.setIEEEFloat(); caps->vertexLowpFloat.setIEEEFloat(); caps->fragmentHighpFloat.setIEEEFloat(); caps->fragmentMediumpFloat.setIEEEFloat(); caps->fragmentLowpFloat.setIEEEFloat(); // Some (most) hardware only supports single-precision floating-point numbers, // which can accurately represent integers up to +/-16777216 caps->vertexHighpInt.setSimulatedInt(24); caps->vertexMediumpInt.setSimulatedInt(24); caps->vertexLowpInt.setSimulatedInt(24); caps->fragmentHighpInt.setSimulatedInt(24); caps->fragmentMediumpInt.setSimulatedInt(24); caps->fragmentLowpInt.setSimulatedInt(24); // WaitSync is ES3-only, set to zero caps->maxServerWaitTimeout = 0; // Vertex shader limits caps->maxVertexAttributes = 16; const size_t MAX_VERTEX_CONSTANT_VECTORS_D3D9 = 256; caps->maxVertexUniformVectors = MAX_VERTEX_CONSTANT_VECTORS_D3D9 - GetReservedVertexUniformVectors(); caps->maxVertexUniformComponents = caps->maxVertexUniformVectors * 4; caps->maxVertexUniformBlocks = 0; // SM3 only supports 11 output variables, with a special 12th register for PSIZE. const size_t MAX_VERTEX_OUTPUT_VECTORS_SM3 = 9; const size_t MAX_VERTEX_OUTPUT_VECTORS_SM2 = 7; caps->maxVertexOutputComponents = ((deviceCaps.VertexShaderVersion >= D3DVS_VERSION(3, 0)) ? MAX_VERTEX_OUTPUT_VECTORS_SM3 : MAX_VERTEX_OUTPUT_VECTORS_SM2) * 4; // Only Direct3D 10 ready devices support all the necessary vertex texture formats. // We test this using D3D9 by checking support for the R16F format. if (deviceCaps.VertexShaderVersion >= D3DVS_VERSION(3, 0) && SUCCEEDED(d3d9->CheckDeviceFormat(adapter, deviceType, currentDisplayMode.Format, D3DUSAGE_QUERY_VERTEXTEXTURE, D3DRTYPE_TEXTURE, D3DFMT_R16F))) { const size_t MAX_TEXTURE_IMAGE_UNITS_VTF_SM3 = 4; caps->maxVertexTextureImageUnits = MAX_TEXTURE_IMAGE_UNITS_VTF_SM3; } else { caps->maxVertexTextureImageUnits = 0; } // Fragment shader limits const size_t MAX_PIXEL_CONSTANT_VECTORS_SM3 = 224; const size_t MAX_PIXEL_CONSTANT_VECTORS_SM2 = 32; caps->maxFragmentUniformVectors = ((deviceCaps.PixelShaderVersion >= D3DPS_VERSION(3, 0)) ? MAX_PIXEL_CONSTANT_VECTORS_SM3 : MAX_PIXEL_CONSTANT_VECTORS_SM2) - GetReservedFragmentUniformVectors(); caps->maxFragmentUniformComponents = caps->maxFragmentUniformVectors * 4; caps->maxFragmentUniformBlocks = 0; caps->maxFragmentInputComponents = caps->maxVertexOutputComponents; caps->maxTextureImageUnits = 16; caps->minProgramTexelOffset = 0; caps->maxProgramTexelOffset = 0; // Aggregate shader limits (unused in ES2) caps->maxUniformBufferBindings = 0; caps->maxUniformBlockSize = 0; caps->uniformBufferOffsetAlignment = 0; caps->maxCombinedUniformBlocks = 0; caps->maxCombinedVertexUniformComponents = 0; caps->maxCombinedFragmentUniformComponents = 0; caps->maxVaryingComponents = 0; // Aggregate shader limits caps->maxVaryingVectors = caps->maxVertexOutputComponents / 4; caps->maxCombinedTextureImageUnits = caps->maxVertexTextureImageUnits + caps->maxTextureImageUnits; // Transform feedback limits caps->maxTransformFeedbackInterleavedComponents = 0; caps->maxTransformFeedbackSeparateAttributes = 0; caps->maxTransformFeedbackSeparateComponents = 0; // Multisample limits caps->maxSamples = maxSamples; // GL extension support extensions->setTextureExtensionSupport(*textureCapsMap); extensions->elementIndexUint = deviceCaps.MaxVertexIndex >= (1 << 16); extensions->getProgramBinary = true; extensions->rgb8rgba8 = true; extensions->readFormatBGRA = true; extensions->pixelBufferObject = false; extensions->mapBuffer = false; extensions->mapBufferRange = false; // textureRG is emulated and not performant. extensions->textureRG = false; D3DADAPTER_IDENTIFIER9 adapterId = {}; if (SUCCEEDED(d3d9->GetAdapterIdentifier(adapter, 0, &adapterId))) { // ATI cards on XP have problems with non-power-of-two textures. extensions->textureNPOT = !(deviceCaps.TextureCaps & D3DPTEXTURECAPS_POW2) && !(deviceCaps.TextureCaps & D3DPTEXTURECAPS_CUBEMAP_POW2) && !(deviceCaps.TextureCaps & D3DPTEXTURECAPS_NONPOW2CONDITIONAL) && !(!isWindowsVistaOrGreater() && IsAMD(adapterId.VendorId)); // Disable depth texture support on AMD cards (See ANGLE issue 839) if (IsAMD(adapterId.VendorId)) { extensions->depthTextures = false; } } else { extensions->textureNPOT = false; } extensions->drawBuffers = false; extensions->textureStorage = true; // Must support a minimum of 2:1 anisotropy for max anisotropy to be considered supported, per the spec extensions->textureFilterAnisotropic = (deviceCaps.RasterCaps & D3DPRASTERCAPS_ANISOTROPY) != 0 && deviceCaps.MaxAnisotropy >= 2; extensions->maxTextureAnisotropy = static_cast<GLfloat>(deviceCaps.MaxAnisotropy); // Check occlusion query support by trying to create one IDirect3DQuery9 *occlusionQuery = NULL; extensions->occlusionQueryBoolean = SUCCEEDED(device->CreateQuery(D3DQUERYTYPE_OCCLUSION, &occlusionQuery)) && occlusionQuery; SafeRelease(occlusionQuery); // Check event query support by trying to create one IDirect3DQuery9 *eventQuery = NULL; extensions->fence = SUCCEEDED(device->CreateQuery(D3DQUERYTYPE_EVENT, &eventQuery)) && eventQuery; SafeRelease(eventQuery); extensions->timerQuery = false; // Unimplemented extensions->disjointTimerQuery = false; extensions->robustness = true; extensions->blendMinMax = true; extensions->framebufferBlit = true; extensions->framebufferMultisample = true; extensions->instancedArrays = deviceCaps.PixelShaderVersion >= D3DPS_VERSION(3, 0); extensions->packReverseRowOrder = true; extensions->standardDerivatives = (deviceCaps.PS20Caps.Caps & D3DPS20CAPS_GRADIENTINSTRUCTIONS) != 0; extensions->shaderTextureLOD = true; extensions->fragDepth = true; extensions->textureUsage = true; extensions->translatedShaderSource = true; extensions->fboRenderMipmap = false; extensions->discardFramebuffer = false; // It would be valid to set this to true, since glDiscardFramebufferEXT is just a hint extensions->colorBufferFloat = false; extensions->debugMarker = true; extensions->eglImage = true; extensions->eglImageExternal = true; extensions->unpackSubimage = true; extensions->packSubimage = true; extensions->syncQuery = extensions->fence; // D3D9 has no concept of separate masks and refs for front and back faces in the depth stencil // state. limitations->noSeparateStencilRefsAndMasks = true; // D3D9 shader models have limited support for looping, so the Appendix A // index/loop limitations are necessary. Workarounds that are needed to // support dynamic indexing of vectors on HLSL also don't work on D3D9. limitations->shadersRequireIndexedLoopValidation = true; // D3D9 cannot support constant color and alpha blend funcs together limitations->noSimultaneousConstantColorAndAlphaBlendFunc = true; }
void Tuner(int argc, char* argv[]) { constexpr auto kSeed = 42; // fixed seed for reproducibility // Sets the parameters and platform/device for which to tune (command-line options) auto command_line_args = RetrieveCommandLineArguments(argc, argv); auto help = std::string{"* Options given/available:\n"}; auto args = Arguments<T>{}; args.platform_id = GetArgument(command_line_args, help, kArgPlatform, ConvertArgument(std::getenv("CLBLAST_PLATFORM"), size_t{0})); args.device_id = GetArgument(command_line_args, help, kArgDevice, ConvertArgument(std::getenv("CLBLAST_DEVICE"), size_t{0})); args.precision = GetArgument(command_line_args, help, kArgPrecision, Precision::kSingle); for (auto &o: C::GetOptions()) { if (o == kArgM) { args.m = GetArgument(command_line_args, help, kArgM, C::DefaultM()); } if (o == kArgN) { args.n = GetArgument(command_line_args, help, kArgN, C::DefaultN()); } if (o == kArgK) { args.k = GetArgument(command_line_args, help, kArgK, C::DefaultK()); } if (o == kArgAlpha) { args.alpha = GetArgument(command_line_args, help, kArgAlpha, GetScalar<T>()); } if (o == kArgBeta) { args.beta = GetArgument(command_line_args, help, kArgBeta, GetScalar<T>()); } if (o == kArgFraction) { args.fraction = GetArgument(command_line_args, help, kArgFraction, C::DefaultFraction()); } if (o == kArgBatchCount) { args.batch_count = GetArgument(command_line_args, help, kArgBatchCount, C::DefaultBatchCount()); } if (o == tStrategy) {args.tStrategy = GetArgument(command_line_args, help, tStrategy, DEFAULT_STRATEGY); } if (o == psoSwarmSize) {args.psoSwarmSize = GetArgument(command_line_args, help, psoSwarmSize, DEFAULT_PSO_SWARM); } if (o == psoInfG) {args.psoInfG = GetArgument(command_line_args, help, psoInfG, DEFAULT_PSO_G); } if (o == psoInfL) {args.psoInfL = GetArgument(command_line_args, help, psoInfL, DEFAULT_PSO_L); } if (o == psoInfR) {args.psoInfR = GetArgument(command_line_args, help, psoInfR, DEFAULT_PSO_R); } } const auto num_runs = GetArgument(command_line_args, help, kArgNumRuns, C::DefaultNumRuns()); fprintf(stdout, "%s\n", help.c_str()); // Tests validity of the given arguments C::TestValidArguments(args); // Tests for validity of the precision and retrieves properties auto isAMD = false; auto isARM = false; auto isGPU = false; { const auto platform = Platform(args.platform_id); const auto device = Device(platform, args.device_id); if (!PrecisionSupported<T>(device)) { printf("* Unsupported precision, skipping this tuning run\n\n"); return; } isAMD = device.IsAMD(); isARM = device.IsARM(); isGPU = device.IsGPU(); } // Creates input buffers with random data auto x_vec = std::vector<T>(C::GetSizeX(args)); auto y_vec = std::vector<T>(C::GetSizeY(args)); auto a_mat = std::vector<T>(C::GetSizeA(args)); auto b_mat = std::vector<T>(C::GetSizeB(args)); auto c_mat = std::vector<T>(C::GetSizeC(args)); auto temp = std::vector<T>(C::GetSizeTemp(args)); std::mt19937 mt(kSeed); std::uniform_real_distribution<double> dist(kTestDataLowerLimit, kTestDataUpperLimit); PopulateVector(x_vec, mt, dist); PopulateVector(y_vec, mt, dist); PopulateVector(a_mat, mt, dist); PopulateVector(b_mat, mt, dist); PopulateVector(c_mat, mt, dist); PopulateVector(temp, mt, dist); // Initializes the tuner for the chosen device cltune::Tuner tuner(args.platform_id, args.device_id); // Use full-search to explore all parameter combinations or random-search to search only a part of // the parameter values. The fraction is set as a command-line argument. #ifdef XGEMM_EXEC if(tStrategyFlag) { auto localtStrategy = args.tStrategy; if (args.fraction == 1.0 || args.fraction == 0.0) { localtStrategy = FULL_SEARCH_STRATEGY; } switch (localtStrategy) { case FULL_SEARCH_STRATEGY: tuner.UseFullSearch(); break; case RANDOM_SEARCH_STRATEGY: tuner.UseRandomSearch(1.0/args.fraction); break; case PSO_STRATEGY: tuner.UsePSO(1.0/args.fraction, args.psoSwarmSize, args.psoInfG, args.psoInfL, args.psoInfR); break; case DVDT_STRATEGY: default: tuner.UseFullSearch(); } } #else if (args.fraction == 1.0 || args.fraction == 0.0) { tuner.UseFullSearch(); } else { tuner.UseRandomSearch(1.0/args.fraction); } #endif // Set extra settings for specific defines. This mimics src/routine.cc. auto defines = std::string{""}; if (isAMD && isGPU) { defines += "#define USE_CL_MAD 1\n"; defines += "#define USE_STAGGERED_INDICES 1\n"; } if (isARM && isGPU) { defines += "#define GLOBAL_MEM_FENCE 1\n"; } // Loads the kernel sources and defines the kernel to tune auto sources = defines + C::GetSources(); auto id = tuner.AddKernelFromString(sources, C::KernelName(), C::GlobalSize(args), C::LocalSize()); tuner.SetReferenceFromString(sources, C::KernelName(), C::GlobalSizeRef(args), C::LocalSizeRef()); // Sets the tunable parameters and their possible values C::SetParameters(tuner, id); C::SetConstraints(tuner, id); C::SetLocalMemorySize(tuner, id, args); // Tests for a specific precision tuner.AddParameter(id, "PRECISION", {static_cast<size_t>(args.precision)}); tuner.AddParameterReference("PRECISION", static_cast<size_t>(args.precision)); // Modifies the thread-sizes (both global and local) based on the parameters for (auto ¶meters: C::MulLocal()) { tuner.MulLocalSize(id, parameters); } for (auto ¶meters: C::DivLocal()) { tuner.DivLocalSize(id, parameters); } for (auto ¶meters: C::MulGlobal()) { tuner.MulGlobalSize(id, parameters); } for (auto ¶meters: C::DivGlobal()) { tuner.DivGlobalSize(id, parameters); } // Sets the function's arguments C::SetArguments(tuner, args, x_vec, y_vec, a_mat, b_mat, c_mat, temp); // Starts the tuning process tuner.SetNumRuns(num_runs); tuner.Tune(); // Prints the results to screen auto time_ms = tuner.PrintToScreen(); tuner.PrintFormatted(); // Also prints the performance of the best-case in terms of GB/s or GFLOPS if (time_ms != 0.0) { printf("[ -------> ] %.2lf ms", time_ms); printf(" or %.1lf %s\n", C::GetMetric(args)/(time_ms*1.0e6), C::PerformanceUnit().c_str()); } // Outputs the results as JSON to disk, including some meta-data auto precision_string = std::to_string(static_cast<size_t>(args.precision)); auto metadata = std::vector<std::pair<std::string,std::string>>{ {"kernel_family", C::KernelFamily()}, {"precision", precision_string} }; for (auto &o: C::GetOptions()) { if (o == kArgM) { metadata.push_back({"arg_m", std::to_string(args.m)}); } if (o == kArgN) { metadata.push_back({"arg_n", std::to_string(args.n)}); } if (o == kArgK) { metadata.push_back({"arg_k", std::to_string(args.k)}); } if (o == kArgAlpha) { metadata.push_back({"arg_alpha", ToString(args.alpha)}); } if (o == kArgBeta) { metadata.push_back({"arg_beta", ToString(args.beta)}); } if (o == kArgBatchCount) { metadata.push_back({"arg_batch_count", ToString(args.batch_count)}); } } tuner.PrintJSON("clblast_"+C::KernelFamily()+"_"+precision_string+".json", metadata); }