예제 #1
0
    bool valid() const
    {
        EGLWindow *window  = getEGLWindow();
        EGLDisplay display = window->getDisplay();
        if (!eglDisplayExtensionEnabled(display, "EGL_ANGLE_d3d_texture_client_buffer"))
        {
            std::cout << "Test skipped due to missing EGL_ANGLE_d3d_texture_client_buffer"
                      << std::endl;
            return false;
        }

        if (!mD3D11Device && !mD3D9Device)
        {
            std::cout << "Test skipped due to no D3D devices being available." << std::endl;
            return false;
        }

        if (IsWindows() && IsAMD() && IsOpenGL())
        {
            std::cout << "Test skipped on Windows AMD OpenGL." << std::endl;
            return false;
        }

        if (IsWindows() && IsIntel() && IsOpenGL())
        {
            std::cout << "Test skipped on Windows Intel OpenGL." << std::endl;
            return false;
        }
        return true;
    }
예제 #2
0
 // todo(jonahr): Eventually could add support for all conditions/operating
 // systems, but these are the ones in use for now
 void validateConfigBase(const GPUTestConfig &config)
 {
     EXPECT_EQ(IsWindows(), config.getConditions()[GPUTestConfig::kConditionWin]);
     EXPECT_EQ(IsOSX(), config.getConditions()[GPUTestConfig::kConditionMac]);
     EXPECT_EQ(IsLinux(), config.getConditions()[GPUTestConfig::kConditionLinux]);
     EXPECT_EQ(IsAndroid(), config.getConditions()[GPUTestConfig::kConditionAndroid]);
     EXPECT_EQ(IsNexus5X(), config.getConditions()[GPUTestConfig::kConditionNexus5X]);
     EXPECT_EQ(IsPixel2(), config.getConditions()[GPUTestConfig::kConditionPixel2]);
     EXPECT_EQ(IsIntel(), config.getConditions()[GPUTestConfig::kConditionIntel]);
     EXPECT_EQ(IsAMD(), config.getConditions()[GPUTestConfig::kConditionAMD]);
     EXPECT_EQ(IsNVIDIA(), config.getConditions()[GPUTestConfig::kConditionNVIDIA]);
     EXPECT_EQ(IsDebug(), config.getConditions()[GPUTestConfig::kConditionDebug]);
     EXPECT_EQ(IsRelease(), config.getConditions()[GPUTestConfig::kConditionRelease]);
 }
예제 #3
0
void StateManager9::initialize()
{
    mUsingZeroColorMaskWorkaround = IsAMD(mRenderer9->getVendorId());
}
예제 #4
0
void GenerateCaps(IDirect3D9 *d3d9,
                  IDirect3DDevice9 *device,
                  D3DDEVTYPE deviceType,
                  UINT adapter,
                  gl::Caps *caps,
                  gl::TextureCapsMap *textureCapsMap,
                  gl::Extensions *extensions,
                  gl::Limitations *limitations)
{
    D3DCAPS9 deviceCaps;
    if (FAILED(d3d9->GetDeviceCaps(adapter, deviceType, &deviceCaps)))
    {
        // Can't continue with out device caps
        return;
    }

    D3DDISPLAYMODE currentDisplayMode;
    d3d9->GetAdapterDisplayMode(adapter, &currentDisplayMode);

    GLuint maxSamples = 0;
    const gl::FormatSet &allFormats = gl::GetAllSizedInternalFormats();
    for (gl::FormatSet::const_iterator internalFormat = allFormats.begin(); internalFormat != allFormats.end(); ++internalFormat)
    {
        gl::TextureCaps textureCaps = GenerateTextureFormatCaps(*internalFormat, d3d9, deviceType, adapter,
                                                                currentDisplayMode.Format);
        textureCapsMap->insert(*internalFormat, textureCaps);

        maxSamples = std::max(maxSamples, textureCaps.getMaxSamples());

        if (gl::GetInternalFormatInfo(*internalFormat).compressed)
        {
            caps->compressedTextureFormats.push_back(*internalFormat);
        }
    }

    // GL core feature limits
    caps->maxElementIndex = static_cast<GLint64>(std::numeric_limits<unsigned int>::max());

    // 3D textures are unimplemented in D3D9
    caps->max3DTextureSize = 1;

    // Only one limit in GL, use the minimum dimension
    caps->max2DTextureSize = std::min(deviceCaps.MaxTextureWidth, deviceCaps.MaxTextureHeight);

    // D3D treats cube maps as a special case of 2D textures
    caps->maxCubeMapTextureSize = caps->max2DTextureSize;

    // Array textures are not available in D3D9
    caps->maxArrayTextureLayers = 1;

    // ES3-only feature
    caps->maxLODBias = 0.0f;

    // No specific limits on render target size, maximum 2D texture size is equivalent
    caps->maxRenderbufferSize = caps->max2DTextureSize;

    // Draw buffers are not supported in D3D9
    caps->maxDrawBuffers = 1;
    caps->maxColorAttachments = 1;

    // No specific limits on viewport size, maximum 2D texture size is equivalent
    caps->maxViewportWidth = caps->max2DTextureSize;
    caps->maxViewportHeight = caps->maxViewportWidth;

    // Point size is clamped to 1.0f when the shader model is less than 3
    caps->minAliasedPointSize = 1.0f;
    caps->maxAliasedPointSize = ((D3DSHADER_VERSION_MAJOR(deviceCaps.PixelShaderVersion) >= 3) ? deviceCaps.MaxPointSize : 1.0f);

    // Wide lines not supported
    caps->minAliasedLineWidth = 1.0f;
    caps->maxAliasedLineWidth = 1.0f;

    // Primitive count limits (unused in ES2)
    caps->maxElementsIndices = 0;
    caps->maxElementsVertices = 0;

    // Program and shader binary formats (no supported shader binary formats)
    caps->programBinaryFormats.push_back(GL_PROGRAM_BINARY_ANGLE);

    caps->vertexHighpFloat.setIEEEFloat();
    caps->vertexMediumpFloat.setIEEEFloat();
    caps->vertexLowpFloat.setIEEEFloat();
    caps->fragmentHighpFloat.setIEEEFloat();
    caps->fragmentMediumpFloat.setIEEEFloat();
    caps->fragmentLowpFloat.setIEEEFloat();

    // Some (most) hardware only supports single-precision floating-point numbers,
    // which can accurately represent integers up to +/-16777216
    caps->vertexHighpInt.setSimulatedInt(24);
    caps->vertexMediumpInt.setSimulatedInt(24);
    caps->vertexLowpInt.setSimulatedInt(24);
    caps->fragmentHighpInt.setSimulatedInt(24);
    caps->fragmentMediumpInt.setSimulatedInt(24);
    caps->fragmentLowpInt.setSimulatedInt(24);

    // WaitSync is ES3-only, set to zero
    caps->maxServerWaitTimeout = 0;

    // Vertex shader limits
    caps->maxVertexAttributes = 16;

    const size_t MAX_VERTEX_CONSTANT_VECTORS_D3D9 = 256;
    caps->maxVertexUniformVectors =
        MAX_VERTEX_CONSTANT_VECTORS_D3D9 - GetReservedVertexUniformVectors();
    caps->maxVertexUniformComponents = caps->maxVertexUniformVectors * 4;

    caps->maxVertexUniformBlocks = 0;

    // SM3 only supports 11 output variables, with a special 12th register for PSIZE.
    const size_t MAX_VERTEX_OUTPUT_VECTORS_SM3 = 9;
    const size_t MAX_VERTEX_OUTPUT_VECTORS_SM2 = 7;
    caps->maxVertexOutputComponents = ((deviceCaps.VertexShaderVersion >= D3DVS_VERSION(3, 0)) ? MAX_VERTEX_OUTPUT_VECTORS_SM3
                                                                                               : MAX_VERTEX_OUTPUT_VECTORS_SM2) * 4;

    // Only Direct3D 10 ready devices support all the necessary vertex texture formats.
    // We test this using D3D9 by checking support for the R16F format.
    if (deviceCaps.VertexShaderVersion >= D3DVS_VERSION(3, 0) &&
        SUCCEEDED(d3d9->CheckDeviceFormat(adapter, deviceType, currentDisplayMode.Format,
                                          D3DUSAGE_QUERY_VERTEXTEXTURE, D3DRTYPE_TEXTURE, D3DFMT_R16F)))
    {
        const size_t MAX_TEXTURE_IMAGE_UNITS_VTF_SM3 = 4;
        caps->maxVertexTextureImageUnits = MAX_TEXTURE_IMAGE_UNITS_VTF_SM3;
    }
    else
    {
        caps->maxVertexTextureImageUnits = 0;
    }

    // Fragment shader limits
    const size_t MAX_PIXEL_CONSTANT_VECTORS_SM3 = 224;
    const size_t MAX_PIXEL_CONSTANT_VECTORS_SM2 = 32;
    caps->maxFragmentUniformVectors =
        ((deviceCaps.PixelShaderVersion >= D3DPS_VERSION(3, 0)) ? MAX_PIXEL_CONSTANT_VECTORS_SM3
                                                                : MAX_PIXEL_CONSTANT_VECTORS_SM2) -
        GetReservedFragmentUniformVectors();
    caps->maxFragmentUniformComponents = caps->maxFragmentUniformVectors * 4;
    caps->maxFragmentUniformBlocks = 0;
    caps->maxFragmentInputComponents = caps->maxVertexOutputComponents;
    caps->maxTextureImageUnits = 16;
    caps->minProgramTexelOffset = 0;
    caps->maxProgramTexelOffset = 0;

    // Aggregate shader limits (unused in ES2)
    caps->maxUniformBufferBindings = 0;
    caps->maxUniformBlockSize = 0;
    caps->uniformBufferOffsetAlignment = 0;
    caps->maxCombinedUniformBlocks = 0;
    caps->maxCombinedVertexUniformComponents = 0;
    caps->maxCombinedFragmentUniformComponents = 0;
    caps->maxVaryingComponents = 0;

    // Aggregate shader limits
    caps->maxVaryingVectors = caps->maxVertexOutputComponents / 4;
    caps->maxCombinedTextureImageUnits = caps->maxVertexTextureImageUnits + caps->maxTextureImageUnits;

    // Transform feedback limits
    caps->maxTransformFeedbackInterleavedComponents = 0;
    caps->maxTransformFeedbackSeparateAttributes = 0;
    caps->maxTransformFeedbackSeparateComponents = 0;

    // Multisample limits
    caps->maxSamples = maxSamples;

    // GL extension support
    extensions->setTextureExtensionSupport(*textureCapsMap);
    extensions->elementIndexUint = deviceCaps.MaxVertexIndex >= (1 << 16);
    extensions->getProgramBinary = true;
    extensions->rgb8rgba8 = true;
    extensions->readFormatBGRA = true;
    extensions->pixelBufferObject = false;
    extensions->mapBuffer = false;
    extensions->mapBufferRange = false;

    // textureRG is emulated and not performant.
    extensions->textureRG = false;

    D3DADAPTER_IDENTIFIER9 adapterId = {};
    if (SUCCEEDED(d3d9->GetAdapterIdentifier(adapter, 0, &adapterId)))
    {
        // ATI cards on XP have problems with non-power-of-two textures.
        extensions->textureNPOT = !(deviceCaps.TextureCaps & D3DPTEXTURECAPS_POW2) &&
                                  !(deviceCaps.TextureCaps & D3DPTEXTURECAPS_CUBEMAP_POW2) &&
                                  !(deviceCaps.TextureCaps & D3DPTEXTURECAPS_NONPOW2CONDITIONAL) &&
                                  !(!isWindowsVistaOrGreater() && IsAMD(adapterId.VendorId));

        // Disable depth texture support on AMD cards (See ANGLE issue 839)
        if (IsAMD(adapterId.VendorId))
        {
            extensions->depthTextures = false;
        }
    }
    else
    {
        extensions->textureNPOT = false;
    }

    extensions->drawBuffers = false;
    extensions->textureStorage = true;

    // Must support a minimum of 2:1 anisotropy for max anisotropy to be considered supported, per the spec
    extensions->textureFilterAnisotropic = (deviceCaps.RasterCaps & D3DPRASTERCAPS_ANISOTROPY) != 0 && deviceCaps.MaxAnisotropy >= 2;
    extensions->maxTextureAnisotropy = static_cast<GLfloat>(deviceCaps.MaxAnisotropy);

    // Check occlusion query support by trying to create one
    IDirect3DQuery9 *occlusionQuery = NULL;
    extensions->occlusionQueryBoolean = SUCCEEDED(device->CreateQuery(D3DQUERYTYPE_OCCLUSION, &occlusionQuery)) && occlusionQuery;
    SafeRelease(occlusionQuery);

    // Check event query support by trying to create one
    IDirect3DQuery9 *eventQuery = NULL;
    extensions->fence = SUCCEEDED(device->CreateQuery(D3DQUERYTYPE_EVENT, &eventQuery)) && eventQuery;
    SafeRelease(eventQuery);

    extensions->timerQuery = false; // Unimplemented
    extensions->disjointTimerQuery     = false;
    extensions->robustness = true;
    extensions->blendMinMax = true;
    extensions->framebufferBlit = true;
    extensions->framebufferMultisample = true;
    extensions->instancedArrays = deviceCaps.PixelShaderVersion >= D3DPS_VERSION(3, 0);
    extensions->packReverseRowOrder = true;
    extensions->standardDerivatives = (deviceCaps.PS20Caps.Caps & D3DPS20CAPS_GRADIENTINSTRUCTIONS) != 0;
    extensions->shaderTextureLOD = true;
    extensions->fragDepth = true;
    extensions->textureUsage = true;
    extensions->translatedShaderSource = true;
    extensions->fboRenderMipmap = false;
    extensions->discardFramebuffer = false; // It would be valid to set this to true, since glDiscardFramebufferEXT is just a hint
    extensions->colorBufferFloat = false;
    extensions->debugMarker = true;
    extensions->eglImage               = true;
    extensions->eglImageExternal       = true;
    extensions->unpackSubimage         = true;
    extensions->packSubimage           = true;
    extensions->syncQuery              = extensions->fence;

    // D3D9 has no concept of separate masks and refs for front and back faces in the depth stencil
    // state.
    limitations->noSeparateStencilRefsAndMasks = true;

    // D3D9 shader models have limited support for looping, so the Appendix A
    // index/loop limitations are necessary. Workarounds that are needed to
    // support dynamic indexing of vectors on HLSL also don't work on D3D9.
    limitations->shadersRequireIndexedLoopValidation = true;

    // D3D9 cannot support constant color and alpha blend funcs together
    limitations->noSimultaneousConstantColorAndAlphaBlendFunc = true;
}
예제 #5
0
파일: tuning.hpp 프로젝트: ctuning/ck-math
void Tuner(int argc, char* argv[]) {
  constexpr auto kSeed = 42; // fixed seed for reproducibility

  // Sets the parameters and platform/device for which to tune (command-line options)
  auto command_line_args = RetrieveCommandLineArguments(argc, argv);
  auto help = std::string{"* Options given/available:\n"};
  auto args = Arguments<T>{};
  args.platform_id = GetArgument(command_line_args, help, kArgPlatform, ConvertArgument(std::getenv("CLBLAST_PLATFORM"), size_t{0}));
  args.device_id   = GetArgument(command_line_args, help, kArgDevice, ConvertArgument(std::getenv("CLBLAST_DEVICE"), size_t{0}));
  args.precision   = GetArgument(command_line_args, help, kArgPrecision, Precision::kSingle);
  for (auto &o: C::GetOptions()) {
    if (o == kArgM)        { args.m        = GetArgument(command_line_args, help, kArgM, C::DefaultM()); }
    if (o == kArgN)        { args.n        = GetArgument(command_line_args, help, kArgN, C::DefaultN()); }
    if (o == kArgK)        { args.k        = GetArgument(command_line_args, help, kArgK, C::DefaultK()); }
    if (o == kArgAlpha)    { args.alpha    = GetArgument(command_line_args, help, kArgAlpha, GetScalar<T>()); }
    if (o == kArgBeta)     { args.beta     = GetArgument(command_line_args, help, kArgBeta, GetScalar<T>()); }
    if (o == kArgFraction) { args.fraction = GetArgument(command_line_args, help, kArgFraction, C::DefaultFraction()); }
    if (o == kArgBatchCount) { args.batch_count = GetArgument(command_line_args, help, kArgBatchCount, C::DefaultBatchCount()); }
    if (o == tStrategy)   {args.tStrategy   = GetArgument(command_line_args, help, tStrategy, DEFAULT_STRATEGY);  }
    if (o == psoSwarmSize)   {args.psoSwarmSize   = GetArgument(command_line_args, help, psoSwarmSize, DEFAULT_PSO_SWARM);  }
    if (o == psoInfG)   {args.psoInfG   = GetArgument(command_line_args, help, psoInfG, DEFAULT_PSO_G);  }
    if (o == psoInfL)   {args.psoInfL   = GetArgument(command_line_args, help, psoInfL, DEFAULT_PSO_L);  }
    if (o == psoInfR)   {args.psoInfR   = GetArgument(command_line_args, help, psoInfR, DEFAULT_PSO_R);  }
  }
  const auto num_runs = GetArgument(command_line_args, help, kArgNumRuns, C::DefaultNumRuns());

  fprintf(stdout, "%s\n", help.c_str());

  // Tests validity of the given arguments
  C::TestValidArguments(args);

  // Tests for validity of the precision and retrieves properties
  auto isAMD = false;
  auto isARM = false;
  auto isGPU = false;
  {
    const auto platform = Platform(args.platform_id);
    const auto device = Device(platform, args.device_id);
    if (!PrecisionSupported<T>(device)) {
      printf("* Unsupported precision, skipping this tuning run\n\n");
      return;
    }
    isAMD = device.IsAMD();
    isARM = device.IsARM();
    isGPU = device.IsGPU();
  }

  // Creates input buffers with random data
  auto x_vec = std::vector<T>(C::GetSizeX(args));
  auto y_vec = std::vector<T>(C::GetSizeY(args));
  auto a_mat = std::vector<T>(C::GetSizeA(args));
  auto b_mat = std::vector<T>(C::GetSizeB(args));
  auto c_mat = std::vector<T>(C::GetSizeC(args));
  auto temp = std::vector<T>(C::GetSizeTemp(args));
  std::mt19937 mt(kSeed);
  std::uniform_real_distribution<double> dist(kTestDataLowerLimit, kTestDataUpperLimit);
  PopulateVector(x_vec, mt, dist);
  PopulateVector(y_vec, mt, dist);
  PopulateVector(a_mat, mt, dist);
  PopulateVector(b_mat, mt, dist);
  PopulateVector(c_mat, mt, dist);
  PopulateVector(temp, mt, dist);

  // Initializes the tuner for the chosen device
  cltune::Tuner tuner(args.platform_id, args.device_id);

  // Use full-search to explore all parameter combinations or random-search to search only a part of
  // the parameter values. The fraction is set as a command-line argument.
  #ifdef XGEMM_EXEC
  
  if(tStrategyFlag)
  {
   auto localtStrategy = args.tStrategy;  

    if (args.fraction == 1.0 || args.fraction == 0.0) 
    { 
     localtStrategy = FULL_SEARCH_STRATEGY; 
    }
    switch (localtStrategy)
    {
      case FULL_SEARCH_STRATEGY: 
        tuner.UseFullSearch();
        break;

      case RANDOM_SEARCH_STRATEGY: 
          tuner.UseRandomSearch(1.0/args.fraction);
        break;
      case PSO_STRATEGY: 
          tuner.UsePSO(1.0/args.fraction, args.psoSwarmSize, args.psoInfG, args.psoInfL, args.psoInfR);
        break;
      case DVDT_STRATEGY:
      default: 
        tuner.UseFullSearch();
    }
  }

  #else

  if (args.fraction == 1.0 || args.fraction == 0.0) 
  {
    tuner.UseFullSearch();
  }
  else 
  {
    tuner.UseRandomSearch(1.0/args.fraction);
  }

  #endif
  // Set extra settings for specific defines. This mimics src/routine.cc.
  auto defines = std::string{""};
  if (isAMD && isGPU) {
    defines += "#define USE_CL_MAD 1\n";
    defines += "#define USE_STAGGERED_INDICES 1\n";
  }
  if (isARM && isGPU) {
    defines += "#define GLOBAL_MEM_FENCE 1\n";
  }

  // Loads the kernel sources and defines the kernel to tune
  auto sources = defines + C::GetSources();
  auto id = tuner.AddKernelFromString(sources, C::KernelName(), C::GlobalSize(args), C::LocalSize());
  tuner.SetReferenceFromString(sources, C::KernelName(), C::GlobalSizeRef(args), C::LocalSizeRef());

  // Sets the tunable parameters and their possible values
  C::SetParameters(tuner, id);
  C::SetConstraints(tuner, id);
  C::SetLocalMemorySize(tuner, id, args);

  // Tests for a specific precision
  tuner.AddParameter(id, "PRECISION", {static_cast<size_t>(args.precision)});
  tuner.AddParameterReference("PRECISION", static_cast<size_t>(args.precision));

  // Modifies the thread-sizes (both global and local) based on the parameters
  for (auto &parameters: C::MulLocal()) { tuner.MulLocalSize(id, parameters); }
  for (auto &parameters: C::DivLocal()) { tuner.DivLocalSize(id, parameters); }
  for (auto &parameters: C::MulGlobal()) { tuner.MulGlobalSize(id, parameters); }
  for (auto &parameters: C::DivGlobal()) { tuner.DivGlobalSize(id, parameters); }

  // Sets the function's arguments
  C::SetArguments(tuner, args, x_vec, y_vec, a_mat, b_mat, c_mat, temp);

  // Starts the tuning process
  tuner.SetNumRuns(num_runs);
  tuner.Tune();

  // Prints the results to screen
  auto time_ms = tuner.PrintToScreen();
  tuner.PrintFormatted();

  // Also prints the performance of the best-case in terms of GB/s or GFLOPS
  if (time_ms != 0.0) {
    printf("[ -------> ] %.2lf ms", time_ms);
    printf(" or %.1lf %s\n", C::GetMetric(args)/(time_ms*1.0e6), C::PerformanceUnit().c_str());
  }

  // Outputs the results as JSON to disk, including some meta-data
  auto precision_string = std::to_string(static_cast<size_t>(args.precision));
  auto metadata = std::vector<std::pair<std::string,std::string>>{
    {"kernel_family", C::KernelFamily()},
    {"precision", precision_string}
  };
  for (auto &o: C::GetOptions()) {
    if (o == kArgM)     { metadata.push_back({"arg_m", std::to_string(args.m)}); }
    if (o == kArgN)     { metadata.push_back({"arg_n", std::to_string(args.n)}); }
    if (o == kArgK)     { metadata.push_back({"arg_k", std::to_string(args.k)}); }
    if (o == kArgAlpha) { metadata.push_back({"arg_alpha", ToString(args.alpha)}); }
    if (o == kArgBeta)  { metadata.push_back({"arg_beta", ToString(args.beta)}); }
    if (o == kArgBatchCount) { metadata.push_back({"arg_batch_count", ToString(args.batch_count)}); }
  }
  tuner.PrintJSON("clblast_"+C::KernelFamily()+"_"+precision_string+".json", metadata);
 

}