void intel_img_view_init(struct intel_dev *dev, const VkImageViewCreateInfo *info, struct intel_img_view *view) { VkComponentMapping state_swizzles; uint32_t mip_levels, array_size; struct intel_img *img = intel_img(info->image); mip_levels = info->subresourceRange.levelCount; if (mip_levels > img->mip_levels - info->subresourceRange.baseMipLevel) mip_levels = img->mip_levels - info->subresourceRange.baseMipLevel; array_size = info->subresourceRange.layerCount; if (array_size > img->array_size - info->subresourceRange.baseArrayLayer) array_size = img->array_size - info->subresourceRange.baseArrayLayer; view->obj.destroy = img_view_destroy; view->img = img; if (!(img->usage & VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT)) { if (intel_gpu_gen(dev->gpu) >= INTEL_GEN(7.5)) { state_swizzles = info->components; view->shader_swizzles.r = VK_COMPONENT_SWIZZLE_R; view->shader_swizzles.g = VK_COMPONENT_SWIZZLE_G; view->shader_swizzles.b = VK_COMPONENT_SWIZZLE_B; view->shader_swizzles.a = VK_COMPONENT_SWIZZLE_A; } else { state_swizzles.r = VK_COMPONENT_SWIZZLE_R; state_swizzles.g = VK_COMPONENT_SWIZZLE_G; state_swizzles.b = VK_COMPONENT_SWIZZLE_B; state_swizzles.a = VK_COMPONENT_SWIZZLE_A; view->shader_swizzles = info->components; } /* shader_swizzles is ignored by the compiler */ if (view->shader_swizzles.r != VK_COMPONENT_SWIZZLE_R || view->shader_swizzles.g != VK_COMPONENT_SWIZZLE_G || view->shader_swizzles.b != VK_COMPONENT_SWIZZLE_B || view->shader_swizzles.a != VK_COMPONENT_SWIZZLE_A) { intel_dev_log(dev, VK_DEBUG_REPORT_WARNING_BIT_EXT, (struct intel_base*)view, 0, 0, "image data swizzling is ignored"); } if (intel_gpu_gen(dev->gpu) >= INTEL_GEN(7)) { surface_state_tex_gen7(dev->gpu, img, info->viewType, info->format, info->subresourceRange.baseMipLevel, mip_levels, info->subresourceRange.baseArrayLayer, array_size, state_swizzles, false, view->cmd); view->cmd_len = 8; } else { surface_state_tex_gen6(dev->gpu, img, info->viewType, info->format, info->subresourceRange.baseMipLevel, mip_levels, info->subresourceRange.baseArrayLayer, array_size, false, view->cmd); view->cmd_len = 6; } } }
static void layout_init_layer_height(struct intel_layout *layout, struct intel_layout_params *params) { const VkImageCreateInfo *info = params->info; unsigned num_layers; if (layout->walk != INTEL_LAYOUT_WALK_LAYER) return; num_layers = layout_get_num_layers(layout, params); if (num_layers <= 1) return; /* * From the Sandy Bridge PRM, volume 1 part 1, page 115: * * "The following equation is used for surface formats other than * compressed textures: * * QPitch = (h0 + h1 + 11j)" * * "The equation for compressed textures (BC* and FXT1 surface formats) * follows: * * QPitch = (h0 + h1 + 11j) / 4" * * "[DevSNB] Errata: Sampler MSAA Qpitch will be 4 greater than the * value calculated in the equation above, for every other odd Surface * Height starting from 1 i.e. 1,5,9,13" * * From the Ivy Bridge PRM, volume 1 part 1, page 111-112: * * "If Surface Array Spacing is set to ARYSPC_FULL (note that the depth * buffer and stencil buffer have an implied value of ARYSPC_FULL): * * QPitch = (h0 + h1 + 12j) * QPitch = (h0 + h1 + 12j) / 4 (compressed) * * (There are many typos or missing words here...)" * * To access the N-th slice, an offset of (Stride * QPitch * N) is added to * the base address. The PRM divides QPitch by 4 for compressed formats * because the block height for those formats are 4, and it wants QPitch to * mean the number of memory rows, as opposed to texel rows, between * slices. Since we use texel rows everywhere, we do not need to divide * QPitch by 4. */ layout->layer_height = params->h0 + params->h1 + ((intel_gpu_gen(params->gpu) >= INTEL_GEN(7)) ? 12 : 11) * layout->align_j; if (intel_gpu_gen(params->gpu) == INTEL_GEN(6) && info->samples != VK_SAMPLE_COUNT_1_BIT && layout->height0 % 4 == 1) layout->layer_height += 4; params->max_y += layout->layer_height * (num_layers - 1); }
static void att_view_init_for_input(struct intel_att_view *view, const struct intel_gpu *gpu, const struct intel_img *img, VkImageViewType view_type, VkFormat format, unsigned level, unsigned first_layer, unsigned num_layers) { if (intel_gpu_gen(gpu) >= INTEL_GEN(7)) { if (false) { surface_state_tex_gen7(gpu, img, view_type, format, level, 1, first_layer, num_layers, identity_channel_mapping, false, view->cmd); } else { surface_state_null_gen7(gpu, view->cmd); } view->cmd_len = 8; } else { if (false) { surface_state_tex_gen6(gpu, img, view_type, format, level, 1, first_layer, num_layers, false, view->cmd); } else { surface_state_null_gen6(gpu, view->cmd); } view->cmd_len = 6; } }
static bool layout_want_hiz(const struct intel_layout *layout, const struct intel_layout_params *params) { const VkImageCreateInfo *info = params->info; if (intel_debug & INTEL_DEBUG_NOHIZ) return false; if (!(info->usage & VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT)) return false; if (!intel_format_has_depth(params->gpu, info->format)) return false; /* * HiZ implies separate stencil on Gen6. We do not want to copy stencils * values between combined and separate stencil buffers when HiZ is enabled * or disabled. */ if (intel_gpu_gen(params->gpu) == INTEL_GEN(6)) return false; return true; }
static bool layout_want_mcs(struct intel_layout *layout, struct intel_layout_params *params) { const VkImageCreateInfo *info = params->info; bool want_mcs = false; /* MCS is for RT on GEN7+ */ if (intel_gpu_gen(params->gpu) < INTEL_GEN(7)) return false; if (info->imageType != VK_IMAGE_TYPE_2D || !(info->usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT)) return false; /* * From the Ivy Bridge PRM, volume 4 part 1, page 77: * * "For Render Target and Sampling Engine Surfaces:If the surface is * multisampled (Number of Multisamples any value other than * MULTISAMPLECOUNT_1), this field (MCS Enable) must be enabled." * * "This field must be set to 0 for all SINT MSRTs when all RT channels * are not written" */ if (info->samples != VK_SAMPLE_COUNT_1_BIT && !icd_format_is_int(info->format)) { want_mcs = true; } else if (info->samples == VK_SAMPLE_COUNT_1_BIT) { /* * From the Ivy Bridge PRM, volume 2 part 1, page 326: * * "When MCS is buffer is used for color clear of non-multisampler * render target, the following restrictions apply. * - Support is limited to tiled render targets. * - Support is for non-mip-mapped and non-array surface types * only. * - Clear is supported only on the full RT; i.e., no partial clear * or overlapping clears. * - MCS buffer for non-MSRT is supported only for RT formats * 32bpp, 64bpp and 128bpp. * ..." */ if (layout->tiling != GEN6_TILING_NONE && info->mipLevels == 1 && info->arrayLayers == 1) { switch (layout->block_size) { case 4: case 8: case 16: want_mcs = true; break; default: break; } } } return want_mcs; }
static void layout_init_walk(struct intel_layout *layout, struct intel_layout_params *params) { if (intel_gpu_gen(params->gpu) >= INTEL_GEN(7)) layout_init_walk_gen7(layout, params); else layout_init_walk_gen6(layout, params); }
void intel_buf_view_init(const struct intel_dev *dev, const VkBufferViewCreateInfo *info, struct intel_buf_view *view, bool raw) { struct intel_buf *buf = intel_buf(info->buffer); /* TODO: Is transfer destination the only shader write operation? */ const bool will_write = (buf->usage & (VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT)); VkFormat format; VkDeviceSize stride; uint32_t *cmd; int i; view->obj.destroy = buf_view_destroy; view->buf = buf; /* * The compiler expects uniform buffers to have pitch of * 4 for fragment shaders, but 16 for other stages. The format * must be VK_FORMAT_R32G32B32A32_SFLOAT. */ if (raw) { format = VK_FORMAT_R32G32B32A32_SFLOAT; stride = 16; } else { format = info->format; stride = icd_format_get_size(format); } cmd = view->cmd; for (i = 0; i < 2; i++) { if (intel_gpu_gen(dev->gpu) >= INTEL_GEN(7)) { surface_state_buf_gen7(dev->gpu, info->offset, info->range, stride, format, will_write, will_write, cmd); view->cmd_len = 8; } else { surface_state_buf_gen6(dev->gpu, info->offset, info->range, stride, format, will_write, will_write, cmd); view->cmd_len = 6; } /* switch to view->fs_cmd */ if (raw) { cmd = view->fs_cmd; stride = 4; } else { memcpy(view->fs_cmd, view->cmd, sizeof(uint32_t) * view->cmd_len); break; } } }
void intel_null_view_init(struct intel_null_view *view, struct intel_dev *dev) { if (intel_gpu_gen(dev->gpu) >= INTEL_GEN(7)) { surface_state_null_gen7(dev->gpu, view->cmd); view->cmd_len = 8; } else { surface_state_null_gen6(dev->gpu, view->cmd); view->cmd_len = 6; } }
static void layout_init_size_and_format(struct intel_layout *layout, struct intel_layout_params *params) { const VkImageCreateInfo *info = params->info; VkFormat format = info->format; bool require_separate_stencil = false; layout->width0 = info->extent.width; layout->height0 = info->extent.height; /* * From the Sandy Bridge PRM, volume 2 part 1, page 317: * * "This field (Separate Stencil Buffer Enable) must be set to the same * value (enabled or disabled) as Hierarchical Depth Buffer Enable." * * GEN7+ requires separate stencil buffers. */ if (info->usage & VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT) { if (intel_gpu_gen(params->gpu) >= INTEL_GEN(7)) require_separate_stencil = true; else require_separate_stencil = (layout->aux == INTEL_LAYOUT_AUX_HIZ); } switch (format) { case VK_FORMAT_D24_UNORM_S8_UINT: if (require_separate_stencil) { format = VK_FORMAT_X8_D24_UNORM_PACK32; layout->separate_stencil = true; } break; case VK_FORMAT_D32_SFLOAT_S8_UINT: if (require_separate_stencil) { format = VK_FORMAT_D32_SFLOAT; layout->separate_stencil = true; } break; default: break; } layout->format = format; layout->block_width = icd_format_get_block_width(format); layout->block_height = layout->block_width; layout->block_size = icd_format_get_size(format); params->compressed = icd_format_is_compressed(format); }
static void query_init_pipeline_statistics( struct intel_dev *dev, const VkQueryPoolCreateInfo *info, struct intel_query *query) { /* * Note: order defined by Vulkan spec. */ const uint32_t regs[][2] = { {VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT, GEN6_REG_IA_PRIMITIVES_COUNT}, {VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT, GEN6_REG_VS_INVOCATION_COUNT}, {VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT, GEN6_REG_GS_INVOCATION_COUNT}, {VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT, GEN6_REG_GS_PRIMITIVES_COUNT}, {VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT, GEN6_REG_CL_INVOCATION_COUNT}, {VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT, GEN6_REG_CL_PRIMITIVES_COUNT}, {VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT, GEN6_REG_PS_INVOCATION_COUNT}, {VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT, (intel_gpu_gen(dev->gpu) >= INTEL_GEN(7)) ? GEN7_REG_HS_INVOCATION_COUNT : 0}, {VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT, (intel_gpu_gen(dev->gpu) >= INTEL_GEN(7)) ? GEN7_REG_DS_INVOCATION_COUNT : 0}, {VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT, 0} }; STATIC_ASSERT(ARRAY_SIZE(regs) < 32); uint32_t i; uint32_t reg_count = 0; /* * Only query registers indicated via pipeline statistics flags. * If HW does not support a flag, fill value with 0. */ for (i=0; i < ARRAY_SIZE(regs); i++) { if ((regs[i][0] & info->pipelineStatistics)) { query->regs[reg_count] = regs[i][1]; reg_count++; } } query->reg_count = reg_count; query->slot_stride = u_align(reg_count * sizeof(uint64_t) * 2, 64); }
static unsigned layout_get_valid_tilings(const struct intel_layout *layout, const struct intel_layout_params *params) { const VkImageCreateInfo *info = params->info; const VkFormat format = layout->format; unsigned valid_tilings = LAYOUT_TILING_ALL; /* * From the Sandy Bridge PRM, volume 1 part 2, page 32: * * "Display/Overlay Y-Major not supported. * X-Major required for Async Flips" */ if (params->scanout) valid_tilings &= LAYOUT_TILING_X; if (info->tiling == VK_IMAGE_TILING_LINEAR) valid_tilings &= LAYOUT_TILING_NONE; /* * From the Sandy Bridge PRM, volume 2 part 1, page 318: * * "[DevSNB+]: This field (Tiled Surface) must be set to TRUE. Linear * Depth Buffer is not supported." * * "The Depth Buffer, if tiled, must use Y-Major tiling." * * From the Sandy Bridge PRM, volume 1 part 2, page 22: * * "W-Major Tile Format is used for separate stencil." */ if (info->usage & VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT) { switch (format) { case VK_FORMAT_S8_UINT: valid_tilings &= LAYOUT_TILING_W; break; default: valid_tilings &= LAYOUT_TILING_Y; break; } } if (info->usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) { /* * From the Sandy Bridge PRM, volume 1 part 2, page 32: * * "NOTE: 128BPE Format Color buffer ( render target ) MUST be * either TileX or Linear." * * From the Haswell PRM, volume 5, page 32: * * "NOTE: 128 BPP format color buffer (render target) supports * Linear, TiledX and TiledY." */ if (intel_gpu_gen(params->gpu) < INTEL_GEN(7.5) && layout->block_size == 16) valid_tilings &= ~LAYOUT_TILING_Y; /* * From the Ivy Bridge PRM, volume 4 part 1, page 63: * * "This field (Surface Vertical Aligment) must be set to VALIGN_4 * for all tiled Y Render Target surfaces." * * "VALIGN_4 is not supported for surface format R32G32B32_FLOAT." */ if (intel_gpu_gen(params->gpu) >= INTEL_GEN(7) && intel_gpu_gen(params->gpu) <= INTEL_GEN(7.5) && layout->format == VK_FORMAT_R32G32B32_SFLOAT) valid_tilings &= ~LAYOUT_TILING_Y; valid_tilings &= ~LAYOUT_TILING_W; } if (info->usage & VK_IMAGE_USAGE_SAMPLED_BIT) { if (intel_gpu_gen(params->gpu) < INTEL_GEN(8)) valid_tilings &= ~LAYOUT_TILING_W; } /* no conflicting binding flags */ assert(valid_tilings); return valid_tilings; }
static void layout_init_alignments(struct intel_layout *layout, struct intel_layout_params *params) { const VkImageCreateInfo *info = params->info; /* * From the Sandy Bridge PRM, volume 1 part 1, page 113: * * "surface format align_i align_j * YUV 4:2:2 formats 4 *see below * BC1-5 4 4 * FXT1 8 4 * all other formats 4 *see below" * * "- align_j = 4 for any depth buffer * - align_j = 2 for separate stencil buffer * - align_j = 4 for any render target surface is multisampled (4x) * - align_j = 4 for any render target surface with Surface Vertical * Alignment = VALIGN_4 * - align_j = 2 for any render target surface with Surface Vertical * Alignment = VALIGN_2 * - align_j = 2 for all other render target surface * - align_j = 2 for any sampling engine surface with Surface Vertical * Alignment = VALIGN_2 * - align_j = 4 for any sampling engine surface with Surface Vertical * Alignment = VALIGN_4" * * From the Sandy Bridge PRM, volume 4 part 1, page 86: * * "This field (Surface Vertical Alignment) must be set to VALIGN_2 if * the Surface Format is 96 bits per element (BPE)." * * They can be rephrased as * * align_i align_j * compressed formats block width block height * PIPE_FORMAT_S8_UINT 4 2 * other depth/stencil formats 4 4 * 4x multisampled 4 4 * bpp 96 4 2 * others 4 2 or 4 */ /* * From the Ivy Bridge PRM, volume 1 part 1, page 110: * * "surface defined by surface format align_i align_j * 3DSTATE_DEPTH_BUFFER D16_UNORM 8 4 * not D16_UNORM 4 4 * 3DSTATE_STENCIL_BUFFER N/A 8 8 * SURFACE_STATE BC*, ETC*, EAC* 4 4 * FXT1 8 4 * all others (set by SURFACE_STATE)" * * From the Ivy Bridge PRM, volume 4 part 1, page 63: * * "- This field (Surface Vertical Aligment) is intended to be set to * VALIGN_4 if the surface was rendered as a depth buffer, for a * multisampled (4x) render target, or for a multisampled (8x) * render target, since these surfaces support only alignment of 4. * - Use of VALIGN_4 for other surfaces is supported, but uses more * memory. * - This field must be set to VALIGN_4 for all tiled Y Render Target * surfaces. * - Value of 1 is not supported for format YCRCB_NORMAL (0x182), * YCRCB_SWAPUVY (0x183), YCRCB_SWAPUV (0x18f), YCRCB_SWAPY (0x190) * - If Number of Multisamples is not MULTISAMPLECOUNT_1, this field * must be set to VALIGN_4." * - VALIGN_4 is not supported for surface format R32G32B32_FLOAT." * * "- This field (Surface Horizontal Aligment) is intended to be set to * HALIGN_8 only if the surface was rendered as a depth buffer with * Z16 format or a stencil buffer, since these surfaces support only * alignment of 8. * - Use of HALIGN_8 for other surfaces is supported, but uses more * memory. * - This field must be set to HALIGN_4 if the Surface Format is BC*. * - This field must be set to HALIGN_8 if the Surface Format is * FXT1." * * They can be rephrased as * * align_i align_j * compressed formats block width block height * PIPE_FORMAT_Z16_UNORM 8 4 * PIPE_FORMAT_S8_UINT 8 8 * other depth/stencil formats 4 4 * 2x or 4x multisampled 4 or 8 4 * tiled Y 4 or 8 4 (if rt) * PIPE_FORMAT_R32G32B32_FLOAT 4 or 8 2 * others 4 or 8 2 or 4 */ if (params->compressed) { /* this happens to be the case */ layout->align_i = layout->block_width; layout->align_j = layout->block_height; } else if (info->usage & VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT) { if (intel_gpu_gen(params->gpu) >= INTEL_GEN(7)) { switch (layout->format) { case VK_FORMAT_D16_UNORM: layout->align_i = 8; layout->align_j = 4; break; case VK_FORMAT_S8_UINT: layout->align_i = 8; layout->align_j = 8; break; default: layout->align_i = 4; layout->align_j = 4; break; } } else { switch (layout->format) { case VK_FORMAT_S8_UINT: layout->align_i = 4; layout->align_j = 2; break; default: layout->align_i = 4; layout->align_j = 4; break; } } } else { const bool valign_4 = (info->samples != VK_SAMPLE_COUNT_1_BIT) || (intel_gpu_gen(params->gpu) >= INTEL_GEN(8)) || (intel_gpu_gen(params->gpu) >= INTEL_GEN(7) && layout->tiling == GEN6_TILING_Y && (info->usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT)); if (intel_gpu_gen(params->gpu) >= INTEL_GEN(7) && intel_gpu_gen(params->gpu) <= INTEL_GEN(7.5) && valign_4) assert(layout->format != VK_FORMAT_R32G32B32_SFLOAT); layout->align_i = 4; layout->align_j = (valign_4) ? 4 : 2; } /* * the fact that align i and j are multiples of block width and height * respectively is what makes the size of the bo a multiple of the block * size, slices start at block boundaries, and many of the computations * work. */ assert(layout->align_i % layout->block_width == 0); assert(layout->align_j % layout->block_height == 0); /* make sure u_align() works */ assert(u_is_pow2(layout->align_i) && u_is_pow2(layout->align_j)); assert(u_is_pow2(layout->block_width) && u_is_pow2(layout->block_height)); }
int intel_gpu_get_max_threads(const struct intel_gpu *gpu, VkShaderStageFlagBits stage) { switch (intel_gpu_gen(gpu)) { case INTEL_GEN(7.5): switch (stage) { case VK_SHADER_STAGE_VERTEX_BIT: return (gpu->gt >= 2) ? 280 : 70; case VK_SHADER_STAGE_GEOMETRY_BIT: /* values from ilo_gpe_init_gs_cso_gen7 */ return (gpu->gt >= 2) ? 256 : 70; case VK_SHADER_STAGE_FRAGMENT_BIT: return (gpu->gt == 3) ? 408 : (gpu->gt == 2) ? 204 : 102; default: break; } break; case INTEL_GEN(7): switch (stage) { case VK_SHADER_STAGE_VERTEX_BIT: return (gpu->gt == 2) ? 128 : 36; case VK_SHADER_STAGE_GEOMETRY_BIT: /* values from ilo_gpe_init_gs_cso_gen7 */ return (gpu->gt == 2) ? 128 : 36; case VK_SHADER_STAGE_FRAGMENT_BIT: return (gpu->gt == 2) ? 172 : 48; default: break; } break; case INTEL_GEN(6): switch (stage) { case VK_SHADER_STAGE_VERTEX_BIT: return (gpu->gt == 2) ? 60 : 24; case VK_SHADER_STAGE_GEOMETRY_BIT: /* values from ilo_gpe_init_gs_cso_gen6 */ return (gpu->gt == 2) ? 28 : 21; case VK_SHADER_STAGE_FRAGMENT_BIT: return (gpu->gt == 2) ? 80 : 40; default: break; } break; default: break; } intel_log(gpu, VK_DEBUG_REPORT_ERROR_BIT_EXT, 0, VK_NULL_HANDLE, 0, 0, "unknown Gen or shader stage"); switch (stage) { case VK_SHADER_STAGE_VERTEX_BIT: return 1; case VK_SHADER_STAGE_GEOMETRY_BIT: return 1; case VK_SHADER_STAGE_FRAGMENT_BIT: return 4; default: return 1; } }
/* note that this may force the texture to be linear */ static void layout_calculate_bo_size(struct intel_layout *layout, struct intel_layout_params *params) { assert(params->max_x % layout->block_width == 0); assert(params->max_y % layout->block_height == 0); assert(layout->layer_height % layout->block_height == 0); layout->bo_stride = (params->max_x / layout->block_width) * layout->block_size; layout->bo_height = params->max_y / layout->block_height; while (true) { unsigned w = layout->bo_stride, h = layout->bo_height; unsigned align_w, align_h; /* * From the Haswell PRM, volume 5, page 163: * * "For linear surfaces, additional padding of 64 bytes is required * at the bottom of the surface. This is in addition to the padding * required above." */ if (intel_gpu_gen(params->gpu) >= INTEL_GEN(7.5) && (params->info->usage & VK_IMAGE_USAGE_SAMPLED_BIT) && layout->tiling == GEN6_TILING_NONE) h += (64 + layout->bo_stride - 1) / layout->bo_stride; /* * From the Sandy Bridge PRM, volume 4 part 1, page 81: * * "- For linear render target surfaces, the pitch must be a * multiple of the element size for non-YUV surface formats. * Pitch must be a multiple of 2 * element size for YUV surface * formats. * - For other linear surfaces, the pitch can be any multiple of * bytes. * - For tiled surfaces, the pitch must be a multiple of the tile * width." * * Different requirements may exist when the bo is used in different * places, but our alignments here should be good enough that we do not * need to check layout->info->usage. */ switch (layout->tiling) { case GEN6_TILING_X: align_w = 512; align_h = 8; break; case GEN6_TILING_Y: align_w = 128; align_h = 32; break; case GEN8_TILING_W: /* * From the Sandy Bridge PRM, volume 1 part 2, page 22: * * "A 4KB tile is subdivided into 8-high by 8-wide array of * Blocks for W-Major Tiles (W Tiles). Each Block is 8 rows by 8 * bytes." */ align_w = 64; align_h = 64; break; default: assert(layout->tiling == GEN6_TILING_NONE); /* some good enough values */ align_w = 64; align_h = 2; break; } w = u_align(w, align_w); h = u_align(h, align_h); /* make sure the bo is mappable */ if (layout->tiling != GEN6_TILING_NONE) { /* * Usually only the first 256MB of the GTT is mappable. * * See also how intel_context::max_gtt_map_object_size is calculated. */ const size_t mappable_gtt_size = 256 * 1024 * 1024; /* * Be conservative. We may be able to switch from VALIGN_4 to * VALIGN_2 if the layout was Y-tiled, but let's keep it simple. */ if (mappable_gtt_size / w / 4 < h) { if (layout->valid_tilings & LAYOUT_TILING_NONE) { layout->tiling = GEN6_TILING_NONE; /* MCS support for non-MSRTs is limited to tiled RTs */ if (layout->aux == INTEL_LAYOUT_AUX_MCS && params->info->samples == VK_SAMPLE_COUNT_1_BIT) layout->aux = INTEL_LAYOUT_AUX_NONE; continue; } else { /* mapping will fail */ } } } layout->bo_stride = w; layout->bo_height = h; break; } }
static void ds_init_info(const struct intel_gpu *gpu, const struct intel_img *img, VkImageViewType view_type, VkFormat format, unsigned level, unsigned first_layer, unsigned num_layers, struct ds_surface_info *info) { bool separate_stencil; INTEL_GPU_ASSERT(gpu, 6, 7.5); memset(info, 0, sizeof(*info)); info->surface_type = view_type_to_surface_type(view_type); if (info->surface_type == GEN6_SURFTYPE_CUBE) { /* * From the Sandy Bridge PRM, volume 2 part 1, page 325-326: * * "For Other Surfaces (Cube Surfaces): * This field (Minimum Array Element) is ignored." * * "For Other Surfaces (Cube Surfaces): * This field (Render Target View Extent) is ignored." * * As such, we cannot set first_layer and num_layers on cube surfaces. * To work around that, treat it as a 2D surface. */ info->surface_type = GEN6_SURFTYPE_2D; } if (intel_gpu_gen(gpu) >= INTEL_GEN(7)) { separate_stencil = true; } else { /* * From the Sandy Bridge PRM, volume 2 part 1, page 317: * * "This field (Separate Stencil Buffer Enable) must be set to the * same value (enabled or disabled) as Hierarchical Depth Buffer * Enable." */ separate_stencil = intel_img_can_enable_hiz(img, level); } /* * From the Sandy Bridge PRM, volume 2 part 1, page 317: * * "If this field (Hierarchical Depth Buffer Enable) is enabled, the * Surface Format of the depth buffer cannot be * D32_FLOAT_S8X24_UINT or D24_UNORM_S8_UINT. Use of stencil * requires the separate stencil buffer." * * From the Ironlake PRM, volume 2 part 1, page 330: * * "If this field (Separate Stencil Buffer Enable) is disabled, the * Surface Format of the depth buffer cannot be D24_UNORM_X8_UINT." * * There is no similar restriction for GEN6. But when D24_UNORM_X8_UINT * is indeed used, the depth values output by the fragment shaders will * be different when read back. * * As for GEN7+, separate_stencil is always true. */ switch (format) { case VK_FORMAT_D16_UNORM: info->format = GEN6_ZFORMAT_D16_UNORM; break; case VK_FORMAT_D32_SFLOAT: info->format = GEN6_ZFORMAT_D32_FLOAT; break; case VK_FORMAT_D32_SFLOAT_S8_UINT: info->format = (separate_stencil) ? GEN6_ZFORMAT_D32_FLOAT : GEN6_ZFORMAT_D32_FLOAT_S8X24_UINT; break; case VK_FORMAT_S8_UINT: if (separate_stencil) { info->format = GEN6_ZFORMAT_D32_FLOAT; break; } /* fall through */ default: assert(!"unsupported depth/stencil format"); ds_init_info_null(gpu, info); return; break; } if (format != VK_FORMAT_S8_UINT) info->zs.stride = img->layout.bo_stride; if (img->s8_layout) { /* * From the Sandy Bridge PRM, volume 2 part 1, page 329: * * "The pitch must be set to 2x the value computed based on width, * as the stencil buffer is stored with two rows interleaved." * * According to the classic driver, we need to do the same for GEN7+ * even though the Ivy Bridge PRM does not say anything about it. */ info->stencil.stride = img->s8_layout->bo_stride * 2; if (intel_gpu_gen(gpu) == INTEL_GEN(6)) { unsigned x, y; assert(img->s8_layout->walk == INTEL_LAYOUT_WALK_LOD); /* offset to the level */ intel_layout_get_slice_pos(img->s8_layout, level, 0, &x, &y); intel_layout_pos_to_mem(img->s8_layout, x, y, &x, &y); info->stencil.offset = intel_layout_mem_to_raw(img->s8_layout, x, y); } } else if (format == VK_FORMAT_S8_UINT) { info->stencil.stride = img->layout.bo_stride * 2; } if (intel_img_can_enable_hiz(img, level)) { info->hiz.stride = img->layout.aux_stride; /* offset to the level */ if (intel_gpu_gen(gpu) == INTEL_GEN(6)) info->hiz.offset = img->layout.aux_offsets[level]; } info->width = img->layout.width0; info->height = img->layout.height0; info->depth = (img->type == VK_IMAGE_TYPE_3D) ? img->depth : num_layers; info->lod = level; info->first_layer = first_layer; info->num_layers = num_layers; }
static void surface_state_tex_gen7(const struct intel_gpu *gpu, const struct intel_img *img, VkImageViewType type, VkFormat format, unsigned first_level, unsigned num_levels, unsigned first_layer, unsigned num_layers, VkComponentMapping swizzles, bool is_rt, uint32_t dw[8]) { int surface_type, surface_format; int width, height, depth, pitch, lod; INTEL_GPU_ASSERT(gpu, 7, 7.5); surface_type = view_type_to_surface_type(type); assert(surface_type != GEN6_SURFTYPE_BUFFER); surface_format = intel_format_translate_color(gpu, format); assert(surface_format >= 0); width = img->layout.width0; height = img->layout.height0; depth = (type == VK_IMAGE_VIEW_TYPE_3D) ? img->depth : num_layers; pitch = img->layout.bo_stride; if (surface_type == GEN6_SURFTYPE_CUBE) { /* * From the Ivy Bridge PRM, volume 4 part 1, page 70: * * "For SURFTYPE_CUBE:For Sampling Engine Surfaces, the range of * this field is [0,340], indicating the number of cube array * elements (equal to the number of underlying 2D array elements * divided by 6). For other surfaces, this field must be zero." * * When is_rt is true, we treat the texture as a 2D one to avoid the * restriction. */ if (is_rt) { surface_type = GEN6_SURFTYPE_2D; } else { assert(num_layers % 6 == 0); depth = num_layers / 6; } } /* sanity check the size */ assert(width >= 1 && height >= 1 && depth >= 1 && pitch >= 1); assert(first_layer < 2048 && num_layers <= 2048); switch (surface_type) { case GEN6_SURFTYPE_1D: assert(width <= 16384 && height == 1 && depth <= 2048); break; case GEN6_SURFTYPE_2D: assert(width <= 16384 && height <= 16384 && depth <= 2048); break; case GEN6_SURFTYPE_3D: assert(width <= 2048 && height <= 2048 && depth <= 2048); if (!is_rt) assert(first_layer == 0); break; case GEN6_SURFTYPE_CUBE: assert(width <= 16384 && height <= 16384 && depth <= 86); assert(width == height); if (is_rt) assert(first_layer == 0); break; default: assert(!"unexpected surface type"); break; } if (is_rt) { assert(num_levels == 1); lod = first_level; } else { lod = num_levels - 1; } /* * From the Ivy Bridge PRM, volume 4 part 1, page 68: * * "The Base Address for linear render target surfaces and surfaces * accessed with the typed surface read/write data port messages must * be element-size aligned, for non-YUV surface formats, or a multiple * of 2 element-sizes for YUV surface formats. Other linear surfaces * have no alignment requirements (byte alignment is sufficient)." * * From the Ivy Bridge PRM, volume 4 part 1, page 70: * * "For linear render target surfaces and surfaces accessed with the * typed data port messages, the pitch must be a multiple of the * element size for non-YUV surface formats. Pitch must be a multiple * of 2 * element size for YUV surface formats. For linear surfaces * with Surface Type of SURFTYPE_STRBUF, the pitch must be a multiple * of 4 bytes.For other linear surfaces, the pitch can be any multiple * of bytes." * * From the Ivy Bridge PRM, volume 4 part 1, page 74: * * "For linear surfaces, this field (X Offset) must be zero." */ if (img->layout.tiling == GEN6_TILING_NONE) { if (is_rt) { const int elem_size U_ASSERT_ONLY = icd_format_get_size(format); assert(pitch % elem_size == 0); } } assert(img->layout.tiling != GEN8_TILING_W); dw[0] = surface_type << GEN7_SURFACE_DW0_TYPE__SHIFT | surface_format << GEN7_SURFACE_DW0_FORMAT__SHIFT | img->layout.tiling << 13; /* * From the Ivy Bridge PRM, volume 4 part 1, page 63: * * "If this field (Surface Array) is enabled, the Surface Type must be * SURFTYPE_1D, SURFTYPE_2D, or SURFTYPE_CUBE. If this field is * disabled and Surface Type is SURFTYPE_1D, SURFTYPE_2D, or * SURFTYPE_CUBE, the Depth field must be set to zero." * * For non-3D sampler surfaces, resinfo (the sampler message) always * returns zero for the number of layers when this field is not set. */ if (surface_type != GEN6_SURFTYPE_3D) { if (num_layers > 1) dw[0] |= GEN7_SURFACE_DW0_IS_ARRAY; else assert(depth == 1); } assert(img->layout.align_i == 4 || img->layout.align_i == 8); assert(img->layout.align_j == 2 || img->layout.align_j == 4); if (img->layout.align_j == 4) dw[0] |= GEN7_SURFACE_DW0_VALIGN_4; if (img->layout.align_i == 8) dw[0] |= GEN7_SURFACE_DW0_HALIGN_8; if (img->layout.walk == INTEL_LAYOUT_WALK_LOD) dw[0] |= GEN7_SURFACE_DW0_ARYSPC_LOD0; else dw[0] |= GEN7_SURFACE_DW0_ARYSPC_FULL; if (is_rt) dw[0] |= GEN7_SURFACE_DW0_RENDER_CACHE_RW; if (surface_type == GEN6_SURFTYPE_CUBE && !is_rt) dw[0] |= GEN7_SURFACE_DW0_CUBE_FACE_ENABLES__MASK; dw[1] = 0; dw[2] = (height - 1) << GEN7_SURFACE_DW2_HEIGHT__SHIFT | (width - 1) << GEN7_SURFACE_DW2_WIDTH__SHIFT; dw[3] = (depth - 1) << GEN7_SURFACE_DW3_DEPTH__SHIFT | (pitch - 1); dw[4] = first_layer << 18 | (num_layers - 1) << 7; /* * MSFMT_MSS means the samples are not interleaved and MSFMT_DEPTH_STENCIL * means the samples are interleaved. The layouts are the same when the * number of samples is 1. */ if (img->layout.interleaved_samples && img->sample_count > 1) { assert(!is_rt); dw[4] |= GEN7_SURFACE_DW4_MSFMT_DEPTH_STENCIL; } else { dw[4] |= GEN7_SURFACE_DW4_MSFMT_MSS; } if (img->sample_count > 4) dw[4] |= GEN7_SURFACE_DW4_MULTISAMPLECOUNT_8; else if (img->sample_count > 2) dw[4] |= GEN7_SURFACE_DW4_MULTISAMPLECOUNT_4; else dw[4] |= GEN7_SURFACE_DW4_MULTISAMPLECOUNT_1; dw[5] = GEN7_MOCS_L3_WB << GEN7_SURFACE_DW5_MOCS__SHIFT | (first_level) << GEN7_SURFACE_DW5_MIN_LOD__SHIFT | lod; dw[6] = 0; dw[7] = 0; if (intel_gpu_gen(gpu) >= INTEL_GEN(7.5)) { dw[7] |= channel_swizzle_to_scs((swizzles.r == VK_COMPONENT_SWIZZLE_IDENTITY) ? VK_COMPONENT_SWIZZLE_R : swizzles.r) << GEN75_SURFACE_DW7_SCS_R__SHIFT | channel_swizzle_to_scs((swizzles.g == VK_COMPONENT_SWIZZLE_IDENTITY) ? VK_COMPONENT_SWIZZLE_G : swizzles.g) << GEN75_SURFACE_DW7_SCS_G__SHIFT | channel_swizzle_to_scs((swizzles.b == VK_COMPONENT_SWIZZLE_IDENTITY) ? VK_COMPONENT_SWIZZLE_B : swizzles.b) << GEN75_SURFACE_DW7_SCS_B__SHIFT | channel_swizzle_to_scs((swizzles.a == VK_COMPONENT_SWIZZLE_IDENTITY) ? VK_COMPONENT_SWIZZLE_A : swizzles.a) << GEN75_SURFACE_DW7_SCS_A__SHIFT; } else { assert(((swizzles.r == VK_COMPONENT_SWIZZLE_R) || (swizzles.r == VK_COMPONENT_SWIZZLE_IDENTITY)) && ((swizzles.g == VK_COMPONENT_SWIZZLE_G) || (swizzles.g == VK_COMPONENT_SWIZZLE_IDENTITY)) && ((swizzles.b == VK_COMPONENT_SWIZZLE_B) || (swizzles.b == VK_COMPONENT_SWIZZLE_IDENTITY)) && ((swizzles.a == VK_COMPONENT_SWIZZLE_A) || (swizzles.a == VK_COMPONENT_SWIZZLE_IDENTITY))); } }
static void surface_state_buf_gen7(const struct intel_gpu *gpu, unsigned offset, unsigned size, unsigned struct_size, VkFormat elem_format, bool is_rt, bool render_cache_rw, uint32_t dw[8]) { const bool typed = !icd_format_is_undef(elem_format); const bool structured = (!typed && struct_size > 1); const int elem_size = (typed) ? icd_format_get_size(elem_format) : 1; int width, height, depth, pitch; int surface_type, surface_format, num_entries; INTEL_GPU_ASSERT(gpu, 7, 7.5); surface_type = (structured) ? GEN7_SURFTYPE_STRBUF : GEN6_SURFTYPE_BUFFER; surface_format = (typed) ? intel_format_translate_color(gpu, elem_format) : GEN6_FORMAT_RAW; /* * It's possible that the buffer view being used is smaller than * the format element size (required to be 16 for non-fragment shaders) * Make certain that size is at least struct_size to keep HW happy. */ if (size < struct_size) { size = struct_size; } num_entries = size / struct_size; /* see if there is enough space to fit another element */ if (size % struct_size >= elem_size && !structured) num_entries++; /* * From the Ivy Bridge PRM, volume 4 part 1, page 67: * * "For SURFTYPE_BUFFER render targets, this field (Surface Base * Address) specifies the base address of first element of the * surface. The surface is interpreted as a simple array of that * single element type. The address must be naturally-aligned to the * element size (e.g., a buffer containing R32G32B32A32_FLOAT elements * must be 16-byte aligned) * * For SURFTYPE_BUFFER non-rendertarget surfaces, this field specifies * the base address of the first element of the surface, computed in * software by adding the surface base address to the byte offset of * the element in the buffer." */ if (is_rt) assert(offset % elem_size == 0); /* * From the Ivy Bridge PRM, volume 4 part 1, page 68: * * "For typed buffer and structured buffer surfaces, the number of * entries in the buffer ranges from 1 to 2^27. For raw buffer * surfaces, the number of entries in the buffer is the number of * bytes which can range from 1 to 2^30." */ assert(num_entries >= 1 && num_entries <= 1 << ((typed || structured) ? 27 : 30)); /* * From the Ivy Bridge PRM, volume 4 part 1, page 69: * * "For SURFTYPE_BUFFER: The low two bits of this field (Width) must be * 11 if the Surface Format is RAW (the size of the buffer must be a * multiple of 4 bytes)." * * From the Ivy Bridge PRM, volume 4 part 1, page 70: * * "For surfaces of type SURFTYPE_BUFFER and SURFTYPE_STRBUF, this * field (Surface Pitch) indicates the size of the structure." * * "For linear surfaces with Surface Type of SURFTYPE_STRBUF, the pitch * must be a multiple of 4 bytes." */ if (structured) assert(struct_size % 4 == 0); else if (!typed) assert(num_entries % 4 == 0); pitch = struct_size; pitch--; num_entries--; /* bits [6:0] */ width = (num_entries & 0x0000007f); /* bits [20:7] */ height = (num_entries & 0x001fff80) >> 7; /* bits [30:21] */ depth = (num_entries & 0x7fe00000) >> 21; /* limit to [26:21] */ if (typed || structured) depth &= 0x3f; dw[0] = surface_type << GEN7_SURFACE_DW0_TYPE__SHIFT | surface_format << GEN7_SURFACE_DW0_FORMAT__SHIFT; if (render_cache_rw) dw[0] |= GEN7_SURFACE_DW0_RENDER_CACHE_RW; dw[1] = offset; dw[2] = height << GEN7_SURFACE_DW2_HEIGHT__SHIFT | width << GEN7_SURFACE_DW2_WIDTH__SHIFT; dw[3] = depth << GEN7_SURFACE_DW3_DEPTH__SHIFT | pitch; dw[4] = 0; dw[5] = GEN7_MOCS_L3_WB << GEN7_SURFACE_DW5_MOCS__SHIFT; dw[6] = 0; dw[7] = 0; if (intel_gpu_gen(gpu) >= INTEL_GEN(7.5)) { dw[7] |= GEN75_SCS_RED << GEN75_SURFACE_DW7_SCS_R__SHIFT | GEN75_SCS_GREEN << GEN75_SURFACE_DW7_SCS_G__SHIFT | GEN75_SCS_BLUE << GEN75_SURFACE_DW7_SCS_B__SHIFT | GEN75_SCS_ALPHA << GEN75_SURFACE_DW7_SCS_A__SHIFT; } }
static void att_view_init_for_ds(struct intel_att_view *view, const struct intel_gpu *gpu, const struct intel_img *img, VkImageViewType view_type, VkFormat format, unsigned level, unsigned first_layer, unsigned num_layers) { const int max_2d_size U_ASSERT_ONLY = (intel_gpu_gen(gpu) >= INTEL_GEN(7)) ? 16384 : 8192; const int max_array_size U_ASSERT_ONLY = (intel_gpu_gen(gpu) >= INTEL_GEN(7)) ? 2048 : 512; struct ds_surface_info info; uint32_t dw1, dw2, dw3, dw4, dw5, dw6; uint32_t *dw; INTEL_GPU_ASSERT(gpu, 6, 7.5); if (img) { ds_init_info(gpu, img, view_type, format, level, first_layer, num_layers, &info); } else { ds_init_info_null(gpu, &info); } switch (info.surface_type) { case GEN6_SURFTYPE_NULL: break; case GEN6_SURFTYPE_1D: assert(info.width <= max_2d_size && info.height == 1 && info.depth <= max_array_size); assert(info.first_layer < max_array_size - 1 && info.num_layers <= max_array_size); break; case GEN6_SURFTYPE_2D: assert(info.width <= max_2d_size && info.height <= max_2d_size && info.depth <= max_array_size); assert(info.first_layer < max_array_size - 1 && info.num_layers <= max_array_size); break; case GEN6_SURFTYPE_3D: assert(info.width <= 2048 && info.height <= 2048 && info.depth <= 2048); assert(info.first_layer < 2048 && info.num_layers <= max_array_size); break; case GEN6_SURFTYPE_CUBE: assert(info.width <= max_2d_size && info.height <= max_2d_size && info.depth == 1); assert(info.first_layer == 0 && info.num_layers == 1); assert(info.width == info.height); break; default: assert(!"unexpected depth surface type"); break; } dw1 = info.surface_type << 29 | info.format << 18; if (info.zs.stride) { /* required for GEN6+ */ assert(info.zs.stride > 0 && info.zs.stride < 128 * 1024 && info.zs.stride % 128 == 0); assert(info.width <= info.zs.stride); dw1 |= (info.zs.stride - 1); } dw2 = 0; if (intel_gpu_gen(gpu) >= INTEL_GEN(7)) { if (info.zs.stride) dw1 |= 1 << 28; if (info.stencil.stride) dw1 |= 1 << 27; if (info.hiz.stride) dw1 |= 1 << 22; dw3 = (info.height - 1) << 18 | (info.width - 1) << 4 | info.lod; dw4 = (info.depth - 1) << 21 | info.first_layer << 10 | GEN7_MOCS_L3_WB; dw5 = 0; dw6 = (info.num_layers - 1) << 21; } else { /* always Y-tiled */ dw1 |= 1 << 27 | 1 << 26; if (info.hiz.stride) { dw1 |= 1 << 22 | 1 << 21; } dw3 = (info.height - 1) << 19 | (info.width - 1) << 6 | info.lod << 2 | GEN6_DEPTH_DW3_MIPLAYOUT_BELOW; dw4 = (info.depth - 1) << 21 | info.first_layer << 10 | (info.num_layers - 1) << 1; dw5 = 0; dw6 = 0; } STATIC_ASSERT(ARRAY_SIZE(view->att_cmd) >= 10); dw = view->att_cmd; dw[0] = dw1; dw[1] = dw2; dw[2] = dw3; dw[3] = dw4; dw[4] = dw5; dw[5] = dw6; /* separate stencil */ if (info.stencil.stride) { assert(info.stencil.stride > 0 && info.stencil.stride < 128 * 1024 && info.stencil.stride % 128 == 0); dw[6] = info.stencil.stride - 1; dw[7] = img->s8_offset; if (intel_gpu_gen(gpu) >= INTEL_GEN(7)) dw[6] |= GEN7_MOCS_L3_WB << GEN6_STENCIL_DW1_MOCS__SHIFT; if (intel_gpu_gen(gpu) >= INTEL_GEN(7.5)) dw[6] |= GEN75_STENCIL_DW1_STENCIL_BUFFER_ENABLE; } else { dw[6] = 0; dw[7] = 0; } /* hiz */ if (info.hiz.stride) { dw[8] = info.hiz.stride - 1; dw[9] = img->aux_offset; if (intel_gpu_gen(gpu) >= INTEL_GEN(7)) dw[8] |= GEN7_MOCS_L3_WB << GEN6_HIZ_DW1_MOCS__SHIFT; } else { dw[8] = 0; dw[9] = 0; } view->has_stencil = info.stencil.stride; view->has_hiz = info.hiz.stride; }
static void layout_calculate_hiz_size(struct intel_layout *layout, struct intel_layout_params *params) { const VkImageCreateInfo *info = params->info; const unsigned hz_align_j = 8; enum intel_layout_walk_type hz_walk; unsigned hz_width, hz_height, lv; unsigned hz_clear_w, hz_clear_h; assert(layout->aux == INTEL_LAYOUT_AUX_HIZ); assert(layout->walk == INTEL_LAYOUT_WALK_LAYER || layout->walk == INTEL_LAYOUT_WALK_3D); /* * From the Sandy Bridge PRM, volume 2 part 1, page 312: * * "The hierarchical depth buffer does not support the LOD field, it is * assumed by hardware to be zero. A separate hierarachical depth * buffer is required for each LOD used, and the corresponding * buffer's state delivered to hardware each time a new depth buffer * state with modified LOD is delivered." * * We will put all LODs in a single bo with INTEL_LAYOUT_WALK_LOD. */ if (intel_gpu_gen(params->gpu) >= INTEL_GEN(7)) hz_walk = layout->walk; else hz_walk = INTEL_LAYOUT_WALK_LOD; /* * See the Sandy Bridge PRM, volume 2 part 1, page 312, and the Ivy Bridge * PRM, volume 2 part 1, page 312-313. * * It seems HiZ buffer is aligned to 8x8, with every two rows packed into a * memory row. */ switch (hz_walk) { case INTEL_LAYOUT_WALK_LOD: { unsigned lod_tx[INTEL_LAYOUT_MAX_LEVELS]; unsigned lod_ty[INTEL_LAYOUT_MAX_LEVELS]; unsigned cur_tx, cur_ty; /* figure out the tile offsets of LODs */ hz_width = 0; hz_height = 0; cur_tx = 0; cur_ty = 0; for (lv = 0; lv < info->mipLevels; lv++) { unsigned tw, th; lod_tx[lv] = cur_tx; lod_ty[lv] = cur_ty; tw = u_align(layout->lods[lv].slice_width, 16); th = u_align(layout->lods[lv].slice_height, hz_align_j) * info->arrayLayers / 2; /* convert to Y-tiles */ tw = u_align(tw, 128) / 128; th = u_align(th, 32) / 32; if (hz_width < cur_tx + tw) hz_width = cur_tx + tw; if (hz_height < cur_ty + th) hz_height = cur_ty + th; if (lv == 1) cur_tx += tw; else cur_ty += th; } /* convert tile offsets to memory offsets */ for (lv = 0; lv < info->mipLevels; lv++) { layout->aux_offsets[lv] = (lod_ty[lv] * hz_width + lod_tx[lv]) * 4096; } hz_width *= 128; hz_height *= 32; } break; case INTEL_LAYOUT_WALK_LAYER: { const unsigned h0 = u_align(params->h0, hz_align_j); const unsigned h1 = u_align(params->h1, hz_align_j); const unsigned htail = ((intel_gpu_gen(params->gpu) >= INTEL_GEN(7)) ? 12 : 11) * hz_align_j; const unsigned hz_qpitch = h0 + h1 + htail; hz_width = u_align(layout->lods[0].slice_width, 16); hz_height = hz_qpitch * info->arrayLayers / 2; if (intel_gpu_gen(params->gpu) >= INTEL_GEN(7)) hz_height = u_align(hz_height, 8); layout->aux_layer_height = hz_qpitch; } break; case INTEL_LAYOUT_WALK_3D: hz_width = u_align(layout->lods[0].slice_width, 16); hz_height = 0; for (lv = 0; lv < info->mipLevels; lv++) { const unsigned h = u_align(layout->lods[lv].slice_height, hz_align_j); /* according to the formula, slices are packed together vertically */ hz_height += h * u_minify(info->extent.depth, lv); } hz_height /= 2; break; default: assert(!"unknown layout walk"); hz_width = 0; hz_height = 0; break; } /* * In hiz_align_fb(), we will align the LODs to 8x4 sample blocks. * Experiments on Haswell show that aligning the RECTLIST primitive and * 3DSTATE_DRAWING_RECTANGLE alone are not enough. The LOD sizes must be * aligned. */ hz_clear_w = 8; hz_clear_h = 4; switch (info->samples) { case VK_SAMPLE_COUNT_1_BIT: default: break; case VK_SAMPLE_COUNT_2_BIT: hz_clear_w /= 2; break; case VK_SAMPLE_COUNT_4_BIT: hz_clear_w /= 2; hz_clear_h /= 2; break; case VK_SAMPLE_COUNT_8_BIT: hz_clear_w /= 4; hz_clear_h /= 2; break; case VK_SAMPLE_COUNT_16_BIT: hz_clear_w /= 4; hz_clear_h /= 4; break; } for (lv = 0; lv < info->mipLevels; lv++) { if (u_minify(layout->width0, lv) % hz_clear_w || u_minify(layout->height0, lv) % hz_clear_h) break; layout->aux_enables |= 1 << lv; } /* we padded to allow this in layout_align() */ if (info->mipLevels == 1 && info->arrayLayers == 1 && info->extent.depth == 1) layout->aux_enables |= 0x1; /* align to Y-tile */ layout->aux_stride = u_align(hz_width, 128); layout->aux_height = u_align(hz_height, 32); }
VkResult intel_gpu_create(const struct intel_instance *instance, int devid, const char *primary_node, const char *render_node, struct intel_gpu **gpu_ret) { const int gen = devid_to_gen(devid); size_t primary_len, render_len; struct intel_gpu *gpu; if (gen < 0) { intel_log(instance, VK_DEBUG_REPORT_WARNING_BIT_EXT, 0, VK_NULL_HANDLE, 0, 0, "unsupported device id 0x%04x", devid); return VK_ERROR_INITIALIZATION_FAILED; } gpu = intel_alloc(instance, sizeof(*gpu), sizeof(int), VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE); if (!gpu) return VK_ERROR_OUT_OF_HOST_MEMORY; memset(gpu, 0, sizeof(*gpu)); /* there is no VK_DBG_OBJECT_GPU */ intel_handle_init(&gpu->handle, VK_DEBUG_REPORT_OBJECT_TYPE_PHYSICAL_DEVICE_EXT, instance); gpu->devid = devid; primary_len = strlen(primary_node); render_len = (render_node) ? strlen(render_node) : 0; gpu->primary_node = intel_alloc(gpu, primary_len + 1 + ((render_len) ? (render_len + 1) : 0), sizeof(int), VK_SYSTEM_ALLOCATION_SCOPE_INSTANCE); if (!gpu->primary_node) { intel_free(instance, gpu); return VK_ERROR_OUT_OF_HOST_MEMORY; } memcpy(gpu->primary_node, primary_node, primary_len + 1); if (render_node) { gpu->render_node = gpu->primary_node + primary_len + 1; memcpy(gpu->render_node, render_node, render_len + 1); } else { gpu->render_node = gpu->primary_node; } gpu->gen_opaque = gen; switch (intel_gpu_gen(gpu)) { case INTEL_GEN(7.5): gpu->gt = gen_get_hsw_gt(devid); break; case INTEL_GEN(7): gpu->gt = gen_get_ivb_gt(devid); break; case INTEL_GEN(6): gpu->gt = gen_get_snb_gt(devid); break; } /* 150K dwords */ gpu->max_batch_buffer_size = sizeof(uint32_t) * 150*1024; /* the winsys is prepared for one reloc every two dwords, then minus 2 */ gpu->batch_buffer_reloc_count = gpu->max_batch_buffer_size / sizeof(uint32_t) / 2 - 2; gpu->primary_fd_internal = -1; gpu->render_fd_internal = -1; *gpu_ret = gpu; return VK_SUCCESS; }