VkResult intel_query_create(struct intel_dev *dev, const VkQueryPoolCreateInfo *info, struct intel_query **query_ret) { struct intel_query *query; query = (struct intel_query *) intel_base_create(&dev->base.handle, sizeof(*query), dev->base.dbg, VK_DEBUG_REPORT_OBJECT_TYPE_QUEUE_EXT, info, 0); if (!query) return VK_ERROR_OUT_OF_HOST_MEMORY; query->type = info->queryType; query->slot_count = info->queryCount; /* * For each query type, the GPU will be asked to write the values of some * registers to a buffer before and after a sequence of commands. We will * compare the differences to get the query results. */ switch (info->queryType) { case VK_QUERY_TYPE_OCCLUSION: query->slot_stride = u_align(sizeof(uint64_t) * 2, 64); break; case VK_QUERY_TYPE_PIPELINE_STATISTICS: query_init_pipeline_statistics(dev, info, query); break; case VK_QUERY_TYPE_TIMESTAMP: query->slot_stride = u_align(sizeof(uint64_t), 64); break; default: assert(!"unknown query type"); break; } VkMemoryAllocateInfo mem_reqs; mem_reqs.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; mem_reqs.allocationSize = query->slot_stride * query->slot_count; mem_reqs.pNext = NULL; mem_reqs.memoryTypeIndex = 0; intel_mem_alloc(dev, &mem_reqs, &query->obj.mem); query->obj.destroy = query_destroy; *query_ret = query; return VK_SUCCESS; }
/** * Grow a mapped writer to at least \p new_size. Failures are handled * silently. */ void cmd_writer_grow(struct intel_cmd *cmd, enum intel_cmd_writer_type which, size_t new_size) { struct intel_cmd_writer *writer = &cmd->writers[which]; struct intel_bo *new_bo; void *new_ptr; if (new_size < writer->size << 1) new_size = writer->size << 1; /* STATE_BASE_ADDRESS requires page-aligned buffers */ new_size = u_align(new_size, 4096); new_bo = alloc_writer_bo(cmd->dev->winsys, which, new_size); if (!new_bo) { cmd_writer_discard(cmd, which); cmd_fail(cmd, VK_ERROR_OUT_OF_DEVICE_MEMORY); return; } /* map and copy the data over */ new_ptr = intel_bo_map(new_bo, true); if (!new_ptr) { intel_bo_unref(new_bo); cmd_writer_discard(cmd, which); cmd_fail(cmd, VK_ERROR_VALIDATION_FAILED_EXT); return; } memcpy(new_ptr, writer->ptr, writer->used); intel_bo_unmap(writer->bo); intel_bo_unref(writer->bo); writer->size = new_size; writer->bo = new_bo; writer->ptr = new_ptr; }
static void query_init_pipeline_statistics( struct intel_dev *dev, const VkQueryPoolCreateInfo *info, struct intel_query *query) { /* * Note: order defined by Vulkan spec. */ const uint32_t regs[][2] = { {VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT, GEN6_REG_IA_PRIMITIVES_COUNT}, {VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT, GEN6_REG_VS_INVOCATION_COUNT}, {VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT, GEN6_REG_GS_INVOCATION_COUNT}, {VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT, GEN6_REG_GS_PRIMITIVES_COUNT}, {VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT, GEN6_REG_CL_INVOCATION_COUNT}, {VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT, GEN6_REG_CL_PRIMITIVES_COUNT}, {VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT, GEN6_REG_PS_INVOCATION_COUNT}, {VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT, (intel_gpu_gen(dev->gpu) >= INTEL_GEN(7)) ? GEN7_REG_HS_INVOCATION_COUNT : 0}, {VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT, (intel_gpu_gen(dev->gpu) >= INTEL_GEN(7)) ? GEN7_REG_DS_INVOCATION_COUNT : 0}, {VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT, 0} }; STATIC_ASSERT(ARRAY_SIZE(regs) < 32); uint32_t i; uint32_t reg_count = 0; /* * Only query registers indicated via pipeline statistics flags. * If HW does not support a flag, fill value with 0. */ for (i=0; i < ARRAY_SIZE(regs); i++) { if ((regs[i][0] & info->pipelineStatistics)) { query->regs[reg_count] = regs[i][1]; reg_count++; } } query->reg_count = reg_count; query->slot_stride = u_align(reg_count * sizeof(uint64_t) * 2, 64); }
static VkResult buf_get_memory_requirements(struct intel_base *base, VkMemoryRequirements *pRequirements) { struct intel_buf *buf = intel_buf_from_base(base); /* * From the Sandy Bridge PRM, volume 1 part 1, page 118: * * "For buffers, which have no inherent "height," padding * requirements are different. A buffer must be padded to the * next multiple of 256 array elements, with an additional 16 * bytes added beyond that to account for the L1 cache line." */ pRequirements->size = buf->size; if (buf->usage & (VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT)) { pRequirements->size = u_align(pRequirements->size, 256) + 16; } pRequirements->alignment = 4096; pRequirements->memoryTypeBits = (1 << INTEL_MEMORY_TYPE_COUNT) - 1; return VK_SUCCESS; }
VkResult intel_img_create(struct intel_dev *dev, const VkImageCreateInfo *info, const VkAllocationCallbacks *allocator, bool scanout, struct intel_img **img_ret) { struct intel_img *img; struct intel_layout *layout; img = (struct intel_img *) intel_base_create(&dev->base.handle, sizeof(*img), dev->base.dbg, VK_DEBUG_REPORT_OBJECT_TYPE_IMAGE_EXT, info, 0); if (!img) return VK_ERROR_OUT_OF_HOST_MEMORY; layout = &img->layout; img->type = info->imageType; img->depth = info->extent.depth; img->mip_levels = info->mipLevels; img->array_size = info->arrayLayers; img->usage = info->usage; img->sample_count = (uint32_t) info->samples; intel_layout_init(layout, dev, info, scanout); img->total_size = img->layout.bo_stride * img->layout.bo_height; if (layout->aux != INTEL_LAYOUT_AUX_NONE) { img->aux_offset = u_align(img->total_size, 4096); img->total_size = img->aux_offset + layout->aux_stride * layout->aux_height; } if (layout->separate_stencil) { VkImageCreateInfo s8_info; img->s8_layout = intel_alloc(img, sizeof(*img->s8_layout), sizeof(int), VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (!img->s8_layout) { intel_img_destroy(img); return VK_ERROR_OUT_OF_HOST_MEMORY; } s8_info = *info; s8_info.format = VK_FORMAT_S8_UINT; /* no stencil texturing */ s8_info.usage &= ~VK_IMAGE_USAGE_SAMPLED_BIT; assert(icd_format_is_ds(info->format)); intel_layout_init(img->s8_layout, dev, &s8_info, scanout); img->s8_offset = u_align(img->total_size, 4096); img->total_size = img->s8_offset + img->s8_layout->bo_stride * img->s8_layout->bo_height; } if (scanout) { VkResult ret = intel_wsi_img_init(img); if (ret != VK_SUCCESS) { intel_img_destroy(img); return ret; } } img->obj.destroy = img_destroy; img->obj.base.get_memory_requirements = img_get_memory_requirements; *img_ret = img; return VK_SUCCESS; }
static void layout_calculate_hiz_size(struct intel_layout *layout, struct intel_layout_params *params) { const VkImageCreateInfo *info = params->info; const unsigned hz_align_j = 8; enum intel_layout_walk_type hz_walk; unsigned hz_width, hz_height, lv; unsigned hz_clear_w, hz_clear_h; assert(layout->aux == INTEL_LAYOUT_AUX_HIZ); assert(layout->walk == INTEL_LAYOUT_WALK_LAYER || layout->walk == INTEL_LAYOUT_WALK_3D); /* * From the Sandy Bridge PRM, volume 2 part 1, page 312: * * "The hierarchical depth buffer does not support the LOD field, it is * assumed by hardware to be zero. A separate hierarachical depth * buffer is required for each LOD used, and the corresponding * buffer's state delivered to hardware each time a new depth buffer * state with modified LOD is delivered." * * We will put all LODs in a single bo with INTEL_LAYOUT_WALK_LOD. */ if (intel_gpu_gen(params->gpu) >= INTEL_GEN(7)) hz_walk = layout->walk; else hz_walk = INTEL_LAYOUT_WALK_LOD; /* * See the Sandy Bridge PRM, volume 2 part 1, page 312, and the Ivy Bridge * PRM, volume 2 part 1, page 312-313. * * It seems HiZ buffer is aligned to 8x8, with every two rows packed into a * memory row. */ switch (hz_walk) { case INTEL_LAYOUT_WALK_LOD: { unsigned lod_tx[INTEL_LAYOUT_MAX_LEVELS]; unsigned lod_ty[INTEL_LAYOUT_MAX_LEVELS]; unsigned cur_tx, cur_ty; /* figure out the tile offsets of LODs */ hz_width = 0; hz_height = 0; cur_tx = 0; cur_ty = 0; for (lv = 0; lv < info->mipLevels; lv++) { unsigned tw, th; lod_tx[lv] = cur_tx; lod_ty[lv] = cur_ty; tw = u_align(layout->lods[lv].slice_width, 16); th = u_align(layout->lods[lv].slice_height, hz_align_j) * info->arrayLayers / 2; /* convert to Y-tiles */ tw = u_align(tw, 128) / 128; th = u_align(th, 32) / 32; if (hz_width < cur_tx + tw) hz_width = cur_tx + tw; if (hz_height < cur_ty + th) hz_height = cur_ty + th; if (lv == 1) cur_tx += tw; else cur_ty += th; } /* convert tile offsets to memory offsets */ for (lv = 0; lv < info->mipLevels; lv++) { layout->aux_offsets[lv] = (lod_ty[lv] * hz_width + lod_tx[lv]) * 4096; } hz_width *= 128; hz_height *= 32; } break; case INTEL_LAYOUT_WALK_LAYER: { const unsigned h0 = u_align(params->h0, hz_align_j); const unsigned h1 = u_align(params->h1, hz_align_j); const unsigned htail = ((intel_gpu_gen(params->gpu) >= INTEL_GEN(7)) ? 12 : 11) * hz_align_j; const unsigned hz_qpitch = h0 + h1 + htail; hz_width = u_align(layout->lods[0].slice_width, 16); hz_height = hz_qpitch * info->arrayLayers / 2; if (intel_gpu_gen(params->gpu) >= INTEL_GEN(7)) hz_height = u_align(hz_height, 8); layout->aux_layer_height = hz_qpitch; } break; case INTEL_LAYOUT_WALK_3D: hz_width = u_align(layout->lods[0].slice_width, 16); hz_height = 0; for (lv = 0; lv < info->mipLevels; lv++) { const unsigned h = u_align(layout->lods[lv].slice_height, hz_align_j); /* according to the formula, slices are packed together vertically */ hz_height += h * u_minify(info->extent.depth, lv); } hz_height /= 2; break; default: assert(!"unknown layout walk"); hz_width = 0; hz_height = 0; break; } /* * In hiz_align_fb(), we will align the LODs to 8x4 sample blocks. * Experiments on Haswell show that aligning the RECTLIST primitive and * 3DSTATE_DRAWING_RECTANGLE alone are not enough. The LOD sizes must be * aligned. */ hz_clear_w = 8; hz_clear_h = 4; switch (info->samples) { case VK_SAMPLE_COUNT_1_BIT: default: break; case VK_SAMPLE_COUNT_2_BIT: hz_clear_w /= 2; break; case VK_SAMPLE_COUNT_4_BIT: hz_clear_w /= 2; hz_clear_h /= 2; break; case VK_SAMPLE_COUNT_8_BIT: hz_clear_w /= 4; hz_clear_h /= 2; break; case VK_SAMPLE_COUNT_16_BIT: hz_clear_w /= 4; hz_clear_h /= 4; break; } for (lv = 0; lv < info->mipLevels; lv++) { if (u_minify(layout->width0, lv) % hz_clear_w || u_minify(layout->height0, lv) % hz_clear_h) break; layout->aux_enables |= 1 << lv; } /* we padded to allow this in layout_align() */ if (info->mipLevels == 1 && info->arrayLayers == 1 && info->extent.depth == 1) layout->aux_enables |= 0x1; /* align to Y-tile */ layout->aux_stride = u_align(hz_width, 128); layout->aux_height = u_align(hz_height, 32); }
/* note that this may force the texture to be linear */ static void layout_calculate_bo_size(struct intel_layout *layout, struct intel_layout_params *params) { assert(params->max_x % layout->block_width == 0); assert(params->max_y % layout->block_height == 0); assert(layout->layer_height % layout->block_height == 0); layout->bo_stride = (params->max_x / layout->block_width) * layout->block_size; layout->bo_height = params->max_y / layout->block_height; while (true) { unsigned w = layout->bo_stride, h = layout->bo_height; unsigned align_w, align_h; /* * From the Haswell PRM, volume 5, page 163: * * "For linear surfaces, additional padding of 64 bytes is required * at the bottom of the surface. This is in addition to the padding * required above." */ if (intel_gpu_gen(params->gpu) >= INTEL_GEN(7.5) && (params->info->usage & VK_IMAGE_USAGE_SAMPLED_BIT) && layout->tiling == GEN6_TILING_NONE) h += (64 + layout->bo_stride - 1) / layout->bo_stride; /* * From the Sandy Bridge PRM, volume 4 part 1, page 81: * * "- For linear render target surfaces, the pitch must be a * multiple of the element size for non-YUV surface formats. * Pitch must be a multiple of 2 * element size for YUV surface * formats. * - For other linear surfaces, the pitch can be any multiple of * bytes. * - For tiled surfaces, the pitch must be a multiple of the tile * width." * * Different requirements may exist when the bo is used in different * places, but our alignments here should be good enough that we do not * need to check layout->info->usage. */ switch (layout->tiling) { case GEN6_TILING_X: align_w = 512; align_h = 8; break; case GEN6_TILING_Y: align_w = 128; align_h = 32; break; case GEN8_TILING_W: /* * From the Sandy Bridge PRM, volume 1 part 2, page 22: * * "A 4KB tile is subdivided into 8-high by 8-wide array of * Blocks for W-Major Tiles (W Tiles). Each Block is 8 rows by 8 * bytes." */ align_w = 64; align_h = 64; break; default: assert(layout->tiling == GEN6_TILING_NONE); /* some good enough values */ align_w = 64; align_h = 2; break; } w = u_align(w, align_w); h = u_align(h, align_h); /* make sure the bo is mappable */ if (layout->tiling != GEN6_TILING_NONE) { /* * Usually only the first 256MB of the GTT is mappable. * * See also how intel_context::max_gtt_map_object_size is calculated. */ const size_t mappable_gtt_size = 256 * 1024 * 1024; /* * Be conservative. We may be able to switch from VALIGN_4 to * VALIGN_2 if the layout was Y-tiled, but let's keep it simple. */ if (mappable_gtt_size / w / 4 < h) { if (layout->valid_tilings & LAYOUT_TILING_NONE) { layout->tiling = GEN6_TILING_NONE; /* MCS support for non-MSRTs is limited to tiled RTs */ if (layout->aux == INTEL_LAYOUT_AUX_MCS && params->info->samples == VK_SAMPLE_COUNT_1_BIT) layout->aux = INTEL_LAYOUT_AUX_NONE; continue; } else { /* mapping will fail */ } } } layout->bo_stride = w; layout->bo_height = h; break; } }
static void layout_align(struct intel_layout *layout, struct intel_layout_params *params) { const VkImageCreateInfo *info = params->info; int align_w = 1, align_h = 1, pad_h = 0; /* * From the Sandy Bridge PRM, volume 1 part 1, page 118: * * "To determine the necessary padding on the bottom and right side of * the surface, refer to the table in Section 7.18.3.4 for the i and j * parameters for the surface format in use. The surface must then be * extended to the next multiple of the alignment unit size in each * dimension, and all texels contained in this extended surface must * have valid GTT entries." * * "For cube surfaces, an additional two rows of padding are required * at the bottom of the surface. This must be ensured regardless of * whether the surface is stored tiled or linear. This is due to the * potential rotation of cache line orientation from memory to cache." * * "For compressed textures (BC* and FXT1 surface formats), padding at * the bottom of the surface is to an even compressed row, which is * equal to a multiple of 8 uncompressed texel rows. Thus, for padding * purposes, these surfaces behave as if j = 8 only for surface * padding purposes. The value of 4 for j still applies for mip level * alignment and QPitch calculation." */ if (info->usage & VK_IMAGE_USAGE_SAMPLED_BIT) { if (align_w < layout->align_i) align_w = layout->align_i; if (align_h < layout->align_j) align_h = layout->align_j; /* in case it is used as a cube */ if (info->imageType == VK_IMAGE_TYPE_2D) pad_h += 2; if (params->compressed && align_h < layout->align_j * 2) align_h = layout->align_j * 2; } /* * From the Sandy Bridge PRM, volume 1 part 1, page 118: * * "If the surface contains an odd number of rows of data, a final row * below the surface must be allocated." */ if ((info->usage & VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT) && align_h < 2) align_h = 2; /* * Depth Buffer Clear/Resolve works in 8x4 sample blocks. In * intel_texture_can_enable_hiz(), we always return true for the first slice. * To avoid out-of-bound access, we have to pad. */ if (layout->aux == INTEL_LAYOUT_AUX_HIZ && info->mipLevels == 1 && info->arrayLayers == 1 && info->extent.depth == 1) { if (align_w < 8) align_w = 8; if (align_h < 4) align_h = 4; } params->max_x = u_align(params->max_x, align_w); params->max_y = u_align(params->max_y + pad_h, align_h); }
static void layout_get_slice_size(const struct intel_layout *layout, const struct intel_layout_params *params, unsigned level, unsigned *width, unsigned *height) { const VkImageCreateInfo *info = params->info; unsigned w, h; w = u_minify(layout->width0, level); h = u_minify(layout->height0, level); /* * From the Sandy Bridge PRM, volume 1 part 1, page 114: * * "The dimensions of the mip maps are first determined by applying the * sizing algorithm presented in Non-Power-of-Two Mipmaps above. Then, * if necessary, they are padded out to compression block boundaries." */ w = u_align(w, layout->block_width); h = u_align(h, layout->block_height); /* * From the Sandy Bridge PRM, volume 1 part 1, page 111: * * "If the surface is multisampled (4x), these values must be adjusted * as follows before proceeding: * * W_L = ceiling(W_L / 2) * 4 * H_L = ceiling(H_L / 2) * 4" * * From the Ivy Bridge PRM, volume 1 part 1, page 108: * * "If the surface is multisampled and it is a depth or stencil surface * or Multisampled Surface StorageFormat in SURFACE_STATE is * MSFMT_DEPTH_STENCIL, W_L and H_L must be adjusted as follows before * proceeding: * * #samples W_L = H_L = * 2 ceiling(W_L / 2) * 4 HL [no adjustment] * 4 ceiling(W_L / 2) * 4 ceiling(H_L / 2) * 4 * 8 ceiling(W_L / 2) * 8 ceiling(H_L / 2) * 4 * 16 ceiling(W_L / 2) * 8 ceiling(H_L / 2) * 8" * * For interleaved samples (4x), where pixels * * (x, y ) (x+1, y ) * (x, y+1) (x+1, y+1) * * would be is occupied by * * (x, y , si0) (x+1, y , si0) (x, y , si1) (x+1, y , si1) * (x, y+1, si0) (x+1, y+1, si0) (x, y+1, si1) (x+1, y+1, si1) * (x, y , si2) (x+1, y , si2) (x, y , si3) (x+1, y , si3) * (x, y+1, si2) (x+1, y+1, si2) (x, y+1, si3) (x+1, y+1, si3) * * Thus the need to * * w = align(w, 2) * 2; * y = align(y, 2) * 2; */ if (layout->interleaved_samples) { switch (info->samples) { case VK_SAMPLE_COUNT_1_BIT: break; case VK_SAMPLE_COUNT_2_BIT: w = u_align(w, 2) * 2; break; case VK_SAMPLE_COUNT_4_BIT: w = u_align(w, 2) * 2; h = u_align(h, 2) * 2; break; case VK_SAMPLE_COUNT_8_BIT: w = u_align(w, 2) * 4; h = u_align(h, 2) * 2; break; case VK_SAMPLE_COUNT_16_BIT: w = u_align(w, 2) * 4; h = u_align(h, 2) * 4; break; default: assert(!"unsupported sample count"); break; } } /* * From the Ivy Bridge PRM, volume 1 part 1, page 108: * * "For separate stencil buffer, the width must be mutiplied by 2 and * height divided by 2..." * * To make things easier (for transfer), we will just double the stencil * stride in 3DSTATE_STENCIL_BUFFER. */ w = u_align(w, layout->align_i); h = u_align(h, layout->align_j); *width = w; *height = h; }
static void layout_init_lods(struct intel_layout *layout, struct intel_layout_params *params) { const VkImageCreateInfo *info = params->info; unsigned cur_x, cur_y; unsigned lv; cur_x = 0; cur_y = 0; for (lv = 0; lv < info->mipLevels; lv++) { unsigned lod_w, lod_h; layout_get_slice_size(layout, params, lv, &lod_w, &lod_h); layout->lods[lv].x = cur_x; layout->lods[lv].y = cur_y; layout->lods[lv].slice_width = lod_w; layout->lods[lv].slice_height = lod_h; switch (layout->walk) { case INTEL_LAYOUT_WALK_LOD: lod_h *= layout_get_num_layers(layout, params); if (lv == 1) cur_x += lod_w; else cur_y += lod_h; /* every LOD begins at tile boundaries */ if (info->mipLevels > 1) { assert(layout->format == VK_FORMAT_S8_UINT); cur_x = u_align(cur_x, 64); cur_y = u_align(cur_y, 64); } break; case INTEL_LAYOUT_WALK_LAYER: /* MIPLAYOUT_BELOW */ if (lv == 1) cur_x += lod_w; else cur_y += lod_h; break; case INTEL_LAYOUT_WALK_3D: { const unsigned num_slices = u_minify(info->extent.depth, lv); const unsigned num_slices_per_row = 1 << lv; const unsigned num_rows = (num_slices + num_slices_per_row - 1) / num_slices_per_row; lod_w *= num_slices_per_row; lod_h *= num_rows; cur_y += lod_h; } break; } if (params->max_x < layout->lods[lv].x + lod_w) params->max_x = layout->lods[lv].x + lod_w; if (params->max_y < layout->lods[lv].y + lod_h) params->max_y = layout->lods[lv].y + lod_h; } if (layout->walk == INTEL_LAYOUT_WALK_LAYER) { params->h0 = layout->lods[0].slice_height; if (info->mipLevels > 1) params->h1 = layout->lods[1].slice_height; else layout_get_slice_size(layout, params, 1, &cur_x, ¶ms->h1); } }
static void layout_calculate_mcs_size(struct intel_layout *layout, struct intel_layout_params *params) { const VkImageCreateInfo *info = params->info; int mcs_width, mcs_height, mcs_cpp; int downscale_x, downscale_y; assert(layout->aux == INTEL_LAYOUT_AUX_MCS); if (info->samples != VK_SAMPLE_COUNT_1_BIT) { /* * From the Ivy Bridge PRM, volume 2 part 1, page 326, the clear * rectangle is scaled down by 8x2 for 4X MSAA and 2x2 for 8X MSAA. The * need of scale down could be that the clear rectangle is used to clear * the MCS instead of the RT. * * For 8X MSAA, we need 32 bits in MCS for every pixel in the RT. The * 2x2 factor could come from that the hardware writes 128 bits (an * OWord) at a time, and the OWord in MCS maps to a 2x2 pixel block in * the RT. For 4X MSAA, we need 8 bits in MCS for every pixel in the * RT. Similarly, we could reason that an OWord in 4X MCS maps to a 8x2 * pixel block in the RT. */ switch (info->samples) { case VK_SAMPLE_COUNT_2_BIT: case VK_SAMPLE_COUNT_4_BIT: downscale_x = 8; downscale_y = 2; mcs_cpp = 1; break; case VK_SAMPLE_COUNT_8_BIT: downscale_x = 2; downscale_y = 2; mcs_cpp = 4; break; case VK_SAMPLE_COUNT_16_BIT: downscale_x = 2; downscale_y = 1; mcs_cpp = 8; break; default: assert(!"unsupported sample count"); return; break; } /* * It also appears that the 2x2 subspans generated by the scaled-down * clear rectangle cannot be masked. The scale-down clear rectangle * thus must be aligned to 2x2, and we need to pad. */ mcs_width = u_align(layout->width0, downscale_x * 2); mcs_height = u_align(layout->height0, downscale_y * 2); } else { /* * From the Ivy Bridge PRM, volume 2 part 1, page 327: * * " Pixels Lines * TiledY RT CL * bpp * 32 8 4 * 64 4 4 * 128 2 4 * * TiledX RT CL * bpp * 32 16 2 * 64 8 2 * 128 4 2" * * This table and the two following tables define the RT alignments, the * clear rectangle alignments, and the clear rectangle scale factors. * Viewing the RT alignments as the sizes of 128-byte blocks, we can see * that the clear rectangle alignments are 16x32 blocks, and the clear * rectangle scale factors are 8x16 blocks. * * For non-MSAA RT, we need 1 bit in MCS for every 128-byte block in the * RT. Similar to the MSAA cases, we can argue that an OWord maps to * 8x16 blocks. * * One problem with this reasoning is that a Y-tile in MCS has 8x32 * OWords and maps to 64x512 128-byte blocks. This differs from i965, * which says that a Y-tile maps to 128x256 blocks (\see * intel_get_non_msrt_mcs_alignment). It does not really change * anything except for the size of the allocated MCS. Let's see if we * hit out-of-bound access. */ switch (layout->tiling) { case GEN6_TILING_X: downscale_x = 64 / layout->block_size; downscale_y = 2; break; case GEN6_TILING_Y: downscale_x = 32 / layout->block_size; downscale_y = 4; break; default: assert(!"unsupported tiling mode"); return; break; } downscale_x *= 8; downscale_y *= 16; /* * From the Haswell PRM, volume 7, page 652: * * "Clear rectangle must be aligned to two times the number of * pixels in the table shown below due to 16X16 hashing across the * slice." * * The scaled-down clear rectangle must be aligned to 4x4 instead of * 2x2, and we need to pad. */ mcs_width = u_align(layout->width0, downscale_x * 4) / downscale_x; mcs_height = u_align(layout->height0, downscale_y * 4) / downscale_y; mcs_cpp = 16; /* an OWord */ } layout->aux_enables = (1 << info->mipLevels) - 1; /* align to Y-tile */ layout->aux_stride = u_align(mcs_width * mcs_cpp, 128); layout->aux_height = u_align(mcs_height, 32); }