static void i915_texture_layout_2d(struct i915_texture *tex) { struct pipe_resource *pt = &tex->b.b; unsigned level; unsigned width = util_next_power_of_two(pt->width0); unsigned height = util_next_power_of_two(pt->height0); unsigned nblocksy = util_format_get_nblocksy(pt->format, width); unsigned align_y = 2; if (util_format_is_s3tc(pt->format)) align_y = 1; tex->stride = align(util_format_get_stride(pt->format, width), 4); tex->total_nblocksy = 0; for (level = 0; level <= pt->last_level; level++) { i915_texture_set_level_info(tex, level, 1); i915_texture_set_image_offset(tex, level, 0, 0, tex->total_nblocksy); tex->total_nblocksy += nblocksy; width = u_minify(width, 1); height = u_minify(height, 1); nblocksy = align_nblocksy(pt->format, height, align_y); } }
struct pipe_video_decoder * vl_create_decoder(struct pipe_context *pipe, enum pipe_video_profile profile, enum pipe_video_entrypoint entrypoint, enum pipe_video_chroma_format chroma_format, unsigned width, unsigned height, unsigned max_references, bool expect_chunked_decode) { unsigned buffer_width, buffer_height; bool pot_buffers; assert(pipe); assert(width > 0 && height > 0); pot_buffers = !pipe->screen->get_video_param ( pipe->screen, profile, PIPE_VIDEO_CAP_NPOT_TEXTURES ); buffer_width = pot_buffers ? util_next_power_of_two(width) : align(width, VL_MACROBLOCK_WIDTH); buffer_height = pot_buffers ? util_next_power_of_two(height) : align(height, VL_MACROBLOCK_HEIGHT); switch (u_reduce_video_profile(profile)) { case PIPE_VIDEO_CODEC_MPEG12: return vl_create_mpeg12_decoder(pipe, profile, entrypoint, chroma_format, buffer_width, buffer_height, max_references, expect_chunked_decode); default: return NULL; } return NULL; }
/** * Round up the requested multisample count to the next supported sample size. */ static unsigned framebuffer_quantize_num_samples(struct st_context *st, unsigned num_samples) { struct pipe_screen *screen = st->pipe->screen; int quantized_samples = 0; unsigned msaa_mode; if (!num_samples) return 0; /* Assumes the highest supported MSAA is a power of 2 */ msaa_mode = util_next_power_of_two(st->ctx->Const.MaxFramebufferSamples); assert(!(num_samples > msaa_mode)); /* be safe from infinite loops */ /** * Check if the MSAA mode that is higher than the requested * num_samples is supported, and if so returning it. */ for (; msaa_mode >= num_samples; msaa_mode = msaa_mode / 2) { /** * For ARB_framebuffer_no_attachment, A format of * PIPE_FORMAT_NONE implies what number of samples is * supported for a framebuffer with no attachment. Thus the * drivers callback must be adjusted for this. */ if (screen->is_format_supported(screen, PIPE_FORMAT_NONE, PIPE_TEXTURE_2D, msaa_mode, PIPE_BIND_RENDER_TARGET)) quantized_samples = msaa_mode; } return quantized_samples; }
void r600_texture_get_cmask_info(struct r600_common_screen *rscreen, struct r600_texture *rtex, struct r600_cmask_info *out) { unsigned cmask_tile_width = 8; unsigned cmask_tile_height = 8; unsigned cmask_tile_elements = cmask_tile_width * cmask_tile_height; unsigned element_bits = 4; unsigned cmask_cache_bits = 1024; unsigned num_pipes = rscreen->tiling_info.num_channels; unsigned pipe_interleave_bytes = rscreen->tiling_info.group_bytes; unsigned elements_per_macro_tile = (cmask_cache_bits / element_bits) * num_pipes; unsigned pixels_per_macro_tile = elements_per_macro_tile * cmask_tile_elements; unsigned sqrt_pixels_per_macro_tile = sqrt(pixels_per_macro_tile); unsigned macro_tile_width = util_next_power_of_two(sqrt_pixels_per_macro_tile); unsigned macro_tile_height = pixels_per_macro_tile / macro_tile_width; unsigned pitch_elements = align(rtex->surface.npix_x, macro_tile_width); unsigned height = align(rtex->surface.npix_y, macro_tile_height); unsigned base_align = num_pipes * pipe_interleave_bytes; unsigned slice_bytes = ((pitch_elements * height * element_bits + 7) / 8) / cmask_tile_elements; assert(macro_tile_width % 128 == 0); assert(macro_tile_height % 128 == 0); out->slice_tile_max = ((pitch_elements * height) / (128*128)) - 1; out->alignment = MAX2(256, base_align); out->size = (util_max_layer(&rtex->resource.b.b, 0) + 1) * align(slice_bytes, base_align); }
/** * Cube layout used on i915 and for non-compressed textures on i945. */ static void i9x5_texture_layout_cube(struct i915_texture *tex) { struct pipe_resource *pt = &tex->b.b; unsigned width = util_next_power_of_two(pt->width0); const unsigned nblocks = util_format_get_nblocksx(pt->format, width); unsigned level; unsigned face; assert(pt->width0 == pt->height0); /* cubemap images are square */ /* double pitch for cube layouts */ tex->stride = align(nblocks * util_format_get_blocksize(pt->format) * 2, 4); tex->total_nblocksy = nblocks * 4; for (level = 0; level <= pt->last_level; level++) i915_texture_set_level_info(tex, level, 6); for (face = 0; face < 6; face++) { unsigned x = initial_offsets[face][0] * nblocks; unsigned y = initial_offsets[face][1] * nblocks; unsigned d = nblocks; for (level = 0; level <= pt->last_level; level++) { i915_texture_set_image_offset(tex, level, face, x, y); d >>= 1; x += step_offsets[face][0] * d; y += step_offsets[face][1] * d; } } }
static void r600_setup_miptree(struct r600_screen *rscreen, struct r600_resource_texture *rtex) { struct pipe_resource *ptex = &rtex->resource.base.b; unsigned long w, h, pitch, size, layer_size, i, offset; rtex->bpt = util_format_get_blocksize(ptex->format); for (i = 0, offset = 0; i <= ptex->last_level; i++) { w = u_minify(ptex->width0, i); h = u_minify(ptex->height0, i); h = util_next_power_of_two(h); pitch = util_format_get_stride(ptex->format, align(w, 64)); pitch = align(pitch, 256); layer_size = pitch * h; if (ptex->target == PIPE_TEXTURE_CUBE) size = layer_size * 6; else size = layer_size * u_minify(ptex->depth0, i); rtex->offset[i] = offset; rtex->layer_size[i] = layer_size; rtex->pitch[i] = pitch; rtex->width[i] = w; rtex->height[i] = h; offset += size; } rtex->size = offset; }
static void i945_texture_layout_3d(struct i915_texture *tex) { struct pipe_resource *pt = &tex->b.b; unsigned width = util_next_power_of_two(pt->width0); unsigned height = util_next_power_of_two(pt->height0); unsigned depth = util_next_power_of_two(pt->depth0); unsigned nblocksy = util_format_get_nblocksy(pt->format, width); unsigned pack_x_pitch, pack_x_nr; unsigned pack_y_pitch; unsigned level; tex->stride = align(util_format_get_stride(pt->format, width), 4); tex->total_nblocksy = 0; pack_y_pitch = MAX2(nblocksy, 2); pack_x_pitch = tex->stride / util_format_get_blocksize(pt->format); pack_x_nr = 1; for (level = 0; level <= pt->last_level; level++) { int x = 0; int y = 0; unsigned q, j; i915_texture_set_level_info(tex, level, depth); for (q = 0; q < depth;) { for (j = 0; j < pack_x_nr && q < depth; j++, q++) { i915_texture_set_image_offset(tex, level, q, x, y + tex->total_nblocksy); x += pack_x_pitch; } x = 0; y += pack_y_pitch; } tex->total_nblocksy += y; if (pack_x_pitch > 4) { pack_x_pitch >>= 1; pack_x_nr <<= 1; assert(pack_x_pitch * pack_x_nr * util_format_get_blocksize(pt->format) <= tex->stride); } if (pack_y_pitch > 2) { pack_y_pitch >>= 1; }
static unsigned mip_minify(unsigned size, unsigned level) { unsigned val; val = u_minify(size, level); if (level > 0) val = util_next_power_of_two(val); return val; }
static void i915_texture_layout_3d(struct i915_texture *tex) { struct pipe_resource *pt = &tex->b.b; unsigned level; unsigned width = util_next_power_of_two(pt->width0); unsigned height = util_next_power_of_two(pt->height0); unsigned depth = util_next_power_of_two(pt->depth0); unsigned nblocksy = util_format_get_nblocksy(pt->format, height); unsigned stack_nblocksy = 0; /* Calculate the size of a single slice. */ tex->stride = align(util_format_get_stride(pt->format, width), 4); /* XXX: hardware expects/requires 9 levels at minimum. */ for (level = 0; level <= MAX2(8, pt->last_level); level++) { i915_texture_set_level_info(tex, level, depth); stack_nblocksy += MAX2(2, nblocksy); width = u_minify(width, 1); height = u_minify(height, 1); nblocksy = util_format_get_nblocksy(pt->format, height); } /* Fixup depth image_offsets: */ for (level = 0; level <= pt->last_level; level++) { unsigned i; for (i = 0; i < depth; i++) i915_texture_set_image_offset(tex, level, i, 0, i * stack_nblocksy); depth = u_minify(depth, 1); } /* Multiply slice size by texture depth for total size. It's * remarkable how wasteful of memory the i915 texture layouts * are. They are largely fixed in the i945. */ tex->total_nblocksy = stack_nblocksy * util_next_power_of_two(pt->depth0); }
struct pipe_video_buffer * vl_video_buffer_create(struct pipe_context *pipe, const struct pipe_video_buffer *tmpl) { const enum pipe_format *resource_formats; struct pipe_video_buffer templat, *result; bool pot_buffers; assert(pipe); assert(tmpl->width > 0 && tmpl->height > 0); pot_buffers = !pipe->screen->get_video_param ( pipe->screen, PIPE_VIDEO_PROFILE_UNKNOWN, PIPE_VIDEO_CAP_NPOT_TEXTURES ); resource_formats = vl_video_buffer_formats(pipe->screen, tmpl->buffer_format); if (!resource_formats) return NULL; templat = *tmpl; templat.width = pot_buffers ? util_next_power_of_two(tmpl->width) : align(tmpl->width, VL_MACROBLOCK_WIDTH); templat.height = pot_buffers ? util_next_power_of_two(tmpl->height) : align(tmpl->height, VL_MACROBLOCK_HEIGHT); if (tmpl->interlaced) templat.height /= 2; result = vl_video_buffer_create_ex ( pipe, &templat, resource_formats, 1, tmpl->interlaced ? 2 : 1, PIPE_USAGE_STATIC ); if (result && tmpl->interlaced) result->height *= 2; return result; }
void cayman_emit_msaa_config(struct radeon_winsys_cs *cs, int nr_samples, int ps_iter_samples, int overrast_samples) { int setup_samples = nr_samples > 1 ? nr_samples : overrast_samples > 1 ? overrast_samples : 0; if (setup_samples > 1) { /* indexed by log2(nr_samples) */ unsigned max_dist[] = { 0, eg_max_dist_2x, eg_max_dist_4x, cm_max_dist_8x, cm_max_dist_16x }; unsigned log_samples = util_logbase2(setup_samples); unsigned log_ps_iter_samples = util_logbase2(util_next_power_of_two(ps_iter_samples)); radeon_set_context_reg_seq(cs, CM_R_028BDC_PA_SC_LINE_CNTL, 2); radeon_emit(cs, S_028BDC_LAST_PIXEL(1) | S_028BDC_EXPAND_LINE_WIDTH(1)); /* CM_R_028BDC_PA_SC_LINE_CNTL */ radeon_emit(cs, S_028BE0_MSAA_NUM_SAMPLES(log_samples) | S_028BE0_MAX_SAMPLE_DIST(max_dist[log_samples]) | S_028BE0_MSAA_EXPOSED_SAMPLES(log_samples)); /* CM_R_028BE0_PA_SC_AA_CONFIG */ if (nr_samples > 1) { radeon_set_context_reg(cs, CM_R_028804_DB_EQAA, S_028804_MAX_ANCHOR_SAMPLES(log_samples) | S_028804_PS_ITER_SAMPLES(log_ps_iter_samples) | S_028804_MASK_EXPORT_NUM_SAMPLES(log_samples) | S_028804_ALPHA_TO_MASK_NUM_SAMPLES(log_samples) | S_028804_HIGH_QUALITY_INTERSECTIONS(1) | S_028804_STATIC_ANCHOR_ASSOCIATIONS(1)); radeon_set_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, EG_S_028A4C_PS_ITER_SAMPLE(ps_iter_samples > 1)); } else if (overrast_samples > 1) { radeon_set_context_reg(cs, CM_R_028804_DB_EQAA, S_028804_HIGH_QUALITY_INTERSECTIONS(1) | S_028804_STATIC_ANCHOR_ASSOCIATIONS(1) | S_028804_OVERRASTERIZATION_AMOUNT(log_samples)); radeon_set_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, 0); } } else { radeon_set_context_reg_seq(cs, CM_R_028BDC_PA_SC_LINE_CNTL, 2); radeon_emit(cs, S_028BDC_LAST_PIXEL(1)); /* CM_R_028BDC_PA_SC_LINE_CNTL */ radeon_emit(cs, 0); /* CM_R_028BE0_PA_SC_AA_CONFIG */ radeon_set_context_reg(cs, CM_R_028804_DB_EQAA, S_028804_HIGH_QUALITY_INTERSECTIONS(1) | S_028804_STATIC_ANCHOR_ASSOCIATIONS(1)); radeon_set_context_reg(cs, EG_R_028A4C_PA_SC_MODE_CNTL_1, 0); } }
static unsigned r300_texture_get_nblocksy(struct r300_resource *tex, unsigned level, boolean *out_aligned_for_cbzb) { unsigned height, tile_height; height = u_minify(tex->tex.height0, level); /* Mipmapped and 3D textures must have their height aligned to POT. */ if ((tex->b.b.b.target != PIPE_TEXTURE_1D && tex->b.b.b.target != PIPE_TEXTURE_2D && tex->b.b.b.target != PIPE_TEXTURE_RECT) || tex->b.b.b.last_level != 0) { height = util_next_power_of_two(height); } if (util_format_is_plain(tex->b.b.b.format)) { tile_height = r300_get_pixel_alignment(tex->b.b.b.format, tex->b.b.b.nr_samples, tex->tex.microtile, tex->tex.macrotile[level], DIM_HEIGHT, 0); height = align(height, tile_height); /* See if the CBZB clear can be used on the buffer, * taking the texture size into account. */ if (out_aligned_for_cbzb) { if (tex->tex.macrotile[level]) { /* When clearing, the layer (width*height) is horizontally split * into two, and the upper and lower halves are cleared by the CB * and ZB units, respectively. Therefore, the number of macrotiles * in the Y direction must be even. */ /* Align the height so that there is an even number of macrotiles. * Do so for 3 or more macrotiles in the Y direction. */ if (level == 0 && tex->b.b.b.last_level == 0 && (tex->b.b.b.target == PIPE_TEXTURE_1D || tex->b.b.b.target == PIPE_TEXTURE_2D || tex->b.b.b.target == PIPE_TEXTURE_RECT) && height >= tile_height * 3) { height = align(height, tile_height * 2); } *out_aligned_for_cbzb = height % (tile_height * 2) == 0; } else { *out_aligned_for_cbzb = FALSE; } } } return util_format_get_nblocksy(tex->b.b.b.format, height); }
struct pipe_video_buffer * vl_video_buffer_create(struct pipe_context *pipe, enum pipe_format buffer_format, enum pipe_video_chroma_format chroma_format, unsigned width, unsigned height) { const enum pipe_format *resource_formats; struct pipe_video_buffer *result; unsigned buffer_width, buffer_height; bool pot_buffers; assert(pipe); assert(width > 0 && height > 0); pot_buffers = !pipe->screen->get_video_param ( pipe->screen, PIPE_VIDEO_PROFILE_UNKNOWN, PIPE_VIDEO_CAP_NPOT_TEXTURES ); resource_formats = vl_video_buffer_formats(pipe->screen, buffer_format); if (!resource_formats) return NULL; buffer_width = pot_buffers ? util_next_power_of_two(width) : align(width, MACROBLOCK_WIDTH); buffer_height = pot_buffers ? util_next_power_of_two(height) : align(height, MACROBLOCK_HEIGHT); result = vl_video_buffer_create_ex ( pipe, buffer_width, buffer_height, 1, chroma_format, resource_formats, PIPE_USAGE_STATIC ); if (result) result->buffer_format = buffer_format; return result; }
static int nv50_tls_alloc(struct nv50_screen *screen, unsigned tls_space, uint64_t *tls_size) { struct nouveau_device *dev = screen->base.device; int ret; screen->cur_tls_space = util_next_power_of_two(tls_space / ONE_TEMP_SIZE) * ONE_TEMP_SIZE; if (nouveau_mesa_debug) debug_printf("allocating space for %u temps\n", util_next_power_of_two(tls_space / ONE_TEMP_SIZE)); *tls_size = screen->cur_tls_space * util_next_power_of_two(screen->TPs) * screen->MPsInTP * LOCAL_WARPS_ALLOC * THREADS_IN_WARP; ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 16, *tls_size, NULL, &screen->tls_bo); if (ret) { NOUVEAU_ERR("Failed to allocate local bo: %d\n", ret); return ret; } return 0; }
void r600_test_dma(struct r600_common_screen *rscreen) { struct pipe_screen *screen = &rscreen->b; struct pipe_context *ctx = screen->context_create(screen, NULL, 0); struct r600_common_context *rctx = (struct r600_common_context*)ctx; uint64_t max_alloc_size; unsigned i, iterations, num_partial_copies, max_levels, max_tex_side; unsigned num_pass = 0, num_fail = 0; max_levels = screen->get_param(screen, PIPE_CAP_MAX_TEXTURE_2D_LEVELS); max_tex_side = 1 << (max_levels - 1); /* Max 128 MB allowed for both textures. */ max_alloc_size = 128 * 1024 * 1024; /* the seed for random test parameters */ srand(0x9b47d95b); /* the seed for random pixel data */ s_rand_xorshift128plus(seed_xorshift128plus, false); iterations = 1000000000; /* just kill it when you are bored */ num_partial_copies = 30; /* These parameters are randomly generated per test: * - whether to do one whole-surface copy or N partial copies per test * - which tiling modes to use (LINEAR_ALIGNED, 1D, 2D) * - which texture dimensions to use * - whether to use VRAM (all tiling modes) and GTT (staging, linear * only) allocations * - random initial pixels in src * - generate random subrectangle copies for partial blits */ for (i = 0; i < iterations; i++) { struct pipe_resource tsrc = {}, tdst = {}, *src, *dst; struct r600_texture *rdst; struct r600_texture *rsrc; struct cpu_texture src_cpu, dst_cpu; unsigned bpp, max_width, max_height, max_depth, j, num; unsigned gfx_blits = 0, dma_blits = 0, max_tex_side_gen; unsigned max_tex_layers; bool pass; bool do_partial_copies = rand() & 1; /* generate a random test case */ tsrc.target = tdst.target = PIPE_TEXTURE_2D_ARRAY; tsrc.depth0 = tdst.depth0 = 1; bpp = 1 << (rand() % 5); tsrc.format = tdst.format = get_format_from_bpp(bpp); max_tex_side_gen = generate_max_tex_side(max_tex_side); max_tex_layers = rand() % 4 ? 1 : 5; tsrc.width0 = (rand() % max_tex_side_gen) + 1; tsrc.height0 = (rand() % max_tex_side_gen) + 1; tsrc.array_size = (rand() % max_tex_layers) + 1; /* Have a 1/4 chance of getting power-of-two dimensions. */ if (rand() % 4 == 0) { tsrc.width0 = util_next_power_of_two(tsrc.width0); tsrc.height0 = util_next_power_of_two(tsrc.height0); } if (!do_partial_copies) { /* whole-surface copies only, same dimensions */ tdst = tsrc; } else { max_tex_side_gen = generate_max_tex_side(max_tex_side); max_tex_layers = rand() % 4 ? 1 : 5; /* many partial copies, dimensions can be different */ tdst.width0 = (rand() % max_tex_side_gen) + 1; tdst.height0 = (rand() % max_tex_side_gen) + 1; tdst.array_size = (rand() % max_tex_layers) + 1; /* Have a 1/4 chance of getting power-of-two dimensions. */ if (rand() % 4 == 0) { tdst.width0 = util_next_power_of_two(tdst.width0); tdst.height0 = util_next_power_of_two(tdst.height0); } } /* check texture sizes */ if ((uint64_t)tsrc.width0 * tsrc.height0 * tsrc.array_size * bpp + (uint64_t)tdst.width0 * tdst.height0 * tdst.array_size * bpp > max_alloc_size) { /* too large, try again */ i--; continue; } /* VRAM + the tiling mode depends on dimensions (3/4 of cases), * or GTT + linear only (1/4 of cases) */ tsrc.usage = rand() % 4 ? PIPE_USAGE_DEFAULT : PIPE_USAGE_STAGING; tdst.usage = rand() % 4 ? PIPE_USAGE_DEFAULT : PIPE_USAGE_STAGING; /* Allocate textures (both the GPU and CPU copies). * The CPU will emulate what the GPU should be doing. */ src = screen->resource_create(screen, &tsrc); dst = screen->resource_create(screen, &tdst); assert(src); assert(dst); rdst = (struct r600_texture*)dst; rsrc = (struct r600_texture*)src; alloc_cpu_texture(&src_cpu, &tsrc, bpp); alloc_cpu_texture(&dst_cpu, &tdst, bpp); printf("%4u: dst = (%5u x %5u x %u, %s), " " src = (%5u x %5u x %u, %s), bpp = %2u, ", i, tdst.width0, tdst.height0, tdst.array_size, array_mode_to_string(rscreen, &rdst->surface), tsrc.width0, tsrc.height0, tsrc.array_size, array_mode_to_string(rscreen, &rsrc->surface), bpp); fflush(stdout); /* set src pixels */ set_random_pixels(ctx, src, &src_cpu); /* clear dst pixels */ rctx->clear_buffer(ctx, dst, 0, rdst->surface.surf_size, 0, true); memset(dst_cpu.ptr, 0, dst_cpu.layer_stride * tdst.array_size); /* preparation */ max_width = MIN2(tsrc.width0, tdst.width0); max_height = MIN2(tsrc.height0, tdst.height0); max_depth = MIN2(tsrc.array_size, tdst.array_size); num = do_partial_copies ? num_partial_copies : 1; for (j = 0; j < num; j++) { int width, height, depth; int srcx, srcy, srcz, dstx, dsty, dstz; struct pipe_box box; unsigned old_num_draw_calls = rctx->num_draw_calls; unsigned old_num_dma_calls = rctx->num_dma_calls; if (!do_partial_copies) { /* copy whole src to dst */ width = max_width; height = max_height; depth = max_depth; srcx = srcy = srcz = dstx = dsty = dstz = 0; } else { /* random sub-rectangle copies from src to dst */ depth = (rand() % max_depth) + 1; srcz = rand() % (tsrc.array_size - depth + 1); dstz = rand() % (tdst.array_size - depth + 1); /* special code path to hit the tiled partial copies */ if (!rsrc->surface.is_linear && !rdst->surface.is_linear && rand() & 1) { if (max_width < 8 || max_height < 8) continue; width = ((rand() % (max_width / 8)) + 1) * 8; height = ((rand() % (max_height / 8)) + 1) * 8; srcx = rand() % (tsrc.width0 - width + 1) & ~0x7; srcy = rand() % (tsrc.height0 - height + 1) & ~0x7; dstx = rand() % (tdst.width0 - width + 1) & ~0x7; dsty = rand() % (tdst.height0 - height + 1) & ~0x7; } else { /* just make sure that it doesn't divide by zero */ assert(max_width > 0 && max_height > 0); width = (rand() % max_width) + 1; height = (rand() % max_height) + 1; srcx = rand() % (tsrc.width0 - width + 1); srcy = rand() % (tsrc.height0 - height + 1); dstx = rand() % (tdst.width0 - width + 1); dsty = rand() % (tdst.height0 - height + 1); } /* special code path to hit out-of-bounds reads in L2T */ if (rsrc->surface.is_linear && !rdst->surface.is_linear && rand() % 4 == 0) { srcx = 0; srcy = 0; srcz = 0; } } /* GPU copy */ u_box_3d(srcx, srcy, srcz, width, height, depth, &box); rctx->dma_copy(ctx, dst, 0, dstx, dsty, dstz, src, 0, &box); /* See which engine was used. */ gfx_blits += rctx->num_draw_calls > old_num_draw_calls; dma_blits += rctx->num_dma_calls > old_num_dma_calls; /* CPU copy */ util_copy_box(dst_cpu.ptr, tdst.format, dst_cpu.stride, dst_cpu.layer_stride, dstx, dsty, dstz, width, height, depth, src_cpu.ptr, src_cpu.stride, src_cpu.layer_stride, srcx, srcy, srcz); } pass = compare_textures(ctx, dst, &dst_cpu, bpp); if (pass) num_pass++; else num_fail++; printf("BLITs: GFX = %2u, DMA = %2u, %s [%u/%u]\n", gfx_blits, dma_blits, pass ? "pass" : "fail", num_pass, num_pass+num_fail); /* cleanup */ pipe_resource_reference(&src, NULL); pipe_resource_reference(&dst, NULL); free(src_cpu.ptr); free(dst_cpu.ptr); } ctx->destroy(ctx); exit(0); }
static void i945_texture_layout_2d(struct i915_texture *tex) { struct pipe_resource *pt = &tex->b.b; int align_x = 4, align_y = 2; unsigned level; unsigned x = 0; unsigned y = 0; unsigned width = util_next_power_of_two(pt->width0); unsigned height = util_next_power_of_two(pt->height0); unsigned nblocksx = util_format_get_nblocksx(pt->format, width); unsigned nblocksy = util_format_get_nblocksy(pt->format, height); if (util_format_is_s3tc(pt->format)) { align_x = 1; align_y = 1; } tex->stride = align(util_format_get_stride(pt->format, width), 4); /* May need to adjust pitch to accomodate the placement of * the 2nd mipmap level. This occurs when the alignment * constraints of mipmap placement push the right edge of the * 2nd mipmap level out past the width of its parent. */ if (pt->last_level > 0) { unsigned mip1_nblocksx = align_nblocksx(pt->format, u_minify(width, 1), align_x) + util_format_get_nblocksx(pt->format, u_minify(width, 2)); if (mip1_nblocksx > nblocksx) tex->stride = mip1_nblocksx * util_format_get_blocksize(pt->format); } /* Pitch must be a whole number of dwords */ tex->stride = align(tex->stride, 64); tex->total_nblocksy = 0; for (level = 0; level <= pt->last_level; level++) { i915_texture_set_level_info(tex, level, 1); i915_texture_set_image_offset(tex, level, 0, x, y); /* Because the images are packed better, the final offset * might not be the maximal one: */ tex->total_nblocksy = MAX2(tex->total_nblocksy, y + nblocksy); /* Layout_below: step right after second mipmap level. */ if (level == 1) { x += nblocksx; } else { y += nblocksy; } width = u_minify(width, 1); height = u_minify(height, 1); nblocksx = align_nblocksx(pt->format, width, align_x); nblocksy = align_nblocksy(pt->format, height, align_y); } }
void r300_texture_desc_init(struct r300_screen *rscreen, struct r300_resource *tex, const struct pipe_resource *base) { tex->b.b.target = base->target; tex->b.b.format = base->format; tex->b.b.width0 = base->width0; tex->b.b.height0 = base->height0; tex->b.b.depth0 = base->depth0; tex->b.b.array_size = base->array_size; tex->b.b.last_level = base->last_level; tex->b.b.nr_samples = base->nr_samples; tex->tex.width0 = base->width0; tex->tex.height0 = base->height0; tex->tex.depth0 = base->depth0; /* There is a CB memory addressing hardware bug that limits the width * of the MSAA buffer in some cases in R520. In order to get around it, * the following code lowers the sample count depending on the format and * the width. * * The only catch is that all MSAA colorbuffers and a zbuffer which are * supposed to be used together should always be bound together. Only * then the correct minimum sample count of all bound buffers is used * for rendering. */ if (rscreen->caps.is_r500) { /* FP16 6x MSAA buffers are limited to a width of 1360 pixels. */ if ((tex->b.b.format == PIPE_FORMAT_R16G16B16A16_FLOAT || tex->b.b.format == PIPE_FORMAT_R16G16B16X16_FLOAT) && tex->b.b.nr_samples == 6 && tex->b.b.width0 > 1360) { tex->b.b.nr_samples = 4; } /* FP16 4x MSAA buffers are limited to a width of 2048 pixels. */ if ((tex->b.b.format == PIPE_FORMAT_R16G16B16A16_FLOAT || tex->b.b.format == PIPE_FORMAT_R16G16B16X16_FLOAT) && tex->b.b.nr_samples == 4 && tex->b.b.width0 > 2048) { tex->b.b.nr_samples = 2; } } /* 32-bit 6x MSAA buffers are limited to a width of 2720 pixels. * This applies to all R300-R500 cards. */ if (util_format_get_blocksizebits(tex->b.b.format) == 32 && !util_format_is_depth_or_stencil(tex->b.b.format) && tex->b.b.nr_samples == 6 && tex->b.b.width0 > 2720) { tex->b.b.nr_samples = 4; } r300_setup_flags(tex); /* Align a 3D NPOT texture to POT. */ if (base->target == PIPE_TEXTURE_3D && tex->tex.is_npot) { tex->tex.width0 = util_next_power_of_two(tex->tex.width0); tex->tex.height0 = util_next_power_of_two(tex->tex.height0); tex->tex.depth0 = util_next_power_of_two(tex->tex.depth0); } /* Setup tiling. */ if (tex->tex.microtile == RADEON_LAYOUT_UNKNOWN) { r300_setup_tiling(rscreen, tex); } r300_setup_cbzb_flags(rscreen, tex); /* Setup the miptree description. */ r300_setup_miptree(rscreen, tex, TRUE); /* If the required buffer size is larger than the given max size, * try again without the alignment for the CBZB clear. */ if (tex->buf && tex->tex.size_in_bytes > tex->buf->size) { r300_setup_miptree(rscreen, tex, FALSE); /* Make sure the buffer we got is large enough. */ if (tex->tex.size_in_bytes > tex->buf->size) { fprintf(stderr, "r300: I got a pre-allocated buffer to use it as a texture " "storage, but the buffer is too small. I'll use the buffer " "anyway, because I can't crash here, but it's dangerous. " "This can be a DDX bug. Got: %iB, Need: %iB, Info:\n", tex->buf->size, tex->tex.size_in_bytes); r300_tex_print_info(tex, "texture_desc_init"); /* Ooops, what now. Apps will break if we fail this, * so just pretend everything's okay. */ } } r300_setup_hyperz_properties(rscreen, tex); r300_setup_cmask_properties(rscreen, tex); if (SCREEN_DBG_ON(rscreen, DBG_TEX)) r300_tex_print_info(tex, "texture_desc_init"); }
struct pipe_video_decoder * vl_create_mpeg12_decoder(struct pipe_context *context, enum pipe_video_profile profile, enum pipe_video_entrypoint entrypoint, enum pipe_video_chroma_format chroma_format, unsigned width, unsigned height, unsigned max_references, bool expect_chunked_decode) { const unsigned block_size_pixels = VL_BLOCK_WIDTH * VL_BLOCK_HEIGHT; const struct format_config *format_config; struct vl_mpeg12_decoder *dec; assert(u_reduce_video_profile(profile) == PIPE_VIDEO_CODEC_MPEG12); dec = CALLOC_STRUCT(vl_mpeg12_decoder); if (!dec) return NULL; dec->base.context = context; dec->base.profile = profile; dec->base.entrypoint = entrypoint; dec->base.chroma_format = chroma_format; dec->base.width = width; dec->base.height = height; dec->base.max_references = max_references; dec->base.destroy = vl_mpeg12_destroy; dec->base.begin_frame = vl_mpeg12_begin_frame; dec->base.decode_macroblock = vl_mpeg12_decode_macroblock; dec->base.decode_bitstream = vl_mpeg12_decode_bitstream; dec->base.end_frame = vl_mpeg12_end_frame; dec->base.flush = vl_mpeg12_flush; dec->blocks_per_line = MAX2(util_next_power_of_two(dec->base.width) / block_size_pixels, 4); dec->num_blocks = (dec->base.width * dec->base.height) / block_size_pixels; dec->width_in_macroblocks = align(dec->base.width, VL_MACROBLOCK_WIDTH) / VL_MACROBLOCK_WIDTH; dec->expect_chunked_decode = expect_chunked_decode; /* TODO: Implement 422, 444 */ assert(dec->base.chroma_format == PIPE_VIDEO_CHROMA_FORMAT_420); if (dec->base.chroma_format == PIPE_VIDEO_CHROMA_FORMAT_420) { dec->chroma_width = dec->base.width / 2; dec->chroma_height = dec->base.height / 2; dec->num_blocks = dec->num_blocks * 2; } else if (dec->base.chroma_format == PIPE_VIDEO_CHROMA_FORMAT_422) { dec->chroma_width = dec->base.width; dec->chroma_height = dec->base.height / 2; dec->num_blocks = dec->num_blocks * 2 + dec->num_blocks; } else { dec->chroma_width = dec->base.width; dec->chroma_height = dec->base.height; dec->num_blocks = dec->num_blocks * 3; } dec->quads = vl_vb_upload_quads(dec->base.context); dec->pos = vl_vb_upload_pos( dec->base.context, dec->base.width / VL_MACROBLOCK_WIDTH, dec->base.height / VL_MACROBLOCK_HEIGHT ); dec->ves_ycbcr = vl_vb_get_ves_ycbcr(dec->base.context); dec->ves_mv = vl_vb_get_ves_mv(dec->base.context); switch (entrypoint) { case PIPE_VIDEO_ENTRYPOINT_BITSTREAM: format_config = find_format_config(dec, bitstream_format_config, num_bitstream_format_configs); break; case PIPE_VIDEO_ENTRYPOINT_IDCT: format_config = find_format_config(dec, idct_format_config, num_idct_format_configs); break; case PIPE_VIDEO_ENTRYPOINT_MC: format_config = find_format_config(dec, mc_format_config, num_mc_format_configs); break; default: assert(0); FREE(dec); return NULL; } if (!format_config) { FREE(dec); return NULL; } if (!init_zscan(dec, format_config)) goto error_zscan; if (entrypoint <= PIPE_VIDEO_ENTRYPOINT_IDCT) { if (!init_idct(dec, format_config)) goto error_sources; } else { if (!init_mc_source_widthout_idct(dec, format_config)) goto error_sources; } if (!vl_mc_init(&dec->mc_y, dec->base.context, dec->base.width, dec->base.height, VL_MACROBLOCK_HEIGHT, format_config->mc_scale, mc_vert_shader_callback, mc_frag_shader_callback, dec)) goto error_mc_y; // TODO if (!vl_mc_init(&dec->mc_c, dec->base.context, dec->base.width, dec->base.height, VL_BLOCK_HEIGHT, format_config->mc_scale, mc_vert_shader_callback, mc_frag_shader_callback, dec)) goto error_mc_c; if (!init_pipe_state(dec)) goto error_pipe_state; return &dec->base; error_pipe_state: vl_mc_cleanup(&dec->mc_c); error_mc_c: vl_mc_cleanup(&dec->mc_y); error_mc_y: if (entrypoint <= PIPE_VIDEO_ENTRYPOINT_IDCT) { vl_idct_cleanup(&dec->idct_y); vl_idct_cleanup(&dec->idct_c); dec->idct_source->destroy(dec->idct_source); } dec->mc_source->destroy(dec->mc_source); error_sources: vl_zscan_cleanup(&dec->zscan_y); vl_zscan_cleanup(&dec->zscan_c); error_zscan: FREE(dec); return NULL; }
static void vc4_setup_slices(struct vc4_resource *rsc) { struct pipe_resource *prsc = &rsc->base.b; uint32_t width = prsc->width0; uint32_t height = prsc->height0; uint32_t pot_width = util_next_power_of_two(width); uint32_t pot_height = util_next_power_of_two(height); uint32_t offset = 0; uint32_t utile_w = vc4_utile_width(rsc->cpp); uint32_t utile_h = vc4_utile_height(rsc->cpp); for (int i = prsc->last_level; i >= 0; i--) { struct vc4_resource_slice *slice = &rsc->slices[i]; uint32_t level_width, level_height; if (i == 0) { level_width = width; level_height = height; } else { level_width = u_minify(pot_width, i); level_height = u_minify(pot_height, i); } if (rsc->tiled == VC4_TILING_FORMAT_LINEAR) { slice->tiling = VC4_TILING_FORMAT_LINEAR; level_width = align(level_width, 16); } else { if (vc4_size_is_lt(level_width, level_height, rsc->cpp)) { slice->tiling = VC4_TILING_FORMAT_LT; level_width = align(level_width, utile_w); level_height = align(level_height, utile_h); } else { slice->tiling = VC4_TILING_FORMAT_T; level_width = align(level_width, 4 * 2 * utile_w); level_height = align(level_height, 4 * 2 * utile_h); } } slice->offset = offset; slice->stride = level_width * rsc->cpp; slice->size = level_height * slice->stride; offset += slice->size; } /* The texture base pointer that has to point to level 0 doesn't have * intra-page bits, so we have to align it, and thus shift up all the * smaller slices. */ uint32_t page_align_offset = (align(rsc->slices[0].offset, 4096) - rsc->slices[0].offset); if (page_align_offset) { for (int i = 0; i <= prsc->last_level; i++) rsc->slices[i].offset += page_align_offset; } /* Cube map faces appear as whole miptrees at a page-aligned offset * from the first face's miptree. */ if (prsc->target == PIPE_TEXTURE_CUBE) { rsc->cube_map_stride = align(rsc->slices[0].offset + rsc->slices[0].size, 4096); } }
static INLINE unsigned get_pot_stride(enum pipe_format format, unsigned width) { return util_next_power_of_two(util_format_get_stride(format, width)); }
void * r300_texture_transfer_map(struct pipe_context *ctx, struct pipe_resource *texture, unsigned level, unsigned usage, const struct pipe_box *box, struct pipe_transfer **transfer) { struct r300_context *r300 = r300_context(ctx); struct r300_resource *tex = r300_resource(texture); struct r300_transfer *trans; boolean referenced_cs, referenced_hw; enum pipe_format format = tex->b.b.format; char *map; referenced_cs = r300->rws->cs_is_buffer_referenced(r300->cs, tex->cs_buf, RADEON_USAGE_READWRITE); if (referenced_cs) { referenced_hw = TRUE; } else { referenced_hw = r300->rws->buffer_is_busy(tex->buf, RADEON_USAGE_READWRITE); } trans = CALLOC_STRUCT(r300_transfer); if (trans) { /* Initialize the transfer object. */ trans->transfer.resource = texture; trans->transfer.level = level; trans->transfer.usage = usage; trans->transfer.box = *box; /* If the texture is tiled, we must create a temporary detiled texture * for this transfer. * Also make write transfers pipelined. */ if (tex->tex.microtile || tex->tex.macrotile[level] || (referenced_hw && !(usage & PIPE_TRANSFER_READ) && r300_is_blit_supported(texture->format))) { struct pipe_resource base; if (r300->blitter->running) { fprintf(stderr, "r300: ERROR: Blitter recursion in texture_get_transfer.\n"); os_break(); } memset(&base, 0, sizeof(base)); base.target = PIPE_TEXTURE_2D; base.format = texture->format; base.width0 = box->width; base.height0 = box->height; base.depth0 = 1; base.array_size = 1; base.usage = PIPE_USAGE_STAGING; base.flags = R300_RESOURCE_FLAG_TRANSFER; /* We must set the correct texture target and dimensions if needed for a 3D transfer. */ if (box->depth > 1 && util_max_layer(texture, level) > 0) { base.target = texture->target; if (base.target == PIPE_TEXTURE_3D) { base.depth0 = util_next_power_of_two(box->depth); } } /* Create the temporary texture. */ trans->linear_texture = r300_resource( ctx->screen->resource_create(ctx->screen, &base)); if (!trans->linear_texture) { /* Oh crap, the thing can't create the texture. * Let's flush and try again. */ r300_flush(ctx, 0, NULL); trans->linear_texture = r300_resource( ctx->screen->resource_create(ctx->screen, &base)); if (!trans->linear_texture) { fprintf(stderr, "r300: Failed to create a transfer object.\n"); FREE(trans); return NULL; } } assert(!trans->linear_texture->tex.microtile && !trans->linear_texture->tex.macrotile[0]); /* Set the stride. */ trans->transfer.stride = trans->linear_texture->tex.stride_in_bytes[0]; trans->transfer.layer_stride = trans->linear_texture->tex.layer_size_in_bytes[0]; if (usage & PIPE_TRANSFER_READ) { /* We cannot map a tiled texture directly because the data is * in a different order, therefore we do detiling using a blit. */ r300_copy_from_tiled_texture(ctx, trans); /* Always referenced in the blit. */ r300_flush(ctx, 0, NULL); } } else { /* Unpipelined transfer. */ trans->transfer.stride = tex->tex.stride_in_bytes[level]; trans->transfer.layer_stride = tex->tex.layer_size_in_bytes[level]; trans->offset = r300_texture_get_offset(tex, level, box->z); if (referenced_cs && !(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) { r300_flush(ctx, 0, NULL); } } } if (trans->linear_texture) { /* The detiled texture is of the same size as the region being mapped * (no offset needed). */ map = r300->rws->buffer_map(trans->linear_texture->cs_buf, r300->cs, usage); if (!map) { pipe_resource_reference( (struct pipe_resource**)&trans->linear_texture, NULL); FREE(trans); return NULL; } *transfer = &trans->transfer; return map; } else { /* Tiling is disabled. */ map = r300->rws->buffer_map(tex->cs_buf, r300->cs, usage); if (!map) { FREE(trans); return NULL; } *transfer = &trans->transfer; return map + trans->offset + box->y / util_format_get_blockheight(format) * trans->transfer.stride + box->x / util_format_get_blockwidth(format) * util_format_get_blocksize(format); } }
void st_setup_current(struct st_context *st, const struct st_vertex_program *vp, const struct st_vp_variant *vp_variant, struct pipe_vertex_element *velements, struct pipe_vertex_buffer *vbuffer, unsigned *num_vbuffers) { struct gl_context *ctx = st->ctx; const GLbitfield inputs_read = vp_variant->vert_attrib_mask; /* Process values that should have better been uniforms in the application */ GLbitfield curmask = inputs_read & _mesa_draw_current_bits(ctx); if (curmask) { /* vertex program validation must be done before this */ const struct st_vertex_program *vp = st->vp; const ubyte *input_to_index = vp->input_to_index; /* For each attribute, upload the maximum possible size. */ GLubyte data[VERT_ATTRIB_MAX * sizeof(GLdouble) * 4]; GLubyte *cursor = data; const unsigned bufidx = (*num_vbuffers)++; unsigned max_alignment = 1; while (curmask) { const gl_vert_attrib attr = u_bit_scan(&curmask); const struct gl_array_attributes *const attrib = _mesa_draw_current_attrib(ctx, attr); const unsigned size = attrib->Format._ElementSize; const unsigned alignment = util_next_power_of_two(size); max_alignment = MAX2(max_alignment, alignment); memcpy(cursor, attrib->Ptr, size); if (alignment != size) memset(cursor + size, 0, alignment - size); init_velement_lowered(vp, velements, &attrib->Format, cursor - data, 0, bufidx, input_to_index[attr]); cursor += alignment; } vbuffer[bufidx].is_user_buffer = false; vbuffer[bufidx].buffer.resource = NULL; /* vbuffer[bufidx].buffer_offset is set below */ vbuffer[bufidx].stride = 0; /* Use const_uploader for zero-stride vertex attributes, because * it may use a better memory placement than stream_uploader. * The reason is that zero-stride attributes can be fetched many * times (thousands of times), so a better placement is going to * perform better. */ struct u_upload_mgr *uploader = st->can_bind_const_buffer_as_vertex ? st->pipe->const_uploader : st->pipe->stream_uploader; u_upload_data(uploader, 0, cursor - data, max_alignment, data, &vbuffer[bufidx].buffer_offset, &vbuffer[bufidx].buffer.resource); /* Always unmap. The uploader might use explicit flushes. */ u_upload_unmap(uploader); } }
boolean r300_texture_desc_init(struct r300_screen *rscreen, struct r300_resource *tex, const struct pipe_resource *base) { tex->b.b.b.target = base->target; tex->b.b.b.format = base->format; tex->b.b.b.width0 = base->width0; tex->b.b.b.height0 = base->height0; tex->b.b.b.depth0 = base->depth0; tex->b.b.b.array_size = base->array_size; tex->b.b.b.last_level = base->last_level; tex->b.b.b.nr_samples = base->nr_samples; tex->tex.width0 = base->width0; tex->tex.height0 = base->height0; tex->tex.depth0 = base->depth0; r300_setup_flags(tex); /* Align a 3D NPOT texture to POT. */ if (base->target == PIPE_TEXTURE_3D && tex->tex.is_npot) { tex->tex.width0 = util_next_power_of_two(tex->tex.width0); tex->tex.height0 = util_next_power_of_two(tex->tex.height0); tex->tex.depth0 = util_next_power_of_two(tex->tex.depth0); } /* Setup tiling. */ if (tex->tex.microtile == RADEON_LAYOUT_UNKNOWN) { r300_setup_tiling(rscreen, tex); } r300_setup_cbzb_flags(rscreen, tex); /* Setup the miptree description. */ r300_setup_miptree(rscreen, tex, TRUE); /* If the required buffer size is larger the given max size, * try again without the alignment for the CBZB clear. */ if (tex->buf_size && tex->tex.size_in_bytes > tex->buf_size) { r300_setup_miptree(rscreen, tex, FALSE); } r300_setup_hyperz_properties(rscreen, tex); if (tex->buf_size) { /* Make sure the buffer we got is large enough. */ if (tex->tex.size_in_bytes > tex->buf_size) { fprintf(stderr, "r300: texture_desc_init: The buffer is not " "large enough. Got: %i, Need: %i, Info:\n", tex->buf_size, tex->tex.size_in_bytes); r300_tex_print_info(tex, "texture_desc_init"); return FALSE; } tex->tex.buffer_size_in_bytes = tex->buf_size; } else { tex->tex.buffer_size_in_bytes = tex->tex.size_in_bytes; } if (SCREEN_DBG_ON(rscreen, DBG_TEX)) r300_tex_print_info(tex, "texture_desc_init"); return TRUE; }
/** * Fetch a texels from a texture, returning them in SoA layout. * * \param type the desired return type for 'rgba'. The vector length * is the number of texels to fetch * \param aligned if the offset is guaranteed to be aligned to element width * * \param base_ptr points to the base of the texture mip tree. * \param offset offset to start of the texture image block. For non- * compressed formats, this simply is an offset to the texel. * For compressed formats, it is an offset to the start of the * compressed data block. * * \param i, j the sub-block pixel coordinates. For non-compressed formats * these will always be (0,0). For compressed formats, i will * be in [0, block_width-1] and j will be in [0, block_height-1]. * \param cache optional value pointing to a lp_build_format_cache structure */ void lp_build_fetch_rgba_soa(struct gallivm_state *gallivm, const struct util_format_description *format_desc, struct lp_type type, boolean aligned, LLVMValueRef base_ptr, LLVMValueRef offset, LLVMValueRef i, LLVMValueRef j, LLVMValueRef cache, LLVMValueRef rgba_out[4]) { LLVMBuilderRef builder = gallivm->builder; enum pipe_format format = format_desc->format; struct lp_type fetch_type; if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN && (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB || format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB || format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) && format_desc->block.width == 1 && format_desc->block.height == 1 && format_desc->block.bits <= type.width && (format_desc->channel[0].type != UTIL_FORMAT_TYPE_FLOAT || format_desc->channel[0].size == 32 || format_desc->channel[0].size == 16)) { /* * The packed pixel fits into an element of the destination format. Put * the packed pixels into a vector and extract each component for all * vector elements in parallel. */ LLVMValueRef packed; /* * gather the texels from the texture * Ex: packed = {XYZW, XYZW, XYZW, XYZW} */ assert(format_desc->block.bits <= type.width); fetch_type = lp_type_uint(type.width); packed = lp_build_gather(gallivm, type.length, format_desc->block.bits, fetch_type, aligned, base_ptr, offset, FALSE); /* * convert texels to float rgba */ lp_build_unpack_rgba_soa(gallivm, format_desc, type, packed, rgba_out); return; } if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN && (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB) && format_desc->block.width == 1 && format_desc->block.height == 1 && format_desc->block.bits > type.width && ((format_desc->block.bits <= type.width * type.length && format_desc->channel[0].size <= type.width) || (format_desc->channel[0].size == 64 && format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT && type.floating))) { /* * Similar to above, but the packed pixel is larger than what fits * into an element of the destination format. The packed pixels will be * shuffled into SoA vectors appropriately, and then the extraction will * be done in parallel as much as possible. * Good for 16xn (n > 2) and 32xn (n > 1) formats, care is taken so * the gathered vectors can be shuffled easily (even with avx). * 64xn float -> 32xn float is handled too but it's a bit special as * it does the conversion pre-shuffle. */ LLVMValueRef packed[4], dst[4], output[4], shuffles[LP_MAX_VECTOR_WIDTH/32]; struct lp_type fetch_type, gather_type = type; unsigned num_gather, fetch_width, i, j; struct lp_build_context bld; boolean fp64 = format_desc->channel[0].size == 64; lp_build_context_init(&bld, gallivm, type); assert(type.width == 32); assert(format_desc->block.bits > type.width); /* * First, figure out fetch order. */ fetch_width = util_next_power_of_two(format_desc->block.bits); num_gather = fetch_width / type.width; /* * fp64 are treated like fp32 except we fetch twice wide values * (as we shuffle after trunc). The shuffles for that work out * mostly fine (slightly suboptimal for 4-wide, perfect for AVX) * albeit we miss the potential opportunity for hw gather (as it * only handles native size). */ num_gather = fetch_width / type.width; gather_type.width *= num_gather; if (fp64) { num_gather /= 2; } gather_type.length /= num_gather; for (i = 0; i < num_gather; i++) { LLVMValueRef offsetr, shuf_vec; if(num_gather == 4) { for (j = 0; j < gather_type.length; j++) { unsigned idx = i + 4*j; shuffles[j] = lp_build_const_int32(gallivm, idx); } shuf_vec = LLVMConstVector(shuffles, gather_type.length); offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, ""); } else if (num_gather == 2) { assert(num_gather == 2); for (j = 0; j < gather_type.length; j++) { unsigned idx = i*2 + (j%2) + (j/2)*4; shuffles[j] = lp_build_const_int32(gallivm, idx); } shuf_vec = LLVMConstVector(shuffles, gather_type.length); offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, ""); } else { assert(num_gather == 1); offsetr = offset; } if (gather_type.length == 1) { LLVMValueRef zero = lp_build_const_int32(gallivm, 0); offsetr = LLVMBuildExtractElement(builder, offsetr, zero, ""); } /* * Determine whether to use float or int loads. This is mostly * to outsmart the (stupid) llvm int/float shuffle logic, we * don't really care much if the data is floats or ints... * But llvm will refuse to use single float shuffle with int data * and instead use 3 int shuffles instead, the code looks atrocious. * (Note bitcasts often won't help, as llvm is too smart to be * fooled by that.) * Nobody cares about simd float<->int domain transition penalties, * which usually don't even exist for shuffles anyway. * With 4x32bit (and 3x32bit) fetch, we use float vec (the data is * going into transpose, which is unpacks, so doesn't really matter * much). * With 2x32bit or 4x16bit fetch, we use float vec, since those * go into the weird channel separation shuffle. With floats, * this is (with 128bit vectors): * - 2 movq, 2 movhpd, 2 shufps * With ints it would be: * - 4 movq, 2 punpcklqdq, 4 pshufd, 2 blendw * I've seen texture functions increase in code size by 15% just due * to that (there's lots of such fetches in them...) * (We could chose a different gather order to improve this somewhat * for the int path, but it would basically just drop the blends, * so the float path with this order really is optimal.) * Albeit it is tricky sometimes llvm doesn't ignore the float->int * casts so must avoid them until we're done with the float shuffle... * 3x16bit formats (the same is also true for 3x8) are pretty bad but * there's nothing we can do about them (we could overallocate by * those couple bytes and use unaligned but pot sized load). * Note that this is very much x86 specific. I don't know if this * affect other archs at all. */ if (num_gather > 1) { /* * We always want some float type here (with x86) * due to shuffles being float ones afterwards (albeit for * the num_gather == 4 case int should work fine too * (unless there's some problems with avx but not avx2). */ if (format_desc->channel[0].size == 64) { fetch_type = lp_type_float_vec(64, gather_type.width); } else { fetch_type = lp_type_int_vec(32, gather_type.width); } } else { /* type doesn't matter much */ if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT && (format_desc->channel[0].size == 32 || format_desc->channel[0].size == 64)) { fetch_type = lp_type_float(gather_type.width); } else { fetch_type = lp_type_uint(gather_type.width); } } /* Now finally gather the values */ packed[i] = lp_build_gather(gallivm, gather_type.length, format_desc->block.bits, fetch_type, aligned, base_ptr, offsetr, FALSE); if (fp64) { struct lp_type conv_type = type; conv_type.width *= 2; packed[i] = LLVMBuildBitCast(builder, packed[i], lp_build_vec_type(gallivm, conv_type), ""); packed[i] = LLVMBuildFPTrunc(builder, packed[i], bld.vec_type, ""); } } /* shuffle the gathered values to SoA */ if (num_gather == 2) { for (i = 0; i < num_gather; i++) { for (j = 0; j < type.length; j++) { unsigned idx = (j%2)*2 + (j/4)*4 + i; if ((j/2)%2) idx += type.length; shuffles[j] = lp_build_const_int32(gallivm, idx); } dst[i] = LLVMBuildShuffleVector(builder, packed[0], packed[1], LLVMConstVector(shuffles, type.length), ""); } } else if (num_gather == 4) { lp_build_transpose_aos(gallivm, lp_int_type(type), packed, dst); } else { assert(num_gather == 1); dst[0] = packed[0]; } /* * And finally unpack exactly as above, except that * chan shift is adjusted and the right vector selected. */ if (!fp64) { for (i = 0; i < num_gather; i++) { dst[i] = LLVMBuildBitCast(builder, dst[i], bld.int_vec_type, ""); } for (i = 0; i < format_desc->nr_channels; i++) { struct util_format_channel_description chan_desc = format_desc->channel[i]; unsigned blockbits = type.width; unsigned vec_nr = chan_desc.shift / type.width; chan_desc.shift %= type.width; output[i] = lp_build_extract_soa_chan(&bld, blockbits, FALSE, chan_desc, dst[vec_nr]); } } else { for (i = 0; i < format_desc->nr_channels; i++) { output[i] = dst[i]; } } lp_build_format_swizzle_soa(format_desc, &bld, output, rgba_out); return; } if (format == PIPE_FORMAT_R11G11B10_FLOAT || format == PIPE_FORMAT_R9G9B9E5_FLOAT) { /* * similar conceptually to above but requiring special * AoS packed -> SoA float conversion code. */ LLVMValueRef packed; struct lp_type fetch_type = lp_type_uint(type.width); assert(type.floating); assert(type.width == 32); packed = lp_build_gather(gallivm, type.length, format_desc->block.bits, fetch_type, aligned, base_ptr, offset, FALSE); if (format == PIPE_FORMAT_R11G11B10_FLOAT) { lp_build_r11g11b10_to_float(gallivm, packed, rgba_out); } else { lp_build_rgb9e5_to_float(gallivm, packed, rgba_out); } return; } if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS && format_desc->block.bits == 64) { /* * special case the format is 64 bits but we only require * 32bit (or 8bit) from each block. */ LLVMValueRef packed; struct lp_type fetch_type = lp_type_uint(type.width); if (format == PIPE_FORMAT_X32_S8X24_UINT) { /* * for stencil simply fix up offsets - could in fact change * base_ptr instead even outside the shader. */ unsigned mask = (1 << 8) - 1; LLVMValueRef s_offset = lp_build_const_int_vec(gallivm, type, 4); offset = LLVMBuildAdd(builder, offset, s_offset, ""); packed = lp_build_gather(gallivm, type.length, 32, fetch_type, aligned, base_ptr, offset, FALSE); packed = LLVMBuildAnd(builder, packed, lp_build_const_int_vec(gallivm, type, mask), ""); } else { assert (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT); packed = lp_build_gather(gallivm, type.length, 32, fetch_type, aligned, base_ptr, offset, TRUE); packed = LLVMBuildBitCast(builder, packed, lp_build_vec_type(gallivm, type), ""); } /* for consistency with lp_build_unpack_rgba_soa() return sss1 or zzz1 */ rgba_out[0] = rgba_out[1] = rgba_out[2] = packed; rgba_out[3] = lp_build_const_vec(gallivm, type, 1.0f); return; } /* * Try calling lp_build_fetch_rgba_aos for all pixels. * Should only really hit subsampled, compressed * (for s3tc srgb too, for rgtc the unorm ones only) by now. * (This is invalid for plain 8unorm formats because we're lazy with * the swizzle since some results would arrive swizzled, some not.) */ if ((format_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) && (util_format_fits_8unorm(format_desc) || format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) && type.floating && type.width == 32 && (type.length == 1 || (type.length % 4 == 0))) { struct lp_type tmp_type; struct lp_build_context bld; LLVMValueRef packed, rgba[4]; const struct util_format_description *flinear_desc; const struct util_format_description *frgba8_desc; unsigned chan; lp_build_context_init(&bld, gallivm, type); /* * Make sure the conversion in aos really only does convert to rgba8 * and not anything more (so use linear format, adjust type). */ flinear_desc = util_format_description(util_format_linear(format)); memset(&tmp_type, 0, sizeof tmp_type); tmp_type.width = 8; tmp_type.length = type.length * 4; tmp_type.norm = TRUE; packed = lp_build_fetch_rgba_aos(gallivm, flinear_desc, tmp_type, aligned, base_ptr, offset, i, j, cache); packed = LLVMBuildBitCast(builder, packed, bld.int_vec_type, ""); /* * The values are now packed so they match ordinary (srgb) RGBA8 format, * hence need to use matching format for unpack. */ frgba8_desc = util_format_description(PIPE_FORMAT_R8G8B8A8_UNORM); if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) { assert(format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC); frgba8_desc = util_format_description(PIPE_FORMAT_R8G8B8A8_SRGB); } lp_build_unpack_rgba_soa(gallivm, frgba8_desc, type, packed, rgba); /* * We converted 4 channels. Make sure llvm can drop unneeded ones * (luckily the rgba order is fixed, only LA needs special case). */ for (chan = 0; chan < 4; chan++) { enum pipe_swizzle swizzle = format_desc->swizzle[chan]; if (chan == 3 && util_format_is_luminance_alpha(format)) { swizzle = PIPE_SWIZZLE_W; } rgba_out[chan] = lp_build_swizzle_soa_channel(&bld, rgba, swizzle); } return; } /* * Fallback to calling lp_build_fetch_rgba_aos for each pixel. * * This is not the most efficient way of fetching pixels, as we * miss some opportunities to do vectorization, but this is * convenient for formats or scenarios for which there was no * opportunity or incentive to optimize. * * We do NOT want to end up here, this typically is quite terrible, * in particular if the formats have less than 4 channels. * * Right now, this should only be hit for: * - RGTC snorm formats * (those miss fast fetch functions hence they are terrible anyway) */ { unsigned k; struct lp_type tmp_type; LLVMValueRef aos_fetch[LP_MAX_VECTOR_WIDTH / 32]; if (gallivm_debug & GALLIVM_DEBUG_PERF) { debug_printf("%s: AoS fetch fallback for %s\n", __FUNCTION__, format_desc->short_name); } tmp_type = type; tmp_type.length = 4; /* * Note that vector transpose can be worse compared to insert/extract * for aos->soa conversion (for formats with 1 or 2 channels). However, * we should try to avoid getting here for just about all formats, so * don't bother. */ /* loop over number of pixels */ for(k = 0; k < type.length; ++k) { LLVMValueRef index = lp_build_const_int32(gallivm, k); LLVMValueRef offset_elem; LLVMValueRef i_elem, j_elem; offset_elem = LLVMBuildExtractElement(builder, offset, index, ""); i_elem = LLVMBuildExtractElement(builder, i, index, ""); j_elem = LLVMBuildExtractElement(builder, j, index, ""); /* Get a single float[4]={R,G,B,A} pixel */ aos_fetch[k] = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type, aligned, base_ptr, offset_elem, i_elem, j_elem, cache); } convert_to_soa(gallivm, aos_fetch, rgba_out, type); } }
PUBLIC Status XvMCCreateSubpicture(Display *dpy, XvMCContext *context, XvMCSubpicture *subpicture, unsigned short width, unsigned short height, int xvimage_id) { XvMCContextPrivate *context_priv; XvMCSubpicturePrivate *subpicture_priv; struct pipe_context *pipe; struct pipe_resource tex_templ, *tex; struct pipe_sampler_view sampler_templ; Status ret; XVMC_MSG(XVMC_TRACE, "[XvMC] Creating subpicture %p.\n", subpicture); assert(dpy); if (!context) return XvMCBadContext; context_priv = context->privData; pipe = context_priv->pipe; if (!subpicture) return XvMCBadSubpicture; if (width > context_priv->subpicture_max_width || height > context_priv->subpicture_max_height) return BadValue; ret = Validate(dpy, context->port, context->surface_type_id, xvimage_id); if (ret != Success) return ret; subpicture_priv = CALLOC(1, sizeof(XvMCSubpicturePrivate)); if (!subpicture_priv) return BadAlloc; memset(&tex_templ, 0, sizeof(tex_templ)); tex_templ.target = PIPE_TEXTURE_2D; tex_templ.format = XvIDToPipe(xvimage_id); tex_templ.last_level = 0; if (pipe->screen->get_video_param(pipe->screen, PIPE_VIDEO_PROFILE_UNKNOWN, PIPE_VIDEO_ENTRYPOINT_UNKNOWN, PIPE_VIDEO_CAP_NPOT_TEXTURES)) { tex_templ.width0 = width; tex_templ.height0 = height; } else { tex_templ.width0 = util_next_power_of_two(width); tex_templ.height0 = util_next_power_of_two(height); } tex_templ.depth0 = 1; tex_templ.array_size = 1; tex_templ.usage = PIPE_USAGE_DYNAMIC; tex_templ.bind = PIPE_BIND_SAMPLER_VIEW; tex_templ.flags = 0; tex = pipe->screen->resource_create(pipe->screen, &tex_templ); memset(&sampler_templ, 0, sizeof(sampler_templ)); u_sampler_view_default_template(&sampler_templ, tex, tex->format); subpicture_priv->sampler = pipe->create_sampler_view(pipe, tex, &sampler_templ); pipe_resource_reference(&tex, NULL); if (!subpicture_priv->sampler) { FREE(subpicture_priv); return BadAlloc; } subpicture_priv->context = context; subpicture->subpicture_id = XAllocID(dpy); subpicture->context_id = context->context_id; subpicture->xvimage_id = xvimage_id; subpicture->width = width; subpicture->height = height; subpicture->num_palette_entries = NumPaletteEntries4XvID(xvimage_id); subpicture->entry_bytes = PipeToComponentOrder(tex_templ.format, subpicture->component_order); subpicture->privData = subpicture_priv; if (subpicture->num_palette_entries > 0) { tex_templ.target = PIPE_TEXTURE_1D; tex_templ.format = PIPE_FORMAT_R8G8B8X8_UNORM; tex_templ.width0 = subpicture->num_palette_entries; tex_templ.height0 = 1; tex_templ.usage = PIPE_USAGE_DEFAULT; tex = pipe->screen->resource_create(pipe->screen, &tex_templ); memset(&sampler_templ, 0, sizeof(sampler_templ)); u_sampler_view_default_template(&sampler_templ, tex, tex->format); sampler_templ.swizzle_a = PIPE_SWIZZLE_ONE; subpicture_priv->palette = pipe->create_sampler_view(pipe, tex, &sampler_templ); pipe_resource_reference(&tex, NULL); if (!subpicture_priv->sampler) { FREE(subpicture_priv); return BadAlloc; } } SyncHandle(); XVMC_MSG(XVMC_TRACE, "[XvMC] Subpicture %p created.\n", subpicture); return Success; }