/** * Convert linear float soa values to packed srgb AoS values. * This only handles packed formats which are 4x8bit in size * (rgba and rgbx plus swizzles), and 16bit 565-style formats * with no alpha. (In the latter case the return values won't be * fully packed, it will look like r5g6b5x16r5g6b5x16...) * * @param src float SoA (vector) values to convert. */ LLVMValueRef lp_build_float_to_srgb_packed(struct gallivm_state *gallivm, const struct util_format_description *dst_fmt, struct lp_type src_type, LLVMValueRef *src) { LLVMBuilderRef builder = gallivm->builder; unsigned chan; struct lp_build_context f32_bld; struct lp_type int32_type = lp_int_type(src_type); LLVMValueRef tmpsrgb[4], alpha, dst; lp_build_context_init(&f32_bld, gallivm, src_type); /* rgb is subject to linear->srgb conversion, alpha is not */ for (chan = 0; chan < 3; chan++) { unsigned chan_bits = dst_fmt->channel[dst_fmt->swizzle[chan]].size; tmpsrgb[chan] = lp_build_linear_to_srgb(gallivm, src_type, chan_bits, src[chan]); } /* * can't use lp_build_conv since we want to keep values as 32bit * here so we can interleave with rgb to go from SoA->AoS. */ alpha = lp_build_clamp_zero_one_nanzero(&f32_bld, src[3]); alpha = lp_build_mul(&f32_bld, alpha, lp_build_const_vec(gallivm, src_type, 255.0f)); tmpsrgb[3] = lp_build_iround(&f32_bld, alpha); dst = lp_build_zero(gallivm, int32_type); for (chan = 0; chan < dst_fmt->nr_channels; chan++) { if (dst_fmt->swizzle[chan] <= PIPE_SWIZZLE_W) { unsigned ls; LLVMValueRef shifted, shift_val; ls = dst_fmt->channel[dst_fmt->swizzle[chan]].shift; shift_val = lp_build_const_int_vec(gallivm, int32_type, ls); shifted = LLVMBuildShl(builder, tmpsrgb[chan], shift_val, ""); dst = LLVMBuildOr(builder, dst, shifted, ""); } } return dst; }
void lp_build_tgsi_aos(struct gallivm_state *gallivm, const struct tgsi_token *tokens, struct lp_type type, const unsigned char swizzles[4], LLVMValueRef consts_ptr, const LLVMValueRef *inputs, LLVMValueRef *outputs, struct lp_build_sampler_aos *sampler, const struct tgsi_shader_info *info) { struct lp_build_tgsi_aos_context bld; struct tgsi_parse_context parse; uint num_immediates = 0; unsigned chan; int pc = 0; /* Setup build context */ memset(&bld, 0, sizeof bld); lp_build_context_init(&bld.bld_base.base, gallivm, type); lp_build_context_init(&bld.bld_base.uint_bld, gallivm, lp_uint_type(type)); lp_build_context_init(&bld.bld_base.int_bld, gallivm, lp_int_type(type)); lp_build_context_init(&bld.int_bld, gallivm, lp_int_type(type)); for (chan = 0; chan < 4; ++chan) { bld.swizzles[chan] = swizzles[chan]; bld.inv_swizzles[swizzles[chan]] = chan; } bld.inputs = inputs; bld.outputs = outputs; bld.consts_ptr = consts_ptr; bld.sampler = sampler; bld.indirect_files = info->indirect_files; bld.bld_base.emit_swizzle = swizzle_aos; bld.bld_base.info = info; bld.bld_base.emit_fetch_funcs[TGSI_FILE_CONSTANT] = emit_fetch_constant; bld.bld_base.emit_fetch_funcs[TGSI_FILE_IMMEDIATE] = emit_fetch_immediate; bld.bld_base.emit_fetch_funcs[TGSI_FILE_INPUT] = emit_fetch_input; bld.bld_base.emit_fetch_funcs[TGSI_FILE_TEMPORARY] = emit_fetch_temporary; /* Set opcode actions */ lp_set_default_actions_cpu(&bld.bld_base); if (!lp_bld_tgsi_list_init(&bld.bld_base)) { return; } tgsi_parse_init(&parse, tokens); while (!tgsi_parse_end_of_tokens(&parse)) { tgsi_parse_token(&parse); switch(parse.FullToken.Token.Type) { case TGSI_TOKEN_TYPE_DECLARATION: /* Inputs already interpolated */ lp_emit_declaration_aos(&bld, &parse.FullToken.FullDeclaration); break; case TGSI_TOKEN_TYPE_INSTRUCTION: /* save expanded instruction */ lp_bld_tgsi_add_instruction(&bld.bld_base, &parse.FullToken.FullInstruction); break; case TGSI_TOKEN_TYPE_IMMEDIATE: /* simply copy the immediate values into the next immediates[] slot */ { const uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1; float imm[4]; assert(size <= 4); assert(num_immediates < LP_MAX_TGSI_IMMEDIATES); for (chan = 0; chan < 4; ++chan) { imm[chan] = 0.0f; } for (chan = 0; chan < size; ++chan) { unsigned swizzle = bld.swizzles[chan]; imm[swizzle] = parse.FullToken.FullImmediate.u[chan].Float; } bld.immediates[num_immediates] = lp_build_const_aos(gallivm, type, imm[0], imm[1], imm[2], imm[3], NULL); num_immediates++; } break; case TGSI_TOKEN_TYPE_PROPERTY: break; default: assert(0); } } while (pc != -1) { struct tgsi_full_instruction *instr = bld.bld_base.instructions + pc; const struct tgsi_opcode_info *opcode_info = tgsi_get_opcode_info(instr->Instruction.Opcode); if (!lp_emit_instruction_aos(&bld, instr, opcode_info, &pc)) _debug_printf("warning: failed to translate tgsi opcode %s to LLVM\n", opcode_info->mnemonic); } if (0) { LLVMBasicBlockRef block = LLVMGetInsertBlock(gallivm->builder); LLVMValueRef function = LLVMGetBasicBlockParent(block); debug_printf("11111111111111111111111111111 \n"); tgsi_dump(tokens, 0); lp_debug_dump_value(function); debug_printf("2222222222222222222222222222 \n"); } tgsi_parse_free(&parse); FREE(bld.bld_base.instructions); if (0) { LLVMModuleRef module = LLVMGetGlobalParent( LLVMGetBasicBlockParent(LLVMGetInsertBlock(gallivm->builder))); LLVMDumpModule(module); } }
/** * Generate code for performing depth and/or stencil tests. * We operate on a vector of values (typically n 2x2 quads). * * \param depth the depth test state * \param stencil the front/back stencil state * \param type the data type of the fragment depth/stencil values * \param format_desc description of the depth/stencil surface * \param mask the alive/dead pixel mask for the quad (vector) * \param stencil_refs the front/back stencil ref values (scalar) * \param z_src the incoming depth/stencil values (n 2x2 quad values, float32) * \param zs_dst the depth/stencil values in framebuffer * \param face contains boolean value indicating front/back facing polygon */ void lp_build_depth_stencil_test(struct gallivm_state *gallivm, const struct pipe_depth_state *depth, const struct pipe_stencil_state stencil[2], struct lp_type z_src_type, const struct util_format_description *format_desc, struct lp_build_mask_context *mask, LLVMValueRef stencil_refs[2], LLVMValueRef z_src, LLVMValueRef z_fb, LLVMValueRef s_fb, LLVMValueRef face, LLVMValueRef *z_value, LLVMValueRef *s_value, boolean do_branch) { LLVMBuilderRef builder = gallivm->builder; struct lp_type z_type; struct lp_build_context z_bld; struct lp_build_context s_bld; struct lp_type s_type; unsigned z_shift = 0, z_width = 0, z_mask = 0; LLVMValueRef z_dst = NULL; LLVMValueRef stencil_vals = NULL; LLVMValueRef z_bitmask = NULL, stencil_shift = NULL; LLVMValueRef z_pass = NULL, s_pass_mask = NULL; LLVMValueRef orig_mask = lp_build_mask_value(mask); LLVMValueRef front_facing = NULL; boolean have_z, have_s; /* * Depths are expected to be between 0 and 1, even if they are stored in * floats. Setting these bits here will ensure that the lp_build_conv() call * below won't try to unnecessarily clamp the incoming values. */ if(z_src_type.floating) { z_src_type.sign = FALSE; z_src_type.norm = TRUE; } else { assert(!z_src_type.sign); assert(z_src_type.norm); } /* Pick the type matching the depth-stencil format. */ z_type = lp_depth_type(format_desc, z_src_type.length); /* Pick the intermediate type for depth operations. */ z_type.width = z_src_type.width; assert(z_type.length == z_src_type.length); /* FIXME: for non-float depth/stencil might generate better code * if we'd always split it up to use 128bit operations. * For stencil we'd almost certainly want to pack to 8xi16 values, * for z just run twice. */ /* Sanity checking */ { const unsigned z_swizzle = format_desc->swizzle[0]; const unsigned s_swizzle = format_desc->swizzle[1]; assert(z_swizzle != UTIL_FORMAT_SWIZZLE_NONE || s_swizzle != UTIL_FORMAT_SWIZZLE_NONE); assert(depth->enabled || stencil[0].enabled); assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS); assert(format_desc->block.width == 1); assert(format_desc->block.height == 1); if (stencil[0].enabled) { assert(s_swizzle < 4); assert(format_desc->channel[s_swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED); assert(format_desc->channel[s_swizzle].pure_integer); assert(!format_desc->channel[s_swizzle].normalized); assert(format_desc->channel[s_swizzle].size == 8); } if (depth->enabled) { assert(z_swizzle < 4); if (z_type.floating) { assert(z_swizzle == 0); assert(format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_FLOAT); assert(format_desc->channel[z_swizzle].size == 32); } else { assert(format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED); assert(format_desc->channel[z_swizzle].normalized); assert(!z_type.fixed); } } } /* Setup build context for Z vals */ lp_build_context_init(&z_bld, gallivm, z_type); /* Setup build context for stencil vals */ s_type = lp_int_type(z_type); lp_build_context_init(&s_bld, gallivm, s_type); /* Compute and apply the Z/stencil bitmasks and shifts. */ { unsigned s_shift, s_mask; z_dst = z_fb; stencil_vals = s_fb; have_z = get_z_shift_and_mask(format_desc, &z_shift, &z_width, &z_mask); have_s = get_s_shift_and_mask(format_desc, &s_shift, &s_mask); if (have_z) { if (z_mask != 0xffffffff) { z_bitmask = lp_build_const_int_vec(gallivm, z_type, z_mask); } /* * Align the framebuffer Z 's LSB to the right. */ if (z_shift) { LLVMValueRef shift = lp_build_const_int_vec(gallivm, z_type, z_shift); z_dst = LLVMBuildLShr(builder, z_dst, shift, "z_dst"); } else if (z_bitmask) { z_dst = LLVMBuildAnd(builder, z_dst, z_bitmask, "z_dst"); } else { lp_build_name(z_dst, "z_dst"); } } if (have_s) { if (s_shift) { LLVMValueRef shift = lp_build_const_int_vec(gallivm, s_type, s_shift); stencil_vals = LLVMBuildLShr(builder, stencil_vals, shift, ""); stencil_shift = shift; /* used below */ } if (s_mask != 0xffffffff) { LLVMValueRef mask = lp_build_const_int_vec(gallivm, s_type, s_mask); stencil_vals = LLVMBuildAnd(builder, stencil_vals, mask, ""); } lp_build_name(stencil_vals, "s_dst"); } } if (stencil[0].enabled) { if (face) { LLVMValueRef zero = lp_build_const_int32(gallivm, 0); /* front_facing = face != 0 ? ~0 : 0 */ front_facing = LLVMBuildICmp(builder, LLVMIntNE, face, zero, ""); front_facing = LLVMBuildSExt(builder, front_facing, LLVMIntTypeInContext(gallivm->context, s_bld.type.length*s_bld.type.width), ""); front_facing = LLVMBuildBitCast(builder, front_facing, s_bld.int_vec_type, ""); } /* convert scalar stencil refs into vectors */ stencil_refs[0] = lp_build_broadcast_scalar(&s_bld, stencil_refs[0]); stencil_refs[1] = lp_build_broadcast_scalar(&s_bld, stencil_refs[1]); s_pass_mask = lp_build_stencil_test(&s_bld, stencil, stencil_refs, stencil_vals, front_facing); /* apply stencil-fail operator */ { LLVMValueRef s_fail_mask = lp_build_andnot(&s_bld, orig_mask, s_pass_mask); stencil_vals = lp_build_stencil_op(&s_bld, stencil, S_FAIL_OP, stencil_refs, stencil_vals, s_fail_mask, front_facing); } } if (depth->enabled) { /* * Convert fragment Z to the desired type, aligning the LSB to the right. */ assert(z_type.width == z_src_type.width); assert(z_type.length == z_src_type.length); assert(lp_check_value(z_src_type, z_src)); if (z_src_type.floating) { /* * Convert from floating point values */ if (!z_type.floating) { z_src = lp_build_clamped_float_to_unsigned_norm(gallivm, z_src_type, z_width, z_src); } } else { /* * Convert from unsigned normalized values. */ assert(!z_src_type.sign); assert(!z_src_type.fixed); assert(z_src_type.norm); assert(!z_type.floating); if (z_src_type.width > z_width) { LLVMValueRef shift = lp_build_const_int_vec(gallivm, z_src_type, z_src_type.width - z_width); z_src = LLVMBuildLShr(builder, z_src, shift, ""); } } assert(lp_check_value(z_type, z_src)); lp_build_name(z_src, "z_src"); /* compare src Z to dst Z, returning 'pass' mask */ z_pass = lp_build_cmp(&z_bld, depth->func, z_src, z_dst); if (!stencil[0].enabled) { /* We can potentially skip all remaining operations here, but only * if stencil is disabled because we still need to update the stencil * buffer values. Don't need to update Z buffer values. */ lp_build_mask_update(mask, z_pass); if (do_branch) { lp_build_mask_check(mask); do_branch = FALSE; } } if (depth->writemask) { LLVMValueRef zselectmask; /* mask off bits that failed Z test */ zselectmask = LLVMBuildAnd(builder, orig_mask, z_pass, ""); /* mask off bits that failed stencil test */ if (s_pass_mask) { zselectmask = LLVMBuildAnd(builder, zselectmask, s_pass_mask, ""); } /* Mix the old and new Z buffer values. * z_dst[i] = zselectmask[i] ? z_src[i] : z_dst[i] */ z_dst = lp_build_select(&z_bld, zselectmask, z_src, z_dst); } if (stencil[0].enabled) { /* update stencil buffer values according to z pass/fail result */ LLVMValueRef z_fail_mask, z_pass_mask; /* apply Z-fail operator */ z_fail_mask = lp_build_andnot(&s_bld, orig_mask, z_pass); stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_FAIL_OP, stencil_refs, stencil_vals, z_fail_mask, front_facing); /* apply Z-pass operator */ z_pass_mask = LLVMBuildAnd(builder, orig_mask, z_pass, ""); stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_PASS_OP, stencil_refs, stencil_vals, z_pass_mask, front_facing); } } else { /* No depth test: apply Z-pass operator to stencil buffer values which * passed the stencil test. */ s_pass_mask = LLVMBuildAnd(builder, orig_mask, s_pass_mask, ""); stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_PASS_OP, stencil_refs, stencil_vals, s_pass_mask, front_facing); } /* Put Z and stencil bits in the right place */ if (have_z && z_shift) { LLVMValueRef shift = lp_build_const_int_vec(gallivm, z_type, z_shift); z_dst = LLVMBuildShl(builder, z_dst, shift, ""); } if (stencil_vals && stencil_shift) stencil_vals = LLVMBuildShl(builder, stencil_vals, stencil_shift, ""); /* Finally, merge the z/stencil values */ if (format_desc->block.bits <= 32) { if (have_z && have_s) *z_value = LLVMBuildOr(builder, z_dst, stencil_vals, ""); else if (have_z) *z_value = z_dst; else *z_value = stencil_vals; *s_value = *z_value; } else { *z_value = z_dst; *s_value = stencil_vals; } if (s_pass_mask) lp_build_mask_update(mask, s_pass_mask); if (depth->enabled && stencil[0].enabled) lp_build_mask_update(mask, z_pass); }
/** * Convert linear float values to srgb int values. * Several possibilities how to do this, e.g. * - use table (based on exponent/highest order mantissa bits) and do * linear interpolation (https://gist.github.com/rygorous/2203834) * - Chebyshev polynomial * - Approximation using reciprocals * - using int-to-float and float-to-int tricks for pow() * (http://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent) * * @param src float (vector) value(s) to convert. */ static LLVMValueRef lp_build_linear_to_srgb(struct gallivm_state *gallivm, struct lp_type src_type, LLVMValueRef src) { LLVMBuilderRef builder = gallivm->builder; struct lp_build_context f32_bld; LLVMValueRef lin_thresh, lin, lin_const, is_linear, tmp, pow_final; lp_build_context_init(&f32_bld, gallivm, src_type); src = lp_build_clamp(&f32_bld, src, f32_bld.zero, f32_bld.one); if (0) { /* * using int-to-float and float-to-int trick for pow(). * This is much more accurate than necessary thanks to the correction, * but it most certainly makes no sense without rsqrt available. * Bonus points if you understand how this works... * All in all (including min/max clamp, conversion) 19 instructions. */ float exp_f = 2.0f / 3.0f; /* some compilers can't do exp2f, so this is exp2f(127.0f/exp_f - 127.0f) */ float exp2f_c = 1.30438178253e+19f; float coeff_f = 0.62996f; LLVMValueRef pow_approx, coeff, x2, exponent, pow_1, pow_2; struct lp_type int_type = lp_int_type(src_type); /* * First calculate approx x^8/12 */ exponent = lp_build_const_vec(gallivm, src_type, exp_f); coeff = lp_build_const_vec(gallivm, src_type, exp2f_c * powf(coeff_f, 1.0f / exp_f)); /* premultiply src */ tmp = lp_build_mul(&f32_bld, coeff, src); /* "log2" */ tmp = LLVMBuildBitCast(builder, tmp, lp_build_vec_type(gallivm, int_type), ""); tmp = lp_build_int_to_float(&f32_bld, tmp); /* multiply for pow */ tmp = lp_build_mul(&f32_bld, tmp, exponent); /* "exp2" */ pow_approx = lp_build_itrunc(&f32_bld, tmp); pow_approx = LLVMBuildBitCast(builder, pow_approx, lp_build_vec_type(gallivm, src_type), ""); /* * Since that pow was inaccurate (like 3 bits, though each sqrt step would * give another bit), compensate the error (which is why we chose another * exponent in the first place). */ /* x * x^(8/12) = x^(20/12) */ pow_1 = lp_build_mul(&f32_bld, pow_approx, src); /* x * x * x^(-4/12) = x^(20/12) */ /* Should avoid using rsqrt if it's not available, but * using x * x^(4/12) * x^(4/12) instead will change error weight */ tmp = lp_build_fast_rsqrt(&f32_bld, pow_approx); x2 = lp_build_mul(&f32_bld, src, src); pow_2 = lp_build_mul(&f32_bld, x2, tmp); /* average the values so the errors cancel out, compensate bias, * we also squeeze the 1.055 mul of the srgb conversion plus the 255.0 mul * for conversion to int in here */ tmp = lp_build_add(&f32_bld, pow_1, pow_2); coeff = lp_build_const_vec(gallivm, src_type, 1.0f / (3.0f * coeff_f) * 0.999852f * powf(1.055f * 255.0f, 4.0f)); pow_final = lp_build_mul(&f32_bld, tmp, coeff); /* x^(5/12) = rsqrt(rsqrt(x^20/12)) */ if (lp_build_fast_rsqrt_available(src_type)) { pow_final = lp_build_fast_rsqrt(&f32_bld, lp_build_fast_rsqrt(&f32_bld, pow_final)); } else { pow_final = lp_build_sqrt(&f32_bld, lp_build_sqrt(&f32_bld, pow_final)); } pow_final = lp_build_add(&f32_bld, pow_final, lp_build_const_vec(gallivm, src_type, -0.055f * 255.0f)); } else { /* * using "rational polynomial" approximation here. * Essentially y = a*x^0.375 + b*x^0.5 + c, with also * factoring in the 255.0 mul and the scaling mul. * (a is closer to actual value so has higher weight than b.) * Note: the constants are magic values. They were found empirically, * possibly could be improved but good enough (be VERY careful with * error metric if you'd want to tweak them, they also MUST fit with * the crappy polynomial above for srgb->linear since it is required * that each srgb value maps back to the same value). * This function has an error of max +-0.17 (and we'd only require +-0.6), * for the approximated srgb->linear values the error is naturally larger * (+-0.42) but still accurate enough (required +-0.5 essentially). * All in all (including min/max clamp, conversion) 15 instructions. * FMA would help (minus 2 instructions). */ LLVMValueRef x05, x0375, a_const, b_const, c_const, tmp2; if (lp_build_fast_rsqrt_available(src_type)) { tmp = lp_build_fast_rsqrt(&f32_bld, src); x05 = lp_build_mul(&f32_bld, src, tmp); } else { /* * I don't really expect this to be practical without rsqrt * but there's no reason for triple punishment so at least * save the otherwise resulting division and unnecessary mul... */ x05 = lp_build_sqrt(&f32_bld, src); } tmp = lp_build_mul(&f32_bld, x05, src); if (lp_build_fast_rsqrt_available(src_type)) { x0375 = lp_build_fast_rsqrt(&f32_bld, lp_build_fast_rsqrt(&f32_bld, tmp)); } else { x0375 = lp_build_sqrt(&f32_bld, lp_build_sqrt(&f32_bld, tmp)); } a_const = lp_build_const_vec(gallivm, src_type, 0.675f * 1.0622 * 255.0f); b_const = lp_build_const_vec(gallivm, src_type, 0.325f * 1.0622 * 255.0f); c_const = lp_build_const_vec(gallivm, src_type, -0.0620f * 255.0f); tmp = lp_build_mul(&f32_bld, a_const, x0375); tmp2 = lp_build_mul(&f32_bld, b_const, x05); tmp2 = lp_build_add(&f32_bld, tmp2, c_const); pow_final = lp_build_add(&f32_bld, tmp, tmp2); } /* linear part is easy */ lin_const = lp_build_const_vec(gallivm, src_type, 12.92f * 255.0f); lin = lp_build_mul(&f32_bld, src, lin_const); lin_thresh = lp_build_const_vec(gallivm, src_type, 0.0031308f); is_linear = lp_build_compare(gallivm, src_type, PIPE_FUNC_LEQUAL, src, lin_thresh); tmp = lp_build_select(&f32_bld, is_linear, lin, pow_final); f32_bld.type.sign = 0; return lp_build_iround(&f32_bld, tmp); }
/** * Fetch a texels from a texture, returning them in SoA layout. * * \param type the desired return type for 'rgba'. The vector length * is the number of texels to fetch * \param aligned if the offset is guaranteed to be aligned to element width * * \param base_ptr points to the base of the texture mip tree. * \param offset offset to start of the texture image block. For non- * compressed formats, this simply is an offset to the texel. * For compressed formats, it is an offset to the start of the * compressed data block. * * \param i, j the sub-block pixel coordinates. For non-compressed formats * these will always be (0,0). For compressed formats, i will * be in [0, block_width-1] and j will be in [0, block_height-1]. * \param cache optional value pointing to a lp_build_format_cache structure */ void lp_build_fetch_rgba_soa(struct gallivm_state *gallivm, const struct util_format_description *format_desc, struct lp_type type, boolean aligned, LLVMValueRef base_ptr, LLVMValueRef offset, LLVMValueRef i, LLVMValueRef j, LLVMValueRef cache, LLVMValueRef rgba_out[4]) { LLVMBuilderRef builder = gallivm->builder; enum pipe_format format = format_desc->format; struct lp_type fetch_type; if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN && (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB || format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB || format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) && format_desc->block.width == 1 && format_desc->block.height == 1 && format_desc->block.bits <= type.width && (format_desc->channel[0].type != UTIL_FORMAT_TYPE_FLOAT || format_desc->channel[0].size == 32 || format_desc->channel[0].size == 16)) { /* * The packed pixel fits into an element of the destination format. Put * the packed pixels into a vector and extract each component for all * vector elements in parallel. */ LLVMValueRef packed; /* * gather the texels from the texture * Ex: packed = {XYZW, XYZW, XYZW, XYZW} */ assert(format_desc->block.bits <= type.width); fetch_type = lp_type_uint(type.width); packed = lp_build_gather(gallivm, type.length, format_desc->block.bits, fetch_type, aligned, base_ptr, offset, FALSE); /* * convert texels to float rgba */ lp_build_unpack_rgba_soa(gallivm, format_desc, type, packed, rgba_out); return; } if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN && (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB) && format_desc->block.width == 1 && format_desc->block.height == 1 && format_desc->block.bits > type.width && ((format_desc->block.bits <= type.width * type.length && format_desc->channel[0].size <= type.width) || (format_desc->channel[0].size == 64 && format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT && type.floating))) { /* * Similar to above, but the packed pixel is larger than what fits * into an element of the destination format. The packed pixels will be * shuffled into SoA vectors appropriately, and then the extraction will * be done in parallel as much as possible. * Good for 16xn (n > 2) and 32xn (n > 1) formats, care is taken so * the gathered vectors can be shuffled easily (even with avx). * 64xn float -> 32xn float is handled too but it's a bit special as * it does the conversion pre-shuffle. */ LLVMValueRef packed[4], dst[4], output[4], shuffles[LP_MAX_VECTOR_WIDTH/32]; struct lp_type fetch_type, gather_type = type; unsigned num_gather, fetch_width, i, j; struct lp_build_context bld; boolean fp64 = format_desc->channel[0].size == 64; lp_build_context_init(&bld, gallivm, type); assert(type.width == 32); assert(format_desc->block.bits > type.width); /* * First, figure out fetch order. */ fetch_width = util_next_power_of_two(format_desc->block.bits); num_gather = fetch_width / type.width; /* * fp64 are treated like fp32 except we fetch twice wide values * (as we shuffle after trunc). The shuffles for that work out * mostly fine (slightly suboptimal for 4-wide, perfect for AVX) * albeit we miss the potential opportunity for hw gather (as it * only handles native size). */ num_gather = fetch_width / type.width; gather_type.width *= num_gather; if (fp64) { num_gather /= 2; } gather_type.length /= num_gather; for (i = 0; i < num_gather; i++) { LLVMValueRef offsetr, shuf_vec; if(num_gather == 4) { for (j = 0; j < gather_type.length; j++) { unsigned idx = i + 4*j; shuffles[j] = lp_build_const_int32(gallivm, idx); } shuf_vec = LLVMConstVector(shuffles, gather_type.length); offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, ""); } else if (num_gather == 2) { assert(num_gather == 2); for (j = 0; j < gather_type.length; j++) { unsigned idx = i*2 + (j%2) + (j/2)*4; shuffles[j] = lp_build_const_int32(gallivm, idx); } shuf_vec = LLVMConstVector(shuffles, gather_type.length); offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, ""); } else { assert(num_gather == 1); offsetr = offset; } if (gather_type.length == 1) { LLVMValueRef zero = lp_build_const_int32(gallivm, 0); offsetr = LLVMBuildExtractElement(builder, offsetr, zero, ""); } /* * Determine whether to use float or int loads. This is mostly * to outsmart the (stupid) llvm int/float shuffle logic, we * don't really care much if the data is floats or ints... * But llvm will refuse to use single float shuffle with int data * and instead use 3 int shuffles instead, the code looks atrocious. * (Note bitcasts often won't help, as llvm is too smart to be * fooled by that.) * Nobody cares about simd float<->int domain transition penalties, * which usually don't even exist for shuffles anyway. * With 4x32bit (and 3x32bit) fetch, we use float vec (the data is * going into transpose, which is unpacks, so doesn't really matter * much). * With 2x32bit or 4x16bit fetch, we use float vec, since those * go into the weird channel separation shuffle. With floats, * this is (with 128bit vectors): * - 2 movq, 2 movhpd, 2 shufps * With ints it would be: * - 4 movq, 2 punpcklqdq, 4 pshufd, 2 blendw * I've seen texture functions increase in code size by 15% just due * to that (there's lots of such fetches in them...) * (We could chose a different gather order to improve this somewhat * for the int path, but it would basically just drop the blends, * so the float path with this order really is optimal.) * Albeit it is tricky sometimes llvm doesn't ignore the float->int * casts so must avoid them until we're done with the float shuffle... * 3x16bit formats (the same is also true for 3x8) are pretty bad but * there's nothing we can do about them (we could overallocate by * those couple bytes and use unaligned but pot sized load). * Note that this is very much x86 specific. I don't know if this * affect other archs at all. */ if (num_gather > 1) { /* * We always want some float type here (with x86) * due to shuffles being float ones afterwards (albeit for * the num_gather == 4 case int should work fine too * (unless there's some problems with avx but not avx2). */ if (format_desc->channel[0].size == 64) { fetch_type = lp_type_float_vec(64, gather_type.width); } else { fetch_type = lp_type_int_vec(32, gather_type.width); } } else { /* type doesn't matter much */ if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT && (format_desc->channel[0].size == 32 || format_desc->channel[0].size == 64)) { fetch_type = lp_type_float(gather_type.width); } else { fetch_type = lp_type_uint(gather_type.width); } } /* Now finally gather the values */ packed[i] = lp_build_gather(gallivm, gather_type.length, format_desc->block.bits, fetch_type, aligned, base_ptr, offsetr, FALSE); if (fp64) { struct lp_type conv_type = type; conv_type.width *= 2; packed[i] = LLVMBuildBitCast(builder, packed[i], lp_build_vec_type(gallivm, conv_type), ""); packed[i] = LLVMBuildFPTrunc(builder, packed[i], bld.vec_type, ""); } } /* shuffle the gathered values to SoA */ if (num_gather == 2) { for (i = 0; i < num_gather; i++) { for (j = 0; j < type.length; j++) { unsigned idx = (j%2)*2 + (j/4)*4 + i; if ((j/2)%2) idx += type.length; shuffles[j] = lp_build_const_int32(gallivm, idx); } dst[i] = LLVMBuildShuffleVector(builder, packed[0], packed[1], LLVMConstVector(shuffles, type.length), ""); } } else if (num_gather == 4) { lp_build_transpose_aos(gallivm, lp_int_type(type), packed, dst); } else { assert(num_gather == 1); dst[0] = packed[0]; } /* * And finally unpack exactly as above, except that * chan shift is adjusted and the right vector selected. */ if (!fp64) { for (i = 0; i < num_gather; i++) { dst[i] = LLVMBuildBitCast(builder, dst[i], bld.int_vec_type, ""); } for (i = 0; i < format_desc->nr_channels; i++) { struct util_format_channel_description chan_desc = format_desc->channel[i]; unsigned blockbits = type.width; unsigned vec_nr = chan_desc.shift / type.width; chan_desc.shift %= type.width; output[i] = lp_build_extract_soa_chan(&bld, blockbits, FALSE, chan_desc, dst[vec_nr]); } } else { for (i = 0; i < format_desc->nr_channels; i++) { output[i] = dst[i]; } } lp_build_format_swizzle_soa(format_desc, &bld, output, rgba_out); return; } if (format == PIPE_FORMAT_R11G11B10_FLOAT || format == PIPE_FORMAT_R9G9B9E5_FLOAT) { /* * similar conceptually to above but requiring special * AoS packed -> SoA float conversion code. */ LLVMValueRef packed; struct lp_type fetch_type = lp_type_uint(type.width); assert(type.floating); assert(type.width == 32); packed = lp_build_gather(gallivm, type.length, format_desc->block.bits, fetch_type, aligned, base_ptr, offset, FALSE); if (format == PIPE_FORMAT_R11G11B10_FLOAT) { lp_build_r11g11b10_to_float(gallivm, packed, rgba_out); } else { lp_build_rgb9e5_to_float(gallivm, packed, rgba_out); } return; } if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS && format_desc->block.bits == 64) { /* * special case the format is 64 bits but we only require * 32bit (or 8bit) from each block. */ LLVMValueRef packed; struct lp_type fetch_type = lp_type_uint(type.width); if (format == PIPE_FORMAT_X32_S8X24_UINT) { /* * for stencil simply fix up offsets - could in fact change * base_ptr instead even outside the shader. */ unsigned mask = (1 << 8) - 1; LLVMValueRef s_offset = lp_build_const_int_vec(gallivm, type, 4); offset = LLVMBuildAdd(builder, offset, s_offset, ""); packed = lp_build_gather(gallivm, type.length, 32, fetch_type, aligned, base_ptr, offset, FALSE); packed = LLVMBuildAnd(builder, packed, lp_build_const_int_vec(gallivm, type, mask), ""); } else { assert (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT); packed = lp_build_gather(gallivm, type.length, 32, fetch_type, aligned, base_ptr, offset, TRUE); packed = LLVMBuildBitCast(builder, packed, lp_build_vec_type(gallivm, type), ""); } /* for consistency with lp_build_unpack_rgba_soa() return sss1 or zzz1 */ rgba_out[0] = rgba_out[1] = rgba_out[2] = packed; rgba_out[3] = lp_build_const_vec(gallivm, type, 1.0f); return; } /* * Try calling lp_build_fetch_rgba_aos for all pixels. * Should only really hit subsampled, compressed * (for s3tc srgb too, for rgtc the unorm ones only) by now. * (This is invalid for plain 8unorm formats because we're lazy with * the swizzle since some results would arrive swizzled, some not.) */ if ((format_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) && (util_format_fits_8unorm(format_desc) || format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) && type.floating && type.width == 32 && (type.length == 1 || (type.length % 4 == 0))) { struct lp_type tmp_type; struct lp_build_context bld; LLVMValueRef packed, rgba[4]; const struct util_format_description *flinear_desc; const struct util_format_description *frgba8_desc; unsigned chan; lp_build_context_init(&bld, gallivm, type); /* * Make sure the conversion in aos really only does convert to rgba8 * and not anything more (so use linear format, adjust type). */ flinear_desc = util_format_description(util_format_linear(format)); memset(&tmp_type, 0, sizeof tmp_type); tmp_type.width = 8; tmp_type.length = type.length * 4; tmp_type.norm = TRUE; packed = lp_build_fetch_rgba_aos(gallivm, flinear_desc, tmp_type, aligned, base_ptr, offset, i, j, cache); packed = LLVMBuildBitCast(builder, packed, bld.int_vec_type, ""); /* * The values are now packed so they match ordinary (srgb) RGBA8 format, * hence need to use matching format for unpack. */ frgba8_desc = util_format_description(PIPE_FORMAT_R8G8B8A8_UNORM); if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) { assert(format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC); frgba8_desc = util_format_description(PIPE_FORMAT_R8G8B8A8_SRGB); } lp_build_unpack_rgba_soa(gallivm, frgba8_desc, type, packed, rgba); /* * We converted 4 channels. Make sure llvm can drop unneeded ones * (luckily the rgba order is fixed, only LA needs special case). */ for (chan = 0; chan < 4; chan++) { enum pipe_swizzle swizzle = format_desc->swizzle[chan]; if (chan == 3 && util_format_is_luminance_alpha(format)) { swizzle = PIPE_SWIZZLE_W; } rgba_out[chan] = lp_build_swizzle_soa_channel(&bld, rgba, swizzle); } return; } /* * Fallback to calling lp_build_fetch_rgba_aos for each pixel. * * This is not the most efficient way of fetching pixels, as we * miss some opportunities to do vectorization, but this is * convenient for formats or scenarios for which there was no * opportunity or incentive to optimize. * * We do NOT want to end up here, this typically is quite terrible, * in particular if the formats have less than 4 channels. * * Right now, this should only be hit for: * - RGTC snorm formats * (those miss fast fetch functions hence they are terrible anyway) */ { unsigned k; struct lp_type tmp_type; LLVMValueRef aos_fetch[LP_MAX_VECTOR_WIDTH / 32]; if (gallivm_debug & GALLIVM_DEBUG_PERF) { debug_printf("%s: AoS fetch fallback for %s\n", __FUNCTION__, format_desc->short_name); } tmp_type = type; tmp_type.length = 4; /* * Note that vector transpose can be worse compared to insert/extract * for aos->soa conversion (for formats with 1 or 2 channels). However, * we should try to avoid getting here for just about all formats, so * don't bother. */ /* loop over number of pixels */ for(k = 0; k < type.length; ++k) { LLVMValueRef index = lp_build_const_int32(gallivm, k); LLVMValueRef offset_elem; LLVMValueRef i_elem, j_elem; offset_elem = LLVMBuildExtractElement(builder, offset, index, ""); i_elem = LLVMBuildExtractElement(builder, i, index, ""); j_elem = LLVMBuildExtractElement(builder, j, index, ""); /* Get a single float[4]={R,G,B,A} pixel */ aos_fetch[k] = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type, aligned, base_ptr, offset_elem, i_elem, j_elem, cache); } convert_to_soa(gallivm, aos_fetch, rgba_out, type); } }
/** * Expand the relevent bits of mask_input to a 4-dword mask for the * four pixels in a 2x2 quad. This will set the four elements of the * quad mask vector to 0 or ~0. * * \param quad which quad of the quad group to test, in [0,3] * \param mask_input bitwise mask for the whole 4x4 stamp */ static LLVMValueRef generate_quad_mask(LLVMBuilderRef builder, struct lp_type fs_type, unsigned quad, LLVMValueRef mask_input) /* int32 */ { struct lp_type mask_type; LLVMTypeRef i32t = LLVMInt32Type(); LLVMValueRef bits[4]; LLVMValueRef mask; int shift; /* * XXX: We'll need a different path for 16 x u8 */ assert(fs_type.width == 32); assert(fs_type.length == 4); mask_type = lp_int_type(fs_type); /* * mask_input >>= (quad * 4) */ switch (quad) { case 0: shift = 0; break; case 1: shift = 2; break; case 2: shift = 8; break; case 3: shift = 10; break; default: assert(0); shift = 0; } mask_input = LLVMBuildLShr(builder, mask_input, LLVMConstInt(i32t, shift, 0), ""); /* * mask = { mask_input & (1 << i), for i in [0,3] } */ mask = lp_build_broadcast(builder, lp_build_vec_type(mask_type), mask_input); bits[0] = LLVMConstInt(i32t, 1 << 0, 0); bits[1] = LLVMConstInt(i32t, 1 << 1, 0); bits[2] = LLVMConstInt(i32t, 1 << 4, 0); bits[3] = LLVMConstInt(i32t, 1 << 5, 0); mask = LLVMBuildAnd(builder, mask, LLVMConstVector(bits, 4), ""); /* * mask = mask != 0 ? ~0 : 0 */ mask = lp_build_compare(builder, mask_type, PIPE_FUNC_NOTEQUAL, mask, lp_build_const_int_vec(mask_type, 0)); return mask; }