/** * Convert a vector of rgba8 values into 32bit wide SoA vectors. * * \param dst_type The desired return type. For pure integer formats * this should be a 32bit wide int or uint vector type, * otherwise a float vector type. * * \param packed The rgba8 values to pack. * * \param rgba The 4 SoA return vectors. */ void lp_build_rgba8_to_fi32_soa(struct gallivm_state *gallivm, struct lp_type dst_type, LLVMValueRef packed, LLVMValueRef *rgba) { LLVMBuilderRef builder = gallivm->builder; LLVMValueRef mask = lp_build_const_int_vec(gallivm, dst_type, 0xff); unsigned chan; /* XXX technically shouldn't use that for uint dst_type */ packed = LLVMBuildBitCast(builder, packed, lp_build_int_vec_type(gallivm, dst_type), ""); /* Decode the input vector components */ for (chan = 0; chan < 4; ++chan) { unsigned start = chan*8; unsigned stop = start + 8; LLVMValueRef input; input = packed; if (start) input = LLVMBuildLShr(builder, input, lp_build_const_int_vec(gallivm, dst_type, start), ""); if (stop < 32) input = LLVMBuildAnd(builder, input, mask, ""); if (dst_type.floating) input = lp_build_unsigned_norm_to_float(gallivm, 8, dst_type, input); rgba[chan] = input; } }
/** * Do the one or two-sided stencil test op/update. */ static LLVMValueRef lp_build_stencil_op(struct lp_build_context *bld, const struct pipe_stencil_state stencil[2], enum stencil_op op, LLVMValueRef stencilRefs[2], LLVMValueRef stencilVals, LLVMValueRef mask, LLVMValueRef front_facing) { LLVMBuilderRef builder = bld->gallivm->builder; LLVMValueRef res; assert(stencil[0].enabled); /* do front face op */ res = lp_build_stencil_op_single(bld, &stencil[0], op, stencilRefs[0], stencilVals); if (stencil[1].enabled && front_facing != NULL) { /* do back face op */ LLVMValueRef back_res; back_res = lp_build_stencil_op_single(bld, &stencil[1], op, stencilRefs[1], stencilVals); res = lp_build_select(bld, front_facing, res, back_res); } if (stencil[0].writemask != 0xff || (stencil[1].enabled && front_facing != NULL && stencil[1].writemask != 0xff)) { /* mask &= stencil[0].writemask */ LLVMValueRef writemask = lp_build_const_int_vec(bld->gallivm, bld->type, stencil[0].writemask); if (stencil[1].enabled && stencil[1].writemask != stencil[0].writemask && front_facing != NULL) { LLVMValueRef back_writemask = lp_build_const_int_vec(bld->gallivm, bld->type, stencil[1].writemask); writemask = lp_build_select(bld, front_facing, writemask, back_writemask); } mask = LLVMBuildAnd(builder, mask, writemask, ""); /* res = (res & mask) | (stencilVals & ~mask) */ res = lp_build_select_bitwise(bld, mask, res, stencilVals); } else { /* res = mask ? res : stencilVals */ res = lp_build_select(bld, mask, res, stencilVals); } return res; }
/** * Shift right with immediate. */ LLVMValueRef lp_build_shr_imm(struct lp_build_context *bld, LLVMValueRef a, unsigned imm) { LLVMValueRef b = lp_build_const_int_vec(bld->gallivm, bld->type, imm); assert(imm <= bld->type.width); return lp_build_shr(bld, a, b); }
/** * Do the stencil test comparison (compare FB stencil values against ref value). * This will be used twice when generating two-sided stencil code. * \param stencil the front/back stencil state * \param stencilRef the stencil reference value, replicated as a vector * \param stencilVals vector of stencil values from framebuffer * \return vector mask of pass/fail values (~0 or 0) */ static LLVMValueRef lp_build_stencil_test_single(struct lp_build_context *bld, const struct pipe_stencil_state *stencil, LLVMValueRef stencilRef, LLVMValueRef stencilVals) { const unsigned stencilMax = 255; /* XXX fix */ struct lp_type type = bld->type; LLVMValueRef res; assert(type.sign); assert(stencil->enabled); if (stencil->valuemask != stencilMax) { /* compute stencilRef = stencilRef & valuemask */ LLVMValueRef valuemask = lp_build_const_int_vec(type, stencil->valuemask); stencilRef = LLVMBuildAnd(bld->builder, stencilRef, valuemask, ""); /* compute stencilVals = stencilVals & valuemask */ stencilVals = LLVMBuildAnd(bld->builder, stencilVals, valuemask, ""); } res = lp_build_cmp(bld, stencil->func, stencilRef, stencilVals); return res; }
/** * Compute the partial offset of a pixel block along an arbitrary axis. * * @param coord coordinate in pixels * @param stride number of bytes between rows of successive pixel blocks * @param block_length number of pixels in a pixels block along the coordinate * axis * @param out_offset resulting relative offset of the pixel block in bytes * @param out_subcoord resulting sub-block pixel coordinate */ void lp_build_sample_partial_offset(struct lp_build_context *bld, unsigned block_length, LLVMValueRef coord, LLVMValueRef stride, LLVMValueRef *out_offset, LLVMValueRef *out_subcoord) { LLVMBuilderRef builder = bld->gallivm->builder; LLVMValueRef offset; LLVMValueRef subcoord; if (block_length == 1) { subcoord = bld->zero; } else { /* * Pixel blocks have power of two dimensions. LLVM should convert the * rem/div to bit arithmetic. * TODO: Verify this. * It does indeed BUT it does transform it to scalar (and back) when doing so * (using roughly extract, shift/and, mov, unpack) (llvm 2.7). * The generated code looks seriously unfunny and is quite expensive. */ #if 0 LLVMValueRef block_width = lp_build_const_int_vec(bld->type, block_length); subcoord = LLVMBuildURem(builder, coord, block_width, ""); coord = LLVMBuildUDiv(builder, coord, block_width, ""); #else unsigned logbase2 = util_logbase2(block_length); LLVMValueRef block_shift = lp_build_const_int_vec(bld->gallivm, bld->type, logbase2); LLVMValueRef block_mask = lp_build_const_int_vec(bld->gallivm, bld->type, block_length - 1); subcoord = LLVMBuildAnd(builder, coord, block_mask, ""); coord = LLVMBuildLShr(builder, coord, block_shift, ""); #endif } offset = lp_build_mul(bld, coord, stride); assert(out_offset); assert(out_subcoord); *out_offset = offset; *out_subcoord = subcoord; }
/** * Extract Y, U, V channels from packed YUYV. * @param packed is a <n x i32> vector with the packed YUYV blocks * @param i is a <n x i32> vector with the x pixel coordinate (0 or 1) */ static void yuyv_to_yuv_soa(struct gallivm_state *gallivm, unsigned n, LLVMValueRef packed, LLVMValueRef i, LLVMValueRef *y, LLVMValueRef *u, LLVMValueRef *v) { LLVMBuilderRef builder = gallivm->builder; struct lp_type type; LLVMValueRef mask; memset(&type, 0, sizeof type); type.width = 32; type.length = n; assert(lp_check_value(type, packed)); assert(lp_check_value(type, i)); /* * y = (yuyv >> 16*i) & 0xff * u = (yuyv >> 8 ) & 0xff * v = (yuyv >> 24 ) & 0xff */ #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) /* * Avoid shift with per-element count. * No support on x86, gets translated to roughly 5 instructions * per element. Didn't measure performance but cuts shader size * by quite a bit (less difference if cpu has no sse4.1 support). */ if (util_cpu_caps.has_sse2 && n == 4) { LLVMValueRef sel, tmp; struct lp_build_context bld32; lp_build_context_init(&bld32, gallivm, type); tmp = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(gallivm, type, 16), ""); sel = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL, i, lp_build_const_int_vec(gallivm, type, 0)); *y = lp_build_select(&bld32, sel, packed, tmp); } else #endif { LLVMValueRef shift; shift = LLVMBuildMul(builder, i, lp_build_const_int_vec(gallivm, type, 16), ""); *y = LLVMBuildLShr(builder, packed, shift, ""); } *u = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(gallivm, type, 8), ""); *v = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(gallivm, type, 24), ""); mask = lp_build_const_int_vec(gallivm, type, 0xff); *y = LLVMBuildAnd(builder, *y, mask, "y"); *u = LLVMBuildAnd(builder, *u, mask, "u"); *v = LLVMBuildAnd(builder, *v, mask, "v"); }
static LLVMValueRef rgb_to_rgba_aos(struct gallivm_state *gallivm, unsigned n, LLVMValueRef r, LLVMValueRef g, LLVMValueRef b) { LLVMBuilderRef builder = gallivm->builder; struct lp_type type; LLVMValueRef a; LLVMValueRef rgba; memset(&type, 0, sizeof type); type.sign = TRUE; type.width = 32; type.length = n; assert(lp_check_value(type, r)); assert(lp_check_value(type, g)); assert(lp_check_value(type, b)); /* * Make a 4 x unorm8 vector */ r = r; g = LLVMBuildShl(builder, g, lp_build_const_int_vec(gallivm, type, 8), ""); b = LLVMBuildShl(builder, b, lp_build_const_int_vec(gallivm, type, 16), ""); a = lp_build_const_int_vec(gallivm, type, 0xff000000); rgba = r; rgba = LLVMBuildOr(builder, rgba, g, ""); rgba = LLVMBuildOr(builder, rgba, b, ""); rgba = LLVMBuildOr(builder, rgba, a, ""); rgba = LLVMBuildBitCast(builder, rgba, LLVMVectorType(LLVMInt8TypeInContext(gallivm->context), 4*n), ""); return rgba; }
/** * Extract Y, U, V channels from packed UYVY. * @param packed is a <n x i32> vector with the packed UYVY blocks * @param i is a <n x i32> vector with the x pixel coordinate (0 or 1) */ static void uyvy_to_yuv_soa(LLVMBuilderRef builder, unsigned n, LLVMValueRef packed, LLVMValueRef i, LLVMValueRef *y, LLVMValueRef *u, LLVMValueRef *v) { struct lp_type type; LLVMValueRef shift, mask; memset(&type, 0, sizeof type); type.width = 32; type.length = n; assert(lp_check_value(type, packed)); assert(lp_check_value(type, i)); /* * y = (uyvy >> 16*i) & 0xff * u = (uyvy ) & 0xff * v = (uyvy >> 16 ) & 0xff */ shift = LLVMBuildMul(builder, i, lp_build_const_int_vec(type, 16), ""); shift = LLVMBuildAdd(builder, shift, lp_build_const_int_vec(type, 8), ""); *y = LLVMBuildLShr(builder, packed, shift, ""); *u = packed; *v = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(type, 16), ""); mask = lp_build_const_int_vec(type, 0xff); *y = LLVMBuildAnd(builder, *y, mask, "y"); *u = LLVMBuildAnd(builder, *u, mask, "u"); *v = LLVMBuildAnd(builder, *v, mask, "v"); }
/** * Inverse of lp_build_clamped_float_to_unsigned_norm above. * Ex: src = { i32, i32, i32, i32 } with values in range [0, 2^src_width-1] * return {float, float, float, float} with values in range [0, 1]. */ LLVMValueRef lp_build_unsigned_norm_to_float(LLVMBuilderRef builder, unsigned src_width, struct lp_type dst_type, LLVMValueRef src) { LLVMTypeRef vec_type = lp_build_vec_type(dst_type); LLVMTypeRef int_vec_type = lp_build_int_vec_type(dst_type); LLVMValueRef bias_; LLVMValueRef res; unsigned mantissa; unsigned n; unsigned long long ubound; unsigned long long mask; double scale; double bias; assert(dst_type.floating); mantissa = lp_mantissa(dst_type); n = MIN2(mantissa, src_width); ubound = ((unsigned long long)1 << n); mask = ubound - 1; scale = (double)ubound/mask; bias = (double)((unsigned long long)1 << (mantissa - n)); res = src; if(src_width > mantissa) { int shift = src_width - mantissa; res = LLVMBuildLShr(builder, res, lp_build_const_int_vec(dst_type, shift), ""); } bias_ = lp_build_const_vec(dst_type, bias); res = LLVMBuildOr(builder, res, LLVMBuildBitCast(builder, bias_, int_vec_type, ""), ""); res = LLVMBuildBitCast(builder, res, vec_type, ""); res = LLVMBuildFSub(builder, res, bias_, ""); res = LLVMBuildFMul(builder, res, lp_build_const_vec(dst_type, scale), ""); return res; }
LLVMValueRef lp_build_one(struct lp_type type) { LLVMTypeRef elem_type; LLVMValueRef elems[LP_MAX_VECTOR_LENGTH]; unsigned i; assert(type.length <= LP_MAX_VECTOR_LENGTH); elem_type = lp_build_elem_type(type); if(type.floating) elems[0] = LLVMConstReal(elem_type, 1.0); else if(type.fixed) elems[0] = LLVMConstInt(elem_type, 1LL << (type.width/2), 0); else if(!type.norm) elems[0] = LLVMConstInt(elem_type, 1, 0); else if(type.sign) elems[0] = LLVMConstInt(elem_type, (1LL << (type.width - 1)) - 1, 0); else { /* special case' -- 1.0 for normalized types is more easily attained if * we start with a vector consisting of all bits set */ LLVMTypeRef vec_type = LLVMVectorType(elem_type, type.length); LLVMValueRef vec = LLVMConstAllOnes(vec_type); #if 0 if(type.sign) /* TODO: Unfortunately this caused "Tried to create a shift operation * on a non-integer type!" */ vec = LLVMConstLShr(vec, lp_build_const_int_vec(type, 1)); #endif return vec; } for(i = 1; i < type.length; ++i) elems[i] = elems[0]; if (type.length == 1) return elems[0]; else return LLVMConstVector(elems, type.length); }
/** * Convert linear float soa values to packed srgb AoS values. * This only handles packed formats which are 4x8bit in size * (rgba and rgbx plus swizzles), and 16bit 565-style formats * with no alpha. (In the latter case the return values won't be * fully packed, it will look like r5g6b5x16r5g6b5x16...) * * @param src float SoA (vector) values to convert. */ LLVMValueRef lp_build_float_to_srgb_packed(struct gallivm_state *gallivm, const struct util_format_description *dst_fmt, struct lp_type src_type, LLVMValueRef *src) { LLVMBuilderRef builder = gallivm->builder; unsigned chan; struct lp_build_context f32_bld; struct lp_type int32_type = lp_int_type(src_type); LLVMValueRef tmpsrgb[4], alpha, dst; lp_build_context_init(&f32_bld, gallivm, src_type); /* rgb is subject to linear->srgb conversion, alpha is not */ for (chan = 0; chan < 3; chan++) { unsigned chan_bits = dst_fmt->channel[dst_fmt->swizzle[chan]].size; tmpsrgb[chan] = lp_build_linear_to_srgb(gallivm, src_type, chan_bits, src[chan]); } /* * can't use lp_build_conv since we want to keep values as 32bit * here so we can interleave with rgb to go from SoA->AoS. */ alpha = lp_build_clamp_zero_one_nanzero(&f32_bld, src[3]); alpha = lp_build_mul(&f32_bld, alpha, lp_build_const_vec(gallivm, src_type, 255.0f)); tmpsrgb[3] = lp_build_iround(&f32_bld, alpha); dst = lp_build_zero(gallivm, int32_type); for (chan = 0; chan < dst_fmt->nr_channels; chan++) { if (dst_fmt->swizzle[chan] <= PIPE_SWIZZLE_W) { unsigned ls; LLVMValueRef shifted, shift_val; ls = dst_fmt->channel[dst_fmt->swizzle[chan]].shift; shift_val = lp_build_const_int_vec(gallivm, int32_type, ls); shifted = LLVMBuildShl(builder, tmpsrgb[chan], shift_val, ""); dst = LLVMBuildOr(builder, dst, shifted, ""); } } return dst; }
/** * Double the bit width. * * This will only change the number of bits the values are represented, not the * values themselves. */ void lp_build_unpack2(struct gallivm_state *gallivm, struct lp_type src_type, struct lp_type dst_type, LLVMValueRef src, LLVMValueRef *dst_lo, LLVMValueRef *dst_hi) { LLVMBuilderRef builder = gallivm->builder; LLVMValueRef msb; LLVMTypeRef dst_vec_type; assert(!src_type.floating); assert(!dst_type.floating); assert(dst_type.width == src_type.width * 2); assert(dst_type.length * 2 == src_type.length); if(dst_type.sign && src_type.sign) { /* Replicate the sign bit in the most significant bits */ msb = LLVMBuildAShr(builder, src, lp_build_const_int_vec(gallivm, src_type, src_type.width - 1), ""); } else /* Most significant bits always zero */ msb = lp_build_zero(gallivm, src_type); /* Interleave bits */ #ifdef PIPE_ARCH_LITTLE_ENDIAN *dst_lo = lp_build_interleave2(gallivm, src_type, src, msb, 0); *dst_hi = lp_build_interleave2(gallivm, src_type, src, msb, 1); #else *dst_lo = lp_build_interleave2(gallivm, src_type, msb, src, 0); *dst_hi = lp_build_interleave2(gallivm, src_type, msb, src, 1); #endif /* Cast the result into the new type (twice as wide) */ dst_vec_type = lp_build_vec_type(gallivm, dst_type); *dst_lo = LLVMBuildBitCast(builder, *dst_lo, dst_vec_type, ""); *dst_hi = LLVMBuildBitCast(builder, *dst_hi, dst_vec_type, ""); }
/** * Converts int16 half-float to float32 * Note this can be performed in 1 instruction if vcvtph2ps exists (sse5 i think?) * [llvm.x86.vcvtph2ps / _mm_cvtph_ps] * * @param src_type <vector> type of int16 * @param src value to convert * * ref http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/ */ LLVMValueRef lp_build_half_to_float(struct gallivm_state *gallivm, struct lp_type src_type, LLVMValueRef src) { struct lp_type f32_type = lp_type_float_vec(32, 32 * src_type.length); struct lp_type i32_type = lp_type_int_vec(32, 32 * src_type.length); LLVMBuilderRef builder = gallivm->builder; LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, i32_type); LLVMTypeRef float_vec_type = lp_build_vec_type(gallivm, f32_type); /* Constants */ LLVMValueRef i32_13 = lp_build_const_int_vec(gallivm, i32_type, 13); LLVMValueRef i32_16 = lp_build_const_int_vec(gallivm, i32_type, 16); LLVMValueRef i32_mask_nosign = lp_build_const_int_vec(gallivm, i32_type, 0x7fff); LLVMValueRef i32_was_infnan = lp_build_const_int_vec(gallivm, i32_type, 0x7bff); LLVMValueRef i32_exp_infnan = lp_build_const_int_vec(gallivm, i32_type, 0xff << 23); LLVMValueRef f32_magic = LLVMBuildBitCast(builder, lp_build_const_int_vec(gallivm, i32_type, (254 - 15) << 23), float_vec_type, ""); /* Convert int16 vector to int32 vector by zero ext */ LLVMValueRef h = LLVMBuildZExt(builder, src, int_vec_type, ""); /* Exponent / mantissa bits */ LLVMValueRef expmant = LLVMBuildAnd(builder, i32_mask_nosign, h, ""); LLVMValueRef shifted = LLVMBuildBitCast(builder, LLVMBuildShl(builder, expmant, i32_13, ""), float_vec_type, ""); /* Exponent adjust */ LLVMValueRef scaled = LLVMBuildBitCast(builder, LLVMBuildFMul(builder, shifted, f32_magic, ""), int_vec_type, ""); /* Make sure Inf/NaN survive */ LLVMValueRef b_wasinfnan = lp_build_compare(gallivm, i32_type, PIPE_FUNC_GREATER, expmant, i32_was_infnan); LLVMValueRef infnanexp = LLVMBuildAnd(builder, b_wasinfnan, i32_exp_infnan, ""); /* Sign bit */ LLVMValueRef justsign = LLVMBuildXor(builder, h, expmant, ""); LLVMValueRef sign = LLVMBuildShl(builder, justsign, i32_16, ""); /* Combine result */ LLVMValueRef sign_inf = LLVMBuildOr(builder, sign, infnanexp, ""); LLVMValueRef final = LLVMBuildOr(builder, scaled, sign_inf, ""); /* Cast from int32 vector to float32 vector */ return LLVMBuildBitCast(builder, final, float_vec_type, ""); }
/** * Non-interleaved pack and saturate. * * Same as lp_build_pack2 but will saturate values so that they fit into the * destination type. */ LLVMValueRef lp_build_packs2(struct gallivm_state *gallivm, struct lp_type src_type, struct lp_type dst_type, LLVMValueRef lo, LLVMValueRef hi) { boolean clamp; assert(!src_type.floating); assert(!dst_type.floating); assert(src_type.sign == dst_type.sign); assert(src_type.width == dst_type.width * 2); assert(src_type.length * 2 == dst_type.length); clamp = TRUE; /* All X86 SSE non-interleaved pack instructions take signed inputs and * saturate them, so no need to clamp for those cases. */ if(util_cpu_caps.has_sse2 && src_type.width * src_type.length >= 128 && src_type.sign && (src_type.width == 32 || src_type.width == 16)) clamp = FALSE; if(clamp) { struct lp_build_context bld; unsigned dst_bits = dst_type.sign ? dst_type.width - 1 : dst_type.width; LLVMValueRef dst_max = lp_build_const_int_vec(gallivm, src_type, ((unsigned long long)1 << dst_bits) - 1); lp_build_context_init(&bld, gallivm, src_type); lo = lp_build_min(&bld, lo, dst_max); hi = lp_build_min(&bld, hi, dst_max); /* FIXME: What about lower bound? */ } return lp_build_pack2(gallivm, src_type, dst_type, lo, hi); }
/** * Perform the occlusion test and increase the counter. * Test the depth mask. Add the number of channel which has none zero mask * into the occlusion counter. e.g. maskvalue is {-1, -1, -1, -1}. * The counter will add 4. * * \param type holds element type of the mask vector. * \param maskvalue is the depth test mask. * \param counter is a pointer of the uint32 counter. */ static void lp_build_occlusion_count(LLVMBuilderRef builder, struct lp_type type, LLVMValueRef maskvalue, LLVMValueRef counter) { LLVMValueRef countmask = lp_build_const_int_vec(type, 1); LLVMValueRef countv = LLVMBuildAnd(builder, maskvalue, countmask, "countv"); LLVMTypeRef i8v16 = LLVMVectorType(LLVMInt8Type(), 16); LLVMValueRef counti = LLVMBuildBitCast(builder, countv, i8v16, "counti"); LLVMValueRef maskarray[4] = { LLVMConstInt(LLVMInt32Type(), 0, 0), LLVMConstInt(LLVMInt32Type(), 4, 0), LLVMConstInt(LLVMInt32Type(), 8, 0), LLVMConstInt(LLVMInt32Type(), 12, 0), }; LLVMValueRef shufflemask = LLVMConstVector(maskarray, 4); LLVMValueRef shufflev = LLVMBuildShuffleVector(builder, counti, LLVMGetUndef(i8v16), shufflemask, "shufflev"); LLVMValueRef shuffle = LLVMBuildBitCast(builder, shufflev, LLVMInt32Type(), "shuffle"); LLVMValueRef count = lp_build_intrinsic_unary(builder, "llvm.ctpop.i32", LLVMInt32Type(), shuffle); LLVMValueRef orig = LLVMBuildLoad(builder, counter, "orig"); LLVMValueRef incr = LLVMBuildAdd(builder, orig, count, "incr"); LLVMBuildStore(builder, incr, counter); }
/** * Do the stencil test comparison (compare FB stencil values against ref value). * This will be used twice when generating two-sided stencil code. * \param stencil the front/back stencil state * \param stencilRef the stencil reference value, replicated as a vector * \param stencilVals vector of stencil values from framebuffer * \return vector mask of pass/fail values (~0 or 0) */ static LLVMValueRef lp_build_stencil_test_single(struct lp_build_context *bld, const struct pipe_stencil_state *stencil, LLVMValueRef stencilRef, LLVMValueRef stencilVals) { LLVMBuilderRef builder = bld->gallivm->builder; const unsigned stencilMax = 255; /* XXX fix */ struct lp_type type = bld->type; LLVMValueRef res; /* * SSE2 has intrinsics for signed comparisons, but not unsigned ones. Values * are between 0..255 so ensure we generate the fastest comparisons for * wider elements. */ if (type.width <= 8) { assert(!type.sign); } else { assert(type.sign); } assert(stencil->enabled); if (stencil->valuemask != stencilMax) { /* compute stencilRef = stencilRef & valuemask */ LLVMValueRef valuemask = lp_build_const_int_vec(bld->gallivm, type, stencil->valuemask); stencilRef = LLVMBuildAnd(builder, stencilRef, valuemask, ""); /* compute stencilVals = stencilVals & valuemask */ stencilVals = LLVMBuildAnd(builder, stencilVals, valuemask, ""); } res = lp_build_cmp(bld, stencil->func, stencilRef, stencilVals); return res; }
/** * Generic type conversion. * * TODO: Take a precision argument, or even better, add a new precision member * to the lp_type union. */ void lp_build_conv(struct gallivm_state *gallivm, struct lp_type src_type, struct lp_type dst_type, const LLVMValueRef *src, unsigned num_srcs, LLVMValueRef *dst, unsigned num_dsts) { LLVMBuilderRef builder = gallivm->builder; struct lp_type tmp_type; LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH]; unsigned num_tmps; unsigned i; /* We must not loose or gain channels. Only precision */ assert(src_type.length * num_srcs == dst_type.length * num_dsts); assert(src_type.length <= LP_MAX_VECTOR_LENGTH); assert(dst_type.length <= LP_MAX_VECTOR_LENGTH); assert(num_srcs <= LP_MAX_VECTOR_LENGTH); assert(num_dsts <= LP_MAX_VECTOR_LENGTH); tmp_type = src_type; for(i = 0; i < num_srcs; ++i) { assert(lp_check_value(src_type, src[i])); tmp[i] = src[i]; } num_tmps = num_srcs; /* Special case 4x4f --> 1x16ub */ if (src_type.floating == 1 && src_type.fixed == 0 && src_type.sign == 1 && src_type.norm == 0 && src_type.width == 32 && src_type.length == 4 && dst_type.floating == 0 && dst_type.fixed == 0 && dst_type.sign == 0 && dst_type.norm == 1 && dst_type.width == 8 && dst_type.length == 16 && 4 * num_dsts == num_srcs && util_cpu_caps.has_sse2) { struct lp_build_context bld; struct lp_type int16_type = dst_type; struct lp_type int32_type = dst_type; LLVMValueRef const_255f; unsigned i, j; lp_build_context_init(&bld, gallivm, src_type); int16_type.width *= 2; int16_type.length /= 2; int16_type.sign = 1; int32_type.width *= 4; int32_type.length /= 4; int32_type.sign = 1; const_255f = lp_build_const_vec(gallivm, src_type, 255.0f); for (i = 0; i < num_dsts; ++i, src += 4) { LLVMValueRef lo, hi; for (j = 0; j < 4; ++j) { tmp[j] = LLVMBuildFMul(builder, src[j], const_255f, ""); tmp[j] = lp_build_iround(&bld, tmp[j]); } /* relying on clamping behavior of sse2 intrinsics here */ lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]); hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]); dst[i] = lp_build_pack2(gallivm, int16_type, dst_type, lo, hi); } return; } /* Special case 2x8f --> 1x16ub */ else if (src_type.floating == 1 && src_type.fixed == 0 && src_type.sign == 1 && src_type.norm == 0 && src_type.width == 32 && src_type.length == 8 && dst_type.floating == 0 && dst_type.fixed == 0 && dst_type.sign == 0 && dst_type.norm == 1 && dst_type.width == 8 && dst_type.length == 16 && 2 * num_dsts == num_srcs && util_cpu_caps.has_avx) { struct lp_build_context bld; struct lp_type int16_type = dst_type; struct lp_type int32_type = dst_type; LLVMValueRef const_255f; unsigned i; lp_build_context_init(&bld, gallivm, src_type); int16_type.width *= 2; int16_type.length /= 2; int16_type.sign = 1; int32_type.width *= 4; int32_type.length /= 4; int32_type.sign = 1; const_255f = lp_build_const_vec(gallivm, src_type, 255.0f); for (i = 0; i < num_dsts; ++i, src += 2) { LLVMValueRef lo, hi, a, b; a = LLVMBuildFMul(builder, src[0], const_255f, ""); b = LLVMBuildFMul(builder, src[1], const_255f, ""); a = lp_build_iround(&bld, a); b = lp_build_iround(&bld, b); tmp[0] = lp_build_extract_range(gallivm, a, 0, 4); tmp[1] = lp_build_extract_range(gallivm, a, 4, 4); tmp[2] = lp_build_extract_range(gallivm, b, 0, 4); tmp[3] = lp_build_extract_range(gallivm, b, 4, 4); /* relying on clamping behavior of sse2 intrinsics here */ lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]); hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]); dst[i] = lp_build_pack2(gallivm, int16_type, dst_type, lo, hi); } return; } /* Pre convert half-floats to floats */ else if (src_type.floating && src_type.width == 16) { for(i = 0; i < num_tmps; ++i) tmp[i] = lp_build_half_to_float(gallivm, src_type, tmp[i]); tmp_type.width = 32; } /* * Clamp if necessary */ if(memcmp(&src_type, &dst_type, sizeof src_type) != 0) { struct lp_build_context bld; double src_min = lp_const_min(src_type); double dst_min = lp_const_min(dst_type); double src_max = lp_const_max(src_type); double dst_max = lp_const_max(dst_type); LLVMValueRef thres; lp_build_context_init(&bld, gallivm, tmp_type); if(src_min < dst_min) { if(dst_min == 0.0) thres = bld.zero; else thres = lp_build_const_vec(gallivm, src_type, dst_min); for(i = 0; i < num_tmps; ++i) tmp[i] = lp_build_max(&bld, tmp[i], thres); } if(src_max > dst_max) { if(dst_max == 1.0) thres = bld.one; else thres = lp_build_const_vec(gallivm, src_type, dst_max); for(i = 0; i < num_tmps; ++i) tmp[i] = lp_build_min(&bld, tmp[i], thres); } } /* * Scale to the narrowest range */ if(dst_type.floating) { /* Nothing to do */ } else if(tmp_type.floating) { if(!dst_type.fixed && !dst_type.sign && dst_type.norm) { for(i = 0; i < num_tmps; ++i) { tmp[i] = lp_build_clamped_float_to_unsigned_norm(gallivm, tmp_type, dst_type.width, tmp[i]); } tmp_type.floating = FALSE; } else { double dst_scale = lp_const_scale(dst_type); LLVMTypeRef tmp_vec_type; if (dst_scale != 1.0) { LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, dst_scale); for(i = 0; i < num_tmps; ++i) tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, ""); } /* Use an equally sized integer for intermediate computations */ tmp_type.floating = FALSE; tmp_vec_type = lp_build_vec_type(gallivm, tmp_type); for(i = 0; i < num_tmps; ++i) { #if 0 if(dst_type.sign) tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, ""); else tmp[i] = LLVMBuildFPToUI(builder, tmp[i], tmp_vec_type, ""); #else /* FIXME: there is no SSE counterpart for LLVMBuildFPToUI */ tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, ""); #endif } } } else { unsigned src_shift = lp_const_shift(src_type); unsigned dst_shift = lp_const_shift(dst_type); unsigned src_offset = lp_const_offset(src_type); unsigned dst_offset = lp_const_offset(dst_type); /* Compensate for different offsets */ if (dst_offset > src_offset && src_type.width > dst_type.width) { for (i = 0; i < num_tmps; ++i) { LLVMValueRef shifted; LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type, src_shift - 1); if(src_type.sign) shifted = LLVMBuildAShr(builder, tmp[i], shift, ""); else shifted = LLVMBuildLShr(builder, tmp[i], shift, ""); tmp[i] = LLVMBuildSub(builder, tmp[i], shifted, ""); } } if(src_shift > dst_shift) { LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type, src_shift - dst_shift); for(i = 0; i < num_tmps; ++i) if(src_type.sign) tmp[i] = LLVMBuildAShr(builder, tmp[i], shift, ""); else tmp[i] = LLVMBuildLShr(builder, tmp[i], shift, ""); } } /* * Truncate or expand bit width * * No data conversion should happen here, although the sign bits are * crucial to avoid bad clamping. */ { struct lp_type new_type; new_type = tmp_type; new_type.sign = dst_type.sign; new_type.width = dst_type.width; new_type.length = dst_type.length; lp_build_resize(gallivm, tmp_type, new_type, tmp, num_srcs, tmp, num_dsts); tmp_type = new_type; num_tmps = num_dsts; } /* * Scale to the widest range */ if(src_type.floating) { /* Nothing to do */ } else if(!src_type.floating && dst_type.floating) { if(!src_type.fixed && !src_type.sign && src_type.norm) { for(i = 0; i < num_tmps; ++i) { tmp[i] = lp_build_unsigned_norm_to_float(gallivm, src_type.width, dst_type, tmp[i]); } tmp_type.floating = TRUE; } else { double src_scale = lp_const_scale(src_type); LLVMTypeRef tmp_vec_type; /* Use an equally sized integer for intermediate computations */ tmp_type.floating = TRUE; tmp_type.sign = TRUE; tmp_vec_type = lp_build_vec_type(gallivm, tmp_type); for(i = 0; i < num_tmps; ++i) { #if 0 if(dst_type.sign) tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, ""); else tmp[i] = LLVMBuildUIToFP(builder, tmp[i], tmp_vec_type, ""); #else /* FIXME: there is no SSE counterpart for LLVMBuildUIToFP */ tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, ""); #endif } if (src_scale != 1.0) { LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, 1.0/src_scale); for(i = 0; i < num_tmps; ++i) tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, ""); } } } else { unsigned src_shift = lp_const_shift(src_type); unsigned dst_shift = lp_const_shift(dst_type); unsigned src_offset = lp_const_offset(src_type); unsigned dst_offset = lp_const_offset(dst_type); if (src_shift < dst_shift) { LLVMValueRef pre_shift[LP_MAX_VECTOR_LENGTH]; LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type, dst_shift - src_shift); for (i = 0; i < num_tmps; ++i) { pre_shift[i] = tmp[i]; tmp[i] = LLVMBuildShl(builder, tmp[i], shift, ""); } /* Compensate for different offsets */ if (dst_offset > src_offset) { for (i = 0; i < num_tmps; ++i) { tmp[i] = LLVMBuildSub(builder, tmp[i], pre_shift[i], ""); } } } } for(i = 0; i < num_dsts; ++i) { dst[i] = tmp[i]; assert(lp_check_value(dst_type, dst[i])); } }
/** * Inverse of lp_build_clamped_float_to_unsigned_norm above. * Ex: src = { i32, i32, i32, i32 } with values in range [0, 2^src_width-1] * return {float, float, float, float} with values in range [0, 1]. */ LLVMValueRef lp_build_unsigned_norm_to_float(struct gallivm_state *gallivm, unsigned src_width, struct lp_type dst_type, LLVMValueRef src) { LLVMBuilderRef builder = gallivm->builder; LLVMTypeRef vec_type = lp_build_vec_type(gallivm, dst_type); LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, dst_type); LLVMValueRef bias_; LLVMValueRef res; unsigned mantissa; unsigned n; unsigned long long ubound; unsigned long long mask; double scale; double bias; assert(dst_type.floating); mantissa = lp_mantissa(dst_type); if (src_width <= (mantissa + 1)) { /* * The source width matches fits what can be represented in floating * point (i.e., mantissa + 1 bits). So do a straight multiplication * followed by casting. No further rounding is necessary. */ scale = 1.0/(double)((1ULL << src_width) - 1); res = LLVMBuildSIToFP(builder, src, vec_type, ""); res = LLVMBuildFMul(builder, res, lp_build_const_vec(gallivm, dst_type, scale), ""); return res; } else { /* * The source width exceeds what can be represented in floating * point. So truncate the incoming values. */ n = MIN2(mantissa, src_width); ubound = ((unsigned long long)1 << n); mask = ubound - 1; scale = (double)ubound/mask; bias = (double)((unsigned long long)1 << (mantissa - n)); res = src; if (src_width > mantissa) { int shift = src_width - mantissa; res = LLVMBuildLShr(builder, res, lp_build_const_int_vec(gallivm, dst_type, shift), ""); } bias_ = lp_build_const_vec(gallivm, dst_type, bias); res = LLVMBuildOr(builder, res, LLVMBuildBitCast(builder, bias_, int_vec_type, ""), ""); res = LLVMBuildBitCast(builder, res, vec_type, ""); res = LLVMBuildFSub(builder, res, bias_, ""); res = LLVMBuildFMul(builder, res, lp_build_const_vec(gallivm, dst_type, scale), ""); } return res; }
/** * Special case for converting clamped IEEE-754 floats to unsigned norms. * * The mathematical voodoo below may seem excessive but it is actually * paramount we do it this way for several reasons. First, there is no single * precision FP to unsigned integer conversion Intel SSE instruction. Second, * secondly, even if there was, since the FP's mantissa takes only a fraction * of register bits the typically scale and cast approach would require double * precision for accurate results, and therefore half the throughput * * Although the result values can be scaled to an arbitrary bit width specified * by dst_width, the actual result type will have the same width. * * Ex: src = { float, float, float, float } * return { i32, i32, i32, i32 } where each value is in [0, 2^dst_width-1]. */ LLVMValueRef lp_build_clamped_float_to_unsigned_norm(struct gallivm_state *gallivm, struct lp_type src_type, unsigned dst_width, LLVMValueRef src) { LLVMBuilderRef builder = gallivm->builder; LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, src_type); LLVMValueRef res; unsigned mantissa; assert(src_type.floating); assert(dst_width <= src_type.width); src_type.sign = FALSE; mantissa = lp_mantissa(src_type); if (dst_width <= mantissa) { /* * Apply magic coefficients that will make the desired result to appear * in the lowest significant bits of the mantissa, with correct rounding. * * This only works if the destination width fits in the mantissa. */ unsigned long long ubound; unsigned long long mask; double scale; double bias; ubound = (1ULL << dst_width); mask = ubound - 1; scale = (double)mask/ubound; bias = (double)(1ULL << (mantissa - dst_width)); res = LLVMBuildFMul(builder, src, lp_build_const_vec(gallivm, src_type, scale), ""); res = LLVMBuildFAdd(builder, res, lp_build_const_vec(gallivm, src_type, bias), ""); res = LLVMBuildBitCast(builder, res, int_vec_type, ""); res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(gallivm, src_type, mask), ""); } else if (dst_width == (mantissa + 1)) { /* * The destination width matches exactly what can be represented in * floating point (i.e., mantissa + 1 bits). So do a straight * multiplication followed by casting. No further rounding is necessary. */ double scale; scale = (double)((1ULL << dst_width) - 1); res = LLVMBuildFMul(builder, src, lp_build_const_vec(gallivm, src_type, scale), ""); res = LLVMBuildFPToSI(builder, res, int_vec_type, ""); } else { /* * The destination exceeds what can be represented in the floating point. * So multiply by the largest power two we get away with, and when * subtract the most significant bit to rescale to normalized values. * * The largest power of two factor we can get away is * (1 << (src_type.width - 1)), because we need to use signed . In theory it * should be (1 << (src_type.width - 2)), but IEEE 754 rules states * INT_MIN should be returned in FPToSI, which is the correct result for * values near 1.0! * * This means we get (src_type.width - 1) correct bits for values near 0.0, * and (mantissa + 1) correct bits for values near 1.0. Equally or more * important, we also get exact results for 0.0 and 1.0. */ unsigned n = MIN2(src_type.width - 1, dst_width); double scale = (double)(1ULL << n); unsigned lshift = dst_width - n; unsigned rshift = n; LLVMValueRef lshifted; LLVMValueRef rshifted; res = LLVMBuildFMul(builder, src, lp_build_const_vec(gallivm, src_type, scale), ""); res = LLVMBuildFPToSI(builder, res, int_vec_type, ""); /* * Align the most significant bit to its final place. * * This will cause 1.0 to overflow to 0, but the later adjustment will * get it right. */ if (lshift) { lshifted = LLVMBuildShl(builder, res, lp_build_const_int_vec(gallivm, src_type, lshift), ""); } else { lshifted = res; } /* * Align the most significant bit to the right. */ rshifted = LLVMBuildLShr(builder, res, lp_build_const_int_vec(gallivm, src_type, rshift), ""); /* * Subtract the MSB to the LSB, therefore re-scaling from * (1 << dst_width) to ((1 << dst_width) - 1). */ res = LLVMBuildSub(builder, lshifted, rshifted, ""); } return res; }
/** * Build code to compare two values 'a' and 'b' of 'type' using the given func. * \param func one of PIPE_FUNC_x * The result values will be 0 for false or ~0 for true. */ LLVMValueRef lp_build_compare(struct gallivm_state *gallivm, const struct lp_type type, unsigned func, LLVMValueRef a, LLVMValueRef b) { LLVMBuilderRef builder = gallivm->builder; LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, type); LLVMValueRef zeros = LLVMConstNull(int_vec_type); LLVMValueRef ones = LLVMConstAllOnes(int_vec_type); LLVMValueRef cond; LLVMValueRef res; assert(func >= PIPE_FUNC_NEVER); assert(func <= PIPE_FUNC_ALWAYS); assert(lp_check_value(type, a)); assert(lp_check_value(type, b)); if(func == PIPE_FUNC_NEVER) return zeros; if(func == PIPE_FUNC_ALWAYS) return ones; #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) /* * There are no unsigned integer comparison instructions in SSE. */ if (!type.floating && !type.sign && type.width * type.length == 128 && util_cpu_caps.has_sse2 && (func == PIPE_FUNC_LESS || func == PIPE_FUNC_LEQUAL || func == PIPE_FUNC_GREATER || func == PIPE_FUNC_GEQUAL) && (gallivm_debug & GALLIVM_DEBUG_PERF)) { debug_printf("%s: inefficient <%u x i%u> unsigned comparison\n", __FUNCTION__, type.length, type.width); } #endif #if HAVE_LLVM < 0x0207 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) if(type.width * type.length == 128) { if(type.floating && util_cpu_caps.has_sse) { /* float[4] comparison */ LLVMTypeRef vec_type = lp_build_vec_type(gallivm, type); LLVMValueRef args[3]; unsigned cc; boolean swap; swap = FALSE; switch(func) { case PIPE_FUNC_EQUAL: cc = 0; break; case PIPE_FUNC_NOTEQUAL: cc = 4; break; case PIPE_FUNC_LESS: cc = 1; break; case PIPE_FUNC_LEQUAL: cc = 2; break; case PIPE_FUNC_GREATER: cc = 1; swap = TRUE; break; case PIPE_FUNC_GEQUAL: cc = 2; swap = TRUE; break; default: assert(0); return lp_build_undef(gallivm, type); } if(swap) { args[0] = b; args[1] = a; } else { args[0] = a; args[1] = b; } args[2] = LLVMConstInt(LLVMInt8TypeInContext(gallivm->context), cc, 0); res = lp_build_intrinsic(builder, "llvm.x86.sse.cmp.ps", vec_type, args, 3); res = LLVMBuildBitCast(builder, res, int_vec_type, ""); return res; } else if(util_cpu_caps.has_sse2) { /* int[4] comparison */ static const struct { unsigned swap:1; unsigned eq:1; unsigned gt:1; unsigned not:1; } table[] = { {0, 0, 0, 1}, /* PIPE_FUNC_NEVER */ {1, 0, 1, 0}, /* PIPE_FUNC_LESS */ {0, 1, 0, 0}, /* PIPE_FUNC_EQUAL */ {0, 0, 1, 1}, /* PIPE_FUNC_LEQUAL */ {0, 0, 1, 0}, /* PIPE_FUNC_GREATER */ {0, 1, 0, 1}, /* PIPE_FUNC_NOTEQUAL */ {1, 0, 1, 1}, /* PIPE_FUNC_GEQUAL */ {0, 0, 0, 0} /* PIPE_FUNC_ALWAYS */ }; const char *pcmpeq; const char *pcmpgt; LLVMValueRef args[2]; LLVMValueRef res; LLVMTypeRef vec_type = lp_build_vec_type(gallivm, type); switch (type.width) { case 8: pcmpeq = "llvm.x86.sse2.pcmpeq.b"; pcmpgt = "llvm.x86.sse2.pcmpgt.b"; break; case 16: pcmpeq = "llvm.x86.sse2.pcmpeq.w"; pcmpgt = "llvm.x86.sse2.pcmpgt.w"; break; case 32: pcmpeq = "llvm.x86.sse2.pcmpeq.d"; pcmpgt = "llvm.x86.sse2.pcmpgt.d"; break; default: assert(0); return lp_build_undef(gallivm, type); } /* There are no unsigned comparison instructions. So flip the sign bit * so that the results match. */ if (table[func].gt && !type.sign) { LLVMValueRef msb = lp_build_const_int_vec(gallivm, type, (unsigned long long)1 << (type.width - 1)); a = LLVMBuildXor(builder, a, msb, ""); b = LLVMBuildXor(builder, b, msb, ""); } if(table[func].swap) { args[0] = b; args[1] = a; } else { args[0] = a; args[1] = b; } if(table[func].eq) res = lp_build_intrinsic(builder, pcmpeq, vec_type, args, 2); else if (table[func].gt) res = lp_build_intrinsic(builder, pcmpgt, vec_type, args, 2); else res = LLVMConstNull(vec_type); if(table[func].not) res = LLVMBuildNot(builder, res, ""); return res; } } /* if (type.width * type.length == 128) */ #endif #endif /* HAVE_LLVM < 0x0207 */ /* XXX: It is not clear if we should use the ordered or unordered operators */ if(type.floating) { LLVMRealPredicate op; switch(func) { case PIPE_FUNC_NEVER: op = LLVMRealPredicateFalse; break; case PIPE_FUNC_ALWAYS: op = LLVMRealPredicateTrue; break; case PIPE_FUNC_EQUAL: op = LLVMRealUEQ; break; case PIPE_FUNC_NOTEQUAL: op = LLVMRealUNE; break; case PIPE_FUNC_LESS: op = LLVMRealULT; break; case PIPE_FUNC_LEQUAL: op = LLVMRealULE; break; case PIPE_FUNC_GREATER: op = LLVMRealUGT; break; case PIPE_FUNC_GEQUAL: op = LLVMRealUGE; break; default: assert(0); return lp_build_undef(gallivm, type); } #if HAVE_LLVM >= 0x0207 cond = LLVMBuildFCmp(builder, op, a, b, ""); res = LLVMBuildSExt(builder, cond, int_vec_type, ""); #else if (type.length == 1) { cond = LLVMBuildFCmp(builder, op, a, b, ""); res = LLVMBuildSExt(builder, cond, int_vec_type, ""); } else { unsigned i; res = LLVMGetUndef(int_vec_type); debug_printf("%s: warning: using slow element-wise float" " vector comparison\n", __FUNCTION__); for (i = 0; i < type.length; ++i) { LLVMValueRef index = lp_build_const_int32(gallivm, i); cond = LLVMBuildFCmp(builder, op, LLVMBuildExtractElement(builder, a, index, ""), LLVMBuildExtractElement(builder, b, index, ""), ""); cond = LLVMBuildSelect(builder, cond, LLVMConstExtractElement(ones, index), LLVMConstExtractElement(zeros, index), ""); res = LLVMBuildInsertElement(builder, res, cond, index, ""); } } #endif } else { LLVMIntPredicate op; switch(func) { case PIPE_FUNC_EQUAL: op = LLVMIntEQ; break; case PIPE_FUNC_NOTEQUAL: op = LLVMIntNE; break; case PIPE_FUNC_LESS: op = type.sign ? LLVMIntSLT : LLVMIntULT; break; case PIPE_FUNC_LEQUAL: op = type.sign ? LLVMIntSLE : LLVMIntULE; break; case PIPE_FUNC_GREATER: op = type.sign ? LLVMIntSGT : LLVMIntUGT; break; case PIPE_FUNC_GEQUAL: op = type.sign ? LLVMIntSGE : LLVMIntUGE; break; default: assert(0); return lp_build_undef(gallivm, type); } #if HAVE_LLVM >= 0x0207 cond = LLVMBuildICmp(builder, op, a, b, ""); res = LLVMBuildSExt(builder, cond, int_vec_type, ""); #else if (type.length == 1) { cond = LLVMBuildICmp(builder, op, a, b, ""); res = LLVMBuildSExt(builder, cond, int_vec_type, ""); } else { unsigned i; res = LLVMGetUndef(int_vec_type); if (gallivm_debug & GALLIVM_DEBUG_PERF) { debug_printf("%s: using slow element-wise int" " vector comparison\n", __FUNCTION__); } for(i = 0; i < type.length; ++i) { LLVMValueRef index = lp_build_const_int32(gallivm, i); cond = LLVMBuildICmp(builder, op, LLVMBuildExtractElement(builder, a, index, ""), LLVMBuildExtractElement(builder, b, index, ""), ""); cond = LLVMBuildSelect(builder, cond, LLVMConstExtractElement(ones, index), LLVMConstExtractElement(zeros, index), ""); res = LLVMBuildInsertElement(builder, res, cond, index, ""); } } #endif } return res; }
/** * Fetch a texels from a texture, returning them in SoA layout. * * \param type the desired return type for 'rgba'. The vector length * is the number of texels to fetch * * \param base_ptr points to the base of the texture mip tree. * \param offset offset to start of the texture image block. For non- * compressed formats, this simply is an offset to the texel. * For compressed formats, it is an offset to the start of the * compressed data block. * * \param i, j the sub-block pixel coordinates. For non-compressed formats * these will always be (0,0). For compressed formats, i will * be in [0, block_width-1] and j will be in [0, block_height-1]. */ void lp_build_fetch_rgba_soa(struct gallivm_state *gallivm, const struct util_format_description *format_desc, struct lp_type type, LLVMValueRef base_ptr, LLVMValueRef offset, LLVMValueRef i, LLVMValueRef j, LLVMValueRef rgba_out[4]) { LLVMBuilderRef builder = gallivm->builder; if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN && (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB || format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB || format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) && format_desc->block.width == 1 && format_desc->block.height == 1 && format_desc->block.bits <= type.width && (format_desc->channel[0].type != UTIL_FORMAT_TYPE_FLOAT || format_desc->channel[0].size == 32)) { /* * The packed pixel fits into an element of the destination format. Put * the packed pixels into a vector and extract each component for all * vector elements in parallel. */ LLVMValueRef packed; /* * gather the texels from the texture * Ex: packed = {XYZW, XYZW, XYZW, XYZW} */ assert(format_desc->block.bits <= type.width); packed = lp_build_gather(gallivm, type.length, format_desc->block.bits, type.width, base_ptr, offset, FALSE); /* * convert texels to float rgba */ lp_build_unpack_rgba_soa(gallivm, format_desc, type, packed, rgba_out); return; } if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT || format_desc->format == PIPE_FORMAT_R9G9B9E5_FLOAT) { /* * similar conceptually to above but requiring special * AoS packed -> SoA float conversion code. */ LLVMValueRef packed; assert(type.floating); assert(type.width == 32); packed = lp_build_gather(gallivm, type.length, format_desc->block.bits, type.width, base_ptr, offset, FALSE); if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT) { lp_build_r11g11b10_to_float(gallivm, packed, rgba_out); } else { lp_build_rgb9e5_to_float(gallivm, packed, rgba_out); } return; } if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS && format_desc->block.bits == 64) { /* * special case the format is 64 bits but we only require * 32bit (or 8bit) from each block. */ LLVMValueRef packed; if (format_desc->format == PIPE_FORMAT_X32_S8X24_UINT) { /* * for stencil simply fix up offsets - could in fact change * base_ptr instead even outside the shader. */ unsigned mask = (1 << 8) - 1; LLVMValueRef s_offset = lp_build_const_int_vec(gallivm, type, 4); offset = LLVMBuildAdd(builder, offset, s_offset, ""); packed = lp_build_gather(gallivm, type.length, 32, type.width, base_ptr, offset, FALSE); packed = LLVMBuildAnd(builder, packed, lp_build_const_int_vec(gallivm, type, mask), ""); } else { assert (format_desc->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT); packed = lp_build_gather(gallivm, type.length, 32, type.width, base_ptr, offset, TRUE); packed = LLVMBuildBitCast(builder, packed, lp_build_vec_type(gallivm, type), ""); } /* for consistency with lp_build_unpack_rgba_soa() return sss1 or zzz1 */ rgba_out[0] = rgba_out[1] = rgba_out[2] = packed; rgba_out[3] = lp_build_const_vec(gallivm, type, 1.0f); return; } /* * Try calling lp_build_fetch_rgba_aos for all pixels. */ if (util_format_fits_8unorm(format_desc) && type.floating && type.width == 32 && (type.length == 1 || (type.length % 4 == 0))) { struct lp_type tmp_type; LLVMValueRef tmp; memset(&tmp_type, 0, sizeof tmp_type); tmp_type.width = 8; tmp_type.length = type.length * 4; tmp_type.norm = TRUE; tmp = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type, base_ptr, offset, i, j); lp_build_rgba8_to_fi32_soa(gallivm, type, tmp, rgba_out); return; } /* * Fallback to calling lp_build_fetch_rgba_aos for each pixel. * * This is not the most efficient way of fetching pixels, as we * miss some opportunities to do vectorization, but this is * convenient for formats or scenarios for which there was no * opportunity or incentive to optimize. */ { unsigned k, chan; struct lp_type tmp_type; if (gallivm_debug & GALLIVM_DEBUG_PERF) { debug_printf("%s: scalar unpacking of %s\n", __FUNCTION__, format_desc->short_name); } tmp_type = type; tmp_type.length = 4; for (chan = 0; chan < 4; ++chan) { rgba_out[chan] = lp_build_undef(gallivm, type); } /* loop over number of pixels */ for(k = 0; k < type.length; ++k) { LLVMValueRef index = lp_build_const_int32(gallivm, k); LLVMValueRef offset_elem; LLVMValueRef i_elem, j_elem; LLVMValueRef tmp; offset_elem = LLVMBuildExtractElement(builder, offset, index, ""); i_elem = LLVMBuildExtractElement(builder, i, index, ""); j_elem = LLVMBuildExtractElement(builder, j, index, ""); /* Get a single float[4]={R,G,B,A} pixel */ tmp = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type, base_ptr, offset_elem, i_elem, j_elem); /* * Insert the AoS tmp value channels into the SoA result vectors at * position = 'index'. */ for (chan = 0; chan < 4; ++chan) { LLVMValueRef chan_val = lp_build_const_int32(gallivm, chan), tmp_chan = LLVMBuildExtractElement(builder, tmp, chan_val, ""); rgba_out[chan] = LLVMBuildInsertElement(builder, rgba_out[chan], tmp_chan, index, ""); } } } }
/** * Apply the stencil operator (add/sub/keep/etc) to the given vector * of stencil values. * \return new stencil values vector */ static LLVMValueRef lp_build_stencil_op_single(struct lp_build_context *bld, const struct pipe_stencil_state *stencil, enum stencil_op op, LLVMValueRef stencilRef, LLVMValueRef stencilVals) { LLVMBuilderRef builder = bld->gallivm->builder; struct lp_type type = bld->type; LLVMValueRef res; LLVMValueRef max = lp_build_const_int_vec(bld->gallivm, type, 0xff); unsigned stencil_op; assert(type.sign); switch (op) { case S_FAIL_OP: stencil_op = stencil->fail_op; break; case Z_FAIL_OP: stencil_op = stencil->zfail_op; break; case Z_PASS_OP: stencil_op = stencil->zpass_op; break; default: assert(0 && "Invalid stencil_op mode"); stencil_op = PIPE_STENCIL_OP_KEEP; } switch (stencil_op) { case PIPE_STENCIL_OP_KEEP: res = stencilVals; /* we can return early for this case */ return res; case PIPE_STENCIL_OP_ZERO: res = bld->zero; break; case PIPE_STENCIL_OP_REPLACE: res = stencilRef; break; case PIPE_STENCIL_OP_INCR: res = lp_build_add(bld, stencilVals, bld->one); res = lp_build_min(bld, res, max); break; case PIPE_STENCIL_OP_DECR: res = lp_build_sub(bld, stencilVals, bld->one); res = lp_build_max(bld, res, bld->zero); break; case PIPE_STENCIL_OP_INCR_WRAP: res = lp_build_add(bld, stencilVals, bld->one); res = LLVMBuildAnd(builder, res, max, ""); break; case PIPE_STENCIL_OP_DECR_WRAP: res = lp_build_sub(bld, stencilVals, bld->one); res = LLVMBuildAnd(builder, res, max, ""); break; case PIPE_STENCIL_OP_INVERT: res = LLVMBuildNot(builder, stencilVals, ""); res = LLVMBuildAnd(builder, res, max, ""); break; default: assert(0 && "bad stencil op mode"); res = bld->undef; } return res; }
LLVMValueRef lp_build_swizzle_aos(struct lp_build_context *bld, LLVMValueRef a, const unsigned char swizzles[4]) { LLVMBuilderRef builder = bld->gallivm->builder; const struct lp_type type = bld->type; const unsigned n = type.length; unsigned i, j; if (swizzles[0] == PIPE_SWIZZLE_X && swizzles[1] == PIPE_SWIZZLE_Y && swizzles[2] == PIPE_SWIZZLE_Z && swizzles[3] == PIPE_SWIZZLE_W) { return a; } if (swizzles[0] == swizzles[1] && swizzles[1] == swizzles[2] && swizzles[2] == swizzles[3]) { switch (swizzles[0]) { case PIPE_SWIZZLE_X: case PIPE_SWIZZLE_Y: case PIPE_SWIZZLE_Z: case PIPE_SWIZZLE_W: return lp_build_swizzle_scalar_aos(bld, a, swizzles[0], 4); case PIPE_SWIZZLE_0: return bld->zero; case PIPE_SWIZZLE_1: return bld->one; case LP_BLD_SWIZZLE_DONTCARE: return bld->undef; default: assert(0); return bld->undef; } } if (LLVMIsConstant(a) || type.width >= 16) { /* * Shuffle. */ LLVMValueRef undef = LLVMGetUndef(lp_build_elem_type(bld->gallivm, type)); LLVMTypeRef i32t = LLVMInt32TypeInContext(bld->gallivm->context); LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH]; LLVMValueRef aux[LP_MAX_VECTOR_LENGTH]; memset(aux, 0, sizeof aux); for(j = 0; j < n; j += 4) { for(i = 0; i < 4; ++i) { unsigned shuffle; switch (swizzles[i]) { default: assert(0); /* fall through */ case PIPE_SWIZZLE_X: case PIPE_SWIZZLE_Y: case PIPE_SWIZZLE_Z: case PIPE_SWIZZLE_W: shuffle = j + swizzles[i]; shuffles[j + i] = LLVMConstInt(i32t, shuffle, 0); break; case PIPE_SWIZZLE_0: shuffle = type.length + 0; shuffles[j + i] = LLVMConstInt(i32t, shuffle, 0); if (!aux[0]) { aux[0] = lp_build_const_elem(bld->gallivm, type, 0.0); } break; case PIPE_SWIZZLE_1: shuffle = type.length + 1; shuffles[j + i] = LLVMConstInt(i32t, shuffle, 0); if (!aux[1]) { aux[1] = lp_build_const_elem(bld->gallivm, type, 1.0); } break; case LP_BLD_SWIZZLE_DONTCARE: shuffles[j + i] = LLVMGetUndef(i32t); break; } } } for (i = 0; i < n; ++i) { if (!aux[i]) { aux[i] = undef; } } return LLVMBuildShuffleVector(builder, a, LLVMConstVector(aux, n), LLVMConstVector(shuffles, n), ""); } else { /* * Bit mask and shifts. * * For example, this will convert BGRA to RGBA by doing * * Little endian: * rgba = (bgra & 0x00ff0000) >> 16 * | (bgra & 0xff00ff00) * | (bgra & 0x000000ff) << 16 * * Big endian:A * rgba = (bgra & 0x0000ff00) << 16 * | (bgra & 0x00ff00ff) * | (bgra & 0xff000000) >> 16 * * This is necessary not only for faster cause, but because X86 backend * will refuse shuffles of <4 x i8> vectors */ LLVMValueRef res; struct lp_type type4; unsigned cond = 0; unsigned chan; int shift; /* * Start with a mixture of 1 and 0. */ for (chan = 0; chan < 4; ++chan) { if (swizzles[chan] == PIPE_SWIZZLE_1) { cond |= 1 << chan; } } res = lp_build_select_aos(bld, cond, bld->one, bld->zero, 4); /* * Build a type where each element is an integer that cover the four * channels. */ type4 = type; type4.floating = FALSE; type4.width *= 4; type4.length /= 4; a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, type4), ""); res = LLVMBuildBitCast(builder, res, lp_build_vec_type(bld->gallivm, type4), ""); /* * Mask and shift the channels, trying to group as many channels in the * same shift as possible. The shift amount is positive for shifts left * and negative for shifts right. */ for (shift = -3; shift <= 3; ++shift) { uint64_t mask = 0; assert(type4.width <= sizeof(mask)*8); /* * Vector element numbers follow the XYZW order, so 0 is always X, etc. * After widening 4 times we have: * * 3210 * Little-endian register layout: WZYX * * 0123 * Big-endian register layout: XYZW * * For little-endian, higher-numbered channels are obtained by a shift right * (negative shift amount) and lower-numbered channels by a shift left * (positive shift amount). The opposite is true for big-endian. */ for (chan = 0; chan < 4; ++chan) { if (swizzles[chan] < 4) { /* We need to move channel swizzles[chan] into channel chan */ #ifdef PIPE_ARCH_LITTLE_ENDIAN if (swizzles[chan] - chan == -shift) { mask |= ((1ULL << type.width) - 1) << (swizzles[chan] * type.width); } #else if (swizzles[chan] - chan == shift) { mask |= ((1ULL << type.width) - 1) << (type4.width - type.width) >> (swizzles[chan] * type.width); } #endif } } if (mask) { LLVMValueRef masked; LLVMValueRef shifted; if (0) debug_printf("shift = %i, mask = %" PRIx64 "\n", shift, mask); masked = LLVMBuildAnd(builder, a, lp_build_const_int_vec(bld->gallivm, type4, mask), ""); if (shift > 0) { shifted = LLVMBuildShl(builder, masked, lp_build_const_int_vec(bld->gallivm, type4, shift*type.width), ""); } else if (shift < 0) { shifted = LLVMBuildLShr(builder, masked, lp_build_const_int_vec(bld->gallivm, type4, -shift*type.width), ""); } else { shifted = masked; } res = LLVMBuildOr(builder, res, shifted, ""); } } return LLVMBuildBitCast(builder, res, lp_build_vec_type(bld->gallivm, type), ""); }
/** * Swizzle one channel into other channels. */ LLVMValueRef lp_build_swizzle_scalar_aos(struct lp_build_context *bld, LLVMValueRef a, unsigned channel, unsigned num_channels) { LLVMBuilderRef builder = bld->gallivm->builder; const struct lp_type type = bld->type; const unsigned n = type.length; unsigned i, j; if(a == bld->undef || a == bld->zero || a == bld->one || num_channels == 1) return a; assert(num_channels == 2 || num_channels == 4); /* XXX: SSE3 has PSHUFB which should be better than bitmasks, but forcing * using shuffles here actually causes worst results. More investigation is * needed. */ if (LLVMIsConstant(a) || type.width >= 16) { /* * Shuffle. */ LLVMTypeRef elem_type = LLVMInt32TypeInContext(bld->gallivm->context); LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH]; for(j = 0; j < n; j += num_channels) for(i = 0; i < num_channels; ++i) shuffles[j + i] = LLVMConstInt(elem_type, j + channel, 0); return LLVMBuildShuffleVector(builder, a, bld->undef, LLVMConstVector(shuffles, n), ""); } else if (num_channels == 2) { /* * Bit mask and shifts * * XY XY .... XY <= input * 0Y 0Y .... 0Y * YY YY .... YY * YY YY .... YY <= output */ struct lp_type type2; LLVMValueRef tmp = NULL; int shift; a = LLVMBuildAnd(builder, a, lp_build_const_mask_aos(bld->gallivm, type, 1 << channel, num_channels), ""); type2 = type; type2.floating = FALSE; type2.width *= 2; type2.length /= 2; a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, type2), ""); /* * Vector element 0 is always channel X. * * 76 54 32 10 (array numbering) * Little endian reg in: YX YX YX YX * Little endian reg out: YY YY YY YY if shift right (shift == -1) * XX XX XX XX if shift left (shift == 1) * * 01 23 45 67 (array numbering) * Big endian reg in: XY XY XY XY * Big endian reg out: YY YY YY YY if shift left (shift == 1) * XX XX XX XX if shift right (shift == -1) * */ #ifdef PIPE_ARCH_LITTLE_ENDIAN shift = channel == 0 ? 1 : -1; #else shift = channel == 0 ? -1 : 1; #endif if (shift > 0) { tmp = LLVMBuildShl(builder, a, lp_build_const_int_vec(bld->gallivm, type2, shift * type.width), ""); } else if (shift < 0) { tmp = LLVMBuildLShr(builder, a, lp_build_const_int_vec(bld->gallivm, type2, -shift * type.width), ""); } assert(tmp); if (tmp) { a = LLVMBuildOr(builder, a, tmp, ""); } return LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, type), ""); } else { /* * Bit mask and recursive shifts * * Little-endian registers: * * 7654 3210 * WZYX WZYX .... WZYX <= input * 00Y0 00Y0 .... 00Y0 <= mask * 00YY 00YY .... 00YY <= shift right 1 (shift amount -1) * YYYY YYYY .... YYYY <= shift left 2 (shift amount 2) * * Big-endian registers: * * 0123 4567 * XYZW XYZW .... XYZW <= input * 0Y00 0Y00 .... 0Y00 <= mask * YY00 YY00 .... YY00 <= shift left 1 (shift amount 1) * YYYY YYYY .... YYYY <= shift right 2 (shift amount -2) * * shifts[] gives little-endian shift amounts; we need to negate for big-endian. */ struct lp_type type4; const int shifts[4][2] = { { 1, 2}, {-1, 2}, { 1, -2}, {-1, -2} }; unsigned i; a = LLVMBuildAnd(builder, a, lp_build_const_mask_aos(bld->gallivm, type, 1 << channel, 4), ""); /* * Build a type where each element is an integer that cover the four * channels. */ type4 = type; type4.floating = FALSE; type4.width *= 4; type4.length /= 4; a = LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, type4), ""); for(i = 0; i < 2; ++i) { LLVMValueRef tmp = NULL; int shift = shifts[channel][i]; /* See endianness diagram above */ #ifdef PIPE_ARCH_BIG_ENDIAN shift = -shift; #endif if(shift > 0) tmp = LLVMBuildShl(builder, a, lp_build_const_int_vec(bld->gallivm, type4, shift*type.width), ""); if(shift < 0) tmp = LLVMBuildLShr(builder, a, lp_build_const_int_vec(bld->gallivm, type4, -shift*type.width), ""); assert(tmp); if(tmp) a = LLVMBuildOr(builder, a, tmp, ""); } return LLVMBuildBitCast(builder, a, lp_build_vec_type(bld->gallivm, type), ""); } }
/* * Do a cached lookup. * * Returns (vectors of) 4x8 rgba aos value */ LLVMValueRef lp_build_fetch_cached_texels(struct gallivm_state *gallivm, const struct util_format_description *format_desc, unsigned n, LLVMValueRef base_ptr, LLVMValueRef offset, LLVMValueRef i, LLVMValueRef j, LLVMValueRef cache) { LLVMBuilderRef builder = gallivm->builder; unsigned count, low_bit, log2size; LLVMValueRef color, offset_stored, addr, ptr_addrtrunc, tmp; LLVMValueRef ij_index, hash_index, hash_mask, block_index; LLVMTypeRef i8t = LLVMInt8TypeInContext(gallivm->context); LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context); LLVMTypeRef i64t = LLVMInt64TypeInContext(gallivm->context); struct lp_type type; struct lp_build_context bld32; memset(&type, 0, sizeof type); type.width = 32; type.length = n; assert(format_desc->block.width == 4); assert(format_desc->block.height == 4); lp_build_context_init(&bld32, gallivm, type); /* * compute hash - we use direct mapped cache, the hash function could * be better but it needs to be simple * per-element: * compare offset with offset stored at tag (hash) * if not equal decode/store block, update tag * extract color from cache * assemble result vector */ /* TODO: not ideal with 32bit pointers... */ low_bit = util_logbase2(format_desc->block.bits / 8); log2size = util_logbase2(LP_BUILD_FORMAT_CACHE_SIZE); addr = LLVMBuildPtrToInt(builder, base_ptr, i64t, ""); ptr_addrtrunc = LLVMBuildPtrToInt(builder, base_ptr, i32t, ""); ptr_addrtrunc = lp_build_broadcast_scalar(&bld32, ptr_addrtrunc); /* For the hash function, first mask off the unused lowest bits. Then just do some xor with address bits - only use lower 32bits */ ptr_addrtrunc = LLVMBuildAdd(builder, offset, ptr_addrtrunc, ""); ptr_addrtrunc = LLVMBuildLShr(builder, ptr_addrtrunc, lp_build_const_int_vec(gallivm, type, low_bit), ""); /* This only really makes sense for size 64,128,256 */ hash_index = ptr_addrtrunc; ptr_addrtrunc = LLVMBuildLShr(builder, ptr_addrtrunc, lp_build_const_int_vec(gallivm, type, 2*log2size), ""); hash_index = LLVMBuildXor(builder, ptr_addrtrunc, hash_index, ""); tmp = LLVMBuildLShr(builder, hash_index, lp_build_const_int_vec(gallivm, type, log2size), ""); hash_index = LLVMBuildXor(builder, hash_index, tmp, ""); hash_mask = lp_build_const_int_vec(gallivm, type, LP_BUILD_FORMAT_CACHE_SIZE - 1); hash_index = LLVMBuildAnd(builder, hash_index, hash_mask, ""); ij_index = LLVMBuildShl(builder, i, lp_build_const_int_vec(gallivm, type, 2), ""); ij_index = LLVMBuildAdd(builder, ij_index, j, ""); block_index = LLVMBuildShl(builder, hash_index, lp_build_const_int_vec(gallivm, type, 4), ""); block_index = LLVMBuildAdd(builder, ij_index, block_index, ""); if (n > 1) { color = LLVMGetUndef(LLVMVectorType(i32t, n)); for (count = 0; count < n; count++) { LLVMValueRef index, cond, colorx; LLVMValueRef block_indexx, hash_indexx, addrx, offsetx, ptr_addrx; struct lp_build_if_state if_ctx; index = lp_build_const_int32(gallivm, count); offsetx = LLVMBuildExtractElement(builder, offset, index, ""); addrx = LLVMBuildZExt(builder, offsetx, i64t, ""); addrx = LLVMBuildAdd(builder, addrx, addr, ""); block_indexx = LLVMBuildExtractElement(builder, block_index, index, ""); hash_indexx = LLVMBuildLShr(builder, block_indexx, lp_build_const_int32(gallivm, 4), ""); offset_stored = lookup_tag_data(gallivm, cache, hash_indexx); cond = LLVMBuildICmp(builder, LLVMIntNE, offset_stored, addrx, ""); lp_build_if(&if_ctx, gallivm, cond); { ptr_addrx = LLVMBuildIntToPtr(builder, addrx, LLVMPointerType(i8t, 0), ""); update_cached_block(gallivm, format_desc, ptr_addrx, hash_indexx, cache); #if LP_BUILD_FORMAT_CACHE_DEBUG update_cache_access(gallivm, cache, 1, LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS); #endif } lp_build_endif(&if_ctx); colorx = lookup_cached_pixel(gallivm, cache, block_indexx); color = LLVMBuildInsertElement(builder, color, colorx, lp_build_const_int32(gallivm, count), ""); } } else { LLVMValueRef cond; struct lp_build_if_state if_ctx; tmp = LLVMBuildZExt(builder, offset, i64t, ""); addr = LLVMBuildAdd(builder, tmp, addr, ""); offset_stored = lookup_tag_data(gallivm, cache, hash_index); cond = LLVMBuildICmp(builder, LLVMIntNE, offset_stored, addr, ""); lp_build_if(&if_ctx, gallivm, cond); { tmp = LLVMBuildIntToPtr(builder, addr, LLVMPointerType(i8t, 0), ""); update_cached_block(gallivm, format_desc, tmp, hash_index, cache); #if LP_BUILD_FORMAT_CACHE_DEBUG update_cache_access(gallivm, cache, 1, LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_MISS); #endif } lp_build_endif(&if_ctx); color = lookup_cached_pixel(gallivm, cache, block_index); } #if LP_BUILD_FORMAT_CACHE_DEBUG update_cache_access(gallivm, cache, n, LP_BUILD_FORMAT_CACHE_MEMBER_ACCESS_TOTAL); #endif return LLVMBuildBitCast(builder, color, LLVMVectorType(i8t, n * 4), ""); }
/** * Generate code for performing depth and/or stencil tests. * We operate on a vector of values (typically a 2x2 quad). * * \param depth the depth test state * \param stencil the front/back stencil state * \param type the data type of the fragment depth/stencil values * \param format_desc description of the depth/stencil surface * \param mask the alive/dead pixel mask for the quad (vector) * \param stencil_refs the front/back stencil ref values (scalar) * \param z_src the incoming depth/stencil values (a 2x2 quad) * \param zs_dst_ptr pointer to depth/stencil values in framebuffer * \param facing contains float value indicating front/back facing polygon */ void lp_build_depth_stencil_test(LLVMBuilderRef builder, const struct pipe_depth_state *depth, const struct pipe_stencil_state stencil[2], struct lp_type type, const struct util_format_description *format_desc, struct lp_build_mask_context *mask, LLVMValueRef stencil_refs[2], LLVMValueRef z_src, LLVMValueRef zs_dst_ptr, LLVMValueRef face, LLVMValueRef counter) { struct lp_build_context bld; struct lp_build_context sbld; struct lp_type s_type; LLVMValueRef zs_dst, z_dst = NULL; LLVMValueRef stencil_vals = NULL; LLVMValueRef z_bitmask = NULL, stencil_shift = NULL; LLVMValueRef z_pass = NULL, s_pass_mask = NULL; LLVMValueRef orig_mask = mask->value; /* Sanity checking */ { const unsigned z_swizzle = format_desc->swizzle[0]; const unsigned s_swizzle = format_desc->swizzle[1]; assert(z_swizzle != UTIL_FORMAT_SWIZZLE_NONE || s_swizzle != UTIL_FORMAT_SWIZZLE_NONE); assert(depth->enabled || stencil[0].enabled); assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS); assert(format_desc->block.width == 1); assert(format_desc->block.height == 1); if (stencil[0].enabled) { assert(format_desc->format == PIPE_FORMAT_Z24_UNORM_S8_USCALED || format_desc->format == PIPE_FORMAT_S8_USCALED_Z24_UNORM); } assert(z_swizzle < 4); assert(format_desc->block.bits == type.width); if (type.floating) { assert(z_swizzle == 0); assert(format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_FLOAT); assert(format_desc->channel[z_swizzle].size == format_desc->block.bits); } else { assert(format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED); assert(format_desc->channel[z_swizzle].normalized); assert(!type.fixed); assert(!type.sign); assert(type.norm); } } /* Setup build context for Z vals */ lp_build_context_init(&bld, builder, type); /* Setup build context for stencil vals */ s_type = lp_type_int_vec(type.width); lp_build_context_init(&sbld, builder, s_type); /* Load current z/stencil value from z/stencil buffer */ zs_dst = LLVMBuildLoad(builder, zs_dst_ptr, ""); lp_build_name(zs_dst, "zsbufval"); /* Compute and apply the Z/stencil bitmasks and shifts. */ { unsigned z_shift, z_mask; unsigned s_shift, s_mask; if (get_z_shift_and_mask(format_desc, &z_shift, &z_mask)) { if (z_shift) { LLVMValueRef shift = lp_build_const_int_vec(type, z_shift); z_src = LLVMBuildLShr(builder, z_src, shift, ""); } if (z_mask != 0xffffffff) { LLVMValueRef mask = lp_build_const_int_vec(type, z_mask); z_src = LLVMBuildAnd(builder, z_src, mask, ""); z_dst = LLVMBuildAnd(builder, zs_dst, mask, ""); z_bitmask = mask; /* used below */ } else { z_dst = zs_dst; } lp_build_name(z_dst, "zsbuf.z"); } if (get_s_shift_and_mask(format_desc, &s_shift, &s_mask)) { if (s_shift) { LLVMValueRef shift = lp_build_const_int_vec(type, s_shift); stencil_vals = LLVMBuildLShr(builder, zs_dst, shift, ""); stencil_shift = shift; /* used below */ } else { stencil_vals = zs_dst; } if (s_mask != 0xffffffff) { LLVMValueRef mask = lp_build_const_int_vec(type, s_mask); stencil_vals = LLVMBuildAnd(builder, stencil_vals, mask, ""); } lp_build_name(stencil_vals, "stencil"); } } if (stencil[0].enabled) { /* convert scalar stencil refs into vectors */ stencil_refs[0] = lp_build_broadcast_scalar(&bld, stencil_refs[0]); stencil_refs[1] = lp_build_broadcast_scalar(&bld, stencil_refs[1]); s_pass_mask = lp_build_stencil_test(&sbld, stencil, stencil_refs, stencil_vals, face); /* apply stencil-fail operator */ { LLVMValueRef s_fail_mask = lp_build_andc(&bld, orig_mask, s_pass_mask); stencil_vals = lp_build_stencil_op(&sbld, stencil, S_FAIL_OP, stencil_refs, stencil_vals, s_fail_mask, face); } } if (depth->enabled) { /* compare src Z to dst Z, returning 'pass' mask */ z_pass = lp_build_cmp(&bld, depth->func, z_src, z_dst); if (!stencil[0].enabled) { /* We can potentially skip all remaining operations here, but only * if stencil is disabled because we still need to update the stencil * buffer values. Don't need to update Z buffer values. */ lp_build_mask_update(mask, z_pass); } if (depth->writemask) { LLVMValueRef zselectmask = mask->value; /* mask off bits that failed Z test */ zselectmask = LLVMBuildAnd(builder, zselectmask, z_pass, ""); /* mask off bits that failed stencil test */ if (s_pass_mask) { zselectmask = LLVMBuildAnd(builder, zselectmask, s_pass_mask, ""); } /* if combined Z/stencil format, mask off the stencil bits */ if (z_bitmask) { zselectmask = LLVMBuildAnd(builder, zselectmask, z_bitmask, ""); } /* Mix the old and new Z buffer values. * z_dst[i] = (zselectmask[i] & z_src[i]) | (~zselectmask[i] & z_dst[i]) */ z_dst = lp_build_select_bitwise(&bld, zselectmask, z_src, z_dst); } if (stencil[0].enabled) { /* update stencil buffer values according to z pass/fail result */ LLVMValueRef z_fail_mask, z_pass_mask; /* apply Z-fail operator */ z_fail_mask = lp_build_andc(&bld, orig_mask, z_pass); stencil_vals = lp_build_stencil_op(&sbld, stencil, Z_FAIL_OP, stencil_refs, stencil_vals, z_fail_mask, face); /* apply Z-pass operator */ z_pass_mask = LLVMBuildAnd(bld.builder, orig_mask, z_pass, ""); stencil_vals = lp_build_stencil_op(&sbld, stencil, Z_PASS_OP, stencil_refs, stencil_vals, z_pass_mask, face); } } else { /* No depth test: apply Z-pass operator to stencil buffer values which * passed the stencil test. */ s_pass_mask = LLVMBuildAnd(bld.builder, orig_mask, s_pass_mask, ""); stencil_vals = lp_build_stencil_op(&sbld, stencil, Z_PASS_OP, stencil_refs, stencil_vals, s_pass_mask, face); } /* The Z bits are already in the right place but we may need to shift the * stencil bits before ORing Z with Stencil to make the final pixel value. */ if (stencil_vals && stencil_shift) stencil_vals = LLVMBuildShl(bld.builder, stencil_vals, stencil_shift, ""); /* Finally, merge/store the z/stencil values */ if ((depth->enabled && depth->writemask) || (stencil[0].enabled && stencil[0].writemask)) { if (z_dst && stencil_vals) zs_dst = LLVMBuildOr(bld.builder, z_dst, stencil_vals, ""); else if (z_dst) zs_dst = z_dst; else zs_dst = stencil_vals; LLVMBuildStore(builder, zs_dst, zs_dst_ptr); } if (s_pass_mask) lp_build_mask_update(mask, s_pass_mask); if (depth->enabled && stencil[0].enabled) lp_build_mask_update(mask, z_pass); if (counter) lp_build_occlusion_count(builder, type, mask->value, counter); }
/** * Unpack several pixels in SoA. * * It takes a vector of packed pixels: * * packed = {P0, P1, P2, P3, ..., Pn} * * And will produce four vectors: * * red = {R0, R1, R2, R3, ..., Rn} * green = {G0, G1, G2, G3, ..., Gn} * blue = {B0, B1, B2, B3, ..., Bn} * alpha = {A0, A1, A2, A3, ..., An} * * It requires that a packed pixel fits into an element of the output * channels. The common case is when converting pixel with a depth of 32 bit or * less into floats. * * \param format_desc the format of the 'packed' incoming pixel vector * \param type the desired type for rgba_out (type.length = n, above) * \param packed the incoming vector of packed pixels * \param rgba_out returns the SoA R,G,B,A vectors */ void lp_build_unpack_rgba_soa(struct gallivm_state *gallivm, const struct util_format_description *format_desc, struct lp_type type, LLVMValueRef packed, LLVMValueRef rgba_out[4]) { LLVMBuilderRef builder = gallivm->builder; struct lp_build_context bld; LLVMValueRef inputs[4]; unsigned chan; assert(format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN); assert(format_desc->block.width == 1); assert(format_desc->block.height == 1); assert(format_desc->block.bits <= type.width); /* FIXME: Support more output types */ assert(type.width == 32); lp_build_context_init(&bld, gallivm, type); /* Decode the input vector components */ for (chan = 0; chan < format_desc->nr_channels; ++chan) { const unsigned width = format_desc->channel[chan].size; const unsigned start = format_desc->channel[chan].shift; const unsigned stop = start + width; LLVMValueRef input; input = packed; switch(format_desc->channel[chan].type) { case UTIL_FORMAT_TYPE_VOID: input = lp_build_undef(gallivm, type); break; case UTIL_FORMAT_TYPE_UNSIGNED: /* * Align the LSB */ if (start) { input = LLVMBuildLShr(builder, input, lp_build_const_int_vec(gallivm, type, start), ""); } /* * Zero the MSBs */ if (stop < format_desc->block.bits) { unsigned mask = ((unsigned long long)1 << width) - 1; input = LLVMBuildAnd(builder, input, lp_build_const_int_vec(gallivm, type, mask), ""); } /* * Type conversion */ if (type.floating) { if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) { assert(width == 8); if (format_desc->swizzle[3] == chan) { input = lp_build_unsigned_norm_to_float(gallivm, width, type, input); } else { struct lp_type conv_type = lp_uint_type(type); input = lp_build_srgb_to_linear(gallivm, conv_type, input); } } else { if(format_desc->channel[chan].normalized) input = lp_build_unsigned_norm_to_float(gallivm, width, type, input); else input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(gallivm, type), ""); } } else if (format_desc->channel[chan].pure_integer) { /* Nothing to do */ } else { /* FIXME */ assert(0); } break; case UTIL_FORMAT_TYPE_SIGNED: /* * Align the sign bit first. */ if (stop < type.width) { unsigned bits = type.width - stop; LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits); input = LLVMBuildShl(builder, input, bits_val, ""); } /* * Align the LSB (with an arithmetic shift to preserve the sign) */ if (format_desc->channel[chan].size < type.width) { unsigned bits = type.width - format_desc->channel[chan].size; LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits); input = LLVMBuildAShr(builder, input, bits_val, ""); } /* * Type conversion */ if (type.floating) { input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(gallivm, type), ""); if (format_desc->channel[chan].normalized) { double scale = 1.0 / ((1 << (format_desc->channel[chan].size - 1)) - 1); LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale); input = LLVMBuildFMul(builder, input, scale_val, ""); /* the formula above will produce value below -1.0 for most negative * value but everything seems happy with that hence disable for now */ if (0) input = lp_build_max(&bld, input, lp_build_const_vec(gallivm, type, -1.0f)); } } else if (format_desc->channel[chan].pure_integer) { /* Nothing to do */ } else { /* FIXME */ assert(0); } break; case UTIL_FORMAT_TYPE_FLOAT: if (type.floating) { assert(start == 0); assert(stop == 32); assert(type.width == 32); input = LLVMBuildBitCast(builder, input, lp_build_vec_type(gallivm, type), ""); } else { /* FIXME */ assert(0); input = lp_build_undef(gallivm, type); } break; case UTIL_FORMAT_TYPE_FIXED: if (type.floating) { double scale = 1.0 / ((1 << (format_desc->channel[chan].size/2)) - 1); LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale); input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(gallivm, type), ""); input = LLVMBuildFMul(builder, input, scale_val, ""); } else { /* FIXME */ assert(0); input = lp_build_undef(gallivm, type); } break; default: assert(0); input = lp_build_undef(gallivm, type); break; } inputs[chan] = input; } lp_build_format_swizzle_soa(format_desc, &bld, inputs, rgba_out); }
/** * Perform the occlusion test and increase the counter. * Test the depth mask. Add the number of channel which has none zero mask * into the occlusion counter. e.g. maskvalue is {-1, -1, -1, -1}. * The counter will add 4. * * \param type holds element type of the mask vector. * \param maskvalue is the depth test mask. * \param counter is a pointer of the uint32 counter. */ void lp_build_occlusion_count(struct gallivm_state *gallivm, struct lp_type type, LLVMValueRef maskvalue, LLVMValueRef counter) { LLVMBuilderRef builder = gallivm->builder; LLVMContextRef context = gallivm->context; LLVMValueRef countmask = lp_build_const_int_vec(gallivm, type, 1); LLVMValueRef count, newcount; assert(type.length <= 16); assert(type.floating); if(util_cpu_caps.has_sse && type.length == 4) { const char *movmskintr = "llvm.x86.sse.movmsk.ps"; const char *popcntintr = "llvm.ctpop.i32"; LLVMValueRef bits = LLVMBuildBitCast(builder, maskvalue, lp_build_vec_type(gallivm, type), ""); bits = lp_build_intrinsic_unary(builder, movmskintr, LLVMInt32TypeInContext(context), bits); count = lp_build_intrinsic_unary(builder, popcntintr, LLVMInt32TypeInContext(context), bits); } else if(util_cpu_caps.has_avx && type.length == 8) { const char *movmskintr = "llvm.x86.avx.movmsk.ps.256"; const char *popcntintr = "llvm.ctpop.i32"; LLVMValueRef bits = LLVMBuildBitCast(builder, maskvalue, lp_build_vec_type(gallivm, type), ""); bits = lp_build_intrinsic_unary(builder, movmskintr, LLVMInt32TypeInContext(context), bits); count = lp_build_intrinsic_unary(builder, popcntintr, LLVMInt32TypeInContext(context), bits); } else { unsigned i; LLVMValueRef countv = LLVMBuildAnd(builder, maskvalue, countmask, "countv"); LLVMTypeRef counttype = LLVMIntTypeInContext(context, type.length * 8); LLVMTypeRef i8vntype = LLVMVectorType(LLVMInt8TypeInContext(context), type.length * 4); LLVMValueRef shufflev, countd; LLVMValueRef shuffles[16]; const char *popcntintr = NULL; countv = LLVMBuildBitCast(builder, countv, i8vntype, ""); for (i = 0; i < type.length; i++) { shuffles[i] = lp_build_const_int32(gallivm, 4*i); } shufflev = LLVMConstVector(shuffles, type.length); countd = LLVMBuildShuffleVector(builder, countv, LLVMGetUndef(i8vntype), shufflev, ""); countd = LLVMBuildBitCast(builder, countd, counttype, "countd"); /* * XXX FIXME * this is bad on cpus without popcount (on x86 supported by intel * nehalem, amd barcelona, and up - not tied to sse42). * Would be much faster to just sum the 4 elements of the vector with * some horizontal add (shuffle/add/shuffle/add after the initial and). */ switch (type.length) { case 4: popcntintr = "llvm.ctpop.i32"; break; case 8: popcntintr = "llvm.ctpop.i64"; break; case 16: popcntintr = "llvm.ctpop.i128"; break; default: assert(0); } count = lp_build_intrinsic_unary(builder, popcntintr, counttype, countd); if (type.length > 4) { count = LLVMBuildTrunc(builder, count, LLVMIntTypeInContext(context, 32), ""); } } newcount = LLVMBuildLoad(builder, counter, "origcount"); newcount = LLVMBuildAdd(builder, newcount, count, "newcount"); LLVMBuildStore(builder, newcount, counter); }
static INLINE void yuv_to_rgb_soa(struct gallivm_state *gallivm, unsigned n, LLVMValueRef y, LLVMValueRef u, LLVMValueRef v, LLVMValueRef *r, LLVMValueRef *g, LLVMValueRef *b) { LLVMBuilderRef builder = gallivm->builder; struct lp_type type; struct lp_build_context bld; LLVMValueRef c0; LLVMValueRef c8; LLVMValueRef c16; LLVMValueRef c128; LLVMValueRef c255; LLVMValueRef cy; LLVMValueRef cug; LLVMValueRef cub; LLVMValueRef cvr; LLVMValueRef cvg; memset(&type, 0, sizeof type); type.sign = TRUE; type.width = 32; type.length = n; lp_build_context_init(&bld, gallivm, type); assert(lp_check_value(type, y)); assert(lp_check_value(type, u)); assert(lp_check_value(type, v)); /* * Constants */ c0 = lp_build_const_int_vec(gallivm, type, 0); c8 = lp_build_const_int_vec(gallivm, type, 8); c16 = lp_build_const_int_vec(gallivm, type, 16); c128 = lp_build_const_int_vec(gallivm, type, 128); c255 = lp_build_const_int_vec(gallivm, type, 255); cy = lp_build_const_int_vec(gallivm, type, 298); cug = lp_build_const_int_vec(gallivm, type, -100); cub = lp_build_const_int_vec(gallivm, type, 516); cvr = lp_build_const_int_vec(gallivm, type, 409); cvg = lp_build_const_int_vec(gallivm, type, -208); /* * y -= 16; * u -= 128; * v -= 128; */ y = LLVMBuildSub(builder, y, c16, ""); u = LLVMBuildSub(builder, u, c128, ""); v = LLVMBuildSub(builder, v, c128, ""); /* * r = 298 * _y + 409 * _v + 128; * g = 298 * _y - 100 * _u - 208 * _v + 128; * b = 298 * _y + 516 * _u + 128; */ y = LLVMBuildMul(builder, y, cy, ""); y = LLVMBuildAdd(builder, y, c128, ""); *r = LLVMBuildMul(builder, v, cvr, ""); *g = LLVMBuildAdd(builder, LLVMBuildMul(builder, u, cug, ""), LLVMBuildMul(builder, v, cvg, ""), ""); *b = LLVMBuildMul(builder, u, cub, ""); *r = LLVMBuildAdd(builder, *r, y, ""); *g = LLVMBuildAdd(builder, *g, y, ""); *b = LLVMBuildAdd(builder, *b, y, ""); /* * r >>= 8; * g >>= 8; * b >>= 8; */ *r = LLVMBuildAShr(builder, *r, c8, "r"); *g = LLVMBuildAShr(builder, *g, c8, "g"); *b = LLVMBuildAShr(builder, *b, c8, "b"); /* * Clamp */ *r = lp_build_clamp(&bld, *r, c0, c255); *g = lp_build_clamp(&bld, *g, c0, c255); *b = lp_build_clamp(&bld, *b, c0, c255); }
/** * Generate code for performing depth and/or stencil tests. * We operate on a vector of values (typically n 2x2 quads). * * \param depth the depth test state * \param stencil the front/back stencil state * \param type the data type of the fragment depth/stencil values * \param format_desc description of the depth/stencil surface * \param mask the alive/dead pixel mask for the quad (vector) * \param stencil_refs the front/back stencil ref values (scalar) * \param z_src the incoming depth/stencil values (n 2x2 quad values, float32) * \param zs_dst the depth/stencil values in framebuffer * \param face contains boolean value indicating front/back facing polygon */ void lp_build_depth_stencil_test(struct gallivm_state *gallivm, const struct pipe_depth_state *depth, const struct pipe_stencil_state stencil[2], struct lp_type z_src_type, const struct util_format_description *format_desc, struct lp_build_mask_context *mask, LLVMValueRef stencil_refs[2], LLVMValueRef z_src, LLVMValueRef z_fb, LLVMValueRef s_fb, LLVMValueRef face, LLVMValueRef *z_value, LLVMValueRef *s_value, boolean do_branch) { LLVMBuilderRef builder = gallivm->builder; struct lp_type z_type; struct lp_build_context z_bld; struct lp_build_context s_bld; struct lp_type s_type; unsigned z_shift = 0, z_width = 0, z_mask = 0; LLVMValueRef z_dst = NULL; LLVMValueRef stencil_vals = NULL; LLVMValueRef z_bitmask = NULL, stencil_shift = NULL; LLVMValueRef z_pass = NULL, s_pass_mask = NULL; LLVMValueRef orig_mask = lp_build_mask_value(mask); LLVMValueRef front_facing = NULL; boolean have_z, have_s; /* * Depths are expected to be between 0 and 1, even if they are stored in * floats. Setting these bits here will ensure that the lp_build_conv() call * below won't try to unnecessarily clamp the incoming values. */ if(z_src_type.floating) { z_src_type.sign = FALSE; z_src_type.norm = TRUE; } else { assert(!z_src_type.sign); assert(z_src_type.norm); } /* Pick the type matching the depth-stencil format. */ z_type = lp_depth_type(format_desc, z_src_type.length); /* Pick the intermediate type for depth operations. */ z_type.width = z_src_type.width; assert(z_type.length == z_src_type.length); /* FIXME: for non-float depth/stencil might generate better code * if we'd always split it up to use 128bit operations. * For stencil we'd almost certainly want to pack to 8xi16 values, * for z just run twice. */ /* Sanity checking */ { const unsigned z_swizzle = format_desc->swizzle[0]; const unsigned s_swizzle = format_desc->swizzle[1]; assert(z_swizzle != UTIL_FORMAT_SWIZZLE_NONE || s_swizzle != UTIL_FORMAT_SWIZZLE_NONE); assert(depth->enabled || stencil[0].enabled); assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS); assert(format_desc->block.width == 1); assert(format_desc->block.height == 1); if (stencil[0].enabled) { assert(s_swizzle < 4); assert(format_desc->channel[s_swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED); assert(format_desc->channel[s_swizzle].pure_integer); assert(!format_desc->channel[s_swizzle].normalized); assert(format_desc->channel[s_swizzle].size == 8); } if (depth->enabled) { assert(z_swizzle < 4); if (z_type.floating) { assert(z_swizzle == 0); assert(format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_FLOAT); assert(format_desc->channel[z_swizzle].size == 32); } else { assert(format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED); assert(format_desc->channel[z_swizzle].normalized); assert(!z_type.fixed); } } } /* Setup build context for Z vals */ lp_build_context_init(&z_bld, gallivm, z_type); /* Setup build context for stencil vals */ s_type = lp_int_type(z_type); lp_build_context_init(&s_bld, gallivm, s_type); /* Compute and apply the Z/stencil bitmasks and shifts. */ { unsigned s_shift, s_mask; z_dst = z_fb; stencil_vals = s_fb; have_z = get_z_shift_and_mask(format_desc, &z_shift, &z_width, &z_mask); have_s = get_s_shift_and_mask(format_desc, &s_shift, &s_mask); if (have_z) { if (z_mask != 0xffffffff) { z_bitmask = lp_build_const_int_vec(gallivm, z_type, z_mask); } /* * Align the framebuffer Z 's LSB to the right. */ if (z_shift) { LLVMValueRef shift = lp_build_const_int_vec(gallivm, z_type, z_shift); z_dst = LLVMBuildLShr(builder, z_dst, shift, "z_dst"); } else if (z_bitmask) { z_dst = LLVMBuildAnd(builder, z_dst, z_bitmask, "z_dst"); } else { lp_build_name(z_dst, "z_dst"); } } if (have_s) { if (s_shift) { LLVMValueRef shift = lp_build_const_int_vec(gallivm, s_type, s_shift); stencil_vals = LLVMBuildLShr(builder, stencil_vals, shift, ""); stencil_shift = shift; /* used below */ } if (s_mask != 0xffffffff) { LLVMValueRef mask = lp_build_const_int_vec(gallivm, s_type, s_mask); stencil_vals = LLVMBuildAnd(builder, stencil_vals, mask, ""); } lp_build_name(stencil_vals, "s_dst"); } } if (stencil[0].enabled) { if (face) { LLVMValueRef zero = lp_build_const_int32(gallivm, 0); /* front_facing = face != 0 ? ~0 : 0 */ front_facing = LLVMBuildICmp(builder, LLVMIntNE, face, zero, ""); front_facing = LLVMBuildSExt(builder, front_facing, LLVMIntTypeInContext(gallivm->context, s_bld.type.length*s_bld.type.width), ""); front_facing = LLVMBuildBitCast(builder, front_facing, s_bld.int_vec_type, ""); } /* convert scalar stencil refs into vectors */ stencil_refs[0] = lp_build_broadcast_scalar(&s_bld, stencil_refs[0]); stencil_refs[1] = lp_build_broadcast_scalar(&s_bld, stencil_refs[1]); s_pass_mask = lp_build_stencil_test(&s_bld, stencil, stencil_refs, stencil_vals, front_facing); /* apply stencil-fail operator */ { LLVMValueRef s_fail_mask = lp_build_andnot(&s_bld, orig_mask, s_pass_mask); stencil_vals = lp_build_stencil_op(&s_bld, stencil, S_FAIL_OP, stencil_refs, stencil_vals, s_fail_mask, front_facing); } } if (depth->enabled) { /* * Convert fragment Z to the desired type, aligning the LSB to the right. */ assert(z_type.width == z_src_type.width); assert(z_type.length == z_src_type.length); assert(lp_check_value(z_src_type, z_src)); if (z_src_type.floating) { /* * Convert from floating point values */ if (!z_type.floating) { z_src = lp_build_clamped_float_to_unsigned_norm(gallivm, z_src_type, z_width, z_src); } } else { /* * Convert from unsigned normalized values. */ assert(!z_src_type.sign); assert(!z_src_type.fixed); assert(z_src_type.norm); assert(!z_type.floating); if (z_src_type.width > z_width) { LLVMValueRef shift = lp_build_const_int_vec(gallivm, z_src_type, z_src_type.width - z_width); z_src = LLVMBuildLShr(builder, z_src, shift, ""); } } assert(lp_check_value(z_type, z_src)); lp_build_name(z_src, "z_src"); /* compare src Z to dst Z, returning 'pass' mask */ z_pass = lp_build_cmp(&z_bld, depth->func, z_src, z_dst); if (!stencil[0].enabled) { /* We can potentially skip all remaining operations here, but only * if stencil is disabled because we still need to update the stencil * buffer values. Don't need to update Z buffer values. */ lp_build_mask_update(mask, z_pass); if (do_branch) { lp_build_mask_check(mask); do_branch = FALSE; } } if (depth->writemask) { LLVMValueRef zselectmask; /* mask off bits that failed Z test */ zselectmask = LLVMBuildAnd(builder, orig_mask, z_pass, ""); /* mask off bits that failed stencil test */ if (s_pass_mask) { zselectmask = LLVMBuildAnd(builder, zselectmask, s_pass_mask, ""); } /* Mix the old and new Z buffer values. * z_dst[i] = zselectmask[i] ? z_src[i] : z_dst[i] */ z_dst = lp_build_select(&z_bld, zselectmask, z_src, z_dst); } if (stencil[0].enabled) { /* update stencil buffer values according to z pass/fail result */ LLVMValueRef z_fail_mask, z_pass_mask; /* apply Z-fail operator */ z_fail_mask = lp_build_andnot(&s_bld, orig_mask, z_pass); stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_FAIL_OP, stencil_refs, stencil_vals, z_fail_mask, front_facing); /* apply Z-pass operator */ z_pass_mask = LLVMBuildAnd(builder, orig_mask, z_pass, ""); stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_PASS_OP, stencil_refs, stencil_vals, z_pass_mask, front_facing); } } else { /* No depth test: apply Z-pass operator to stencil buffer values which * passed the stencil test. */ s_pass_mask = LLVMBuildAnd(builder, orig_mask, s_pass_mask, ""); stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_PASS_OP, stencil_refs, stencil_vals, s_pass_mask, front_facing); } /* Put Z and stencil bits in the right place */ if (have_z && z_shift) { LLVMValueRef shift = lp_build_const_int_vec(gallivm, z_type, z_shift); z_dst = LLVMBuildShl(builder, z_dst, shift, ""); } if (stencil_vals && stencil_shift) stencil_vals = LLVMBuildShl(builder, stencil_vals, stencil_shift, ""); /* Finally, merge the z/stencil values */ if (format_desc->block.bits <= 32) { if (have_z && have_s) *z_value = LLVMBuildOr(builder, z_dst, stencil_vals, ""); else if (have_z) *z_value = z_dst; else *z_value = stencil_vals; *s_value = *z_value; } else { *z_value = z_dst; *s_value = stencil_vals; } if (s_pass_mask) lp_build_mask_update(mask, s_pass_mask); if (depth->enabled && stencil[0].enabled) lp_build_mask_update(mask, z_pass); }