/** * Return (a & ~b) */ LLVMValueRef lp_build_andnot(struct lp_build_context *bld, LLVMValueRef a, LLVMValueRef b) { LLVMBuilderRef builder = bld->gallivm->builder; const struct lp_type type = bld->type; LLVMValueRef res; assert(lp_check_value(type, a)); assert(lp_check_value(type, b)); /* can't do bitwise ops on floating-point values */ if (type.floating) { a = LLVMBuildBitCast(builder, a, bld->int_vec_type, ""); b = LLVMBuildBitCast(builder, b, bld->int_vec_type, ""); } res = LLVMBuildNot(builder, b, ""); res = LLVMBuildAnd(builder, a, res, ""); if (type.floating) { res = LLVMBuildBitCast(builder, res, bld->vec_type, ""); } return res; }
/** * Extract Y, U, V channels from packed YUYV. * @param packed is a <n x i32> vector with the packed YUYV blocks * @param i is a <n x i32> vector with the x pixel coordinate (0 or 1) */ static void yuyv_to_yuv_soa(struct gallivm_state *gallivm, unsigned n, LLVMValueRef packed, LLVMValueRef i, LLVMValueRef *y, LLVMValueRef *u, LLVMValueRef *v) { LLVMBuilderRef builder = gallivm->builder; struct lp_type type; LLVMValueRef mask; memset(&type, 0, sizeof type); type.width = 32; type.length = n; assert(lp_check_value(type, packed)); assert(lp_check_value(type, i)); /* * y = (yuyv >> 16*i) & 0xff * u = (yuyv >> 8 ) & 0xff * v = (yuyv >> 24 ) & 0xff */ #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) /* * Avoid shift with per-element count. * No support on x86, gets translated to roughly 5 instructions * per element. Didn't measure performance but cuts shader size * by quite a bit (less difference if cpu has no sse4.1 support). */ if (util_cpu_caps.has_sse2 && n == 4) { LLVMValueRef sel, tmp; struct lp_build_context bld32; lp_build_context_init(&bld32, gallivm, type); tmp = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(gallivm, type, 16), ""); sel = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL, i, lp_build_const_int_vec(gallivm, type, 0)); *y = lp_build_select(&bld32, sel, packed, tmp); } else #endif { LLVMValueRef shift; shift = LLVMBuildMul(builder, i, lp_build_const_int_vec(gallivm, type, 16), ""); *y = LLVMBuildLShr(builder, packed, shift, ""); } *u = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(gallivm, type, 8), ""); *v = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(gallivm, type, 24), ""); mask = lp_build_const_int_vec(gallivm, type, 0xff); *y = LLVMBuildAnd(builder, *y, mask, "y"); *u = LLVMBuildAnd(builder, *u, mask, "u"); *v = LLVMBuildAnd(builder, *v, mask, "v"); }
/** * Return mask ? a : b; * * mask is a TGSI_WRITEMASK_xxx. */ LLVMValueRef lp_build_select_aos(struct lp_build_context *bld, unsigned mask, LLVMValueRef a, LLVMValueRef b, unsigned num_channels) { LLVMBuilderRef builder = bld->gallivm->builder; const struct lp_type type = bld->type; const unsigned n = type.length; unsigned i, j; assert((mask & ~0xf) == 0); assert(lp_check_value(type, a)); assert(lp_check_value(type, b)); if(a == b) return a; if((mask & 0xf) == 0xf) return a; if((mask & 0xf) == 0x0) return b; if(a == bld->undef || b == bld->undef) return bld->undef; /* * There are two major ways of accomplishing this: * - with a shuffle * - with a select * * The flip between these is empirical and might need to be adjusted. */ if (n <= 4) { /* * Shuffle. */ LLVMTypeRef elem_type = LLVMInt32TypeInContext(bld->gallivm->context); LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH]; for(j = 0; j < n; j += num_channels) for(i = 0; i < num_channels; ++i) shuffles[j + i] = LLVMConstInt(elem_type, (mask & (1 << i) ? 0 : n) + j + i, 0); return LLVMBuildShuffleVector(builder, a, b, LLVMConstVector(shuffles, n), ""); } else { LLVMValueRef mask_vec = lp_build_const_mask_aos(bld->gallivm, type, mask, num_channels); return lp_build_select(bld, mask_vec, a, b); } }
/** * Shift left. */ LLVMValueRef lp_build_shl(struct lp_build_context *bld, LLVMValueRef a, LLVMValueRef b) { LLVMBuilderRef builder = bld->gallivm->builder; const struct lp_type type = bld->type; LLVMValueRef res; assert(!type.floating); assert(lp_check_value(type, a)); assert(lp_check_value(type, b)); res = LLVMBuildShl(builder, a, b, ""); return res; }
static INLINE LLVMValueRef lp_build_round_sse41(struct lp_build_context *bld, LLVMValueRef a, enum lp_build_round_sse41_mode mode) { const struct lp_type type = bld->type; LLVMTypeRef vec_type = lp_build_vec_type(type); const char *intrinsic; assert(type.floating); assert(type.width*type.length == 128); assert(lp_check_value(type, a)); assert(util_cpu_caps.has_sse4_1); switch(type.width) { case 32: intrinsic = "llvm.x86.sse41.round.ps"; break; case 64: intrinsic = "llvm.x86.sse41.round.pd"; break; default: assert(0); return bld->undef; } return lp_build_intrinsic_binary(bld->builder, intrinsic, vec_type, a, LLVMConstInt(LLVMInt32Type(), mode, 0)); }
/** * Combined extract and broadcast (mere shuffle in most cases) */ LLVMValueRef lp_build_extract_broadcast(struct gallivm_state *gallivm, struct lp_type src_type, struct lp_type dst_type, LLVMValueRef vector, LLVMValueRef index) { LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context); LLVMValueRef res; assert(src_type.floating == dst_type.floating); assert(src_type.width == dst_type.width); assert(lp_check_value(src_type, vector)); assert(LLVMTypeOf(index) == i32t); if (src_type.length == 1) { if (dst_type.length == 1) { /* * Trivial scalar -> scalar. */ res = vector; } else { /* * Broadcast scalar -> vector. */ res = lp_build_broadcast(gallivm, lp_build_vec_type(gallivm, dst_type), vector); } } else { if (dst_type.length > 1) { /* * shuffle - result can be of different length. */ LLVMValueRef shuffle; shuffle = lp_build_broadcast(gallivm, LLVMVectorType(i32t, dst_type.length), index); res = LLVMBuildShuffleVector(gallivm->builder, vector, LLVMGetUndef(lp_build_vec_type(gallivm, src_type)), shuffle, ""); } else { /* * Trivial extract scalar from vector. */ res = LLVMBuildExtractElement(gallivm->builder, vector, index, ""); } } return res; }
static LLVMValueRef rgb_to_rgba_aos(struct gallivm_state *gallivm, unsigned n, LLVMValueRef r, LLVMValueRef g, LLVMValueRef b) { LLVMBuilderRef builder = gallivm->builder; struct lp_type type; LLVMValueRef a; LLVMValueRef rgba; memset(&type, 0, sizeof type); type.sign = TRUE; type.width = 32; type.length = n; assert(lp_check_value(type, r)); assert(lp_check_value(type, g)); assert(lp_check_value(type, b)); /* * Make a 4 x unorm8 vector */ #ifdef PIPE_ARCH_LITTLE_ENDIAN r = r; g = LLVMBuildShl(builder, g, lp_build_const_int_vec(gallivm, type, 8), ""); b = LLVMBuildShl(builder, b, lp_build_const_int_vec(gallivm, type, 16), ""); a = lp_build_const_int_vec(gallivm, type, 0xff000000); #else r = LLVMBuildShl(builder, r, lp_build_const_int_vec(gallivm, type, 24), ""); g = LLVMBuildShl(builder, g, lp_build_const_int_vec(gallivm, type, 16), ""); b = LLVMBuildShl(builder, b, lp_build_const_int_vec(gallivm, type, 8), ""); a = lp_build_const_int_vec(gallivm, type, 0x000000ff); #endif rgba = r; rgba = LLVMBuildOr(builder, rgba, g, ""); rgba = LLVMBuildOr(builder, rgba, b, ""); rgba = LLVMBuildOr(builder, rgba, a, ""); rgba = LLVMBuildBitCast(builder, rgba, LLVMVectorType(LLVMInt8TypeInContext(gallivm->context), 4*n), ""); return rgba; }
/** * Shift right. */ LLVMValueRef lp_build_shr(struct lp_build_context *bld, LLVMValueRef a, LLVMValueRef b) { LLVMBuilderRef builder = bld->gallivm->builder; const struct lp_type type = bld->type; LLVMValueRef res; assert(!type.floating); assert(lp_check_value(type, a)); assert(lp_check_value(type, b)); if (type.sign) { res = LLVMBuildAShr(builder, a, b, ""); } else { res = LLVMBuildLShr(builder, a, b, ""); } return res; }
/** * Return (mask & a) | (~mask & b); */ LLVMValueRef lp_build_select_bitwise(struct lp_build_context *bld, LLVMValueRef mask, LLVMValueRef a, LLVMValueRef b) { LLVMBuilderRef builder = bld->gallivm->builder; struct lp_type type = bld->type; LLVMValueRef res; assert(lp_check_value(type, a)); assert(lp_check_value(type, b)); if (a == b) { return a; } if(type.floating) { LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); a = LLVMBuildBitCast(builder, a, int_vec_type, ""); b = LLVMBuildBitCast(builder, b, int_vec_type, ""); } a = LLVMBuildAnd(builder, a, mask, ""); /* This often gets translated to PANDN, but sometimes the NOT is * pre-computed and stored in another constant. The best strategy depends * on available registers, so it is not a big deal -- hopefully LLVM does * the right decision attending the rest of the program. */ b = LLVMBuildAnd(builder, b, LLVMBuildNot(builder, mask, ""), ""); res = LLVMBuildOr(builder, a, b, ""); if(type.floating) { LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); res = LLVMBuildBitCast(builder, res, vec_type, ""); } return res; }
/** * Convert to integer, through whichever rounding method that's fastest, * typically truncating toward zero. */ LLVMValueRef lp_build_itrunc(struct lp_build_context *bld, LLVMValueRef a) { const struct lp_type type = bld->type; LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); assert(type.floating); assert(lp_check_value(type, a)); return LLVMBuildFPToSI(bld->builder, a, int_vec_type, ""); }
/** * Codegen equivalent for u_minify(). * Return max(1, base_size >> level); */ LLVMValueRef lp_build_minify(struct lp_build_context *bld, LLVMValueRef base_size, LLVMValueRef level) { LLVMBuilderRef builder = bld->gallivm->builder; assert(lp_check_value(bld->type, base_size)); assert(lp_check_value(bld->type, level)); if (level == bld->zero) { /* if we're using mipmap level zero, no minification is needed */ return base_size; } else { LLVMValueRef size = LLVMBuildLShr(builder, base_size, level, "minify"); assert(bld->type.sign); size = lp_build_max(bld, size, bld->one); return size; } }
/** * Extract Y, U, V channels from packed UYVY. * @param packed is a <n x i32> vector with the packed UYVY blocks * @param i is a <n x i32> vector with the x pixel coordinate (0 or 1) */ static void uyvy_to_yuv_soa(LLVMBuilderRef builder, unsigned n, LLVMValueRef packed, LLVMValueRef i, LLVMValueRef *y, LLVMValueRef *u, LLVMValueRef *v) { struct lp_type type; LLVMValueRef shift, mask; memset(&type, 0, sizeof type); type.width = 32; type.length = n; assert(lp_check_value(type, packed)); assert(lp_check_value(type, i)); /* * y = (uyvy >> 16*i) & 0xff * u = (uyvy ) & 0xff * v = (uyvy >> 16 ) & 0xff */ shift = LLVMBuildMul(builder, i, lp_build_const_int_vec(type, 16), ""); shift = LLVMBuildAdd(builder, shift, lp_build_const_int_vec(type, 8), ""); *y = LLVMBuildLShr(builder, packed, shift, ""); *u = packed; *v = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(type, 16), ""); mask = lp_build_const_int_vec(type, 0xff); *y = LLVMBuildAnd(builder, *y, mask, "y"); *u = LLVMBuildAnd(builder, *u, mask, "u"); *v = LLVMBuildAnd(builder, *v, mask, "v"); }
static LLVMValueRef rgb_to_rgba_aos(LLVMBuilderRef builder, unsigned n, LLVMValueRef r, LLVMValueRef g, LLVMValueRef b) { struct lp_type type; LLVMValueRef a; LLVMValueRef rgba; memset(&type, 0, sizeof type); type.sign = TRUE; type.width = 32; type.length = n; assert(lp_check_value(type, r)); assert(lp_check_value(type, g)); assert(lp_check_value(type, b)); /* * Make a 4 x unorm8 vector */ r = r; g = LLVMBuildShl(builder, g, lp_build_const_int_vec(type, 8), ""); b = LLVMBuildShl(builder, b, lp_build_const_int_vec(type, 16), ""); a = lp_build_const_int_vec(type, 0xff000000); rgba = r; rgba = LLVMBuildOr(builder, rgba, g, ""); rgba = LLVMBuildOr(builder, rgba, b, ""); rgba = LLVMBuildOr(builder, rgba, a, ""); rgba = LLVMBuildBitCast(builder, rgba, LLVMVectorType(LLVMInt8Type(), 4*n), ""); return rgba; }
/** * Convert float[] to int[] with floor(). */ LLVMValueRef lp_build_ifloor(struct lp_build_context *bld, LLVMValueRef a) { const struct lp_type type = bld->type; LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); LLVMValueRef res; assert(type.floating); assert(lp_check_value(type, a)); if(util_cpu_caps.has_sse4_1) { res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR); } else { /* Take the sign bit and add it to 1 constant */ LLVMTypeRef vec_type = lp_build_vec_type(type); unsigned mantissa = lp_mantissa(type); LLVMValueRef mask = lp_build_int_const_scalar(type, (unsigned long long)1 << (type.width - 1)); LLVMValueRef sign; LLVMValueRef offset; /* sign = a < 0 ? ~0 : 0 */ sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, ""); sign = LLVMBuildAnd(bld->builder, sign, mask, ""); sign = LLVMBuildAShr(bld->builder, sign, lp_build_int_const_scalar(type, type.width - 1), ""); lp_build_name(sign, "floor.sign"); /* offset = -0.99999(9)f */ offset = lp_build_const_scalar(type, -(double)(((unsigned long long)1 << mantissa) - 1)/((unsigned long long)1 << mantissa)); offset = LLVMConstBitCast(offset, int_vec_type); /* offset = a < 0 ? -0.99999(9)f : 0.0f */ offset = LLVMBuildAnd(bld->builder, offset, sign, ""); offset = LLVMBuildBitCast(bld->builder, offset, vec_type, ""); lp_build_name(offset, "floor.offset"); res = LLVMBuildAdd(bld->builder, a, offset, ""); lp_build_name(res, "floor.res"); } res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, ""); lp_build_name(res, "floor"); return res; }
/* * Combined log2 and brilinear lod computation. * * It's in all identical to calling lp_build_fast_log2() and * lp_build_brilinear_lod() above, but by combining we can compute the integer * and fractional part independently. */ static void lp_build_brilinear_rho(struct lp_build_context *bld, LLVMValueRef rho, double factor, LLVMValueRef *out_lod_ipart, LLVMValueRef *out_lod_fpart) { LLVMValueRef lod_ipart; LLVMValueRef lod_fpart; const double pre_factor = (2*factor - 0.5)/(M_SQRT2*factor); const double post_offset = 1 - 2*factor; assert(bld->type.floating); assert(lp_check_value(bld->type, rho)); /* * The pre factor will make the intersections with the exact powers of two * happen precisely where we want then to be, which means that the integer * part will not need any post adjustments. */ rho = lp_build_mul(bld, rho, lp_build_const_vec(bld->gallivm, bld->type, pre_factor)); /* ipart = ifloor(log2(rho)) */ lod_ipart = lp_build_extract_exponent(bld, rho, 0); /* fpart = rho / 2**ipart */ lod_fpart = lp_build_extract_mantissa(bld, rho); lod_fpart = lp_build_mul(bld, lod_fpart, lp_build_const_vec(bld->gallivm, bld->type, factor)); lod_fpart = lp_build_add(bld, lod_fpart, lp_build_const_vec(bld->gallivm, bld->type, post_offset)); /* * Like lp_build_brilinear_lod, it's not necessary to clamp lod_fpart since: * - the above expression will never produce numbers greater than one. * - the mip filtering branch is only taken if lod_fpart is positive */ *out_lod_ipart = lod_ipart; *out_lod_fpart = lod_fpart; }
LLVMValueRef lp_build_ceil(struct lp_build_context *bld, LLVMValueRef a) { const struct lp_type type = bld->type; assert(type.floating); assert(lp_check_value(type, a)); if(util_cpu_caps.has_sse4_1) return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL); else { LLVMTypeRef vec_type = lp_build_vec_type(type); LLVMValueRef res; res = lp_build_iceil(bld, a); res = LLVMBuildSIToFP(bld->builder, res, vec_type, ""); return res; } }
LLVMValueRef lp_build_iceil(struct lp_build_context *bld, LLVMValueRef a) { const struct lp_type type = bld->type; LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); LLVMValueRef res; assert(type.floating); assert(lp_check_value(type, a)); if(util_cpu_caps.has_sse4_1) { res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL); } else { assert(0); res = bld->undef; } res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, ""); return res; }
LLVMValueRef lp_build_iround(struct lp_build_context *bld, LLVMValueRef a) { const struct lp_type type = bld->type; LLVMTypeRef int_vec_type = lp_build_int_vec_type(type); LLVMValueRef res; assert(type.floating); assert(lp_check_value(type, a)); if(util_cpu_caps.has_sse4_1) { res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST); } else { LLVMTypeRef vec_type = lp_build_vec_type(type); LLVMValueRef mask = lp_build_int_const_scalar(type, (unsigned long long)1 << (type.width - 1)); LLVMValueRef sign; LLVMValueRef half; /* get sign bit */ sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, ""); sign = LLVMBuildAnd(bld->builder, sign, mask, ""); /* sign * 0.5 */ half = lp_build_const_scalar(type, 0.5); half = LLVMBuildBitCast(bld->builder, half, int_vec_type, ""); half = LLVMBuildOr(bld->builder, sign, half, ""); half = LLVMBuildBitCast(bld->builder, half, vec_type, ""); res = LLVMBuildAdd(bld->builder, a, half, ""); } res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, ""); return res; }
/** * Generate code for performing depth and/or stencil tests. * We operate on a vector of values (typically n 2x2 quads). * * \param depth the depth test state * \param stencil the front/back stencil state * \param type the data type of the fragment depth/stencil values * \param format_desc description of the depth/stencil surface * \param mask the alive/dead pixel mask for the quad (vector) * \param stencil_refs the front/back stencil ref values (scalar) * \param z_src the incoming depth/stencil values (n 2x2 quad values, float32) * \param zs_dst the depth/stencil values in framebuffer * \param face contains boolean value indicating front/back facing polygon */ void lp_build_depth_stencil_test(struct gallivm_state *gallivm, const struct pipe_depth_state *depth, const struct pipe_stencil_state stencil[2], struct lp_type z_src_type, const struct util_format_description *format_desc, struct lp_build_mask_context *mask, LLVMValueRef stencil_refs[2], LLVMValueRef z_src, LLVMValueRef z_fb, LLVMValueRef s_fb, LLVMValueRef face, LLVMValueRef *z_value, LLVMValueRef *s_value, boolean do_branch) { LLVMBuilderRef builder = gallivm->builder; struct lp_type z_type; struct lp_build_context z_bld; struct lp_build_context s_bld; struct lp_type s_type; unsigned z_shift = 0, z_width = 0, z_mask = 0; LLVMValueRef z_dst = NULL; LLVMValueRef stencil_vals = NULL; LLVMValueRef z_bitmask = NULL, stencil_shift = NULL; LLVMValueRef z_pass = NULL, s_pass_mask = NULL; LLVMValueRef orig_mask = lp_build_mask_value(mask); LLVMValueRef front_facing = NULL; boolean have_z, have_s; /* * Depths are expected to be between 0 and 1, even if they are stored in * floats. Setting these bits here will ensure that the lp_build_conv() call * below won't try to unnecessarily clamp the incoming values. */ if(z_src_type.floating) { z_src_type.sign = FALSE; z_src_type.norm = TRUE; } else { assert(!z_src_type.sign); assert(z_src_type.norm); } /* Pick the type matching the depth-stencil format. */ z_type = lp_depth_type(format_desc, z_src_type.length); /* Pick the intermediate type for depth operations. */ z_type.width = z_src_type.width; assert(z_type.length == z_src_type.length); /* FIXME: for non-float depth/stencil might generate better code * if we'd always split it up to use 128bit operations. * For stencil we'd almost certainly want to pack to 8xi16 values, * for z just run twice. */ /* Sanity checking */ { const unsigned z_swizzle = format_desc->swizzle[0]; const unsigned s_swizzle = format_desc->swizzle[1]; assert(z_swizzle != UTIL_FORMAT_SWIZZLE_NONE || s_swizzle != UTIL_FORMAT_SWIZZLE_NONE); assert(depth->enabled || stencil[0].enabled); assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS); assert(format_desc->block.width == 1); assert(format_desc->block.height == 1); if (stencil[0].enabled) { assert(s_swizzle < 4); assert(format_desc->channel[s_swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED); assert(format_desc->channel[s_swizzle].pure_integer); assert(!format_desc->channel[s_swizzle].normalized); assert(format_desc->channel[s_swizzle].size == 8); } if (depth->enabled) { assert(z_swizzle < 4); if (z_type.floating) { assert(z_swizzle == 0); assert(format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_FLOAT); assert(format_desc->channel[z_swizzle].size == 32); } else { assert(format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED); assert(format_desc->channel[z_swizzle].normalized); assert(!z_type.fixed); } } } /* Setup build context for Z vals */ lp_build_context_init(&z_bld, gallivm, z_type); /* Setup build context for stencil vals */ s_type = lp_int_type(z_type); lp_build_context_init(&s_bld, gallivm, s_type); /* Compute and apply the Z/stencil bitmasks and shifts. */ { unsigned s_shift, s_mask; z_dst = z_fb; stencil_vals = s_fb; have_z = get_z_shift_and_mask(format_desc, &z_shift, &z_width, &z_mask); have_s = get_s_shift_and_mask(format_desc, &s_shift, &s_mask); if (have_z) { if (z_mask != 0xffffffff) { z_bitmask = lp_build_const_int_vec(gallivm, z_type, z_mask); } /* * Align the framebuffer Z 's LSB to the right. */ if (z_shift) { LLVMValueRef shift = lp_build_const_int_vec(gallivm, z_type, z_shift); z_dst = LLVMBuildLShr(builder, z_dst, shift, "z_dst"); } else if (z_bitmask) { z_dst = LLVMBuildAnd(builder, z_dst, z_bitmask, "z_dst"); } else { lp_build_name(z_dst, "z_dst"); } } if (have_s) { if (s_shift) { LLVMValueRef shift = lp_build_const_int_vec(gallivm, s_type, s_shift); stencil_vals = LLVMBuildLShr(builder, stencil_vals, shift, ""); stencil_shift = shift; /* used below */ } if (s_mask != 0xffffffff) { LLVMValueRef mask = lp_build_const_int_vec(gallivm, s_type, s_mask); stencil_vals = LLVMBuildAnd(builder, stencil_vals, mask, ""); } lp_build_name(stencil_vals, "s_dst"); } } if (stencil[0].enabled) { if (face) { LLVMValueRef zero = lp_build_const_int32(gallivm, 0); /* front_facing = face != 0 ? ~0 : 0 */ front_facing = LLVMBuildICmp(builder, LLVMIntNE, face, zero, ""); front_facing = LLVMBuildSExt(builder, front_facing, LLVMIntTypeInContext(gallivm->context, s_bld.type.length*s_bld.type.width), ""); front_facing = LLVMBuildBitCast(builder, front_facing, s_bld.int_vec_type, ""); } /* convert scalar stencil refs into vectors */ stencil_refs[0] = lp_build_broadcast_scalar(&s_bld, stencil_refs[0]); stencil_refs[1] = lp_build_broadcast_scalar(&s_bld, stencil_refs[1]); s_pass_mask = lp_build_stencil_test(&s_bld, stencil, stencil_refs, stencil_vals, front_facing); /* apply stencil-fail operator */ { LLVMValueRef s_fail_mask = lp_build_andnot(&s_bld, orig_mask, s_pass_mask); stencil_vals = lp_build_stencil_op(&s_bld, stencil, S_FAIL_OP, stencil_refs, stencil_vals, s_fail_mask, front_facing); } } if (depth->enabled) { /* * Convert fragment Z to the desired type, aligning the LSB to the right. */ assert(z_type.width == z_src_type.width); assert(z_type.length == z_src_type.length); assert(lp_check_value(z_src_type, z_src)); if (z_src_type.floating) { /* * Convert from floating point values */ if (!z_type.floating) { z_src = lp_build_clamped_float_to_unsigned_norm(gallivm, z_src_type, z_width, z_src); } } else { /* * Convert from unsigned normalized values. */ assert(!z_src_type.sign); assert(!z_src_type.fixed); assert(z_src_type.norm); assert(!z_type.floating); if (z_src_type.width > z_width) { LLVMValueRef shift = lp_build_const_int_vec(gallivm, z_src_type, z_src_type.width - z_width); z_src = LLVMBuildLShr(builder, z_src, shift, ""); } } assert(lp_check_value(z_type, z_src)); lp_build_name(z_src, "z_src"); /* compare src Z to dst Z, returning 'pass' mask */ z_pass = lp_build_cmp(&z_bld, depth->func, z_src, z_dst); if (!stencil[0].enabled) { /* We can potentially skip all remaining operations here, but only * if stencil is disabled because we still need to update the stencil * buffer values. Don't need to update Z buffer values. */ lp_build_mask_update(mask, z_pass); if (do_branch) { lp_build_mask_check(mask); do_branch = FALSE; } } if (depth->writemask) { LLVMValueRef zselectmask; /* mask off bits that failed Z test */ zselectmask = LLVMBuildAnd(builder, orig_mask, z_pass, ""); /* mask off bits that failed stencil test */ if (s_pass_mask) { zselectmask = LLVMBuildAnd(builder, zselectmask, s_pass_mask, ""); } /* Mix the old and new Z buffer values. * z_dst[i] = zselectmask[i] ? z_src[i] : z_dst[i] */ z_dst = lp_build_select(&z_bld, zselectmask, z_src, z_dst); } if (stencil[0].enabled) { /* update stencil buffer values according to z pass/fail result */ LLVMValueRef z_fail_mask, z_pass_mask; /* apply Z-fail operator */ z_fail_mask = lp_build_andnot(&s_bld, orig_mask, z_pass); stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_FAIL_OP, stencil_refs, stencil_vals, z_fail_mask, front_facing); /* apply Z-pass operator */ z_pass_mask = LLVMBuildAnd(builder, orig_mask, z_pass, ""); stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_PASS_OP, stencil_refs, stencil_vals, z_pass_mask, front_facing); } } else { /* No depth test: apply Z-pass operator to stencil buffer values which * passed the stencil test. */ s_pass_mask = LLVMBuildAnd(builder, orig_mask, s_pass_mask, ""); stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_PASS_OP, stencil_refs, stencil_vals, s_pass_mask, front_facing); } /* Put Z and stencil bits in the right place */ if (have_z && z_shift) { LLVMValueRef shift = lp_build_const_int_vec(gallivm, z_type, z_shift); z_dst = LLVMBuildShl(builder, z_dst, shift, ""); } if (stencil_vals && stencil_shift) stencil_vals = LLVMBuildShl(builder, stencil_vals, stencil_shift, ""); /* Finally, merge the z/stencil values */ if (format_desc->block.bits <= 32) { if (have_z && have_s) *z_value = LLVMBuildOr(builder, z_dst, stencil_vals, ""); else if (have_z) *z_value = z_dst; else *z_value = stencil_vals; *s_value = *z_value; } else { *z_value = z_dst; *s_value = stencil_vals; } if (s_pass_mask) lp_build_mask_update(mask, s_pass_mask); if (depth->enabled && stencil[0].enabled) lp_build_mask_update(mask, z_pass); }
static INLINE void yuv_to_rgb_soa(struct gallivm_state *gallivm, unsigned n, LLVMValueRef y, LLVMValueRef u, LLVMValueRef v, LLVMValueRef *r, LLVMValueRef *g, LLVMValueRef *b) { LLVMBuilderRef builder = gallivm->builder; struct lp_type type; struct lp_build_context bld; LLVMValueRef c0; LLVMValueRef c8; LLVMValueRef c16; LLVMValueRef c128; LLVMValueRef c255; LLVMValueRef cy; LLVMValueRef cug; LLVMValueRef cub; LLVMValueRef cvr; LLVMValueRef cvg; memset(&type, 0, sizeof type); type.sign = TRUE; type.width = 32; type.length = n; lp_build_context_init(&bld, gallivm, type); assert(lp_check_value(type, y)); assert(lp_check_value(type, u)); assert(lp_check_value(type, v)); /* * Constants */ c0 = lp_build_const_int_vec(gallivm, type, 0); c8 = lp_build_const_int_vec(gallivm, type, 8); c16 = lp_build_const_int_vec(gallivm, type, 16); c128 = lp_build_const_int_vec(gallivm, type, 128); c255 = lp_build_const_int_vec(gallivm, type, 255); cy = lp_build_const_int_vec(gallivm, type, 298); cug = lp_build_const_int_vec(gallivm, type, -100); cub = lp_build_const_int_vec(gallivm, type, 516); cvr = lp_build_const_int_vec(gallivm, type, 409); cvg = lp_build_const_int_vec(gallivm, type, -208); /* * y -= 16; * u -= 128; * v -= 128; */ y = LLVMBuildSub(builder, y, c16, ""); u = LLVMBuildSub(builder, u, c128, ""); v = LLVMBuildSub(builder, v, c128, ""); /* * r = 298 * _y + 409 * _v + 128; * g = 298 * _y - 100 * _u - 208 * _v + 128; * b = 298 * _y + 516 * _u + 128; */ y = LLVMBuildMul(builder, y, cy, ""); y = LLVMBuildAdd(builder, y, c128, ""); *r = LLVMBuildMul(builder, v, cvr, ""); *g = LLVMBuildAdd(builder, LLVMBuildMul(builder, u, cug, ""), LLVMBuildMul(builder, v, cvg, ""), ""); *b = LLVMBuildMul(builder, u, cub, ""); *r = LLVMBuildAdd(builder, *r, y, ""); *g = LLVMBuildAdd(builder, *g, y, ""); *b = LLVMBuildAdd(builder, *b, y, ""); /* * r >>= 8; * g >>= 8; * b >>= 8; */ *r = LLVMBuildAShr(builder, *r, c8, "r"); *g = LLVMBuildAShr(builder, *g, c8, "g"); *b = LLVMBuildAShr(builder, *b, c8, "b"); /* * Clamp */ *r = lp_build_clamp(&bld, *r, c0, c255); *g = lp_build_clamp(&bld, *g, c0, c255); *b = lp_build_clamp(&bld, *b, c0, c255); }
/** * Generic type conversion. * * TODO: Take a precision argument, or even better, add a new precision member * to the lp_type union. */ void lp_build_conv(struct gallivm_state *gallivm, struct lp_type src_type, struct lp_type dst_type, const LLVMValueRef *src, unsigned num_srcs, LLVMValueRef *dst, unsigned num_dsts) { LLVMBuilderRef builder = gallivm->builder; struct lp_type tmp_type; LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH]; unsigned num_tmps; unsigned i; /* We must not loose or gain channels. Only precision */ assert(src_type.length * num_srcs == dst_type.length * num_dsts); assert(src_type.length <= LP_MAX_VECTOR_LENGTH); assert(dst_type.length <= LP_MAX_VECTOR_LENGTH); assert(num_srcs <= LP_MAX_VECTOR_LENGTH); assert(num_dsts <= LP_MAX_VECTOR_LENGTH); tmp_type = src_type; for(i = 0; i < num_srcs; ++i) { assert(lp_check_value(src_type, src[i])); tmp[i] = src[i]; } num_tmps = num_srcs; /* Special case 4x4f --> 1x16ub */ if (src_type.floating == 1 && src_type.fixed == 0 && src_type.sign == 1 && src_type.norm == 0 && src_type.width == 32 && src_type.length == 4 && dst_type.floating == 0 && dst_type.fixed == 0 && dst_type.sign == 0 && dst_type.norm == 1 && dst_type.width == 8 && dst_type.length == 16 && 4 * num_dsts == num_srcs && util_cpu_caps.has_sse2) { struct lp_build_context bld; struct lp_type int16_type = dst_type; struct lp_type int32_type = dst_type; LLVMValueRef const_255f; unsigned i, j; lp_build_context_init(&bld, gallivm, src_type); int16_type.width *= 2; int16_type.length /= 2; int16_type.sign = 1; int32_type.width *= 4; int32_type.length /= 4; int32_type.sign = 1; const_255f = lp_build_const_vec(gallivm, src_type, 255.0f); for (i = 0; i < num_dsts; ++i, src += 4) { LLVMValueRef lo, hi; for (j = 0; j < 4; ++j) { tmp[j] = LLVMBuildFMul(builder, src[j], const_255f, ""); tmp[j] = lp_build_iround(&bld, tmp[j]); } /* relying on clamping behavior of sse2 intrinsics here */ lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]); hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]); dst[i] = lp_build_pack2(gallivm, int16_type, dst_type, lo, hi); } return; } /* Special case 2x8f --> 1x16ub */ else if (src_type.floating == 1 && src_type.fixed == 0 && src_type.sign == 1 && src_type.norm == 0 && src_type.width == 32 && src_type.length == 8 && dst_type.floating == 0 && dst_type.fixed == 0 && dst_type.sign == 0 && dst_type.norm == 1 && dst_type.width == 8 && dst_type.length == 16 && 2 * num_dsts == num_srcs && util_cpu_caps.has_avx) { struct lp_build_context bld; struct lp_type int16_type = dst_type; struct lp_type int32_type = dst_type; LLVMValueRef const_255f; unsigned i; lp_build_context_init(&bld, gallivm, src_type); int16_type.width *= 2; int16_type.length /= 2; int16_type.sign = 1; int32_type.width *= 4; int32_type.length /= 4; int32_type.sign = 1; const_255f = lp_build_const_vec(gallivm, src_type, 255.0f); for (i = 0; i < num_dsts; ++i, src += 2) { LLVMValueRef lo, hi, a, b; a = LLVMBuildFMul(builder, src[0], const_255f, ""); b = LLVMBuildFMul(builder, src[1], const_255f, ""); a = lp_build_iround(&bld, a); b = lp_build_iround(&bld, b); tmp[0] = lp_build_extract_range(gallivm, a, 0, 4); tmp[1] = lp_build_extract_range(gallivm, a, 4, 4); tmp[2] = lp_build_extract_range(gallivm, b, 0, 4); tmp[3] = lp_build_extract_range(gallivm, b, 4, 4); /* relying on clamping behavior of sse2 intrinsics here */ lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]); hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]); dst[i] = lp_build_pack2(gallivm, int16_type, dst_type, lo, hi); } return; } /* Pre convert half-floats to floats */ else if (src_type.floating && src_type.width == 16) { for(i = 0; i < num_tmps; ++i) tmp[i] = lp_build_half_to_float(gallivm, src_type, tmp[i]); tmp_type.width = 32; } /* * Clamp if necessary */ if(memcmp(&src_type, &dst_type, sizeof src_type) != 0) { struct lp_build_context bld; double src_min = lp_const_min(src_type); double dst_min = lp_const_min(dst_type); double src_max = lp_const_max(src_type); double dst_max = lp_const_max(dst_type); LLVMValueRef thres; lp_build_context_init(&bld, gallivm, tmp_type); if(src_min < dst_min) { if(dst_min == 0.0) thres = bld.zero; else thres = lp_build_const_vec(gallivm, src_type, dst_min); for(i = 0; i < num_tmps; ++i) tmp[i] = lp_build_max(&bld, tmp[i], thres); } if(src_max > dst_max) { if(dst_max == 1.0) thres = bld.one; else thres = lp_build_const_vec(gallivm, src_type, dst_max); for(i = 0; i < num_tmps; ++i) tmp[i] = lp_build_min(&bld, tmp[i], thres); } } /* * Scale to the narrowest range */ if(dst_type.floating) { /* Nothing to do */ } else if(tmp_type.floating) { if(!dst_type.fixed && !dst_type.sign && dst_type.norm) { for(i = 0; i < num_tmps; ++i) { tmp[i] = lp_build_clamped_float_to_unsigned_norm(gallivm, tmp_type, dst_type.width, tmp[i]); } tmp_type.floating = FALSE; } else { double dst_scale = lp_const_scale(dst_type); LLVMTypeRef tmp_vec_type; if (dst_scale != 1.0) { LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, dst_scale); for(i = 0; i < num_tmps; ++i) tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, ""); } /* Use an equally sized integer for intermediate computations */ tmp_type.floating = FALSE; tmp_vec_type = lp_build_vec_type(gallivm, tmp_type); for(i = 0; i < num_tmps; ++i) { #if 0 if(dst_type.sign) tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, ""); else tmp[i] = LLVMBuildFPToUI(builder, tmp[i], tmp_vec_type, ""); #else /* FIXME: there is no SSE counterpart for LLVMBuildFPToUI */ tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, ""); #endif } } } else { unsigned src_shift = lp_const_shift(src_type); unsigned dst_shift = lp_const_shift(dst_type); unsigned src_offset = lp_const_offset(src_type); unsigned dst_offset = lp_const_offset(dst_type); /* Compensate for different offsets */ if (dst_offset > src_offset && src_type.width > dst_type.width) { for (i = 0; i < num_tmps; ++i) { LLVMValueRef shifted; LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type, src_shift - 1); if(src_type.sign) shifted = LLVMBuildAShr(builder, tmp[i], shift, ""); else shifted = LLVMBuildLShr(builder, tmp[i], shift, ""); tmp[i] = LLVMBuildSub(builder, tmp[i], shifted, ""); } } if(src_shift > dst_shift) { LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type, src_shift - dst_shift); for(i = 0; i < num_tmps; ++i) if(src_type.sign) tmp[i] = LLVMBuildAShr(builder, tmp[i], shift, ""); else tmp[i] = LLVMBuildLShr(builder, tmp[i], shift, ""); } } /* * Truncate or expand bit width * * No data conversion should happen here, although the sign bits are * crucial to avoid bad clamping. */ { struct lp_type new_type; new_type = tmp_type; new_type.sign = dst_type.sign; new_type.width = dst_type.width; new_type.length = dst_type.length; lp_build_resize(gallivm, tmp_type, new_type, tmp, num_srcs, tmp, num_dsts); tmp_type = new_type; num_tmps = num_dsts; } /* * Scale to the widest range */ if(src_type.floating) { /* Nothing to do */ } else if(!src_type.floating && dst_type.floating) { if(!src_type.fixed && !src_type.sign && src_type.norm) { for(i = 0; i < num_tmps; ++i) { tmp[i] = lp_build_unsigned_norm_to_float(gallivm, src_type.width, dst_type, tmp[i]); } tmp_type.floating = TRUE; } else { double src_scale = lp_const_scale(src_type); LLVMTypeRef tmp_vec_type; /* Use an equally sized integer for intermediate computations */ tmp_type.floating = TRUE; tmp_type.sign = TRUE; tmp_vec_type = lp_build_vec_type(gallivm, tmp_type); for(i = 0; i < num_tmps; ++i) { #if 0 if(dst_type.sign) tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, ""); else tmp[i] = LLVMBuildUIToFP(builder, tmp[i], tmp_vec_type, ""); #else /* FIXME: there is no SSE counterpart for LLVMBuildUIToFP */ tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, ""); #endif } if (src_scale != 1.0) { LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, 1.0/src_scale); for(i = 0; i < num_tmps; ++i) tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, ""); } } } else { unsigned src_shift = lp_const_shift(src_type); unsigned dst_shift = lp_const_shift(dst_type); unsigned src_offset = lp_const_offset(src_type); unsigned dst_offset = lp_const_offset(dst_type); if (src_shift < dst_shift) { LLVMValueRef pre_shift[LP_MAX_VECTOR_LENGTH]; LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type, dst_shift - src_shift); for (i = 0; i < num_tmps; ++i) { pre_shift[i] = tmp[i]; tmp[i] = LLVMBuildShl(builder, tmp[i], shift, ""); } /* Compensate for different offsets */ if (dst_offset > src_offset) { for (i = 0; i < num_tmps; ++i) { tmp[i] = LLVMBuildSub(builder, tmp[i], pre_shift[i], ""); } } } } for(i = 0; i < num_dsts; ++i) { dst[i] = tmp[i]; assert(lp_check_value(dst_type, dst[i])); } }
/** * Generic type conversion. * * TODO: Take a precision argument, or even better, add a new precision member * to the lp_type union. */ void lp_build_conv(LLVMBuilderRef builder, struct lp_type src_type, struct lp_type dst_type, const LLVMValueRef *src, unsigned num_srcs, LLVMValueRef *dst, unsigned num_dsts) { struct lp_type tmp_type; LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH]; unsigned num_tmps; unsigned i; /* We must not loose or gain channels. Only precision */ assert(src_type.length * num_srcs == dst_type.length * num_dsts); assert(src_type.length <= LP_MAX_VECTOR_LENGTH); assert(dst_type.length <= LP_MAX_VECTOR_LENGTH); assert(num_srcs <= LP_MAX_VECTOR_LENGTH); assert(num_dsts <= LP_MAX_VECTOR_LENGTH); tmp_type = src_type; for(i = 0; i < num_srcs; ++i) { assert(lp_check_value(src_type, src[i])); tmp[i] = src[i]; } num_tmps = num_srcs; /* * Clamp if necessary */ if(memcmp(&src_type, &dst_type, sizeof src_type) != 0) { struct lp_build_context bld; double src_min = lp_const_min(src_type); double dst_min = lp_const_min(dst_type); double src_max = lp_const_max(src_type); double dst_max = lp_const_max(dst_type); LLVMValueRef thres; lp_build_context_init(&bld, builder, tmp_type); if(src_min < dst_min) { if(dst_min == 0.0) thres = bld.zero; else thres = lp_build_const_vec(src_type, dst_min); for(i = 0; i < num_tmps; ++i) tmp[i] = lp_build_max(&bld, tmp[i], thres); } if(src_max > dst_max) { if(dst_max == 1.0) thres = bld.one; else thres = lp_build_const_vec(src_type, dst_max); for(i = 0; i < num_tmps; ++i) tmp[i] = lp_build_min(&bld, tmp[i], thres); } } /* * Scale to the narrowest range */ if(dst_type.floating) { /* Nothing to do */ } else if(tmp_type.floating) { if(!dst_type.fixed && !dst_type.sign && dst_type.norm) { for(i = 0; i < num_tmps; ++i) { tmp[i] = lp_build_clamped_float_to_unsigned_norm(builder, tmp_type, dst_type.width, tmp[i]); } tmp_type.floating = FALSE; } else { double dst_scale = lp_const_scale(dst_type); LLVMTypeRef tmp_vec_type; if (dst_scale != 1.0) { LLVMValueRef scale = lp_build_const_vec(tmp_type, dst_scale); for(i = 0; i < num_tmps; ++i) tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, ""); } /* Use an equally sized integer for intermediate computations */ tmp_type.floating = FALSE; tmp_vec_type = lp_build_vec_type(tmp_type); for(i = 0; i < num_tmps; ++i) { #if 0 if(dst_type.sign) tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, ""); else tmp[i] = LLVMBuildFPToUI(builder, tmp[i], tmp_vec_type, ""); #else /* FIXME: there is no SSE counterpart for LLVMBuildFPToUI */ tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, ""); #endif } } } else { unsigned src_shift = lp_const_shift(src_type); unsigned dst_shift = lp_const_shift(dst_type); /* FIXME: compensate different offsets too */ if(src_shift > dst_shift) { LLVMValueRef shift = lp_build_const_int_vec(tmp_type, src_shift - dst_shift); for(i = 0; i < num_tmps; ++i) if(src_type.sign) tmp[i] = LLVMBuildAShr(builder, tmp[i], shift, ""); else tmp[i] = LLVMBuildLShr(builder, tmp[i], shift, ""); } } /* * Truncate or expand bit width * * No data conversion should happen here, although the sign bits are * crucial to avoid bad clamping. */ { struct lp_type new_type; new_type = tmp_type; new_type.sign = dst_type.sign; new_type.width = dst_type.width; new_type.length = dst_type.length; lp_build_resize(builder, tmp_type, new_type, tmp, num_srcs, tmp, num_dsts); tmp_type = new_type; num_tmps = num_dsts; } /* * Scale to the widest range */ if(src_type.floating) { /* Nothing to do */ } else if(!src_type.floating && dst_type.floating) { if(!src_type.fixed && !src_type.sign && src_type.norm) { for(i = 0; i < num_tmps; ++i) { tmp[i] = lp_build_unsigned_norm_to_float(builder, src_type.width, dst_type, tmp[i]); } tmp_type.floating = TRUE; } else { double src_scale = lp_const_scale(src_type); LLVMTypeRef tmp_vec_type; /* Use an equally sized integer for intermediate computations */ tmp_type.floating = TRUE; tmp_type.sign = TRUE; tmp_vec_type = lp_build_vec_type(tmp_type); for(i = 0; i < num_tmps; ++i) { #if 0 if(dst_type.sign) tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, ""); else tmp[i] = LLVMBuildUIToFP(builder, tmp[i], tmp_vec_type, ""); #else /* FIXME: there is no SSE counterpart for LLVMBuildUIToFP */ tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, ""); #endif } if (src_scale != 1.0) { LLVMValueRef scale = lp_build_const_vec(tmp_type, 1.0/src_scale); for(i = 0; i < num_tmps; ++i) tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, ""); } } } else { unsigned src_shift = lp_const_shift(src_type); unsigned dst_shift = lp_const_shift(dst_type); /* FIXME: compensate different offsets too */ if(src_shift < dst_shift) { LLVMValueRef shift = lp_build_const_int_vec(tmp_type, dst_shift - src_shift); for(i = 0; i < num_tmps; ++i) tmp[i] = LLVMBuildShl(builder, tmp[i], shift, ""); } } for(i = 0; i < num_dsts; ++i) { dst[i] = tmp[i]; assert(lp_check_value(dst_type, dst[i])); } }
/** * Build code to compare two values 'a' and 'b' of 'type' using the given func. * \param func one of PIPE_FUNC_x * The result values will be 0 for false or ~0 for true. */ LLVMValueRef lp_build_compare(struct gallivm_state *gallivm, const struct lp_type type, unsigned func, LLVMValueRef a, LLVMValueRef b) { LLVMBuilderRef builder = gallivm->builder; LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, type); LLVMValueRef zeros = LLVMConstNull(int_vec_type); LLVMValueRef ones = LLVMConstAllOnes(int_vec_type); LLVMValueRef cond; LLVMValueRef res; assert(func >= PIPE_FUNC_NEVER); assert(func <= PIPE_FUNC_ALWAYS); assert(lp_check_value(type, a)); assert(lp_check_value(type, b)); if(func == PIPE_FUNC_NEVER) return zeros; if(func == PIPE_FUNC_ALWAYS) return ones; #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) /* * There are no unsigned integer comparison instructions in SSE. */ if (!type.floating && !type.sign && type.width * type.length == 128 && util_cpu_caps.has_sse2 && (func == PIPE_FUNC_LESS || func == PIPE_FUNC_LEQUAL || func == PIPE_FUNC_GREATER || func == PIPE_FUNC_GEQUAL) && (gallivm_debug & GALLIVM_DEBUG_PERF)) { debug_printf("%s: inefficient <%u x i%u> unsigned comparison\n", __FUNCTION__, type.length, type.width); } #endif #if HAVE_LLVM < 0x0207 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) if(type.width * type.length == 128) { if(type.floating && util_cpu_caps.has_sse) { /* float[4] comparison */ LLVMTypeRef vec_type = lp_build_vec_type(gallivm, type); LLVMValueRef args[3]; unsigned cc; boolean swap; swap = FALSE; switch(func) { case PIPE_FUNC_EQUAL: cc = 0; break; case PIPE_FUNC_NOTEQUAL: cc = 4; break; case PIPE_FUNC_LESS: cc = 1; break; case PIPE_FUNC_LEQUAL: cc = 2; break; case PIPE_FUNC_GREATER: cc = 1; swap = TRUE; break; case PIPE_FUNC_GEQUAL: cc = 2; swap = TRUE; break; default: assert(0); return lp_build_undef(gallivm, type); } if(swap) { args[0] = b; args[1] = a; } else { args[0] = a; args[1] = b; } args[2] = LLVMConstInt(LLVMInt8TypeInContext(gallivm->context), cc, 0); res = lp_build_intrinsic(builder, "llvm.x86.sse.cmp.ps", vec_type, args, 3); res = LLVMBuildBitCast(builder, res, int_vec_type, ""); return res; } else if(util_cpu_caps.has_sse2) { /* int[4] comparison */ static const struct { unsigned swap:1; unsigned eq:1; unsigned gt:1; unsigned not:1; } table[] = { {0, 0, 0, 1}, /* PIPE_FUNC_NEVER */ {1, 0, 1, 0}, /* PIPE_FUNC_LESS */ {0, 1, 0, 0}, /* PIPE_FUNC_EQUAL */ {0, 0, 1, 1}, /* PIPE_FUNC_LEQUAL */ {0, 0, 1, 0}, /* PIPE_FUNC_GREATER */ {0, 1, 0, 1}, /* PIPE_FUNC_NOTEQUAL */ {1, 0, 1, 1}, /* PIPE_FUNC_GEQUAL */ {0, 0, 0, 0} /* PIPE_FUNC_ALWAYS */ }; const char *pcmpeq; const char *pcmpgt; LLVMValueRef args[2]; LLVMValueRef res; LLVMTypeRef vec_type = lp_build_vec_type(gallivm, type); switch (type.width) { case 8: pcmpeq = "llvm.x86.sse2.pcmpeq.b"; pcmpgt = "llvm.x86.sse2.pcmpgt.b"; break; case 16: pcmpeq = "llvm.x86.sse2.pcmpeq.w"; pcmpgt = "llvm.x86.sse2.pcmpgt.w"; break; case 32: pcmpeq = "llvm.x86.sse2.pcmpeq.d"; pcmpgt = "llvm.x86.sse2.pcmpgt.d"; break; default: assert(0); return lp_build_undef(gallivm, type); } /* There are no unsigned comparison instructions. So flip the sign bit * so that the results match. */ if (table[func].gt && !type.sign) { LLVMValueRef msb = lp_build_const_int_vec(gallivm, type, (unsigned long long)1 << (type.width - 1)); a = LLVMBuildXor(builder, a, msb, ""); b = LLVMBuildXor(builder, b, msb, ""); } if(table[func].swap) { args[0] = b; args[1] = a; } else { args[0] = a; args[1] = b; } if(table[func].eq) res = lp_build_intrinsic(builder, pcmpeq, vec_type, args, 2); else if (table[func].gt) res = lp_build_intrinsic(builder, pcmpgt, vec_type, args, 2); else res = LLVMConstNull(vec_type); if(table[func].not) res = LLVMBuildNot(builder, res, ""); return res; } } /* if (type.width * type.length == 128) */ #endif #endif /* HAVE_LLVM < 0x0207 */ /* XXX: It is not clear if we should use the ordered or unordered operators */ if(type.floating) { LLVMRealPredicate op; switch(func) { case PIPE_FUNC_NEVER: op = LLVMRealPredicateFalse; break; case PIPE_FUNC_ALWAYS: op = LLVMRealPredicateTrue; break; case PIPE_FUNC_EQUAL: op = LLVMRealUEQ; break; case PIPE_FUNC_NOTEQUAL: op = LLVMRealUNE; break; case PIPE_FUNC_LESS: op = LLVMRealULT; break; case PIPE_FUNC_LEQUAL: op = LLVMRealULE; break; case PIPE_FUNC_GREATER: op = LLVMRealUGT; break; case PIPE_FUNC_GEQUAL: op = LLVMRealUGE; break; default: assert(0); return lp_build_undef(gallivm, type); } #if HAVE_LLVM >= 0x0207 cond = LLVMBuildFCmp(builder, op, a, b, ""); res = LLVMBuildSExt(builder, cond, int_vec_type, ""); #else if (type.length == 1) { cond = LLVMBuildFCmp(builder, op, a, b, ""); res = LLVMBuildSExt(builder, cond, int_vec_type, ""); } else { unsigned i; res = LLVMGetUndef(int_vec_type); debug_printf("%s: warning: using slow element-wise float" " vector comparison\n", __FUNCTION__); for (i = 0; i < type.length; ++i) { LLVMValueRef index = lp_build_const_int32(gallivm, i); cond = LLVMBuildFCmp(builder, op, LLVMBuildExtractElement(builder, a, index, ""), LLVMBuildExtractElement(builder, b, index, ""), ""); cond = LLVMBuildSelect(builder, cond, LLVMConstExtractElement(ones, index), LLVMConstExtractElement(zeros, index), ""); res = LLVMBuildInsertElement(builder, res, cond, index, ""); } } #endif } else { LLVMIntPredicate op; switch(func) { case PIPE_FUNC_EQUAL: op = LLVMIntEQ; break; case PIPE_FUNC_NOTEQUAL: op = LLVMIntNE; break; case PIPE_FUNC_LESS: op = type.sign ? LLVMIntSLT : LLVMIntULT; break; case PIPE_FUNC_LEQUAL: op = type.sign ? LLVMIntSLE : LLVMIntULE; break; case PIPE_FUNC_GREATER: op = type.sign ? LLVMIntSGT : LLVMIntUGT; break; case PIPE_FUNC_GEQUAL: op = type.sign ? LLVMIntSGE : LLVMIntUGE; break; default: assert(0); return lp_build_undef(gallivm, type); } #if HAVE_LLVM >= 0x0207 cond = LLVMBuildICmp(builder, op, a, b, ""); res = LLVMBuildSExt(builder, cond, int_vec_type, ""); #else if (type.length == 1) { cond = LLVMBuildICmp(builder, op, a, b, ""); res = LLVMBuildSExt(builder, cond, int_vec_type, ""); } else { unsigned i; res = LLVMGetUndef(int_vec_type); if (gallivm_debug & GALLIVM_DEBUG_PERF) { debug_printf("%s: using slow element-wise int" " vector comparison\n", __FUNCTION__); } for(i = 0; i < type.length; ++i) { LLVMValueRef index = lp_build_const_int32(gallivm, i); cond = LLVMBuildICmp(builder, op, LLVMBuildExtractElement(builder, a, index, ""), LLVMBuildExtractElement(builder, b, index, ""), ""); cond = LLVMBuildSelect(builder, cond, LLVMConstExtractElement(ones, index), LLVMConstExtractElement(zeros, index), ""); res = LLVMBuildInsertElement(builder, res, cond, index, ""); } } #endif } return res; }
/** * Return mask ? a : b; * * mask is a bitwise mask, composed of 0 or ~0 for each element. Any other value * will yield unpredictable results. */ LLVMValueRef lp_build_select(struct lp_build_context *bld, LLVMValueRef mask, LLVMValueRef a, LLVMValueRef b) { LLVMBuilderRef builder = bld->gallivm->builder; LLVMContextRef lc = bld->gallivm->context; struct lp_type type = bld->type; LLVMValueRef res; assert(lp_check_value(type, a)); assert(lp_check_value(type, b)); if(a == b) return a; if (type.length == 1) { mask = LLVMBuildTrunc(builder, mask, LLVMInt1TypeInContext(lc), ""); res = LLVMBuildSelect(builder, mask, a, b, ""); } else if (0) { /* Generate a vector select. * * XXX: Using vector selects would avoid emitting intrinsics, but they aren't * properly supported yet. * * LLVM 3.0 includes experimental support provided the -promote-elements * options is passed to LLVM's command line (e.g., via * llvm::cl::ParseCommandLineOptions), but resulting code quality is much * worse, probably because some optimization passes don't know how to * handle vector selects. * * See also: * - http://lists.cs.uiuc.edu/pipermail/llvmdev/2011-October/043659.html */ /* Convert the mask to a vector of booleans. * XXX: There are two ways to do this. Decide what's best. */ if (1) { LLVMTypeRef bool_vec_type = LLVMVectorType(LLVMInt1TypeInContext(lc), type.length); mask = LLVMBuildTrunc(builder, mask, bool_vec_type, ""); } else { mask = LLVMBuildICmp(builder, LLVMIntNE, mask, LLVMConstNull(bld->int_vec_type), ""); } res = LLVMBuildSelect(builder, mask, a, b, ""); } else if (((util_cpu_caps.has_sse4_1 && type.width * type.length == 128) || (util_cpu_caps.has_avx && type.width * type.length == 256 && type.width >= 32)) && !LLVMIsConstant(a) && !LLVMIsConstant(b) && !LLVMIsConstant(mask)) { const char *intrinsic; LLVMTypeRef arg_type; LLVMValueRef args[3]; /* * There's only float blend in AVX but can just cast i32/i64 * to float. */ if (type.width * type.length == 256) { if (type.width == 64) { intrinsic = "llvm.x86.avx.blendv.pd.256"; arg_type = LLVMVectorType(LLVMDoubleTypeInContext(lc), 4); } else { intrinsic = "llvm.x86.avx.blendv.ps.256"; arg_type = LLVMVectorType(LLVMFloatTypeInContext(lc), 8); } } else if (type.floating && type.width == 64) { intrinsic = "llvm.x86.sse41.blendvpd"; arg_type = LLVMVectorType(LLVMDoubleTypeInContext(lc), 2); } else if (type.floating && type.width == 32) { intrinsic = "llvm.x86.sse41.blendvps"; arg_type = LLVMVectorType(LLVMFloatTypeInContext(lc), 4); } else { intrinsic = "llvm.x86.sse41.pblendvb"; arg_type = LLVMVectorType(LLVMInt8TypeInContext(lc), 16); } if (arg_type != bld->int_vec_type) { mask = LLVMBuildBitCast(builder, mask, arg_type, ""); } if (arg_type != bld->vec_type) { a = LLVMBuildBitCast(builder, a, arg_type, ""); b = LLVMBuildBitCast(builder, b, arg_type, ""); } args[0] = b; args[1] = a; args[2] = mask; res = lp_build_intrinsic(builder, intrinsic, arg_type, args, Elements(args)); if (arg_type != bld->vec_type) { res = LLVMBuildBitCast(builder, res, bld->vec_type, ""); } } else { res = lp_build_select_bitwise(bld, mask, a, b); } return res; }