/** * Convert srgb int values to linear float values. * Several possibilities how to do this, e.g. * - table * - doing the pow() with int-to-float and float-to-int tricks * (http://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent) * - just using standard polynomial approximation * (3rd order polynomial is required for crappy but just sufficient accuracy) * * @param src integer (vector) value(s) to convert * (chan_bits bit values unpacked to 32 bit already). */ LLVMValueRef lp_build_srgb_to_linear(struct gallivm_state *gallivm, struct lp_type src_type, unsigned chan_bits, LLVMValueRef src) { struct lp_type f32_type = lp_type_float_vec(32, src_type.length * 32); struct lp_build_context f32_bld; LLVMValueRef srcf, part_lin, part_pow, is_linear, lin_const, lin_thresh; double coeffs[4] = {0.0023f, 0.0030f / 255.0f, 0.6935f / (255.0f * 255.0f), 0.3012f / (255.0f * 255.0f * 255.0f) }; assert(src_type.width == 32); /* Technically this would work with more bits too but would be inaccurate. */ assert(chan_bits <= 8); lp_build_context_init(&f32_bld, gallivm, f32_type); /* * using polynomial: (src * (src * (src * 0.3012 + 0.6935) + 0.0030) + 0.0023) * ( poly = 0.3012*x^3 + 0.6935*x^2 + 0.0030*x + 0.0023) * (found with octave polyfit and some magic as I couldn't get the error * function right). Using the above mentioned error function, the values stay * within +-0.35, except for the lowest values - hence tweaking linear segment * to cover the first 16 instead of the first 11 values (the error stays * just about acceptable there too). * Hence: lin = src > 15 ? poly : src / 12.6 * This function really only makes sense for vectors, should use LUT otherwise. * All in all (including float conversion) 11 instructions (with sse4.1), * 6 constants (polynomial could be done with 1 instruction less at the cost * of slightly worse dependency chain, fma should also help). */ /* doing the 1/255 mul as part of the approximation */ srcf = lp_build_int_to_float(&f32_bld, src); if (chan_bits != 8) { /* could adjust all the constants instead */ LLVMValueRef rescale_const = lp_build_const_vec(gallivm, f32_type, 255.0f / ((1 << chan_bits) - 1)); srcf = lp_build_mul(&f32_bld, srcf, rescale_const); } lin_const = lp_build_const_vec(gallivm, f32_type, 1.0f / (12.6f * 255.0f)); part_lin = lp_build_mul(&f32_bld, srcf, lin_const); part_pow = lp_build_polynomial(&f32_bld, srcf, coeffs, 4); lin_thresh = lp_build_const_vec(gallivm, f32_type, 15.0f); is_linear = lp_build_compare(gallivm, f32_type, PIPE_FUNC_LEQUAL, srcf, lin_thresh); return lp_build_select(&f32_bld, is_linear, part_lin, part_pow); }
/** * Inverse of lp_build_clamped_float_to_unsigned_norm above. * Ex: src = { i32, i32, i32, i32 } with values in range [0, 2^src_width-1] * return {float, float, float, float} with values in range [0, 1]. */ LLVMValueRef lp_build_unsigned_norm_to_float(LLVMBuilderRef builder, unsigned src_width, struct lp_type dst_type, LLVMValueRef src) { LLVMTypeRef vec_type = lp_build_vec_type(dst_type); LLVMTypeRef int_vec_type = lp_build_int_vec_type(dst_type); LLVMValueRef bias_; LLVMValueRef res; unsigned mantissa; unsigned n; unsigned long long ubound; unsigned long long mask; double scale; double bias; assert(dst_type.floating); mantissa = lp_mantissa(dst_type); n = MIN2(mantissa, src_width); ubound = ((unsigned long long)1 << n); mask = ubound - 1; scale = (double)ubound/mask; bias = (double)((unsigned long long)1 << (mantissa - n)); res = src; if(src_width > mantissa) { int shift = src_width - mantissa; res = LLVMBuildLShr(builder, res, lp_build_const_int_vec(dst_type, shift), ""); } bias_ = lp_build_const_vec(dst_type, bias); res = LLVMBuildOr(builder, res, LLVMBuildBitCast(builder, bias_, int_vec_type, ""), ""); res = LLVMBuildBitCast(builder, res, vec_type, ""); res = LLVMBuildFSub(builder, res, bias_, ""); res = LLVMBuildFMul(builder, res, lp_build_const_vec(dst_type, scale), ""); return res; }
/* * Combined log2 and brilinear lod computation. * * It's in all identical to calling lp_build_fast_log2() and * lp_build_brilinear_lod() above, but by combining we can compute the integer * and fractional part independently. */ static void lp_build_brilinear_rho(struct lp_build_context *bld, LLVMValueRef rho, double factor, LLVMValueRef *out_lod_ipart, LLVMValueRef *out_lod_fpart) { LLVMValueRef lod_ipart; LLVMValueRef lod_fpart; const double pre_factor = (2*factor - 0.5)/(M_SQRT2*factor); const double post_offset = 1 - 2*factor; assert(bld->type.floating); assert(lp_check_value(bld->type, rho)); /* * The pre factor will make the intersections with the exact powers of two * happen precisely where we want then to be, which means that the integer * part will not need any post adjustments. */ rho = lp_build_mul(bld, rho, lp_build_const_vec(bld->gallivm, bld->type, pre_factor)); /* ipart = ifloor(log2(rho)) */ lod_ipart = lp_build_extract_exponent(bld, rho, 0); /* fpart = rho / 2**ipart */ lod_fpart = lp_build_extract_mantissa(bld, rho); lod_fpart = lp_build_mul(bld, lod_fpart, lp_build_const_vec(bld->gallivm, bld->type, factor)); lod_fpart = lp_build_add(bld, lod_fpart, lp_build_const_vec(bld->gallivm, bld->type, post_offset)); /* * Like lp_build_brilinear_lod, it's not necessary to clamp lod_fpart since: * - the above expression will never produce numbers greater than one. * - the mip filtering branch is only taken if lod_fpart is positive */ *out_lod_ipart = lod_ipart; *out_lod_fpart = lod_fpart; }
/** Helper used by lp_build_cube_lookup() */ static LLVMValueRef lp_build_cube_ima(struct lp_build_context *coord_bld, LLVMValueRef coord) { /* ima = -0.5 / abs(coord); */ LLVMValueRef negHalf = lp_build_const_vec(coord_bld->gallivm, coord_bld->type, -0.5); LLVMValueRef absCoord = lp_build_abs(coord_bld, coord); LLVMValueRef ima = lp_build_div(coord_bld, negHalf, absCoord); return ima; }
/* * Bri-linear lod computation * * Use a piece-wise linear approximation of log2 such that: * - round to nearest, for values in the neighborhood of -1, 0, 1, 2, etc. * - linear approximation for values in the neighborhood of 0.5, 1.5., etc, * with the steepness specified in 'factor' * - exact result for 0.5, 1.5, etc. * * * 1.0 - /----* * / * / * / * 0.5 - * * / * / * / * 0.0 - *----/ * * | | * 2^0 2^1 * * This is a technique also commonly used in hardware: * - http://ixbtlabs.com/articles2/gffx/nv40-rx800-3.html * * TODO: For correctness, this should only be applied when texture is known to * have regular mipmaps, i.e., mipmaps derived from the base level. * * TODO: This could be done in fixed point, where applicable. */ static void lp_build_brilinear_lod(struct lp_build_context *bld, LLVMValueRef lod, double factor, LLVMValueRef *out_lod_ipart, LLVMValueRef *out_lod_fpart) { LLVMValueRef lod_fpart; double pre_offset = (factor - 0.5)/factor - 0.5; double post_offset = 1 - factor; if (0) { lp_build_printf(bld->gallivm, "lod = %f\n", lod); } lod = lp_build_add(bld, lod, lp_build_const_vec(bld->gallivm, bld->type, pre_offset)); lp_build_ifloor_fract(bld, lod, out_lod_ipart, &lod_fpart); lod_fpart = lp_build_mul(bld, lod_fpart, lp_build_const_vec(bld->gallivm, bld->type, factor)); lod_fpart = lp_build_add(bld, lod_fpart, lp_build_const_vec(bld->gallivm, bld->type, post_offset)); /* * It's not necessary to clamp lod_fpart since: * - the above expression will never produce numbers greater than one. * - the mip filtering branch is only taken if lod_fpart is positive */ *out_lod_fpart = lod_fpart; if (0) { lp_build_printf(bld->gallivm, "lod_ipart = %i\n", *out_lod_ipart); lp_build_printf(bld->gallivm, "lod_fpart = %f\n\n", *out_lod_fpart); } }
/** * Compute the offset of a pixel block. * * x, y, z, y_stride, z_stride are vectors, and they refer to pixels. * * Returns the relative offset and i,j sub-block coordinates */ void lp_build_sample_offset(struct lp_build_context *bld, const struct util_format_description *format_desc, LLVMValueRef x, LLVMValueRef y, LLVMValueRef z, LLVMValueRef y_stride, LLVMValueRef z_stride, LLVMValueRef *out_offset, LLVMValueRef *out_i, LLVMValueRef *out_j) { LLVMValueRef x_stride; LLVMValueRef offset; x_stride = lp_build_const_vec(bld->gallivm, bld->type, format_desc->block.bits/8); lp_build_sample_partial_offset(bld, format_desc->block.width, x, x_stride, &offset, out_i); if (y && y_stride) { LLVMValueRef y_offset; lp_build_sample_partial_offset(bld, format_desc->block.height, y, y_stride, &y_offset, out_j); offset = lp_build_add(bld, offset, y_offset); } else { *out_j = bld->zero; } if (z && z_stride) { LLVMValueRef z_offset; LLVMValueRef k; lp_build_sample_partial_offset(bld, 1, /* pixel blocks are always 2D */ z, z_stride, &z_offset, &k); offset = lp_build_add(bld, offset, z_offset); } *out_offset = offset; }
/** * Convert linear float soa values to packed srgb AoS values. * This only handles packed formats which are 4x8bit in size * (rgba and rgbx plus swizzles), and 16bit 565-style formats * with no alpha. (In the latter case the return values won't be * fully packed, it will look like r5g6b5x16r5g6b5x16...) * * @param src float SoA (vector) values to convert. */ LLVMValueRef lp_build_float_to_srgb_packed(struct gallivm_state *gallivm, const struct util_format_description *dst_fmt, struct lp_type src_type, LLVMValueRef *src) { LLVMBuilderRef builder = gallivm->builder; unsigned chan; struct lp_build_context f32_bld; struct lp_type int32_type = lp_int_type(src_type); LLVMValueRef tmpsrgb[4], alpha, dst; lp_build_context_init(&f32_bld, gallivm, src_type); /* rgb is subject to linear->srgb conversion, alpha is not */ for (chan = 0; chan < 3; chan++) { unsigned chan_bits = dst_fmt->channel[dst_fmt->swizzle[chan]].size; tmpsrgb[chan] = lp_build_linear_to_srgb(gallivm, src_type, chan_bits, src[chan]); } /* * can't use lp_build_conv since we want to keep values as 32bit * here so we can interleave with rgb to go from SoA->AoS. */ alpha = lp_build_clamp_zero_one_nanzero(&f32_bld, src[3]); alpha = lp_build_mul(&f32_bld, alpha, lp_build_const_vec(gallivm, src_type, 255.0f)); tmpsrgb[3] = lp_build_iround(&f32_bld, alpha); dst = lp_build_zero(gallivm, int32_type); for (chan = 0; chan < dst_fmt->nr_channels; chan++) { if (dst_fmt->swizzle[chan] <= PIPE_SWIZZLE_W) { unsigned ls; LLVMValueRef shifted, shift_val; ls = dst_fmt->channel[dst_fmt->swizzle[chan]].shift; shift_val = lp_build_const_int_vec(gallivm, int32_type, ls); shifted = LLVMBuildShl(builder, tmpsrgb[chan], shift_val, ""); dst = LLVMBuildOr(builder, dst, shifted, ""); } } return dst; }
void lp_build_alpha_to_coverage(struct gallivm_state *gallivm, struct lp_type type, struct lp_build_mask_context *mask, LLVMValueRef alpha, boolean do_branch) { struct lp_build_context bld; LLVMValueRef test; LLVMValueRef alpha_ref_value; lp_build_context_init(&bld, gallivm, type); alpha_ref_value = lp_build_const_vec(gallivm, type, 0.5); test = lp_build_cmp(&bld, PIPE_FUNC_GREATER, alpha, alpha_ref_value); lp_build_name(test, "alpha_to_coverage"); lp_build_mask_update(mask, test); if (do_branch) lp_build_mask_check(mask); }
/** * Helper used by lp_build_cube_lookup() * \param sign scalar +1 or -1 * \param coord float vector * \param ima float vector */ static LLVMValueRef lp_build_cube_coord(struct lp_build_context *coord_bld, LLVMValueRef sign, int negate_coord, LLVMValueRef coord, LLVMValueRef ima) { /* return negate(coord) * ima * sign + 0.5; */ LLVMValueRef half = lp_build_const_vec(coord_bld->gallivm, coord_bld->type, 0.5); LLVMValueRef res; assert(negate_coord == +1 || negate_coord == -1); if (negate_coord == -1) { coord = lp_build_negate(coord_bld, coord); } res = lp_build_mul(coord_bld, coord, ima); if (sign) { sign = lp_build_broadcast_scalar(coord_bld, sign); res = lp_build_mul(coord_bld, res, sign); } res = lp_build_add(coord_bld, res, half); return res; }
/** * Special case for converting clamped IEEE-754 floats to unsigned norms. * * The mathematical voodoo below may seem excessive but it is actually * paramount we do it this way for several reasons. First, there is no single * precision FP to unsigned integer conversion Intel SSE instruction. Second, * secondly, even if there was, since the FP's mantissa takes only a fraction * of register bits the typically scale and cast approach would require double * precision for accurate results, and therefore half the throughput * * Although the result values can be scaled to an arbitrary bit width specified * by dst_width, the actual result type will have the same width. * * Ex: src = { float, float, float, float } * return { i32, i32, i32, i32 } where each value is in [0, 2^dst_width-1]. */ LLVMValueRef lp_build_clamped_float_to_unsigned_norm(struct gallivm_state *gallivm, struct lp_type src_type, unsigned dst_width, LLVMValueRef src) { LLVMBuilderRef builder = gallivm->builder; LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, src_type); LLVMValueRef res; unsigned mantissa; assert(src_type.floating); assert(dst_width <= src_type.width); src_type.sign = FALSE; mantissa = lp_mantissa(src_type); if (dst_width <= mantissa) { /* * Apply magic coefficients that will make the desired result to appear * in the lowest significant bits of the mantissa, with correct rounding. * * This only works if the destination width fits in the mantissa. */ unsigned long long ubound; unsigned long long mask; double scale; double bias; ubound = (1ULL << dst_width); mask = ubound - 1; scale = (double)mask/ubound; bias = (double)(1ULL << (mantissa - dst_width)); res = LLVMBuildFMul(builder, src, lp_build_const_vec(gallivm, src_type, scale), ""); res = LLVMBuildFAdd(builder, res, lp_build_const_vec(gallivm, src_type, bias), ""); res = LLVMBuildBitCast(builder, res, int_vec_type, ""); res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(gallivm, src_type, mask), ""); } else if (dst_width == (mantissa + 1)) { /* * The destination width matches exactly what can be represented in * floating point (i.e., mantissa + 1 bits). So do a straight * multiplication followed by casting. No further rounding is necessary. */ double scale; scale = (double)((1ULL << dst_width) - 1); res = LLVMBuildFMul(builder, src, lp_build_const_vec(gallivm, src_type, scale), ""); res = LLVMBuildFPToSI(builder, res, int_vec_type, ""); } else { /* * The destination exceeds what can be represented in the floating point. * So multiply by the largest power two we get away with, and when * subtract the most significant bit to rescale to normalized values. * * The largest power of two factor we can get away is * (1 << (src_type.width - 1)), because we need to use signed . In theory it * should be (1 << (src_type.width - 2)), but IEEE 754 rules states * INT_MIN should be returned in FPToSI, which is the correct result for * values near 1.0! * * This means we get (src_type.width - 1) correct bits for values near 0.0, * and (mantissa + 1) correct bits for values near 1.0. Equally or more * important, we also get exact results for 0.0 and 1.0. */ unsigned n = MIN2(src_type.width - 1, dst_width); double scale = (double)(1ULL << n); unsigned lshift = dst_width - n; unsigned rshift = n; LLVMValueRef lshifted; LLVMValueRef rshifted; res = LLVMBuildFMul(builder, src, lp_build_const_vec(gallivm, src_type, scale), ""); res = LLVMBuildFPToSI(builder, res, int_vec_type, ""); /* * Align the most significant bit to its final place. * * This will cause 1.0 to overflow to 0, but the later adjustment will * get it right. */ if (lshift) { lshifted = LLVMBuildShl(builder, res, lp_build_const_int_vec(gallivm, src_type, lshift), ""); } else { lshifted = res; } /* * Align the most significant bit to the right. */ rshifted = LLVMBuildLShr(builder, res, lp_build_const_int_vec(gallivm, src_type, rshift), ""); /* * Subtract the MSB to the LSB, therefore re-scaling from * (1 << dst_width) to ((1 << dst_width) - 1). */ res = LLVMBuildSub(builder, lshifted, rshifted, ""); } return res; }
static LLVMValueRef lp_build_extract_soa_chan(struct lp_build_context *bld, unsigned blockbits, boolean srgb_chan, struct util_format_channel_description chan_desc, LLVMValueRef packed) { struct gallivm_state *gallivm = bld->gallivm; LLVMBuilderRef builder = gallivm->builder; struct lp_type type = bld->type; LLVMValueRef input = packed; const unsigned width = chan_desc.size; const unsigned start = chan_desc.shift; const unsigned stop = start + width; /* Decode the input vector component */ switch(chan_desc.type) { case UTIL_FORMAT_TYPE_VOID: input = bld->undef; break; case UTIL_FORMAT_TYPE_UNSIGNED: /* * Align the LSB */ if (start) { input = LLVMBuildLShr(builder, input, lp_build_const_int_vec(gallivm, type, start), ""); } /* * Zero the MSBs */ if (stop < blockbits) { unsigned mask = ((unsigned long long)1 << width) - 1; input = LLVMBuildAnd(builder, input, lp_build_const_int_vec(gallivm, type, mask), ""); } /* * Type conversion */ if (type.floating) { if (srgb_chan) { struct lp_type conv_type = lp_uint_type(type); input = lp_build_srgb_to_linear(gallivm, conv_type, width, input); } else { if(chan_desc.normalized) input = lp_build_unsigned_norm_to_float(gallivm, width, type, input); else input = LLVMBuildSIToFP(builder, input, bld->vec_type, ""); } } else if (chan_desc.pure_integer) { /* Nothing to do */ } else { /* FIXME */ assert(0); } break; case UTIL_FORMAT_TYPE_SIGNED: /* * Align the sign bit first. */ if (stop < type.width) { unsigned bits = type.width - stop; LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits); input = LLVMBuildShl(builder, input, bits_val, ""); } /* * Align the LSB (with an arithmetic shift to preserve the sign) */ if (chan_desc.size < type.width) { unsigned bits = type.width - chan_desc.size; LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits); input = LLVMBuildAShr(builder, input, bits_val, ""); } /* * Type conversion */ if (type.floating) { input = LLVMBuildSIToFP(builder, input, bld->vec_type, ""); if (chan_desc.normalized) { double scale = 1.0 / ((1 << (chan_desc.size - 1)) - 1); LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale); input = LLVMBuildFMul(builder, input, scale_val, ""); /* * The formula above will produce value below -1.0 for most negative * value but everything seems happy with that hence disable for now. */ if (0) input = lp_build_max(bld, input, lp_build_const_vec(gallivm, type, -1.0f)); } } else if (chan_desc.pure_integer) { /* Nothing to do */ } else { /* FIXME */ assert(0); } break; case UTIL_FORMAT_TYPE_FLOAT: if (type.floating) { if (chan_desc.size == 16) { struct lp_type f16i_type = type; f16i_type.width /= 2; f16i_type.floating = 0; if (start) { input = LLVMBuildLShr(builder, input, lp_build_const_int_vec(gallivm, type, start), ""); } input = LLVMBuildTrunc(builder, input, lp_build_vec_type(gallivm, f16i_type), ""); input = lp_build_half_to_float(gallivm, input); } else { assert(start == 0); assert(stop == 32); assert(type.width == 32); } input = LLVMBuildBitCast(builder, input, bld->vec_type, ""); } else { /* FIXME */ assert(0); input = bld->undef; } break; case UTIL_FORMAT_TYPE_FIXED: if (type.floating) { double scale = 1.0 / ((1 << (chan_desc.size/2)) - 1); LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale); input = LLVMBuildSIToFP(builder, input, bld->vec_type, ""); input = LLVMBuildFMul(builder, input, scale_val, ""); } else { /* FIXME */ assert(0); input = bld->undef; } break; default: assert(0); input = bld->undef; break; } return input; }
/** * Convert linear float values to srgb int values. * Several possibilities how to do this, e.g. * - use table (based on exponent/highest order mantissa bits) and do * linear interpolation (https://gist.github.com/rygorous/2203834) * - Chebyshev polynomial * - Approximation using reciprocals * - using int-to-float and float-to-int tricks for pow() * (http://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent) * * @param src float (vector) value(s) to convert. */ static LLVMValueRef lp_build_linear_to_srgb(struct gallivm_state *gallivm, struct lp_type src_type, LLVMValueRef src) { LLVMBuilderRef builder = gallivm->builder; struct lp_build_context f32_bld; LLVMValueRef lin_thresh, lin, lin_const, is_linear, tmp, pow_final; lp_build_context_init(&f32_bld, gallivm, src_type); src = lp_build_clamp(&f32_bld, src, f32_bld.zero, f32_bld.one); if (0) { /* * using int-to-float and float-to-int trick for pow(). * This is much more accurate than necessary thanks to the correction, * but it most certainly makes no sense without rsqrt available. * Bonus points if you understand how this works... * All in all (including min/max clamp, conversion) 19 instructions. */ float exp_f = 2.0f / 3.0f; /* some compilers can't do exp2f, so this is exp2f(127.0f/exp_f - 127.0f) */ float exp2f_c = 1.30438178253e+19f; float coeff_f = 0.62996f; LLVMValueRef pow_approx, coeff, x2, exponent, pow_1, pow_2; struct lp_type int_type = lp_int_type(src_type); /* * First calculate approx x^8/12 */ exponent = lp_build_const_vec(gallivm, src_type, exp_f); coeff = lp_build_const_vec(gallivm, src_type, exp2f_c * powf(coeff_f, 1.0f / exp_f)); /* premultiply src */ tmp = lp_build_mul(&f32_bld, coeff, src); /* "log2" */ tmp = LLVMBuildBitCast(builder, tmp, lp_build_vec_type(gallivm, int_type), ""); tmp = lp_build_int_to_float(&f32_bld, tmp); /* multiply for pow */ tmp = lp_build_mul(&f32_bld, tmp, exponent); /* "exp2" */ pow_approx = lp_build_itrunc(&f32_bld, tmp); pow_approx = LLVMBuildBitCast(builder, pow_approx, lp_build_vec_type(gallivm, src_type), ""); /* * Since that pow was inaccurate (like 3 bits, though each sqrt step would * give another bit), compensate the error (which is why we chose another * exponent in the first place). */ /* x * x^(8/12) = x^(20/12) */ pow_1 = lp_build_mul(&f32_bld, pow_approx, src); /* x * x * x^(-4/12) = x^(20/12) */ /* Should avoid using rsqrt if it's not available, but * using x * x^(4/12) * x^(4/12) instead will change error weight */ tmp = lp_build_fast_rsqrt(&f32_bld, pow_approx); x2 = lp_build_mul(&f32_bld, src, src); pow_2 = lp_build_mul(&f32_bld, x2, tmp); /* average the values so the errors cancel out, compensate bias, * we also squeeze the 1.055 mul of the srgb conversion plus the 255.0 mul * for conversion to int in here */ tmp = lp_build_add(&f32_bld, pow_1, pow_2); coeff = lp_build_const_vec(gallivm, src_type, 1.0f / (3.0f * coeff_f) * 0.999852f * powf(1.055f * 255.0f, 4.0f)); pow_final = lp_build_mul(&f32_bld, tmp, coeff); /* x^(5/12) = rsqrt(rsqrt(x^20/12)) */ if (lp_build_fast_rsqrt_available(src_type)) { pow_final = lp_build_fast_rsqrt(&f32_bld, lp_build_fast_rsqrt(&f32_bld, pow_final)); } else { pow_final = lp_build_sqrt(&f32_bld, lp_build_sqrt(&f32_bld, pow_final)); } pow_final = lp_build_add(&f32_bld, pow_final, lp_build_const_vec(gallivm, src_type, -0.055f * 255.0f)); } else { /* * using "rational polynomial" approximation here. * Essentially y = a*x^0.375 + b*x^0.5 + c, with also * factoring in the 255.0 mul and the scaling mul. * (a is closer to actual value so has higher weight than b.) * Note: the constants are magic values. They were found empirically, * possibly could be improved but good enough (be VERY careful with * error metric if you'd want to tweak them, they also MUST fit with * the crappy polynomial above for srgb->linear since it is required * that each srgb value maps back to the same value). * This function has an error of max +-0.17 (and we'd only require +-0.6), * for the approximated srgb->linear values the error is naturally larger * (+-0.42) but still accurate enough (required +-0.5 essentially). * All in all (including min/max clamp, conversion) 15 instructions. * FMA would help (minus 2 instructions). */ LLVMValueRef x05, x0375, a_const, b_const, c_const, tmp2; if (lp_build_fast_rsqrt_available(src_type)) { tmp = lp_build_fast_rsqrt(&f32_bld, src); x05 = lp_build_mul(&f32_bld, src, tmp); } else { /* * I don't really expect this to be practical without rsqrt * but there's no reason for triple punishment so at least * save the otherwise resulting division and unnecessary mul... */ x05 = lp_build_sqrt(&f32_bld, src); } tmp = lp_build_mul(&f32_bld, x05, src); if (lp_build_fast_rsqrt_available(src_type)) { x0375 = lp_build_fast_rsqrt(&f32_bld, lp_build_fast_rsqrt(&f32_bld, tmp)); } else { x0375 = lp_build_sqrt(&f32_bld, lp_build_sqrt(&f32_bld, tmp)); } a_const = lp_build_const_vec(gallivm, src_type, 0.675f * 1.0622 * 255.0f); b_const = lp_build_const_vec(gallivm, src_type, 0.325f * 1.0622 * 255.0f); c_const = lp_build_const_vec(gallivm, src_type, -0.0620f * 255.0f); tmp = lp_build_mul(&f32_bld, a_const, x0375); tmp2 = lp_build_mul(&f32_bld, b_const, x05); tmp2 = lp_build_add(&f32_bld, tmp2, c_const); pow_final = lp_build_add(&f32_bld, tmp, tmp2); } /* linear part is easy */ lin_const = lp_build_const_vec(gallivm, src_type, 12.92f * 255.0f); lin = lp_build_mul(&f32_bld, src, lin_const); lin_thresh = lp_build_const_vec(gallivm, src_type, 0.0031308f); is_linear = lp_build_compare(gallivm, src_type, PIPE_FUNC_LEQUAL, src, lin_thresh); tmp = lp_build_select(&f32_bld, is_linear, lin, pow_final); f32_bld.type.sign = 0; return lp_build_iround(&f32_bld, tmp); }
/** * Fetch a texels from a texture, returning them in SoA layout. * * \param type the desired return type for 'rgba'. The vector length * is the number of texels to fetch * \param aligned if the offset is guaranteed to be aligned to element width * * \param base_ptr points to the base of the texture mip tree. * \param offset offset to start of the texture image block. For non- * compressed formats, this simply is an offset to the texel. * For compressed formats, it is an offset to the start of the * compressed data block. * * \param i, j the sub-block pixel coordinates. For non-compressed formats * these will always be (0,0). For compressed formats, i will * be in [0, block_width-1] and j will be in [0, block_height-1]. * \param cache optional value pointing to a lp_build_format_cache structure */ void lp_build_fetch_rgba_soa(struct gallivm_state *gallivm, const struct util_format_description *format_desc, struct lp_type type, boolean aligned, LLVMValueRef base_ptr, LLVMValueRef offset, LLVMValueRef i, LLVMValueRef j, LLVMValueRef cache, LLVMValueRef rgba_out[4]) { LLVMBuilderRef builder = gallivm->builder; enum pipe_format format = format_desc->format; struct lp_type fetch_type; if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN && (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB || format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB || format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) && format_desc->block.width == 1 && format_desc->block.height == 1 && format_desc->block.bits <= type.width && (format_desc->channel[0].type != UTIL_FORMAT_TYPE_FLOAT || format_desc->channel[0].size == 32 || format_desc->channel[0].size == 16)) { /* * The packed pixel fits into an element of the destination format. Put * the packed pixels into a vector and extract each component for all * vector elements in parallel. */ LLVMValueRef packed; /* * gather the texels from the texture * Ex: packed = {XYZW, XYZW, XYZW, XYZW} */ assert(format_desc->block.bits <= type.width); fetch_type = lp_type_uint(type.width); packed = lp_build_gather(gallivm, type.length, format_desc->block.bits, fetch_type, aligned, base_ptr, offset, FALSE); /* * convert texels to float rgba */ lp_build_unpack_rgba_soa(gallivm, format_desc, type, packed, rgba_out); return; } if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN && (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB) && format_desc->block.width == 1 && format_desc->block.height == 1 && format_desc->block.bits > type.width && ((format_desc->block.bits <= type.width * type.length && format_desc->channel[0].size <= type.width) || (format_desc->channel[0].size == 64 && format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT && type.floating))) { /* * Similar to above, but the packed pixel is larger than what fits * into an element of the destination format. The packed pixels will be * shuffled into SoA vectors appropriately, and then the extraction will * be done in parallel as much as possible. * Good for 16xn (n > 2) and 32xn (n > 1) formats, care is taken so * the gathered vectors can be shuffled easily (even with avx). * 64xn float -> 32xn float is handled too but it's a bit special as * it does the conversion pre-shuffle. */ LLVMValueRef packed[4], dst[4], output[4], shuffles[LP_MAX_VECTOR_WIDTH/32]; struct lp_type fetch_type, gather_type = type; unsigned num_gather, fetch_width, i, j; struct lp_build_context bld; boolean fp64 = format_desc->channel[0].size == 64; lp_build_context_init(&bld, gallivm, type); assert(type.width == 32); assert(format_desc->block.bits > type.width); /* * First, figure out fetch order. */ fetch_width = util_next_power_of_two(format_desc->block.bits); num_gather = fetch_width / type.width; /* * fp64 are treated like fp32 except we fetch twice wide values * (as we shuffle after trunc). The shuffles for that work out * mostly fine (slightly suboptimal for 4-wide, perfect for AVX) * albeit we miss the potential opportunity for hw gather (as it * only handles native size). */ num_gather = fetch_width / type.width; gather_type.width *= num_gather; if (fp64) { num_gather /= 2; } gather_type.length /= num_gather; for (i = 0; i < num_gather; i++) { LLVMValueRef offsetr, shuf_vec; if(num_gather == 4) { for (j = 0; j < gather_type.length; j++) { unsigned idx = i + 4*j; shuffles[j] = lp_build_const_int32(gallivm, idx); } shuf_vec = LLVMConstVector(shuffles, gather_type.length); offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, ""); } else if (num_gather == 2) { assert(num_gather == 2); for (j = 0; j < gather_type.length; j++) { unsigned idx = i*2 + (j%2) + (j/2)*4; shuffles[j] = lp_build_const_int32(gallivm, idx); } shuf_vec = LLVMConstVector(shuffles, gather_type.length); offsetr = LLVMBuildShuffleVector(builder, offset, offset, shuf_vec, ""); } else { assert(num_gather == 1); offsetr = offset; } if (gather_type.length == 1) { LLVMValueRef zero = lp_build_const_int32(gallivm, 0); offsetr = LLVMBuildExtractElement(builder, offsetr, zero, ""); } /* * Determine whether to use float or int loads. This is mostly * to outsmart the (stupid) llvm int/float shuffle logic, we * don't really care much if the data is floats or ints... * But llvm will refuse to use single float shuffle with int data * and instead use 3 int shuffles instead, the code looks atrocious. * (Note bitcasts often won't help, as llvm is too smart to be * fooled by that.) * Nobody cares about simd float<->int domain transition penalties, * which usually don't even exist for shuffles anyway. * With 4x32bit (and 3x32bit) fetch, we use float vec (the data is * going into transpose, which is unpacks, so doesn't really matter * much). * With 2x32bit or 4x16bit fetch, we use float vec, since those * go into the weird channel separation shuffle. With floats, * this is (with 128bit vectors): * - 2 movq, 2 movhpd, 2 shufps * With ints it would be: * - 4 movq, 2 punpcklqdq, 4 pshufd, 2 blendw * I've seen texture functions increase in code size by 15% just due * to that (there's lots of such fetches in them...) * (We could chose a different gather order to improve this somewhat * for the int path, but it would basically just drop the blends, * so the float path with this order really is optimal.) * Albeit it is tricky sometimes llvm doesn't ignore the float->int * casts so must avoid them until we're done with the float shuffle... * 3x16bit formats (the same is also true for 3x8) are pretty bad but * there's nothing we can do about them (we could overallocate by * those couple bytes and use unaligned but pot sized load). * Note that this is very much x86 specific. I don't know if this * affect other archs at all. */ if (num_gather > 1) { /* * We always want some float type here (with x86) * due to shuffles being float ones afterwards (albeit for * the num_gather == 4 case int should work fine too * (unless there's some problems with avx but not avx2). */ if (format_desc->channel[0].size == 64) { fetch_type = lp_type_float_vec(64, gather_type.width); } else { fetch_type = lp_type_int_vec(32, gather_type.width); } } else { /* type doesn't matter much */ if (format_desc->channel[0].type == UTIL_FORMAT_TYPE_FLOAT && (format_desc->channel[0].size == 32 || format_desc->channel[0].size == 64)) { fetch_type = lp_type_float(gather_type.width); } else { fetch_type = lp_type_uint(gather_type.width); } } /* Now finally gather the values */ packed[i] = lp_build_gather(gallivm, gather_type.length, format_desc->block.bits, fetch_type, aligned, base_ptr, offsetr, FALSE); if (fp64) { struct lp_type conv_type = type; conv_type.width *= 2; packed[i] = LLVMBuildBitCast(builder, packed[i], lp_build_vec_type(gallivm, conv_type), ""); packed[i] = LLVMBuildFPTrunc(builder, packed[i], bld.vec_type, ""); } } /* shuffle the gathered values to SoA */ if (num_gather == 2) { for (i = 0; i < num_gather; i++) { for (j = 0; j < type.length; j++) { unsigned idx = (j%2)*2 + (j/4)*4 + i; if ((j/2)%2) idx += type.length; shuffles[j] = lp_build_const_int32(gallivm, idx); } dst[i] = LLVMBuildShuffleVector(builder, packed[0], packed[1], LLVMConstVector(shuffles, type.length), ""); } } else if (num_gather == 4) { lp_build_transpose_aos(gallivm, lp_int_type(type), packed, dst); } else { assert(num_gather == 1); dst[0] = packed[0]; } /* * And finally unpack exactly as above, except that * chan shift is adjusted and the right vector selected. */ if (!fp64) { for (i = 0; i < num_gather; i++) { dst[i] = LLVMBuildBitCast(builder, dst[i], bld.int_vec_type, ""); } for (i = 0; i < format_desc->nr_channels; i++) { struct util_format_channel_description chan_desc = format_desc->channel[i]; unsigned blockbits = type.width; unsigned vec_nr = chan_desc.shift / type.width; chan_desc.shift %= type.width; output[i] = lp_build_extract_soa_chan(&bld, blockbits, FALSE, chan_desc, dst[vec_nr]); } } else { for (i = 0; i < format_desc->nr_channels; i++) { output[i] = dst[i]; } } lp_build_format_swizzle_soa(format_desc, &bld, output, rgba_out); return; } if (format == PIPE_FORMAT_R11G11B10_FLOAT || format == PIPE_FORMAT_R9G9B9E5_FLOAT) { /* * similar conceptually to above but requiring special * AoS packed -> SoA float conversion code. */ LLVMValueRef packed; struct lp_type fetch_type = lp_type_uint(type.width); assert(type.floating); assert(type.width == 32); packed = lp_build_gather(gallivm, type.length, format_desc->block.bits, fetch_type, aligned, base_ptr, offset, FALSE); if (format == PIPE_FORMAT_R11G11B10_FLOAT) { lp_build_r11g11b10_to_float(gallivm, packed, rgba_out); } else { lp_build_rgb9e5_to_float(gallivm, packed, rgba_out); } return; } if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS && format_desc->block.bits == 64) { /* * special case the format is 64 bits but we only require * 32bit (or 8bit) from each block. */ LLVMValueRef packed; struct lp_type fetch_type = lp_type_uint(type.width); if (format == PIPE_FORMAT_X32_S8X24_UINT) { /* * for stencil simply fix up offsets - could in fact change * base_ptr instead even outside the shader. */ unsigned mask = (1 << 8) - 1; LLVMValueRef s_offset = lp_build_const_int_vec(gallivm, type, 4); offset = LLVMBuildAdd(builder, offset, s_offset, ""); packed = lp_build_gather(gallivm, type.length, 32, fetch_type, aligned, base_ptr, offset, FALSE); packed = LLVMBuildAnd(builder, packed, lp_build_const_int_vec(gallivm, type, mask), ""); } else { assert (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT); packed = lp_build_gather(gallivm, type.length, 32, fetch_type, aligned, base_ptr, offset, TRUE); packed = LLVMBuildBitCast(builder, packed, lp_build_vec_type(gallivm, type), ""); } /* for consistency with lp_build_unpack_rgba_soa() return sss1 or zzz1 */ rgba_out[0] = rgba_out[1] = rgba_out[2] = packed; rgba_out[3] = lp_build_const_vec(gallivm, type, 1.0f); return; } /* * Try calling lp_build_fetch_rgba_aos for all pixels. * Should only really hit subsampled, compressed * (for s3tc srgb too, for rgtc the unorm ones only) by now. * (This is invalid for plain 8unorm formats because we're lazy with * the swizzle since some results would arrive swizzled, some not.) */ if ((format_desc->layout != UTIL_FORMAT_LAYOUT_PLAIN) && (util_format_fits_8unorm(format_desc) || format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC) && type.floating && type.width == 32 && (type.length == 1 || (type.length % 4 == 0))) { struct lp_type tmp_type; struct lp_build_context bld; LLVMValueRef packed, rgba[4]; const struct util_format_description *flinear_desc; const struct util_format_description *frgba8_desc; unsigned chan; lp_build_context_init(&bld, gallivm, type); /* * Make sure the conversion in aos really only does convert to rgba8 * and not anything more (so use linear format, adjust type). */ flinear_desc = util_format_description(util_format_linear(format)); memset(&tmp_type, 0, sizeof tmp_type); tmp_type.width = 8; tmp_type.length = type.length * 4; tmp_type.norm = TRUE; packed = lp_build_fetch_rgba_aos(gallivm, flinear_desc, tmp_type, aligned, base_ptr, offset, i, j, cache); packed = LLVMBuildBitCast(builder, packed, bld.int_vec_type, ""); /* * The values are now packed so they match ordinary (srgb) RGBA8 format, * hence need to use matching format for unpack. */ frgba8_desc = util_format_description(PIPE_FORMAT_R8G8B8A8_UNORM); if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) { assert(format_desc->layout == UTIL_FORMAT_LAYOUT_S3TC); frgba8_desc = util_format_description(PIPE_FORMAT_R8G8B8A8_SRGB); } lp_build_unpack_rgba_soa(gallivm, frgba8_desc, type, packed, rgba); /* * We converted 4 channels. Make sure llvm can drop unneeded ones * (luckily the rgba order is fixed, only LA needs special case). */ for (chan = 0; chan < 4; chan++) { enum pipe_swizzle swizzle = format_desc->swizzle[chan]; if (chan == 3 && util_format_is_luminance_alpha(format)) { swizzle = PIPE_SWIZZLE_W; } rgba_out[chan] = lp_build_swizzle_soa_channel(&bld, rgba, swizzle); } return; } /* * Fallback to calling lp_build_fetch_rgba_aos for each pixel. * * This is not the most efficient way of fetching pixels, as we * miss some opportunities to do vectorization, but this is * convenient for formats or scenarios for which there was no * opportunity or incentive to optimize. * * We do NOT want to end up here, this typically is quite terrible, * in particular if the formats have less than 4 channels. * * Right now, this should only be hit for: * - RGTC snorm formats * (those miss fast fetch functions hence they are terrible anyway) */ { unsigned k; struct lp_type tmp_type; LLVMValueRef aos_fetch[LP_MAX_VECTOR_WIDTH / 32]; if (gallivm_debug & GALLIVM_DEBUG_PERF) { debug_printf("%s: AoS fetch fallback for %s\n", __FUNCTION__, format_desc->short_name); } tmp_type = type; tmp_type.length = 4; /* * Note that vector transpose can be worse compared to insert/extract * for aos->soa conversion (for formats with 1 or 2 channels). However, * we should try to avoid getting here for just about all formats, so * don't bother. */ /* loop over number of pixels */ for(k = 0; k < type.length; ++k) { LLVMValueRef index = lp_build_const_int32(gallivm, k); LLVMValueRef offset_elem; LLVMValueRef i_elem, j_elem; offset_elem = LLVMBuildExtractElement(builder, offset, index, ""); i_elem = LLVMBuildExtractElement(builder, i, index, ""); j_elem = LLVMBuildExtractElement(builder, j, index, ""); /* Get a single float[4]={R,G,B,A} pixel */ aos_fetch[k] = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type, aligned, base_ptr, offset_elem, i_elem, j_elem, cache); } convert_to_soa(gallivm, aos_fetch, rgba_out, type); } }
/** * Interpolate the shader input attribute values. * This is called for each (group of) quad(s). */ static void attribs_update_simple(struct lp_build_interp_soa_context *bld, struct gallivm_state *gallivm, LLVMValueRef loop_iter, int start, int end) { LLVMBuilderRef builder = gallivm->builder; struct lp_build_context *coeff_bld = &bld->coeff_bld; struct lp_build_context *setup_bld = &bld->setup_bld; LLVMValueRef oow = NULL; unsigned attrib; LLVMValueRef pixoffx; LLVMValueRef pixoffy; LLVMValueRef ptr; /* could do this with code-generated passed in pixel offsets too */ assert(loop_iter); ptr = LLVMBuildGEP(builder, bld->xoffset_store, &loop_iter, 1, ""); pixoffx = LLVMBuildLoad(builder, ptr, ""); ptr = LLVMBuildGEP(builder, bld->yoffset_store, &loop_iter, 1, ""); pixoffy = LLVMBuildLoad(builder, ptr, ""); pixoffx = LLVMBuildFAdd(builder, pixoffx, lp_build_broadcast_scalar(coeff_bld, bld->x), ""); pixoffy = LLVMBuildFAdd(builder, pixoffy, lp_build_broadcast_scalar(coeff_bld, bld->y), ""); for (attrib = start; attrib < end; attrib++) { const unsigned mask = bld->mask[attrib]; const unsigned interp = bld->interp[attrib]; unsigned chan; for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { if (mask & (1 << chan)) { LLVMValueRef index; LLVMValueRef dadx = coeff_bld->zero; LLVMValueRef dady = coeff_bld->zero; LLVMValueRef a = coeff_bld->zero; index = lp_build_const_int32(gallivm, chan); switch (interp) { case LP_INTERP_PERSPECTIVE: /* fall-through */ case LP_INTERP_LINEAR: if (attrib == 0 && chan == 0) { dadx = coeff_bld->one; if (bld->pos_offset) { a = lp_build_const_vec(gallivm, coeff_bld->type, bld->pos_offset); } } else if (attrib == 0 && chan == 1) { dady = coeff_bld->one; if (bld->pos_offset) { a = lp_build_const_vec(gallivm, coeff_bld->type, bld->pos_offset); } } else { dadx = lp_build_extract_broadcast(gallivm, setup_bld->type, coeff_bld->type, bld->dadxaos[attrib], index); dady = lp_build_extract_broadcast(gallivm, setup_bld->type, coeff_bld->type, bld->dadyaos[attrib], index); a = lp_build_extract_broadcast(gallivm, setup_bld->type, coeff_bld->type, bld->a0aos[attrib], index); } /* * a = a0 + (x * dadx + y * dady) */ a = lp_build_fmuladd(builder, dadx, pixoffx, a); a = lp_build_fmuladd(builder, dady, pixoffy, a); if (interp == LP_INTERP_PERSPECTIVE) { if (oow == NULL) { LLVMValueRef w = bld->attribs[0][3]; assert(attrib != 0); assert(bld->mask[0] & TGSI_WRITEMASK_W); oow = lp_build_rcp(coeff_bld, w); } a = lp_build_mul(coeff_bld, a, oow); } break; case LP_INTERP_CONSTANT: case LP_INTERP_FACING: a = lp_build_extract_broadcast(gallivm, setup_bld->type, coeff_bld->type, bld->a0aos[attrib], index); break; case LP_INTERP_POSITION: assert(attrib > 0); a = bld->attribs[0][chan]; break; default: assert(0); break; } if ((attrib == 0) && (chan == 2)){ /* FIXME: Depth values can exceed 1.0, due to the fact that * setup interpolation coefficients refer to (0,0) which causes * precision loss. So we must clamp to 1.0 here to avoid artifacts */ a = lp_build_min(coeff_bld, a, coeff_bld->one); } bld->attribs[attrib][chan] = a; } } } }
/** * Sample a single texture image with nearest sampling. * If sampling a cube texture, r = cube face in [0,5]. * Return filtered color as two vectors of 16-bit fixed point values. */ static void lp_build_sample_image_nearest(struct lp_build_sample_context *bld, LLVMValueRef int_size, LLVMValueRef row_stride_vec, LLVMValueRef img_stride_vec, LLVMValueRef data_ptr, LLVMValueRef s, LLVMValueRef t, LLVMValueRef r, LLVMValueRef *colors_lo, LLVMValueRef *colors_hi) { const unsigned dims = bld->dims; LLVMBuilderRef builder = bld->gallivm->builder; struct lp_build_context i32, h16, u8n; LLVMTypeRef i32_vec_type, h16_vec_type, u8n_vec_type; LLVMValueRef i32_c8; LLVMValueRef width_vec, height_vec, depth_vec; LLVMValueRef s_ipart, t_ipart = NULL, r_ipart = NULL; LLVMValueRef x_stride; LLVMValueRef x_offset, offset; LLVMValueRef x_subcoord, y_subcoord, z_subcoord; lp_build_context_init(&i32, bld->gallivm, lp_type_int_vec(32)); lp_build_context_init(&h16, bld->gallivm, lp_type_ufixed(16)); lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8)); i32_vec_type = lp_build_vec_type(bld->gallivm, i32.type); h16_vec_type = lp_build_vec_type(bld->gallivm, h16.type); u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type); lp_build_extract_image_sizes(bld, bld->int_size_type, bld->int_coord_type, int_size, &width_vec, &height_vec, &depth_vec); if (bld->static_state->normalized_coords) { LLVMValueRef scaled_size; LLVMValueRef flt_size; /* scale size by 256 (8 fractional bits) */ scaled_size = lp_build_shl_imm(&bld->int_size_bld, int_size, 8); flt_size = lp_build_int_to_float(&bld->float_size_bld, scaled_size); lp_build_unnormalized_coords(bld, flt_size, &s, &t, &r); } else { /* scale coords by 256 (8 fractional bits) */ s = lp_build_mul_imm(&bld->coord_bld, s, 256); if (dims >= 2) t = lp_build_mul_imm(&bld->coord_bld, t, 256); if (dims >= 3) r = lp_build_mul_imm(&bld->coord_bld, r, 256); } /* convert float to int */ s = LLVMBuildFPToSI(builder, s, i32_vec_type, ""); if (dims >= 2) t = LLVMBuildFPToSI(builder, t, i32_vec_type, ""); if (dims >= 3) r = LLVMBuildFPToSI(builder, r, i32_vec_type, ""); /* compute floor (shift right 8) */ i32_c8 = lp_build_const_int_vec(bld->gallivm, i32.type, 8); s_ipart = LLVMBuildAShr(builder, s, i32_c8, ""); if (dims >= 2) t_ipart = LLVMBuildAShr(builder, t, i32_c8, ""); if (dims >= 3) r_ipart = LLVMBuildAShr(builder, r, i32_c8, ""); /* get pixel, row, image strides */ x_stride = lp_build_const_vec(bld->gallivm, bld->int_coord_bld.type, bld->format_desc->block.bits/8); /* Do texcoord wrapping, compute texel offset */ lp_build_sample_wrap_nearest_int(bld, bld->format_desc->block.width, s_ipart, width_vec, x_stride, bld->static_state->pot_width, bld->static_state->wrap_s, &x_offset, &x_subcoord); offset = x_offset; if (dims >= 2) { LLVMValueRef y_offset; lp_build_sample_wrap_nearest_int(bld, bld->format_desc->block.height, t_ipart, height_vec, row_stride_vec, bld->static_state->pot_height, bld->static_state->wrap_t, &y_offset, &y_subcoord); offset = lp_build_add(&bld->int_coord_bld, offset, y_offset); if (dims >= 3) { LLVMValueRef z_offset; lp_build_sample_wrap_nearest_int(bld, 1, /* block length (depth) */ r_ipart, depth_vec, img_stride_vec, bld->static_state->pot_height, bld->static_state->wrap_r, &z_offset, &z_subcoord); offset = lp_build_add(&bld->int_coord_bld, offset, z_offset); } else if (bld->static_state->target == PIPE_TEXTURE_CUBE) { LLVMValueRef z_offset; /* The r coord is the cube face in [0,5] */ z_offset = lp_build_mul(&bld->int_coord_bld, r, img_stride_vec); offset = lp_build_add(&bld->int_coord_bld, offset, z_offset); } } /* * Fetch the pixels as 4 x 32bit (rgba order might differ): * * rgba0 rgba1 rgba2 rgba3 * * bit cast them into 16 x u8 * * r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3 * * unpack them into two 8 x i16: * * r0 g0 b0 a0 r1 g1 b1 a1 * r2 g2 b2 a2 r3 g3 b3 a3 * * The higher 8 bits of the resulting elements will be zero. */ { LLVMValueRef rgba8; if (util_format_is_rgba8_variant(bld->format_desc)) { /* * Given the format is a rgba8, just read the pixels as is, * without any swizzling. Swizzling will be done later. */ rgba8 = lp_build_gather(bld->gallivm, bld->texel_type.length, bld->format_desc->block.bits, bld->texel_type.width, data_ptr, offset); rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, ""); } else { rgba8 = lp_build_fetch_rgba_aos(bld->gallivm, bld->format_desc, u8n.type, data_ptr, offset, x_subcoord, y_subcoord); } /* Expand one 4*rgba8 to two 2*rgba16 */ lp_build_unpack2(bld->gallivm, u8n.type, h16.type, rgba8, colors_lo, colors_hi); } }
/** * Sample a single texture image with (bi-)(tri-)linear sampling. * Return filtered color as two vectors of 16-bit fixed point values. */ static void lp_build_sample_image_linear(struct lp_build_sample_context *bld, LLVMValueRef int_size, LLVMValueRef row_stride_vec, LLVMValueRef img_stride_vec, LLVMValueRef data_ptr, LLVMValueRef s, LLVMValueRef t, LLVMValueRef r, LLVMValueRef *colors_lo, LLVMValueRef *colors_hi) { const unsigned dims = bld->dims; LLVMBuilderRef builder = bld->gallivm->builder; struct lp_build_context i32, h16, u8n; LLVMTypeRef i32_vec_type, h16_vec_type, u8n_vec_type; LLVMValueRef i32_c8, i32_c128, i32_c255; LLVMValueRef width_vec, height_vec, depth_vec; LLVMValueRef s_ipart, s_fpart, s_fpart_lo, s_fpart_hi; LLVMValueRef t_ipart = NULL, t_fpart = NULL, t_fpart_lo = NULL, t_fpart_hi = NULL; LLVMValueRef r_ipart = NULL, r_fpart = NULL, r_fpart_lo = NULL, r_fpart_hi = NULL; LLVMValueRef x_stride, y_stride, z_stride; LLVMValueRef x_offset0, x_offset1; LLVMValueRef y_offset0, y_offset1; LLVMValueRef z_offset0, z_offset1; LLVMValueRef offset[2][2][2]; /* [z][y][x] */ LLVMValueRef x_subcoord[2], y_subcoord[2], z_subcoord[2]; LLVMValueRef neighbors_lo[2][2][2]; /* [z][y][x] */ LLVMValueRef neighbors_hi[2][2][2]; /* [z][y][x] */ LLVMValueRef packed_lo, packed_hi; unsigned x, y, z; unsigned i, j, k; unsigned numj, numk; lp_build_context_init(&i32, bld->gallivm, lp_type_int_vec(32)); lp_build_context_init(&h16, bld->gallivm, lp_type_ufixed(16)); lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8)); i32_vec_type = lp_build_vec_type(bld->gallivm, i32.type); h16_vec_type = lp_build_vec_type(bld->gallivm, h16.type); u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type); lp_build_extract_image_sizes(bld, bld->int_size_type, bld->int_coord_type, int_size, &width_vec, &height_vec, &depth_vec); if (bld->static_state->normalized_coords) { LLVMValueRef scaled_size; LLVMValueRef flt_size; /* scale size by 256 (8 fractional bits) */ scaled_size = lp_build_shl_imm(&bld->int_size_bld, int_size, 8); flt_size = lp_build_int_to_float(&bld->float_size_bld, scaled_size); lp_build_unnormalized_coords(bld, flt_size, &s, &t, &r); } else { /* scale coords by 256 (8 fractional bits) */ s = lp_build_mul_imm(&bld->coord_bld, s, 256); if (dims >= 2) t = lp_build_mul_imm(&bld->coord_bld, t, 256); if (dims >= 3) r = lp_build_mul_imm(&bld->coord_bld, r, 256); } /* convert float to int */ s = LLVMBuildFPToSI(builder, s, i32_vec_type, ""); if (dims >= 2) t = LLVMBuildFPToSI(builder, t, i32_vec_type, ""); if (dims >= 3) r = LLVMBuildFPToSI(builder, r, i32_vec_type, ""); /* subtract 0.5 (add -128) */ i32_c128 = lp_build_const_int_vec(bld->gallivm, i32.type, -128); s = LLVMBuildAdd(builder, s, i32_c128, ""); if (dims >= 2) { t = LLVMBuildAdd(builder, t, i32_c128, ""); } if (dims >= 3) { r = LLVMBuildAdd(builder, r, i32_c128, ""); } /* compute floor (shift right 8) */ i32_c8 = lp_build_const_int_vec(bld->gallivm, i32.type, 8); s_ipart = LLVMBuildAShr(builder, s, i32_c8, ""); if (dims >= 2) t_ipart = LLVMBuildAShr(builder, t, i32_c8, ""); if (dims >= 3) r_ipart = LLVMBuildAShr(builder, r, i32_c8, ""); /* compute fractional part (AND with 0xff) */ i32_c255 = lp_build_const_int_vec(bld->gallivm, i32.type, 255); s_fpart = LLVMBuildAnd(builder, s, i32_c255, ""); if (dims >= 2) t_fpart = LLVMBuildAnd(builder, t, i32_c255, ""); if (dims >= 3) r_fpart = LLVMBuildAnd(builder, r, i32_c255, ""); /* get pixel, row and image strides */ x_stride = lp_build_const_vec(bld->gallivm, bld->int_coord_bld.type, bld->format_desc->block.bits/8); y_stride = row_stride_vec; z_stride = img_stride_vec; /* do texcoord wrapping and compute texel offsets */ lp_build_sample_wrap_linear_int(bld, bld->format_desc->block.width, s_ipart, width_vec, x_stride, bld->static_state->pot_width, bld->static_state->wrap_s, &x_offset0, &x_offset1, &x_subcoord[0], &x_subcoord[1]); for (z = 0; z < 2; z++) { for (y = 0; y < 2; y++) { offset[z][y][0] = x_offset0; offset[z][y][1] = x_offset1; } } if (dims >= 2) { lp_build_sample_wrap_linear_int(bld, bld->format_desc->block.height, t_ipart, height_vec, y_stride, bld->static_state->pot_height, bld->static_state->wrap_t, &y_offset0, &y_offset1, &y_subcoord[0], &y_subcoord[1]); for (z = 0; z < 2; z++) { for (x = 0; x < 2; x++) { offset[z][0][x] = lp_build_add(&bld->int_coord_bld, offset[z][0][x], y_offset0); offset[z][1][x] = lp_build_add(&bld->int_coord_bld, offset[z][1][x], y_offset1); } } } if (dims >= 3) { lp_build_sample_wrap_linear_int(bld, bld->format_desc->block.height, r_ipart, depth_vec, z_stride, bld->static_state->pot_depth, bld->static_state->wrap_r, &z_offset0, &z_offset1, &z_subcoord[0], &z_subcoord[1]); for (y = 0; y < 2; y++) { for (x = 0; x < 2; x++) { offset[0][y][x] = lp_build_add(&bld->int_coord_bld, offset[0][y][x], z_offset0); offset[1][y][x] = lp_build_add(&bld->int_coord_bld, offset[1][y][x], z_offset1); } } } else if (bld->static_state->target == PIPE_TEXTURE_CUBE) { LLVMValueRef z_offset; z_offset = lp_build_mul(&bld->int_coord_bld, r, img_stride_vec); for (y = 0; y < 2; y++) { for (x = 0; x < 2; x++) { /* The r coord is the cube face in [0,5] */ offset[0][y][x] = lp_build_add(&bld->int_coord_bld, offset[0][y][x], z_offset); } } } /* * Transform 4 x i32 in * * s_fpart = {s0, s1, s2, s3} * * into 8 x i16 * * s_fpart = {00, s0, 00, s1, 00, s2, 00, s3} * * into two 8 x i16 * * s_fpart_lo = {s0, s0, s0, s0, s1, s1, s1, s1} * s_fpart_hi = {s2, s2, s2, s2, s3, s3, s3, s3} * * and likewise for t_fpart. There is no risk of loosing precision here * since the fractional parts only use the lower 8bits. */ s_fpart = LLVMBuildBitCast(builder, s_fpart, h16_vec_type, ""); if (dims >= 2) t_fpart = LLVMBuildBitCast(builder, t_fpart, h16_vec_type, ""); if (dims >= 3) r_fpart = LLVMBuildBitCast(builder, r_fpart, h16_vec_type, ""); { LLVMTypeRef elem_type = LLVMInt32TypeInContext(bld->gallivm->context); LLVMValueRef shuffles_lo[LP_MAX_VECTOR_LENGTH]; LLVMValueRef shuffles_hi[LP_MAX_VECTOR_LENGTH]; LLVMValueRef shuffle_lo; LLVMValueRef shuffle_hi; for (j = 0; j < h16.type.length; j += 4) { #ifdef PIPE_ARCH_LITTLE_ENDIAN unsigned subindex = 0; #else unsigned subindex = 1; #endif LLVMValueRef index; index = LLVMConstInt(elem_type, j/2 + subindex, 0); for (i = 0; i < 4; ++i) shuffles_lo[j + i] = index; index = LLVMConstInt(elem_type, h16.type.length/2 + j/2 + subindex, 0); for (i = 0; i < 4; ++i) shuffles_hi[j + i] = index; } shuffle_lo = LLVMConstVector(shuffles_lo, h16.type.length); shuffle_hi = LLVMConstVector(shuffles_hi, h16.type.length); s_fpart_lo = LLVMBuildShuffleVector(builder, s_fpart, h16.undef, shuffle_lo, ""); s_fpart_hi = LLVMBuildShuffleVector(builder, s_fpart, h16.undef, shuffle_hi, ""); if (dims >= 2) { t_fpart_lo = LLVMBuildShuffleVector(builder, t_fpart, h16.undef, shuffle_lo, ""); t_fpart_hi = LLVMBuildShuffleVector(builder, t_fpart, h16.undef, shuffle_hi, ""); } if (dims >= 3) { r_fpart_lo = LLVMBuildShuffleVector(builder, r_fpart, h16.undef, shuffle_lo, ""); r_fpart_hi = LLVMBuildShuffleVector(builder, r_fpart, h16.undef, shuffle_hi, ""); } } /* * Fetch the pixels as 4 x 32bit (rgba order might differ): * * rgba0 rgba1 rgba2 rgba3 * * bit cast them into 16 x u8 * * r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3 * * unpack them into two 8 x i16: * * r0 g0 b0 a0 r1 g1 b1 a1 * r2 g2 b2 a2 r3 g3 b3 a3 * * The higher 8 bits of the resulting elements will be zero. */ numj = 1 + (dims >= 2); numk = 1 + (dims >= 3); for (k = 0; k < numk; k++) { for (j = 0; j < numj; j++) { for (i = 0; i < 2; i++) { LLVMValueRef rgba8; if (util_format_is_rgba8_variant(bld->format_desc)) { /* * Given the format is a rgba8, just read the pixels as is, * without any swizzling. Swizzling will be done later. */ rgba8 = lp_build_gather(bld->gallivm, bld->texel_type.length, bld->format_desc->block.bits, bld->texel_type.width, data_ptr, offset[k][j][i]); rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, ""); } else { rgba8 = lp_build_fetch_rgba_aos(bld->gallivm, bld->format_desc, u8n.type, data_ptr, offset[k][j][i], x_subcoord[i], y_subcoord[j]); } /* Expand one 4*rgba8 to two 2*rgba16 */ lp_build_unpack2(bld->gallivm, u8n.type, h16.type, rgba8, &neighbors_lo[k][j][i], &neighbors_hi[k][j][i]); } } } /* * Linear interpolation with 8.8 fixed point. */ if (dims == 1) { /* 1-D lerp */ packed_lo = lp_build_lerp(&h16, s_fpart_lo, neighbors_lo[0][0][0], neighbors_lo[0][0][1]); packed_hi = lp_build_lerp(&h16, s_fpart_hi, neighbors_hi[0][0][0], neighbors_hi[0][0][1]); } else { /* 2-D lerp */ packed_lo = lp_build_lerp_2d(&h16, s_fpart_lo, t_fpart_lo, neighbors_lo[0][0][0], neighbors_lo[0][0][1], neighbors_lo[0][1][0], neighbors_lo[0][1][1]); packed_hi = lp_build_lerp_2d(&h16, s_fpart_hi, t_fpart_hi, neighbors_hi[0][0][0], neighbors_hi[0][0][1], neighbors_hi[0][1][0], neighbors_hi[0][1][1]); if (dims >= 3) { LLVMValueRef packed_lo2, packed_hi2; /* lerp in the second z slice */ packed_lo2 = lp_build_lerp_2d(&h16, s_fpart_lo, t_fpart_lo, neighbors_lo[1][0][0], neighbors_lo[1][0][1], neighbors_lo[1][1][0], neighbors_lo[1][1][1]); packed_hi2 = lp_build_lerp_2d(&h16, s_fpart_hi, t_fpart_hi, neighbors_hi[1][0][0], neighbors_hi[1][0][1], neighbors_hi[1][1][0], neighbors_hi[1][1][1]); /* interp between two z slices */ packed_lo = lp_build_lerp(&h16, r_fpart_lo, packed_lo, packed_lo2); packed_hi = lp_build_lerp(&h16, r_fpart_hi, packed_hi, packed_hi2); } } *colors_lo = packed_lo; *colors_hi = packed_hi; }
/** * Unpack several pixels in SoA. * * It takes a vector of packed pixels: * * packed = {P0, P1, P2, P3, ..., Pn} * * And will produce four vectors: * * red = {R0, R1, R2, R3, ..., Rn} * green = {G0, G1, G2, G3, ..., Gn} * blue = {B0, B1, B2, B3, ..., Bn} * alpha = {A0, A1, A2, A3, ..., An} * * It requires that a packed pixel fits into an element of the output * channels. The common case is when converting pixel with a depth of 32 bit or * less into floats. * * \param format_desc the format of the 'packed' incoming pixel vector * \param type the desired type for rgba_out (type.length = n, above) * \param packed the incoming vector of packed pixels * \param rgba_out returns the SoA R,G,B,A vectors */ void lp_build_unpack_rgba_soa(struct gallivm_state *gallivm, const struct util_format_description *format_desc, struct lp_type type, LLVMValueRef packed, LLVMValueRef rgba_out[4]) { LLVMBuilderRef builder = gallivm->builder; struct lp_build_context bld; LLVMValueRef inputs[4]; unsigned chan; assert(format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN); assert(format_desc->block.width == 1); assert(format_desc->block.height == 1); assert(format_desc->block.bits <= type.width); /* FIXME: Support more output types */ assert(type.width == 32); lp_build_context_init(&bld, gallivm, type); /* Decode the input vector components */ for (chan = 0; chan < format_desc->nr_channels; ++chan) { const unsigned width = format_desc->channel[chan].size; const unsigned start = format_desc->channel[chan].shift; const unsigned stop = start + width; LLVMValueRef input; input = packed; switch(format_desc->channel[chan].type) { case UTIL_FORMAT_TYPE_VOID: input = lp_build_undef(gallivm, type); break; case UTIL_FORMAT_TYPE_UNSIGNED: /* * Align the LSB */ if (start) { input = LLVMBuildLShr(builder, input, lp_build_const_int_vec(gallivm, type, start), ""); } /* * Zero the MSBs */ if (stop < format_desc->block.bits) { unsigned mask = ((unsigned long long)1 << width) - 1; input = LLVMBuildAnd(builder, input, lp_build_const_int_vec(gallivm, type, mask), ""); } /* * Type conversion */ if (type.floating) { if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB) { assert(width == 8); if (format_desc->swizzle[3] == chan) { input = lp_build_unsigned_norm_to_float(gallivm, width, type, input); } else { struct lp_type conv_type = lp_uint_type(type); input = lp_build_srgb_to_linear(gallivm, conv_type, input); } } else { if(format_desc->channel[chan].normalized) input = lp_build_unsigned_norm_to_float(gallivm, width, type, input); else input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(gallivm, type), ""); } } else if (format_desc->channel[chan].pure_integer) { /* Nothing to do */ } else { /* FIXME */ assert(0); } break; case UTIL_FORMAT_TYPE_SIGNED: /* * Align the sign bit first. */ if (stop < type.width) { unsigned bits = type.width - stop; LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits); input = LLVMBuildShl(builder, input, bits_val, ""); } /* * Align the LSB (with an arithmetic shift to preserve the sign) */ if (format_desc->channel[chan].size < type.width) { unsigned bits = type.width - format_desc->channel[chan].size; LLVMValueRef bits_val = lp_build_const_int_vec(gallivm, type, bits); input = LLVMBuildAShr(builder, input, bits_val, ""); } /* * Type conversion */ if (type.floating) { input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(gallivm, type), ""); if (format_desc->channel[chan].normalized) { double scale = 1.0 / ((1 << (format_desc->channel[chan].size - 1)) - 1); LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale); input = LLVMBuildFMul(builder, input, scale_val, ""); /* the formula above will produce value below -1.0 for most negative * value but everything seems happy with that hence disable for now */ if (0) input = lp_build_max(&bld, input, lp_build_const_vec(gallivm, type, -1.0f)); } } else if (format_desc->channel[chan].pure_integer) { /* Nothing to do */ } else { /* FIXME */ assert(0); } break; case UTIL_FORMAT_TYPE_FLOAT: if (type.floating) { assert(start == 0); assert(stop == 32); assert(type.width == 32); input = LLVMBuildBitCast(builder, input, lp_build_vec_type(gallivm, type), ""); } else { /* FIXME */ assert(0); input = lp_build_undef(gallivm, type); } break; case UTIL_FORMAT_TYPE_FIXED: if (type.floating) { double scale = 1.0 / ((1 << (format_desc->channel[chan].size/2)) - 1); LLVMValueRef scale_val = lp_build_const_vec(gallivm, type, scale); input = LLVMBuildSIToFP(builder, input, lp_build_vec_type(gallivm, type), ""); input = LLVMBuildFMul(builder, input, scale_val, ""); } else { /* FIXME */ assert(0); input = lp_build_undef(gallivm, type); } break; default: assert(0); input = lp_build_undef(gallivm, type); break; } inputs[chan] = input; } lp_build_format_swizzle_soa(format_desc, &bld, inputs, rgba_out); }
/** * Inverse of lp_build_clamped_float_to_unsigned_norm above. * Ex: src = { i32, i32, i32, i32 } with values in range [0, 2^src_width-1] * return {float, float, float, float} with values in range [0, 1]. */ LLVMValueRef lp_build_unsigned_norm_to_float(struct gallivm_state *gallivm, unsigned src_width, struct lp_type dst_type, LLVMValueRef src) { LLVMBuilderRef builder = gallivm->builder; LLVMTypeRef vec_type = lp_build_vec_type(gallivm, dst_type); LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, dst_type); LLVMValueRef bias_; LLVMValueRef res; unsigned mantissa; unsigned n; unsigned long long ubound; unsigned long long mask; double scale; double bias; assert(dst_type.floating); mantissa = lp_mantissa(dst_type); if (src_width <= (mantissa + 1)) { /* * The source width matches fits what can be represented in floating * point (i.e., mantissa + 1 bits). So do a straight multiplication * followed by casting. No further rounding is necessary. */ scale = 1.0/(double)((1ULL << src_width) - 1); res = LLVMBuildSIToFP(builder, src, vec_type, ""); res = LLVMBuildFMul(builder, res, lp_build_const_vec(gallivm, dst_type, scale), ""); return res; } else { /* * The source width exceeds what can be represented in floating * point. So truncate the incoming values. */ n = MIN2(mantissa, src_width); ubound = ((unsigned long long)1 << n); mask = ubound - 1; scale = (double)ubound/mask; bias = (double)((unsigned long long)1 << (mantissa - n)); res = src; if (src_width > mantissa) { int shift = src_width - mantissa; res = LLVMBuildLShr(builder, res, lp_build_const_int_vec(gallivm, dst_type, shift), ""); } bias_ = lp_build_const_vec(gallivm, dst_type, bias); res = LLVMBuildOr(builder, res, LLVMBuildBitCast(builder, bias_, int_vec_type, ""), ""); res = LLVMBuildBitCast(builder, res, vec_type, ""); res = LLVMBuildFSub(builder, res, bias_, ""); res = LLVMBuildFMul(builder, res, lp_build_const_vec(gallivm, dst_type, scale), ""); } return res; }
/** * Special case for converting clamped IEEE-754 floats to unsigned norms. * * The mathematical voodoo below may seem excessive but it is actually * paramount we do it this way for several reasons. First, there is no single * precision FP to unsigned integer conversion Intel SSE instruction. Second, * secondly, even if there was, since the FP's mantissa takes only a fraction * of register bits the typically scale and cast approach would require double * precision for accurate results, and therefore half the throughput * * Although the result values can be scaled to an arbitrary bit width specified * by dst_width, the actual result type will have the same width. * * Ex: src = { float, float, float, float } * return { i32, i32, i32, i32 } where each value is in [0, 2^dst_width-1]. */ LLVMValueRef lp_build_clamped_float_to_unsigned_norm(LLVMBuilderRef builder, struct lp_type src_type, unsigned dst_width, LLVMValueRef src) { LLVMTypeRef int_vec_type = lp_build_int_vec_type(src_type); LLVMValueRef res; unsigned mantissa; unsigned n; unsigned long long ubound; unsigned long long mask; double scale; double bias; assert(src_type.floating); mantissa = lp_mantissa(src_type); /* We cannot carry more bits than the mantissa */ n = MIN2(mantissa, dst_width); /* This magic coefficients will make the desired result to appear in the * lowest significant bits of the mantissa. */ ubound = ((unsigned long long)1 << n); mask = ubound - 1; scale = (double)mask/ubound; bias = (double)((unsigned long long)1 << (mantissa - n)); res = LLVMBuildFMul(builder, src, lp_build_const_vec(src_type, scale), ""); res = LLVMBuildFAdd(builder, res, lp_build_const_vec(src_type, bias), ""); res = LLVMBuildBitCast(builder, res, int_vec_type, ""); if(dst_width > n) { int shift = dst_width - n; res = LLVMBuildShl(builder, res, lp_build_const_int_vec(src_type, shift), ""); /* TODO: Fill in the empty lower bits for additional precision? */ /* YES: this fixes progs/trivial/tri-z-eq.c. * Otherwise vertex Z=1.0 values get converted to something like * 0xfffffb00 and the test for equality with 0xffffffff fails. */ #if 0 { LLVMValueRef msb; msb = LLVMBuildLShr(builder, res, lp_build_const_int_vec(src_type, dst_width - 1), ""); msb = LLVMBuildShl(builder, msb, lp_build_const_int_vec(src_type, shift), ""); msb = LLVMBuildSub(builder, msb, lp_build_const_int_vec(src_type, 1), ""); res = LLVMBuildOr(builder, res, msb, ""); } #elif 0 while(shift > 0) { res = LLVMBuildOr(builder, res, LLVMBuildLShr(builder, res, lp_build_const_int_vec(src_type, n), ""), ""); shift -= n; n *= 2; } #endif } else res = LLVMBuildAnd(builder, res, lp_build_const_int_vec(src_type, mask), ""); return res; }
/** * Generic type conversion. * * TODO: Take a precision argument, or even better, add a new precision member * to the lp_type union. */ void lp_build_conv(LLVMBuilderRef builder, struct lp_type src_type, struct lp_type dst_type, const LLVMValueRef *src, unsigned num_srcs, LLVMValueRef *dst, unsigned num_dsts) { struct lp_type tmp_type; LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH]; unsigned num_tmps; unsigned i; /* We must not loose or gain channels. Only precision */ assert(src_type.length * num_srcs == dst_type.length * num_dsts); assert(src_type.length <= LP_MAX_VECTOR_LENGTH); assert(dst_type.length <= LP_MAX_VECTOR_LENGTH); assert(num_srcs <= LP_MAX_VECTOR_LENGTH); assert(num_dsts <= LP_MAX_VECTOR_LENGTH); tmp_type = src_type; for(i = 0; i < num_srcs; ++i) { assert(lp_check_value(src_type, src[i])); tmp[i] = src[i]; } num_tmps = num_srcs; /* * Clamp if necessary */ if(memcmp(&src_type, &dst_type, sizeof src_type) != 0) { struct lp_build_context bld; double src_min = lp_const_min(src_type); double dst_min = lp_const_min(dst_type); double src_max = lp_const_max(src_type); double dst_max = lp_const_max(dst_type); LLVMValueRef thres; lp_build_context_init(&bld, builder, tmp_type); if(src_min < dst_min) { if(dst_min == 0.0) thres = bld.zero; else thres = lp_build_const_vec(src_type, dst_min); for(i = 0; i < num_tmps; ++i) tmp[i] = lp_build_max(&bld, tmp[i], thres); } if(src_max > dst_max) { if(dst_max == 1.0) thres = bld.one; else thres = lp_build_const_vec(src_type, dst_max); for(i = 0; i < num_tmps; ++i) tmp[i] = lp_build_min(&bld, tmp[i], thres); } } /* * Scale to the narrowest range */ if(dst_type.floating) { /* Nothing to do */ } else if(tmp_type.floating) { if(!dst_type.fixed && !dst_type.sign && dst_type.norm) { for(i = 0; i < num_tmps; ++i) { tmp[i] = lp_build_clamped_float_to_unsigned_norm(builder, tmp_type, dst_type.width, tmp[i]); } tmp_type.floating = FALSE; } else { double dst_scale = lp_const_scale(dst_type); LLVMTypeRef tmp_vec_type; if (dst_scale != 1.0) { LLVMValueRef scale = lp_build_const_vec(tmp_type, dst_scale); for(i = 0; i < num_tmps; ++i) tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, ""); } /* Use an equally sized integer for intermediate computations */ tmp_type.floating = FALSE; tmp_vec_type = lp_build_vec_type(tmp_type); for(i = 0; i < num_tmps; ++i) { #if 0 if(dst_type.sign) tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, ""); else tmp[i] = LLVMBuildFPToUI(builder, tmp[i], tmp_vec_type, ""); #else /* FIXME: there is no SSE counterpart for LLVMBuildFPToUI */ tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, ""); #endif } } } else { unsigned src_shift = lp_const_shift(src_type); unsigned dst_shift = lp_const_shift(dst_type); /* FIXME: compensate different offsets too */ if(src_shift > dst_shift) { LLVMValueRef shift = lp_build_const_int_vec(tmp_type, src_shift - dst_shift); for(i = 0; i < num_tmps; ++i) if(src_type.sign) tmp[i] = LLVMBuildAShr(builder, tmp[i], shift, ""); else tmp[i] = LLVMBuildLShr(builder, tmp[i], shift, ""); } } /* * Truncate or expand bit width * * No data conversion should happen here, although the sign bits are * crucial to avoid bad clamping. */ { struct lp_type new_type; new_type = tmp_type; new_type.sign = dst_type.sign; new_type.width = dst_type.width; new_type.length = dst_type.length; lp_build_resize(builder, tmp_type, new_type, tmp, num_srcs, tmp, num_dsts); tmp_type = new_type; num_tmps = num_dsts; } /* * Scale to the widest range */ if(src_type.floating) { /* Nothing to do */ } else if(!src_type.floating && dst_type.floating) { if(!src_type.fixed && !src_type.sign && src_type.norm) { for(i = 0; i < num_tmps; ++i) { tmp[i] = lp_build_unsigned_norm_to_float(builder, src_type.width, dst_type, tmp[i]); } tmp_type.floating = TRUE; } else { double src_scale = lp_const_scale(src_type); LLVMTypeRef tmp_vec_type; /* Use an equally sized integer for intermediate computations */ tmp_type.floating = TRUE; tmp_type.sign = TRUE; tmp_vec_type = lp_build_vec_type(tmp_type); for(i = 0; i < num_tmps; ++i) { #if 0 if(dst_type.sign) tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, ""); else tmp[i] = LLVMBuildUIToFP(builder, tmp[i], tmp_vec_type, ""); #else /* FIXME: there is no SSE counterpart for LLVMBuildUIToFP */ tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, ""); #endif } if (src_scale != 1.0) { LLVMValueRef scale = lp_build_const_vec(tmp_type, 1.0/src_scale); for(i = 0; i < num_tmps; ++i) tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, ""); } } } else { unsigned src_shift = lp_const_shift(src_type); unsigned dst_shift = lp_const_shift(dst_type); /* FIXME: compensate different offsets too */ if(src_shift < dst_shift) { LLVMValueRef shift = lp_build_const_int_vec(tmp_type, dst_shift - src_shift); for(i = 0; i < num_tmps; ++i) tmp[i] = LLVMBuildShl(builder, tmp[i], shift, ""); } } for(i = 0; i < num_dsts; ++i) { dst[i] = tmp[i]; assert(lp_check_value(dst_type, dst[i])); } }
/** * Generic type conversion. * * TODO: Take a precision argument, or even better, add a new precision member * to the lp_type union. */ void lp_build_conv(struct gallivm_state *gallivm, struct lp_type src_type, struct lp_type dst_type, const LLVMValueRef *src, unsigned num_srcs, LLVMValueRef *dst, unsigned num_dsts) { LLVMBuilderRef builder = gallivm->builder; struct lp_type tmp_type; LLVMValueRef tmp[LP_MAX_VECTOR_LENGTH]; unsigned num_tmps; unsigned i; /* We must not loose or gain channels. Only precision */ assert(src_type.length * num_srcs == dst_type.length * num_dsts); assert(src_type.length <= LP_MAX_VECTOR_LENGTH); assert(dst_type.length <= LP_MAX_VECTOR_LENGTH); assert(num_srcs <= LP_MAX_VECTOR_LENGTH); assert(num_dsts <= LP_MAX_VECTOR_LENGTH); tmp_type = src_type; for(i = 0; i < num_srcs; ++i) { assert(lp_check_value(src_type, src[i])); tmp[i] = src[i]; } num_tmps = num_srcs; /* Special case 4x4f --> 1x16ub */ if (src_type.floating == 1 && src_type.fixed == 0 && src_type.sign == 1 && src_type.norm == 0 && src_type.width == 32 && src_type.length == 4 && dst_type.floating == 0 && dst_type.fixed == 0 && dst_type.sign == 0 && dst_type.norm == 1 && dst_type.width == 8 && dst_type.length == 16 && 4 * num_dsts == num_srcs && util_cpu_caps.has_sse2) { struct lp_build_context bld; struct lp_type int16_type = dst_type; struct lp_type int32_type = dst_type; LLVMValueRef const_255f; unsigned i, j; lp_build_context_init(&bld, gallivm, src_type); int16_type.width *= 2; int16_type.length /= 2; int16_type.sign = 1; int32_type.width *= 4; int32_type.length /= 4; int32_type.sign = 1; const_255f = lp_build_const_vec(gallivm, src_type, 255.0f); for (i = 0; i < num_dsts; ++i, src += 4) { LLVMValueRef lo, hi; for (j = 0; j < 4; ++j) { tmp[j] = LLVMBuildFMul(builder, src[j], const_255f, ""); tmp[j] = lp_build_iround(&bld, tmp[j]); } /* relying on clamping behavior of sse2 intrinsics here */ lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]); hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]); dst[i] = lp_build_pack2(gallivm, int16_type, dst_type, lo, hi); } return; } /* Special case 2x8f --> 1x16ub */ else if (src_type.floating == 1 && src_type.fixed == 0 && src_type.sign == 1 && src_type.norm == 0 && src_type.width == 32 && src_type.length == 8 && dst_type.floating == 0 && dst_type.fixed == 0 && dst_type.sign == 0 && dst_type.norm == 1 && dst_type.width == 8 && dst_type.length == 16 && 2 * num_dsts == num_srcs && util_cpu_caps.has_avx) { struct lp_build_context bld; struct lp_type int16_type = dst_type; struct lp_type int32_type = dst_type; LLVMValueRef const_255f; unsigned i; lp_build_context_init(&bld, gallivm, src_type); int16_type.width *= 2; int16_type.length /= 2; int16_type.sign = 1; int32_type.width *= 4; int32_type.length /= 4; int32_type.sign = 1; const_255f = lp_build_const_vec(gallivm, src_type, 255.0f); for (i = 0; i < num_dsts; ++i, src += 2) { LLVMValueRef lo, hi, a, b; a = LLVMBuildFMul(builder, src[0], const_255f, ""); b = LLVMBuildFMul(builder, src[1], const_255f, ""); a = lp_build_iround(&bld, a); b = lp_build_iround(&bld, b); tmp[0] = lp_build_extract_range(gallivm, a, 0, 4); tmp[1] = lp_build_extract_range(gallivm, a, 4, 4); tmp[2] = lp_build_extract_range(gallivm, b, 0, 4); tmp[3] = lp_build_extract_range(gallivm, b, 4, 4); /* relying on clamping behavior of sse2 intrinsics here */ lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]); hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]); dst[i] = lp_build_pack2(gallivm, int16_type, dst_type, lo, hi); } return; } /* Pre convert half-floats to floats */ else if (src_type.floating && src_type.width == 16) { for(i = 0; i < num_tmps; ++i) tmp[i] = lp_build_half_to_float(gallivm, src_type, tmp[i]); tmp_type.width = 32; } /* * Clamp if necessary */ if(memcmp(&src_type, &dst_type, sizeof src_type) != 0) { struct lp_build_context bld; double src_min = lp_const_min(src_type); double dst_min = lp_const_min(dst_type); double src_max = lp_const_max(src_type); double dst_max = lp_const_max(dst_type); LLVMValueRef thres; lp_build_context_init(&bld, gallivm, tmp_type); if(src_min < dst_min) { if(dst_min == 0.0) thres = bld.zero; else thres = lp_build_const_vec(gallivm, src_type, dst_min); for(i = 0; i < num_tmps; ++i) tmp[i] = lp_build_max(&bld, tmp[i], thres); } if(src_max > dst_max) { if(dst_max == 1.0) thres = bld.one; else thres = lp_build_const_vec(gallivm, src_type, dst_max); for(i = 0; i < num_tmps; ++i) tmp[i] = lp_build_min(&bld, tmp[i], thres); } } /* * Scale to the narrowest range */ if(dst_type.floating) { /* Nothing to do */ } else if(tmp_type.floating) { if(!dst_type.fixed && !dst_type.sign && dst_type.norm) { for(i = 0; i < num_tmps; ++i) { tmp[i] = lp_build_clamped_float_to_unsigned_norm(gallivm, tmp_type, dst_type.width, tmp[i]); } tmp_type.floating = FALSE; } else { double dst_scale = lp_const_scale(dst_type); LLVMTypeRef tmp_vec_type; if (dst_scale != 1.0) { LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, dst_scale); for(i = 0; i < num_tmps; ++i) tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, ""); } /* Use an equally sized integer for intermediate computations */ tmp_type.floating = FALSE; tmp_vec_type = lp_build_vec_type(gallivm, tmp_type); for(i = 0; i < num_tmps; ++i) { #if 0 if(dst_type.sign) tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, ""); else tmp[i] = LLVMBuildFPToUI(builder, tmp[i], tmp_vec_type, ""); #else /* FIXME: there is no SSE counterpart for LLVMBuildFPToUI */ tmp[i] = LLVMBuildFPToSI(builder, tmp[i], tmp_vec_type, ""); #endif } } } else { unsigned src_shift = lp_const_shift(src_type); unsigned dst_shift = lp_const_shift(dst_type); unsigned src_offset = lp_const_offset(src_type); unsigned dst_offset = lp_const_offset(dst_type); /* Compensate for different offsets */ if (dst_offset > src_offset && src_type.width > dst_type.width) { for (i = 0; i < num_tmps; ++i) { LLVMValueRef shifted; LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type, src_shift - 1); if(src_type.sign) shifted = LLVMBuildAShr(builder, tmp[i], shift, ""); else shifted = LLVMBuildLShr(builder, tmp[i], shift, ""); tmp[i] = LLVMBuildSub(builder, tmp[i], shifted, ""); } } if(src_shift > dst_shift) { LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type, src_shift - dst_shift); for(i = 0; i < num_tmps; ++i) if(src_type.sign) tmp[i] = LLVMBuildAShr(builder, tmp[i], shift, ""); else tmp[i] = LLVMBuildLShr(builder, tmp[i], shift, ""); } } /* * Truncate or expand bit width * * No data conversion should happen here, although the sign bits are * crucial to avoid bad clamping. */ { struct lp_type new_type; new_type = tmp_type; new_type.sign = dst_type.sign; new_type.width = dst_type.width; new_type.length = dst_type.length; lp_build_resize(gallivm, tmp_type, new_type, tmp, num_srcs, tmp, num_dsts); tmp_type = new_type; num_tmps = num_dsts; } /* * Scale to the widest range */ if(src_type.floating) { /* Nothing to do */ } else if(!src_type.floating && dst_type.floating) { if(!src_type.fixed && !src_type.sign && src_type.norm) { for(i = 0; i < num_tmps; ++i) { tmp[i] = lp_build_unsigned_norm_to_float(gallivm, src_type.width, dst_type, tmp[i]); } tmp_type.floating = TRUE; } else { double src_scale = lp_const_scale(src_type); LLVMTypeRef tmp_vec_type; /* Use an equally sized integer for intermediate computations */ tmp_type.floating = TRUE; tmp_type.sign = TRUE; tmp_vec_type = lp_build_vec_type(gallivm, tmp_type); for(i = 0; i < num_tmps; ++i) { #if 0 if(dst_type.sign) tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, ""); else tmp[i] = LLVMBuildUIToFP(builder, tmp[i], tmp_vec_type, ""); #else /* FIXME: there is no SSE counterpart for LLVMBuildUIToFP */ tmp[i] = LLVMBuildSIToFP(builder, tmp[i], tmp_vec_type, ""); #endif } if (src_scale != 1.0) { LLVMValueRef scale = lp_build_const_vec(gallivm, tmp_type, 1.0/src_scale); for(i = 0; i < num_tmps; ++i) tmp[i] = LLVMBuildFMul(builder, tmp[i], scale, ""); } } } else { unsigned src_shift = lp_const_shift(src_type); unsigned dst_shift = lp_const_shift(dst_type); unsigned src_offset = lp_const_offset(src_type); unsigned dst_offset = lp_const_offset(dst_type); if (src_shift < dst_shift) { LLVMValueRef pre_shift[LP_MAX_VECTOR_LENGTH]; LLVMValueRef shift = lp_build_const_int_vec(gallivm, tmp_type, dst_shift - src_shift); for (i = 0; i < num_tmps; ++i) { pre_shift[i] = tmp[i]; tmp[i] = LLVMBuildShl(builder, tmp[i], shift, ""); } /* Compensate for different offsets */ if (dst_offset > src_offset) { for (i = 0; i < num_tmps; ++i) { tmp[i] = LLVMBuildSub(builder, tmp[i], pre_shift[i], ""); } } } } for(i = 0; i < num_dsts; ++i) { dst[i] = tmp[i]; assert(lp_check_value(dst_type, dst[i])); } }
/** * Register store. */ void lp_emit_store_aos( struct lp_build_tgsi_aos_context *bld, const struct tgsi_full_instruction *inst, unsigned index, LLVMValueRef value) { LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder; const struct tgsi_full_dst_register *reg = &inst->Dst[index]; LLVMValueRef mask = NULL; LLVMValueRef ptr; /* * Saturate the value */ switch (inst->Instruction.Saturate) { case TGSI_SAT_NONE: break; case TGSI_SAT_ZERO_ONE: value = lp_build_max(&bld->bld_base.base, value, bld->bld_base.base.zero); value = lp_build_min(&bld->bld_base.base, value, bld->bld_base.base.one); break; case TGSI_SAT_MINUS_PLUS_ONE: value = lp_build_max(&bld->bld_base.base, value, lp_build_const_vec(bld->bld_base.base.gallivm, bld->bld_base.base.type, -1.0)); value = lp_build_min(&bld->bld_base.base, value, bld->bld_base.base.one); break; default: assert(0); } /* * Translate the register file */ assert(!reg->Register.Indirect); switch (reg->Register.File) { case TGSI_FILE_OUTPUT: ptr = bld->outputs[reg->Register.Index]; break; case TGSI_FILE_TEMPORARY: ptr = bld->temps[reg->Register.Index]; break; case TGSI_FILE_ADDRESS: ptr = bld->addr[reg->Indirect.Index]; break; case TGSI_FILE_PREDICATE: ptr = bld->preds[reg->Register.Index]; break; default: assert(0); return; } if (!ptr) return; /* * Predicate */ if (inst->Instruction.Predicate) { LLVMValueRef pred; assert(inst->Predicate.Index < LP_MAX_TGSI_PREDS); pred = LLVMBuildLoad(builder, bld->preds[inst->Predicate.Index], ""); /* * Convert the value to an integer mask. */ pred = lp_build_compare(bld->bld_base.base.gallivm, bld->bld_base.base.type, PIPE_FUNC_NOTEQUAL, pred, bld->bld_base.base.zero); if (inst->Predicate.Negate) { pred = LLVMBuildNot(builder, pred, ""); } pred = bld->bld_base.emit_swizzle(&bld->bld_base, pred, inst->Predicate.SwizzleX, inst->Predicate.SwizzleY, inst->Predicate.SwizzleZ, inst->Predicate.SwizzleW); if (mask) { mask = LLVMBuildAnd(builder, mask, pred, ""); } else { mask = pred; } } /* * Writemask */ if (reg->Register.WriteMask != TGSI_WRITEMASK_XYZW) { LLVMValueRef writemask; writemask = lp_build_const_mask_aos_swizzled(bld->bld_base.base.gallivm, bld->bld_base.base.type, reg->Register.WriteMask, TGSI_NUM_CHANNELS, bld->swizzles); if (mask) { mask = LLVMBuildAnd(builder, mask, writemask, ""); } else { mask = writemask; } } if (mask) { LLVMValueRef orig_value; orig_value = LLVMBuildLoad(builder, ptr, ""); value = lp_build_select(&bld->bld_base.base, mask, value, orig_value); } LLVMBuildStore(builder, value, ptr); }
/** * Fetch a texels from a texture, returning them in SoA layout. * * \param type the desired return type for 'rgba'. The vector length * is the number of texels to fetch * * \param base_ptr points to the base of the texture mip tree. * \param offset offset to start of the texture image block. For non- * compressed formats, this simply is an offset to the texel. * For compressed formats, it is an offset to the start of the * compressed data block. * * \param i, j the sub-block pixel coordinates. For non-compressed formats * these will always be (0,0). For compressed formats, i will * be in [0, block_width-1] and j will be in [0, block_height-1]. */ void lp_build_fetch_rgba_soa(struct gallivm_state *gallivm, const struct util_format_description *format_desc, struct lp_type type, LLVMValueRef base_ptr, LLVMValueRef offset, LLVMValueRef i, LLVMValueRef j, LLVMValueRef rgba_out[4]) { LLVMBuilderRef builder = gallivm->builder; if (format_desc->layout == UTIL_FORMAT_LAYOUT_PLAIN && (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB || format_desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB || format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS) && format_desc->block.width == 1 && format_desc->block.height == 1 && format_desc->block.bits <= type.width && (format_desc->channel[0].type != UTIL_FORMAT_TYPE_FLOAT || format_desc->channel[0].size == 32)) { /* * The packed pixel fits into an element of the destination format. Put * the packed pixels into a vector and extract each component for all * vector elements in parallel. */ LLVMValueRef packed; /* * gather the texels from the texture * Ex: packed = {XYZW, XYZW, XYZW, XYZW} */ assert(format_desc->block.bits <= type.width); packed = lp_build_gather(gallivm, type.length, format_desc->block.bits, type.width, base_ptr, offset, FALSE); /* * convert texels to float rgba */ lp_build_unpack_rgba_soa(gallivm, format_desc, type, packed, rgba_out); return; } if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT || format_desc->format == PIPE_FORMAT_R9G9B9E5_FLOAT) { /* * similar conceptually to above but requiring special * AoS packed -> SoA float conversion code. */ LLVMValueRef packed; assert(type.floating); assert(type.width == 32); packed = lp_build_gather(gallivm, type.length, format_desc->block.bits, type.width, base_ptr, offset, FALSE); if (format_desc->format == PIPE_FORMAT_R11G11B10_FLOAT) { lp_build_r11g11b10_to_float(gallivm, packed, rgba_out); } else { lp_build_rgb9e5_to_float(gallivm, packed, rgba_out); } return; } if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS && format_desc->block.bits == 64) { /* * special case the format is 64 bits but we only require * 32bit (or 8bit) from each block. */ LLVMValueRef packed; if (format_desc->format == PIPE_FORMAT_X32_S8X24_UINT) { /* * for stencil simply fix up offsets - could in fact change * base_ptr instead even outside the shader. */ unsigned mask = (1 << 8) - 1; LLVMValueRef s_offset = lp_build_const_int_vec(gallivm, type, 4); offset = LLVMBuildAdd(builder, offset, s_offset, ""); packed = lp_build_gather(gallivm, type.length, 32, type.width, base_ptr, offset, FALSE); packed = LLVMBuildAnd(builder, packed, lp_build_const_int_vec(gallivm, type, mask), ""); } else { assert (format_desc->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT); packed = lp_build_gather(gallivm, type.length, 32, type.width, base_ptr, offset, TRUE); packed = LLVMBuildBitCast(builder, packed, lp_build_vec_type(gallivm, type), ""); } /* for consistency with lp_build_unpack_rgba_soa() return sss1 or zzz1 */ rgba_out[0] = rgba_out[1] = rgba_out[2] = packed; rgba_out[3] = lp_build_const_vec(gallivm, type, 1.0f); return; } /* * Try calling lp_build_fetch_rgba_aos for all pixels. */ if (util_format_fits_8unorm(format_desc) && type.floating && type.width == 32 && (type.length == 1 || (type.length % 4 == 0))) { struct lp_type tmp_type; LLVMValueRef tmp; memset(&tmp_type, 0, sizeof tmp_type); tmp_type.width = 8; tmp_type.length = type.length * 4; tmp_type.norm = TRUE; tmp = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type, base_ptr, offset, i, j); lp_build_rgba8_to_fi32_soa(gallivm, type, tmp, rgba_out); return; } /* * Fallback to calling lp_build_fetch_rgba_aos for each pixel. * * This is not the most efficient way of fetching pixels, as we * miss some opportunities to do vectorization, but this is * convenient for formats or scenarios for which there was no * opportunity or incentive to optimize. */ { unsigned k, chan; struct lp_type tmp_type; if (gallivm_debug & GALLIVM_DEBUG_PERF) { debug_printf("%s: scalar unpacking of %s\n", __FUNCTION__, format_desc->short_name); } tmp_type = type; tmp_type.length = 4; for (chan = 0; chan < 4; ++chan) { rgba_out[chan] = lp_build_undef(gallivm, type); } /* loop over number of pixels */ for(k = 0; k < type.length; ++k) { LLVMValueRef index = lp_build_const_int32(gallivm, k); LLVMValueRef offset_elem; LLVMValueRef i_elem, j_elem; LLVMValueRef tmp; offset_elem = LLVMBuildExtractElement(builder, offset, index, ""); i_elem = LLVMBuildExtractElement(builder, i, index, ""); j_elem = LLVMBuildExtractElement(builder, j, index, ""); /* Get a single float[4]={R,G,B,A} pixel */ tmp = lp_build_fetch_rgba_aos(gallivm, format_desc, tmp_type, base_ptr, offset_elem, i_elem, j_elem); /* * Insert the AoS tmp value channels into the SoA result vectors at * position = 'index'. */ for (chan = 0; chan < 4; ++chan) { LLVMValueRef chan_val = lp_build_const_int32(gallivm, chan), tmp_chan = LLVMBuildExtractElement(builder, tmp, chan_val, ""); rgba_out[chan] = LLVMBuildInsertElement(builder, rgba_out[chan], tmp_chan, index, ""); } } } }
/** * Emit LLVM for one TGSI instruction. * \param return TRUE for success, FALSE otherwise */ boolean lp_emit_instruction_aos( struct lp_build_tgsi_aos_context *bld, const struct tgsi_full_instruction *inst, const struct tgsi_opcode_info *info, int *pc) { LLVMValueRef src0, src1, src2; LLVMValueRef tmp0, tmp1; LLVMValueRef dst0 = NULL; /* * Stores and write masks are handled in a general fashion after the long * instruction opcode switch statement. * * Although not stricitly necessary, we avoid generating instructions for * channels which won't be stored, in cases where's that easy. For some * complex instructions, like texture sampling, it is more convenient to * assume a full writemask and then let LLVM optimization passes eliminate * redundant code. */ (*pc)++; assert(info->num_dst <= 1); if (info->num_dst) { dst0 = bld->bld_base.base.undef; } switch (inst->Instruction.Opcode) { case TGSI_OPCODE_ARL: src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL); dst0 = lp_build_floor(&bld->bld_base.base, src0); break; case TGSI_OPCODE_MOV: dst0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL); break; case TGSI_OPCODE_LIT: return FALSE; case TGSI_OPCODE_RCP: /* TGSI_OPCODE_RECIP */ src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL); dst0 = lp_build_rcp(&bld->bld_base.base, src0); break; case TGSI_OPCODE_RSQ: /* TGSI_OPCODE_RECIPSQRT */ src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL); tmp0 = lp_build_emit_llvm_unary(&bld->bld_base, TGSI_OPCODE_ABS, src0); dst0 = lp_build_rsqrt(&bld->bld_base.base, tmp0); break; case TGSI_OPCODE_EXP: return FALSE; case TGSI_OPCODE_LOG: return FALSE; case TGSI_OPCODE_MUL: src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL); src1 = lp_build_emit_fetch(&bld->bld_base, inst, 1, LP_CHAN_ALL); dst0 = lp_build_mul(&bld->bld_base.base, src0, src1); break; case TGSI_OPCODE_ADD: src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL); src1 = lp_build_emit_fetch(&bld->bld_base, inst, 1, LP_CHAN_ALL); dst0 = lp_build_add(&bld->bld_base.base, src0, src1); break; case TGSI_OPCODE_DP3: /* TGSI_OPCODE_DOT3 */ return FALSE; case TGSI_OPCODE_DP4: /* TGSI_OPCODE_DOT4 */ return FALSE; case TGSI_OPCODE_DST: return FALSE; case TGSI_OPCODE_MIN: src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL); src1 = lp_build_emit_fetch(&bld->bld_base, inst, 1, LP_CHAN_ALL); dst0 = lp_build_max(&bld->bld_base.base, src0, src1); break; case TGSI_OPCODE_MAX: src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL); src1 = lp_build_emit_fetch(&bld->bld_base, inst, 1, LP_CHAN_ALL); dst0 = lp_build_max(&bld->bld_base.base, src0, src1); break; case TGSI_OPCODE_SLT: /* TGSI_OPCODE_SETLT */ src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL); src1 = lp_build_emit_fetch(&bld->bld_base, inst, 1, LP_CHAN_ALL); tmp0 = lp_build_cmp(&bld->bld_base.base, PIPE_FUNC_LESS, src0, src1); dst0 = lp_build_select(&bld->bld_base.base, tmp0, bld->bld_base.base.one, bld->bld_base.base.zero); break; case TGSI_OPCODE_SGE: /* TGSI_OPCODE_SETGE */ src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL); src1 = lp_build_emit_fetch(&bld->bld_base, inst, 1, LP_CHAN_ALL); tmp0 = lp_build_cmp(&bld->bld_base.base, PIPE_FUNC_GEQUAL, src0, src1); dst0 = lp_build_select(&bld->bld_base.base, tmp0, bld->bld_base.base.one, bld->bld_base.base.zero); break; case TGSI_OPCODE_MAD: /* TGSI_OPCODE_MADD */ src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL); src1 = lp_build_emit_fetch(&bld->bld_base, inst, 1, LP_CHAN_ALL); src2 = lp_build_emit_fetch(&bld->bld_base, inst, 2, LP_CHAN_ALL); tmp0 = lp_build_mul(&bld->bld_base.base, src0, src1); dst0 = lp_build_add(&bld->bld_base.base, tmp0, src2); break; case TGSI_OPCODE_SUB: src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL); src1 = lp_build_emit_fetch(&bld->bld_base, inst, 1, LP_CHAN_ALL); dst0 = lp_build_sub(&bld->bld_base.base, src0, src1); break; case TGSI_OPCODE_LRP: src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL); src1 = lp_build_emit_fetch(&bld->bld_base, inst, 1, LP_CHAN_ALL); src2 = lp_build_emit_fetch(&bld->bld_base, inst, 2, LP_CHAN_ALL); tmp0 = lp_build_sub(&bld->bld_base.base, src1, src2); tmp0 = lp_build_mul(&bld->bld_base.base, src0, tmp0); dst0 = lp_build_add(&bld->bld_base.base, tmp0, src2); break; case TGSI_OPCODE_CND: src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL); src1 = lp_build_emit_fetch(&bld->bld_base, inst, 1, LP_CHAN_ALL); src2 = lp_build_emit_fetch(&bld->bld_base, inst, 2, LP_CHAN_ALL); tmp1 = lp_build_const_vec(bld->bld_base.base.gallivm, bld->bld_base.base.type, 0.5); tmp0 = lp_build_cmp(&bld->bld_base.base, PIPE_FUNC_GREATER, src2, tmp1); dst0 = lp_build_select(&bld->bld_base.base, tmp0, src0, src1); break; case TGSI_OPCODE_DP2A: return FALSE; case TGSI_OPCODE_FRC: src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL); tmp0 = lp_build_floor(&bld->bld_base.base, src0); dst0 = lp_build_sub(&bld->bld_base.base, src0, tmp0); break; case TGSI_OPCODE_CLAMP: src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL); src1 = lp_build_emit_fetch(&bld->bld_base, inst, 1, LP_CHAN_ALL); src2 = lp_build_emit_fetch(&bld->bld_base, inst, 2, LP_CHAN_ALL); tmp0 = lp_build_max(&bld->bld_base.base, src0, src1); dst0 = lp_build_min(&bld->bld_base.base, tmp0, src2); break; case TGSI_OPCODE_FLR: src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL); dst0 = lp_build_floor(&bld->bld_base.base, src0); break; case TGSI_OPCODE_ROUND: src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL); dst0 = lp_build_round(&bld->bld_base.base, src0); break; case TGSI_OPCODE_EX2: src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL); tmp0 = lp_build_swizzle_scalar_aos(&bld->bld_base.base, src0, TGSI_SWIZZLE_X, TGSI_NUM_CHANNELS); dst0 = lp_build_exp2(&bld->bld_base.base, tmp0); break; case TGSI_OPCODE_LG2: src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL); tmp0 = swizzle_scalar_aos(bld, src0, TGSI_SWIZZLE_X); dst0 = lp_build_log2(&bld->bld_base.base, tmp0); break; case TGSI_OPCODE_POW: src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL); src0 = swizzle_scalar_aos(bld, src0, TGSI_SWIZZLE_X); src1 = lp_build_emit_fetch(&bld->bld_base, inst, 1, LP_CHAN_ALL); src1 = swizzle_scalar_aos(bld, src1, TGSI_SWIZZLE_X); dst0 = lp_build_pow(&bld->bld_base.base, src0, src1); break; case TGSI_OPCODE_XPD: return FALSE; case TGSI_OPCODE_RCC: /* deprecated? */ assert(0); return FALSE; case TGSI_OPCODE_DPH: return FALSE; case TGSI_OPCODE_COS: src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL); tmp0 = swizzle_scalar_aos(bld, src0, TGSI_SWIZZLE_X); dst0 = lp_build_cos(&bld->bld_base.base, tmp0); break; case TGSI_OPCODE_DDX: return FALSE; case TGSI_OPCODE_DDY: return FALSE; case TGSI_OPCODE_KILP: /* predicated kill */ return FALSE; case TGSI_OPCODE_KIL: /* conditional kill */ return FALSE; case TGSI_OPCODE_PK2H: return FALSE; break; case TGSI_OPCODE_PK2US: return FALSE; break; case TGSI_OPCODE_PK4B: return FALSE; break; case TGSI_OPCODE_PK4UB: return FALSE; case TGSI_OPCODE_RFL: return FALSE; case TGSI_OPCODE_SEQ: src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL); src1 = lp_build_emit_fetch(&bld->bld_base, inst, 1, LP_CHAN_ALL); tmp0 = lp_build_cmp(&bld->bld_base.base, PIPE_FUNC_EQUAL, src0, src1); dst0 = lp_build_select(&bld->bld_base.base, tmp0, bld->bld_base.base.one, bld->bld_base.base.zero); break; case TGSI_OPCODE_SFL: dst0 = bld->bld_base.base.zero; break; case TGSI_OPCODE_SGT: src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL); src1 = lp_build_emit_fetch(&bld->bld_base, inst, 1, LP_CHAN_ALL); tmp0 = lp_build_cmp(&bld->bld_base.base, PIPE_FUNC_GREATER, src0, src1); dst0 = lp_build_select(&bld->bld_base.base, tmp0, bld->bld_base.base.one, bld->bld_base.base.zero); break; case TGSI_OPCODE_SIN: src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL); tmp0 = swizzle_scalar_aos(bld, src0, TGSI_SWIZZLE_X); dst0 = lp_build_sin(&bld->bld_base.base, tmp0); break; case TGSI_OPCODE_SLE: src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL); src1 = lp_build_emit_fetch(&bld->bld_base, inst, 1, LP_CHAN_ALL); tmp0 = lp_build_cmp(&bld->bld_base.base, PIPE_FUNC_LEQUAL, src0, src1); dst0 = lp_build_select(&bld->bld_base.base, tmp0, bld->bld_base.base.one, bld->bld_base.base.zero); break; case TGSI_OPCODE_SNE: src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL); src1 = lp_build_emit_fetch(&bld->bld_base, inst, 1, LP_CHAN_ALL); tmp0 = lp_build_cmp(&bld->bld_base.base, PIPE_FUNC_NOTEQUAL, src0, src1); dst0 = lp_build_select(&bld->bld_base.base, tmp0, bld->bld_base.base.one, bld->bld_base.base.zero); break; case TGSI_OPCODE_STR: dst0 = bld->bld_base.base.one; break; case TGSI_OPCODE_TEX: dst0 = emit_tex(bld, inst, LP_BLD_TEX_MODIFIER_NONE); break; case TGSI_OPCODE_TXD: dst0 = emit_tex(bld, inst, LP_BLD_TEX_MODIFIER_EXPLICIT_DERIV); break; case TGSI_OPCODE_UP2H: /* deprecated */ assert (0); return FALSE; break; case TGSI_OPCODE_UP2US: /* deprecated */ assert(0); return FALSE; break; case TGSI_OPCODE_UP4B: /* deprecated */ assert(0); return FALSE; break; case TGSI_OPCODE_UP4UB: /* deprecated */ assert(0); return FALSE; break; case TGSI_OPCODE_X2D: /* deprecated? */ assert(0); return FALSE; break; case TGSI_OPCODE_ARA: /* deprecated */ assert(0); return FALSE; break; case TGSI_OPCODE_ARR: src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL); dst0 = lp_build_round(&bld->bld_base.base, src0); break; case TGSI_OPCODE_BRA: /* deprecated */ assert(0); return FALSE; break; case TGSI_OPCODE_CAL: return FALSE; case TGSI_OPCODE_RET: return FALSE; case TGSI_OPCODE_END: *pc = -1; break; case TGSI_OPCODE_SSG: /* TGSI_OPCODE_SGN */ tmp0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL); dst0 = lp_build_sgn(&bld->bld_base.base, tmp0); break; case TGSI_OPCODE_CMP: src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL); src1 = lp_build_emit_fetch(&bld->bld_base, inst, 1, LP_CHAN_ALL); src2 = lp_build_emit_fetch(&bld->bld_base, inst, 2, LP_CHAN_ALL); tmp0 = lp_build_cmp(&bld->bld_base.base, PIPE_FUNC_LESS, src0, bld->bld_base.base.zero); dst0 = lp_build_select(&bld->bld_base.base, tmp0, src1, src2); break; case TGSI_OPCODE_SCS: return FALSE; case TGSI_OPCODE_TXB: dst0 = emit_tex(bld, inst, LP_BLD_TEX_MODIFIER_LOD_BIAS); break; case TGSI_OPCODE_NRM: /* fall-through */ case TGSI_OPCODE_NRM4: return FALSE; case TGSI_OPCODE_DIV: /* deprecated */ assert(0); return FALSE; break; case TGSI_OPCODE_DP2: return FALSE; case TGSI_OPCODE_TXL: dst0 = emit_tex(bld, inst, LP_BLD_TEX_MODIFIER_EXPLICIT_LOD); break; case TGSI_OPCODE_TXP: dst0 = emit_tex(bld, inst, LP_BLD_TEX_MODIFIER_PROJECTED); break; case TGSI_OPCODE_BRK: return FALSE; case TGSI_OPCODE_IF: return FALSE; case TGSI_OPCODE_BGNLOOP: return FALSE; case TGSI_OPCODE_BGNSUB: return FALSE; case TGSI_OPCODE_ELSE: return FALSE; case TGSI_OPCODE_ENDIF: return FALSE; case TGSI_OPCODE_ENDLOOP: return FALSE; case TGSI_OPCODE_ENDSUB: return FALSE; case TGSI_OPCODE_PUSHA: /* deprecated? */ assert(0); return FALSE; break; case TGSI_OPCODE_POPA: /* deprecated? */ assert(0); return FALSE; break; case TGSI_OPCODE_CEIL: src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL); dst0 = lp_build_ceil(&bld->bld_base.base, src0); break; case TGSI_OPCODE_I2F: /* deprecated? */ assert(0); return FALSE; break; case TGSI_OPCODE_NOT: /* deprecated? */ assert(0); return FALSE; break; case TGSI_OPCODE_TRUNC: src0 = lp_build_emit_fetch(&bld->bld_base, inst, 0, LP_CHAN_ALL); dst0 = lp_build_trunc(&bld->bld_base.base, src0); break; case TGSI_OPCODE_SHL: /* deprecated? */ assert(0); return FALSE; break; case TGSI_OPCODE_ISHR: /* deprecated? */ assert(0); return FALSE; break; case TGSI_OPCODE_AND: /* deprecated? */ assert(0); return FALSE; break; case TGSI_OPCODE_OR: /* deprecated? */ assert(0); return FALSE; break; case TGSI_OPCODE_MOD: /* deprecated? */ assert(0); return FALSE; break; case TGSI_OPCODE_XOR: /* deprecated? */ assert(0); return FALSE; break; case TGSI_OPCODE_SAD: /* deprecated? */ assert(0); return FALSE; break; case TGSI_OPCODE_TXF: /* deprecated? */ assert(0); return FALSE; break; case TGSI_OPCODE_TXQ: /* deprecated? */ assert(0); return FALSE; break; case TGSI_OPCODE_CONT: return FALSE; case TGSI_OPCODE_EMIT: return FALSE; break; case TGSI_OPCODE_ENDPRIM: return FALSE; break; case TGSI_OPCODE_NOP: break; default: return FALSE; } if (info->num_dst) { lp_emit_store_aos(bld, inst, 0, dst0); } return TRUE; }