/** * Extract Y, U, V channels from packed YUYV. * @param packed is a <n x i32> vector with the packed YUYV blocks * @param i is a <n x i32> vector with the x pixel coordinate (0 or 1) */ static void yuyv_to_yuv_soa(struct gallivm_state *gallivm, unsigned n, LLVMValueRef packed, LLVMValueRef i, LLVMValueRef *y, LLVMValueRef *u, LLVMValueRef *v) { LLVMBuilderRef builder = gallivm->builder; struct lp_type type; LLVMValueRef mask; memset(&type, 0, sizeof type); type.width = 32; type.length = n; assert(lp_check_value(type, packed)); assert(lp_check_value(type, i)); /* * y = (yuyv >> 16*i) & 0xff * u = (yuyv >> 8 ) & 0xff * v = (yuyv >> 24 ) & 0xff */ #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) /* * Avoid shift with per-element count. * No support on x86, gets translated to roughly 5 instructions * per element. Didn't measure performance but cuts shader size * by quite a bit (less difference if cpu has no sse4.1 support). */ if (util_cpu_caps.has_sse2 && n == 4) { LLVMValueRef sel, tmp; struct lp_build_context bld32; lp_build_context_init(&bld32, gallivm, type); tmp = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(gallivm, type, 16), ""); sel = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL, i, lp_build_const_int_vec(gallivm, type, 0)); *y = lp_build_select(&bld32, sel, packed, tmp); } else #endif { LLVMValueRef shift; shift = LLVMBuildMul(builder, i, lp_build_const_int_vec(gallivm, type, 16), ""); *y = LLVMBuildLShr(builder, packed, shift, ""); } *u = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(gallivm, type, 8), ""); *v = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(gallivm, type, 24), ""); mask = lp_build_const_int_vec(gallivm, type, 0xff); *y = LLVMBuildAnd(builder, *y, mask, "y"); *u = LLVMBuildAnd(builder, *u, mask, "u"); *v = LLVMBuildAnd(builder, *v, mask, "v"); }
/** * Build code to compare two values 'a' and 'b' using the given func. * \param func one of PIPE_FUNC_x * The result values will be 0 for false or ~0 for true. */ LLVMValueRef lp_build_cmp(struct lp_build_context *bld, unsigned func, LLVMValueRef a, LLVMValueRef b) { return lp_build_compare(bld->gallivm, bld->type, func, a, b); }
static LLVMValueRef generate_scissor_test(LLVMBuilderRef builder, LLVMValueRef context_ptr, const struct lp_build_interp_soa_context *interp, struct lp_type type) { LLVMTypeRef vec_type = lp_build_vec_type(type); LLVMValueRef xpos = interp->pos[0], ypos = interp->pos[1]; LLVMValueRef xmin, ymin, xmax, ymax; LLVMValueRef m0, m1, m2, m3, m; /* xpos, ypos contain the window coords for the four pixels in the quad */ assert(xpos); assert(ypos); /* get the current scissor bounds, convert to vectors */ xmin = lp_jit_context_scissor_xmin_value(builder, context_ptr); xmin = lp_build_broadcast(builder, vec_type, xmin); ymin = lp_jit_context_scissor_ymin_value(builder, context_ptr); ymin = lp_build_broadcast(builder, vec_type, ymin); xmax = lp_jit_context_scissor_xmax_value(builder, context_ptr); xmax = lp_build_broadcast(builder, vec_type, xmax); ymax = lp_jit_context_scissor_ymax_value(builder, context_ptr); ymax = lp_build_broadcast(builder, vec_type, ymax); /* compare the fragment's position coordinates against the scissor bounds */ m0 = lp_build_compare(builder, type, PIPE_FUNC_GEQUAL, xpos, xmin); m1 = lp_build_compare(builder, type, PIPE_FUNC_GEQUAL, ypos, ymin); m2 = lp_build_compare(builder, type, PIPE_FUNC_LESS, xpos, xmax); m3 = lp_build_compare(builder, type, PIPE_FUNC_LESS, ypos, ymax); /* AND all the masks together */ m = LLVMBuildAnd(builder, m0, m1, ""); m = LLVMBuildAnd(builder, m, m2, ""); m = LLVMBuildAnd(builder, m, m3, ""); lp_build_name(m, "scissormask"); return m; }
/** * Convert srgb int values to linear float values. * Several possibilities how to do this, e.g. * - table * - doing the pow() with int-to-float and float-to-int tricks * (http://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent) * - just using standard polynomial approximation * (3rd order polynomial is required for crappy but just sufficient accuracy) * * @param src integer (vector) value(s) to convert * (chan_bits bit values unpacked to 32 bit already). */ LLVMValueRef lp_build_srgb_to_linear(struct gallivm_state *gallivm, struct lp_type src_type, unsigned chan_bits, LLVMValueRef src) { struct lp_type f32_type = lp_type_float_vec(32, src_type.length * 32); struct lp_build_context f32_bld; LLVMValueRef srcf, part_lin, part_pow, is_linear, lin_const, lin_thresh; double coeffs[4] = {0.0023f, 0.0030f / 255.0f, 0.6935f / (255.0f * 255.0f), 0.3012f / (255.0f * 255.0f * 255.0f) }; assert(src_type.width == 32); /* Technically this would work with more bits too but would be inaccurate. */ assert(chan_bits <= 8); lp_build_context_init(&f32_bld, gallivm, f32_type); /* * using polynomial: (src * (src * (src * 0.3012 + 0.6935) + 0.0030) + 0.0023) * ( poly = 0.3012*x^3 + 0.6935*x^2 + 0.0030*x + 0.0023) * (found with octave polyfit and some magic as I couldn't get the error * function right). Using the above mentioned error function, the values stay * within +-0.35, except for the lowest values - hence tweaking linear segment * to cover the first 16 instead of the first 11 values (the error stays * just about acceptable there too). * Hence: lin = src > 15 ? poly : src / 12.6 * This function really only makes sense for vectors, should use LUT otherwise. * All in all (including float conversion) 11 instructions (with sse4.1), * 6 constants (polynomial could be done with 1 instruction less at the cost * of slightly worse dependency chain, fma should also help). */ /* doing the 1/255 mul as part of the approximation */ srcf = lp_build_int_to_float(&f32_bld, src); if (chan_bits != 8) { /* could adjust all the constants instead */ LLVMValueRef rescale_const = lp_build_const_vec(gallivm, f32_type, 255.0f / ((1 << chan_bits) - 1)); srcf = lp_build_mul(&f32_bld, srcf, rescale_const); } lin_const = lp_build_const_vec(gallivm, f32_type, 1.0f / (12.6f * 255.0f)); part_lin = lp_build_mul(&f32_bld, srcf, lin_const); part_pow = lp_build_polynomial(&f32_bld, srcf, coeffs, 4); lin_thresh = lp_build_const_vec(gallivm, f32_type, 15.0f); is_linear = lp_build_compare(gallivm, f32_type, PIPE_FUNC_LEQUAL, srcf, lin_thresh); return lp_build_select(&f32_bld, is_linear, part_lin, part_pow); }
/** * Converts int16 half-float to float32 * Note this can be performed in 1 instruction if vcvtph2ps exists (sse5 i think?) * [llvm.x86.vcvtph2ps / _mm_cvtph_ps] * * @param src_type <vector> type of int16 * @param src value to convert * * ref http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/ */ LLVMValueRef lp_build_half_to_float(struct gallivm_state *gallivm, struct lp_type src_type, LLVMValueRef src) { struct lp_type f32_type = lp_type_float_vec(32, 32 * src_type.length); struct lp_type i32_type = lp_type_int_vec(32, 32 * src_type.length); LLVMBuilderRef builder = gallivm->builder; LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, i32_type); LLVMTypeRef float_vec_type = lp_build_vec_type(gallivm, f32_type); /* Constants */ LLVMValueRef i32_13 = lp_build_const_int_vec(gallivm, i32_type, 13); LLVMValueRef i32_16 = lp_build_const_int_vec(gallivm, i32_type, 16); LLVMValueRef i32_mask_nosign = lp_build_const_int_vec(gallivm, i32_type, 0x7fff); LLVMValueRef i32_was_infnan = lp_build_const_int_vec(gallivm, i32_type, 0x7bff); LLVMValueRef i32_exp_infnan = lp_build_const_int_vec(gallivm, i32_type, 0xff << 23); LLVMValueRef f32_magic = LLVMBuildBitCast(builder, lp_build_const_int_vec(gallivm, i32_type, (254 - 15) << 23), float_vec_type, ""); /* Convert int16 vector to int32 vector by zero ext */ LLVMValueRef h = LLVMBuildZExt(builder, src, int_vec_type, ""); /* Exponent / mantissa bits */ LLVMValueRef expmant = LLVMBuildAnd(builder, i32_mask_nosign, h, ""); LLVMValueRef shifted = LLVMBuildBitCast(builder, LLVMBuildShl(builder, expmant, i32_13, ""), float_vec_type, ""); /* Exponent adjust */ LLVMValueRef scaled = LLVMBuildBitCast(builder, LLVMBuildFMul(builder, shifted, f32_magic, ""), int_vec_type, ""); /* Make sure Inf/NaN survive */ LLVMValueRef b_wasinfnan = lp_build_compare(gallivm, i32_type, PIPE_FUNC_GREATER, expmant, i32_was_infnan); LLVMValueRef infnanexp = LLVMBuildAnd(builder, b_wasinfnan, i32_exp_infnan, ""); /* Sign bit */ LLVMValueRef justsign = LLVMBuildXor(builder, h, expmant, ""); LLVMValueRef sign = LLVMBuildShl(builder, justsign, i32_16, ""); /* Combine result */ LLVMValueRef sign_inf = LLVMBuildOr(builder, sign, infnanexp, ""); LLVMValueRef final = LLVMBuildOr(builder, scaled, sign_inf, ""); /* Cast from int32 vector to float32 vector */ return LLVMBuildBitCast(builder, final, float_vec_type, ""); }
/** * Register store. */ void lp_emit_store_aos( struct lp_build_tgsi_aos_context *bld, const struct tgsi_full_instruction *inst, unsigned index, LLVMValueRef value) { LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder; const struct tgsi_full_dst_register *reg = &inst->Dst[index]; LLVMValueRef mask = NULL; LLVMValueRef ptr; /* * Saturate the value */ switch (inst->Instruction.Saturate) { case TGSI_SAT_NONE: break; case TGSI_SAT_ZERO_ONE: value = lp_build_max(&bld->bld_base.base, value, bld->bld_base.base.zero); value = lp_build_min(&bld->bld_base.base, value, bld->bld_base.base.one); break; case TGSI_SAT_MINUS_PLUS_ONE: value = lp_build_max(&bld->bld_base.base, value, lp_build_const_vec(bld->bld_base.base.gallivm, bld->bld_base.base.type, -1.0)); value = lp_build_min(&bld->bld_base.base, value, bld->bld_base.base.one); break; default: assert(0); } /* * Translate the register file */ assert(!reg->Register.Indirect); switch (reg->Register.File) { case TGSI_FILE_OUTPUT: ptr = bld->outputs[reg->Register.Index]; break; case TGSI_FILE_TEMPORARY: ptr = bld->temps[reg->Register.Index]; break; case TGSI_FILE_ADDRESS: ptr = bld->addr[reg->Indirect.Index]; break; case TGSI_FILE_PREDICATE: ptr = bld->preds[reg->Register.Index]; break; default: assert(0); return; } if (!ptr) return; /* * Predicate */ if (inst->Instruction.Predicate) { LLVMValueRef pred; assert(inst->Predicate.Index < LP_MAX_TGSI_PREDS); pred = LLVMBuildLoad(builder, bld->preds[inst->Predicate.Index], ""); /* * Convert the value to an integer mask. */ pred = lp_build_compare(bld->bld_base.base.gallivm, bld->bld_base.base.type, PIPE_FUNC_NOTEQUAL, pred, bld->bld_base.base.zero); if (inst->Predicate.Negate) { pred = LLVMBuildNot(builder, pred, ""); } pred = bld->bld_base.emit_swizzle(&bld->bld_base, pred, inst->Predicate.SwizzleX, inst->Predicate.SwizzleY, inst->Predicate.SwizzleZ, inst->Predicate.SwizzleW); if (mask) { mask = LLVMBuildAnd(builder, mask, pred, ""); } else { mask = pred; } } /* * Writemask */ if (reg->Register.WriteMask != TGSI_WRITEMASK_XYZW) { LLVMValueRef writemask; writemask = lp_build_const_mask_aos_swizzled(bld->bld_base.base.gallivm, bld->bld_base.base.type, reg->Register.WriteMask, TGSI_NUM_CHANNELS, bld->swizzles); if (mask) { mask = LLVMBuildAnd(builder, mask, writemask, ""); } else { mask = writemask; } } if (mask) { LLVMValueRef orig_value; orig_value = LLVMBuildLoad(builder, ptr, ""); value = lp_build_select(&bld->bld_base.base, mask, value, orig_value); } LLVMBuildStore(builder, value, ptr); }
/** * Generate the code to do inside/outside triangle testing for the * four pixels in a 2x2 quad. This will set the four elements of the * quad mask vector to 0 or ~0. * \param i which quad of the quad group to test, in [0,3] */ static void generate_tri_edge_mask(LLVMBuilderRef builder, unsigned i, LLVMValueRef *mask, /* ivec4, out */ LLVMValueRef c0, /* int32 */ LLVMValueRef c1, /* int32 */ LLVMValueRef c2, /* int32 */ LLVMValueRef step0_ptr, /* ivec4 */ LLVMValueRef step1_ptr, /* ivec4 */ LLVMValueRef step2_ptr) /* ivec4 */ { #define OPTIMIZE_IN_OUT_TEST 0 #if OPTIMIZE_IN_OUT_TEST struct lp_build_if_state ifctx; LLVMValueRef not_draw_all; #endif struct lp_build_flow_context *flow; struct lp_type i32_type; LLVMTypeRef i32vec4_type, mask_type; LLVMValueRef c0_vec, c1_vec, c2_vec; LLVMValueRef in_out_mask; assert(i < 4); /* int32 vector type */ memset(&i32_type, 0, sizeof i32_type); i32_type.floating = FALSE; /* values are integers */ i32_type.sign = TRUE; /* values are signed */ i32_type.norm = FALSE; /* values are not normalized */ i32_type.width = 32; /* 32-bit int values */ i32_type.length = 4; /* 4 elements per vector */ i32vec4_type = lp_build_int32_vec4_type(); mask_type = LLVMIntType(32 * 4); /* * Use a conditional here to do detailed pixel in/out testing. * We only have to do this if c0 != INT_MIN. */ flow = lp_build_flow_create(builder); lp_build_flow_scope_begin(flow); { #if OPTIMIZE_IN_OUT_TEST /* not_draw_all = (c0 != INT_MIN) */ not_draw_all = LLVMBuildICmp(builder, LLVMIntNE, c0, LLVMConstInt(LLVMInt32Type(), INT_MIN, 0), ""); in_out_mask = lp_build_int_const_scalar(i32_type, ~0); lp_build_flow_scope_declare(flow, &in_out_mask); /* if (not_draw_all) {... */ lp_build_if(&ifctx, flow, builder, not_draw_all); #endif { LLVMValueRef step0_vec, step1_vec, step2_vec; LLVMValueRef m0_vec, m1_vec, m2_vec; LLVMValueRef index, m; /* c0_vec = {c0, c0, c0, c0} * Note that we emit this code four times but LLVM optimizes away * three instances of it. */ c0_vec = lp_build_broadcast(builder, i32vec4_type, c0); c1_vec = lp_build_broadcast(builder, i32vec4_type, c1); c2_vec = lp_build_broadcast(builder, i32vec4_type, c2); lp_build_name(c0_vec, "edgeconst0vec"); lp_build_name(c1_vec, "edgeconst1vec"); lp_build_name(c2_vec, "edgeconst2vec"); /* load step0vec, step1, step2 vec from memory */ index = LLVMConstInt(LLVMInt32Type(), i, 0); step0_vec = LLVMBuildLoad(builder, LLVMBuildGEP(builder, step0_ptr, &index, 1, ""), ""); step1_vec = LLVMBuildLoad(builder, LLVMBuildGEP(builder, step1_ptr, &index, 1, ""), ""); step2_vec = LLVMBuildLoad(builder, LLVMBuildGEP(builder, step2_ptr, &index, 1, ""), ""); lp_build_name(step0_vec, "step0vec"); lp_build_name(step1_vec, "step1vec"); lp_build_name(step2_vec, "step2vec"); /* m0_vec = step0_ptr[i] > c0_vec */ m0_vec = lp_build_compare(builder, i32_type, PIPE_FUNC_GREATER, step0_vec, c0_vec); m1_vec = lp_build_compare(builder, i32_type, PIPE_FUNC_GREATER, step1_vec, c1_vec); m2_vec = lp_build_compare(builder, i32_type, PIPE_FUNC_GREATER, step2_vec, c2_vec); /* in_out_mask = m0_vec & m1_vec & m2_vec */ m = LLVMBuildAnd(builder, m0_vec, m1_vec, ""); in_out_mask = LLVMBuildAnd(builder, m, m2_vec, ""); lp_build_name(in_out_mask, "inoutmaskvec"); } #if OPTIMIZE_IN_OUT_TEST lp_build_endif(&ifctx); #endif } lp_build_flow_scope_end(flow); lp_build_flow_destroy(flow); /* This is the initial alive/dead pixel mask for a quad of four pixels. * It's an int[4] vector with each word set to 0 or ~0. * Words will get cleared when pixels faile the Z test, etc. */ *mask = in_out_mask; }
/** * Build LLVM code for texture coord wrapping, for linear filtering, * for scaled integer texcoords. * \param block_length is the length of the pixel block along the * coordinate axis * \param coord0 the incoming texcoord (s,t,r or q) scaled to the texture size * \param length the texture size along one dimension * \param stride pixel stride along the coordinate axis (in bytes) * \param is_pot if TRUE, length is a power of two * \param wrap_mode one of PIPE_TEX_WRAP_x * \param offset0 resulting relative offset for coord0 * \param offset1 resulting relative offset for coord0 + 1 * \param i0 resulting sub-block pixel coordinate for coord0 * \param i1 resulting sub-block pixel coordinate for coord0 + 1 */ static void lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld, unsigned block_length, LLVMValueRef coord0, LLVMValueRef length, LLVMValueRef stride, boolean is_pot, unsigned wrap_mode, LLVMValueRef *offset0, LLVMValueRef *offset1, LLVMValueRef *i0, LLVMValueRef *i1) { struct lp_build_context *int_coord_bld = &bld->int_coord_bld; LLVMBuilderRef builder = bld->gallivm->builder; LLVMValueRef length_minus_one; LLVMValueRef lmask, umask, mask; if (block_length != 1) { /* * If the pixel block covers more than one pixel then there is no easy * way to calculate offset1 relative to offset0. Instead, compute them * independently. */ LLVMValueRef coord1; lp_build_sample_wrap_nearest_int(bld, block_length, coord0, length, stride, is_pot, wrap_mode, offset0, i0); coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one); lp_build_sample_wrap_nearest_int(bld, block_length, coord1, length, stride, is_pot, wrap_mode, offset1, i1); return; } /* * Scalar pixels -- try to compute offset0 and offset1 with a single stride * multiplication. */ *i0 = int_coord_bld->zero; *i1 = int_coord_bld->zero; length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one); switch(wrap_mode) { case PIPE_TEX_WRAP_REPEAT: if (is_pot) { coord0 = LLVMBuildAnd(builder, coord0, length_minus_one, ""); } else { /* Add a bias to the texcoord to handle negative coords */ LLVMValueRef bias = lp_build_mul_imm(int_coord_bld, length, 1024); coord0 = LLVMBuildAdd(builder, coord0, bias, ""); coord0 = LLVMBuildURem(builder, coord0, length, ""); } mask = lp_build_compare(bld->gallivm, int_coord_bld->type, PIPE_FUNC_NOTEQUAL, coord0, length_minus_one); *offset0 = lp_build_mul(int_coord_bld, coord0, stride); *offset1 = LLVMBuildAnd(builder, lp_build_add(int_coord_bld, *offset0, stride), mask, ""); break; case PIPE_TEX_WRAP_CLAMP_TO_EDGE: lmask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type, PIPE_FUNC_GEQUAL, coord0, int_coord_bld->zero); umask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type, PIPE_FUNC_LESS, coord0, length_minus_one); coord0 = lp_build_select(int_coord_bld, lmask, coord0, int_coord_bld->zero); coord0 = lp_build_select(int_coord_bld, umask, coord0, length_minus_one); mask = LLVMBuildAnd(builder, lmask, umask, ""); *offset0 = lp_build_mul(int_coord_bld, coord0, stride); *offset1 = lp_build_add(int_coord_bld, *offset0, LLVMBuildAnd(builder, stride, mask, "")); break; case PIPE_TEX_WRAP_CLAMP: case PIPE_TEX_WRAP_CLAMP_TO_BORDER: case PIPE_TEX_WRAP_MIRROR_REPEAT: case PIPE_TEX_WRAP_MIRROR_CLAMP: case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: default: assert(0); *offset0 = int_coord_bld->zero; *offset1 = int_coord_bld->zero; break; } }
/** * Convert linear float values to srgb int values. * Several possibilities how to do this, e.g. * - use table (based on exponent/highest order mantissa bits) and do * linear interpolation (https://gist.github.com/rygorous/2203834) * - Chebyshev polynomial * - Approximation using reciprocals * - using int-to-float and float-to-int tricks for pow() * (http://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent) * * @param src float (vector) value(s) to convert. */ static LLVMValueRef lp_build_linear_to_srgb(struct gallivm_state *gallivm, struct lp_type src_type, LLVMValueRef src) { LLVMBuilderRef builder = gallivm->builder; struct lp_build_context f32_bld; LLVMValueRef lin_thresh, lin, lin_const, is_linear, tmp, pow_final; lp_build_context_init(&f32_bld, gallivm, src_type); src = lp_build_clamp(&f32_bld, src, f32_bld.zero, f32_bld.one); if (0) { /* * using int-to-float and float-to-int trick for pow(). * This is much more accurate than necessary thanks to the correction, * but it most certainly makes no sense without rsqrt available. * Bonus points if you understand how this works... * All in all (including min/max clamp, conversion) 19 instructions. */ float exp_f = 2.0f / 3.0f; /* some compilers can't do exp2f, so this is exp2f(127.0f/exp_f - 127.0f) */ float exp2f_c = 1.30438178253e+19f; float coeff_f = 0.62996f; LLVMValueRef pow_approx, coeff, x2, exponent, pow_1, pow_2; struct lp_type int_type = lp_int_type(src_type); /* * First calculate approx x^8/12 */ exponent = lp_build_const_vec(gallivm, src_type, exp_f); coeff = lp_build_const_vec(gallivm, src_type, exp2f_c * powf(coeff_f, 1.0f / exp_f)); /* premultiply src */ tmp = lp_build_mul(&f32_bld, coeff, src); /* "log2" */ tmp = LLVMBuildBitCast(builder, tmp, lp_build_vec_type(gallivm, int_type), ""); tmp = lp_build_int_to_float(&f32_bld, tmp); /* multiply for pow */ tmp = lp_build_mul(&f32_bld, tmp, exponent); /* "exp2" */ pow_approx = lp_build_itrunc(&f32_bld, tmp); pow_approx = LLVMBuildBitCast(builder, pow_approx, lp_build_vec_type(gallivm, src_type), ""); /* * Since that pow was inaccurate (like 3 bits, though each sqrt step would * give another bit), compensate the error (which is why we chose another * exponent in the first place). */ /* x * x^(8/12) = x^(20/12) */ pow_1 = lp_build_mul(&f32_bld, pow_approx, src); /* x * x * x^(-4/12) = x^(20/12) */ /* Should avoid using rsqrt if it's not available, but * using x * x^(4/12) * x^(4/12) instead will change error weight */ tmp = lp_build_fast_rsqrt(&f32_bld, pow_approx); x2 = lp_build_mul(&f32_bld, src, src); pow_2 = lp_build_mul(&f32_bld, x2, tmp); /* average the values so the errors cancel out, compensate bias, * we also squeeze the 1.055 mul of the srgb conversion plus the 255.0 mul * for conversion to int in here */ tmp = lp_build_add(&f32_bld, pow_1, pow_2); coeff = lp_build_const_vec(gallivm, src_type, 1.0f / (3.0f * coeff_f) * 0.999852f * powf(1.055f * 255.0f, 4.0f)); pow_final = lp_build_mul(&f32_bld, tmp, coeff); /* x^(5/12) = rsqrt(rsqrt(x^20/12)) */ if (lp_build_fast_rsqrt_available(src_type)) { pow_final = lp_build_fast_rsqrt(&f32_bld, lp_build_fast_rsqrt(&f32_bld, pow_final)); } else { pow_final = lp_build_sqrt(&f32_bld, lp_build_sqrt(&f32_bld, pow_final)); } pow_final = lp_build_add(&f32_bld, pow_final, lp_build_const_vec(gallivm, src_type, -0.055f * 255.0f)); } else { /* * using "rational polynomial" approximation here. * Essentially y = a*x^0.375 + b*x^0.5 + c, with also * factoring in the 255.0 mul and the scaling mul. * (a is closer to actual value so has higher weight than b.) * Note: the constants are magic values. They were found empirically, * possibly could be improved but good enough (be VERY careful with * error metric if you'd want to tweak them, they also MUST fit with * the crappy polynomial above for srgb->linear since it is required * that each srgb value maps back to the same value). * This function has an error of max +-0.17 (and we'd only require +-0.6), * for the approximated srgb->linear values the error is naturally larger * (+-0.42) but still accurate enough (required +-0.5 essentially). * All in all (including min/max clamp, conversion) 15 instructions. * FMA would help (minus 2 instructions). */ LLVMValueRef x05, x0375, a_const, b_const, c_const, tmp2; if (lp_build_fast_rsqrt_available(src_type)) { tmp = lp_build_fast_rsqrt(&f32_bld, src); x05 = lp_build_mul(&f32_bld, src, tmp); } else { /* * I don't really expect this to be practical without rsqrt * but there's no reason for triple punishment so at least * save the otherwise resulting division and unnecessary mul... */ x05 = lp_build_sqrt(&f32_bld, src); } tmp = lp_build_mul(&f32_bld, x05, src); if (lp_build_fast_rsqrt_available(src_type)) { x0375 = lp_build_fast_rsqrt(&f32_bld, lp_build_fast_rsqrt(&f32_bld, tmp)); } else { x0375 = lp_build_sqrt(&f32_bld, lp_build_sqrt(&f32_bld, tmp)); } a_const = lp_build_const_vec(gallivm, src_type, 0.675f * 1.0622 * 255.0f); b_const = lp_build_const_vec(gallivm, src_type, 0.325f * 1.0622 * 255.0f); c_const = lp_build_const_vec(gallivm, src_type, -0.0620f * 255.0f); tmp = lp_build_mul(&f32_bld, a_const, x0375); tmp2 = lp_build_mul(&f32_bld, b_const, x05); tmp2 = lp_build_add(&f32_bld, tmp2, c_const); pow_final = lp_build_add(&f32_bld, tmp, tmp2); } /* linear part is easy */ lin_const = lp_build_const_vec(gallivm, src_type, 12.92f * 255.0f); lin = lp_build_mul(&f32_bld, src, lin_const); lin_thresh = lp_build_const_vec(gallivm, src_type, 0.0031308f); is_linear = lp_build_compare(gallivm, src_type, PIPE_FUNC_LEQUAL, src, lin_thresh); tmp = lp_build_select(&f32_bld, is_linear, lin, pow_final); f32_bld.type.sign = 0; return lp_build_iround(&f32_bld, tmp); }
/** * Expand the relevent bits of mask_input to a 4-dword mask for the * four pixels in a 2x2 quad. This will set the four elements of the * quad mask vector to 0 or ~0. * * \param quad which quad of the quad group to test, in [0,3] * \param mask_input bitwise mask for the whole 4x4 stamp */ static LLVMValueRef generate_quad_mask(LLVMBuilderRef builder, struct lp_type fs_type, unsigned quad, LLVMValueRef mask_input) /* int32 */ { struct lp_type mask_type; LLVMTypeRef i32t = LLVMInt32Type(); LLVMValueRef bits[4]; LLVMValueRef mask; int shift; /* * XXX: We'll need a different path for 16 x u8 */ assert(fs_type.width == 32); assert(fs_type.length == 4); mask_type = lp_int_type(fs_type); /* * mask_input >>= (quad * 4) */ switch (quad) { case 0: shift = 0; break; case 1: shift = 2; break; case 2: shift = 8; break; case 3: shift = 10; break; default: assert(0); shift = 0; } mask_input = LLVMBuildLShr(builder, mask_input, LLVMConstInt(i32t, shift, 0), ""); /* * mask = { mask_input & (1 << i), for i in [0,3] } */ mask = lp_build_broadcast(builder, lp_build_vec_type(mask_type), mask_input); bits[0] = LLVMConstInt(i32t, 1 << 0, 0); bits[1] = LLVMConstInt(i32t, 1 << 1, 0); bits[2] = LLVMConstInt(i32t, 1 << 4, 0); bits[3] = LLVMConstInt(i32t, 1 << 5, 0); mask = LLVMBuildAnd(builder, mask, LLVMConstVector(bits, 4), ""); /* * mask = mask != 0 ? ~0 : 0 */ mask = lp_build_compare(builder, mask_type, PIPE_FUNC_NOTEQUAL, mask, lp_build_const_int_vec(mask_type, 0)); return mask; }