/**
 * Extract Y, U, V channels from packed YUYV.
 * @param packed  is a <n x i32> vector with the packed YUYV blocks
 * @param i  is a <n x i32> vector with the x pixel coordinate (0 or 1)
 */
static void
yuyv_to_yuv_soa(struct gallivm_state *gallivm,
                unsigned n,
                LLVMValueRef packed,
                LLVMValueRef i,
                LLVMValueRef *y,
                LLVMValueRef *u,
                LLVMValueRef *v)
{
   LLVMBuilderRef builder = gallivm->builder;
   struct lp_type type;
   LLVMValueRef mask;

   memset(&type, 0, sizeof type);
   type.width = 32;
   type.length = n;

   assert(lp_check_value(type, packed));
   assert(lp_check_value(type, i));

   /*
    * y = (yuyv >> 16*i) & 0xff
    * u = (yuyv >> 8   ) & 0xff
    * v = (yuyv >> 24  ) & 0xff
    */

#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
   /*
    * Avoid shift with per-element count.
    * No support on x86, gets translated to roughly 5 instructions
    * per element. Didn't measure performance but cuts shader size
    * by quite a bit (less difference if cpu has no sse4.1 support).
    */
   if (util_cpu_caps.has_sse2 && n == 4) {
      LLVMValueRef sel, tmp;
      struct lp_build_context bld32;

      lp_build_context_init(&bld32, gallivm, type);

      tmp = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(gallivm, type, 16), "");
      sel = lp_build_compare(gallivm, type, PIPE_FUNC_EQUAL, i, lp_build_const_int_vec(gallivm, type, 0));
       *y = lp_build_select(&bld32, sel, packed, tmp);
   } else
#endif
   {
      LLVMValueRef shift;
      shift = LLVMBuildMul(builder, i, lp_build_const_int_vec(gallivm, type, 16), "");
      *y = LLVMBuildLShr(builder, packed, shift, "");
   }

   *u = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(gallivm, type, 8), "");
   *v = LLVMBuildLShr(builder, packed, lp_build_const_int_vec(gallivm, type, 24), "");

   mask = lp_build_const_int_vec(gallivm, type, 0xff);

   *y = LLVMBuildAnd(builder, *y, mask, "y");
   *u = LLVMBuildAnd(builder, *u, mask, "u");
   *v = LLVMBuildAnd(builder, *v, mask, "v");
}
/**
 * Build code to compare two values 'a' and 'b' using the given func.
 * \param func  one of PIPE_FUNC_x
 * The result values will be 0 for false or ~0 for true.
 */
LLVMValueRef
lp_build_cmp(struct lp_build_context *bld,
             unsigned func,
             LLVMValueRef a,
             LLVMValueRef b)
{
   return lp_build_compare(bld->gallivm, bld->type, func, a, b);
}
Example #3
0
static LLVMValueRef
generate_scissor_test(LLVMBuilderRef builder,
                      LLVMValueRef context_ptr,
                      const struct lp_build_interp_soa_context *interp,
                      struct lp_type type)
{
   LLVMTypeRef vec_type = lp_build_vec_type(type);
   LLVMValueRef xpos = interp->pos[0], ypos = interp->pos[1];
   LLVMValueRef xmin, ymin, xmax, ymax;
   LLVMValueRef m0, m1, m2, m3, m;

   /* xpos, ypos contain the window coords for the four pixels in the quad */
   assert(xpos);
   assert(ypos);

   /* get the current scissor bounds, convert to vectors */
   xmin = lp_jit_context_scissor_xmin_value(builder, context_ptr);
   xmin = lp_build_broadcast(builder, vec_type, xmin);

   ymin = lp_jit_context_scissor_ymin_value(builder, context_ptr);
   ymin = lp_build_broadcast(builder, vec_type, ymin);

   xmax = lp_jit_context_scissor_xmax_value(builder, context_ptr);
   xmax = lp_build_broadcast(builder, vec_type, xmax);

   ymax = lp_jit_context_scissor_ymax_value(builder, context_ptr);
   ymax = lp_build_broadcast(builder, vec_type, ymax);

   /* compare the fragment's position coordinates against the scissor bounds */
   m0 = lp_build_compare(builder, type, PIPE_FUNC_GEQUAL, xpos, xmin);
   m1 = lp_build_compare(builder, type, PIPE_FUNC_GEQUAL, ypos, ymin);
   m2 = lp_build_compare(builder, type, PIPE_FUNC_LESS, xpos, xmax);
   m3 = lp_build_compare(builder, type, PIPE_FUNC_LESS, ypos, ymax);

   /* AND all the masks together */
   m = LLVMBuildAnd(builder, m0, m1, "");
   m = LLVMBuildAnd(builder, m, m2, "");
   m = LLVMBuildAnd(builder, m, m3, "");

   lp_build_name(m, "scissormask");

   return m;
}
/**
 * Convert srgb int values to linear float values.
 * Several possibilities how to do this, e.g.
 * - table
 * - doing the pow() with int-to-float and float-to-int tricks
 *   (http://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent)
 * - just using standard polynomial approximation
 *   (3rd order polynomial is required for crappy but just sufficient accuracy)
 *
 * @param src   integer (vector) value(s) to convert
 *              (chan_bits bit values unpacked to 32 bit already).
 */
LLVMValueRef
lp_build_srgb_to_linear(struct gallivm_state *gallivm,
                        struct lp_type src_type,
                        unsigned chan_bits,
                        LLVMValueRef src)
{
   struct lp_type f32_type = lp_type_float_vec(32, src_type.length * 32);
   struct lp_build_context f32_bld;
   LLVMValueRef srcf, part_lin, part_pow, is_linear, lin_const, lin_thresh;
   double coeffs[4] = {0.0023f,
                       0.0030f / 255.0f,
                       0.6935f / (255.0f * 255.0f),
                       0.3012f / (255.0f * 255.0f * 255.0f)
   };

   assert(src_type.width == 32);
   /* Technically this would work with more bits too but would be inaccurate. */
   assert(chan_bits <= 8);

   lp_build_context_init(&f32_bld, gallivm, f32_type);

   /*
    * using polynomial: (src * (src * (src * 0.3012 + 0.6935) + 0.0030) + 0.0023)
    * ( poly =  0.3012*x^3 + 0.6935*x^2 + 0.0030*x + 0.0023)
    * (found with octave polyfit and some magic as I couldn't get the error
    * function right). Using the above mentioned error function, the values stay
    * within +-0.35, except for the lowest values - hence tweaking linear segment
    * to cover the first 16 instead of the first 11 values (the error stays
    * just about acceptable there too).
    * Hence: lin = src > 15 ? poly : src / 12.6
    * This function really only makes sense for vectors, should use LUT otherwise.
    * All in all (including float conversion) 11 instructions (with sse4.1),
    * 6 constants (polynomial could be done with 1 instruction less at the cost
    * of slightly worse dependency chain, fma should also help).
    */
   /* doing the 1/255 mul as part of the approximation */
   srcf = lp_build_int_to_float(&f32_bld, src);
   if (chan_bits != 8) {
      /* could adjust all the constants instead */
      LLVMValueRef rescale_const = lp_build_const_vec(gallivm, f32_type,
                                                      255.0f / ((1 << chan_bits) - 1));
      srcf = lp_build_mul(&f32_bld, srcf, rescale_const);
   }
   lin_const = lp_build_const_vec(gallivm, f32_type, 1.0f / (12.6f * 255.0f));
   part_lin = lp_build_mul(&f32_bld, srcf, lin_const);

   part_pow = lp_build_polynomial(&f32_bld, srcf, coeffs, 4);

   lin_thresh = lp_build_const_vec(gallivm, f32_type, 15.0f);
   is_linear = lp_build_compare(gallivm, f32_type, PIPE_FUNC_LEQUAL, srcf, lin_thresh);
   return lp_build_select(&f32_bld, is_linear, part_lin, part_pow);
}
Example #5
0
/**
 * Converts int16 half-float to float32
 * Note this can be performed in 1 instruction if vcvtph2ps exists (sse5 i think?)
 * [llvm.x86.vcvtph2ps / _mm_cvtph_ps]
 *
 * @param src_type      <vector> type of int16
 * @param src           value to convert
 *
 * ref http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
 */
LLVMValueRef
lp_build_half_to_float(struct gallivm_state *gallivm,
                       struct lp_type src_type,
                       LLVMValueRef src)
{
    struct lp_type f32_type = lp_type_float_vec(32, 32 * src_type.length);
    struct lp_type i32_type = lp_type_int_vec(32, 32 * src_type.length);

    LLVMBuilderRef builder = gallivm->builder;
    LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm, i32_type);
    LLVMTypeRef float_vec_type = lp_build_vec_type(gallivm, f32_type);

    /* Constants */
    LLVMValueRef i32_13          = lp_build_const_int_vec(gallivm, i32_type, 13);
    LLVMValueRef i32_16          = lp_build_const_int_vec(gallivm, i32_type, 16);
    LLVMValueRef i32_mask_nosign = lp_build_const_int_vec(gallivm, i32_type, 0x7fff);
    LLVMValueRef i32_was_infnan  = lp_build_const_int_vec(gallivm, i32_type, 0x7bff);
    LLVMValueRef i32_exp_infnan  = lp_build_const_int_vec(gallivm, i32_type, 0xff << 23);
    LLVMValueRef f32_magic       = LLVMBuildBitCast(builder,
                                   lp_build_const_int_vec(gallivm, i32_type, (254 - 15) << 23),
                                   float_vec_type, "");

    /* Convert int16 vector to int32 vector by zero ext */
    LLVMValueRef h             = LLVMBuildZExt(builder, src, int_vec_type, "");

    /* Exponent / mantissa bits */
    LLVMValueRef expmant       = LLVMBuildAnd(builder, i32_mask_nosign, h, "");
    LLVMValueRef shifted       = LLVMBuildBitCast(builder, LLVMBuildShl(builder, expmant, i32_13, ""), float_vec_type, "");

    /* Exponent adjust */
    LLVMValueRef scaled        = LLVMBuildBitCast(builder, LLVMBuildFMul(builder, shifted, f32_magic, ""), int_vec_type, "");

    /* Make sure Inf/NaN survive */
    LLVMValueRef b_wasinfnan   = lp_build_compare(gallivm, i32_type, PIPE_FUNC_GREATER, expmant, i32_was_infnan);
    LLVMValueRef infnanexp     = LLVMBuildAnd(builder, b_wasinfnan, i32_exp_infnan, "");

    /* Sign bit */
    LLVMValueRef justsign      = LLVMBuildXor(builder, h, expmant, "");
    LLVMValueRef sign          = LLVMBuildShl(builder, justsign, i32_16, "");

    /* Combine result */
    LLVMValueRef sign_inf      = LLVMBuildOr(builder, sign, infnanexp, "");
    LLVMValueRef final         = LLVMBuildOr(builder, scaled, sign_inf, "");

    /* Cast from int32 vector to float32 vector */
    return LLVMBuildBitCast(builder, final, float_vec_type, "");
}
/**
 * Register store.
 */
void
lp_emit_store_aos(
   struct lp_build_tgsi_aos_context *bld,
   const struct tgsi_full_instruction *inst,
   unsigned index,
   LLVMValueRef value)
{
   LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
   const struct tgsi_full_dst_register *reg = &inst->Dst[index];
   LLVMValueRef mask = NULL;
   LLVMValueRef ptr;

   /*
    * Saturate the value
    */

   switch (inst->Instruction.Saturate) {
   case TGSI_SAT_NONE:
      break;

   case TGSI_SAT_ZERO_ONE:
      value = lp_build_max(&bld->bld_base.base, value, bld->bld_base.base.zero);
      value = lp_build_min(&bld->bld_base.base, value, bld->bld_base.base.one);
      break;

   case TGSI_SAT_MINUS_PLUS_ONE:
      value = lp_build_max(&bld->bld_base.base, value, lp_build_const_vec(bld->bld_base.base.gallivm, bld->bld_base.base.type, -1.0));
      value = lp_build_min(&bld->bld_base.base, value, bld->bld_base.base.one);
      break;

   default:
      assert(0);
   }

   /*
    * Translate the register file
    */

   assert(!reg->Register.Indirect);

   switch (reg->Register.File) {
   case TGSI_FILE_OUTPUT:
      ptr = bld->outputs[reg->Register.Index];
      break;

   case TGSI_FILE_TEMPORARY:
      ptr = bld->temps[reg->Register.Index];
      break;

   case TGSI_FILE_ADDRESS:
      ptr = bld->addr[reg->Indirect.Index];
      break;

   case TGSI_FILE_PREDICATE:
      ptr = bld->preds[reg->Register.Index];
      break;

   default:
      assert(0);
      return;
   }

   if (!ptr)
      return;
   /*
    * Predicate
    */

   if (inst->Instruction.Predicate) {
      LLVMValueRef pred;

      assert(inst->Predicate.Index < LP_MAX_TGSI_PREDS);

      pred = LLVMBuildLoad(builder,
                           bld->preds[inst->Predicate.Index], "");

      /*
       * Convert the value to an integer mask.
       */
      pred = lp_build_compare(bld->bld_base.base.gallivm,
                               bld->bld_base.base.type,
                               PIPE_FUNC_NOTEQUAL,
                               pred,
                               bld->bld_base.base.zero);

      if (inst->Predicate.Negate) {
         pred = LLVMBuildNot(builder, pred, "");
      }

      pred = bld->bld_base.emit_swizzle(&bld->bld_base, pred,
                         inst->Predicate.SwizzleX,
                         inst->Predicate.SwizzleY,
                         inst->Predicate.SwizzleZ,
                         inst->Predicate.SwizzleW);

      if (mask) {
         mask = LLVMBuildAnd(builder, mask, pred, "");
      } else {
         mask = pred;
      }
   }

   /*
    * Writemask
    */

   if (reg->Register.WriteMask != TGSI_WRITEMASK_XYZW) {
      LLVMValueRef writemask;

      writemask = lp_build_const_mask_aos_swizzled(bld->bld_base.base.gallivm,
                                                   bld->bld_base.base.type,
                                                   reg->Register.WriteMask,
                                                   TGSI_NUM_CHANNELS,
                                                   bld->swizzles);

      if (mask) {
         mask = LLVMBuildAnd(builder, mask, writemask, "");
      } else {
         mask = writemask;
      }
   }

   if (mask) {
      LLVMValueRef orig_value;

      orig_value = LLVMBuildLoad(builder, ptr, "");
      value = lp_build_select(&bld->bld_base.base,
                              mask, value, orig_value);
   }

   LLVMBuildStore(builder, value, ptr);
}
Example #7
0
/**
 * Generate the code to do inside/outside triangle testing for the
 * four pixels in a 2x2 quad.  This will set the four elements of the
 * quad mask vector to 0 or ~0.
 * \param i  which quad of the quad group to test, in [0,3]
 */
static void
generate_tri_edge_mask(LLVMBuilderRef builder,
                       unsigned i,
                       LLVMValueRef *mask,      /* ivec4, out */
                       LLVMValueRef c0,         /* int32 */
                       LLVMValueRef c1,         /* int32 */
                       LLVMValueRef c2,         /* int32 */
                       LLVMValueRef step0_ptr,  /* ivec4 */
                       LLVMValueRef step1_ptr,  /* ivec4 */
                       LLVMValueRef step2_ptr)  /* ivec4 */
{
#define OPTIMIZE_IN_OUT_TEST 0
#if OPTIMIZE_IN_OUT_TEST
   struct lp_build_if_state ifctx;
   LLVMValueRef not_draw_all;
#endif
   struct lp_build_flow_context *flow;
   struct lp_type i32_type;
   LLVMTypeRef i32vec4_type, mask_type;
   LLVMValueRef c0_vec, c1_vec, c2_vec;
   LLVMValueRef in_out_mask;

   assert(i < 4);
   
   /* int32 vector type */
   memset(&i32_type, 0, sizeof i32_type);
   i32_type.floating = FALSE; /* values are integers */
   i32_type.sign = TRUE;      /* values are signed */
   i32_type.norm = FALSE;     /* values are not normalized */
   i32_type.width = 32;       /* 32-bit int values */
   i32_type.length = 4;       /* 4 elements per vector */

   i32vec4_type = lp_build_int32_vec4_type();

   mask_type = LLVMIntType(32 * 4);

   /*
    * Use a conditional here to do detailed pixel in/out testing.
    * We only have to do this if c0 != INT_MIN.
    */
   flow = lp_build_flow_create(builder);
   lp_build_flow_scope_begin(flow);

   {
#if OPTIMIZE_IN_OUT_TEST
      /* not_draw_all = (c0 != INT_MIN) */
      not_draw_all = LLVMBuildICmp(builder,
                                   LLVMIntNE,
                                   c0,
                                   LLVMConstInt(LLVMInt32Type(), INT_MIN, 0),
                                   "");

      in_out_mask = lp_build_int_const_scalar(i32_type, ~0);


      lp_build_flow_scope_declare(flow, &in_out_mask);

      /* if (not_draw_all) {... */
      lp_build_if(&ifctx, flow, builder, not_draw_all);
#endif
      {
         LLVMValueRef step0_vec, step1_vec, step2_vec;
         LLVMValueRef m0_vec, m1_vec, m2_vec;
         LLVMValueRef index, m;

         /* c0_vec = {c0, c0, c0, c0}
          * Note that we emit this code four times but LLVM optimizes away
          * three instances of it.
          */
         c0_vec = lp_build_broadcast(builder, i32vec4_type, c0);
         c1_vec = lp_build_broadcast(builder, i32vec4_type, c1);
         c2_vec = lp_build_broadcast(builder, i32vec4_type, c2);
         lp_build_name(c0_vec, "edgeconst0vec");
         lp_build_name(c1_vec, "edgeconst1vec");
         lp_build_name(c2_vec, "edgeconst2vec");

         /* load step0vec, step1, step2 vec from memory */
         index = LLVMConstInt(LLVMInt32Type(), i, 0);
         step0_vec = LLVMBuildLoad(builder, LLVMBuildGEP(builder, step0_ptr, &index, 1, ""), "");
         step1_vec = LLVMBuildLoad(builder, LLVMBuildGEP(builder, step1_ptr, &index, 1, ""), "");
         step2_vec = LLVMBuildLoad(builder, LLVMBuildGEP(builder, step2_ptr, &index, 1, ""), "");
         lp_build_name(step0_vec, "step0vec");
         lp_build_name(step1_vec, "step1vec");
         lp_build_name(step2_vec, "step2vec");

         /* m0_vec = step0_ptr[i] > c0_vec */
         m0_vec = lp_build_compare(builder, i32_type, PIPE_FUNC_GREATER, step0_vec, c0_vec);
         m1_vec = lp_build_compare(builder, i32_type, PIPE_FUNC_GREATER, step1_vec, c1_vec);
         m2_vec = lp_build_compare(builder, i32_type, PIPE_FUNC_GREATER, step2_vec, c2_vec);

         /* in_out_mask = m0_vec & m1_vec & m2_vec */
         m = LLVMBuildAnd(builder, m0_vec, m1_vec, "");
         in_out_mask = LLVMBuildAnd(builder, m, m2_vec, "");
         lp_build_name(in_out_mask, "inoutmaskvec");
      }
#if OPTIMIZE_IN_OUT_TEST
      lp_build_endif(&ifctx);
#endif

   }
   lp_build_flow_scope_end(flow);
   lp_build_flow_destroy(flow);

   /* This is the initial alive/dead pixel mask for a quad of four pixels.
    * It's an int[4] vector with each word set to 0 or ~0.
    * Words will get cleared when pixels faile the Z test, etc.
    */
   *mask = in_out_mask;
}
Example #8
0
/**
 * Build LLVM code for texture coord wrapping, for linear filtering,
 * for scaled integer texcoords.
 * \param block_length  is the length of the pixel block along the
 *                      coordinate axis
 * \param coord0  the incoming texcoord (s,t,r or q) scaled to the texture size
 * \param length  the texture size along one dimension
 * \param stride  pixel stride along the coordinate axis (in bytes)
 * \param is_pot  if TRUE, length is a power of two
 * \param wrap_mode  one of PIPE_TEX_WRAP_x
 * \param offset0  resulting relative offset for coord0
 * \param offset1  resulting relative offset for coord0 + 1
 * \param i0  resulting sub-block pixel coordinate for coord0
 * \param i1  resulting sub-block pixel coordinate for coord0 + 1
 */
static void
lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld,
                                unsigned block_length,
                                LLVMValueRef coord0,
                                LLVMValueRef length,
                                LLVMValueRef stride,
                                boolean is_pot,
                                unsigned wrap_mode,
                                LLVMValueRef *offset0,
                                LLVMValueRef *offset1,
                                LLVMValueRef *i0,
                                LLVMValueRef *i1)
{
   struct lp_build_context *int_coord_bld = &bld->int_coord_bld;
   LLVMBuilderRef builder = bld->gallivm->builder;
   LLVMValueRef length_minus_one;
   LLVMValueRef lmask, umask, mask;

   if (block_length != 1) {
      /*
       * If the pixel block covers more than one pixel then there is no easy
       * way to calculate offset1 relative to offset0. Instead, compute them
       * independently.
       */

      LLVMValueRef coord1;

      lp_build_sample_wrap_nearest_int(bld,
                                       block_length,
                                       coord0,
                                       length,
                                       stride,
                                       is_pot,
                                       wrap_mode,
                                       offset0, i0);

      coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one);

      lp_build_sample_wrap_nearest_int(bld,
                                       block_length,
                                       coord1,
                                       length,
                                       stride,
                                       is_pot,
                                       wrap_mode,
                                       offset1, i1);

      return;
   }

   /*
    * Scalar pixels -- try to compute offset0 and offset1 with a single stride
    * multiplication.
    */

   *i0 = int_coord_bld->zero;
   *i1 = int_coord_bld->zero;

   length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one);

   switch(wrap_mode) {
   case PIPE_TEX_WRAP_REPEAT:
      if (is_pot) {
         coord0 = LLVMBuildAnd(builder, coord0, length_minus_one, "");
      }
      else {
         /* Add a bias to the texcoord to handle negative coords */
         LLVMValueRef bias = lp_build_mul_imm(int_coord_bld, length, 1024);
         coord0 = LLVMBuildAdd(builder, coord0, bias, "");
         coord0 = LLVMBuildURem(builder, coord0, length, "");
      }

      mask = lp_build_compare(bld->gallivm, int_coord_bld->type,
                              PIPE_FUNC_NOTEQUAL, coord0, length_minus_one);

      *offset0 = lp_build_mul(int_coord_bld, coord0, stride);
      *offset1 = LLVMBuildAnd(builder,
                              lp_build_add(int_coord_bld, *offset0, stride),
                              mask, "");
      break;

   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
      lmask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
                               PIPE_FUNC_GEQUAL, coord0, int_coord_bld->zero);
      umask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type,
                               PIPE_FUNC_LESS, coord0, length_minus_one);

      coord0 = lp_build_select(int_coord_bld, lmask, coord0, int_coord_bld->zero);
      coord0 = lp_build_select(int_coord_bld, umask, coord0, length_minus_one);

      mask = LLVMBuildAnd(builder, lmask, umask, "");

      *offset0 = lp_build_mul(int_coord_bld, coord0, stride);
      *offset1 = lp_build_add(int_coord_bld,
                              *offset0,
                              LLVMBuildAnd(builder, stride, mask, ""));
      break;

   case PIPE_TEX_WRAP_CLAMP:
   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
   case PIPE_TEX_WRAP_MIRROR_REPEAT:
   case PIPE_TEX_WRAP_MIRROR_CLAMP:
   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
   default:
      assert(0);
      *offset0 = int_coord_bld->zero;
      *offset1 = int_coord_bld->zero;
      break;
   }
}
/**
 * Convert linear float values to srgb int values.
 * Several possibilities how to do this, e.g.
 * - use table (based on exponent/highest order mantissa bits) and do
 *   linear interpolation (https://gist.github.com/rygorous/2203834)
 * - Chebyshev polynomial
 * - Approximation using reciprocals
 * - using int-to-float and float-to-int tricks for pow()
 *   (http://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent)
 *
 * @param src   float (vector) value(s) to convert.
 */
static LLVMValueRef
lp_build_linear_to_srgb(struct gallivm_state *gallivm,
                        struct lp_type src_type,
                        LLVMValueRef src)
{
   LLVMBuilderRef builder = gallivm->builder;
   struct lp_build_context f32_bld;
   LLVMValueRef lin_thresh, lin, lin_const, is_linear, tmp, pow_final;

   lp_build_context_init(&f32_bld, gallivm, src_type);

   src = lp_build_clamp(&f32_bld, src, f32_bld.zero, f32_bld.one);

   if (0) {
      /*
       * using int-to-float and float-to-int trick for pow().
       * This is much more accurate than necessary thanks to the correction,
       * but it most certainly makes no sense without rsqrt available.
       * Bonus points if you understand how this works...
       * All in all (including min/max clamp, conversion) 19 instructions.
       */

      float exp_f = 2.0f / 3.0f;
      /* some compilers can't do exp2f, so this is exp2f(127.0f/exp_f - 127.0f) */
      float exp2f_c = 1.30438178253e+19f;
      float coeff_f = 0.62996f;
      LLVMValueRef pow_approx, coeff, x2, exponent, pow_1, pow_2;
      struct lp_type int_type = lp_int_type(src_type);

      /*
       * First calculate approx x^8/12
       */
      exponent = lp_build_const_vec(gallivm, src_type, exp_f);
      coeff = lp_build_const_vec(gallivm, src_type,
                                 exp2f_c * powf(coeff_f, 1.0f / exp_f));

      /* premultiply src */
      tmp = lp_build_mul(&f32_bld, coeff, src);
      /* "log2" */
      tmp = LLVMBuildBitCast(builder, tmp, lp_build_vec_type(gallivm, int_type), "");
      tmp = lp_build_int_to_float(&f32_bld, tmp);
      /* multiply for pow */
      tmp = lp_build_mul(&f32_bld, tmp, exponent);
      /* "exp2" */
      pow_approx = lp_build_itrunc(&f32_bld, tmp);
      pow_approx = LLVMBuildBitCast(builder, pow_approx,
                                    lp_build_vec_type(gallivm, src_type), "");

      /*
       * Since that pow was inaccurate (like 3 bits, though each sqrt step would
       * give another bit), compensate the error (which is why we chose another
       * exponent in the first place).
       */
      /* x * x^(8/12) = x^(20/12) */
      pow_1 = lp_build_mul(&f32_bld, pow_approx, src);

      /* x * x * x^(-4/12) = x^(20/12) */
      /* Should avoid using rsqrt if it's not available, but
       * using x * x^(4/12) * x^(4/12) instead will change error weight */
      tmp = lp_build_fast_rsqrt(&f32_bld, pow_approx);
      x2 = lp_build_mul(&f32_bld, src, src);
      pow_2 = lp_build_mul(&f32_bld, x2, tmp);

      /* average the values so the errors cancel out, compensate bias,
       * we also squeeze the 1.055 mul of the srgb conversion plus the 255.0 mul
       * for conversion to int in here */
      tmp = lp_build_add(&f32_bld, pow_1, pow_2);
      coeff = lp_build_const_vec(gallivm, src_type,
                                 1.0f / (3.0f * coeff_f) * 0.999852f *
                                 powf(1.055f * 255.0f, 4.0f));
      pow_final = lp_build_mul(&f32_bld, tmp, coeff);

      /* x^(5/12) = rsqrt(rsqrt(x^20/12)) */
      if (lp_build_fast_rsqrt_available(src_type)) {
         pow_final = lp_build_fast_rsqrt(&f32_bld,
                        lp_build_fast_rsqrt(&f32_bld, pow_final));
      }
      else {
         pow_final = lp_build_sqrt(&f32_bld, lp_build_sqrt(&f32_bld, pow_final));
      }
      pow_final = lp_build_add(&f32_bld, pow_final,
                               lp_build_const_vec(gallivm, src_type, -0.055f * 255.0f));
   }

   else {
      /*
       * using "rational polynomial" approximation here.
       * Essentially y = a*x^0.375 + b*x^0.5 + c, with also
       * factoring in the 255.0 mul and the scaling mul.
       * (a is closer to actual value so has higher weight than b.)
       * Note: the constants are magic values. They were found empirically,
       * possibly could be improved but good enough (be VERY careful with
       * error metric if you'd want to tweak them, they also MUST fit with
       * the crappy polynomial above for srgb->linear since it is required
       * that each srgb value maps back to the same value).
       * This function has an error of max +-0.17 (and we'd only require +-0.6),
       * for the approximated srgb->linear values the error is naturally larger
       * (+-0.42) but still accurate enough (required +-0.5 essentially).
       * All in all (including min/max clamp, conversion) 15 instructions.
       * FMA would help (minus 2 instructions).
       */

      LLVMValueRef x05, x0375, a_const, b_const, c_const, tmp2;

      if (lp_build_fast_rsqrt_available(src_type)) {
         tmp = lp_build_fast_rsqrt(&f32_bld, src);
         x05 = lp_build_mul(&f32_bld, src, tmp);
      }
      else {
         /*
          * I don't really expect this to be practical without rsqrt
          * but there's no reason for triple punishment so at least
          * save the otherwise resulting division and unnecessary mul...
          */
         x05 = lp_build_sqrt(&f32_bld, src);
      }

      tmp = lp_build_mul(&f32_bld, x05, src);
      if (lp_build_fast_rsqrt_available(src_type)) {
         x0375 = lp_build_fast_rsqrt(&f32_bld, lp_build_fast_rsqrt(&f32_bld, tmp));
      }
      else {
         x0375 = lp_build_sqrt(&f32_bld, lp_build_sqrt(&f32_bld, tmp));
      }

      a_const = lp_build_const_vec(gallivm, src_type, 0.675f * 1.0622 * 255.0f);
      b_const = lp_build_const_vec(gallivm, src_type, 0.325f * 1.0622 * 255.0f);
      c_const = lp_build_const_vec(gallivm, src_type, -0.0620f * 255.0f);

      tmp = lp_build_mul(&f32_bld, a_const, x0375);
      tmp2 = lp_build_mul(&f32_bld, b_const, x05);
      tmp2 = lp_build_add(&f32_bld, tmp2, c_const);
      pow_final = lp_build_add(&f32_bld, tmp, tmp2);
   }

   /* linear part is easy */
   lin_const = lp_build_const_vec(gallivm, src_type, 12.92f * 255.0f);
   lin = lp_build_mul(&f32_bld, src, lin_const);

   lin_thresh = lp_build_const_vec(gallivm, src_type, 0.0031308f);
   is_linear = lp_build_compare(gallivm, src_type, PIPE_FUNC_LEQUAL, src, lin_thresh);
   tmp = lp_build_select(&f32_bld, is_linear, lin, pow_final);

   f32_bld.type.sign = 0;
   return lp_build_iround(&f32_bld, tmp);
}
/**
 * Expand the relevent bits of mask_input to a 4-dword mask for the 
 * four pixels in a 2x2 quad.  This will set the four elements of the
 * quad mask vector to 0 or ~0.
 *
 * \param quad  which quad of the quad group to test, in [0,3]
 * \param mask_input  bitwise mask for the whole 4x4 stamp
 */
static LLVMValueRef
generate_quad_mask(LLVMBuilderRef builder,
                   struct lp_type fs_type,
                   unsigned quad,
                   LLVMValueRef mask_input) /* int32 */
{
   struct lp_type mask_type;
   LLVMTypeRef i32t = LLVMInt32Type();
   LLVMValueRef bits[4];
   LLVMValueRef mask;
   int shift;

   /*
    * XXX: We'll need a different path for 16 x u8
    */
   assert(fs_type.width == 32);
   assert(fs_type.length == 4);
   mask_type = lp_int_type(fs_type);

   /*
    * mask_input >>= (quad * 4)
    */
   
   switch (quad) {
   case 0:
      shift = 0;
      break;
   case 1:
      shift = 2;
      break;
   case 2:
      shift = 8;
      break;
   case 3:
      shift = 10;
      break;
   default:
      assert(0);
      shift = 0;
   }

   mask_input = LLVMBuildLShr(builder,
                              mask_input,
                              LLVMConstInt(i32t, shift, 0),
                              "");

   /*
    * mask = { mask_input & (1 << i), for i in [0,3] }
    */

   mask = lp_build_broadcast(builder, lp_build_vec_type(mask_type), mask_input);

   bits[0] = LLVMConstInt(i32t, 1 << 0, 0);
   bits[1] = LLVMConstInt(i32t, 1 << 1, 0);
   bits[2] = LLVMConstInt(i32t, 1 << 4, 0);
   bits[3] = LLVMConstInt(i32t, 1 << 5, 0);
   
   mask = LLVMBuildAnd(builder, mask, LLVMConstVector(bits, 4), "");

   /*
    * mask = mask != 0 ? ~0 : 0
    */

   mask = lp_build_compare(builder,
                           mask_type, PIPE_FUNC_NOTEQUAL,
                           mask,
                           lp_build_const_int_vec(mask_type, 0));

   return mask;
}